Example #1
0
 def _test_percents(self, human_classified_pickle, language):
     'This method returns ntuple containing (matches, false_positive, false_negative, unknown)'
     human_classified = HumanClassification(human_classified_pickle)
     human_classified.load()
     entry_count = len(human_classified.classification)
     true_positive = 0.0
     true_negative = 0.0
     matches = 0.0
     false_positive = 0.0
     false_negative = 0.0
     unknown = 0.0
     for entry_id in human_classified.classification:
         processed_entry = self.db.get_entry_by_id(entry_id)
         probability = self.classify(processed_entry.original_entry,
                                     language)
         if probability < self._low:
             if not human_classified.classification[entry_id]:
                 matches += 1
                 true_negative += 1
             else:
                 false_negative += 1
         elif probability >= self._high:
             if human_classified.classification[entry_id]:
                 matches += 1
                 true_positive += 1
             else:
                 false_positive += 1
         else:
             unknown += 1
     return (matches, true_positive, true_negative, false_positive,
             false_negative, unknown, entry_count)
Example #2
0
    def _test_percents(self, human_classified_pickle, language):
	'This method returns ntuple containing (matches, false_positive, false_negative, unknown)'
	human_classified = HumanClassification(human_classified_pickle)
	human_classified.load()
	entry_count = len(human_classified.classification)
	true_positive = 0.0
        true_negative = 0.0
        matches = 0.0
	false_positive = 0.0
	false_negative = 0.0
	unknown = 0.0
	for entry_id in human_classified.classification:
	    processed_entry = self.db.get_entry_by_id(entry_id)
	    probability = self.classify(processed_entry.original_entry, language)
	    if probability < self._low:
		if not human_classified.classification[entry_id]:
		    matches += 1
                    true_negative += 1
		else:
		    false_negative += 1
	    elif probability >= self._high:
		if  human_classified.classification[entry_id]:
		    matches += 1
                    true_positive += 1
		else:
		    false_positive += 1
	    else:
		unknown += 1
	return (matches, true_positive, true_negative, false_positive, false_negative, unknown, entry_count)
Example #3
0
    def run_tests(self, input_file, language):
	'Method for running tests on input file and get time elapsed for classification of one entry'
	self.db.connect(user='******', database='meco', host='localhost', port=5432)
	tmp = HumanClassification(input_file)
	tmp.load()
	self._logger.info('Running tests...')
        tests = Tests()
        tests.set_test_len(len(tmp.classification))
        tests.set_train_len(len(self.human.classification))
        tests.set_train_positive_len(self.human.get_positively_classified_count(language))
        tests.set_train_negative_len(self.human.get_negatively_classified_count(language))
        self._logger.info('Calculating corelation...')
        tests.set_corelation(self._test_corelation(input_file, language))
        self._logger.info('Calculating percentage of classification accuracy...')
        tests.set_percents(self._test_percents(input_file, language))
        print tests
Example #4
0
    def _test_corelation(self, human_classified_pickle, language):
        'This method prints corelation between user defined input in human_classified_pickle and automatic classification.'
        #
        #		    covariance
        #		        |
        #		     C(X,Y)		          E(XY) - E(X)E(Y)
        # corelation = ------------------ = -------------------------------------------  , a = E(XY), b = E(X), c = E(Y), d,= E(X^2), e = E(Y^2)
        #		    d(X)d(Y)	    sqrt(E(X^2) - E(X)^2) sqrt(E(Y^2) - E(Y)^2)
        #		       |
        #	       standard deviations
        #
        # X - automatically calculated probabilities
        # Y - human input probabilities
        #
        human_classified = HumanClassification(human_classified_pickle)
        human_classified.load()
        entry_count = len(human_classified.classification)
        a = 0.0
        b = 0.0
        c = 0.0
        d = 0.0
        e = 0.0
        for entry_id in human_classified.classification:
            processed_entry = self.db.get_entry_by_id(entry_id)
            probability_auto = self.classify(processed_entry.original_entry,
                                             language)
            if human_classified.classification[entry_id]:
                probability_human = self.HUMAN_RATING_PROBABILITY
            else:
                probability_human = (1 - self.HUMAN_RATING_PROBABILITY)

            a += probability_human * probability_auto
            b += probability_auto
            c += probability_human
            d += probability_auto * probability_auto
            e += probability_human * probability_human

        # E() values
        a /= entry_count
        b /= entry_count
        c /= entry_count
        d /= entry_count
        e /= entry_count

        return (a - (b * c)) / (sqrt(d - (b * b)) * sqrt(e - (c * c)))
Example #5
0
    def _test_corelation(self, human_classified_pickle, language):
	'This method prints corelation between user defined input in human_classified_pickle and automatic classification.'
	#
	#		    covariance
	#		        |
	#		     C(X,Y)		          E(XY) - E(X)E(Y)
	# corelation = ------------------ = -------------------------------------------  , a = E(XY), b = E(X), c = E(Y), d,= E(X^2), e = E(Y^2)
	#		    d(X)d(Y)	    sqrt(E(X^2) - E(X)^2) sqrt(E(Y^2) - E(Y)^2)
	#		       |
	#	       standard deviations
	#
	# X - automatically calculated probabilities
	# Y - human input probabilities
	#
	human_classified = HumanClassification(human_classified_pickle)
	human_classified.load()
	entry_count = len(human_classified.classification)
	a = 0.0
	b = 0.0
	c = 0.0
	d = 0.0
	e = 0.0
	for entry_id in human_classified.classification:
	    processed_entry = self.db.get_entry_by_id(entry_id)
	    probability_auto = self.classify(processed_entry.original_entry, language)
	    if human_classified.classification[entry_id]:
		probability_human = self.HUMAN_RATING_PROBABILITY
	    else:
		probability_human = (1 - self.HUMAN_RATING_PROBABILITY)

	    a += probability_human * probability_auto
	    b += probability_auto
	    c += probability_human
	    d += probability_auto * probability_auto
	    e += probability_human * probability_human

	# E() values
	a /= entry_count
	b /= entry_count
	c /= entry_count
	d /= entry_count
	e /= entry_count

	return (a - (b * c)) / (sqrt(d - (b * b)) * sqrt(e - (c * c)))
Example #6
0
 def run_tests(self, input_file, language):
     'Method for running tests on input file and get time elapsed for classification of one entry'
     self.db.connect(user='******',
                     database='meco',
                     host='localhost',
                     port=5432)
     tmp = HumanClassification(input_file)
     tmp.load()
     self._logger.info('Running tests...')
     tests = Tests()
     tests.set_test_len(len(tmp.classification))
     tests.set_train_len(len(self.human.classification))
     tests.set_train_positive_len(
         self.human.get_positively_classified_count(language))
     tests.set_train_negative_len(
         self.human.get_negatively_classified_count(language))
     self._logger.info('Calculating corelation...')
     tests.set_corelation(self._test_corelation(input_file, language))
     self._logger.info(
         'Calculating percentage of classification accuracy...')
     tests.set_percents(self._test_percents(input_file, language))
     print tests
Example #7
0
 def __init__(self, low=0.5, high=0.5):
     # classification thresholds
     self._low = float(low)
     self._high = float(high)
     # add and setup logger
     self._logger = logging.getLogger()
     logging.basicConfig(level=logging.DEBUG)
     # db connection
     self.db = Connection()
     # load info about allready classified entries
     self._logger.info('Loading Allready classified entries...')
     self.human = HumanClassification(
         '/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/HumanClassification'
     )
     self.human.load()
     # load database of words
     self._logger.info('Loading word dictionary...')
     self.word_dict = WordDictionary(
         '/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/WordDictionary'
     )
     self.word_dict.load()
     # timer
     self._timer = timeit.Timer()
Example #8
0
    def human_classify(self, output_pickle, language):
	'This method creates output_pickle file containing user defined classifications of entries. May be used for creating test data.'
	self.db.connect(user='******', database='meco', host='localhost', port=5432)
	new_human_classify = HumanClassification(output_pickle)
	new_human_classify.load()
	try:
	    for entry in self.db.entries(language=language, entry_count=None, entry_offset=0):
		# when entry was allready processed skip
		if entry.id in new_human_classify.classification:
		    continue
		print 'Original entry: \n"'+ entry.original_entry + '"\n automatic classification = ' + str(self.classify(entry.original_entry, language))
                automatic_classification = self.classify(entry.original_entry, language)
		if automatic_classification < self._low:
                    auto = 'n'
                    continue # TODO: odstranit
                elif automatic_classification >= self._high:
                    auto = 'y'
                else:
                    auto = '?'
                answer = raw_input('Is this entry relevant? (y/n/?/END))['+ auto +']: ')
		if answer == 'y':
		    new_human_classify.classification[entry.id] = True
		elif answer == 'n':
		    new_human_classify.classification[entry.id] = False
		elif answer == 'END':
		    break
		else:
                    if automatic_classification < self._low:
                        new_human_classify.classification[entry.id] = False
                    elif automatic_classification >= self._high:
                        new_human_classify.classification[entry.id] = True
                    else:
                        new_human_classify.classification[entry.id] = None
                print 'Cassified count = ' + str(len(new_human_classify.classification))
	except KeyboardInterrupt:
	    pass
	new_human_classify.store()
Example #9
0
    def __init__(self, low=0.5, high=0.5):
        # classification thresholds
        self._low = float(low)
        self._high = float(high)
        # add and setup logger
        self._logger = logging.getLogger()
        logging.basicConfig(level=logging.DEBUG)
        # db connection
	self.db = Connection()
	# load info about allready classified entries
        self._logger.info('Loading Allready classified entries...')
	self.human = HumanClassification('/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/HumanClassification')
	self.human.load()
	# load database of words
        self._logger.info('Loading word dictionary...')
	self.word_dict = WordDictionary('/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/WordDictionary')
	self.word_dict.load()
        # timer
        self._timer = timeit.Timer()
Example #10
0
 def human_classify(self, output_pickle, language):
     'This method creates output_pickle file containing user defined classifications of entries. May be used for creating test data.'
     self.db.connect(user='******',
                     database='meco',
                     host='localhost',
                     port=5432)
     new_human_classify = HumanClassification(output_pickle)
     new_human_classify.load()
     try:
         for entry in self.db.entries(language=language,
                                      entry_count=None,
                                      entry_offset=0):
             # when entry was allready processed skip
             if entry.id in new_human_classify.classification:
                 continue
             print 'Original entry: \n"' + entry.original_entry + '"\n automatic classification = ' + str(
                 self.classify(entry.original_entry, language))
             automatic_classification = self.classify(
                 entry.original_entry, language)
             if automatic_classification < self._low:
                 auto = 'n'
                 continue  # TODO: odstranit
             elif automatic_classification >= self._high:
                 auto = 'y'
             else:
                 auto = '?'
             answer = raw_input('Is this entry relevant? (y/n/?/END))[' +
                                auto + ']: ')
             if answer == 'y':
                 new_human_classify.classification[entry.id] = True
             elif answer == 'n':
                 new_human_classify.classification[entry.id] = False
             elif answer == 'END':
                 break
             else:
                 if automatic_classification < self._low:
                     new_human_classify.classification[entry.id] = False
                 elif automatic_classification >= self._high:
                     new_human_classify.classification[entry.id] = True
                 else:
                     new_human_classify.classification[entry.id] = None
             print 'Cassified count = ' + str(
                 len(new_human_classify.classification))
     except KeyboardInterrupt:
         pass
     new_human_classify.store()
Example #11
0
class Classifier:
    '''Class using for classification of tweets. Use classify() method for classification, train() method for training of bayesian filter.'''
    MAX_TOKEN_SIZE = 6 # defines word count in dictionary tuples
    HUMAN_RATING_PROBABILITY = 0.99
    def __init__(self, low=0.5, high=0.5):
        # classification thresholds
        self._low = float(low)
        self._high = float(high)
        # add and setup logger
        self._logger = logging.getLogger()
        logging.basicConfig(level=logging.DEBUG)
        # db connection
	self.db = Connection()
	# load info about allready classified entries
        self._logger.info('Loading Allready classified entries...')
	self.human = HumanClassification('/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/HumanClassification')
	self.human.load()
	# load database of words
        self._logger.info('Loading word dictionary...')
	self.word_dict = WordDictionary('/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/WordDictionary')
	self.word_dict.load()
        # timer
        self._timer = timeit.Timer()

    def _add_classification(self, entry, classification):
        'Add each token to word dictionary for futher classification.'
        language = entry.get_language()
	for i in xrange(1, self.MAX_TOKEN_SIZE + 1):
            # for each token add to word dictionary
	    for token in entry.get_token(i):
                self.word_dict.words.setdefault(language, {}).setdefault(token, {'count':0, 'weight':0})['count'] += 1
                if classification:
                    self.word_dict.words[language][token]['weight'] += self.HUMAN_RATING_PROBABILITY
                else:
                    self.word_dict.words[language][token]['weight'] += (1 - self.HUMAN_RATING_PROBABILITY)

    def _add_to_human_classification(self, entry, classification):
        'Adds classified text to human classification. Stores classification, text and language of each text.'
	self.human.classification[entry.get_id()] = (classification, entry.get_guid(), entry.get_original_entry(), entry.get_language())
	if classification is None:
	    return
	self._add_classification(entry, classification)

    def train(self, language, count=None, offset=0):
	'Given the language and optionaly count or offset shows dialog for realning'
	self.db.connect(user='******', database='meco', host='localhost', port=5432)
	try:
	    for entry in self.db.entries(language=language, entry_count=10000, entry_offset=offset):
                # when entry was allready processed apply and skip
		if entry.id in self.human.classification:
		    continue
		# ask whether entry is relevant
                automatic_classification = self.classify(entry.original_entry, language)
		print 'Original entry(' + str(entry.id) + '): \n"'+ entry.original_entry + '"\n automatic classification = ' + str(automatic_classification)
                if automatic_classification < self._low:
                    auto = 'n'
                    continue # TODO:odstranit
                elif automatic_classification >= self._high:
                    auto = 'y'
                else:
                    auto = '?'

		answer = raw_input('Is this entry relevant? (y/n/?/END))[' + auto + ']: ')
		if answer == 'y':
		    self._add_to_human_classification(entry, True)
		elif answer == 'n':
		    self._add_to_human_classification(entry, False)
		elif answer == '?':
		    continue
		elif answer == 'END':
		    break
		else:
                    if automatic_classification < self._low:
                        self._add_to_human_classification(entry, False)
                    elif automatic_classification >= self._high:
                        self._add_to_human_classification(entry, True)
                    else:
                        continue

		print 'after classification: ' + str(self.classify(entry.original_entry, language))
	except KeyboardInterrupt:
	    pass
	# store human input and word_dictionary
	self.human.store()
	self.word_dict.store()

    def manual_train(self, text, language, classification):
        'Method for manual training of bayesian filter.'
        e = Entry(None, text, language)
        if classification is True:
            self._add_classification(e, True)
        if classification is False:
            self._add_classification(e, False)
        self.word_dict.store()
        
    def train_from_human_classification(self, filename, language):
        'Method for training current bayesian filter from external human classification file'
	filehandler = open(filename, 'rb')
        content = pickle.load(filehandler)

        for entry_id in content:
            e = Entry(entry_id, list(content[entry_id])[1], list(content[entry_id])[2])
            if e.get_language() == language:
                self._add_to_human_classification(e, list(content[entry_id])[0])
        self.human.store()
        self.word_dict.store()

    def regenerate_word_dict(self):
        'regenerate word dictionary according to human_input.'
        print self.human.classification
        self.word_dict.words = {}
        # go through human classification and create new word dictionary using classification
        for entry_id in self.human.classification:
            e = Entry(entry_id, list(self.human.classification[entry_id])[1], list(self.human.classification[entry_id])[2])
            if list(self.human.classification[entry_id])[0] == True:
                self._add_classification(e, True)
            if list(self.human.classification[entry_id])[0] == False:
                self._add_classification(e, False)
        self.word_dict.store()

    def human_classify(self, output_pickle, language):
	'This method creates output_pickle file containing user defined classifications of entries. May be used for creating test data.'
	self.db.connect(user='******', database='meco', host='localhost', port=5432)
	new_human_classify = HumanClassification(output_pickle)
	new_human_classify.load()
	try:
	    for entry in self.db.entries(language=language, entry_count=None, entry_offset=0):
		# when entry was allready processed skip
		if entry.id in new_human_classify.classification:
		    continue
		print 'Original entry: \n"'+ entry.original_entry + '"\n automatic classification = ' + str(self.classify(entry.original_entry, language))
                automatic_classification = self.classify(entry.original_entry, language)
		if automatic_classification < self._low:
                    auto = 'n'
                    continue # TODO: odstranit
                elif automatic_classification >= self._high:
                    auto = 'y'
                else:
                    auto = '?'
                answer = raw_input('Is this entry relevant? (y/n/?/END))['+ auto +']: ')
		if answer == 'y':
		    new_human_classify.classification[entry.id] = True
		elif answer == 'n':
		    new_human_classify.classification[entry.id] = False
		elif answer == 'END':
		    break
		else:
                    if automatic_classification < self._low:
                        new_human_classify.classification[entry.id] = False
                    elif automatic_classification >= self._high:
                        new_human_classify.classification[entry.id] = True
                    else:
                        new_human_classify.classification[entry.id] = None
                print 'Cassified count = ' + str(len(new_human_classify.classification))
	except KeyboardInterrupt:
	    pass
	new_human_classify.store()


    def classify(self, text, language):
	'''Given input text and language, method calculates probability of text being relevant to topic. @result probability that text is relevant'''
	input_entry = Entry(id=None, guid=None, entry=text, language=language)
	self.word_dict.words.setdefault(language, {})
	# for each token claculate probability of being relevant to topic
	# and calculate according to bayes theorem
	#
	#		  p1p2p3........pn		      a
	# P = ------------------------------------------ = -------
	#	p1p2p3........pn + (1-p1)(1-p2)...(1-pn)    a + b
	#
	a = 1.0
	b = 1.0
	for i in xrange(1, self.MAX_TOKEN_SIZE + 1):
	    for token in input_entry.get_token(i):
		if not token in self.word_dict.words[language]:
		    probability = 0.5
		else:
		    token_stats = self.word_dict.words[language][token]
		    probability = token_stats['weight'] / token_stats['count']
		a *= probability
		b *= 1 - probability

        if a + b == 0:
            return 0
        else:
            result = a / (a + b)
            if result == 0.5:
                return -1
            else:
                return a / (a + b)

    def _test_corelation(self, human_classified_pickle, language):
	'This method prints corelation between user defined input in human_classified_pickle and automatic classification.'
	#
	#		    covariance
	#		        |
	#		     C(X,Y)		          E(XY) - E(X)E(Y)
	# corelation = ------------------ = -------------------------------------------  , a = E(XY), b = E(X), c = E(Y), d,= E(X^2), e = E(Y^2)
	#		    d(X)d(Y)	    sqrt(E(X^2) - E(X)^2) sqrt(E(Y^2) - E(Y)^2)
	#		       |
	#	       standard deviations
	#
	# X - automatically calculated probabilities
	# Y - human input probabilities
	#
	human_classified = HumanClassification(human_classified_pickle)
	human_classified.load()
	entry_count = len(human_classified.classification)
	a = 0.0
	b = 0.0
	c = 0.0
	d = 0.0
	e = 0.0
	for entry_id in human_classified.classification:
	    processed_entry = self.db.get_entry_by_id(entry_id)
	    probability_auto = self.classify(processed_entry.original_entry, language)
	    if human_classified.classification[entry_id]:
		probability_human = self.HUMAN_RATING_PROBABILITY
	    else:
		probability_human = (1 - self.HUMAN_RATING_PROBABILITY)

	    a += probability_human * probability_auto
	    b += probability_auto
	    c += probability_human
	    d += probability_auto * probability_auto
	    e += probability_human * probability_human

	# E() values
	a /= entry_count
	b /= entry_count
	c /= entry_count
	d /= entry_count
	e /= entry_count

	return (a - (b * c)) / (sqrt(d - (b * b)) * sqrt(e - (c * c)))

    def _test_percents(self, human_classified_pickle, language):
	'This method returns ntuple containing (matches, false_positive, false_negative, unknown)'
	human_classified = HumanClassification(human_classified_pickle)
	human_classified.load()
	entry_count = len(human_classified.classification)
	true_positive = 0.0
        true_negative = 0.0
        matches = 0.0
	false_positive = 0.0
	false_negative = 0.0
	unknown = 0.0
	for entry_id in human_classified.classification:
	    processed_entry = self.db.get_entry_by_id(entry_id)
	    probability = self.classify(processed_entry.original_entry, language)
	    if probability < self._low:
		if not human_classified.classification[entry_id]:
		    matches += 1
                    true_negative += 1
		else:
		    false_negative += 1
	    elif probability >= self._high:
		if  human_classified.classification[entry_id]:
		    matches += 1
                    true_positive += 1
		else:
		    false_positive += 1
	    else:
		unknown += 1
	return (matches, true_positive, true_negative, false_positive, false_negative, unknown, entry_count)

    def run_tests(self, input_file, language):
	'Method for running tests on input file and get time elapsed for classification of one entry'
	self.db.connect(user='******', database='meco', host='localhost', port=5432)
	tmp = HumanClassification(input_file)
	tmp.load()
	self._logger.info('Running tests...')
        tests = Tests()
        tests.set_test_len(len(tmp.classification))
        tests.set_train_len(len(self.human.classification))
        tests.set_train_positive_len(self.human.get_positively_classified_count(language))
        tests.set_train_negative_len(self.human.get_negatively_classified_count(language))
        self._logger.info('Calculating corelation...')
        tests.set_corelation(self._test_corelation(input_file, language))
        self._logger.info('Calculating percentage of classification accuracy...')
        tests.set_percents(self._test_percents(input_file, language))
        print tests


    def get_time(self):
        'Method for calculating time needed for one entry classification'
        self._logger.info('Downloading entries to run tests on...')
        i = 0
        imax = 1000
        entries = []
        for entry in self.db.entries(language='en', entry_count=imax):
            i += 1
            if i >= imax:
                break
            entries.append(entry.original_entry)


        self._logger.info('Masuring amount of entries to be calculated in 1sec')
        repetitions = 100
        result_avg = 0.0
        for i in xrange(0, repetitions):
            average = 0
            for j in xrange(0, imax - 1):
                start = time.time()
                self.classify(entries[j], 'en')
                average += time.time() - start
            average /= imax
            result_avg += average
        result_avg /= repetitions
        return 'Classifier is able to classify ' + str(round(1/result_avg, 2)) + ' entries in one second.'

    def export_to_xml(self, language, specification):
        'Method exports all data to xml files'
        self.word_dict.to_xml(filename='XML/word_dict', specification=specification)
        self.human.to_xml(self.db, 'XML/human_classification', language=language)

    def fix_old_human_classification(self, filename):
        'method converts old human classification file to new one inlcluding text of tweets'
	self.db.connect(user='******', database='meco', host='localhost', port=5432)
        file = open(filename, 'rb')
        content = pickle.load(file)
        new_content = {}

        for entry_id in content:
            e = self.db.get_entry_by_id(entry_id)
            if e:
                new_content[entry_id] = (content[entry_id], e.original_entry, e.get_language())

        new_file = open(filename +'new', 'wb')
        pickle.dump(new_content, new_file)
        
    def fix_old_human_classification2(self, filename):
        'method converts old human classification file to new one inlcluding text of tweets'
	self.db.connect(user='******', database='meco', host='localhost', port=5432)
        file = open(filename, 'rb')
        content = pickle.load(file)
        new_content = {}

        for entry_id in content:
            e = self.db.get_entry_by_id(entry_id)
            if e:
                new_content[entry_id] = (list(content[entry_id])[0], e.get_guid(), e.get_original_entry(), e.get_language())

        new_file = open(filename +'_new', 'wb')
        pickle.dump(new_content, new_file)

    def train_from_file(self, filename, language, classification):
        'method trains classifier from some file'
	file = open(filename, 'r')
	for line in file:
            e = Entry(None, None, line, language)
            self._add_to_human_classification(e, classification)
        self.human.store()
        self.word_dict.store()
Example #12
0
class Classifier:
    '''Class using for classification of tweets. Use classify() method for classification, train() method for training of bayesian filter.'''
    MAX_TOKEN_SIZE = 6  # defines word count in dictionary tuples
    HUMAN_RATING_PROBABILITY = 0.99

    def __init__(self, low=0.5, high=0.5):
        # classification thresholds
        self._low = float(low)
        self._high = float(high)
        # add and setup logger
        self._logger = logging.getLogger()
        logging.basicConfig(level=logging.DEBUG)
        # db connection
        self.db = Connection()
        # load info about allready classified entries
        self._logger.info('Loading Allready classified entries...')
        self.human = HumanClassification(
            '/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/HumanClassification'
        )
        self.human.load()
        # load database of words
        self._logger.info('Loading word dictionary...')
        self.word_dict = WordDictionary(
            '/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/WordDictionary'
        )
        self.word_dict.load()
        # timer
        self._timer = timeit.Timer()

    def _add_classification(self, entry, classification):
        'Add each token to word dictionary for futher classification.'
        language = entry.get_language()
        for i in xrange(1, self.MAX_TOKEN_SIZE + 1):
            # for each token add to word dictionary
            for token in entry.get_token(i):
                self.word_dict.words.setdefault(language, {}).setdefault(
                    token, {
                        'count': 0,
                        'weight': 0
                    })['count'] += 1
                if classification:
                    self.word_dict.words[language][token][
                        'weight'] += self.HUMAN_RATING_PROBABILITY
                else:
                    self.word_dict.words[language][token]['weight'] += (
                        1 - self.HUMAN_RATING_PROBABILITY)

    def _add_to_human_classification(self, entry, classification):
        'Adds classified text to human classification. Stores classification, text and language of each text.'
        self.human.classification[entry.get_id()] = (
            classification, entry.get_guid(), entry.get_original_entry(),
            entry.get_language())
        if classification is None:
            return
        self._add_classification(entry, classification)

    def train(self, language, count=None, offset=0):
        'Given the language and optionaly count or offset shows dialog for realning'
        self.db.connect(user='******',
                        database='meco',
                        host='localhost',
                        port=5432)
        try:
            for entry in self.db.entries(language=language,
                                         entry_count=10000,
                                         entry_offset=offset):
                # when entry was allready processed apply and skip
                if entry.id in self.human.classification:
                    continue
                # ask whether entry is relevant
                automatic_classification = self.classify(
                    entry.original_entry, language)
                print 'Original entry(' + str(
                    entry.id
                ) + '): \n"' + entry.original_entry + '"\n automatic classification = ' + str(
                    automatic_classification)
                if automatic_classification < self._low:
                    auto = 'n'
                    continue  # TODO:odstranit
                elif automatic_classification >= self._high:
                    auto = 'y'
                else:
                    auto = '?'

                answer = raw_input('Is this entry relevant? (y/n/?/END))[' +
                                   auto + ']: ')
                if answer == 'y':
                    self._add_to_human_classification(entry, True)
                elif answer == 'n':
                    self._add_to_human_classification(entry, False)
                elif answer == '?':
                    continue
                elif answer == 'END':
                    break
                else:
                    if automatic_classification < self._low:
                        self._add_to_human_classification(entry, False)
                    elif automatic_classification >= self._high:
                        self._add_to_human_classification(entry, True)
                    else:
                        continue

                print 'after classification: ' + str(
                    self.classify(entry.original_entry, language))
        except KeyboardInterrupt:
            pass
        # store human input and word_dictionary
        self.human.store()
        self.word_dict.store()

    def manual_train(self, text, language, classification):
        'Method for manual training of bayesian filter.'
        e = Entry(None, text, language)
        if classification is True:
            self._add_classification(e, True)
        if classification is False:
            self._add_classification(e, False)
        self.word_dict.store()

    def train_from_human_classification(self, filename, language):
        'Method for training current bayesian filter from external human classification file'
        filehandler = open(filename, 'rb')
        content = pickle.load(filehandler)

        for entry_id in content:
            e = Entry(entry_id,
                      list(content[entry_id])[1],
                      list(content[entry_id])[2])
            if e.get_language() == language:
                self._add_to_human_classification(e,
                                                  list(content[entry_id])[0])
        self.human.store()
        self.word_dict.store()

    def regenerate_word_dict(self):
        'regenerate word dictionary according to human_input.'
        print self.human.classification
        self.word_dict.words = {}
        # go through human classification and create new word dictionary using classification
        for entry_id in self.human.classification:
            e = Entry(entry_id,
                      list(self.human.classification[entry_id])[1],
                      list(self.human.classification[entry_id])[2])
            if list(self.human.classification[entry_id])[0] == True:
                self._add_classification(e, True)
            if list(self.human.classification[entry_id])[0] == False:
                self._add_classification(e, False)
        self.word_dict.store()

    def human_classify(self, output_pickle, language):
        'This method creates output_pickle file containing user defined classifications of entries. May be used for creating test data.'
        self.db.connect(user='******',
                        database='meco',
                        host='localhost',
                        port=5432)
        new_human_classify = HumanClassification(output_pickle)
        new_human_classify.load()
        try:
            for entry in self.db.entries(language=language,
                                         entry_count=None,
                                         entry_offset=0):
                # when entry was allready processed skip
                if entry.id in new_human_classify.classification:
                    continue
                print 'Original entry: \n"' + entry.original_entry + '"\n automatic classification = ' + str(
                    self.classify(entry.original_entry, language))
                automatic_classification = self.classify(
                    entry.original_entry, language)
                if automatic_classification < self._low:
                    auto = 'n'
                    continue  # TODO: odstranit
                elif automatic_classification >= self._high:
                    auto = 'y'
                else:
                    auto = '?'
                answer = raw_input('Is this entry relevant? (y/n/?/END))[' +
                                   auto + ']: ')
                if answer == 'y':
                    new_human_classify.classification[entry.id] = True
                elif answer == 'n':
                    new_human_classify.classification[entry.id] = False
                elif answer == 'END':
                    break
                else:
                    if automatic_classification < self._low:
                        new_human_classify.classification[entry.id] = False
                    elif automatic_classification >= self._high:
                        new_human_classify.classification[entry.id] = True
                    else:
                        new_human_classify.classification[entry.id] = None
                print 'Cassified count = ' + str(
                    len(new_human_classify.classification))
        except KeyboardInterrupt:
            pass
        new_human_classify.store()

    def classify(self, text, language):
        '''Given input text and language, method calculates probability of text being relevant to topic. @result probability that text is relevant'''
        input_entry = Entry(id=None, guid=None, entry=text, language=language)
        self.word_dict.words.setdefault(language, {})
        # for each token claculate probability of being relevant to topic
        # and calculate according to bayes theorem
        #
        #		  p1p2p3........pn		      a
        # P = ------------------------------------------ = -------
        #	p1p2p3........pn + (1-p1)(1-p2)...(1-pn)    a + b
        #
        a = 1.0
        b = 1.0
        for i in xrange(1, self.MAX_TOKEN_SIZE + 1):
            for token in input_entry.get_token(i):
                if not token in self.word_dict.words[language]:
                    probability = 0.5
                else:
                    token_stats = self.word_dict.words[language][token]
                    probability = token_stats['weight'] / token_stats['count']
                a *= probability
                b *= 1 - probability

        if a + b == 0:
            return 0
        else:
            result = a / (a + b)
            if result == 0.5:
                return -1
            else:
                return a / (a + b)

    def _test_corelation(self, human_classified_pickle, language):
        'This method prints corelation between user defined input in human_classified_pickle and automatic classification.'
        #
        #		    covariance
        #		        |
        #		     C(X,Y)		          E(XY) - E(X)E(Y)
        # corelation = ------------------ = -------------------------------------------  , a = E(XY), b = E(X), c = E(Y), d,= E(X^2), e = E(Y^2)
        #		    d(X)d(Y)	    sqrt(E(X^2) - E(X)^2) sqrt(E(Y^2) - E(Y)^2)
        #		       |
        #	       standard deviations
        #
        # X - automatically calculated probabilities
        # Y - human input probabilities
        #
        human_classified = HumanClassification(human_classified_pickle)
        human_classified.load()
        entry_count = len(human_classified.classification)
        a = 0.0
        b = 0.0
        c = 0.0
        d = 0.0
        e = 0.0
        for entry_id in human_classified.classification:
            processed_entry = self.db.get_entry_by_id(entry_id)
            probability_auto = self.classify(processed_entry.original_entry,
                                             language)
            if human_classified.classification[entry_id]:
                probability_human = self.HUMAN_RATING_PROBABILITY
            else:
                probability_human = (1 - self.HUMAN_RATING_PROBABILITY)

            a += probability_human * probability_auto
            b += probability_auto
            c += probability_human
            d += probability_auto * probability_auto
            e += probability_human * probability_human

        # E() values
        a /= entry_count
        b /= entry_count
        c /= entry_count
        d /= entry_count
        e /= entry_count

        return (a - (b * c)) / (sqrt(d - (b * b)) * sqrt(e - (c * c)))

    def _test_percents(self, human_classified_pickle, language):
        'This method returns ntuple containing (matches, false_positive, false_negative, unknown)'
        human_classified = HumanClassification(human_classified_pickle)
        human_classified.load()
        entry_count = len(human_classified.classification)
        true_positive = 0.0
        true_negative = 0.0
        matches = 0.0
        false_positive = 0.0
        false_negative = 0.0
        unknown = 0.0
        for entry_id in human_classified.classification:
            processed_entry = self.db.get_entry_by_id(entry_id)
            probability = self.classify(processed_entry.original_entry,
                                        language)
            if probability < self._low:
                if not human_classified.classification[entry_id]:
                    matches += 1
                    true_negative += 1
                else:
                    false_negative += 1
            elif probability >= self._high:
                if human_classified.classification[entry_id]:
                    matches += 1
                    true_positive += 1
                else:
                    false_positive += 1
            else:
                unknown += 1
        return (matches, true_positive, true_negative, false_positive,
                false_negative, unknown, entry_count)

    def run_tests(self, input_file, language):
        'Method for running tests on input file and get time elapsed for classification of one entry'
        self.db.connect(user='******',
                        database='meco',
                        host='localhost',
                        port=5432)
        tmp = HumanClassification(input_file)
        tmp.load()
        self._logger.info('Running tests...')
        tests = Tests()
        tests.set_test_len(len(tmp.classification))
        tests.set_train_len(len(self.human.classification))
        tests.set_train_positive_len(
            self.human.get_positively_classified_count(language))
        tests.set_train_negative_len(
            self.human.get_negatively_classified_count(language))
        self._logger.info('Calculating corelation...')
        tests.set_corelation(self._test_corelation(input_file, language))
        self._logger.info(
            'Calculating percentage of classification accuracy...')
        tests.set_percents(self._test_percents(input_file, language))
        print tests

    def get_time(self):
        'Method for calculating time needed for one entry classification'
        self._logger.info('Downloading entries to run tests on...')
        i = 0
        imax = 1000
        entries = []
        for entry in self.db.entries(language='en', entry_count=imax):
            i += 1
            if i >= imax:
                break
            entries.append(entry.original_entry)

        self._logger.info(
            'Masuring amount of entries to be calculated in 1sec')
        repetitions = 100
        result_avg = 0.0
        for i in xrange(0, repetitions):
            average = 0
            for j in xrange(0, imax - 1):
                start = time.time()
                self.classify(entries[j], 'en')
                average += time.time() - start
            average /= imax
            result_avg += average
        result_avg /= repetitions
        return 'Classifier is able to classify ' + str(round(
            1 / result_avg, 2)) + ' entries in one second.'

    def export_to_xml(self, language, specification):
        'Method exports all data to xml files'
        self.word_dict.to_xml(filename='XML/word_dict',
                              specification=specification)
        self.human.to_xml(self.db,
                          'XML/human_classification',
                          language=language)

    def fix_old_human_classification(self, filename):
        'method converts old human classification file to new one inlcluding text of tweets'
        self.db.connect(user='******',
                        database='meco',
                        host='localhost',
                        port=5432)
        file = open(filename, 'rb')
        content = pickle.load(file)
        new_content = {}

        for entry_id in content:
            e = self.db.get_entry_by_id(entry_id)
            if e:
                new_content[entry_id] = (content[entry_id], e.original_entry,
                                         e.get_language())

        new_file = open(filename + 'new', 'wb')
        pickle.dump(new_content, new_file)

    def fix_old_human_classification2(self, filename):
        'method converts old human classification file to new one inlcluding text of tweets'
        self.db.connect(user='******',
                        database='meco',
                        host='localhost',
                        port=5432)
        file = open(filename, 'rb')
        content = pickle.load(file)
        new_content = {}

        for entry_id in content:
            e = self.db.get_entry_by_id(entry_id)
            if e:
                new_content[entry_id] = (list(content[entry_id])[0],
                                         e.get_guid(), e.get_original_entry(),
                                         e.get_language())

        new_file = open(filename + '_new', 'wb')
        pickle.dump(new_content, new_file)

    def train_from_file(self, filename, language, classification):
        'method trains classifier from some file'
        file = open(filename, 'r')
        for line in file:
            e = Entry(None, None, line, language)
            self._add_to_human_classification(e, classification)
        self.human.store()
        self.word_dict.store()