Example #1
0
 def _test_percents(self, human_classified_pickle, language):
     'This method returns ntuple containing (matches, false_positive, false_negative, unknown)'
     human_classified = HumanClassification(human_classified_pickle)
     human_classified.load()
     entry_count = len(human_classified.classification)
     true_positive = 0.0
     true_negative = 0.0
     matches = 0.0
     false_positive = 0.0
     false_negative = 0.0
     unknown = 0.0
     for entry_id in human_classified.classification:
         processed_entry = self.db.get_entry_by_id(entry_id)
         probability = self.classify(processed_entry.original_entry,
                                     language)
         if probability < self._low:
             if not human_classified.classification[entry_id]:
                 matches += 1
                 true_negative += 1
             else:
                 false_negative += 1
         elif probability >= self._high:
             if human_classified.classification[entry_id]:
                 matches += 1
                 true_positive += 1
             else:
                 false_positive += 1
         else:
             unknown += 1
     return (matches, true_positive, true_negative, false_positive,
             false_negative, unknown, entry_count)
Example #2
0
 def human_classify(self, output_pickle, language):
     'This method creates output_pickle file containing user defined classifications of entries. May be used for creating test data.'
     self.db.connect(user='******',
                     database='meco',
                     host='localhost',
                     port=5432)
     new_human_classify = HumanClassification(output_pickle)
     new_human_classify.load()
     try:
         for entry in self.db.entries(language=language,
                                      entry_count=None,
                                      entry_offset=0):
             # when entry was allready processed skip
             if entry.id in new_human_classify.classification:
                 continue
             print 'Original entry: \n"' + entry.original_entry + '"\n automatic classification = ' + str(
                 self.classify(entry.original_entry, language))
             automatic_classification = self.classify(
                 entry.original_entry, language)
             if automatic_classification < self._low:
                 auto = 'n'
                 continue  # TODO: odstranit
             elif automatic_classification >= self._high:
                 auto = 'y'
             else:
                 auto = '?'
             answer = raw_input('Is this entry relevant? (y/n/?/END))[' +
                                auto + ']: ')
             if answer == 'y':
                 new_human_classify.classification[entry.id] = True
             elif answer == 'n':
                 new_human_classify.classification[entry.id] = False
             elif answer == 'END':
                 break
             else:
                 if automatic_classification < self._low:
                     new_human_classify.classification[entry.id] = False
                 elif automatic_classification >= self._high:
                     new_human_classify.classification[entry.id] = True
                 else:
                     new_human_classify.classification[entry.id] = None
             print 'Cassified count = ' + str(
                 len(new_human_classify.classification))
     except KeyboardInterrupt:
         pass
     new_human_classify.store()
Example #3
0
    def _test_corelation(self, human_classified_pickle, language):
        'This method prints corelation between user defined input in human_classified_pickle and automatic classification.'
        #
        #		    covariance
        #		        |
        #		     C(X,Y)		          E(XY) - E(X)E(Y)
        # corelation = ------------------ = -------------------------------------------  , a = E(XY), b = E(X), c = E(Y), d,= E(X^2), e = E(Y^2)
        #		    d(X)d(Y)	    sqrt(E(X^2) - E(X)^2) sqrt(E(Y^2) - E(Y)^2)
        #		       |
        #	       standard deviations
        #
        # X - automatically calculated probabilities
        # Y - human input probabilities
        #
        human_classified = HumanClassification(human_classified_pickle)
        human_classified.load()
        entry_count = len(human_classified.classification)
        a = 0.0
        b = 0.0
        c = 0.0
        d = 0.0
        e = 0.0
        for entry_id in human_classified.classification:
            processed_entry = self.db.get_entry_by_id(entry_id)
            probability_auto = self.classify(processed_entry.original_entry,
                                             language)
            if human_classified.classification[entry_id]:
                probability_human = self.HUMAN_RATING_PROBABILITY
            else:
                probability_human = (1 - self.HUMAN_RATING_PROBABILITY)

            a += probability_human * probability_auto
            b += probability_auto
            c += probability_human
            d += probability_auto * probability_auto
            e += probability_human * probability_human

        # E() values
        a /= entry_count
        b /= entry_count
        c /= entry_count
        d /= entry_count
        e /= entry_count

        return (a - (b * c)) / (sqrt(d - (b * b)) * sqrt(e - (c * c)))
Example #4
0
 def run_tests(self, input_file, language):
     'Method for running tests on input file and get time elapsed for classification of one entry'
     self.db.connect(user='******',
                     database='meco',
                     host='localhost',
                     port=5432)
     tmp = HumanClassification(input_file)
     tmp.load()
     self._logger.info('Running tests...')
     tests = Tests()
     tests.set_test_len(len(tmp.classification))
     tests.set_train_len(len(self.human.classification))
     tests.set_train_positive_len(
         self.human.get_positively_classified_count(language))
     tests.set_train_negative_len(
         self.human.get_negatively_classified_count(language))
     self._logger.info('Calculating corelation...')
     tests.set_corelation(self._test_corelation(input_file, language))
     self._logger.info(
         'Calculating percentage of classification accuracy...')
     tests.set_percents(self._test_percents(input_file, language))
     print tests
Example #5
0
 def __init__(self, low=0.5, high=0.5):
     # classification thresholds
     self._low = float(low)
     self._high = float(high)
     # add and setup logger
     self._logger = logging.getLogger()
     logging.basicConfig(level=logging.DEBUG)
     # db connection
     self.db = Connection()
     # load info about allready classified entries
     self._logger.info('Loading Allready classified entries...')
     self.human = HumanClassification(
         '/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/HumanClassification'
     )
     self.human.load()
     # load database of words
     self._logger.info('Loading word dictionary...')
     self.word_dict = WordDictionary(
         '/mnt/minerva1/nlp/projects/twitter_classification/TwitterClassifier/pickles/WordDictionary'
     )
     self.word_dict.load()
     # timer
     self._timer = timeit.Timer()