def results(self, name, set_name="Test", test_classes=None, test_auxs=None, v=False): if test_classes == None: test_classes = self.test_classes if test_auxs == None: print "WOIJOWIRJWOIRJWOIRJWORIQJWRPOWQJRPOWQJRPOJQWRPOQWJR" # test_auxs = self.all_auxiliaries if len(self.predictions) != len(test_classes): raise Exception("The number of test vectors != the number of test classes!") result_vector = [] tp, fp, fn = 0.0, 0.0, 0.0 for i in range(len(test_classes)): if v: sent = self.sentences.get_sentence(test_auxs[i].sentnum) if test_classes[i] == self.predictions[i] == vc.bool_to_int(True): result_vector.append(("tp", i)) if v: print "TP", sent.file, sent print test_auxs[i], "\n" tp += 1 elif test_classes[i] == vc.bool_to_int(True) and self.predictions[i] == vc.bool_to_int(False): result_vector.append(("fn", i)) if v: print "FN", sent.file, sent print test_auxs[i], "\n" fn += 1 elif test_classes[i] == vc.bool_to_int(False) and self.predictions[i] == vc.bool_to_int(True): result_vector.append(("fp", i)) if v: print "FP", sent.file, sent print test_auxs[i], "\n" fp += 1 try: precision = tp / (tp + fp) except ZeroDivisionError: precision = 0.0 try: recall = tp / (tp + fn) except ZeroDivisionError: recall = 0.0 if precision == 0.0 or recall == 0.0: f1 = 0.0 else: f1 = 2 * precision * recall / (precision + recall) print '\nResults from applying "%s" on the %s set.' % (name, set_name) print "TP: %d, FP: %d, FN: %d" % (tp, fp, fn) print "Precision: %0.3f" % precision print "Recall: %0.3f" % recall print "F1: %0.3f\n" % f1 result_vector += [("precision", precision), ("recall", recall), ("f1", f1)] self.result_vector = result_vector
def make_feature_vectors(self, make_test_vectors=True, make_train_vectors=True, use_old_vectors=False): if make_train_vectors: self.train_vectors, self.train_classes = [], [] if make_test_vectors: self.test_vectors, self.test_classes = [], [] frequent_words = self.file_names.extract_data_from_file(self.file_names.EACH_UNIQUE_WORD_NEAR_AUX) all_pos = self.file_names.extract_data_from_file(self.file_names.EACH_UNIQUE_POS_FILE) pos_bigrams = wc.pos_bigrams(all_pos) for aux in self.all_auxiliaries: sentdict = self.sentences.get_sentence(aux.sentnum) if make_train_vectors and self.start_train <= sentdict.get_section() <= self.end_train: self.train_vectors.append( csr_matrix( vc.make_vector( sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors, ) ) ) self.train_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.train_vectors) % 1000 == 0 or len(self.train_vectors) == 1: print "Making the %dth training vector..." % (len(self.train_vectors)) if make_test_vectors and self.start_test <= sentdict.get_section() <= self.end_test: self.test_vectors.append( csr_matrix( vc.make_vector( sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors, ) ) ) self.test_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.test_vectors) % 1000 == 0 or len(self.test_vectors) == 1: print "Making the %dth testing vector..." % (len(self.test_vectors)) self.pre_oversample_length = len(self.train_vectors)
def oversample(self, multiplier=None): if not multiplier: multiplier = self.train_classes.count(vc.bool_to_int(False)) / self.train_classes.count( vc.bool_to_int(True) ) print "Oversampling by x%d" % multiplier new_features = [] new_classes = [] for i in range(0, len(self.train_vectors)): if self.train_classes[i] == vc.bool_to_int(True): for _ in range(0, multiplier): new_features.append(self.train_vectors[i]) new_classes.append(vc.bool_to_int(True)) else: new_features.append(self.train_vectors[i]) new_classes.append(vc.bool_to_int(False)) self.train_vectors = new_features self.train_classes = new_classes
def oversample(self, multiplier=None): if not multiplier: multiplier = self.train_classes.count( vc.bool_to_int(False)) / self.train_classes.count( vc.bool_to_int(True)) print 'Oversampling by x%d' % multiplier new_features = [] new_classes = [] for i in range(0, len(self.train_vectors)): if self.train_classes[i] == vc.bool_to_int(True): for _ in range(0, multiplier): new_features.append(self.train_vectors[i]) new_classes.append(vc.bool_to_int(True)) else: new_features.append(self.train_vectors[i]) new_classes.append(vc.bool_to_int(False)) self.train_vectors = new_features self.train_classes = new_classes
def test_my_rules(self, original_rules=False, idxs=None): self.predictions = [] print "Length of test set: %d, length of All_auxs-training vectors: %d" % ( len(self.test_classes), len(self.all_auxiliaries) - len(self.train_vectors), ) for i in range(self.pre_oversample_length, len(self.all_auxiliaries)): if idxs == None or i in idxs: aux = self.all_auxiliaries.get_aux(i) sendict = self.sentences.get_sentence(aux.sentnum) tree = sendict.get_nltk_tree() word_subtree_positions = nt.get_smallest_subtree_positions(tree) if not original_rules: if aux.type == "modal": self.predictions.append( vc.bool_to_int(wc.modal_rule(sendict, aux, tree, word_subtree_positions)) ) elif aux.type == "be": self.predictions.append(vc.bool_to_int(wc.be_rule(sendict, aux))) elif aux.type == "have": self.predictions.append(vc.bool_to_int(wc.have_rule(sendict, aux))) elif aux.type == "do": self.predictions.append(vc.bool_to_int(wc.do_rule(sendict, aux, tree, word_subtree_positions))) elif aux.type == "so": self.predictions.append(vc.bool_to_int(wc.so_rule(sendict, aux))) elif aux.type == "to": self.predictions.append(vc.bool_to_int(wc.to_rule(sendict, aux))) else: auxidx = aux.wordnum if aux.type == "modal": self.predictions.append( vc.bool_to_int(dv.modalcheck(sendict, auxidx, tree, word_subtree_positions)) ) elif aux.type == "be": self.predictions.append( vc.bool_to_int(dv.becheck(sendict, auxidx, tree, word_subtree_positions)) ) elif aux.type == "have": self.predictions.append( vc.bool_to_int(dv.havecheck(sendict, auxidx, tree, word_subtree_positions)) ) elif aux.type == "do": self.predictions.append( vc.bool_to_int(dv.docheck(sendict, auxidx, tree, word_subtree_positions)) ) elif aux.type == "so": self.predictions.append( vc.bool_to_int(dv.socheck(sendict, auxidx, tree, word_subtree_positions)) ) elif aux.type == "to": self.predictions.append( vc.bool_to_int(dv.tocheck(sendict, auxidx, tree, word_subtree_positions)) )
def results(self, name, set_name='Test', test_classes=None, test_auxs=None, v=False): if test_classes == None: test_classes = self.test_classes if test_auxs == None: print 'WOIJOWIRJWOIRJWOIRJWORIQJWRPOWQJRPOWQJRPOJQWRPOQWJR' # test_auxs = self.all_auxiliaries if len(self.predictions) != len(test_classes): raise Exception( 'The number of test vectors != the number of test classes!') result_vector = [] tp, fp, fn = 0.0, 0.0, 0.0 for i in range(len(test_classes)): if v: sent = self.sentences.get_sentence(test_auxs[i].sentnum) if test_classes[i] == self.predictions[i] == vc.bool_to_int(True): result_vector.append(('tp', i)) if v: print 'TP', sent.file, sent print test_auxs[i], '\n' tp += 1 elif test_classes[i] == vc.bool_to_int( True) and self.predictions[i] == vc.bool_to_int(False): result_vector.append(('fn', i)) if v: print 'FN', sent.file, sent print test_auxs[i], '\n' fn += 1 elif test_classes[i] == vc.bool_to_int( False) and self.predictions[i] == vc.bool_to_int(True): result_vector.append(('fp', i)) if v: print 'FP', sent.file, sent print test_auxs[i], '\n' fp += 1 try: precision = tp / (tp + fp) except ZeroDivisionError: precision = 0.0 try: recall = tp / (tp + fn) except ZeroDivisionError: recall = 0.0 if precision == 0.0 or recall == 0.0: f1 = 0.0 else: f1 = 2 * precision * recall / (precision + recall) print '\nResults from applying \"%s\" on the %s set.' % (name, set_name) print 'TP: %d, FP: %d, FN: %d' % (tp, fp, fn) print 'Precision: %0.3f' % precision print 'Recall: %0.3f' % recall print 'F1: %0.3f\n' % f1 result_vector += [('precision', precision), ('recall', recall), ('f1', f1)] self.result_vector = result_vector
def test_my_rules(self, original_rules=False, idxs=None): self.predictions = [] print 'Length of test set: %d, length of All_auxs-training vectors: %d' % ( len(self.test_classes), len(self.all_auxiliaries) - len(self.train_vectors)) for i in range(self.pre_oversample_length, len(self.all_auxiliaries)): if idxs == None or i in idxs: aux = self.all_auxiliaries.get_aux(i) sendict = self.sentences.get_sentence(aux.sentnum) tree = sendict.get_nltk_tree() word_subtree_positions = nt.get_smallest_subtree_positions( tree) if not original_rules: if aux.type == 'modal': self.predictions.append( vc.bool_to_int( wc.modal_rule(sendict, aux, tree, word_subtree_positions))) elif aux.type == 'be': self.predictions.append( vc.bool_to_int(wc.be_rule(sendict, aux))) elif aux.type == 'have': self.predictions.append( vc.bool_to_int(wc.have_rule(sendict, aux))) elif aux.type == 'do': self.predictions.append( vc.bool_to_int( wc.do_rule(sendict, aux, tree, word_subtree_positions))) elif aux.type == 'so': self.predictions.append( vc.bool_to_int(wc.so_rule(sendict, aux))) elif aux.type == 'to': self.predictions.append( vc.bool_to_int(wc.to_rule(sendict, aux))) else: auxidx = aux.wordnum if aux.type == 'modal': self.predictions.append( vc.bool_to_int( dv.modalcheck(sendict, auxidx, tree, word_subtree_positions))) elif aux.type == 'be': self.predictions.append( vc.bool_to_int( dv.becheck(sendict, auxidx, tree, word_subtree_positions))) elif aux.type == 'have': self.predictions.append( vc.bool_to_int( dv.havecheck(sendict, auxidx, tree, word_subtree_positions))) elif aux.type == 'do': self.predictions.append( vc.bool_to_int( dv.docheck(sendict, auxidx, tree, word_subtree_positions))) elif aux.type == 'so': self.predictions.append( vc.bool_to_int( dv.socheck(sendict, auxidx, tree, word_subtree_positions))) elif aux.type == 'to': self.predictions.append( vc.bool_to_int( dv.tocheck(sendict, auxidx, tree, word_subtree_positions)))
def make_feature_vectors(self, make_test_vectors=True, make_train_vectors=True, use_old_vectors=False): if make_train_vectors: self.train_vectors, self.train_classes = [], [] if make_test_vectors: self.test_vectors, self.test_classes = [], [] frequent_words = self.file_names.extract_data_from_file( self.file_names.EACH_UNIQUE_WORD_NEAR_AUX) all_pos = self.file_names.extract_data_from_file( self.file_names.EACH_UNIQUE_POS_FILE) pos_bigrams = wc.pos_bigrams(all_pos) for aux in self.all_auxiliaries: sentdict = self.sentences.get_sentence(aux.sentnum) if make_train_vectors and self.start_train <= sentdict.get_section( ) <= self.end_train: self.train_vectors.append( csr_matrix( vc.make_vector(sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors))) self.train_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.train_vectors) % 1000 == 0 or len( self.train_vectors) == 1: print 'Making the %dth training vector...' % (len( self.train_vectors)) if make_test_vectors and self.start_test <= sentdict.get_section( ) <= self.end_test: self.test_vectors.append( csr_matrix( vc.make_vector(sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors))) self.test_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.test_vectors) % 1000 == 0 or len( self.test_vectors) == 1: print 'Making the %dth testing vector...' % (len( self.test_vectors)) self.pre_oversample_length = len(self.train_vectors)