def __init__(self, start_train, end_train, start_test, end_test): self.sentences = vpe.AllSentences() self.annotations = vpe.Annotations() self.file_names = Files() self.all_auxiliaries = vpe.Auxiliaries() self.gold_standard_auxs = vpe.Auxiliaries() self.hyperplane = None self.features = [] """ Train and test_vectors are lists of csr_matrices in order to save memory. """ self.m = None self.m2 = None self.train_vectors = [] self.train_classes = [] self.test_vectors = [] self.test_classes = [] self.predictions = [] self.result_vector = [] self.pre_oversample_length = 0 self.start_train = start_train self.end_train = end_train self.start_test = start_test self.end_test = end_test
class VPEDetectionClassifier: SVM = "SVM" NUSVM = "NuSVC" LINEAR_SVM = "Linear SVC" LOGREG = "Logistic regression" NAIVE_BAYES = "Naive Bayes" LOGREGCV = "Logistic regression CV" DECISION_TREE = "Decision Tree" DECISION_TREE_WITH_OPTIONS = "Decision Tree with options" RANDOMFOREST = "Random Forest" ADABOOST = "Adaboost" def __init__(self, start_train, end_train, start_test, end_test): self.sentences = vpe.AllSentences() self.annotations = vpe.Annotations() self.file_names = Files() self.all_auxiliaries = vpe.Auxiliaries() self.gold_standard_auxs = vpe.Auxiliaries() self.hyperplane = None self.features = [] """ Train and test_vectors are lists of csr_matrices in order to save memory. """ self.m = None self.m2 = None self.train_vectors = [] self.train_classes = [] self.test_vectors = [] self.test_classes = [] self.predictions = [] self.result_vector = [] self.pre_oversample_length = 0 self.start_train = start_train self.end_train = end_train self.start_test = start_test self.end_test = end_test def set_classifier(self, classifier): if classifier == self.SVM: self.hyperplane = SVC() elif classifier == self.NUSVM: self.hyperplane = NuSVC(nu=0.9) elif classifier == self.LINEAR_SVM: self.hyperplane = LinearSVC() elif classifier == self.LOGREG: self.hyperplane = LogisticRegression() elif classifier == self.NAIVE_BAYES: self.hyperplane = MultinomialNB() elif classifier == self.LOGREGCV: self.hyperplane = LogisticRegressionCV() elif classifier == self.DECISION_TREE: self.hyperplane = DecisionTreeClassifier() elif classifier == self.DECISION_TREE_WITH_OPTIONS: self.hyperplane = DecisionTreeClassifier(max_depth=10, min_samples_leaf=3) elif classifier == self.RANDOMFOREST: self.hyperplane = RandomForestClassifier(n_estimators=100, min_samples_leaf=4) elif classifier == self.ADABOOST: self.hyperplane = AdaBoostClassifier(random_state=1917, n_estimators=100) else: self.hyperplane = classifier def set_features(self, features): self.features = features def import_data(self, test=None): dirs = listdir(self.file_names.XML_MRG) dirs.sort() sentnum_modifier = -1 dnum = 0 for d in dirs: subdir = d + self.file_names.SLASH_CHAR if subdir.startswith("."): continue if (self.start_train <= dnum <= self.end_train) or (self.start_test <= dnum <= self.end_test): section_annotation = vpe.AnnotationSection(subdir, self.file_names.VPE_ANNOTATIONS) vpe_files = list(set([annotation.file for annotation in section_annotation])) vpe_files.sort() for f in vpe_files: if not test or (test and f in test): # Here we are now getting the non-MRG POS file that we had neglected to get before. try: mrg_matrix = vpe.XMLMatrix(f + ".mrg.xml", self.file_names.XML_MRG + subdir) except IOError: mrg_matrix = vpe.XMLMatrix(f + ".pos.xml", self.file_names.XML_POS, pos_file=True) for sentdict in mrg_matrix: self.all_auxiliaries.add_auxs(sentdict.get_auxiliaries(), sentnum_modifier=sentnum_modifier) self.gold_standard_auxs.add_auxs( mrg_matrix.get_gs_auxiliaries(section_annotation.get_anns_for_file(f), sentnum_modifier) ) self.annotations.add_section(section_annotation) self.sentences.add_mrg(mrg_matrix) sentnum_modifier = len(self.sentences) - 1 dnum += 1 # We now just have to say which auxs are the gold standard ones within the 'all_auxiliaries' object by changing their "is_trigger" attribute. crt_gold_aux_idx = 0 crt_gold_aux = self.gold_standard_auxs.get_aux(crt_gold_aux_idx) for aux in self.all_auxiliaries: if crt_gold_aux.equals(aux): aux.is_trigger = True crt_gold_aux_idx += 1 try: crt_gold_aux = self.gold_standard_auxs.get_aux(crt_gold_aux_idx) except IndexError: break def fix_test_set_triggers(self): """ Some triggers annotated by B&S were missed in our data importation step, here we manually set them as actual triggers. """ for i, aux in enumerate(self.all_auxiliaries.auxs): if (aux.sentnum, aux.wordnum) in [(12072, 39), (10989, 30), (11804, 12), (11499, 11)]: print self.sentences.get_sentence(aux.sentnum) print aux print aux.is_trigger = True self.test_classes[i - self.pre_oversample_length] = 1 def save_classifier(self): a = np.array([self.hyperplane]) np.save("vpe_trained_classifier", a) def load_classifier(self): a = np.load("vpe_trained_classifier.npy") self.hyperplane = a[0] def save_data_npy(self, val=False): a = np.array( [ self.gold_standard_auxs, self.annotations, self.sentences, self.all_auxiliaries, self.train_vectors, self.train_classes, self.test_vectors, self.test_classes, ] ) if val: np.save("vpe_detect_data_val", a) else: np.save("vpe_detect_data_test", a) def load_data_npy(self, val=False, all_data=True): string = "_NON_MRG" if all_data else "" if val: a = np.load("vpe_detect_data_val" + string + ".npy") else: a = np.load("vpe_detect_data_test" + string + ".npy") self.gold_standard_auxs = a[0] self.annotations = a[1] self.sentences = a[2] self.all_auxiliaries = a[3] self.train_vectors = a[4] self.train_classes = a[5] self.test_vectors = a[6] self.test_classes = a[7] self.pre_oversample_length = len(self.train_vectors) def normalize(self): print "Normalizing the data..." s = StandardScaler(with_mean=False) # No need to do mean on sparse s.fit_transform(self.vecs_to_mat(train=True)) s.transform(self.vecs_to_mat(train=False)) def make_feature_vectors(self, make_test_vectors=True, make_train_vectors=True, use_old_vectors=False): if make_train_vectors: self.train_vectors, self.train_classes = [], [] if make_test_vectors: self.test_vectors, self.test_classes = [], [] frequent_words = self.file_names.extract_data_from_file(self.file_names.EACH_UNIQUE_WORD_NEAR_AUX) all_pos = self.file_names.extract_data_from_file(self.file_names.EACH_UNIQUE_POS_FILE) pos_bigrams = wc.pos_bigrams(all_pos) for aux in self.all_auxiliaries: sentdict = self.sentences.get_sentence(aux.sentnum) if make_train_vectors and self.start_train <= sentdict.get_section() <= self.end_train: self.train_vectors.append( csr_matrix( vc.make_vector( sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors, ) ) ) self.train_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.train_vectors) % 1000 == 0 or len(self.train_vectors) == 1: print "Making the %dth training vector..." % (len(self.train_vectors)) if make_test_vectors and self.start_test <= sentdict.get_section() <= self.end_test: self.test_vectors.append( csr_matrix( vc.make_vector( sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors, ) ) ) self.test_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.test_vectors) % 1000 == 0 or len(self.test_vectors) == 1: print "Making the %dth testing vector..." % (len(self.test_vectors)) self.pre_oversample_length = len(self.train_vectors) def oversample(self, multiplier=None): if not multiplier: multiplier = self.train_classes.count(vc.bool_to_int(False)) / self.train_classes.count( vc.bool_to_int(True) ) print "Oversampling by x%d" % multiplier new_features = [] new_classes = [] for i in range(0, len(self.train_vectors)): if self.train_classes[i] == vc.bool_to_int(True): for _ in range(0, multiplier): new_features.append(self.train_vectors[i]) new_classes.append(vc.bool_to_int(True)) else: new_features.append(self.train_vectors[i]) new_classes.append(vc.bool_to_int(False)) self.train_vectors = new_features self.train_classes = new_classes def vecs_to_mat(self, train=True): if train: vecs = self.train_vectors else: vecs = self.test_vectors m = vecs[0] for i in range(1, len(vecs)): m = vstack((m, vecs[i]), format="csr") return m def train(self): print "Training the model..." if self.m == None: self.m = self.train_vectors[0] for i in range(1, len(self.train_vectors)): self.m = vstack((self.m, self.train_vectors[i]), format="csr") self.hyperplane.fit(self.m, np.array(self.train_classes)) def make_so(self): for aux in self.all_auxiliaries: sent = self.sentences[aux.sentnum].words try: if sent[aux.wordnum + 1] == "so" or sent[aux.wordnum + 1] == "likewise": aux.type = "so" if sent[aux.wordnum + 1] == "the" and sent[aux.wordnum + 2] in ["same", "opposite"]: aux.type = "so" except IndexError: pass def set_aux_type(self, type_): # assert type_ in new_train, new_test = [], [] new_train_classes, new_test_classes = [], [] new_auxs = vpe.Auxiliaries() for i in range(len(self.train_vectors)): if self.all_auxiliaries.get_aux(i).type == type_: new_train.append(self.train_vectors[i]) new_train_classes.append(self.train_classes[i]) new_auxs.add_aux(self.all_auxiliaries.get_aux(i)) for i in range(len(self.train_vectors), len(self.all_auxiliaries)): if self.all_auxiliaries.get_aux(i).type == type_: new_test.append(self.test_vectors[i - len(self.train_vectors)]) new_test_classes.append(self.test_classes[i - len(self.train_vectors)]) new_auxs.add_aux(self.all_auxiliaries.get_aux(i)) self.train_vectors = new_train self.train_classes = new_train_classes self.test_vectors = new_test self.test_classes = new_test_classes self.all_auxiliaries = new_auxs def analyze_auxs(self): d = {} for aux in self.all_auxiliaries: if aux.is_trigger: if not d.has_key(aux.type): d[aux.type] = [aux] else: d[aux.type].append(aux) print d.keys() total = 0 for k in d: total += len(d[k]) print k, len(d[k]) print total def test(self, mat=None): print "Testing the model..." if mat == None: if self.m2 == None: self.m2 = self.test_vectors[0] for j in range(1, len(self.test_vectors)): self.m2 = vstack((self.m2, self.test_vectors[j]), format="csr") self.predictions = self.hyperplane.predict(self.m2) else: self.predictions = self.hyperplane.predict(mat) def test_my_rules(self, original_rules=False, idxs=None): self.predictions = [] print "Length of test set: %d, length of All_auxs-training vectors: %d" % ( len(self.test_classes), len(self.all_auxiliaries) - len(self.train_vectors), ) for i in range(self.pre_oversample_length, len(self.all_auxiliaries)): if idxs == None or i in idxs: aux = self.all_auxiliaries.get_aux(i) sendict = self.sentences.get_sentence(aux.sentnum) tree = sendict.get_nltk_tree() word_subtree_positions = nt.get_smallest_subtree_positions(tree) if not original_rules: if aux.type == "modal": self.predictions.append( vc.bool_to_int(wc.modal_rule(sendict, aux, tree, word_subtree_positions)) ) elif aux.type == "be": self.predictions.append(vc.bool_to_int(wc.be_rule(sendict, aux))) elif aux.type == "have": self.predictions.append(vc.bool_to_int(wc.have_rule(sendict, aux))) elif aux.type == "do": self.predictions.append(vc.bool_to_int(wc.do_rule(sendict, aux, tree, word_subtree_positions))) elif aux.type == "so": self.predictions.append(vc.bool_to_int(wc.so_rule(sendict, aux))) elif aux.type == "to": self.predictions.append(vc.bool_to_int(wc.to_rule(sendict, aux))) else: auxidx = aux.wordnum if aux.type == "modal": self.predictions.append( vc.bool_to_int(dv.modalcheck(sendict, auxidx, tree, word_subtree_positions)) ) elif aux.type == "be": self.predictions.append( vc.bool_to_int(dv.becheck(sendict, auxidx, tree, word_subtree_positions)) ) elif aux.type == "have": self.predictions.append( vc.bool_to_int(dv.havecheck(sendict, auxidx, tree, word_subtree_positions)) ) elif aux.type == "do": self.predictions.append( vc.bool_to_int(dv.docheck(sendict, auxidx, tree, word_subtree_positions)) ) elif aux.type == "so": self.predictions.append( vc.bool_to_int(dv.socheck(sendict, auxidx, tree, word_subtree_positions)) ) elif aux.type == "to": self.predictions.append( vc.bool_to_int(dv.tocheck(sendict, auxidx, tree, word_subtree_positions)) ) def results(self, name, set_name="Test", test_classes=None, test_auxs=None, v=False): if test_classes == None: test_classes = self.test_classes if test_auxs == None: print "WOIJOWIRJWOIRJWOIRJWORIQJWRPOWQJRPOWQJRPOJQWRPOQWJR" # test_auxs = self.all_auxiliaries if len(self.predictions) != len(test_classes): raise Exception("The number of test vectors != the number of test classes!") result_vector = [] tp, fp, fn = 0.0, 0.0, 0.0 for i in range(len(test_classes)): if v: sent = self.sentences.get_sentence(test_auxs[i].sentnum) if test_classes[i] == self.predictions[i] == vc.bool_to_int(True): result_vector.append(("tp", i)) if v: print "TP", sent.file, sent print test_auxs[i], "\n" tp += 1 elif test_classes[i] == vc.bool_to_int(True) and self.predictions[i] == vc.bool_to_int(False): result_vector.append(("fn", i)) if v: print "FN", sent.file, sent print test_auxs[i], "\n" fn += 1 elif test_classes[i] == vc.bool_to_int(False) and self.predictions[i] == vc.bool_to_int(True): result_vector.append(("fp", i)) if v: print "FP", sent.file, sent print test_auxs[i], "\n" fp += 1 try: precision = tp / (tp + fp) except ZeroDivisionError: precision = 0.0 try: recall = tp / (tp + fn) except ZeroDivisionError: recall = 0.0 if precision == 0.0 or recall == 0.0: f1 = 0.0 else: f1 = 2 * precision * recall / (precision + recall) print '\nResults from applying "%s" on the %s set.' % (name, set_name) print "TP: %d, FP: %d, FN: %d" % (tp, fp, fn) print "Precision: %0.3f" % precision print "Recall: %0.3f" % recall print "F1: %0.3f\n" % f1 result_vector += [("precision", precision), ("recall", recall), ("f1", f1)] self.result_vector = result_vector def log_results(self, file_name): train_length = self.pre_oversample_length with open(self.file_names.RESULT_LOGS_LOCATION + file_name + ".txt", "w") as f: for pair in self.result_vector: if pair[0] in ["tp", "fp", "fn"]: aux = self.all_auxiliaries.get_aux(pair[1] + train_length) sentdict = self.sentences.get_sentence(aux.sentnum) # print aux # print pair[0].upper(), # sentdict.print_sentence() # print f.write("%s\n%s: %s\n\n" % (str(aux), pair[0].upper(), sentdict.words_to_string())) else: f.write("\n%s: %0.3f\n" % (pair[0], pair[1])) def initialize2(self, aux_type=None, rules_test=False, oversample=5): if aux_type: self.set_aux_type(aux_type) if not rules_test: self.oversample(multiplier=oversample)
from numpy import dot from copy import copy, deepcopy from random import shuffle MODALS = [ 'can', 'could', 'may', 'must', 'might', 'will', 'would', 'shall', 'should' ] BE = ['be'] HAVE = ['have'] DO = ['do'] TO = ['to'] SO = ['so', 'same', 'likewise', 'opposite'] AUX_LEMMAS = MODALS + BE + HAVE + DO + TO + SO ALL_CATEGORIES = [MODALS, BE, HAVE, DO, TO, SO] ALL_AUXILIARIES = Files().extract_data_from_file(Files.UNIQUE_AUXILIARIES_FILE) EMPTY_DEP = 'NONE' """ ---- Exception classes. ---- """ class AuxiliaryHasNoTypeException(BaseException): def __init__(self, aux_name): print 'The following auxiliary, %s, has no category!' % aux_name class EmptySentDictException(BaseException): def __init__(self): pass class GoldStandardComesFromRawException(BaseException):
import word_characteristics as wc import numpy as np import nltktree as nt import warnings from file_names import Files from os import listdir from sys import argv from sklearn.cross_validation import KFold from sklearn.preprocessing import StandardScaler from sklearn.metrics import precision_score, recall_score, f1_score from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.ensemble import RandomForestClassifier from scipy.sparse import csr_matrix, vstack files = Files() MRG_DATA_FILE = 'dataset_with_features_ALL_AUXS.npy' AUTO_PARSE_FILE = '../npy_data/auto_parse_with_features_FULL_DATASET.npy' AUTO_PARSE_XML_DIR = '/Users/kian/Documents/HONOR/xml_annotations/raw_auto_parse/' class Dataset(object): def __init__(self): self.sentences = [] self.auxs = [] self.gold_auxs = [] self.X = [] self.Y = [] self.section_ends = {k: -1 for k in range(0, 25)} def add(self, section):
class VPEDetectionClassifier: SVM = 'SVM' NUSVM = 'NuSVC' LINEAR_SVM = 'Linear SVC' LOGREG = 'Logistic regression' NAIVE_BAYES = 'Naive Bayes' LOGREGCV = 'Logistic regression CV' DECISION_TREE = 'Decision Tree' DECISION_TREE_WITH_OPTIONS = 'Decision Tree with options' RANDOMFOREST = 'Random Forest' ADABOOST = 'Adaboost' def __init__(self, start_train, end_train, start_test, end_test): self.sentences = vpe.AllSentences() self.annotations = vpe.Annotations() self.file_names = Files() self.all_auxiliaries = vpe.Auxiliaries() self.gold_standard_auxs = vpe.Auxiliaries() self.hyperplane = None self.features = [] """ Train and test_vectors are lists of csr_matrices in order to save memory. """ self.m = None self.m2 = None self.train_vectors = [] self.train_classes = [] self.test_vectors = [] self.test_classes = [] self.predictions = [] self.result_vector = [] self.pre_oversample_length = 0 self.start_train = start_train self.end_train = end_train self.start_test = start_test self.end_test = end_test def set_classifier(self, classifier): if classifier == self.SVM: self.hyperplane = SVC() elif classifier == self.NUSVM: self.hyperplane = NuSVC(nu=0.9) elif classifier == self.LINEAR_SVM: self.hyperplane = LinearSVC() elif classifier == self.LOGREG: self.hyperplane = LogisticRegression() elif classifier == self.NAIVE_BAYES: self.hyperplane = MultinomialNB() elif classifier == self.LOGREGCV: self.hyperplane = LogisticRegressionCV() elif classifier == self.DECISION_TREE: self.hyperplane = DecisionTreeClassifier() elif classifier == self.DECISION_TREE_WITH_OPTIONS: self.hyperplane = DecisionTreeClassifier(max_depth=10, min_samples_leaf=3) elif classifier == self.RANDOMFOREST: self.hyperplane = RandomForestClassifier(n_estimators=100, min_samples_leaf=4) elif classifier == self.ADABOOST: self.hyperplane = AdaBoostClassifier(random_state=1917, n_estimators=100) else: self.hyperplane = classifier def set_features(self, features): self.features = features def import_data(self, test=None): dirs = listdir(self.file_names.XML_MRG) dirs.sort() sentnum_modifier = -1 dnum = 0 for d in dirs: subdir = d + self.file_names.SLASH_CHAR if subdir.startswith('.'): continue if (self.start_train <= dnum <= self.end_train) or ( self.start_test <= dnum <= self.end_test): section_annotation = vpe.AnnotationSection( subdir, self.file_names.VPE_ANNOTATIONS) vpe_files = list( set([annotation.file for annotation in section_annotation])) vpe_files.sort() for f in vpe_files: if not test or (test and f in test): # Here we are now getting the non-MRG POS file that we had neglected to get before. try: mrg_matrix = vpe.XMLMatrix( f + '.mrg.xml', self.file_names.XML_MRG + subdir) except IOError: mrg_matrix = vpe.XMLMatrix(f + '.pos.xml', self.file_names.XML_POS, pos_file=True) for sentdict in mrg_matrix: self.all_auxiliaries.add_auxs( sentdict.get_auxiliaries(), sentnum_modifier=sentnum_modifier) self.gold_standard_auxs.add_auxs( mrg_matrix.get_gs_auxiliaries( section_annotation.get_anns_for_file(f), sentnum_modifier)) self.annotations.add_section(section_annotation) self.sentences.add_mrg(mrg_matrix) sentnum_modifier = len(self.sentences) - 1 dnum += 1 # We now just have to say which auxs are the gold standard ones within the 'all_auxiliaries' object by changing their "is_trigger" attribute. crt_gold_aux_idx = 0 crt_gold_aux = self.gold_standard_auxs.get_aux(crt_gold_aux_idx) for aux in self.all_auxiliaries: if crt_gold_aux.equals(aux): aux.is_trigger = True crt_gold_aux_idx += 1 try: crt_gold_aux = self.gold_standard_auxs.get_aux( crt_gold_aux_idx) except IndexError: break def fix_test_set_triggers(self): """ Some triggers annotated by B&S were missed in our data importation step, here we manually set them as actual triggers. """ for i, aux in enumerate(self.all_auxiliaries.auxs): if (aux.sentnum, aux.wordnum) in [(12072, 39), (10989, 30), (11804, 12), (11499, 11)]: print self.sentences.get_sentence(aux.sentnum) print aux print aux.is_trigger = True self.test_classes[i - self.pre_oversample_length] = 1 def save_classifier(self): a = np.array([self.hyperplane]) np.save('vpe_trained_classifier', a) def load_classifier(self): a = np.load('vpe_trained_classifier.npy') self.hyperplane = a[0] def save_data_npy(self, val=False): a = np.array([ self.gold_standard_auxs, self.annotations, self.sentences, self.all_auxiliaries, self.train_vectors, self.train_classes, self.test_vectors, self.test_classes ]) if val: np.save('vpe_detect_data_val', a) else: np.save('vpe_detect_data_test', a) def load_data_npy(self, val=False, all_data=True): string = '_NON_MRG' if all_data else '' if val: a = np.load('vpe_detect_data_val' + string + '.npy') else: a = np.load('vpe_detect_data_test' + string + '.npy') self.gold_standard_auxs = a[0] self.annotations = a[1] self.sentences = a[2] self.all_auxiliaries = a[3] self.train_vectors = a[4] self.train_classes = a[5] self.test_vectors = a[6] self.test_classes = a[7] self.pre_oversample_length = len(self.train_vectors) def normalize(self): print 'Normalizing the data...' s = StandardScaler(with_mean=False) # No need to do mean on sparse s.fit_transform(self.vecs_to_mat(train=True)) s.transform(self.vecs_to_mat(train=False)) def make_feature_vectors(self, make_test_vectors=True, make_train_vectors=True, use_old_vectors=False): if make_train_vectors: self.train_vectors, self.train_classes = [], [] if make_test_vectors: self.test_vectors, self.test_classes = [], [] frequent_words = self.file_names.extract_data_from_file( self.file_names.EACH_UNIQUE_WORD_NEAR_AUX) all_pos = self.file_names.extract_data_from_file( self.file_names.EACH_UNIQUE_POS_FILE) pos_bigrams = wc.pos_bigrams(all_pos) for aux in self.all_auxiliaries: sentdict = self.sentences.get_sentence(aux.sentnum) if make_train_vectors and self.start_train <= sentdict.get_section( ) <= self.end_train: self.train_vectors.append( csr_matrix( vc.make_vector(sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors))) self.train_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.train_vectors) % 1000 == 0 or len( self.train_vectors) == 1: print 'Making the %dth training vector...' % (len( self.train_vectors)) if make_test_vectors and self.start_test <= sentdict.get_section( ) <= self.end_test: self.test_vectors.append( csr_matrix( vc.make_vector(sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors))) self.test_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.test_vectors) % 1000 == 0 or len( self.test_vectors) == 1: print 'Making the %dth testing vector...' % (len( self.test_vectors)) self.pre_oversample_length = len(self.train_vectors) def oversample(self, multiplier=None): if not multiplier: multiplier = self.train_classes.count( vc.bool_to_int(False)) / self.train_classes.count( vc.bool_to_int(True)) print 'Oversampling by x%d' % multiplier new_features = [] new_classes = [] for i in range(0, len(self.train_vectors)): if self.train_classes[i] == vc.bool_to_int(True): for _ in range(0, multiplier): new_features.append(self.train_vectors[i]) new_classes.append(vc.bool_to_int(True)) else: new_features.append(self.train_vectors[i]) new_classes.append(vc.bool_to_int(False)) self.train_vectors = new_features self.train_classes = new_classes def vecs_to_mat(self, train=True): if train: vecs = self.train_vectors else: vecs = self.test_vectors m = vecs[0] for i in range(1, len(vecs)): m = vstack((m, vecs[i]), format='csr') return m def train(self): print 'Training the model...' if self.m == None: self.m = self.train_vectors[0] for i in range(1, len(self.train_vectors)): self.m = vstack((self.m, self.train_vectors[i]), format='csr') self.hyperplane.fit(self.m, np.array(self.train_classes)) def make_so(self): for aux in self.all_auxiliaries: sent = self.sentences[aux.sentnum].words try: if sent[aux.wordnum + 1] == 'so' or sent[aux.wordnum + 1] == 'likewise': aux.type = 'so' if (sent[aux.wordnum + 1] == 'the' and sent[aux.wordnum + 2] in ['same', 'opposite']): aux.type = 'so' except IndexError: pass def set_aux_type(self, type_): # assert type_ in new_train, new_test = [], [] new_train_classes, new_test_classes = [], [] new_auxs = vpe.Auxiliaries() for i in range(len(self.train_vectors)): if self.all_auxiliaries.get_aux(i).type == type_: new_train.append(self.train_vectors[i]) new_train_classes.append(self.train_classes[i]) new_auxs.add_aux(self.all_auxiliaries.get_aux(i)) for i in range(len(self.train_vectors), len(self.all_auxiliaries)): if self.all_auxiliaries.get_aux(i).type == type_: new_test.append(self.test_vectors[i - len(self.train_vectors)]) new_test_classes.append( self.test_classes[i - len(self.train_vectors)]) new_auxs.add_aux(self.all_auxiliaries.get_aux(i)) self.train_vectors = new_train self.train_classes = new_train_classes self.test_vectors = new_test self.test_classes = new_test_classes self.all_auxiliaries = new_auxs def analyze_auxs(self): d = {} for aux in self.all_auxiliaries: if aux.is_trigger: if not d.has_key(aux.type): d[aux.type] = [aux] else: d[aux.type].append(aux) print d.keys() total = 0 for k in d: total += len(d[k]) print k, len(d[k]) print total def test(self, mat=None): print 'Testing the model...' if mat == None: if self.m2 == None: self.m2 = self.test_vectors[0] for j in range(1, len(self.test_vectors)): self.m2 = vstack((self.m2, self.test_vectors[j]), format='csr') self.predictions = self.hyperplane.predict(self.m2) else: self.predictions = self.hyperplane.predict(mat) def test_my_rules(self, original_rules=False, idxs=None): self.predictions = [] print 'Length of test set: %d, length of All_auxs-training vectors: %d' % ( len(self.test_classes), len(self.all_auxiliaries) - len(self.train_vectors)) for i in range(self.pre_oversample_length, len(self.all_auxiliaries)): if idxs == None or i in idxs: aux = self.all_auxiliaries.get_aux(i) sendict = self.sentences.get_sentence(aux.sentnum) tree = sendict.get_nltk_tree() word_subtree_positions = nt.get_smallest_subtree_positions( tree) if not original_rules: if aux.type == 'modal': self.predictions.append( vc.bool_to_int( wc.modal_rule(sendict, aux, tree, word_subtree_positions))) elif aux.type == 'be': self.predictions.append( vc.bool_to_int(wc.be_rule(sendict, aux))) elif aux.type == 'have': self.predictions.append( vc.bool_to_int(wc.have_rule(sendict, aux))) elif aux.type == 'do': self.predictions.append( vc.bool_to_int( wc.do_rule(sendict, aux, tree, word_subtree_positions))) elif aux.type == 'so': self.predictions.append( vc.bool_to_int(wc.so_rule(sendict, aux))) elif aux.type == 'to': self.predictions.append( vc.bool_to_int(wc.to_rule(sendict, aux))) else: auxidx = aux.wordnum if aux.type == 'modal': self.predictions.append( vc.bool_to_int( dv.modalcheck(sendict, auxidx, tree, word_subtree_positions))) elif aux.type == 'be': self.predictions.append( vc.bool_to_int( dv.becheck(sendict, auxidx, tree, word_subtree_positions))) elif aux.type == 'have': self.predictions.append( vc.bool_to_int( dv.havecheck(sendict, auxidx, tree, word_subtree_positions))) elif aux.type == 'do': self.predictions.append( vc.bool_to_int( dv.docheck(sendict, auxidx, tree, word_subtree_positions))) elif aux.type == 'so': self.predictions.append( vc.bool_to_int( dv.socheck(sendict, auxidx, tree, word_subtree_positions))) elif aux.type == 'to': self.predictions.append( vc.bool_to_int( dv.tocheck(sendict, auxidx, tree, word_subtree_positions))) def results(self, name, set_name='Test', test_classes=None, test_auxs=None, v=False): if test_classes == None: test_classes = self.test_classes if test_auxs == None: print 'WOIJOWIRJWOIRJWOIRJWORIQJWRPOWQJRPOWQJRPOJQWRPOQWJR' # test_auxs = self.all_auxiliaries if len(self.predictions) != len(test_classes): raise Exception( 'The number of test vectors != the number of test classes!') result_vector = [] tp, fp, fn = 0.0, 0.0, 0.0 for i in range(len(test_classes)): if v: sent = self.sentences.get_sentence(test_auxs[i].sentnum) if test_classes[i] == self.predictions[i] == vc.bool_to_int(True): result_vector.append(('tp', i)) if v: print 'TP', sent.file, sent print test_auxs[i], '\n' tp += 1 elif test_classes[i] == vc.bool_to_int( True) and self.predictions[i] == vc.bool_to_int(False): result_vector.append(('fn', i)) if v: print 'FN', sent.file, sent print test_auxs[i], '\n' fn += 1 elif test_classes[i] == vc.bool_to_int( False) and self.predictions[i] == vc.bool_to_int(True): result_vector.append(('fp', i)) if v: print 'FP', sent.file, sent print test_auxs[i], '\n' fp += 1 try: precision = tp / (tp + fp) except ZeroDivisionError: precision = 0.0 try: recall = tp / (tp + fn) except ZeroDivisionError: recall = 0.0 if precision == 0.0 or recall == 0.0: f1 = 0.0 else: f1 = 2 * precision * recall / (precision + recall) print '\nResults from applying \"%s\" on the %s set.' % (name, set_name) print 'TP: %d, FP: %d, FN: %d' % (tp, fp, fn) print 'Precision: %0.3f' % precision print 'Recall: %0.3f' % recall print 'F1: %0.3f\n' % f1 result_vector += [('precision', precision), ('recall', recall), ('f1', f1)] self.result_vector = result_vector def log_results(self, file_name): train_length = self.pre_oversample_length with open(self.file_names.RESULT_LOGS_LOCATION + file_name + '.txt', 'w') as f: for pair in self.result_vector: if pair[0] in ['tp', 'fp', 'fn']: aux = self.all_auxiliaries.get_aux(pair[1] + train_length) sentdict = self.sentences.get_sentence(aux.sentnum) # print aux # print pair[0].upper(), # sentdict.print_sentence() # print f.write('%s\n%s: %s\n\n' % (str(aux), pair[0].upper(), sentdict.words_to_string())) else: f.write('\n%s: %0.3f\n' % (pair[0], pair[1])) def initialize2(self, aux_type=None, rules_test=False, oversample=5): if aux_type: self.set_aux_type(aux_type) if not rules_test: self.oversample(multiplier=oversample)