def make_feature_vectors(self, make_test_vectors=True, make_train_vectors=True, use_old_vectors=False): if make_train_vectors: self.train_vectors, self.train_classes = [], [] if make_test_vectors: self.test_vectors, self.test_classes = [], [] frequent_words = self.file_names.extract_data_from_file(self.file_names.EACH_UNIQUE_WORD_NEAR_AUX) all_pos = self.file_names.extract_data_from_file(self.file_names.EACH_UNIQUE_POS_FILE) pos_bigrams = wc.pos_bigrams(all_pos) for aux in self.all_auxiliaries: sentdict = self.sentences.get_sentence(aux.sentnum) if make_train_vectors and self.start_train <= sentdict.get_section() <= self.end_train: self.train_vectors.append( csr_matrix( vc.make_vector( sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors, ) ) ) self.train_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.train_vectors) % 1000 == 0 or len(self.train_vectors) == 1: print "Making the %dth training vector..." % (len(self.train_vectors)) if make_test_vectors and self.start_test <= sentdict.get_section() <= self.end_test: self.test_vectors.append( csr_matrix( vc.make_vector( sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors, ) ) ) self.test_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.test_vectors) % 1000 == 0 or len(self.test_vectors) == 1: print "Making the %dth testing vector..." % (len(self.test_vectors)) self.pre_oversample_length = len(self.train_vectors)
def all_auxs_to_features(self, features): x = [] frequent_words = files.extract_data_from_file(Files.EACH_UNIQUE_WORD_NEAR_AUX) all_pos = files.extract_data_from_file(Files.EACH_UNIQUE_POS_FILE) pos_bigrams = wc.pos_bigrams(all_pos) for aux in self.auxs: sentdict = self.sentences[aux.sentnum] x.append(csr_matrix(vc.make_vector(sentdict, aux, features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams))) return x
def all_auxs_to_features(self, features): x = [] frequent_words = files.extract_data_from_file( Files.EACH_UNIQUE_WORD_NEAR_AUX) all_pos = files.extract_data_from_file(Files.EACH_UNIQUE_POS_FILE) pos_bigrams = wc.pos_bigrams(all_pos) for aux in self.auxs: sentdict = self.sentences[aux.sentnum] x.append( csr_matrix( vc.make_vector(sentdict, aux, features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams))) return x
def make_feature_vectors(self, make_test_vectors=True, make_train_vectors=True, use_old_vectors=False): if make_train_vectors: self.train_vectors, self.train_classes = [], [] if make_test_vectors: self.test_vectors, self.test_classes = [], [] frequent_words = self.file_names.extract_data_from_file( self.file_names.EACH_UNIQUE_WORD_NEAR_AUX) all_pos = self.file_names.extract_data_from_file( self.file_names.EACH_UNIQUE_POS_FILE) pos_bigrams = wc.pos_bigrams(all_pos) for aux in self.all_auxiliaries: sentdict = self.sentences.get_sentence(aux.sentnum) if make_train_vectors and self.start_train <= sentdict.get_section( ) <= self.end_train: self.train_vectors.append( csr_matrix( vc.make_vector(sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors))) self.train_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.train_vectors) % 1000 == 0 or len( self.train_vectors) == 1: print 'Making the %dth training vector...' % (len( self.train_vectors)) if make_test_vectors and self.start_test <= sentdict.get_section( ) <= self.end_test: self.test_vectors.append( csr_matrix( vc.make_vector(sentdict, aux, self.features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams, make_old=use_old_vectors))) self.test_classes.append(vc.bool_to_int(aux.is_trigger)) if len(self.test_vectors) % 1000 == 0 or len( self.test_vectors) == 1: print 'Making the %dth testing vector...' % (len( self.test_vectors)) self.pre_oversample_length = len(self.train_vectors)