Example #1
0
    def make_feature_vectors(self, make_test_vectors=True, make_train_vectors=True, use_old_vectors=False):
        if make_train_vectors:
            self.train_vectors, self.train_classes = [], []
        if make_test_vectors:
            self.test_vectors, self.test_classes = [], []

        frequent_words = self.file_names.extract_data_from_file(self.file_names.EACH_UNIQUE_WORD_NEAR_AUX)
        all_pos = self.file_names.extract_data_from_file(self.file_names.EACH_UNIQUE_POS_FILE)
        pos_bigrams = wc.pos_bigrams(all_pos)

        for aux in self.all_auxiliaries:
            sentdict = self.sentences.get_sentence(aux.sentnum)

            if make_train_vectors and self.start_train <= sentdict.get_section() <= self.end_train:
                self.train_vectors.append(
                    csr_matrix(
                        vc.make_vector(
                            sentdict,
                            aux,
                            self.features,
                            vpe.ALL_CATEGORIES,
                            vpe.AUX_LEMMAS,
                            vpe.ALL_AUXILIARIES,
                            frequent_words,
                            all_pos,
                            pos_bigrams,
                            make_old=use_old_vectors,
                        )
                    )
                )

                self.train_classes.append(vc.bool_to_int(aux.is_trigger))
                if len(self.train_vectors) % 1000 == 0 or len(self.train_vectors) == 1:
                    print "Making the %dth training vector..." % (len(self.train_vectors))

            if make_test_vectors and self.start_test <= sentdict.get_section() <= self.end_test:
                self.test_vectors.append(
                    csr_matrix(
                        vc.make_vector(
                            sentdict,
                            aux,
                            self.features,
                            vpe.ALL_CATEGORIES,
                            vpe.AUX_LEMMAS,
                            vpe.ALL_AUXILIARIES,
                            frequent_words,
                            all_pos,
                            pos_bigrams,
                            make_old=use_old_vectors,
                        )
                    )
                )

                self.test_classes.append(vc.bool_to_int(aux.is_trigger))
                if len(self.test_vectors) % 1000 == 0 or len(self.test_vectors) == 1:
                    print "Making the %dth testing vector..." % (len(self.test_vectors))

        self.pre_oversample_length = len(self.train_vectors)
Example #2
0
    def all_auxs_to_features(self, features):
        x = []

        frequent_words = files.extract_data_from_file(Files.EACH_UNIQUE_WORD_NEAR_AUX)
        all_pos = files.extract_data_from_file(Files.EACH_UNIQUE_POS_FILE)
        pos_bigrams = wc.pos_bigrams(all_pos)

        for aux in self.auxs:
            sentdict = self.sentences[aux.sentnum]
            x.append(csr_matrix(vc.make_vector(sentdict, aux, features, vpe.ALL_CATEGORIES, vpe.AUX_LEMMAS,
                                               vpe.ALL_AUXILIARIES, frequent_words, all_pos, pos_bigrams)))
        return x
Example #3
0
    def all_auxs_to_features(self, features):
        x = []

        frequent_words = files.extract_data_from_file(
            Files.EACH_UNIQUE_WORD_NEAR_AUX)
        all_pos = files.extract_data_from_file(Files.EACH_UNIQUE_POS_FILE)
        pos_bigrams = wc.pos_bigrams(all_pos)

        for aux in self.auxs:
            sentdict = self.sentences[aux.sentnum]
            x.append(
                csr_matrix(
                    vc.make_vector(sentdict, aux, features, vpe.ALL_CATEGORIES,
                                   vpe.AUX_LEMMAS, vpe.ALL_AUXILIARIES,
                                   frequent_words, all_pos, pos_bigrams)))
        return x
Example #4
0
    def make_feature_vectors(self,
                             make_test_vectors=True,
                             make_train_vectors=True,
                             use_old_vectors=False):
        if make_train_vectors:
            self.train_vectors, self.train_classes = [], []
        if make_test_vectors:
            self.test_vectors, self.test_classes = [], []

        frequent_words = self.file_names.extract_data_from_file(
            self.file_names.EACH_UNIQUE_WORD_NEAR_AUX)
        all_pos = self.file_names.extract_data_from_file(
            self.file_names.EACH_UNIQUE_POS_FILE)
        pos_bigrams = wc.pos_bigrams(all_pos)

        for aux in self.all_auxiliaries:
            sentdict = self.sentences.get_sentence(aux.sentnum)

            if make_train_vectors and self.start_train <= sentdict.get_section(
            ) <= self.end_train:
                self.train_vectors.append(
                    csr_matrix(
                        vc.make_vector(sentdict,
                                       aux,
                                       self.features,
                                       vpe.ALL_CATEGORIES,
                                       vpe.AUX_LEMMAS,
                                       vpe.ALL_AUXILIARIES,
                                       frequent_words,
                                       all_pos,
                                       pos_bigrams,
                                       make_old=use_old_vectors)))

                self.train_classes.append(vc.bool_to_int(aux.is_trigger))
                if len(self.train_vectors) % 1000 == 0 or len(
                        self.train_vectors) == 1:
                    print 'Making the %dth training vector...' % (len(
                        self.train_vectors))

            if make_test_vectors and self.start_test <= sentdict.get_section(
            ) <= self.end_test:
                self.test_vectors.append(
                    csr_matrix(
                        vc.make_vector(sentdict,
                                       aux,
                                       self.features,
                                       vpe.ALL_CATEGORIES,
                                       vpe.AUX_LEMMAS,
                                       vpe.ALL_AUXILIARIES,
                                       frequent_words,
                                       all_pos,
                                       pos_bigrams,
                                       make_old=use_old_vectors)))

                self.test_classes.append(vc.bool_to_int(aux.is_trigger))
                if len(self.test_vectors) % 1000 == 0 or len(
                        self.test_vectors) == 1:
                    print 'Making the %dth testing vector...' % (len(
                        self.test_vectors))

        self.pre_oversample_length = len(self.train_vectors)