コード例 #1
0
ファイル: new_vpesvm.py プロジェクト: kiankd/vpe
    def results(self, name, set_name="Test", test_classes=None, test_auxs=None, v=False):
        if test_classes == None:
            test_classes = self.test_classes

        if test_auxs == None:
            print "WOIJOWIRJWOIRJWOIRJWORIQJWRPOWQJRPOWQJRPOJQWRPOQWJR"
            # test_auxs = self.all_auxiliaries

        if len(self.predictions) != len(test_classes):
            raise Exception("The number of test vectors != the number of test classes!")

        result_vector = []
        tp, fp, fn = 0.0, 0.0, 0.0
        for i in range(len(test_classes)):
            if v:
                sent = self.sentences.get_sentence(test_auxs[i].sentnum)

            if test_classes[i] == self.predictions[i] == vc.bool_to_int(True):
                result_vector.append(("tp", i))
                if v:
                    print "TP", sent.file, sent
                    print test_auxs[i], "\n"
                tp += 1

            elif test_classes[i] == vc.bool_to_int(True) and self.predictions[i] == vc.bool_to_int(False):
                result_vector.append(("fn", i))
                if v:
                    print "FN", sent.file, sent
                    print test_auxs[i], "\n"
                fn += 1

            elif test_classes[i] == vc.bool_to_int(False) and self.predictions[i] == vc.bool_to_int(True):
                result_vector.append(("fp", i))
                if v:
                    print "FP", sent.file, sent
                    print test_auxs[i], "\n"
                fp += 1

        try:
            precision = tp / (tp + fp)
        except ZeroDivisionError:
            precision = 0.0
        try:
            recall = tp / (tp + fn)
        except ZeroDivisionError:
            recall = 0.0

        if precision == 0.0 or recall == 0.0:
            f1 = 0.0
        else:
            f1 = 2 * precision * recall / (precision + recall)

        print '\nResults from applying "%s" on the %s set.' % (name, set_name)
        print "TP: %d, FP: %d, FN: %d" % (tp, fp, fn)
        print "Precision: %0.3f" % precision
        print "Recall: %0.3f" % recall
        print "F1: %0.3f\n" % f1

        result_vector += [("precision", precision), ("recall", recall), ("f1", f1)]
        self.result_vector = result_vector
コード例 #2
0
ファイル: new_vpesvm.py プロジェクト: kiankd/vpe
    def make_feature_vectors(self, make_test_vectors=True, make_train_vectors=True, use_old_vectors=False):
        if make_train_vectors:
            self.train_vectors, self.train_classes = [], []
        if make_test_vectors:
            self.test_vectors, self.test_classes = [], []

        frequent_words = self.file_names.extract_data_from_file(self.file_names.EACH_UNIQUE_WORD_NEAR_AUX)
        all_pos = self.file_names.extract_data_from_file(self.file_names.EACH_UNIQUE_POS_FILE)
        pos_bigrams = wc.pos_bigrams(all_pos)

        for aux in self.all_auxiliaries:
            sentdict = self.sentences.get_sentence(aux.sentnum)

            if make_train_vectors and self.start_train <= sentdict.get_section() <= self.end_train:
                self.train_vectors.append(
                    csr_matrix(
                        vc.make_vector(
                            sentdict,
                            aux,
                            self.features,
                            vpe.ALL_CATEGORIES,
                            vpe.AUX_LEMMAS,
                            vpe.ALL_AUXILIARIES,
                            frequent_words,
                            all_pos,
                            pos_bigrams,
                            make_old=use_old_vectors,
                        )
                    )
                )

                self.train_classes.append(vc.bool_to_int(aux.is_trigger))
                if len(self.train_vectors) % 1000 == 0 or len(self.train_vectors) == 1:
                    print "Making the %dth training vector..." % (len(self.train_vectors))

            if make_test_vectors and self.start_test <= sentdict.get_section() <= self.end_test:
                self.test_vectors.append(
                    csr_matrix(
                        vc.make_vector(
                            sentdict,
                            aux,
                            self.features,
                            vpe.ALL_CATEGORIES,
                            vpe.AUX_LEMMAS,
                            vpe.ALL_AUXILIARIES,
                            frequent_words,
                            all_pos,
                            pos_bigrams,
                            make_old=use_old_vectors,
                        )
                    )
                )

                self.test_classes.append(vc.bool_to_int(aux.is_trigger))
                if len(self.test_vectors) % 1000 == 0 or len(self.test_vectors) == 1:
                    print "Making the %dth testing vector..." % (len(self.test_vectors))

        self.pre_oversample_length = len(self.train_vectors)
コード例 #3
0
ファイル: new_vpesvm.py プロジェクト: kiankd/vpe
    def oversample(self, multiplier=None):
        if not multiplier:
            multiplier = self.train_classes.count(vc.bool_to_int(False)) / self.train_classes.count(
                vc.bool_to_int(True)
            )

        print "Oversampling by x%d" % multiplier

        new_features = []
        new_classes = []
        for i in range(0, len(self.train_vectors)):
            if self.train_classes[i] == vc.bool_to_int(True):
                for _ in range(0, multiplier):
                    new_features.append(self.train_vectors[i])
                    new_classes.append(vc.bool_to_int(True))
            else:
                new_features.append(self.train_vectors[i])
                new_classes.append(vc.bool_to_int(False))

        self.train_vectors = new_features
        self.train_classes = new_classes
コード例 #4
0
    def oversample(self, multiplier=None):
        if not multiplier:
            multiplier = self.train_classes.count(
                vc.bool_to_int(False)) / self.train_classes.count(
                    vc.bool_to_int(True))

        print 'Oversampling by x%d' % multiplier

        new_features = []
        new_classes = []
        for i in range(0, len(self.train_vectors)):
            if self.train_classes[i] == vc.bool_to_int(True):
                for _ in range(0, multiplier):
                    new_features.append(self.train_vectors[i])
                    new_classes.append(vc.bool_to_int(True))
            else:
                new_features.append(self.train_vectors[i])
                new_classes.append(vc.bool_to_int(False))

        self.train_vectors = new_features
        self.train_classes = new_classes
コード例 #5
0
ファイル: new_vpesvm.py プロジェクト: kiankd/vpe
    def test_my_rules(self, original_rules=False, idxs=None):
        self.predictions = []
        print "Length of test set: %d, length of All_auxs-training vectors: %d" % (
            len(self.test_classes),
            len(self.all_auxiliaries) - len(self.train_vectors),
        )
        for i in range(self.pre_oversample_length, len(self.all_auxiliaries)):
            if idxs == None or i in idxs:
                aux = self.all_auxiliaries.get_aux(i)
                sendict = self.sentences.get_sentence(aux.sentnum)
                tree = sendict.get_nltk_tree()
                word_subtree_positions = nt.get_smallest_subtree_positions(tree)

                if not original_rules:
                    if aux.type == "modal":
                        self.predictions.append(
                            vc.bool_to_int(wc.modal_rule(sendict, aux, tree, word_subtree_positions))
                        )
                    elif aux.type == "be":
                        self.predictions.append(vc.bool_to_int(wc.be_rule(sendict, aux)))
                    elif aux.type == "have":
                        self.predictions.append(vc.bool_to_int(wc.have_rule(sendict, aux)))
                    elif aux.type == "do":
                        self.predictions.append(vc.bool_to_int(wc.do_rule(sendict, aux, tree, word_subtree_positions)))
                    elif aux.type == "so":
                        self.predictions.append(vc.bool_to_int(wc.so_rule(sendict, aux)))
                    elif aux.type == "to":
                        self.predictions.append(vc.bool_to_int(wc.to_rule(sendict, aux)))
                else:
                    auxidx = aux.wordnum
                    if aux.type == "modal":
                        self.predictions.append(
                            vc.bool_to_int(dv.modalcheck(sendict, auxidx, tree, word_subtree_positions))
                        )
                    elif aux.type == "be":
                        self.predictions.append(
                            vc.bool_to_int(dv.becheck(sendict, auxidx, tree, word_subtree_positions))
                        )
                    elif aux.type == "have":
                        self.predictions.append(
                            vc.bool_to_int(dv.havecheck(sendict, auxidx, tree, word_subtree_positions))
                        )
                    elif aux.type == "do":
                        self.predictions.append(
                            vc.bool_to_int(dv.docheck(sendict, auxidx, tree, word_subtree_positions))
                        )
                    elif aux.type == "so":
                        self.predictions.append(
                            vc.bool_to_int(dv.socheck(sendict, auxidx, tree, word_subtree_positions))
                        )
                    elif aux.type == "to":
                        self.predictions.append(
                            vc.bool_to_int(dv.tocheck(sendict, auxidx, tree, word_subtree_positions))
                        )
コード例 #6
0
    def results(self,
                name,
                set_name='Test',
                test_classes=None,
                test_auxs=None,
                v=False):
        if test_classes == None:
            test_classes = self.test_classes

        if test_auxs == None:
            print 'WOIJOWIRJWOIRJWOIRJWORIQJWRPOWQJRPOWQJRPOJQWRPOQWJR'
            # test_auxs = self.all_auxiliaries

        if len(self.predictions) != len(test_classes):
            raise Exception(
                'The number of test vectors != the number of test classes!')

        result_vector = []
        tp, fp, fn = 0.0, 0.0, 0.0
        for i in range(len(test_classes)):
            if v:
                sent = self.sentences.get_sentence(test_auxs[i].sentnum)

            if test_classes[i] == self.predictions[i] == vc.bool_to_int(True):
                result_vector.append(('tp', i))
                if v:
                    print 'TP', sent.file, sent
                    print test_auxs[i], '\n'
                tp += 1

            elif test_classes[i] == vc.bool_to_int(
                    True) and self.predictions[i] == vc.bool_to_int(False):
                result_vector.append(('fn', i))
                if v:
                    print 'FN', sent.file, sent
                    print test_auxs[i], '\n'
                fn += 1

            elif test_classes[i] == vc.bool_to_int(
                    False) and self.predictions[i] == vc.bool_to_int(True):
                result_vector.append(('fp', i))
                if v:
                    print 'FP', sent.file, sent
                    print test_auxs[i], '\n'
                fp += 1

        try:
            precision = tp / (tp + fp)
        except ZeroDivisionError:
            precision = 0.0
        try:
            recall = tp / (tp + fn)
        except ZeroDivisionError:
            recall = 0.0

        if precision == 0.0 or recall == 0.0:
            f1 = 0.0
        else:
            f1 = 2 * precision * recall / (precision + recall)

        print '\nResults from applying \"%s\" on the %s set.' % (name,
                                                                 set_name)
        print 'TP: %d, FP: %d, FN: %d' % (tp, fp, fn)
        print 'Precision: %0.3f' % precision
        print 'Recall: %0.3f' % recall
        print 'F1: %0.3f\n' % f1

        result_vector += [('precision', precision), ('recall', recall),
                          ('f1', f1)]
        self.result_vector = result_vector
コード例 #7
0
    def test_my_rules(self, original_rules=False, idxs=None):
        self.predictions = []
        print 'Length of test set: %d, length of All_auxs-training vectors: %d' % (
            len(self.test_classes),
            len(self.all_auxiliaries) - len(self.train_vectors))
        for i in range(self.pre_oversample_length, len(self.all_auxiliaries)):
            if idxs == None or i in idxs:
                aux = self.all_auxiliaries.get_aux(i)
                sendict = self.sentences.get_sentence(aux.sentnum)
                tree = sendict.get_nltk_tree()
                word_subtree_positions = nt.get_smallest_subtree_positions(
                    tree)

                if not original_rules:
                    if aux.type == 'modal':
                        self.predictions.append(
                            vc.bool_to_int(
                                wc.modal_rule(sendict, aux, tree,
                                              word_subtree_positions)))
                    elif aux.type == 'be':
                        self.predictions.append(
                            vc.bool_to_int(wc.be_rule(sendict, aux)))
                    elif aux.type == 'have':
                        self.predictions.append(
                            vc.bool_to_int(wc.have_rule(sendict, aux)))
                    elif aux.type == 'do':
                        self.predictions.append(
                            vc.bool_to_int(
                                wc.do_rule(sendict, aux, tree,
                                           word_subtree_positions)))
                    elif aux.type == 'so':
                        self.predictions.append(
                            vc.bool_to_int(wc.so_rule(sendict, aux)))
                    elif aux.type == 'to':
                        self.predictions.append(
                            vc.bool_to_int(wc.to_rule(sendict, aux)))
                else:
                    auxidx = aux.wordnum
                    if aux.type == 'modal':
                        self.predictions.append(
                            vc.bool_to_int(
                                dv.modalcheck(sendict, auxidx, tree,
                                              word_subtree_positions)))
                    elif aux.type == 'be':
                        self.predictions.append(
                            vc.bool_to_int(
                                dv.becheck(sendict, auxidx, tree,
                                           word_subtree_positions)))
                    elif aux.type == 'have':
                        self.predictions.append(
                            vc.bool_to_int(
                                dv.havecheck(sendict, auxidx, tree,
                                             word_subtree_positions)))
                    elif aux.type == 'do':
                        self.predictions.append(
                            vc.bool_to_int(
                                dv.docheck(sendict, auxidx, tree,
                                           word_subtree_positions)))
                    elif aux.type == 'so':
                        self.predictions.append(
                            vc.bool_to_int(
                                dv.socheck(sendict, auxidx, tree,
                                           word_subtree_positions)))
                    elif aux.type == 'to':
                        self.predictions.append(
                            vc.bool_to_int(
                                dv.tocheck(sendict, auxidx, tree,
                                           word_subtree_positions)))
コード例 #8
0
    def make_feature_vectors(self,
                             make_test_vectors=True,
                             make_train_vectors=True,
                             use_old_vectors=False):
        if make_train_vectors:
            self.train_vectors, self.train_classes = [], []
        if make_test_vectors:
            self.test_vectors, self.test_classes = [], []

        frequent_words = self.file_names.extract_data_from_file(
            self.file_names.EACH_UNIQUE_WORD_NEAR_AUX)
        all_pos = self.file_names.extract_data_from_file(
            self.file_names.EACH_UNIQUE_POS_FILE)
        pos_bigrams = wc.pos_bigrams(all_pos)

        for aux in self.all_auxiliaries:
            sentdict = self.sentences.get_sentence(aux.sentnum)

            if make_train_vectors and self.start_train <= sentdict.get_section(
            ) <= self.end_train:
                self.train_vectors.append(
                    csr_matrix(
                        vc.make_vector(sentdict,
                                       aux,
                                       self.features,
                                       vpe.ALL_CATEGORIES,
                                       vpe.AUX_LEMMAS,
                                       vpe.ALL_AUXILIARIES,
                                       frequent_words,
                                       all_pos,
                                       pos_bigrams,
                                       make_old=use_old_vectors)))

                self.train_classes.append(vc.bool_to_int(aux.is_trigger))
                if len(self.train_vectors) % 1000 == 0 or len(
                        self.train_vectors) == 1:
                    print 'Making the %dth training vector...' % (len(
                        self.train_vectors))

            if make_test_vectors and self.start_test <= sentdict.get_section(
            ) <= self.end_test:
                self.test_vectors.append(
                    csr_matrix(
                        vc.make_vector(sentdict,
                                       aux,
                                       self.features,
                                       vpe.ALL_CATEGORIES,
                                       vpe.AUX_LEMMAS,
                                       vpe.ALL_AUXILIARIES,
                                       frequent_words,
                                       all_pos,
                                       pos_bigrams,
                                       make_old=use_old_vectors)))

                self.test_classes.append(vc.bool_to_int(aux.is_trigger))
                if len(self.test_vectors) % 1000 == 0 or len(
                        self.test_vectors) == 1:
                    print 'Making the %dth testing vector...' % (len(
                        self.test_vectors))

        self.pre_oversample_length = len(self.train_vectors)