예제 #1
0
파일: forms.py 프로젝트: XI-lab/axel
    def build_features(article):
        from axel.stats.scores import compress_pos_tag

        component_size_dict = defaultdict(lambda: 0)
        dbpedia_graph = article.dbpedia_graph(redirects=True)
        dblp_component_set = set()
        for component in nx.connected_components(dbpedia_graph):
            nodes = [node for node in component if 'Category' not in node]
            stats_ngrams = article.CollocationModel.COLLECTION_MODEL.objects.filter(ngram__in=nodes)
            is_dblp_inside = bool([True for ngram in stats_ngrams if 'dblp' in ngram.source])
            if is_dblp_inside:
                dblp_component_set.update(nodes)
            comp_len = len(nodes)
            for node in component:
                component_size_dict[node] = comp_len

        features = []
        for colloc in article.CollocationModel.objects.filter(article=article):
            max_pos_tag = colloc.max_pos_tag
            collection_ngram = colloc.COLLECTION_MODEL.objects.get(ngram=colloc.ngram)
            pos_tag_start = str(compress_pos_tag(max_pos_tag, RULES_DICT_START))
            pos_tag_end = str(compress_pos_tag(max_pos_tag, RULES_DICT_END))
            feature = [
                int(colloc.ngram in dblp_component_set),
                'dblp' in collection_ngram.source,
                component_size_dict[colloc.ngram],
                int('NN_STARTS' == pos_tag_start),
                int('JJ_STARTS' == pos_tag_start),
                int('NN_ENDS' == pos_tag_end),
                int('VB_ENDS' == pos_tag_end),
            ]
            features.append((colloc.ngram, feature))
        return features
예제 #2
0
파일: views.py 프로젝트: XI-lab/axel
    def get_context_data(self, **kwargs):
        """Add nodes and links to the context"""
        context = super(NgramPOSView, self).get_context_data(**kwargs)
        model = _get_model_from_string(self.kwargs["model_name"])
        ct = ContentType.objects.get_for_model(model, for_concrete_model=False)

        rules_dict = self._parse_rules()

        relevant_ids = set(
            TaggedCollection.objects.filter(content_type=ct, is_relevant=True).values_list("object_id", flat=True)
        )
        irrelevant_ids = set(
            TaggedCollection.objects.filter(content_type=ct, is_relevant=False).values_list("object_id", flat=True)
        )

        all_tags = set()

        correct_data = defaultdict(lambda: 0)
        incorrect_data = defaultdict(lambda: 0)
        unjudged_data = defaultdict(lambda: 0)
        for obj in model.objects.all():
            max_pos_tag = " ".join(max(obj.pos_tag, key=lambda x: x[1])[0])
            tag = str(scores.compress_pos_tag(max_pos_tag, rules_dict))
            all_tags.add(tag)
            if obj.id in relevant_ids:
                correct_data[tag] += 1
            elif obj.id in irrelevant_ids:
                incorrect_data[tag] += 1
            else:
                unjudged_data[tag] += 1

        all_tags = sorted(all_tags)
        context["categories"] = all_tags
        context["top_relevant_data"] = [
            (tag, correct_data[tag], incorrect_data[tag], unjudged_data[tag])
            for tag in all_tags
            if int(correct_data[tag] / (incorrect_data[tag] + 0.1)) > 10
        ]

        context["top_irrelevant_data"] = [
            (tag, correct_data[tag], incorrect_data[tag], unjudged_data[tag])
            for tag in all_tags
            if int(incorrect_data[tag] / (correct_data[tag] + 0.1)) > 10
        ]

        context["correct_data"] = [correct_data[tag] for tag in all_tags]
        context["incorrect_data"] = [incorrect_data[tag] for tag in all_tags]
        context["unjudged_data"] = [unjudged_data[tag] for tag in all_tags]

        # Add regex groups to populate forms
        context["regex_groups"] = self.regex_groups

        return context
예제 #3
0
파일: ml_classify.py 프로젝트: XI-lab/axel
    def fit_ml_algo(self, scored_ngrams, cv_num):
        """
        :param scored_ngrams: list of tuple of type (ngram, score) after initial scoring
        """
        # 1. Calculate scores with float numbers for ngram bindings, as a dict
        collection = []
        collection_labels = []
        component_size_dict = {}
        dblp_component_dict = defaultdict(lambda: set())

        # Calculate max pos tag count and build pos_tag_list
        start_pos_tag_list = []
        end_pos_tag_list = []
        pos_tag_list = []

        # Populate component size dict
        for article in Article.objects.filter(cluster_id=self.cluster_id):
            temp_dict = defaultdict(lambda: 0)
            dbpedia_graph = article.dbpedia_graph(redirects=self.redirects)
            for component in nx.connected_components(dbpedia_graph):
                nodes = [node for node in component if 'Category' not in node]
                stats_ngrams = self.StatsModel.objects.filter(ngram__in=nodes)
                is_dblp_inside = bool([True for ngram in stats_ngrams if 'dblp' in ngram.source])
                # ScienceWISE
                #is_dblp_inside = bool([True for ngram in stats_ngrams if ngram.is_ontological])
                if is_dblp_inside:
                    dblp_component_dict[article.id].update(nodes)
                comp_len = len(nodes)
                for node in component:
                    temp_dict[node] = comp_len
            component_size_dict[article.id] = temp_dict

        if self.global_pos_tag:
            queryset = self.StatsModel.objects.all()
        else:
            queryset = self.Model.objects.all()
        for ngram in queryset:
            max_pos_tag = ngram.max_pos_tag
            pos_tag_start = str(compress_pos_tag(max_pos_tag, RULES_DICT_START))
            pos_tag_end = str(compress_pos_tag(max_pos_tag, RULES_DICT_END))
            if pos_tag_start not in start_pos_tag_list:
                start_pos_tag_list.append(pos_tag_start)
            if pos_tag_end not in end_pos_tag_list:
                end_pos_tag_list.append(pos_tag_end)
            if max_pos_tag not in pos_tag_list:
                pos_tag_list.append(max_pos_tag)
        max_pos_tag_start_len = len(start_pos_tag_list)
        max_pos_tag_end_len = len(end_pos_tag_list)
        max_pos_tag_len = len(pos_tag_list)

        # 2. Iterate through all ngrams, add scores - POS tag (to number), DBLP, DBPEDIA, IS_REL
        for article, score_dict in scored_ngrams:
            ngram = score_dict['ngram']
            collection_ngram = score_dict['collection_ngram']

            # POS TAG enumeration
            if self.global_pos_tag:
                max_pos_tag = collection_ngram.max_pos_tag
                pos_tag_prev = collection_ngram.pos_tag_prev
                pos_tag_after = collection_ngram.pos_tag_after
            else:
                max_pos_tag = ngram.max_pos_tag
                pos_tag_prev = ngram.pos_tag_prev
                pos_tag_after = ngram.pos_tag_after
            pos_tag_start = str(compress_pos_tag(max_pos_tag, RULES_DICT_START))
            pos_tag_end = str(compress_pos_tag(max_pos_tag, RULES_DICT_END))

            pos_tag_extra = set([' '.join(set(tags)) for tags in zip(*ngram.pos_tag)[0]])

            wiki_edges_count = len(article.wikilinks_graph.edges([ngram.ngram]))
            feature = [
                ngram.ngram in article.wiki_text_index,
                ngram.ngram in dblp_component_dict[article.id],
                ngram.ngram.isupper(),
                'dblp' in collection_ngram.source,
                component_size_dict[article.id][ngram.ngram],
                wiki_edges_count,
                #collection_ngram.is_ontological,
                #'dbpedia' in collection_ngram.source,
                'wiki_redirect' in collection_ngram.source,
                bool({'.', ',', ':', ';'}.intersection(zip(*pos_tag_prev)[0])),
                bool({'.', ',', ':', ';'}.intersection(zip(*pos_tag_after)[0])),
                len(ngram.ngram.split()),
                score_dict['participation_count']
            ]

            if not self.uncompressed:
                # extend with compressed part of speech
                extended_feature = [1 if i == start_pos_tag_list.index(pos_tag_start) else 0
                                    for i in range(max_pos_tag_start_len)]
                feature.extend(extended_feature)
                extended_feature = [1 if i == end_pos_tag_list.index(pos_tag_end) else 0
                                    for i in range(max_pos_tag_end_len)]
                feature.extend(extended_feature)
            else:
                # Normal part of speech
                extended_feature = [1 if i == pos_tag_list.index(max_pos_tag) else 0 for i in
                                    range(max_pos_tag_len)]
                feature.extend(extended_feature)

            collection.append(feature)
            collection_labels.append(score_dict['is_rel'])

        feature_names = [
            'is_wiki_text',
            'dblp_inside',
            'is_upper',
            'dblp',
            'comp_size',
            'wikilinks',
            #'ScienceWISE',
            #'is_wiki',
            'is_redirect',
            'punkt_prev',
            'punkt_after',
            'word_len',
            'part_count'
        ]
        if not self.uncompressed:
            feature_names.extend(start_pos_tag_list)
            feature_names.extend(end_pos_tag_list)
        else:
            feature_names.extend(pos_tag_list)

        from sklearn.ensemble import ExtraTreesClassifier
        e_clf = ExtraTreesClassifier(random_state=0, compute_importances=True, n_estimators=100)
        new_collection = e_clf.fit(collection, collection_labels).transform(collection)
        print sorted(zip(list(e_clf.feature_importances_), feature_names), key=lambda x: x[0],
                     reverse=True)[:new_collection.shape[1]]
        print new_collection.shape

        datas = []
        for depth, min_split in ((5, 50), (5, 100), (5, 200), (3, 50), (3, 100), (3, 200)):
            print 'Parameters: depth {0}, split {1}'.format(depth, min_split)
            clf = DecisionTreeClassifier(max_depth=depth, min_samples_split=min_split)
            #for tag, values in pos_tag_counts.iteritems():
            #    print tag, values[1]/values[0]
            # clf.fit(new_collection, collection_labels)
            #import StringIO, pydot
            #from sklearn import tree
            #dot_data = StringIO.StringIO()
            #tree.export_graphviz(clf, out_file=dot_data, feature_names=feature_names)
            #graph = pydot.graph_from_dot_data(dot_data.getvalue())
            #graph.write_pdf("decision.pdf")
            #
            # for i, vector in enumerate(collection):
            #     value = clf.predict(vector)[0]
            #     if value != collection_labels[i] and value:
            #         print scored_ngrams[i][1]['ngram'], vector, value, collection_labels[i]

            # K-fold cross-validation
            print 'Performing cross validation'
            scores = cross_validation.cross_val_score(clf, new_collection, np.array(collection_labels),
                                                      cv=cv_num, score_func=precision_score)
            print("Precision: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() / 2))
            #print "Precision full scores (for t-test:): ", '\n'.join([str(score) for score in scores])
            scores = cross_validation.cross_val_score(clf, new_collection, np.array(collection_labels),
                                                      cv=cv_num, score_func=recall_score)
            print("Recall: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() / 2))
            #print "Recall full scores (for t-test:): ", '\n'.join([str(score) for score in scores])
            scores = cross_validation.cross_val_score(clf, new_collection, np.array(collection_labels),
                                                      cv=cv_num, score_func=f1_score)
            # TODO: update recall with full collection labels
            print("F1: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() / 2))
            #print "F1 full scores (for t-test:): ", '\n'.join([str(score) for score in scores])

            data = {'f1': scores.mean(), 'min_split': min_split, 'depth': depth}

            scores = cross_validation.cross_val_score(clf, new_collection, np.array(collection_labels),
                                                      cv=cv_num)
            print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() / 2))
            datas.append(data)

        max_data = {'f1': 0}
        for data in datas:
            if max_data['f1'] < data['f1']:
                max_data = data
        print 'Best result:'
        print max_data
        print
        clf = DecisionTreeClassifier(max_depth=max_data['depth'],
                                     min_samples_split=max_data['min_split'])
        clf.fit(new_collection, collection_labels)
        pickle.dump(clf, open('ngram_clf.pcl', 'w'))
        return max_data