Beispiel #1
0
def linked_score(collection_ngram, ngram, text, article_dict, ngram_abs_count, corr_dict1=None,
                 corr_dict2=None, score_func='weight_both_ngram4'):
    """
    :type collection_ngram: Collocation
    :type ngram: ArticleCollocation
    :type text: unicode
    """
    ngram = ngram.ngram
    nb = NgramBindings(ngram, text, corr_dict1=corr_dict1, corr_dict2=corr_dict2)
    if len(ngram.split()) == 2:
        score = getattr(nb, score_func)()
    else:
        smaller_ngrams = set(build_ngram_index(ngram).keys()).intersection(article_dict.keys())
        # select max split combination
        if smaller_ngrams:
            if len(smaller_ngrams) == 1:
                smaller_ngram = smaller_ngrams.pop()
                values = article_dict[smaller_ngram]
                score = values['score'] * ngram_abs_count / values['abs_count']
                # reduce score from the consumed ngram according its score
                article_dict[smaller_ngram]['score'] = values['score'] - score
                score = (score + getattr(nb, score_func)(split_ngram=smaller_ngram)) / 2
            else:
                score = 0
                smaller_ngrams = sorted(smaller_ngrams, key=lambda x: len(x.split()), reverse=True)
                for smaller_ngram in smaller_ngrams:
                    values = article_dict[smaller_ngram]
                    local_score = values['score'] * ngram_abs_count / values['abs_count']
                    article_dict[smaller_ngram]['score'] = values['score'] - local_score
                    score += local_score
        # no - full average
        else:
            score = getattr(nb, score_func)()
    return score, nb.ddict1, nb.ddict2
Beispiel #2
0
    def get_context_data(self, **kwargs):
        """Add nodes and links to the context"""
        context = super(NgramParticipationView, self).get_context_data(**kwargs)
        # nodes are simply ngrams
        links = []
        irrel_ngrams = set(self.queryset.filter(tags__is_relevant=False).values_list("ngram", flat=True))
        rel_ngrams = set(self.queryset.filter(tags__is_relevant=True).values_list("ngram", flat=True))

        all_ngrams = list(self.queryset)
        # Sort from longest to shortest, we use this in computing connections
        all_ngrams.sort(key=lambda x: len(x.ngram) + len(x.ngram.split()), reverse=True)

        ngrams_set = set(self.queryset.values_list("ngram", flat=True))
        participation_dict = defaultdict(list)
        for ngram_obj in all_ngrams:
            ngram = ngram_obj.ngram
            if ngram in participation_dict:
                for ngram_1 in participation_dict[ngram]:
                    links.append((ngram, ngram_1))
                # replace with current ngram
                for ngram_i in ngrams_set.intersection(build_ngram_index(ngram).keys()):
                    participation_dict[ngram_i] = [ngram]
            else:
                # append current ngram
                for ngram_i in ngrams_set.intersection(build_ngram_index(ngram).keys()):
                    participation_dict[ngram_i].append(ngram)

        # keep only connected components
        connected_nodes = list(set(zip(*links)[0]).union(set(zip(*links)[1])))
        node_dict = dict([(node, i) for i, node in enumerate(connected_nodes)])

        links = [{"source": node_dict[source], "target": node_dict[target]} for source, target in links]

        def _get_rel_info(ngram):
            if ngram in rel_ngrams:
                return 1
            elif ngram in irrel_ngrams:
                return -1
            return 0

        nodes = [{"name": ngram, "rel": _get_rel_info(ngram)} for ngram in connected_nodes]
        context["data"] = json.dumps({"nodes": nodes, "links": links})
        return context
Beispiel #3
0
 def populate_wiki_index(cls, cluster_id):
     import networkx as nx
     from axel.stats.models import STATS_CLUSTERS_DICT
     for article in print_progress(cls.objects.filter(cluster_id=cluster_id)):
         text = ''
         dbpedia_graph = article.dbpedia_graph(redirects=True)
         # by default components are ordered descending by size
         max_comp = nx.connected_components(dbpedia_graph)[0]
         nodes = [node for node in max_comp if 'Category' not in node]
         statsModel = STATS_CLUSTERS_DICT[cluster_id]
         ngrams = statsModel.objects.filter(ngram__in=nodes)
         for ngram in ngrams:
             text += ngram.wikipedia_text + '\n'
         article.wiki_text_index = nlp.build_ngram_index(nlp.Stemmer.stem_wordnet(text))
         article.save()
Beispiel #4
0
 def generate_temp_article(text):
     # TODO: make this Article class method
     from axel.articles.models import Article, Venue, TestCollocations
     import json
     venue = Venue.objects.get(acronym='SIGIR')
     stemmed_text = nlp.Stemmer.stem_wordnet(text)
     index = json.dumps(nlp.build_ngram_index(stemmed_text))
     article = Article(text=text, cluster_id='CS_COLLOCS', venue=venue, year=2013,
                       stemmed_text=stemmed_text, index=index)
     # TODO: extract title and abstract
     article.save_base(raw=True)
     article._create_collocations(True)
     for test_colloc in TestCollocations.objects.filter(article=article):
         obj = article.CollocationModel(ngram=test_colloc.ngram, count=test_colloc.count,
                                        article=article, total_count=0, extra_fields={})
         obj.save()
     TestCollocations.objects.filter(article=article).delete()
     return article