Beispiel #1
0
    def precompute_related(self, senses):
        """
        @param a set of ids
        """
        with profiled("Precomputing scores in %s"):
            scores = {}

            operation = 0
            maxoperation = len(senses) / 2

            while operation < maxoperation:
                spot_senses = senses.pop(0)

                for other_senses in senses:
                    for source in spot_senses:
                        for dest in other_senses:
                            a, b = source, dest

                            if a > b:
                                a, b = b, a

                            if (a, b) in scores:
                                continue

                            scores[(a, b)] = self.relatedness(a, b)

                senses.append(spot_senses)
                operation += 1

            return scores
Beispiel #2
0
    def match(self, text, context=5, threshold=0.8):
        anchors = {}

        with profiled("Annotated in %s"):
            start = 0
            text_words = extract_words(text)

            while start < len(text_words):
                stop = min(start + context, len(text_words)) 

                while stop >= start:
                    words = map(lambda x: x[0], text_words[start:stop])
                    target = u' '.join(words)

                    # Only consider strings which are at least 3 characters long
                    if len(target) >= 3:
                        begin = text_words[start][1][0]
                        end   = text_words[stop - 1][1][1]

                        #assert target == text[begin:end].lower()

                        wiki_titles = self.trie.keys(target)
                        wiki_titles = [title for title in self.filter_wiki_titles(words, wiki_titles, threshold=threshold)]

                        if wiki_titles:
                            anchors[(begin, end)] = wiki_titles
                            stop = 0

                    stop -= 1
                start += 1

        return anchors
Beispiel #3
0
    def match(self, text, context=5, threshold=0.8):
        anchors = {}

        with profiled("Annotated in %s"):
            start = 0
            text_words = extract_words(text)

            while start < len(text_words):
                stop = min(start + context, len(text_words))

                while stop >= start:
                    words = map(lambda x: x[0], text_words[start:stop])
                    target = u' '.join(words)

                    # Only consider strings which are at least 3 characters long
                    if len(target) >= 3:
                        begin = text_words[start][1][0]
                        end = text_words[stop - 1][1][1]

                        #assert target == text[begin:end].lower()

                        wiki_titles = self.trie.keys(target)
                        wiki_titles = [
                            title for title in self.filter_wiki_titles(
                                words, wiki_titles, threshold=threshold)
                        ]

                        if wiki_titles:
                            anchors[(begin, end)] = wiki_titles
                            stop = 0

                    stop -= 1
                start += 1

        return anchors
Beispiel #4
0
    def precompute_related(self, senses):
        """
        @param a set of ids
        """
        with profiled("Precomputing scores in %s"):
            scores = {}

            operation = 0
            maxoperation = len(senses) / 2

            while operation < maxoperation:
                spot_senses = senses.pop(0)

                for other_senses in senses:
                    for source in spot_senses:
                        for dest in other_senses:
                            a, b = source, dest

                            if a > b:
                                a, b = b, a

                            if (a, b) in scores:
                                continue

                            scores[(a, b)] = self.relatedness(a, b)

                senses.append(spot_senses)
                operation += 1

            return scores
Beispiel #5
0
    def annotate(self, text):
        with profiled("Disambiguated in %s"):
            matches = self.match(text)

            spots = []
            indices = {}
            for (start, stop), anchors in matches.items():
                if anchors[0] in self.stopwords:
                    print "Ignoring stopword", anchors[0], anchors
                    continue

                spots.append(anchors[0])
                indices[anchors[0]] = (start, stop)

            ret = {}
            results = self.disambig.disambiguate(spots)

            for spot in results:
                ret[spot] = results[spot]
                ret[spot]['indices'] = indices[spot]
                start, stop = indices[spot]
                ret[spot]['spot'] = text[start:stop]

            return [v for(k, v) in ret.items()]
Beispiel #6
0
    def annotate(self, text):
        with profiled("Disambiguated in %s"):
            matches = self.match(text)

            spots = []
            indices = {}
            for (start, stop), anchors in matches.items():
                if anchors[0] in self.stopwords:
                    print "Ignoring stopword", anchors[0], anchors
                    continue

                spots.append(anchors[0])
                indices[anchors[0]] = (start, stop)

            ret = {}
            results = self.disambig.disambiguate(spots)

            for spot in results:
                ret[spot] = results[spot]
                ret[spot]['indices'] = indices[spot]
                start, stop = indices[spot]
                ret[spot]['spot'] = text[start:stop]

            return [v for (k, v) in ret.items()]
Beispiel #7
0
    def disambiguate(self, spots):
        db = self.anchors_db

        allsenses = []

        with profiled("Disambiguation in %s"):
            # First we get all the page connected to each spot

            pages = defaultdict(list)
            lp = {}

            index = 0
            while index < len(spots):
                spot = spots[index]
                link_prob, senses = self.get_senses(spot)

                allsenses.append(map(lambda x: x[0], senses))

                if link_prob > 0:
                    lp[spot], pages[spot] = link_prob, senses
                    #print "Link probability", spot, link_prob
                    index += 1
                else:
                    del spots[index]
                    print "Removing spot", spot, link_prob

            scores = self.precompute_related(allsenses)

            # Then we remove each spot and append it to the end
            # so we can apply the voting scheme

            candidates = defaultdict(list)

            for spot in spots:
                sense_ids = pages.pop(spot)

                #print "Voting for spot", spot

                for p_a, pr_a in sense_ids:
                    score = 0
                    counter = 0

                    for counter, (p_b, pg_pr_b) in enumerate(pages.items()):
                        pg_b = map(lambda x: x[0], pg_pr_b)
                        prg_b = map(lambda x: x[1], pg_pr_b)

                        score += self.vote_for(p_a, pg_b, prg_b, scores)

                    score /= float(counter + 1)
                    #print "Vote to spot", spot, p_a, "is", score

                    print "Link probability", lp[spot]

                    rho = (score + lp[spot]) / 2.0
                    candidates[spot].append((pr_a, rho, p_a))

                pages[spot] = sense_ids

            winning = {}

            for spot, champion_list in candidates.items():
                print "Spot", spot, "has the following:"


                for pr_a, score, p_a in sorted(champion_list, reverse=True, key=lambda x: x[1]):
                    #if score < 0.05:
                    #    continue

                    print "\tProability:", pr_a, "Score:", score, "Page id:", p_a, "Title", self.catalog.get_title(p_a)

                _, rho, wiki_id = sorted(champion_list, reverse=True, key=lambda x: x[1])[0]

                print _, rho, wiki_id

                #if rho > 0.05:
                winning[spot] = {
                    "rho": rho,
                    "id": wiki_id,
                    "title": self.catalog.get_title(wiki_id),
                    "portals": self.catalog.get_portals(wiki_id),
                }

            return winning
Beispiel #8
0
    def disambiguate(self, spots):
        db = self.anchors_db

        allsenses = []

        with profiled("Disambiguation in %s"):
            # First we get all the page connected to each spot

            pages = defaultdict(list)
            lp = {}

            index = 0
            while index < len(spots):
                spot = spots[index]
                link_prob, senses = self.get_senses(spot)

                allsenses.append(map(lambda x: x[0], senses))

                if link_prob > 0:
                    lp[spot], pages[spot] = link_prob, senses
                    #print "Link probability", spot, link_prob
                    index += 1
                else:
                    del spots[index]
                    print "Removing spot", spot, link_prob

            scores = self.precompute_related(allsenses)

            # Then we remove each spot and append it to the end
            # so we can apply the voting scheme

            candidates = defaultdict(list)

            for spot in spots:
                sense_ids = pages.pop(spot)

                #print "Voting for spot", spot

                for p_a, pr_a in sense_ids:
                    score = 0
                    counter = 0

                    for counter, (p_b, pg_pr_b) in enumerate(pages.items()):
                        pg_b = map(lambda x: x[0], pg_pr_b)
                        prg_b = map(lambda x: x[1], pg_pr_b)

                        score += self.vote_for(p_a, pg_b, prg_b, scores)

                    score /= float(counter + 1)
                    #print "Vote to spot", spot, p_a, "is", score

                    print "Link probability", lp[spot]

                    rho = (score + lp[spot]) / 2.0
                    candidates[spot].append((pr_a, rho, p_a))

                pages[spot] = sense_ids

            winning = {}

            for spot, champion_list in candidates.items():
                print "Spot", spot, "has the following:"

                for pr_a, score, p_a in sorted(champion_list,
                                               reverse=True,
                                               key=lambda x: x[1]):
                    #if score < 0.05:
                    #    continue

                    print "\tProability:", pr_a, "Score:", score, "Page id:", p_a, "Title", self.catalog.get_title(
                        p_a)

                _, rho, wiki_id = sorted(champion_list,
                                         reverse=True,
                                         key=lambda x: x[1])[0]

                print _, rho, wiki_id

                #if rho > 0.05:
                winning[spot] = {
                    "rho": rho,
                    "id": wiki_id,
                    "title": self.catalog.get_title(wiki_id),
                    "portals": self.catalog.get_portals(wiki_id),
                }

            return winning