def precompute_related(self, senses): """ @param a set of ids """ with profiled("Precomputing scores in %s"): scores = {} operation = 0 maxoperation = len(senses) / 2 while operation < maxoperation: spot_senses = senses.pop(0) for other_senses in senses: for source in spot_senses: for dest in other_senses: a, b = source, dest if a > b: a, b = b, a if (a, b) in scores: continue scores[(a, b)] = self.relatedness(a, b) senses.append(spot_senses) operation += 1 return scores
def match(self, text, context=5, threshold=0.8): anchors = {} with profiled("Annotated in %s"): start = 0 text_words = extract_words(text) while start < len(text_words): stop = min(start + context, len(text_words)) while stop >= start: words = map(lambda x: x[0], text_words[start:stop]) target = u' '.join(words) # Only consider strings which are at least 3 characters long if len(target) >= 3: begin = text_words[start][1][0] end = text_words[stop - 1][1][1] #assert target == text[begin:end].lower() wiki_titles = self.trie.keys(target) wiki_titles = [title for title in self.filter_wiki_titles(words, wiki_titles, threshold=threshold)] if wiki_titles: anchors[(begin, end)] = wiki_titles stop = 0 stop -= 1 start += 1 return anchors
def match(self, text, context=5, threshold=0.8): anchors = {} with profiled("Annotated in %s"): start = 0 text_words = extract_words(text) while start < len(text_words): stop = min(start + context, len(text_words)) while stop >= start: words = map(lambda x: x[0], text_words[start:stop]) target = u' '.join(words) # Only consider strings which are at least 3 characters long if len(target) >= 3: begin = text_words[start][1][0] end = text_words[stop - 1][1][1] #assert target == text[begin:end].lower() wiki_titles = self.trie.keys(target) wiki_titles = [ title for title in self.filter_wiki_titles( words, wiki_titles, threshold=threshold) ] if wiki_titles: anchors[(begin, end)] = wiki_titles stop = 0 stop -= 1 start += 1 return anchors
def annotate(self, text): with profiled("Disambiguated in %s"): matches = self.match(text) spots = [] indices = {} for (start, stop), anchors in matches.items(): if anchors[0] in self.stopwords: print "Ignoring stopword", anchors[0], anchors continue spots.append(anchors[0]) indices[anchors[0]] = (start, stop) ret = {} results = self.disambig.disambiguate(spots) for spot in results: ret[spot] = results[spot] ret[spot]['indices'] = indices[spot] start, stop = indices[spot] ret[spot]['spot'] = text[start:stop] return [v for(k, v) in ret.items()]
def annotate(self, text): with profiled("Disambiguated in %s"): matches = self.match(text) spots = [] indices = {} for (start, stop), anchors in matches.items(): if anchors[0] in self.stopwords: print "Ignoring stopword", anchors[0], anchors continue spots.append(anchors[0]) indices[anchors[0]] = (start, stop) ret = {} results = self.disambig.disambiguate(spots) for spot in results: ret[spot] = results[spot] ret[spot]['indices'] = indices[spot] start, stop = indices[spot] ret[spot]['spot'] = text[start:stop] return [v for (k, v) in ret.items()]
def disambiguate(self, spots): db = self.anchors_db allsenses = [] with profiled("Disambiguation in %s"): # First we get all the page connected to each spot pages = defaultdict(list) lp = {} index = 0 while index < len(spots): spot = spots[index] link_prob, senses = self.get_senses(spot) allsenses.append(map(lambda x: x[0], senses)) if link_prob > 0: lp[spot], pages[spot] = link_prob, senses #print "Link probability", spot, link_prob index += 1 else: del spots[index] print "Removing spot", spot, link_prob scores = self.precompute_related(allsenses) # Then we remove each spot and append it to the end # so we can apply the voting scheme candidates = defaultdict(list) for spot in spots: sense_ids = pages.pop(spot) #print "Voting for spot", spot for p_a, pr_a in sense_ids: score = 0 counter = 0 for counter, (p_b, pg_pr_b) in enumerate(pages.items()): pg_b = map(lambda x: x[0], pg_pr_b) prg_b = map(lambda x: x[1], pg_pr_b) score += self.vote_for(p_a, pg_b, prg_b, scores) score /= float(counter + 1) #print "Vote to spot", spot, p_a, "is", score print "Link probability", lp[spot] rho = (score + lp[spot]) / 2.0 candidates[spot].append((pr_a, rho, p_a)) pages[spot] = sense_ids winning = {} for spot, champion_list in candidates.items(): print "Spot", spot, "has the following:" for pr_a, score, p_a in sorted(champion_list, reverse=True, key=lambda x: x[1]): #if score < 0.05: # continue print "\tProability:", pr_a, "Score:", score, "Page id:", p_a, "Title", self.catalog.get_title(p_a) _, rho, wiki_id = sorted(champion_list, reverse=True, key=lambda x: x[1])[0] print _, rho, wiki_id #if rho > 0.05: winning[spot] = { "rho": rho, "id": wiki_id, "title": self.catalog.get_title(wiki_id), "portals": self.catalog.get_portals(wiki_id), } return winning
def disambiguate(self, spots): db = self.anchors_db allsenses = [] with profiled("Disambiguation in %s"): # First we get all the page connected to each spot pages = defaultdict(list) lp = {} index = 0 while index < len(spots): spot = spots[index] link_prob, senses = self.get_senses(spot) allsenses.append(map(lambda x: x[0], senses)) if link_prob > 0: lp[spot], pages[spot] = link_prob, senses #print "Link probability", spot, link_prob index += 1 else: del spots[index] print "Removing spot", spot, link_prob scores = self.precompute_related(allsenses) # Then we remove each spot and append it to the end # so we can apply the voting scheme candidates = defaultdict(list) for spot in spots: sense_ids = pages.pop(spot) #print "Voting for spot", spot for p_a, pr_a in sense_ids: score = 0 counter = 0 for counter, (p_b, pg_pr_b) in enumerate(pages.items()): pg_b = map(lambda x: x[0], pg_pr_b) prg_b = map(lambda x: x[1], pg_pr_b) score += self.vote_for(p_a, pg_b, prg_b, scores) score /= float(counter + 1) #print "Vote to spot", spot, p_a, "is", score print "Link probability", lp[spot] rho = (score + lp[spot]) / 2.0 candidates[spot].append((pr_a, rho, p_a)) pages[spot] = sense_ids winning = {} for spot, champion_list in candidates.items(): print "Spot", spot, "has the following:" for pr_a, score, p_a in sorted(champion_list, reverse=True, key=lambda x: x[1]): #if score < 0.05: # continue print "\tProability:", pr_a, "Score:", score, "Page id:", p_a, "Title", self.catalog.get_title( p_a) _, rho, wiki_id = sorted(champion_list, reverse=True, key=lambda x: x[1])[0] print _, rho, wiki_id #if rho > 0.05: winning[spot] = { "rho": rho, "id": wiki_id, "title": self.catalog.get_title(wiki_id), "portals": self.catalog.get_portals(wiki_id), } return winning