def __init__(self, filetrie="anchors.marisa", filestop="stop.txt"): self.trie = marisa_trie.Trie() self.disambig = Disambiguator() self.stopwords = set() with open(filestop, 'r') as stopfile: self.stopwords = set( filter(lambda x: x and x[0] != '#', map(lambda x: x.rstrip(), stopfile.readlines()))) with open(filetrie, 'r') as inputfile: self.trie.read(inputfile) print "Loaded %d anchors" % len(self.trie)
def __init__(self, filetrie="anchors.marisa", filestop="stop.txt"): self.trie = marisa_trie.Trie() self.disambig = Disambiguator() self.stopwords = set() with open(filestop, 'r') as stopfile: self.stopwords = set(filter(lambda x: x and x[0] != '#', map(lambda x: x.rstrip(), stopfile.readlines()))) with open(filetrie, 'r') as inputfile: self.trie.read(inputfile) print "Loaded %d anchors" % len(self.trie)
class LightTag(object): def __init__(self, filetrie="anchors.marisa", filestop="stop.txt"): self.trie = marisa_trie.Trie() self.disambig = Disambiguator() self.stopwords = set() with open(filestop, 'r') as stopfile: self.stopwords = set(filter(lambda x: x and x[0] != '#', map(lambda x: x.rstrip(), stopfile.readlines()))) with open(filetrie, 'r') as inputfile: self.trie.read(inputfile) print "Loaded %d anchors" % len(self.trie) def annotate(self, text): with profiled("Disambiguated in %s"): matches = self.match(text) spots = [] indices = {} for (start, stop), anchors in matches.items(): if anchors[0] in self.stopwords: print "Ignoring stopword", anchors[0], anchors continue spots.append(anchors[0]) indices[anchors[0]] = (start, stop) ret = {} results = self.disambig.disambiguate(spots) for spot in results: ret[spot] = results[spot] ret[spot]['indices'] = indices[spot] start, stop = indices[spot] ret[spot]['spot'] = text[start:stop] return [v for(k, v) in ret.items()] def match(self, text, context=5, threshold=0.8): anchors = {} with profiled("Annotated in %s"): start = 0 text_words = extract_words(text) while start < len(text_words): stop = min(start + context, len(text_words)) while stop >= start: words = map(lambda x: x[0], text_words[start:stop]) target = u' '.join(words) # Only consider strings which are at least 3 characters long if len(target) >= 3: begin = text_words[start][1][0] end = text_words[stop - 1][1][1] #assert target == text[begin:end].lower() wiki_titles = self.trie.keys(target) wiki_titles = [title for title in self.filter_wiki_titles(words, wiki_titles, threshold=threshold)] if wiki_titles: anchors[(begin, end)] = wiki_titles stop = 0 stop -= 1 start += 1 return anchors def filter_wiki_titles(self, words, wiki_titles, threshold=0.8): """ Filter a list of wiki titles based on similarity """ #print "Filtering", words, wiki_titles for title in wiki_titles: similarities = 0 for counter, (word1, word2) in enumerate(izip_longest(words, title.split())): if not word1 or not word2: continue if word1 == word2: similarities += 1 sim = similarities / float(counter + 1) if sim > threshold: yield title if sim == 1.0: raise StopIteration
class LightTag(object): def __init__(self, filetrie="anchors.marisa", filestop="stop.txt"): self.trie = marisa_trie.Trie() self.disambig = Disambiguator() self.stopwords = set() with open(filestop, 'r') as stopfile: self.stopwords = set( filter(lambda x: x and x[0] != '#', map(lambda x: x.rstrip(), stopfile.readlines()))) with open(filetrie, 'r') as inputfile: self.trie.read(inputfile) print "Loaded %d anchors" % len(self.trie) def annotate(self, text): with profiled("Disambiguated in %s"): matches = self.match(text) spots = [] indices = {} for (start, stop), anchors in matches.items(): if anchors[0] in self.stopwords: print "Ignoring stopword", anchors[0], anchors continue spots.append(anchors[0]) indices[anchors[0]] = (start, stop) ret = {} results = self.disambig.disambiguate(spots) for spot in results: ret[spot] = results[spot] ret[spot]['indices'] = indices[spot] start, stop = indices[spot] ret[spot]['spot'] = text[start:stop] return [v for (k, v) in ret.items()] def match(self, text, context=5, threshold=0.8): anchors = {} with profiled("Annotated in %s"): start = 0 text_words = extract_words(text) while start < len(text_words): stop = min(start + context, len(text_words)) while stop >= start: words = map(lambda x: x[0], text_words[start:stop]) target = u' '.join(words) # Only consider strings which are at least 3 characters long if len(target) >= 3: begin = text_words[start][1][0] end = text_words[stop - 1][1][1] #assert target == text[begin:end].lower() wiki_titles = self.trie.keys(target) wiki_titles = [ title for title in self.filter_wiki_titles( words, wiki_titles, threshold=threshold) ] if wiki_titles: anchors[(begin, end)] = wiki_titles stop = 0 stop -= 1 start += 1 return anchors def filter_wiki_titles(self, words, wiki_titles, threshold=0.8): """ Filter a list of wiki titles based on similarity """ #print "Filtering", words, wiki_titles for title in wiki_titles: similarities = 0 for counter, (word1, word2) in enumerate( izip_longest(words, title.split())): if not word1 or not word2: continue if word1 == word2: similarities += 1 sim = similarities / float(counter + 1) if sim > threshold: yield title if sim == 1.0: raise StopIteration