def getCombinedScores(arttxt, title_txts, comp, occp, wt=True, vt=True): wordsa = map(lambda x: x.lower(), tools.splitToWords(arttxt)) global NAME res = [] for name, text in title_txts: NAME = name score = 0 wordsb = map(lambda x: x.lower(), tools.splitToWords(text)) score1 = comp(wordsb, wordsa) #reverse score2 = getKeywordVectorScore(arttxt, name, text, occp) score2 = sum(score2.values()) if wt: score += score1*100 if vt: score += score2 res.append((name, score)) res.sort(key=lambda x: x[1], reverse=True) return res
def getKeywordVectorScore(art_text, cand_title, cand_text, occp): score_dict = dict() title_words = cand_title.lower().split('_') years = (re.findall('Category: *(\d+).+births', cand_text) + re.findall('Category: *(\d+).+deaths', cand_text)) for year in years: score_dict[year] = 0 for word in tools.splitToWords(cand_text)[:150]: if word in occp: score_dict[word] = 0 nyear = 0 nword = 0 for word in tools.splitToWords(art_text)[:150]: nword+=1 if tools.isNumeric(word): year = fixYear(word) nyear += 1 if year in score_dict and nyear < 4: score_dict[year] = 100 elif nyear < 3: for wyear in years[:4]: try: iwyear = int(wyear) iyear = int(year) if math.fabs(iwyear-iyear) < 10: score_dict[year] = 80 except: continue elif nword < 20 and word.lower() in title_words: score_dict[word.lower()] = 80 elif word in occp: if word in score_dict: score_dict[word] = 20 score = {'years':0, 'titles':0, 'occp':0, 'other':0} for k in score_dict.keys(): if k in years: score['years'] += score_dict[k] elif k in title_words: score['titles'] += score_dict[k] elif k in occp: score['occp'] += score_dict[k] else: score['other'] += score_dict[k] #print cand_title, sorted(score_dict.items(), key=lambda x: x[1], reverse=True) return score
def getProximityScores(arttxt, title_txts, comp): '''to be used to calculate a specific meassure using comp''' wordsa = map(lambda x: x.lower(), tools.splitToWords(arttxt)) global NAME NAME = title_txts[0][0] res = [] for name, text in title_txts: wordsb = map(lambda x: x.lower(), tools.splitToWords(text)) #print set(wordsa) & set(wordsb) res.append((name, comp(wordsa, wordsb))) res.sort(key=lambda x: x[1], reverse=True) return res
def getWordDict(arts): d = dict() for a in arts: for w in tools.splitToWords(a["txt"].lower()): if w not in d: d[w] = set() d[w].add(a["id"]) return d
def getOffsetWordDict(arts): d = dict() for a in arts: words = tools.splitToWords(a["name"].lower()) for w in words: if w in BLACKLIST: continue if w not in d: d[w] = [] d[w].append(a["id"]) return d
def getOffsetDirectMentions(arts, wd=None, wlen=3): ad = dict() for a in arts: ad[a["id"]] = a if wd == None: wd = getOffsetWordDict(arts) artcount = 0 for a in arts: words = tools.splitToWords(a["txt"].lower()) window = words[:wlen] a["direct_refs"] = set() for w in words[wlen:] + [""]: matchd = dict() for word in list(set(window)): if word in wd: for aid in set(wd[word]): if aid not in matchd: matchd[aid] = 1 else: matchd[aid] += 1 for m in matchd.keys(): txtwords = filter(lambda x: x not in BLACKLIST, tools.splitToWords(ad[m]["name"])) namelen = len(txtwords) if namelen <= matchd[m] or matchd[m] == wlen: a["direct_refs"].add((ad[m]["name"], ad[m]["id"])) # print namelen, matchd[m], ad[m]['name'] if len(window) > 0: window.pop(0) window.append(w) print "at", artcount, "from", len(arts) artcount += 1
def getWikiCandidates(wdict, title): words = tools.splitToWords(title.lower()) first = True candids = set() for w in words: if w in wdict: if first: candids |= set(wdict[w]) # first = False else: candids &= set(wdict[w]) if len(candids) == 0: print "no match for", title return list(candids)
def getDirectMentions(arts, wd): ad = dict() for a in arts: ad[a["id"]] = a for a in arts: words = tools.splitToWords(a["name"].lower()) c = 0 match = set() for w in words: c += 1 if w in wd: match = set(wd[w]) break for w in words[c:]: if w in wd: match &= wd[w] a["direct_refs"] = [] for m in list(match): a["direct_refs"].append((ad[m]["name"], ad[m]["id"]))
def getNeighbors(artd, wikititles, range=3, BING=True, WIKIBOT=True, params=PARAMS): #return titles from the vicinity in the range title = artd['name'] #put last name last title = ' '.join(artd['name'].split(',')[::-1]).strip() #title = reformat(title) print title index = bisect.bisect_left(wikititles, title) res = set() for wt in wikititles[index-range:index+range]: if 'isambiguation' not in wt: res.add(wiki.reformat(wt)) #bing it blim1 = params.getNumWordsQ1() blim2 = params.getNumWordsQ2() if BING: firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim1]) q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords res = addBingLinks(q, res) #secnod time with more words firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim2]) q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords res = addBingLinks(q, res) #print q q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title) res = addBingLinks(q, res) if WIKIBOT: res |= set(wiki.queryInterface(title)) res |= set(wiki.queryInterface('_'.join( [title] + tools.splitToWords(artd['txt'])[:blim1]))) #print '\n\n', res, '\n\n' #wiki suggestions res = res | set(wiki.getSearchSuggestions(title, limit=5)) #wiki search res = res | set(wiki.search(title)) #print res #deal with disambiguation pages artd['missing'] = set() artd['disambig'] = set() for wt in list(res): if 'Talk:' in wt: res.add(wt.split('Talk:')[1]) continue if ':' in wt or 'List_of' in wt or 'Category:' in wt: continue if wiki.isMissing(wt, rfmt=lambda x: x): res.remove(wt) if wt not in artd['missing']: artd['missing'].add(wt) res |= set(wiki.getSearchSuggestions(wt)) print 'miss', wt continue dtitles = removeDisambigCandidates(wt) if dtitles != None: artd['disambig'].add(wt) res.remove(wt) res |= dtitles res = set(map(lambda x: re.sub(' ', '_', x), list(res))) for wt in list(res): if re.match('\d\d\d*', wt): res.remove(wt) #print res return list(res)
def getKeywordScore(art_text, cand_title, cand_text, occp): score_dict = dict() #links = re.findall('\[+(.+?)\]', cand_text) title_words = cand_title.lower().split('_') years = (re.findall('Category: *(\d+).+births', cand_text) + re.findall('Category: *(\d+).+deaths', cand_text)) for year in years: score_dict[year] = 0 '''while len(links) != 0: l = links.pop(0) words = tools.splitToWords(l) if len(words) > 1: links = links+words else: l = l.lower() score_dict[l] = 0 ''' for word in tools.splitToWords(cand_text)[:150]: if word in occp: score_dict[word] = 0 ''' elif isCapitalized(word): word = word.lower() if word in score_dict: score_dict[word] = 0 ''' nyear = 0 nword = 0 for word in tools.splitToWords(art_text)[:150]: nword+=1 if tools.isNumeric(word): year = fixYear(word) nyear += 1 if year in score_dict and nyear < 4: score_dict[year] = 100 elif nyear < 3: for wyear in years[:4]: try: iwyear = int(wyear) iyear = int(year) if math.fabs(iwyear-iyear) < 10: score_dict[year] = 80 except: continue elif nword < 20 and word.lower() in title_words: score_dict[word.lower()] = 80 elif word in occp: if word in score_dict: score_dict[word] = 20 '''elif isCapitalized(word): word = word.lower() if word in score_dict: score_dict[word] = 10 else: word = word.lower() if word in score_dict: score_dict[word] = 15''' score = 0 for k in score_dict.keys(): score += score_dict[k] #print cand_title, sorted(score_dict.items(), key=lambda x: x[1], reverse=True) return score