def fixWikiMatched(path, wtitles, dfd, N): arts = loadDict("dicts/" + path) c = 0 for a in arts: print "at", c, "from", path, "proc", os.getpid() c += 1 for m in a["matched"]: if wiki.isDisambiguationPage(m): print "disambig" a["matched"] = wikiMatcher.matchArticle(a, wtitles, dfd, N) break elif wiki.isMissing(m): print "missing" a["matched"] = wikiMatcher.matchArticle(a, wtitles, dfd, N) break print "saving", path saveDict(arts, "dicts/fixed_" + path)
def getNeighbors(artd, wikititles, range=3, BING=True, WIKIBOT=True, params=PARAMS): #return titles from the vicinity in the range title = artd['name'] #put last name last title = ' '.join(artd['name'].split(',')[::-1]).strip() #title = reformat(title) print title index = bisect.bisect_left(wikititles, title) res = set() for wt in wikititles[index-range:index+range]: if 'isambiguation' not in wt: res.add(wiki.reformat(wt)) #bing it blim1 = params.getNumWordsQ1() blim2 = params.getNumWordsQ2() if BING: firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim1]) q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords res = addBingLinks(q, res) #secnod time with more words firstwords = '%20'.join(tools.splitToWords(artd['txt'])[:blim2]) q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title)+'%20'+firstwords res = addBingLinks(q, res) #print q q = 'site%3Awikipedia.org%20'+ re.sub(' \(.*', '', title) res = addBingLinks(q, res) if WIKIBOT: res |= set(wiki.queryInterface(title)) res |= set(wiki.queryInterface('_'.join( [title] + tools.splitToWords(artd['txt'])[:blim1]))) #print '\n\n', res, '\n\n' #wiki suggestions res = res | set(wiki.getSearchSuggestions(title, limit=5)) #wiki search res = res | set(wiki.search(title)) #print res #deal with disambiguation pages artd['missing'] = set() artd['disambig'] = set() for wt in list(res): if 'Talk:' in wt: res.add(wt.split('Talk:')[1]) continue if ':' in wt or 'List_of' in wt or 'Category:' in wt: continue if wiki.isMissing(wt, rfmt=lambda x: x): res.remove(wt) if wt not in artd['missing']: artd['missing'].add(wt) res |= set(wiki.getSearchSuggestions(wt)) print 'miss', wt continue dtitles = removeDisambigCandidates(wt) if dtitles != None: artd['disambig'].add(wt) res.remove(wt) res |= dtitles res = set(map(lambda x: re.sub(' ', '_', x), list(res))) for wt in list(res): if re.match('\d\d\d*', wt): res.remove(wt) #print res return list(res)