def properFilterArts(arts, params=PARAMS): #iF year and title - its matched #iF no year - find first with year and title if there is one. #iF diff greater than 80 then it's bad res = [] c = 0 MAX_DIFF = 80 for a in arts: matched = [] first_name, first_score = a['candids'][0] if wiki.isDisambiguationPage(first_name): print name, 'dis' else: first_sum = sum(first_score.values()) if first_score['years'] > 0 and first_score['titles'] > 0: matched = [wiki.getRedirect(first_name)] a['matched'] = matched res.append(a) continue #not first match for name, score in a['candids'][1:]: if wiki.isDisambiguationPage(name): print name, 'dis' continue if score['years'] > 0 and score['titles'] > 0: asum = sum(score.values()) if first_sum - asum < MAX_DIFF: a['matched'] = [wiki.getRedirect(name)] res.append(a) break else: print a['name'],'DIFFBIG:',asum,name break return res
def fixWikiMatched(path, wtitles, dfd, N): arts = loadDict("dicts/" + path) c = 0 for a in arts: print "at", c, "from", path, "proc", os.getpid() c += 1 for m in a["matched"]: if wiki.isDisambiguationPage(m): print "disambig" a["matched"] = wikiMatcher.matchArticle(a, wtitles, dfd, N) break elif wiki.isMissing(m): print "missing" a["matched"] = wikiMatcher.matchArticle(a, wtitles, dfd, N) break print "saving", path saveDict(arts, "dicts/fixed_" + path)
def removeDisambigCandidates(wt): if wiki.isDisambiguationPage(wt): print 'disambig', wt dtitles = wiki.getLinks(wt) return set(dtitles)