def getActualRefsForArticle(a, w): wlinks = set() a['wiki_derived_refs'] = [] for m in a['matched']: wlinks |= set(wiki.getLinks(m)) some = False for candid in a['ref_candidates']: inter = set(candid['matched']) & wlinks if len(inter) > 0: print a['name'], 'to', candid['name'], 'on' , inter, 'and', a['ref'], '\n', a['wiki_derived_refs'].append(candid) some = True if not some: print 'none and', a['ref'] w.write('<tr><td>') w.write(str(a['name'])) w.write('</td><td>None</td><td>') w.write(str(a['ref'])) w.write('</td></tr>\n') else: w.write('<tr><td>') w.write(str(a['name'])) w.write('</td><td>') w.write(str(set(map(lambda x: x['name'], a['wiki_derived_refs'])))) w.write('</td><td>') w.write(str(a['ref'])) w.write('</td></tr>\n')
def scrapeWiki(): pages = ['python','programming','computer','resistor'] scraped = [] while len(pages) > 0: GPIO.output(GREEN_LED,GPIO.HIGH) nextPage = pages.pop(0) if not nextPage in scraped: try: # write data to file dataPath = os.path.join(consts.USB_FOLDER,nextPage+".txt") f = open(dataPath,"w+") f.write(wiki.getText(nextPage)) f.close() # log the success and grab next pages logger.log(nextPage) newLinks = wiki.getLinks(nextPage) # shuff the links so that it isn't # geared toward alphabetical searches random.shuffle(newLinks) for p in newLinks: if p in pages: pages.remove(p) index = random.randrange(0,len(pages)) pages.insert(index,p) else: pages.append(p) except: logger.log('ERROR: '+nextPage) scraped.append(nextPage) else: logger.log("page already scraped " + nextPage) # flip between green and blue to show # when we are scraping a page GPIO.output(GREEN_LED,GPIO.LOW) GPIO.output(BLUE_LED,GPIO.HIGH) time.sleep(10) GPIO.output(BLUE_LED,GPIO.LOW)
def removeDisambigCandidates(wt): if wiki.isDisambiguationPage(wt): print 'disambig', wt dtitles = wiki.getLinks(wt) return set(dtitles)