Ejemplo n.º 1
0
def getActualRefsForArticle(a, w):
    wlinks = set()
    a['wiki_derived_refs'] = []
    for m in a['matched']:
        wlinks |= set(wiki.getLinks(m))

    some = False
    for candid in a['ref_candidates']:
        inter = set(candid['matched']) & wlinks
        if len(inter) > 0:
            print a['name'], 'to', candid['name'], 'on' , inter, 'and', a['ref'], '\n',
            a['wiki_derived_refs'].append(candid)
            
            some = True

    if not some:
        print 'none and', a['ref']
        w.write('<tr><td>')
        w.write(str(a['name']))
        w.write('</td><td>None</td><td>')
        w.write(str(a['ref']))
        w.write('</td></tr>\n')
    else:
        w.write('<tr><td>')
        w.write(str(a['name']))
        w.write('</td><td>')
        w.write(str(set(map(lambda x: x['name'], a['wiki_derived_refs']))))
        w.write('</td><td>')
        w.write(str(a['ref']))
        w.write('</td></tr>\n')
Ejemplo n.º 2
0
def scrapeWiki():
    pages = ['python','programming','computer','resistor']
    scraped = []
    while len(pages) > 0:
        GPIO.output(GREEN_LED,GPIO.HIGH)
        nextPage = pages.pop(0)
        if not nextPage in scraped:
            try:
                # write data to file
                dataPath = os.path.join(consts.USB_FOLDER,nextPage+".txt")
                f = open(dataPath,"w+")
                f.write(wiki.getText(nextPage))
                f.close()

                # log the success and grab next pages
                logger.log(nextPage)       
                newLinks = wiki.getLinks(nextPage)
            
                # shuff the links so that it isn't
                #  geared toward alphabetical searches
                random.shuffle(newLinks)
                for p in newLinks:
                    if p in pages:
                        pages.remove(p)
                        index = random.randrange(0,len(pages))
                        pages.insert(index,p)
                    else:
                        pages.append(p)
                
            except:
                logger.log('ERROR: '+nextPage)

            scraped.append(nextPage)
        else:
            logger.log("page already scraped " + nextPage)

        # flip between green and blue to show
        #  when we are scraping a page
        GPIO.output(GREEN_LED,GPIO.LOW)
        GPIO.output(BLUE_LED,GPIO.HIGH)
        time.sleep(10)
        GPIO.output(BLUE_LED,GPIO.LOW)
Ejemplo n.º 3
0
def removeDisambigCandidates(wt):
    if wiki.isDisambiguationPage(wt):
        print 'disambig', wt
        dtitles = wiki.getLinks(wt)
        return set(dtitles)