def main(): startUrl="http://goodgame.ru/video/" page='http://goodgame.ru/video/page/%s/' maxPages=dumper.extractMaxPages(requests.get(startUrl).text) crawledData={} for i in range(1,int(maxPages)+1): # for i in range(1,5): print "Grabbing %s page"%i grabDataFromPage(page%i,crawledData) f=codecs.open(RESULT_FILE,"w+","UTF-8") f.write('<head><meta charset="UTF-8"></head>') streamers=crawledData.keys() for s in sorted(streamers): count=len(crawledData[s]["streams"]) f.write("<a href='#%s'>%s %s</a><br>"%(s,s,count)) for k in streamers: f.write("<a name='%s'></a>"%k) f.write("<h1>%s</h1>"%k) for stream in crawledData[k]["streams"]: f.write("<br>") f.write("<a href='%s' target='_blank'>%s</a>"%(stream["url"],stream["title"])) f.close()
def test_extractMaxPages(): """getting total pages number from first page""" from dumper import extractMaxPages data = fileAsString("video.html") maxPagesActual = extractMaxPages(data) maxPagesExpected = 147 assert maxPagesExpected == maxPagesActual