# Author: Yunqiao Zhang # Email: [email protected] from scheduler import CrawlerScheduler import share import time from page_intializer import PageInitializer if __name__ == '__main__': cs = CrawlerScheduler(share.GPR) cs.start() for line in open("test.txt", 'r'): line = line.strip() cs.add_request(line) time.sleep(40) print 'size:', share.GPR.qsize() # Get content out n = 0 pi = PageInitializer(share.GPR) while True: url, content = pi.get_page() fp = open(str(n) + '.html', 'w') if url != None: fp.write(content) fp.close() n += 1
# Author: Yunqiao Zhang # Email: [email protected] from scheduler import CrawlerScheduler import share import time if __name__ == '__main__': cs = CrawlerScheduler(share.GPR) cs.start() for line in open("test.txt", 'r'): line = line.strip() cs.add_request(line) time.sleep(40) print 'size:', share.GPR.qsize() # Get content out n = 0 while True: r = share.GPR.get(block=False) print r[0], r[1], r[2], r[3], r[4] fp = open(str(n) + '.html', 'w') fp.write(r[6]) fp.close() n += 1 cs.join()
# Author: Yunqiao Zhang # Email: [email protected] from scheduler import CrawlerScheduler import share import time from page_intializer import PageInitializer if __name__ == '__main__': cs = CrawlerScheduler(share.GPR) cs.start() # for line in open("chan.txt", 'r'): # line = line.strip() # cs.add_request(line) sitedata = {} lines = [line.strip() for line in open("chan.txt", 'r')] lines = filter(None,lines) while lines: url = lines.pop() title = lines.pop() sitedata[url] = title cs.add_request(url) # time.sleep(40) print 'size:', share.GPR.qsize() # Get content out n = 0 pi = PageInitializer(share.GPR)