def spider_link(): try: url = rdb.consumers(cf.get('redis', 'link_queue')) if url is not None: logging.info('crawler url is: ' + url) page = crawler_url.chrome_crawler(url, '', '') if page is not None: page = page.encode('utf-8') filename = 'link_' + tools.get_md5(url) + '.html' logging.info(filename) status = tools.gzip_file(filename, page) if status: rdb.producers(cf.get('redis', 'html_queue'), filename) else: rdb.producers(cf.get('redis', 'link_queue'), url) except Exception, e: logging.info(e) rdb.producers(cf.get('redis', 'link_queue'), url)
def spider_down(): try: get_url = rdb.consumers(cf.get('redis', 'down_queue')) headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0' } req = urllib2.Request(url=get_url, headers=headers) url = urllib2.urlopen(req, timeout=5) binary_data = url.read() filename = tools.get_md5(get_url) + '.jpg' status = tools.save_to_file(filename, binary_data) if status: logging.info('img download ok !') else: rdb.producers(cf.get('redis', 'down_queue'), get_url) except Exception, e: logging.info(e) rdb.producers(cf.get('redis', 'down_queue'), get_url)
def spider_down(): try: down = rdb.consumers(cf.get('redis', 'down_queue')) js = json.loads(down) url = js['url'] href = js['href'] headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0', 'Referer': url } req = urllib2.Request(url=href, headers=headers) openurl = urllib2.urlopen(req, timeout=5) binary_data = openurl.read() filename = tools.get_md5(href) + '.jpg' status = tools.save_to_file(filename, binary_data) if status: logging.info('img download ok !') else: rdb.producers(cf.get('redis', 'down_queue'), down) except Exception, e: logging.info(e) rdb.producers(cf.get('redis', 'down_queue'), down)