def do(): input = Queue(maxsize=2000) output = Queue(maxsize=2000) bs = [] for i in range(num): string_proxy = string_proxies[i%len(string_proxies)] b = BestSell(string_proxy=string_proxy, input=input, output=output, domain='amazon.es', cache_root='I:\\cache_amazon_es') bs.append(b) b.start() t = Thread(target=urlopt, args=[output, ]) t.start() session = Session() if session.query(Url).count() < 1: bs[0].categories_first(categories=spider_categories) session.commit() while session.query(Url).filter(and_(Url.has_crawled==False, Url.tries<3)).count() > 0: for url_obj in session.query(Url).filter(and_(Url.has_crawled==False, Url.tries<3)).limit(1000*len(string_proxies)): try: # m = Map(url_obj.as_dict()) logger.info('put: %s' % url_obj.url) input.put(url_obj.as_dict()) except: logger.error(traceback.format_exc()) logger.info('wait 5 seconds...') time.sleep(5) logger.info('oooooooooooooooooooooooooooooooover')
def urlopt(queue): session = Session(autocommit=True) while 1: try: opt, urls = queue.get(timeout=30) if opt == 'add': url_objs = [] for url in urls: if session.query(Url).filter_by(key=url.key).count() < 1: logger.info('Opt: Add %s' % url.url) url_objs.append(url) session.bulk_save_objects(url_objs) elif opt == 'error': for url in urls: logger.info('Opt: Error %s' % url.url) session.query(Url).filter_by(id=url.id).update({Url.tries: url.tries+1}) elif opt == 'over': for url in urls: logger.info('Opt: Over %s' % url.url) session.query(Url).filter_by(id=url.id).update({Url.has_crawled: True}) # session.commit() except Empty: logger.info('Empty') except: logger.error(traceback.format_exc())
def category(): session = Session() while session.query(Url).filter_by(type=URL_TYPE.BEST_SELL_CATEGORY, has_crawled=False).count() > 0: for i, url_obj in enumerate( session.query(Url).filter_by( type=URL_TYPE.BEST_SELL_CATEGORY, has_crawled=False)): # .limit(20*len(string_proxies))): executor.submit(bs[i % len(bs)].categories, url_obj=url_obj) time.sleep(0.1) time.sleep(30)
def product(): try: session = Session() while session.query(Url).filter_by(type=URL_TYPE.PRODUCT_URL, has_crawled=False).count() > 0: for i, url_obj in enumerate( session.query(Url).filter_by( type=URL_TYPE.PRODUCT_URL, has_crawled=False)): #.limit(20*len(string_proxies))): executor.submit(bs[i % len(bs)].product, url_obj=url_obj) time.sleep(0.1) time.sleep(30) except: logger.error(traceback.format_exc())
def category_next(): try: session = Session() while session.query(Url).filter_by( type=URL_TYPE.BEST_SELL_CATEGORY_NEXT, has_crawled=False).count() > 0: for i, url_obj in enumerate( session.query(Url).filter_by( type=URL_TYPE.BEST_SELL_CATEGORY_NEXT, has_crawled=False)): #.limit(20*len(string_proxies))): executor.submit(bs[i % len(bs)].category_next, url_obj=url_obj) time.sleep(0.1) time.sleep(30) except: logger.error(traceback.format_exc())
def first(): session = Session() if session.query(Url).filter_by(type=URL_TYPE.BEST_SELL_CATEGORY, has_crawled=False).count() < 1: b = BestSell() # string_proxy='socks4://192.168.1.188:1080') # b.product('B01FQN3W5U') b.categories_first(categories=config.spider_categories) # b.categories(url_obj=url_obj) session.close()