コード例 #1
0
ファイル: main.py プロジェクト: xtwxfxk/left5
def first():
    session = Session()
    if session.query(Url).filter_by(type=URL_TYPE.BEST_SELL_CATEGORY,
                                    has_crawled=False).count() < 1:
        b = BestSell()  # string_proxy='socks4://192.168.1.188:1080')
        # b.product('B01FQN3W5U')
        b.categories_first(categories=config.spider_categories)
        # b.categories(url_obj=url_obj)
    session.close()
コード例 #2
0
ファイル: main_es.py プロジェクト: xtwxfxk/left5
def urlopt(queue):

    session = Session(autocommit=True)

    while 1:
        try:
            opt, urls = queue.get(timeout=30)

            if opt == 'add':
                url_objs = []
                for url in urls:
                    if session.query(Url).filter_by(key=url.key).count() < 1:
                        logger.info('Opt: Add %s' % url.url)

                        url_objs.append(url)
                session.bulk_save_objects(url_objs)

            elif opt == 'error':
                for url in urls:
                    logger.info('Opt: Error %s' % url.url)

                    session.query(Url).filter_by(id=url.id).update({Url.tries: url.tries+1})

            elif opt == 'over':
                for url in urls:
                    logger.info('Opt: Over %s' % url.url)
                    session.query(Url).filter_by(id=url.id).update({Url.has_crawled: True})

            # session.commit()
        except Empty:
            logger.info('Empty')
        except:
            logger.error(traceback.format_exc())
コード例 #3
0
def category():
    session = Session()
    while session.query(Url).filter_by(type=URL_TYPE.BEST_SELL_CATEGORY,
                                       has_crawled=False).count() > 0:
        for i, url_obj in enumerate(
                session.query(Url).filter_by(
                    type=URL_TYPE.BEST_SELL_CATEGORY,
                    has_crawled=False)):  # .limit(20*len(string_proxies))):
            executor.submit(bs[i % len(bs)].categories, url_obj=url_obj)
            time.sleep(0.1)

        time.sleep(30)
コード例 #4
0
def product():
    try:
        session = Session()
        while session.query(Url).filter_by(type=URL_TYPE.PRODUCT_URL,
                                           has_crawled=False).count() > 0:
            for i, url_obj in enumerate(
                    session.query(Url).filter_by(
                        type=URL_TYPE.PRODUCT_URL,
                        has_crawled=False)):  #.limit(20*len(string_proxies))):
                executor.submit(bs[i % len(bs)].product, url_obj=url_obj)
                time.sleep(0.1)
            time.sleep(30)
    except:
        logger.error(traceback.format_exc())
コード例 #5
0
def category_next():
    try:
        session = Session()
        while session.query(Url).filter_by(
                type=URL_TYPE.BEST_SELL_CATEGORY_NEXT,
                has_crawled=False).count() > 0:
            for i, url_obj in enumerate(
                    session.query(Url).filter_by(
                        type=URL_TYPE.BEST_SELL_CATEGORY_NEXT,
                        has_crawled=False)):  #.limit(20*len(string_proxies))):
                executor.submit(bs[i % len(bs)].category_next, url_obj=url_obj)
                time.sleep(0.1)

            time.sleep(30)
    except:
        logger.error(traceback.format_exc())
コード例 #6
0
ファイル: main_es.py プロジェクト: xtwxfxk/left5
def do():

    input = Queue(maxsize=2000)
    output = Queue(maxsize=2000)

    bs = []
    for i in range(num):
        string_proxy = string_proxies[i%len(string_proxies)]
        b = BestSell(string_proxy=string_proxy, input=input, output=output, domain='amazon.es', cache_root='I:\\cache_amazon_es')
        bs.append(b)
        b.start()

    t = Thread(target=urlopt, args=[output, ])
    t.start()

    session = Session()
    if session.query(Url).count() < 1:
        bs[0].categories_first(categories=spider_categories)
        session.commit()

    while session.query(Url).filter(and_(Url.has_crawled==False, Url.tries<3)).count() > 0:
        for url_obj in session.query(Url).filter(and_(Url.has_crawled==False, Url.tries<3)).limit(1000*len(string_proxies)):
            try:
                # m = Map(url_obj.as_dict())
                logger.info('put: %s' % url_obj.url)
                input.put(url_obj.as_dict())
            except:
                logger.error(traceback.format_exc())

        logger.info('wait 5 seconds...')
        time.sleep(5)

    logger.info('oooooooooooooooooooooooooooooooover')