Esempio n. 1
0
def bsrData_save(dataQ, debug_log, db_log):
    print('\nbsrData_save init\n')
    data_type = 'bsr'
    if dataQ.RedisQ.llen('bsrData') > 0:
        dbObj = GetDbObj().get_db_obj()
        cur = dbObj.cursor()
        dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ)
        db_name = SqlConfig.bsrData_db_name
        update_sql = SqlConfig.bsrData_update_sql
        insert_sql = SqlConfig.bsrData_insert_sql
        while True:
            datas = dataQ.get_new_bsrData()
            if not datas:
                if dataQ.RedisQ.llen('bsrData') > 0:
                    datas = dataQ.get_new_bsrData()
                else:
                    break
            for item in datas:
                asin = item
                tuple_list = datas[item]
                tm = int(DataOutput.get_redis_time())
                # print('asin tuple_list: ', asin, tuple_list)
                for item in tuple_list:
                    if item and type(item) is tuple:
                        # print('bsr item: ', item)
                        itemLen = len(item)
                        bsr = item[0]
                        bsrc = item[1]
                        aday = item[2]
                        # if itemLen == 4:
                        #     tm = item[3]
                        # else:
                        #     tm = int(time.time() * 1000)
                        data_dict = dict(asin=asin,
                                         bsr=bsr,
                                         bsrc=bsrc,
                                         tm=tm,
                                         aday=aday)
                        data = dataOutput.save_data_to_db(update_sql,
                                                          insert_sql,
                                                          asin,
                                                          data_dict,
                                                          db_name=db_name)
                        # print('bsrData: ',data)

        cur.close()
        dbObj.close()
        db_log.war('%s, %s线程任务已完成\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
    else:
        db_log.war('%s, %s数据队列为空\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
Esempio n. 2
0
def init_crawler_state():
    time_now = int(time.time() * 1000)
    dbObj = GetDbObj().get_db_obj()
    cur = dbObj.cursor()
    update_sql = 'update public.amazon_product_data set crawler_state=0 where getinfo_tm < %s;' % (time_now)
    cur.execute(update_sql)
    row1 = cur.rowcount
    print('%s %s %s行更新成功' % (1, update_sql, row1))
    update_sql = 'update public.amazon_product_data_tosell set crawler_state=0 where getinfo_tm < %s;' % (time_now)
    cur.execute(update_sql)
    row2 = cur.rowcount
    print('%s %s %s行更新成功' % (2, update_sql, row2))
    update_sql = 'update public.amazon_keyword_data set crawler_state=0 where getinfo_tm < %s;' % (time_now)
    cur.execute(update_sql)
    row3 = cur.rowcount
    print('%s %s %s行更新成功' % (3, update_sql, row3))
    dbObj.commit()
    cur.close()
    dbObj.close()
Esempio n. 3
0
def goods_data_save(dataQ, debug_log, db_log):
    print('\ngoods_save init\n')
    data_type = 'goods'
    if dataQ.RedisQ.llen('goodsData') > 0:
        dbObj = GetDbObj().get_db_obj()
        cur = dbObj.cursor()
        dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ)
        db_name = SqlConfig.goods_db_name
        update_sql = SqlConfig.goods_update_sql
        insert_sql = SqlConfig.goods_insert_sql
        while True:
            datas = dataQ.get_new_goods_data()
            the_hour = return_PST().hour
            if not datas:
                if dataQ.RedisQ.llen('goodsData') > 0:
                    datas = dataQ.get_new_goods_data()
                else:
                    break
            tm = DataOutput.get_redis_time()
            for k, v in datas.items():
                asin = k
                data = v
                # print('data', data)
                # 如果库存下载失败, 先不入历史库
                from pprint import pprint
                pprint(data)

                print(data['getinfo_tm'], 1)
                data['getinfo_tm'] = tm
                print(data['getinfo_tm'], 2)
                print('rc1: ', data['rc'])
                print('quantity1', data['quantity'])
                sql = "select rc, quantity, price, title, bsr from public.amazon_product_data where asin=%(asin)s and getinfo_tm>%(the_tm)s ;"
                select_dict = {
                    'asin': data['asin'],
                    'the_tm': (tm / 1000 - 3600 * 24 * 3) * 1000
                }
                cur.execute(sql, select_dict)
                select_rows = cur.fetchall()
                if len(select_rows) > 0:
                    print(select_rows, type(select_rows), type(select_rows[0]),
                          type(select_rows[0][0]))
                    the_old_rc, the_old_qty, the_old_price, the_old_title, the_old_bsr = select_rows[
                        0]
                    print(the_old_rc, the_old_qty, the_old_price,
                          the_old_title, the_old_bsr)
                    the_new_qty = data['quantity']
                    print('price1', data['price'], the_old_price)
                    # 如果没有price 则用前一天的数据
                    if data['price'] <= 0 and the_old_price > 0 and data[
                            'asin_state'] == 3:
                        data['price'] = the_old_price
                    # 如果没有title, 则用前一天的数据
                    if not data['title'] and the_old_title:
                        data['title'] = the_old_title
                    # 如果没有bsr, 则用前一天的数据
                    #if data['bsr'] < 1 and the_old_bsr > 0:
                    #    data['bsr'] = the_old_bsr
                    print('the_old_rc', the_old_rc, type(the_old_rc))
                    print('old quantity', the_old_qty, type(the_old_qty))
                    print('new quantity', the_new_qty, type(the_new_qty))
                    # 如果评论小于前一天的评论, 则用前一天的评论
                    print("data['rc']", data['rc'], type(data['rc']))
                    if data.get('rc', 0) < the_old_rc:
                        data['rc'] = the_old_rc
                    # 如果库存爬取失败, 则用前一天的库存
                    if the_new_qty == -1 and the_old_qty >= 0 and data[
                            'asin_state'] == 3:
                        data['quantity'] = the_old_qty
                        data['qtydt'] = 4
                        with open('quantity_fail.csv', 'a',
                                  encoding='utf-8') as f:
                            f.write('asin, %s, old qty, %s, new qty, %s\n' %
                                    (data['asin'], the_old_qty, the_new_qty))

                    if data['asin_state'] == 2:
                        data['quantity'] = 0
                        data['byb'] = 0
                        data['qtydt'] = 5  # 不可售

                # 如果没有dpre, 则用price
                if data['dpre'] <= 0 and data['price'] > 0:
                    data['dpre'] = data['price']
                # 如果没有cart_price, 则用price
                if data['cart_price'] <= 0 and data['price'] > 0:
                    data['cart_price'] = data['price']
                print('price2', data['price'])
                print('quantity2', data['quantity'])
                print('rc2: ', data['rc'])

                if the_hour < 9 and data['quantity'] < 0 and data[
                        'asin_state'] == 3:
                    # 先不更新
                    pass
                    # # 弹出更新库不需要的字段
                    # data.pop('dpre')
                    # data.pop('bs1')
                    # data.pop('qc')
                    # data.pop('qtydt')
                    # data.pop('aday')
                    # # 再传给更新库
                    # dataOutput.save_data_to_db(update_sql, insert_sql, asin, data, db_name=db_name)
                else:
                    # 先传一份给历史库
                    druidData_to_db(asin, data, dataOutput)

                    # 弹出更新库不需要的字段
                    data.pop('dpre')
                    data.pop('bs1')
                    data.pop('qc')
                    data.pop('qtydt')
                    data.pop('aday')
                    # 再传给更新库
                    dataOutput.save_data_to_db(update_sql,
                                               insert_sql,
                                               asin,
                                               data,
                                               db_name=db_name)
                    # 记录更新时间
                    dataOutput.crawler_tm(asin, data_type)

        cur.close()
        dbObj.close()
        db_log.war('%s, %s线程任务已完成\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
    else:
        db_log.war('%s, %s数据队列为空\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
Esempio n. 4
0
def tosell_save(dataQ, debug_log, db_log):
    print('\ntosell_save init\n')
    data_type = 'tosell'
    if dataQ.RedisQ.llen('tosellData') > 0:
        dbObj = GetDbObj().get_db_obj()
        cur = dbObj.cursor()
        dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ)
        data_tosell_db_name = SqlConfig.data_tosell_db_name
        data_tosell_update_sql = SqlConfig.data_tosell_update_sql
        data_tosell_insert_sql = SqlConfig.data_tosell_insert_sql

        druid_tosell_db_name = SqlConfig.druid_tosell_db_name
        # druid_tosell_update_sql = SqlConfig.druid_tosell_update_sql
        druid_tosell_update_sql = None  # SqlConfig.druid_tosell_update_sql
        druid_tosell_insert_sql = SqlConfig.druid_tosell_insert_sql
        while True:
            datas = dataQ.get_new_tosellData()
            pprint(datas)
            # datas = {'B01F0QQN8Q': ({'asin': 'B01F0QQN8Q',
            #                          'fba_sn': 1,
            #                          'getinfo_tm': 1542018763364,
            #                          'plow': 1,
            #                          'plows': 'largeshop',
            #                          'plows_id': 'df',
            #                          'seller_id': 'A1XEMYOCVN4TN8',
            #                          'sn': 1,
            #                          'sname': 'Gemschest'},
            #                         [{'aday': '20181112',
            #                           'asin': 'B01F0QQN8Q',
            #                           'condition': 'New',
            #                           'crawler_state': 1,
            #                           'delivery': 'Fulfillment by Amazon',
            #                           'demo': '5 out of 5 stars 99% positive over the past 12 months. (722 total '
            #                                   'ratings)',
            #                           'fba': 1,
            #                           'is_limit': 0,
            #                           'offering_id': 'tXTG86Zk6%2Bfn3YW0ITpD7nE1mscbzOgJAAhDW3VHDrP8cWV%2F1fd0DDtk7FV8eHIOKghI7PqYtkyapr23dSShe%2Fec6EMnW30fniLCM2fd1hkZKMTSUhqBYCuO87D2zljdYwfuDuVCDTm%2FQbjYnRPPhVBBs82MwpT9',
            #                           'positive': 99,
            #                           'price': 2199,
            #                           'qty': 11,
            #                           'qtydt': 0,
            #                           'rank': 1,
            #                           'reivew_count': 50,
            #                           'seller_id': 'A21P7EI9UKXT1Y',
            #                           'sn': 1,
            #                           'sname': 'largeshop',
            #                           'srank': 0,
            #                           'stype': 'FREE Shipping',
            #                           'tm': 1542018647,
            #                           'total_ratings': 722}])}
            if not datas:
                if dataQ.RedisQ.llen('tosellData') > 0:
                    datas = dataQ.get_new_tosellData()
                else:
                    break
            # print('\ntosell_save datas: [= %s =] \n' % (datas))
            tm = DataOutput.get_redis_time()
            for item in datas:
                asin = item
                tosell_datas = datas[item][0]
                tosell_list = datas[item][1]

                pprint(tosell_datas)
                pprint(tosell_list)
                print(tosell_datas['getinfo_tm'], 1)
                tosell_datas['getinfo_tm'] = tm
                print(tosell_datas['getinfo_tm'], 2)
                sql = "select asin, aday from public.amazon_product_tosell where asin=%(asin)s and aday=%(aday)s limit 1;"
                aday = tosell_list[0]['aday'] if len(
                    tosell_list) > 0 else return_PST().strftime('%Y%m%d')
                select_dict = {'asin': asin, 'aday': aday}
                cur.execute(sql, select_dict)
                select_rows = cur.fetchall()
                dbObj.commit()
                if len(select_rows) < 1:
                    if not tosell_datas.get('sname'):
                        print(222222)
                        sql1 = "select sname, seller_id from public.amazon_product_data where asin='%s' and getinfo_tm > %s" % (
                            asin, tm - 24 * 3600 * 1000)
                        cur.execute(sql1)
                        select_rows = cur.fetchall()
                        dbObj.commit()
                        select_rows = select_rows[0] if len(
                            select_rows) == 1 else ('', '')
                        sname, seller_id = select_rows
                        print('seller_id: ', seller_id)
                        print('sname ', sname)
                        tosell_datas['sname'] = sname
                        tosell_datas['seller_id'] = seller_id

                    data0 = dataOutput.save_data_to_db(
                        data_tosell_update_sql,
                        data_tosell_insert_sql,
                        asin,
                        tosell_datas,
                        db_name=data_tosell_db_name)

                    for item in tosell_list:
                        item['tm'] = int(tm / 1000)
                        data = dataOutput.save_data_to_db(
                            druid_tosell_update_sql,
                            druid_tosell_insert_sql,
                            asin,
                            item,
                            db_name=druid_tosell_db_name)

                    # 记录更新时间
                    dataOutput.crawler_tm(asin, data_type)
        cur.close()
        dbObj.close()
        db_log.war('%s, %s线程任务已完成\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
    else:
        db_log.war('%s, %s数据队列为空\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
Esempio n. 5
0
def tosell_save(dataQ, debug_log, db_log):
    print('\ntosell_save init\n')
    data_type = 'tosell'
    if dataQ.RedisQ.llen('tosellData') > 0:
        dbObj = GetDbObj().get_db_obj()
        cur = dbObj.cursor()
        dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ)
        data_tosell_db_name = SqlConfig.data_tosell_db_name
        data_tosell_update_sql = SqlConfig.data_tosell_update_sql
        data_tosell_insert_sql = SqlConfig.data_tosell_insert_sql

        druid_tosell_db_name = SqlConfig.druid_tosell_db_name
        #druid_tosell_update_sql = SqlConfig.druid_tosell_update_sql
        druid_tosell_update_sql = None  #SqlConfig.druid_tosell_update_sql
        druid_tosell_insert_sql = SqlConfig.druid_tosell_insert_sql
        while True:
            datas = dataQ.get_new_tosellData()
            if not datas:
                if dataQ.RedisQ.llen('tosellData') > 0:
                    datas = dataQ.get_new_tosellData()
                else:
                    break
            # print('\ntosell_save datas: [= %s =] \n' % (datas))
            tm = DataOutput.get_redis_time()
            for item in datas:
                asin = item
                tosell_datas = datas[item][0]
                tosell_list = datas[item][1]
                # print('tosell_datas: ', tosell_datas)
                print(tosell_datas['getinfo_tm'], 1)
                tosell_datas['getinfo_tm'] = tm
                print(tosell_datas['getinfo_tm'], 2)
                # sql = "select asin, getinfo_tm from public.amazon_product_data_tosell where asin=%(asin)s and getinfo_tm>%(the_tm)s;"
                # # select_dict = {'asin': asin, 'the_tm': (tm / 1000 - 120) * 1000}
                # the_tm = dataQ._get_value_from_string('initUpdateTm', 'initTime')
                # print('the_tm1', the_tm)
                # if not the_tm:
                #     _, the_tm = BaseCrawler.get_the_time()
                #     print('the_tm2', the_tm)
                # else:
                #     the_tm = str(the_tm, encoding='utf-8')
                # print('the_tm3', the_tm)
                # select_dict = {'asin': asin, 'the_tm': int(the_tm) * 1000}
                # cur.execute(sql, select_dict)
                # select_rows = cur.fetchall()
                sql = "select asin, aday from public.amazon_product_tosell where asin=%(asin)s and aday=%(aday)s limit 1;"
                aday = tosell_list[0]['aday'] if len(
                    tosell_list) > 0 else return_PST().strftime('%Y%m%d')
                select_dict = {'asin': asin, 'aday': aday}
                cur.execute(sql, select_dict)
                select_rows = cur.fetchall()
                dbObj.commit()
                if len(select_rows) < 1:
                    print(tosell_datas)
                    if not tosell_datas.get('sname'):
                        sql1 = "select sname, seller_id from public.amazon_product_data where asin='%s' and getinfo_tm > %s" % (
                            asin, tm - 24 * 3600 * 1000)
                        cur.execute(sql1)
                        select_rows = cur.fetchall()
                        dbObj.commit()
                        select_rows = select_rows[0] if len(
                            select_rows) == 1 else ('', '')
                        sname, seller_id = select_rows
                        print('seller_id: ', seller_id)
                        print('sname ', sname)
                        tosell_datas['sname'] = sname
                        tosell_datas['seller_id'] = seller_id
                    data0 = dataOutput.save_data_to_db(
                        data_tosell_update_sql,
                        data_tosell_insert_sql,
                        asin,
                        tosell_datas,
                        db_name=data_tosell_db_name)
                    for item in tosell_list:
                        item['tm'] = int(tm / 1000)
                        data = dataOutput.save_data_to_db(
                            druid_tosell_update_sql,
                            druid_tosell_insert_sql,
                            asin,
                            item,
                            db_name=druid_tosell_db_name)

                    # 记录更新时间
                    dataOutput.crawler_tm(asin, data_type)
        cur.close()
        dbObj.close()
        db_log.war('%s, %s线程任务已完成\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
    else:
        db_log.war('%s, %s数据队列为空\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
Esempio n. 6
0
def reviews_save(dataQ, debug_log, db_log):
    print('\nreviews_save init\n')
    data_type = 'reviews'
    if dataQ.RedisQ.llen('reviewsData') > 0:
        dbObj = GetDbObj().get_db_obj()
        cur = dbObj.cursor()
        dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ)
        reviews_db_name = SqlConfig.reivews_db_name
        reviews_update_sql = SqlConfig.reivews_update_sql
        reviews_insert_sql = SqlConfig.reivews_insert_sql

        datetime = return_PST()
        oldDate = datetime - timedelta(days=90)
        yesterdate = datetime - timedelta(days=1)
        yesterday = yesterdate.strftime('%Y%m%d')
        theYesterDete = int(yesterday)
        theMon = oldDate.strftime('%Y%m%d')
        three_mon_date = int(theMon)

        while True:
            datas = dataQ.get_new_reviewsData()
            if not datas:
                if dataQ.RedisQ.llen('reviewsData') > 0:
                    datas = dataQ.get_new_reviewsData()
                else:
                    break

            for item in datas:
                asin = item
                dict_list = datas[item]
                # print('tuple_list: ', dict_list)
                md5value = asin + 'reviewsFirst'
                md5key = DataOutput.get_md5_key(md5value)
                first = dataQ.is_first_download(md5key)
                i = 0
                for item in dict_list:
                    i += 1
                    # md5key只传3次, 避免无畏的重复写入, 以及前两次写入失败的情况.
                    if i < 3:
                        theMd5key = md5key
                    else:
                        theMd5key = None

                    # 如果是第一次下载, 则写入三个月内的评论.
                    if first:
                        if item['date'] >= three_mon_date:
                            # print('reviews item: ', item)
                            data0 = dataOutput.save_data_to_db(
                                reviews_update_sql,
                                reviews_insert_sql,
                                asin,
                                item,
                                db_name=reviews_db_name,
                                md5key=theMd5key)
                    # 否则只写入当天评论
                    else:
                        if item['date'] >= theYesterDete:
                            # print('reviews item: ', item)
                            data1 = dataOutput.save_data_to_db(
                                reviews_update_sql,
                                reviews_insert_sql,
                                asin,
                                item,
                                db_name=reviews_db_name)
                # 记录更新时间
                dataOutput.crawler_tm(asin, data_type)
        cur.close()
        dbObj.close()
        db_log.war('%s, %s线程任务已完成\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
    else:
        db_log.war('%s, %s数据队列为空\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))