コード例 #1
0
 def record_disabled_tosell(the_asin, data_type=''):
     data_dict = {
         'asin': the_asin,
         'getinfo_tm': int(DataOutput.get_redis_time())
     }
     conn = psycopg2.connect(**DATADB_CONFIG[BASE_TYPE])
     cur = conn.cursor()
     try:
         sql = "update public.amazon_product_data_tosell set is_sync=0, crawler_state=2, getinfo_tm=%(getinfo_tm)s where asin=%(asin)s;"
         print(sql)
         cur.execute(sql, data_dict)
         row = cur.rowcount
         print(1, row)
         if row > 0:
             conn.commit()
             print('\namazon_product_data_tosell,%s,%s,%s行,更新成功\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), the_asin,
                    row))
             sql1 = "update public.amazon_product_monitor set tosell_tm_crawler=%(tosell_tm_crawler)s where asin=%(asin)s;"
             data_dict1 = {
                 'asin': the_asin,
                 'tosell_tm_crawler':
                 int(DataOutput.get_redis_time() / 1000)
             }
             cur.execute(sql1, data_dict1)
             conn.commit()
             # cur.execute("select * from amazon_product_data where asin=%s;", (the_asin,))
             # print(cur.fetchall())
         else:
             conn.rollback()
             sql = "INSERT INTO public.amazon_product_data_tosell(asin, getinfo_tm, crawler_state) VALUES (%(asin)s, %(getinfo_tm)s, 2);"
             cur.execute(sql, data_dict)
             row = cur.rowcount
             print(2, row)
             if row > 0:
                 conn.commit()
                 sql1 = "update public.amazon_product_monitor set tosell_tm_crawler=%(tosell_tm_crawler)s where asin=%(asin)s;"
                 data_dict1 = {
                     'asin':
                     the_asin,
                     'tosell_tm_crawler':
                     int(DataOutput.get_redis_time() / 1000)
                 }
                 cur.execute(sql1, data_dict1)
                 conn.commit()
                 print('\namazon_product_data_tosell,%s,%s,%s行,插入成功\n' %
                       (return_PST().strftime("%Y-%m-%d %H:%M:%S"),
                        the_asin, row))
             else:
                 conn.rollback()
         cur.close()
         conn.close()
     except Exception as e:
         conn.rollback()
         cur.close()
         conn.close()
         datas = {the_asin: data_dict}
         print('失败 amazon_product_data ', e, datas)
     cur.close()
     conn.close()
コード例 #2
0
 def record_not_found_keyword(the_kw, data_type=''):
     data_dict = {
         'kw': the_kw,
         'search_num': -2,
         'getinfo_tm': int(DataOutput.get_redis_time())
     }
     conn = psycopg2.connect(**DATADB_CONFIG[BASE_TYPE])
     cur = conn.cursor()
     try:
         sql = "update public.amazon_keyword_data set kw=%(kw)s, search_num=%(search_num)s, getinfo_tm=%(getinfo_tm)s, is_sync=0, crawler_state=1 where kw=%(kw)s;"
         cur.execute(sql, data_dict)
         row = cur.rowcount
         if row > 0:
             conn.commit()
             print(
                 '\namazon_keyword_data,%s,%s,%s行,更新成功\n' %
                 (return_PST().strftime("%Y-%m-%d %H:%M:%S"), the_kw, row))
             sql1 = "update public.amazon_keyword_monitor set crawler_tm=%(crawler_tm)s where kw=%(kw)s;"
             data_dict1 = {
                 'asin': the_kw,
                 'crawler_tm': int(DataOutput.get_redis_time() / 1000)
             }
             cur.execute(sql1, data_dict1)
             conn.commit()
             # cur.execute("select * from amazon_product_data where asin=%s;", (the_asin,))
             # print(cur.fetchall())
         else:
             conn.rollback()
             sql = "INSERT INTO public.amazon_keyword_data(kw, search_num, getinfo_tm, crawler_state) VALUES (%(kw)s, %(search_num)s, %(getinfo_tm)s, 1);"
             cur.execute(sql, data_dict)
             row = cur.rowcount
             if row > 0:
                 conn.commit()
                 sql1 = "update public.amazon_keyword_monitor set crawler_tm=%(crawler_tm)s where kw=%(kw)s;"
                 data_dict1 = {
                     'asin': the_kw,
                     'crawler_tm': int(DataOutput.get_redis_time() / 1000)
                 }
                 cur.execute(sql1, data_dict1)
                 conn.commit()
                 print('\namazon_keyword_data,%s,%s,%s行,插入成功\n' %
                       (return_PST().strftime("%Y-%m-%d %H:%M:%S"), the_kw,
                        row))
             else:
                 conn.rollback()
         cur.close()
         conn.close()
     except Exception as e:
         conn.rollback()
         cur.close()
         conn.close()
         datas = {the_kw: data_dict}
         # print('失败 amazon_product_data ', e, datas)
     cur.close()
     conn.close()
コード例 #3
0
ファイル: bsrDataOutput.py プロジェクト: xusu12/hs_code
def bsrData_save(dataQ, debug_log, db_log):
    print('\nbsrData_save init\n')
    data_type = 'bsr'
    if dataQ.RedisQ.llen('bsrData') > 0:
        dbObj = GetDbObj().get_db_obj()
        cur = dbObj.cursor()
        dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ)
        db_name = SqlConfig.bsrData_db_name
        update_sql = SqlConfig.bsrData_update_sql
        insert_sql = SqlConfig.bsrData_insert_sql
        while True:
            datas = dataQ.get_new_bsrData()
            if not datas:
                if dataQ.RedisQ.llen('bsrData') > 0:
                    datas = dataQ.get_new_bsrData()
                else:
                    break
            for item in datas:
                asin = item
                tuple_list = datas[item]
                tm = int(DataOutput.get_redis_time())
                # print('asin tuple_list: ', asin, tuple_list)
                for item in tuple_list:
                    if item and type(item) is tuple:
                        # print('bsr item: ', item)
                        itemLen = len(item)
                        bsr = item[0]
                        bsrc = item[1]
                        aday = item[2]
                        # if itemLen == 4:
                        #     tm = item[3]
                        # else:
                        #     tm = int(time.time() * 1000)
                        data_dict = dict(asin=asin,
                                         bsr=bsr,
                                         bsrc=bsrc,
                                         tm=tm,
                                         aday=aday)
                        data = dataOutput.save_data_to_db(update_sql,
                                                          insert_sql,
                                                          asin,
                                                          data_dict,
                                                          db_name=db_name)
                        # print('bsrData: ',data)

        cur.close()
        dbObj.close()
        db_log.war('%s, %s线程任务已完成\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
    else:
        db_log.war('%s, %s数据队列为空\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
コード例 #4
0
ファイル: url_retry.py プロジェクト: xusu12/hs_code
def goods_retry(urlQ):
    url_type = 'goods'
    Qname = 'goodsUrlQueue'
    empty_url_queue(urlQ.RedisQ, Qname)
    sql_times = get_init_updae_tm(urlQ)
    # sql_times = 0
    if sql_times:
        # 未下载重试
        sql = "select asin, monitor_type, aid, info_tm from public.amazon_product_monitor where state = 1 \
and monitor_type > 0 and info_tm_crawler < %s and asin not in (select asin from public.amazon_druid_product_data where aday='%s') order by info_tm_crawler limit 8000;" % (
            sql_times, return_PST().strftime('%Y%m%d'))

        # 库存下载失败重试
        #sql1 = "select asin, monitor_type, aid, info_tm from public.amazon_product_monitor where asin in (select asin from amazon_product_data where asin in (select asin from amazon_product_monitor where state=1 and monitor_type > 0) and asin not in (select asin from amazon_druid_product_data where aday='%s')) order by info_tm_crawler limit 8000;" % (return_PST().strftime("%Y%m%d"))
        #print(sql)
        print('\ngoods 重试进程 %s\n' % (sql_times))
        url_tuple_list = urlQ.retrieve_asin(sql)
        #url_tuple_list2 = urlQ.retrieve_asin(sql1)
        #url_list_tuple = set(url_tuple_list + url_tuple_list2)
        url_list_tuple = url_tuple_list
        print('需要重试的商品数 %s' % (len(url_list_tuple)))
        # print('monitoring_goods_Now: ', url_tuple_list)
        if len(url_list_tuple) > 0:
            add_url_to_queue(urlQ,
                             url_list_tuple,
                             url_type=url_type,
                             sql_times=sql_times)
コード例 #5
0
    def update_getdata_tm(data_dict, data_type, dataQ=None, db_log=None):
        sql = ''
        db_type = ''
        if data_type == 'goods':
            db_type = 'public.amazon_product_monitor.info_tm_crawler'
            sql = "update public.amazon_product_monitor set info_tm_crawler=%(info_tm_crawler)s where asin=%(asin)s;"
        if data_type == 'reviews':
            db_type = 'public.amazon_product_monitor.comment_tm_crawler'
            sql = "update public.amazon_product_monitor set comment_tm_crawler=%(comment_tm_crawler)s where asin=%(asin)s;"
        if data_type == 'tosell':
            db_type = 'public.amazon_product_monitor.tosell_tm_crawler'
            sql = "update public.amazon_product_monitor set tosell_tm_crawler=%(tosell_tm_crawler)s where asin=%(asin)s;"
        if data_type == 'keyword':
            db_type = 'public.amazon_keyword_monitor.crawler_tm'
            sql = "update public.amazon_keyword_monitor set crawler_tm=%(crawler_tm)s where kw=%(kw)s;"

        if sql:
            conn = psycopg2.connect(**DATADB_CONFIG[BASE_TYPE])
            cur = conn.cursor()
            cur.execute(sql, data_dict)
            row = cur.rowcount
            if row > 0:
                if db_log:
                    db_log.info('爬虫数据更新时间 %s,%s,%s,%s行,标记成功' %
                            (db_type, return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type, row))
                conn.commit()
            else:
                conn.rollback()
            cur.close()
            conn.close()
コード例 #6
0
def select_analyzer_state(asins, kws, urlQ, aid, tid, aday=None):
    if not aday:
        aday = return_PST().strftime("%Y%m%d")
    _, init_time = get_the_time()
    asin_tuple = tuple(set(asins))
    kw_tulep = tuple(set(kws))
    asin_sql = 'select count(*) from amazon_product_data where getinfo_tm > ' + str(init_time - 3600 * 2) +  'and asin in %s;'
    the_vaule = lambda lst: lst[0][0] if len(lst) > 0 and type(lst[0]) is tuple and len(lst[0]) > 0 else 0
    asin_count = the_vaule(urlQ.retrieve_asin(asin_sql, [asin_tuple]))
    print(asin_count, type(asin_count))
    # kw_sql = 'select count(*) from amazon_keyword_data where getinfo_tm > ' + str(init_time - 3600 * 2) + 'and kw in %s;'
    # kw_count = the_vaule(urlQ.retrieve_asin(kw_sql, [kw_tulep]))
    # print(kw_count, type(kw_count))
    sql1 = "select kw from public.amazon_druid_keyword_data where tm > " + str(init_time - 3600 * 2)
    sql2 = sql1 + "and kw in %s group by kw;"
    print(sql2)
    get_vlues = lambda lst: [x[0] for x in lst if len(lst) > 0 and type(x) is tuple and len(x) > 0]
    rows = get_vlues(urlQ.retrieve_asin(sql2, [kw_tulep]))
    # rows = list(set(rows))
    finish_kws = json.dumps(rows)
    finish_count = len(rows)
    print(finish_kws, type(finish_kws))
    print('总共 %s 个asin, %s 个关键词\n已更新%s个asin, %s个关键词' % (len(asin_tuple), len(kw_tulep), asin_count, finish_count))
    # 有90%的关键词完成, 就可以标记完成状态了.
    if len(kw_tulep) - finish_count < len(kw_tulep) * 0.1:
        # 标记爬虫完成(crawler_state=2).
        update_analyzer_state(tid, aid, 2, finish_count, finish_kws=finish_kws)
        return True
コード例 #7
0
def get_the_time():
    # 日期格式
    date_str = '%Y%m%d'
    # 时间格式
    time_str = '%Y%m%d%H%M%S'

    # 当天的日期对象
    the_day = datetime.now()
    the_hour = the_day.hour
    pstNow = return_PST()
    pstHour = pstNow.hour
    # print(1.1, the_day)
    # 当天日期字符串
    date_str = the_day.strftime(date_str)
    # 当天15点整字符串
    the_day_str = '%s150000' % (date_str)
    # 当天15点的时间对象
    time_day = time.strptime(the_day_str, time_str)
    # print(1, time_day)

    the_time = time.mktime(time_day)
    # 当天15点时间戳
    the_date_time = the_time
    # 昨天15点时间戳
    old_date_time = the_date_time - 86400

    # 如果过了太平洋时间0点了, 需要另外计算.
    if 10 >= pstHour >= 0 and 15 <= the_hour <= 23:
        the_date_time = the_time + 86400
        old_date_time = the_time

    return the_date_time, old_date_time
コード例 #8
0
 def record_not_found_reviews(the_asin, data_type=''):
     data_dict = {
         'asin': the_asin,
         'comment_tm_crawler': int(DataOutput.get_redis_time() / 1000)
     }
     db_name = 'public.amazon_product_monitor.comment_tm_crawler'
     update_sql = "update public.amazon_product_monitor set comment_tm_crawler=%(comment_tm_crawler)s where asin=%(asin)s;"
     conn = psycopg2.connect(**DATADB_CONFIG[BASE_TYPE])
     cur = conn.cursor()
     try:
         cur.execute(update_sql, data_dict)
         row = cur.rowcount
         if row > 0:
             conn.commit()
             print('%s,%s,%s,%s行,更新成功' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name,
                    the_asin, row))
         cur.close()
         conn.close()
     except Exception as e:
         conn.rollback()
         cur.close()
         conn.close()
         # print('失败 amazon_product_data ', e, datas)
     cur.close()
     conn.close()
コード例 #9
0
ファイル: productModel.py プロジェクト: xusu12/hs_code
    def __init__(self):
        # 配置文件
        self.db_config = deepcopy(DATADB_CONFIG[BASE_TYPE])
        # 数据库名
        self.database = self.db_config['database']
        super(ProductModel, self).__init__(self.db_config)

        # amazon_product_data 表
        self.product_name = 'amazon_product_data'
        self.product_class = self.get_class(self.product_name)  # crud用

        # amazon_druid_product_data 表
        self.druid_name = 'amazon_druid_product_data'
        self.druid_class = self.get_class(self.druid_name)  # crud用

        # amazon_druid_product_data_bsr
        self.bsr_name = 'amazon_druid_product_data_bsr'
        self.bsr_class = self.get_class(self.bsr_name)

        # amazon_product_monitor 表
        self.monitor_name = 'amazon_product_monitor'
        self.monitor_class = self.get_class(self.monitor_name)

        # 时间相关的对象.
        self.date_fmt = '%Y%m%d'
        self.today = return_PST()
        self.old_date = self.today + timedelta(days=-3)
コード例 #10
0
    def record_not_found_goods(the_asin, data_type=''):
        data_dict = {'asin': the_asin, 'asin_state': 0, 'getinfo_tm': int(DataOutput.get_redis_time())}
        conn = psycopg2.connect(**DATADB_CONFIG[BASE_TYPE])
        cur = conn.cursor()
        try:
            sql = "update public.amazon_product_data set \
price=0, sale_price=0, sname='', ts_min_price=0, to_sell=0, byb=0, bsr=0, rc=0, rrg=0,\
r5p=0, r4p=0, r3p=0, r2p=0, r1p=0, feature='', brand='', release_date=0,\
collect_tm=0, variant='', cart_price=0, quantity=0, seller_id='',\
asin_state=%(asin_state)s, getinfo_tm=%(getinfo_tm)s, is_sync=0, crawler_state=1  where asin=%(asin)s;"
            cur.execute(sql, data_dict)
            row = cur.rowcount
            if row > 0:
                conn.commit()
                print('\namazon_product_data,%s,%s,%s行,更新成功\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), the_asin, row))
                sql1 = "update public.amazon_product_monitor set info_tm_crawler=%(info_tm_crawler)s where asin=%(asin)s;"
                data_dict1 = {'asin': the_asin, 'info_tm_crawler': int(DataOutput.get_redis_time() / 1000)}
                cur.execute(sql1, data_dict1)
                conn.commit()
                # cur.execute("select * from amazon_product_data where asin=%s;", (the_asin,))
                # print(cur.fetchall())
            else:
                conn.rollback()
                sql = "INSERT INTO public.amazon_product_data(asin, asin_state, getinfo_tm, crawler_state) VALUES (%(asin)s, %(asin_state)s, %(getinfo_tm)s, 1);"
                cur.execute(sql, data_dict)
                row = cur.rowcount
                if row > 0:
                    conn.commit()
                    sql1 = "update public.amazon_product_monitor set info_tm_crawler=%(info_tm_crawler)s where asin=%(asin)s;"
                    data_dict1 = {'asin':the_asin, 'info_tm_crawler': int(DataOutput.get_redis_time() / 1000)}
                    cur.execute(sql1, data_dict1)
                    conn.commit()
                    print('\namazon_product_data,%s,%s,%s行,插入成功\n' % (return_PST().strftime("%Y-%m-%d %H:%M:%S"), the_asin, row))
                else:
                    conn.rollback()
            cur.close()
            conn.close()
        except Exception as e:
            conn.rollback()
            cur.close()
            conn.close()
            datas = {the_asin: data_dict}
            # print('失败 amazon_product_data ', e, datas)
        cur.close()
        conn.close()
コード例 #11
0
ファイル: BaseCrawler.py プロジェクト: xusu12/hs_code
 def record_log(self, asin_keyword, time1, msgInt, msgType, startTime, ip,
                proxyInfo):
     '''msgInt 1成功 2失败 3报错 代表数据, 4失败 5验证码 6报错 7页面找不到 代表html'''
     time2 = time.time()
     endTime = return_PST().strftime("%Y-%m-%d %H:%M:%S")
     diffTime = time2 - time1
     self.info_log.info('%s, %s, %s, %s, %s, %s, %s, %s' %
                        (ip, asin_keyword, msgType, startTime, endTime,
                         diffTime, msgInt, proxyInfo))
コード例 #12
0
ファイル: BaseCrawler.py プロジェクト: xusu12/hs_code
 def save_discard_url(self, asin, url, num, discard_type):
     pstNow = return_PST()
     timeNow = pstNow.strftime("%Y-%m-%d %H:%M:%S")
     dateNow = pstNow.strftime("%Y_%m_%d")
     filepath = os.path.join(DATA_DIR, 'discard_url_%s.log' % (dateNow))
     msg = '[%s][%s][%s] [%s] [%s] [被放弃]\n' % (timeNow, asin, url,
                                               discard_type, num)
     msg.encode('utf-8')
     with open(filepath, 'a') as f:
         f.write(msg)
コード例 #13
0
 def save_data_to_db(self, update_sql, insert_sql, the_asin_or_kw, data_dict, db_name='', md5key=''):
     self.dataQ.record_dbSum_times()
     # print(the_asin_or_kw, data_dict)
     try:
         if update_sql and insert_sql:
             self.cur.execute(update_sql, data_dict)
             row = self.cur.rowcount
             if row > 0:
                 self.dbObj.commit()
                 self.dataQ.record_db_ok_times()
                 self.db_log.info('%s,%s,%s,%s行,更新成功' %
                                  (return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, row))
                 # 评论进程标记第一次
                 if md5key:
                     self.dataQ.the_reviews_first_download(md5key)
             else:
                 self.dbObj.rollback()
                 self.cur.execute(insert_sql, data_dict)
                 row = self.cur.rowcount
                 if row > 0:
                     self.dbObj.commit()
                     self.dataQ.record_db_ok_times()
                     self.db_log.info('%s,%s,%s,%s行,插入成功' %
                                      (return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, row))
                     # 评论进程标记第一次
                     if md5key:
                         self.dataQ.the_reviews_first_download(md5key)
                 else:
                     self.dbObj.rollback()
         else:
             if update_sql and not insert_sql:
                 self.cur.execute(update_sql, data_dict)
                 row = self.cur.rowcount
                 if row > 0:
                     self.dbObj.commit()
                     self.dataQ.record_db_ok_times()
                     self.db_log.info('%s,%s,%s,%s行,更新成功\n' %
                                      (return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, row))
                 else:
                     self.dbObj.rollback()
             if insert_sql and not update_sql:
                 self.cur.execute(insert_sql, data_dict)
                 row = self.cur.rowcount
                 if row > 0:
                     self.dbObj.commit()
                     self.dataQ.record_db_ok_times()
                     self.db_log.info('%s,%s,%s,%s行,插入成功' %
                                      (return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, row))
                 else:
                     self.dbObj.rollback()
     except Exception as e:
         self.dbObj.rollback()
         datas = {the_asin_or_kw: data_dict}
         self.debug_log.error('%s,%s,%s,入库失败,原因%s,失败数据[%s];' % (
         return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, e, datas))
         self.db_log.error('%s,%s,%s,入库失败,原因%s,失败数据[%s];' % (
         return_PST().strftime("%Y-%m-%d %H:%M:%S"), db_name, the_asin_or_kw, e, datas))
         return datas
コード例 #14
0
    def wrap(*args, **kwargs):
        '''保存HTML的装饰器, 此装饰器, 装饰在产品详情与库存的解析方法上'''
        # 运行原函数
        result = func(*args, **kwargs)
        try:
            if 'qty' in func.__name__:
                '''如果是获取库存的函数, 用此逻辑获取相应参数'''
                asin = kwargs.get('asin', '')
                html = kwargs.get('html_code', '')
                html_type = 'inventory'
                print(asin, html)
            else:
                '''如果不是获取库存的函数, 用此逻辑获取相应参数'''
                asin = args[2]
                html = args[1]
                html_type = 'product'
            print('*' * 20)
            print(func.__name__, html_type, asin, len(html), type(html))
            print('*' * 20)
            ## 在 save_asin_list 中的 asin 才保存
            # if asin in save_asin_list:
            '''注释掉上面一行后, 则是所有合法HTML都保存.'''
            if type(html) is str and html:
                print(1111111111)
                # 获取pt时间
                datenow = return_PST()
                # 文件保存目录
                # base_dir = '/data3/var/devtest/'
                base_dir = '../../data/devtest/'
                # 若目录不存在, 分级创建各种目录
                if not os.path.exists(base_dir):
                    os.mkdir(base_dir)
                html_dir = os.path.join(base_dir, 'save_asin_html/')
                if not os.path.exists(html_dir):
                    os.mkdir(html_dir)
                save_dir = os.path.join(html_dir, datenow.strftime('%Y%m%d'))
                if not os.path.exists(save_dir):
                    os.mkdir(save_dir)
                # 生成文件名
                file_path = os.path.join(
                    save_dir, '%s_%s_%s.html' %
                    (asin, html_type, datenow.strftime('%Y%m%d_%H_%M_%S')))
                # 将HTML写入文件
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(html)

                print(2222222)
        except Exception as e:
            print(e)
        return result
コード例 #15
0
    def wrap(*args, **kwargs):
        '''保存HTML的装饰器, 此装饰器, 装饰在关键字的解析方法上'''
        # 运行原函数
        result = func(*args, **kwargs)

        try:
            keyword = args[2]
            html_list = args[1]
            html_type = 'keyword'
            # print(keyword, html_list)

            print('*' * 20)
            print(func.__name__, html_type, keyword, len(html_list),
                  type(html_list))
            print('*' * 20)
            ## 在 save_asin_list 中的 asin 才保存
            # if asin in save_asin_list:
            '''注释掉上面一行后, 则是所有合法HTML都保存.'''
            if type(html_list) is list and html_list:
                i = 1
                for html in html_list:
                    # 获取pt时间
                    datenow = return_PST()
                    # 文件保存目录
                    base_dir = '../../data/devtest/'
                    # 若目录不存在, 分级创建各种目录
                    if not os.path.exists(base_dir):
                        os.mkdir(base_dir)
                    html_dir = os.path.join(base_dir, 'save_asin_html/')
                    if not os.path.exists(html_dir):
                        os.mkdir(html_dir)
                    save_dir = os.path.join(html_dir,
                                            datenow.strftime('%Y%m%d'))
                    if not os.path.exists(save_dir):
                        os.mkdir(save_dir)
                    # 生成文件名
                    keyword = '_'.join(keyword.split(' '))
                    print(keyword, i)
                    file_path = os.path.join(
                        save_dir, '%s_%s_%s_%s.html' %
                        (keyword, html_type,
                         datenow.strftime('%Y%m%d_%H_%M_%S'), i))
                    # 将HTML写入文件
                    with open(file_path, 'w', encoding='utf-8') as f:
                        f.write(html)
                    i += 1
        except Exception as e:
            print(e)
        return result
コード例 #16
0
ファイル: init_url.py プロジェクト: xusu12/hs_code
def all_url_init(urlQ, kwQ):
    kwNum = kw_init(kwQ)
    goodsNum = goods_init(urlQ)
    reviewsNum = reviews_init(urlQ)
    tosellNum = tosell_init(urlQ)
    urlNum = goodsNum + reviewsNum + tosellNum + kwNum
    urlQ.update_mission_attempts(urlNum)
    pstNow = return_PST()
    startTime = pstNow.strftime("%Y-%m-%d %H:%M:%S")
    dateNow = pstNow.strftime("%Y_%m_%d")
    statFile = os.path.join(REPORT_DIR, 'statistics_%s.csv' % (dateNow))
    msg = '\n[,%s,] [,初始化报告,], 任务总数, %s, 成功加入商品队列, %s, ' % (startTime, urlNum, goodsNum)
    msg1 = '成功加入评论队列, %s, ' % (reviewsNum)
    msg2 = '成功加入跟卖队列, %s,' % (tosellNum)
    msg3 = '成功加入关键字队列, %s, 任务开始!' % (kwNum)
    msgs = msg + msg1 + msg2 + msg3
    with open(statFile, 'a') as f:
        f.write(msgs)
コード例 #17
0
def analyzer_start(urlQ, dataQ, kwQ, info_log, debug_log, i):
    print('\nanalyzer_start%s 启动成功' % (i))
    while True:
        urllen = urlQ._get_queue_len(queue_name)
        print('当前analyzer任务队列长度 %s' % (urllen))
        if urllen < 1:
            sys.exit()
        # 获取任务字典
        task_dict = get_task(urlQ)
        # 提取监测时间
        mtm = task_dict.get('mtm', int(time.time()))
        # 提取aid
        aid = task_dict.get('aid', -1)
        # 提取tid
        tid = task_dict.get('tid')

        # 解包asin加工后重新打包
        asin_list = task_dict.get('asins', [])
        for asin in asin_list:
            asin_dict = dict(asin=asin, monitor_tm=mtm, aid=aid, utp='goods')
            # 将asin加入监测
            save_task_to_db(urlQ, asin_dict, url_type='goods')
        # 解包关键词加工后并重新打包
        kws_dict = task_dict.get('kws', {})
        # print(kws_dict, type(kws_dict))
        kw_list = []
        for k, v in kws_dict.items():
            # print(k, type(k))
            # print(v, type(v))
            kw_list.extend(v)
        for kw in kw_list:
            kw_dict = dict(aid=aid, monitor_tm=mtm, kw=kw, utp='keyword')
            # 将关键词加入监测
            save_task_to_db(urlQ, kw_dict, url_type='keyword')
        # 更改分析器状态(1 分析中)
        update_analyzer_state(tid, aid, 1)
        aday = return_PST().strftime("%Y%m%d")
        while 1:
            if select_analyzer_state(asin_list, kw_list, urlQ, aid, tid, aday=aday):
                break
            time.sleep(60 * 5)
コード例 #18
0
 def is_not_turn_the_page(self, first, html, page_num=0, asin=''):
     '''
     判断规则, 如果不第一次下载, 单页内出现前一天的评论, 则判断为不需要翻页了.
     如果是第一次下载, 单页内出现三个月以前的评论, 也判断不再需要翻页.
     '''
     reviews_date_list = ReviewsParser.get_reviews_date_list(html)
     datetime = return_PST()
     min_reviews_date = min(
         reviews_date_list) if len(reviews_date_list) > 0 else int(
             datetime.strftime('%Y%m%d'))
     oldDate = datetime - timedelta(days=90)
     yesterdate = datetime - timedelta(days=1)
     yesterday = yesterdate.strftime('%Y%m%d')
     theYesterDete = int(yesterday)
     theMon = oldDate.strftime('%Y%m%d')
     three_mon_date = int(theMon)
     print(
         '\n%s: min_reviews_date: %s\ntheYesterDete: %s\nthree_mon_date: %s\n'
         % (asin, min_reviews_date, theYesterDete, three_mon_date))
     # 如果不是第一次下载
     # 如果redis崩了, 数据丢了, 要从数据库查询, 修复此数据
     if not first:
         if min_reviews_date < theYesterDete:
             print('%s < %s' % (min_reviews_date, theYesterDete))
             print('非第一次下载, 当前是%s的第%s页评论, 不再需要继续翻页' % (asin, page_num))
             return True
         else:
             print('是第一次下载, 当前是%s的第%s页评论, 还需要继续翻页' % (asin, page_num))
             return False
     else:
         if min_reviews_date < three_mon_date:
             print('%s < %s' % (min_reviews_date, three_mon_date))
             print('非第一次下载, 当前是%s的第%s页评论, 不再需要继续翻页' % (asin, page_num))
             return True
         else:
             print('非第一次下载, 当前是%s的第%s页评论, 还需要继续翻页' % (asin, page_num))
             return False
コード例 #19
0
    def parser_not_found(self, the_asin, goods_html, html_code_list=None):
        self.html_code = goods_html
        self.head_html = self.get_head_html(goods_html)
        self.xpath_obj = etree.HTML(goods_html)
        self.desc_html = self.get_description_html(goods_html)
        self.buy_box_html = self.get_buy_box_html(goods_html)
        tosell_list = []
        tosell_info = {}
        sn = 1
        if self._is_Currently_unavailable():
            sn = 0
        sname = self._get_seller_name()  # 卖家
        seller_id = self._get_seller_id()
        if not sname:
            seller_id = ''
        price = self._get_discount_price()
        total_ratings = self._get_review_count()  # 评论数
        reivew_count = self._get_review_rating()  # 综合评分
        r5p = self._get_review_5_percent()  # 5星评价百分比
        r4p = self._get_review_4_percent()  # 4星评价百分比
        positive = r5p + r4p
        byb = self._has_buy_button()
        fba = self._get_fba()
        tosell_dict = dict(
            asin=the_asin,
            condition='',  # 使用情况
            sname=sname,  # 卖家
            stype='',  # 货运类型
            price=price,  # 价格
            demo='',  # 描述
            positive=positive,  # 好评率
            total_ratings=total_ratings,  # 总评数量
            tm=int(BaseParser.get_redis_time() / 1000),  # 更新时间
            fba=fba,  # 是否fba
            seller_id=seller_id,  # 卖家id
            reivew_count=reivew_count,  # 评分
            delivery='',  # 配送方式
            aday=return_PST().strftime("%Y%m%d"),  # 获取数据的太平洋日期
        )

        tosell_list.append(tosell_dict)

        if fba > 0:
            fba_sn = 1
        else:
            fba_sn = 0
        if byb > 0:
            the_sname = sname
            the_seller_id = seller_id
        else:
            the_sname = ''
            the_seller_id = ''
        if not the_sname:
            the_seller_id = ''
        tosell_datas = dict(
            asin=the_asin,
            sn=sn,  # 跟卖卖家数量
            fba_sn=fba_sn,  # FBA跟卖卖家数量
            plow=price,  # 最低价
            plows=sname,  # 最低跟卖卖家名
            plows_id=seller_id,  # 最低跟卖卖家id
            getinfo_tm=int(BaseParser.get_redis_time()),  # 获取时间
            sname=the_sname,  # 黄金购物车卖家
            seller_id=the_seller_id,  # 黄金购物车卖家id
        )
        tosell_html = html_code_list[0] if type(
            html_code_list) is list and len(html_code_list) > 0 else ''
        if re.search('There are currently no listings for this search',
                     tosell_html):
            # 如果没有new跟卖, 则卖家数量为0, fba卖家数量为0, 价格为-, 最低跟卖卖家与卖家id为-
            tosell_datas['sn'] = 0
            tosell_datas['fba_sn'] = 0
            tosell_datas['plow'] = -1
            tosell_datas['plows'] = ''
            tosell_datas['plows_id'] = ''
            tosell_info[the_asin] = (tosell_datas, [])

        elif self._is_Currently_unavailable():
            # 如果不可售, 则卖家数量为0, fba卖家数量为0, 价格为-, 最低跟卖卖家与卖家id为-
            tosell_datas['sn'] = 0
            tosell_datas['fba_sn'] = 0
            tosell_datas['plow'] = -1
            tosell_datas['plows'] = ''
            tosell_datas['plows_id'] = ''
            tosell_datas['sname'] = ''
            tosell_datas['seller_id'] = ''
            tosell_info[the_asin] = (tosell_datas, [])
        else:
            tosell_info[the_asin] = (tosell_datas, tosell_list)
        return tosell_info
コード例 #20
0
    def tosell_parser(self,
                      html_code_list,
                      the_asin,
                      tosellSum=None,
                      ip='',
                      download_url='',
                      goods_html_code=''):
        fba_list = []
        sn_list = []
        plow_list = []
        plow_dict = {}
        tosell_list = []
        tosell_info = {}
        the_sname, the_seller_id = self._get_byb_merchant(goods_html_code)
        print(the_sname, the_seller_id)
        ts_price = self.get_to_sell_price(goods_html_code)
        print(ts_price)
        for html_code in html_code_list:
            xpathObj_list = self.get_tosell_html(html_code)
            print('xpathObj_list: ', xpathObj_list)
            for xpathObj in xpathObj_list:
                self.html_code = str(tostring(xpathObj), encoding='utf-8')
                # print(self.html_code)
                condition = self._get_condition()
                sname = self._get_sname()
                stype = self._get_stype()
                price = self._get_price()
                demo = self._get_demo()
                positive = self._get_positive(demo)
                total_ratings = self._get_total_ratings(demo)
                reivew_count = self._get_reivew_count(demo)
                seller_id = self._get_seller_id()
                if not sname:
                    seller_id = ''
                fba = self._get_fba()
                delivery = self._get_delivery()
                tosell_dict = dict(
                    asin=the_asin,
                    condition=condition,  # 使用情况
                    sname=sname,  # 卖家
                    stype=stype,  # 货运类型
                    price=price,  # 价格
                    demo=demo,  # 描述
                    positive=positive,  # 好评率
                    total_ratings=total_ratings,  # 总评数量
                    tm=int(BaseParser.get_redis_time() / 1000),  # 查询时间
                    fba=fba,  # 是否fba
                    seller_id=seller_id,  # 卖家id
                    reivew_count=reivew_count,  # 评分
                    delivery=delivery,  # 配送方式
                    aday=return_PST().strftime("%Y%m%d"),  # 获取数据的太平洋日期
                )
                if fba:
                    if seller_id:
                        fba_list.append(str(seller_id))
                    else:
                        fba_list.append(str(sname))
                # if seller_id:
                #     sn_list.append(str(seller_id))
                # else:
                #     sn_list.append(str(sname))
                plow_list.append(price)
                price_str = str(price)
                if price_str not in plow_dict:
                    plow_dict[price_str] = dict(sname=sname,
                                                seller_id=seller_id)
                tosell_list.append(tosell_dict)
        sn = len(tosell_list)
        # print('\ntosellSum: ', tosellSum)
        # print('sn: ', sn, '\n')
        fba_sn = len(fba_list)  # fbag跟卖数
        if len(plow_list) > 0:
            plow = min(plow_list)
        else:
            plow = 0
        # if ts_price > 0 and ts_price - plow > ts_price * 0.7:
        #     plow = ts_price
        plows1 = plow_dict.get(str(plow)) or {}
        plows = plows1.get('sname') or ''
        plows_id = plows1.get('seller_id') or ''

        tosell_datas = dict(
            asin=the_asin,
            sn=sn,  # 跟卖卖家数量
            fba_sn=fba_sn,  # FBA跟卖卖家数量
            plow=plow,  # 最低价
            plows=plows,  # 最低跟卖卖家名
            plows_id=plows_id,  # 最低跟卖卖家id
            getinfo_tm=int(BaseParser.get_redis_time()),  # 获取时间
            sname=the_sname,  # 黄金购物车卖家
            seller_id=the_seller_id,  # 黄金购物车卖家id
        )

        if len(tosell_list) > 0:
            tosell_info[the_asin] = (tosell_datas, tosell_list)
        else:
            tosell_info = TosellNotFoundParser(
                goods_html_code).parser_not_found(the_asin, goods_html_code,
                                                  html_code_list)
        print(tosell_info)
        return tosell_info
コード例 #21
0
def add_url_to_queue(theQueue, url_tuple_list, url_type='', sql_times=None):
    used = 'useInterval'
    aday = return_PST().strftime('%Y%m%d')

    def add_to_queue(theQueue, url_tuple_bytes, url_type=''):
        result = False
        if url_type == 'goods':
            result = theQueue.add_goods_url_to_queue(url_tuple_bytes)
        if url_type == 'reviews':
            result = theQueue.add_reviews_url_to_queue(url_tuple_bytes)
        if url_type == 'tosell':
            result = theQueue.add_tosell_url_to_queue(url_tuple_bytes)
        if url_type == 'keyword':
            result = theQueue.add_keyword_to_queue(url_tuple_bytes)
        return result

    # print(url_tuple_list)
    url_data_dict = {}
    for urlTuple in url_tuple_list:
        # print(urlTuple)
        kw_asin = urlTuple[0]
        cid_monitorType = urlTuple[1]
        aid = urlTuple[2]
        # 记录监控时间
        monitor_tm = urlTuple[3]
        md5value = kw_asin + url_type
        md5key = theQueue.get_md5_key(md5value)
        usedMd5key = theQueue.get_md5_key(md5value + used)
        url_dict = dict(
            aid=aid,
            mtm=monitor_tm,
            md5=md5key,
            umd5=usedMd5key,
            utp=url_type,
        )
        if url_type == 'keyword':
            url_dict['kw'] = kw_asin
            url_dict['cid'] = cid_monitorType
        else:
            url_dict['asin'] = kw_asin
            url_dict['mtp'] = cid_monitorType
        if not url_data_dict.get(kw_asin):
            url_data_dict[kw_asin] = url_dict
    url_data_list = []
    for url in url_data_dict:
        url_data_list.append(url_data_dict[url])
    url_dict_list = sorted(url_data_list, key=lambda x: x.get('mtm', int(time.time())))
    filter_list = lambda lst: [x[0] for x in lst if type(x) is tuple and len(x) > 0]
    asin_set_list = []
    if url_type == 'keyword':
        sql1 = "select kw from public.amazon_druid_keyword_data where tm > %(tm)s and aday=%(aday)s group by kw;"
        # print(sql1)
        data = dict(tm=sql_times * 1000, aday=aday)
        asin_set_list = filter_list(urlQ.retrieve_asin(sql1, data))
        if len(asin_set_list) > 10:
            print(asin_set_list[:10])
        print('len asin_set_list', len(asin_set_list))
    if url_type == 'goods':
        sql1 = "select asin from public.amazon_druid_product_data where tm > %s and aday='%s';" % (
        sql_times * 1000, return_PST().strftime('%Y%m%d'))
        # print(sql1)
        asin_set_list = filter_list(urlQ.retrieve_asin(sql1))
        if len(asin_set_list) > 10:
            print(asin_set_list[:10])
        print('len asin_set_list', len(asin_set_list))
    for url_dict in url_dict_list:
        url_tuple_bytes = pickle.dumps(url_dict)
        # print(monitor_tm, 1, md5value)
        result = add_to_queue(theQueue, url_tuple_bytes, url_type)
        # print(monitor_tm, 2, md5value)
        if result:
            # print(monitor_tm, result, md5value)
            theQueue.add_asinAndKw_to_set(url_dict.get('md5'))
        # urlQ.srem_successAsinSet_from_set(md5key)
        if url_type == 'goods':
            # 如果 asin 不是已下架
            if sql_times:
                if urlQ.is_downloaded(url_dict.get('md5')):
                    if url_dict.get('asin') not in asin_set_list:
                        print('实际上没有下载的asin', url_dict.get('asin'))
                        urlQ.srem_successAsinSet_from_set(url_dict.get('md5'))
        if url_type == 'keyword':
            # 如果 asin 不是已下架
            if sql_times:
                if urlQ.is_downloaded(url_dict.get('md5')):
                    if url_dict.get('kw') not in asin_set_list:
                        print('实际上没有下载的kw', url_dict.get('kw'))
                        urlQ.srem_successAsinSet_from_set(url_dict.get('md5'))
コード例 #22
0
    def download(self, asin_or_kw, url_dict, **kwargs):
        url_type = self.url_type
        asin = asin_or_kw
        monitor_type = url_dict.get('mtp') or 5
        print('url type: ', url_type)
        startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S")
        time_now = lambda: time.time()
        time1 = time_now()
        url_md5key = url_dict.get('md5') or ''
        if not url_md5key:
            url_md5key = self.get_md5_key(asin + url_type)
        goodsUrl_tuple = self.make_url(asin, url_type='goods')
        goodsUrl, referer = goodsUrl_tuple
        if goodsUrl:
            html_list, url_list, cookiesObj, is_error_lsit, tosellSum = \
                self.get_tosell_html_lsit(asin, goodsUrl, referer, **kwargs)
            old_dnum = url_dict.get('dnum') or 0
            durl = url_dict.get('durl') or []
            url_dict['durl'] = list(set(durl + url_list))
            url_dict['dnum'] = old_dnum + 1
            # 如果判定为没有跟卖, 结束程序
            if self.not_found:
                self.urlQ.record_tosell_notFound_times()
                msgInt = 0
                proxyInfo = 'the asin not tosell'
                self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo)
                return self.debug_log.war('%s没有跟卖' % (asin))
            i = -1
            tosell_html_list = []
            if len(html_list) > 0:
                for html in html_list:
                    i += 1
                    is_error = is_error_lsit[i]
                    print(is_error_lsit, is_error_lsit[i])
                    url = url_list[i]
                    if is_error:
                        self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
                        msgInt = 6
                        proxyInfo = 'get Html error'
                        self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo)

                    else:
                        analyze = self.analyze_html(html, asin, url_dict, time1,
                                                    startTime, html_type=url_type)
                        if analyze and analyze != 404:
                            tosell_html_list.append(html)
                print('html num: ', len(html_list), 'tosell_html num: ', len(tosell_html_list))
                if len(tosell_html_list) == len(html_list):
                    result, is_error = self.parser(tosell_html_list, html_type=url_type, asin=asin,
                                                   monitor_type=monitor_type, tosellSum=tosellSum,
                                                   goods_html_code=self.goods_html)
                    # from pprint import pprint
                    # pprint('tosell_result', result)
                    if is_error:
                        self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
                        msgInt = 3
                        proxyInfo = 'get data error'
                        self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo)
                    else:
                        if not result:
                            self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
                            msgInt = 2
                            proxyInfo = 'get data defeated'
                            self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo)
                        else:
                            self.save_success_asin_keyword(asin, url_type=url_type)
                            msgInt = 1
                            proxyInfo = 'get data success'
                            self.record_log(asin, time1, msgInt, url_type, startTime, proxyInfo)
                            tosell_datas = result[0]
                            # from pprint import pprint
                            # pprint(tosell_datas)
                            data_bytes = pickle.dumps(tosell_datas)
                            self.dataQ.add_tosell_to_queue(data_bytes)
                            self.dataQ.record_data_ok_times()
                            self.dataQ.record_tosell_ok_times()
                else:
                    self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
            else:
                if tosellSum == -1:
                    self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
                if tosellSum > 0:
                    self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
        else:
            print(url_type, '没有url')
            self.the_url_is_discard(asin, url_dict, url_type, url_md5key)
            time.sleep(1)
コード例 #23
0
ファイル: report_tasks.py プロジェクト: xusu12/hs_code
def statistics(urlQ):
    the_date = return_PST()
    file_name = os.path.join(
        REPORT_DIR, 'statistics_info_%s.csv' % (the_date.strftime('%Y%m%d')))
    print(file_name)
    #  关键字总数 已入库数  入库失败的数量
    kw_sum, kw_num, kw_fail_sum = keyword_retry(urlQ)
    #  产品的总数  已经入库的数量  入库失败的数量  入库失败但是保存数据到数据库的商品的数量
    # goods_sum, goods_num, goods_fail_sum, fail_goods = goods_retry(urlQ)
    goods_sum, goods_num, goods_fail_sum = goods_retry(urlQ)
    # 失败的库存
    not_qty_sum = qty_retry(urlQ)
    #  评论的总数 已经入库的数量  入库失败的数量
    reviews_sum, reviews_num, reviews_fail_sum = reviews_retry(urlQ)
    #  跟卖的总数 已经入库的数量  入库失败的数量
    tosell_sum, tosell_num, tosell_fail_sum = tosell_retry(urlQ)
    # 库存已入库数
    inventory_num = goods_num - not_qty_sum
    # 未完成入库的商品的数量
    goods_unfinished = goods_sum - goods_num
    # 未完成入库的关键字的数量
    kw_unfinished = kw_sum - kw_num
    # 未完成入库的跟卖的数量
    tosell_unfinished = tosell_sum - tosell_num
    # 未完成入库的评论的数量
    reviews_unfinished = reviews_sum - reviews_num
    # 库存入库完成度
    inventory_comlete_rate = inventory_num / goods_sum * 100
    # 商品入库完成度
    goods_complete_rate = goods_num / goods_sum * 100
    # 关键字入库完成度
    kw_complete_rate = kw_num / kw_sum * 100
    # 跟卖入库完成度
    tosell_complete_rate = tosell_num / tosell_sum * 100
    # 评论入库完成度
    reviews_complete_rate = reviews_num / reviews_sum * 100
    # 库存重试率
    inventory_retry_rate = not_qty_sum / goods_sum * 100
    # 商品重试率
    goods_retry_rate = goods_fail_sum / goods_sum * 100
    # 关键字重试率
    kw_retry_rate = kw_fail_sum / kw_sum * 100
    # 跟卖重试率
    tosell_retry_rate = tosell_fail_sum / tosell_sum * 100
    # 评论重试率
    reviews_retry_rate = reviews_fail_sum / reviews_sum * 100
    with open(file_name, 'a', encoding='utf-8') as f:
        f.write(
            "\n , ,GTM+8,%s\n" %
            (time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time()))))
        f.write("%s,%s,%s,%s,%s,%s\n" %
                (the_date.strftime('%Y-%m-%d,%H:%M:%S'), 'inventory|库存',
                 'product|产品', 'keyword|关键字', 'tosell|跟卖', 'reviews|评论'))
        f.write(
            "%s,%s,%s,%s,%s,%s\n" %
            (' ,总数', goods_sum, goods_sum, kw_sum, tosell_sum, reviews_sum))
        f.write("%s,%s,%s,%s,%s,%s\n" % (' ,已入库', inventory_num, goods_num,
                                         kw_num, tosell_num, reviews_num))
        f.write("%s,%s,%s,%s,%s,%s\n" %
                (' ,未完成', goods_sum - inventory_num, goods_unfinished,
                 kw_unfinished, tosell_unfinished, reviews_unfinished))

        f.write("%s,%s,%s,%s,%s,%s\n" %
                (' ,完成度', '%.2f%%' % inventory_comlete_rate, '%.2f%%' %
                 goods_complete_rate, '%.2f%%' % kw_complete_rate, '%.2f%%' %
                 tosell_complete_rate, '%.2f%%' % reviews_complete_rate))
        f.write("%s,%s,%s,%s,%s,%s\n" %
                (' ,需重试', not_qty_sum, goods_fail_sum, kw_fail_sum,
                 tosell_fail_sum, reviews_fail_sum))
        f.write("%s,%s,%s,%s,%s,%s\n" %
                (' ,重试率', '%.2f%%' % inventory_retry_rate,
                 '%.2f%%' % goods_retry_rate, '%.2f%%' % kw_retry_rate,
                 '%.2f%%' % tosell_retry_rate, '%.2f%%' % reviews_retry_rate))

    # 发送邮件的参数
    # 发送邮件的数据格式
    msg_format = "%s队列:总数%s条数据, 已入库%s条数据, 未入库%s条数据, 完成度%s, 需重试%s, 重试率%s"
    # 要发送的信息
    msg_list = []
    # 发送的邮箱地址
    to_addr = '*****@*****.**'
    war_msg = '数据入库完成度超过90%'
    title_format = 'Data Save Warning! PT: %s'

    # 判断产品入库的完成度  超过90%就发送邮件
    if goods_complete_rate >= 0:
        msg_list.append(msg_format %
                        ('product', goods_sum, goods_num, goods_unfinished,
                         (goods_complete_rate), goods_fail_sum,
                         (goods_retry_rate)))
        send_func(msg_list, war_msg, title_format, to_addr)
        # 清空列表
        msg_list = []
    # 判断关键字入库的完成度  超过90%就发送邮件
    if kw_complete_rate >= 0:
        msg_list.append(msg_format % ('keyword', kw_sum, kw_num, kw_unfinished,
                                      (kw_complete_rate), kw_fail_sum,
                                      (kw_retry_rate)))
        send_func(msg_list, war_msg, title_format, to_addr)
        # 清空列表
        msg_list = []
    # 判断跟卖入库的完成度  超过90%就发送邮件
    if tosell_complete_rate >= 0:
        msg_list.append(msg_format %
                        ('tosell', tosell_sum, tosell_num, tosell_unfinished,
                         (tosell_complete_rate), tosell_fail_sum,
                         (tosell_retry_rate)))
        send_func(msg_list, war_msg, title_format, to_addr)
        # 清空列表
        msg_list = []
    # 判断评论入库的完成度  超过90%就发送邮件
    if reviews_complete_rate >= 90:
        msg_list.append(
            msg_format %
            ('reviews', reviews_sum, reviews_num, reviews_unfinished,
             reviews_complete_rate, reviews_fail_sum, reviews_retry_rate))
        send_func(msg_list, war_msg, title_format, to_addr)
        # 清空列表
        msg_list = []
コード例 #24
0
ファイル: keywordParser.py プロジェクト: xusu12/hs_code
 def _get_date(self):
     return int(return_PST().strftime('%Y%m%d'))
コード例 #25
0
ファイル: tosellDataOutput.py プロジェクト: xusu12/hs_code
def tosell_save(dataQ, debug_log, db_log):
    print('\ntosell_save init\n')
    data_type = 'tosell'
    if dataQ.RedisQ.llen('tosellData') > 0:
        dbObj = GetDbObj().get_db_obj()
        cur = dbObj.cursor()
        dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ)
        data_tosell_db_name = SqlConfig.data_tosell_db_name
        data_tosell_update_sql = SqlConfig.data_tosell_update_sql
        data_tosell_insert_sql = SqlConfig.data_tosell_insert_sql

        druid_tosell_db_name = SqlConfig.druid_tosell_db_name
        #druid_tosell_update_sql = SqlConfig.druid_tosell_update_sql
        druid_tosell_update_sql = None  #SqlConfig.druid_tosell_update_sql
        druid_tosell_insert_sql = SqlConfig.druid_tosell_insert_sql
        while True:
            datas = dataQ.get_new_tosellData()
            if not datas:
                if dataQ.RedisQ.llen('tosellData') > 0:
                    datas = dataQ.get_new_tosellData()
                else:
                    break
            # print('\ntosell_save datas: [= %s =] \n' % (datas))
            tm = DataOutput.get_redis_time()
            for item in datas:
                asin = item
                tosell_datas = datas[item][0]
                tosell_list = datas[item][1]
                # print('tosell_datas: ', tosell_datas)
                print(tosell_datas['getinfo_tm'], 1)
                tosell_datas['getinfo_tm'] = tm
                print(tosell_datas['getinfo_tm'], 2)
                # sql = "select asin, getinfo_tm from public.amazon_product_data_tosell where asin=%(asin)s and getinfo_tm>%(the_tm)s;"
                # # select_dict = {'asin': asin, 'the_tm': (tm / 1000 - 120) * 1000}
                # the_tm = dataQ._get_value_from_string('initUpdateTm', 'initTime')
                # print('the_tm1', the_tm)
                # if not the_tm:
                #     _, the_tm = BaseCrawler.get_the_time()
                #     print('the_tm2', the_tm)
                # else:
                #     the_tm = str(the_tm, encoding='utf-8')
                # print('the_tm3', the_tm)
                # select_dict = {'asin': asin, 'the_tm': int(the_tm) * 1000}
                # cur.execute(sql, select_dict)
                # select_rows = cur.fetchall()
                sql = "select asin, aday from public.amazon_product_tosell where asin=%(asin)s and aday=%(aday)s limit 1;"
                aday = tosell_list[0]['aday'] if len(
                    tosell_list) > 0 else return_PST().strftime('%Y%m%d')
                select_dict = {'asin': asin, 'aday': aday}
                cur.execute(sql, select_dict)
                select_rows = cur.fetchall()
                dbObj.commit()
                if len(select_rows) < 1:
                    print(tosell_datas)
                    if not tosell_datas.get('sname'):
                        sql1 = "select sname, seller_id from public.amazon_product_data where asin='%s' and getinfo_tm > %s" % (
                            asin, tm - 24 * 3600 * 1000)
                        cur.execute(sql1)
                        select_rows = cur.fetchall()
                        dbObj.commit()
                        select_rows = select_rows[0] if len(
                            select_rows) == 1 else ('', '')
                        sname, seller_id = select_rows
                        print('seller_id: ', seller_id)
                        print('sname ', sname)
                        tosell_datas['sname'] = sname
                        tosell_datas['seller_id'] = seller_id
                    data0 = dataOutput.save_data_to_db(
                        data_tosell_update_sql,
                        data_tosell_insert_sql,
                        asin,
                        tosell_datas,
                        db_name=data_tosell_db_name)
                    for item in tosell_list:
                        item['tm'] = int(tm / 1000)
                        data = dataOutput.save_data_to_db(
                            druid_tosell_update_sql,
                            druid_tosell_insert_sql,
                            asin,
                            item,
                            db_name=druid_tosell_db_name)

                    # 记录更新时间
                    dataOutput.crawler_tm(asin, data_type)
        cur.close()
        dbObj.close()
        db_log.war('%s, %s线程任务已完成\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
    else:
        db_log.war('%s, %s数据队列为空\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
コード例 #26
0
ファイル: goodsDataOutput.py プロジェクト: xusu12/hs_code
def goods_data_save(dataQ, debug_log, db_log):
    print('\ngoods_save init\n')
    data_type = 'goods'
    if dataQ.RedisQ.llen('goodsData') > 0:
        dbObj = GetDbObj().get_db_obj()
        cur = dbObj.cursor()
        dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ)
        db_name = SqlConfig.goods_db_name
        update_sql = SqlConfig.goods_update_sql
        insert_sql = SqlConfig.goods_insert_sql
        while True:
            datas = dataQ.get_new_goods_data()
            the_hour = return_PST().hour
            if not datas:
                if dataQ.RedisQ.llen('goodsData') > 0:
                    datas = dataQ.get_new_goods_data()
                else:
                    break
            tm = DataOutput.get_redis_time()
            for k, v in datas.items():
                asin = k
                data = v
                # print('data', data)
                # 如果库存下载失败, 先不入历史库
                from pprint import pprint
                pprint(data)

                print(data['getinfo_tm'], 1)
                data['getinfo_tm'] = tm
                print(data['getinfo_tm'], 2)
                print('rc1: ', data['rc'])
                print('quantity1', data['quantity'])
                sql = "select rc, quantity, price, title, bsr from public.amazon_product_data where asin=%(asin)s and getinfo_tm>%(the_tm)s ;"
                select_dict = {
                    'asin': data['asin'],
                    'the_tm': (tm / 1000 - 3600 * 24 * 3) * 1000
                }
                cur.execute(sql, select_dict)
                select_rows = cur.fetchall()
                if len(select_rows) > 0:
                    print(select_rows, type(select_rows), type(select_rows[0]),
                          type(select_rows[0][0]))
                    the_old_rc, the_old_qty, the_old_price, the_old_title, the_old_bsr = select_rows[
                        0]
                    print(the_old_rc, the_old_qty, the_old_price,
                          the_old_title, the_old_bsr)
                    the_new_qty = data['quantity']
                    print('price1', data['price'], the_old_price)
                    # 如果没有price 则用前一天的数据
                    if data['price'] <= 0 and the_old_price > 0 and data[
                            'asin_state'] == 3:
                        data['price'] = the_old_price
                    # 如果没有title, 则用前一天的数据
                    if not data['title'] and the_old_title:
                        data['title'] = the_old_title
                    # 如果没有bsr, 则用前一天的数据
                    #if data['bsr'] < 1 and the_old_bsr > 0:
                    #    data['bsr'] = the_old_bsr
                    print('the_old_rc', the_old_rc, type(the_old_rc))
                    print('old quantity', the_old_qty, type(the_old_qty))
                    print('new quantity', the_new_qty, type(the_new_qty))
                    # 如果评论小于前一天的评论, 则用前一天的评论
                    print("data['rc']", data['rc'], type(data['rc']))
                    if data.get('rc', 0) < the_old_rc:
                        data['rc'] = the_old_rc
                    # 如果库存爬取失败, 则用前一天的库存
                    if the_new_qty == -1 and the_old_qty >= 0 and data[
                            'asin_state'] == 3:
                        data['quantity'] = the_old_qty
                        data['qtydt'] = 4
                        with open('quantity_fail.csv', 'a',
                                  encoding='utf-8') as f:
                            f.write('asin, %s, old qty, %s, new qty, %s\n' %
                                    (data['asin'], the_old_qty, the_new_qty))

                    if data['asin_state'] == 2:
                        data['quantity'] = 0
                        data['byb'] = 0
                        data['qtydt'] = 5  # 不可售

                # 如果没有dpre, 则用price
                if data['dpre'] <= 0 and data['price'] > 0:
                    data['dpre'] = data['price']
                # 如果没有cart_price, 则用price
                if data['cart_price'] <= 0 and data['price'] > 0:
                    data['cart_price'] = data['price']
                print('price2', data['price'])
                print('quantity2', data['quantity'])
                print('rc2: ', data['rc'])

                if the_hour < 9 and data['quantity'] < 0 and data[
                        'asin_state'] == 3:
                    # 先不更新
                    pass
                    # # 弹出更新库不需要的字段
                    # data.pop('dpre')
                    # data.pop('bs1')
                    # data.pop('qc')
                    # data.pop('qtydt')
                    # data.pop('aday')
                    # # 再传给更新库
                    # dataOutput.save_data_to_db(update_sql, insert_sql, asin, data, db_name=db_name)
                else:
                    # 先传一份给历史库
                    druidData_to_db(asin, data, dataOutput)

                    # 弹出更新库不需要的字段
                    data.pop('dpre')
                    data.pop('bs1')
                    data.pop('qc')
                    data.pop('qtydt')
                    data.pop('aday')
                    # 再传给更新库
                    dataOutput.save_data_to_db(update_sql,
                                               insert_sql,
                                               asin,
                                               data,
                                               db_name=db_name)
                    # 记录更新时间
                    dataOutput.crawler_tm(asin, data_type)

        cur.close()
        dbObj.close()
        db_log.war('%s, %s线程任务已完成\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
    else:
        db_log.war('%s, %s数据队列为空\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
コード例 #27
0
ファイル: keywordDataOutput.py プロジェクト: xusu12/hs_code
def keyword_data_save(dataQ, debug_log, db_log):
    print('\nkeyword_data_save init\n')
    data_type = 'keyword'
    if dataQ.RedisQ.llen('keywordData') > 0:
        pool = psycopg2.pool.SimpleConnectionPool(5, 51,
                                                  **DATADB_CONFIG[BASE_TYPE])
        # dbObj1 = GetDbObj().get_db_obj()
        # cur1 = dbObj1.cursor()
        # dataOutput = DataOutput(dbObj1, cur1, db_log, debug_log, dataQ)
        keyword_data_db_name = SqlConfig.keyword_data_db_name
        keyword_data_update_sql = SqlConfig.keyword_data_update_sql
        keyword_data_insert_sql = SqlConfig.keyword_data_insert_sql

        druid_keyword_db_name = SqlConfig.druid_keyword_db_name
        #druid_keyword_update_sql = SqlConfig.druid_keyword_update_sql
        druid_keyword_update_sql = None
        druid_keyword_insert_sql = SqlConfig.druid_keyword_insert_sql
        i = 0
        while True:
            i += 1
            dbObj = pool.getconn(i)
            cur = dbObj.cursor()
            datas = dataQ.get_new_keywordData()
            if not datas:
                if dataQ.RedisQ.llen('keywordData') > 0:
                    print(dataQ.RedisQ.llen('keywordData'),
                          type(dataQ.RedisQ.llen('keywordData')))
                    datas = dataQ.get_new_keywordData()
                else:
                    break

            for k, v in datas.items():
                kw = k
                tm = DataOutput.get_redis_time()
                keyword_data_dict = v[0]
                keyword_druid_data_list = v[1]
                aday = keyword_druid_data_list[0]['aday'] if len(
                    keyword_druid_data_list) > 0 else return_PST().strftime(
                        "%Y%m%d")
                if len(keyword_druid_data_list
                       ) < 50 and keyword_data_dict['search_num'] < 1000:
                    if keyword_data_dict['search_num'] != len(
                            keyword_druid_data_list):
                        keyword_data_dict['search_num'] = len(
                            keyword_druid_data_list)
                        for data in keyword_druid_data_list:
                            data['srn'] = len(keyword_druid_data_list)

                # print('keyword_data_dict: ', keyword_data_dict)
                # print(keyword_data_dict['getinfo_tm'], 1)
                keyword_data_dict['getinfo_tm'] = tm
                # print(keyword_data_dict['getinfo_tm'], 2)
                sql = "select kw from public.amazon_druid_keyword_data where kw=%(kw)s and aday=%(aday)s limit 1;"
                the_data = dict(kw=kw, aday=aday)
                cur.execute(sql, the_data)
                asin_rows = cur.fetchall()
                print('asin_rows: ', len(asin_rows))
                print('keyword_druid_data_list len: ',
                      len(keyword_druid_data_list))
                if len(asin_rows) < 1:
                    data0 = DataOutput.save_data_to_db_pool(
                        dbObj,
                        cur,
                        db_log,
                        debug_log,
                        dataQ,
                        keyword_data_update_sql,
                        keyword_data_insert_sql,
                        kw,
                        keyword_data_dict,
                        db_name=keyword_data_db_name)
                    # print('keyword_druid_data_list: ', keyword_druid_data_list)
                    if len(keyword_druid_data_list) > 0:
                        for druid in keyword_druid_data_list:
                            # print(druid)
                            druid['tm'] = tm
                            data1 = DataOutput.save_data_to_db_pool(
                                dbObj,
                                cur,
                                db_log,
                                debug_log,
                                dataQ,
                                druid_keyword_update_sql,
                                druid_keyword_insert_sql,
                                kw,
                                druid,
                                db_name=druid_keyword_db_name)
                            # time.sleep(20)
                    # 记录更新时间
                    data_dict = {
                        'kw': kw,
                        'crawler_tm': keyword_data_dict['getinfo_tm'] / 1000
                    }
                    db_name = 'public.amazon_keyword_monitor.crawler_tm'
                    insert_sql = ''
                    update_sql = "update public.amazon_keyword_monitor set crawler_tm=%(crawler_tm)s where kw=%(kw)s;"
                    DataOutput.save_data_to_db_pool(dbObj,
                                                    cur,
                                                    db_log,
                                                    debug_log,
                                                    dataQ,
                                                    update_sql,
                                                    insert_sql,
                                                    kw,
                                                    data_dict,
                                                    db_name=db_name)
            dbObj.commit()
            pool.putconn(dbObj, i)
            if i == 50:
                i = 0
        pool.closeall()
    db_log.war('%s, %s线程任务已完成\n' %
               (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
コード例 #28
0
ファイル: tosellDataOutput.py プロジェクト: xusu12/hs_code
def tosell_save(dataQ, debug_log, db_log):
    print('\ntosell_save init\n')
    data_type = 'tosell'
    if dataQ.RedisQ.llen('tosellData') > 0:
        dbObj = GetDbObj().get_db_obj()
        cur = dbObj.cursor()
        dataOutput = DataOutput(dbObj, cur, db_log, debug_log, dataQ)
        data_tosell_db_name = SqlConfig.data_tosell_db_name
        data_tosell_update_sql = SqlConfig.data_tosell_update_sql
        data_tosell_insert_sql = SqlConfig.data_tosell_insert_sql

        druid_tosell_db_name = SqlConfig.druid_tosell_db_name
        # druid_tosell_update_sql = SqlConfig.druid_tosell_update_sql
        druid_tosell_update_sql = None  # SqlConfig.druid_tosell_update_sql
        druid_tosell_insert_sql = SqlConfig.druid_tosell_insert_sql
        while True:
            datas = dataQ.get_new_tosellData()
            pprint(datas)
            # datas = {'B01F0QQN8Q': ({'asin': 'B01F0QQN8Q',
            #                          'fba_sn': 1,
            #                          'getinfo_tm': 1542018763364,
            #                          'plow': 1,
            #                          'plows': 'largeshop',
            #                          'plows_id': 'df',
            #                          'seller_id': 'A1XEMYOCVN4TN8',
            #                          'sn': 1,
            #                          'sname': 'Gemschest'},
            #                         [{'aday': '20181112',
            #                           'asin': 'B01F0QQN8Q',
            #                           'condition': 'New',
            #                           'crawler_state': 1,
            #                           'delivery': 'Fulfillment by Amazon',
            #                           'demo': '5 out of 5 stars 99% positive over the past 12 months. (722 total '
            #                                   'ratings)',
            #                           'fba': 1,
            #                           'is_limit': 0,
            #                           'offering_id': 'tXTG86Zk6%2Bfn3YW0ITpD7nE1mscbzOgJAAhDW3VHDrP8cWV%2F1fd0DDtk7FV8eHIOKghI7PqYtkyapr23dSShe%2Fec6EMnW30fniLCM2fd1hkZKMTSUhqBYCuO87D2zljdYwfuDuVCDTm%2FQbjYnRPPhVBBs82MwpT9',
            #                           'positive': 99,
            #                           'price': 2199,
            #                           'qty': 11,
            #                           'qtydt': 0,
            #                           'rank': 1,
            #                           'reivew_count': 50,
            #                           'seller_id': 'A21P7EI9UKXT1Y',
            #                           'sn': 1,
            #                           'sname': 'largeshop',
            #                           'srank': 0,
            #                           'stype': 'FREE Shipping',
            #                           'tm': 1542018647,
            #                           'total_ratings': 722}])}
            if not datas:
                if dataQ.RedisQ.llen('tosellData') > 0:
                    datas = dataQ.get_new_tosellData()
                else:
                    break
            # print('\ntosell_save datas: [= %s =] \n' % (datas))
            tm = DataOutput.get_redis_time()
            for item in datas:
                asin = item
                tosell_datas = datas[item][0]
                tosell_list = datas[item][1]

                pprint(tosell_datas)
                pprint(tosell_list)
                print(tosell_datas['getinfo_tm'], 1)
                tosell_datas['getinfo_tm'] = tm
                print(tosell_datas['getinfo_tm'], 2)
                sql = "select asin, aday from public.amazon_product_tosell where asin=%(asin)s and aday=%(aday)s limit 1;"
                aday = tosell_list[0]['aday'] if len(
                    tosell_list) > 0 else return_PST().strftime('%Y%m%d')
                select_dict = {'asin': asin, 'aday': aday}
                cur.execute(sql, select_dict)
                select_rows = cur.fetchall()
                dbObj.commit()
                if len(select_rows) < 1:
                    if not tosell_datas.get('sname'):
                        print(222222)
                        sql1 = "select sname, seller_id from public.amazon_product_data where asin='%s' and getinfo_tm > %s" % (
                            asin, tm - 24 * 3600 * 1000)
                        cur.execute(sql1)
                        select_rows = cur.fetchall()
                        dbObj.commit()
                        select_rows = select_rows[0] if len(
                            select_rows) == 1 else ('', '')
                        sname, seller_id = select_rows
                        print('seller_id: ', seller_id)
                        print('sname ', sname)
                        tosell_datas['sname'] = sname
                        tosell_datas['seller_id'] = seller_id

                    data0 = dataOutput.save_data_to_db(
                        data_tosell_update_sql,
                        data_tosell_insert_sql,
                        asin,
                        tosell_datas,
                        db_name=data_tosell_db_name)

                    for item in tosell_list:
                        item['tm'] = int(tm / 1000)
                        data = dataOutput.save_data_to_db(
                            druid_tosell_update_sql,
                            druid_tosell_insert_sql,
                            asin,
                            item,
                            db_name=druid_tosell_db_name)

                    # 记录更新时间
                    dataOutput.crawler_tm(asin, data_type)
        cur.close()
        dbObj.close()
        db_log.war('%s, %s线程任务已完成\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
    else:
        db_log.war('%s, %s数据队列为空\n' %
                   (return_PST().strftime("%Y-%m-%d %H:%M:%S"), data_type))
コード例 #29
0
ファイル: keywordCrawler.py プロジェクト: xusu12/hs_code
    def download(self, ip, asin_or_kw, url_dict):
        time_now = lambda: time.time()
        url_type = self.url_type
        kw = asin_or_kw
        cid = url_dict.get('cid') or 0
        print(ip, kw, cid, url_dict)
        # time.sleep(30)
        kw_url = self.make_search_url(quote(kw), cid)
        url = kw_url
        print('\nkeyword_url_tuple: ', kw_url)
        self.debug_log.debug(
            'url_type: %s, asin: %s, monitor_type: %s, url %s: ' %
            (url_type, kw, cid, url))
        if url:
            self.debug_log.debug('[ip %s] 工作中... [%s]' % (ip, url))
            startTime = return_PST().strftime("%Y-%m-%d %H:%M:%S")
            time1 = time_now()
            referer = 'https://www.amazon.com'
            self.debug_log.debug('keyword referer: %s' % (referer))
            ua = self.get_ua()
            self.debug_log.debug('keyword ua: %s' % (ua))
            # value_str = ip + ua
            # self.debug_log.debug('keyword value_str: %s' % (value_str))
            url_md5key = url_dict.get('md5') or ''
            if not url_md5key:
                url_md5key = self.get_md5_key(kw + url_type)
            # cookMd5key = self.get_md5_key(value_str)
            cookMd5key = None
            cookie = self.get_cookie(cookMd5key)
            self.debug_log.debug('keyword cookie: %s' % (cookie))
            # 下载url
            html_list, url_list, cookiesObj, is_error_lsit = \
                self.get_keyword_html_lsit(url, ua, ip, cookie, referer, kw=kw)
            old_dnum = url_dict.get('dnum') or 0
            durl = url_dict.get('durl') or []
            url_dict['durl'] = list(set(durl + url_list))
            url_dict['dnum'] = old_dnum + 1
            # 如果判定为没有关键词, 结束程序
            if self.not_found:
                DataOutput.record_not_found_keyword(kw)
                self.dataQ.record_keyword_not_fund_times()
                msgInt = 0
                proxyInfo = 'the keyword not found'
                self.record_log(kw, time1, msgInt, url_type, startTime, ip,
                                proxyInfo)
                return self.debug_log.war('%s关键字不存在' % (kw))
            i = -1
            keyword_html_list = []
            if len(html_list) > 0:
                for html in html_list:
                    i += 1
                    is_error = is_error_lsit[i]
                    url = url_list[i]
                    if is_error:
                        msgInt = 6
                        proxyInfo = 'get Html error'
                        self.record_log(kw, time1, msgInt, url_type, startTime,
                                        ip, proxyInfo)

                    else:
                        analyze = self.analyze_html(html,
                                                    cookie,
                                                    cookiesObj,
                                                    ip,
                                                    kw,
                                                    url_dict,
                                                    cookMd5key,
                                                    time1,
                                                    startTime,
                                                    html_type=url_type)
                        if analyze and analyze != 404:
                            keyword_html_list.append(html)
                if len(html_list) == len(keyword_html_list):
                    result, is_error = self.kw_parser(keyword_html_list,
                                                      kw,
                                                      cid,
                                                      ip=ip,
                                                      url=url)
                    if is_error:
                        msgInt = 3
                        proxyInfo = 'get data error'
                        self.record_log(kw, time1, msgInt, url_type, startTime,
                                        ip, proxyInfo)
                    else:
                        if not result:
                            self.the_url_is_discard(kw, url_dict, url_type,
                                                    url_md5key)
                            msgInt = 2
                            proxyInfo = 'get data defeated'
                            self.record_log(kw, time1, msgInt, url_type,
                                            startTime, ip, proxyInfo)
                        else:
                            keyword_datas = result[0]
                            if not keyword_datas:
                                self.add_url_to_queue(url_dict,
                                                      url_type=url_type,
                                                      retry_type=True)
                            else:
                                self.save_success_asin_keyword(
                                    kw, url_type=url_type)
                                msgInt = 1
                                proxyInfo = 'get data success'
                                self.record_log(kw, time1, msgInt, url_type,
                                                startTime, ip, proxyInfo)

                                data_bytes = pickle.dumps(keyword_datas)
                                self.dataQ.add_keyword_to_queue(data_bytes)
                                self.dataQ.record_data_ok_times()
                                self.dataQ.record_keyword_ok_times()

                else:
                    self.the_url_is_discard(kw, url_dict, url_type, url_md5key)
            else:
                self.the_url_is_discard(kw, url_dict, url_type, url_md5key)

            time.sleep(1)
        else:
            print(url_type, '没有url')
            self.add_url_to_set(url_dict, url_type, retry_type=True)
            time.sleep(1)
コード例 #30
0
ファイル: keywordParser.py プロジェクト: xusu12/hs_code
    def kw_parser(self, html_code_list, keyword, cid, not_match=False):
        result_dict = {}
        # print(len(html_code))
        print('not_match', not_match)
        price_list = []
        rrg_list = []
        rc_list = []
        keyword_data_list = []
        search_num = self._get_search_num(html_code_list[0])
        print('html_code_list len: ', len(html_code_list))
        j = 0
        for html_code in html_code_list:
            j += 1
            self.html_code = html_code
            resultId = KwParser.get_results_tag(html_code)
            resultNum = len(resultId)
            # 保存html样本, 测试用
            # print('resultId len: ', resultNum)
            # file = '%s%s.html' % (keyword, j)
            # with open(file, 'w', encoding='utf-8') as f:
            #     f.write(html_code)
            i = -1
            for result in resultId:
                i += 1
                html = str(tostring(result), encoding='utf-8')
                result = etree.HTML(html)
                asin = self._get_asin(result, i)
                title = self._get_title(result, i)
                brand = self._get_brand(result, i)
                price = self._get_price(result, i)
                rrg = self._get_rrg(result, i)
                rc = self._get_rc(result, i)
                img = self._get_img(result, i)
                issp = self._get_issp(result, i)
                prime = self._get_prime(result, i)
                keyword_data_dict = dict(
                    kw=keyword,  # 关键字
                    cid=cid,  # 分类id(0代表所有分类)
                    asin=asin,  # 产品
                    title=title,  # 产品标题
                    img=img,  # 产品图片
                    brand=brand,  # 品牌
                    msn=0,  # 月搜索量
                    issp=issp,  # 是否付费推广
                    srn=search_num,  # 搜索结果数
                    price=price,  # 产品价格
                    rrg=rrg,  # 产品评分
                    rc=rc,  # 产品评论数
                    special=1,  # 均分程序跑数据
                    tm=int(BaseParser.get_redis_time()),  # 获取数据的时间(毫秒级)
                    aday=return_PST().strftime("%Y%m%d"),  # 获取数据的太平洋日期
                    is_prime=prime,  # 是否有prime 标记
                    fba=prime,
                    category=None,  # 分类
                    bsr=None,  # bsr1.3做
                )
                if asin:
                    keyword_data_list.append(keyword_data_dict)
            print(j, '数据长度: ', len(keyword_data_list))
        i = 0
        for item in keyword_data_list:
            i += 1
            item['pr'] = i

        print(keyword, 'keyword_data_list len: ', len(keyword_data_list))
        if len(keyword_data_list) > 50:
            keyword_data_list = keyword_data_list[0:50]
        for item in keyword_data_list:
            price = item.get('price')
            rrg = item.get('rrg')
            rc = item.get('rc')
            # 价格
            if type(price) is int and price > 0:
                price_list.append(price)
            # 评论
            if type(rc) is int and rc >= 0:
                rc_list.append(rc)
            # 评分
            if type(rrg) is int and rrg > 0:
                rrg_list.append(rrg)

        print('price_list: ', len(price_list), price_list)
        print('rrg: ', len(rrg_list), rrg_list)
        print('rc: ', len(rc_list), rc_list)
        price_max = self._get_price_max(price_list)
        price_min = self._get_price_min(price_list)
        price_ave = self._get_price_ave(price_list)
        # print('rrg_list: ', rrg_list)
        rrg_max = self._get_rrg_max(rrg_list)
        rrg_min = self._get_rrg_min(rrg_list)
        rrg_ave = self._get_rrg_ave(rrg_list)
        # print('rc_list: ', rc_list)
        rc_max = self._get_rc_max(rc_list)
        rc_min = self._get_rc_min(rc_list)
        rc_ave = self._get_rc_ave(rc_list)

        date = self._get_date()
        # print(date, type(date))
        if search_num < 50 and search_num < len(keyword_data_list):
            search_num = len(keyword_data_list)
            for data in keyword_data_list:
                data['srn'] = search_num
        kwData_dict = dict(
            kw=keyword,  # 关键词
            cid=cid,  # 分类id(0代表所有分类)
            mon_search=0,  # 月搜索量(废弃)
            search_num=search_num,  # 搜索结果数
            price_max=price_max,  # 最高价格
            price_min=price_min,  # 最低价格
            price_ave=price_ave,  # 平均价格
            rrg_max=rrg_max,  # 最高评分
            rrg_min=rrg_min,  # 最低评分
            rrg_ave=rrg_ave,  # 平均评分
            rc_max=rc_max,  # 最高评论数
            rc_min=rc_min,  # 最低评论数
            rc_ave=rc_ave,  # 平均评论数
            date=date,  # 采集的日期(Ymd)
            mon_search_state=0,  # 月搜索数量采集的状态
            other_state=1,  # 其它数据采集状态
            getinfo_tm=int(BaseParser.get_redis_time()),  # 获取数据的时间(毫秒级)
        )
        print(keyword, 'keyword_data_list len1: ', len(keyword_data_list))
        if search_num > 100:
            if len(keyword_data_list) == 50:
                result_dict[keyword] = (kwData_dict, keyword_data_list)
            else:
                if not_match and len(keyword_data_list) <= 50:
                    result_dict[keyword] = (kwData_dict, keyword_data_list)

        else:
            if len(keyword_data_list) <= 50:
                result_dict[keyword] = (kwData_dict, keyword_data_list)
        if search_num == 0 and price_max == 0 and price_ave == 0 and rrg_max == 0 and rc_max == 0:
            result_dict = {}
        from pprint import pprint
        pprint(result_dict)
        return result_dict