def weitao_spider():
    global loop

    if not my_queue.empty():
        taobao_short_url = my_queue.get()
        taobao_short_url_uuid = str(
            uuid.uuid5(uuid.NAMESPACE_DNS, taobao_short_url))
        print(taobao_short_url_uuid)
        print(old_message_url_uuid_list)
        if taobao_short_url_uuid in old_message_url_uuid_list:
            return False

        print('拿到待处理url:', taobao_short_url)

        weitao = TaoBaoWeiTaoShareParse()
        try:
            loop.run_until_complete(
                weitao._deal_with_api_info(taobao_short_url))
        except RuntimeError:
            pass

        try:
            del weitao
            # loop.close()  # 重用loop
        except:
            pass
        gc.collect()
        restart_program()  # 通过这个重启环境, 避免log重复打印

        return True
    else:
        print('空queue!')

        return False
def just_fuck_run():
    '''由于写成守护进程无法运行, 采用tmux模式运行, 设置采集时间点用以防止采集冲突'''
    _spider_run_time = ['00', '01', '02', '03', '04', '05']
    while True:
        if str(get_shanghai_time())[11:13] in _spider_run_time:
            while True:
                if str(get_shanghai_time())[11:13] not in _spider_run_time:
                    print('冲突时间点, 不抓取数据..., 上海时间%s' % str(get_shanghai_time()))
                    sleep(60 * 5)
                    break

                print('一次大抓取即将开始'.center(30, '-'))
                taobao_qianggou = TaoBaoQiangGou()
                loop = asyncio.get_event_loop()
                loop.run_until_complete(
                    taobao_qianggou._deal_with_all_goods_id())
                try:
                    del taobao_qianggou
                    loop.close()
                except:
                    pass
                gc.collect()
                print('一次大抓取完毕, 即将重新开始'.center(30, '-'))
                restart_program()  # 通过这个重启环境, 避免log重复打印
                sleep(60 * 30)

        else:
            print('未在脚本运行时间点...休眠中, 上海时间%s' % str(get_shanghai_time()))
            sleep(60 * 2)
Beispiel #3
0
def main_2():
    while True:
        loop = asyncio.get_event_loop()
        loop.run_until_complete(run_forever())
        try: loop.close()
        except: pass
        gc.collect()
        restart_program()
Beispiel #4
0
def just_fuck_run():
    while True:
        print('一次大抓取即将开始'.center(30, '-'))
        jumeiyoupin_pintuan = JuMeiYouPinPinTuan()
        loop = asyncio.get_event_loop()
        loop.run_until_complete(jumeiyoupin_pintuan.deal_with_data())
        try:
            del jumeiyoupin_pintuan
            loop.close()
        except: pass
        gc.collect()
        print('一次大抓取完毕, 即将重新开始'.center(30, '-'))
        restart_program()       # 通过这个重启环境, 避免log重复打印
def just_fuck_run():
    while True:
        print('一次大抓取即将开始'.center(30, '-'))
        taobao_qianggou = TaoBaoQiangGouRealTimesUpdate()
        loop = asyncio.get_event_loop()
        loop.run_until_complete(taobao_qianggou._run_forever())
        try:
            del taobao_qianggou
            loop.close()
        except: pass
        gc.collect()
        print('一次大抓取完毕, 即将重新开始'.center(30, '-'))
        restart_program()   # 通过这个重启环境, 避免log重复打印
        sleep(60*10)
Beispiel #6
0
def just_fuck_run():
    while True:
        print('一次大抓取即将开始'.center(30, '-'))
        taobao_tiantaintejia = TaoBaoTianTianTeJia()
        loop = asyncio.get_event_loop()
        loop.run_until_complete(taobao_tiantaintejia.deal_with_all_goods_id())
        try:
            del taobao_tiantaintejia
            loop.close()
        except:
            pass
        gc.collect()
        print('一次大抓取完毕, 即将重新开始'.center(30, '-'))
        restart_program()  # 通过这个重启环境, 避免log重复打印
        sleep(60 * 5)
def just_fuck_run():
    while True:
        print('一次大更新即将开始'.center(30, '-'))
        tmp = JuMeiYouPinRealTimesUpdate()
        loop = asyncio.get_event_loop()
        loop.run_until_complete(tmp.run_forever(
        ))  # 切记run_until_complete()一定要接收一个return值,不然视为未结束重复打印结果
        print('麻痹的执行完了')
        try:
            del tmp
            loop.close()
        except:
            pass
        gc.collect()
        print('一次大更新完毕'.center(30, '-'))
        restart_program()  # 通过这个重启环境, 避免log重复打印
Beispiel #8
0
def run_forever():
    #### 实时更新数据
    while True:
        # ** 不能写成全局变量并放在循环中, 否则会一直记录到同一文件中
        my_lg = set_logger(log_file_name=MY_SPIDER_LOGS_PATH + '/淘宝/实时更新/' +
                           str(get_shanghai_time())[0:10] + '.txt',
                           console_log_level=INFO,
                           file_log_level=ERROR)

        # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
        tmp_sql_server = SqlPools()  # 使用sqlalchemy管理数据库连接池
        try:
            # result = list(tmp_sql_server.select_taobao_all_goods_id())
            result = tmp_sql_server.select_taobao_all_goods_id()

        except TypeError:
            my_lg.error('TypeError错误, 原因数据库连接失败...(可能维护中)')
            result = None
        if result is None:
            pass
        else:
            my_lg.info('------>>> 下面是数据库返回的所有符合条件的goods_id <<<------')
            my_lg.info(str(result))
            my_lg.info(
                '--------------------------------------------------------')

            my_lg.info('即将开始实时更新数据, 请耐心等待...'.center(100, '#'))
            index = 1
            for item in result:  # 实时更新数据
                taobao = TaoBaoLoginAndParse(logger=my_lg)
                if index % 50 == 0:  # 每50次重连一次,避免单次长连无响应报错
                    my_lg.info('正在重置,并与数据库建立新连接中...')
                    # try:
                    #     del tmp_sql_server
                    # except:
                    #     pass
                    # gc.collect()
                    # tmp_sql_server = SqlServerMyPageInfoSaveItemPipeline()
                    tmp_sql_server = SqlPools()

                    my_lg.info('与数据库的新连接成功建立...')

                if tmp_sql_server.is_connect_success:
                    my_lg.info(
                        '------>>>| 正在更新的goods_id为(%s) | --------->>>@ 索引值为(%s)'
                        % (item[0], str(index)))
                    data = taobao.get_goods_data(item[0])

                    if data.get('is_delete') == 1:  # 单独处理【原先插入】就是 下架状态的商品
                        data['goods_id'] = item[0]
                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                MyShelfAndDownTime=item[2])

                        # my_lg.info('------>>>| 爬取到的数据为: ' + str(data))
                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)

                        sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 避免服务器更新太频繁
                        index += 1
                        gc.collect()
                        continue

                    data = taobao.deal_with_data(goods_id=item[0])
                    if data != {}:
                        data['goods_id'] = item[0]
                        data['my_shelf_and_down_time'], data[
                            'delete_time'] = get_my_shelf_and_down_time_and_delete_time(
                                tmp_data=data,
                                is_delete=item[1],
                                MyShelfAndDownTime=item[2])
                        data['_is_price_change'], data[
                            '_price_change_info'] = _get_price_change_info(
                                old_price=item[3],
                                old_taobao_price=item[4],
                                new_price=data['price'],
                                new_taobao_price=data['taobao_price'])

                        # my_lg.info('------>>>| 爬取到的数据为: ' + str(data))
                        taobao.to_right_and_update_data(
                            data, pipeline=tmp_sql_server)
                    else:
                        my_lg.info('------>>>| 休眠5s中...')
                        sleep(5)

                else:  # 表示返回的data值为空值
                    my_lg.error('数据库连接失败,数据库可能关闭或者维护中')
                    sleep(10)
                    pass

                index += 1
                # try:
                #     del taobao
                # except:
                #     pass
                gc.collect()
                # 国外服务器上可以缩短时间, 可以设置为0s
                sleep(TAOBAO_REAL_TIMES_SLEEP_TIME)  # 不能太频繁,与用户请求错开尽量
            my_lg.info('全部数据更新完毕'.center(100, '#'))  # sleep(60*60)
        if get_shanghai_time().hour == 0:  # 0点以后不更新
            sleep(60 * 60 * 5.5)
        else:
            sleep(5)
        gc.collect()
        restart_program()
Beispiel #9
0
        return data

    def __del__(self):
        try:
            del self.my_lg
            del self.msg
            del self.my_pipeline
        except:
            pass
        gc.collect()


# _short_url = 'http://m.tb.cn/h.WAjz5RP'
# _short_url = 'http://m.tb.cn/h.WA6JGoC'
_short_url = 'http://m.tb.cn/h.WA6Hp6H'

if __name__ == '__main__':
    while True:
        taobao_short_url = input('请输入淘宝短链接:').replace(';', '')
        weitao = TaoBaoWeiTaoShareParse()
        loop = asyncio.get_event_loop()
        loop.run_until_complete(weitao._deal_with_api_info(taobao_short_url))
        try:
            del weitao
            loop.close()
        except:
            pass
        gc.collect()
        restart_program()  # 通过这个重启环境, 避免log重复打印