예제 #1
0
def send_result_detail_task(source, tasks, detail_tables, priority):
    timestamp = None
    if not tasks:
        return timestamp

    convert_data = defaultdict(list)

    for id, source_list, timestamp in tasks:
        for _source, hotel_url in json.loads(source_list).iteritems():
            task_tag = detail_tables.get(_source)
            if not task_tag:continue
            convert_data[(task_tag, _source)].append((id, hotel_url, timestamp))


    for (task_tag, _source), hotel_url_and_hids in convert_data.iteritems():
        with InsertTask(worker='proj.total_tasks.hotel_detail_task', queue='hotel_detail', routine_key='hotel_detail',
                        task_name=task_tag, source=_source.title(), _type='Hotel',
                        priority=priority) as it:
            for id, hotel_url, timestamp in hotel_url_and_hids:
                if hotel_url in ('null', '{}', None, 'http://', '', 'https://'): continue
                if hotel_url.strip().startswith('https://www.tripadvisor.cn'): continue

                it.insert_task({
                    'source': _source,
                    'url': hotel_url,
                    'part': task_tag,
                    'source_id': 'NULL',
                    'hid': id,
                    'city_id': 'NULL',
                    'country_id': 'NULL'
                })
    return timestamp
예제 #2
0
def qyer_supplement_map_info(tasks):
    utime = None
    _count = 0

    if not tasks:
        return utime

    source = tasks[0][0]
    with InsertTask(worker='proj.total_tasks.supplement_map_info',
                    queue='supplement_field',
                    routine_key='supplement_field',
                    task_name='supplement_field',
                    source=source.title(),
                    _type='SupplementField',
                    priority=3) as it:
        for table_name, type, source, sid, other_info, status, utime in tasks:
            _count += 1
            it.insert_task({
                'table_name': table_name,
                'source': source,
                'sid': sid,
                'other_info': other_info,
            })

    return utime
예제 #3
0
def send_qyer_detail_task(tasks, task_tag, priority):
    utime = None
    with InsertTask(worker='proj.total_tasks.qyer_detail_task', queue='poi_detail', routine_key='poi_detail',
                    task_name=task_tag, source='Qyer', _type='QyerDetail',
                    priority=priority) as it:
        for source, source_id, city_id, qyer_url, utime in tasks:
            it.insert_task({
                'target_url': qyer_url,
                'city_id': 'NULL',
                'part': task_tag
            })

    return utime
예제 #4
0
def send_GT_detail_task(tasks, task_tag, priority):
    utime = None
    with InsertTask(worker='proj.total_tasks.GT_detail_task', queue='grouptravel', routine_key='grouptravel',
                    task_name=task_tag, source='GT', _type='GTDetail',
                    priority=priority) as it:
        for source, source_id, city_id, country_id, url, utime in tasks:
            gg = json.loads(url)
            gg['dest_id'] = country_id
            gg['source']=source
            it.insert_task(
                gg
            )
    return utime
예제 #5
0
def send_ctripPoi_detail_task(tasks, task_tag, priority):
    utime = None
    with InsertTask(worker='proj.total_tasks.ctrip_poi_detail_task', queue='poi_detail', routine_key='poi_detail',
                    task_name=task_tag, source='CtripPoi',_type='CtripPoiDetail',
                    priority=priority) as it:
        for source, source_id, city_id,country_id, url, utime in tasks:
            it.insert_task({
                'source': source,
                'poi_id': source_id,
                'tag': country_id,
                'city_id': city_id,
                'detail_url': url
            })
    return utime
예제 #6
0
def send_poi_detail_task(tasks, task_tag, priority):
    utime = None
    typ1, typ2, source, tag = task_tag.split('_')
    with InsertTask(worker='proj.total_tasks.poi_detail_task', queue='poi_detail', routine_key='poi_detail',
                    task_name=task_tag, source='Daodao', _type='DaodaoDetail',
                    priority=priority) as it:
        for source, source_id, city_id, hotel_url, utime in tasks:
            it.insert_task({
                'target_url': hotel_url,
                'city_id': 'NULL',
                'poi_type': typ2,
                'country_id': 'NULL',
                'part': task_tag
            })

    return utime
예제 #7
0
def send_hotel_detail_task(tasks, task_tag, priority):
    timestamp = None
    if not tasks:
        return timestamp

    source = tasks[0][0]
    with InsertTask(worker='proj.total_tasks.hotel_detail_task', queue='hotel_detail', routine_key='hotel_detail',
                    task_name=task_tag, source=source.title(), _type='Hotel',
                    priority=priority) as it:
        for source, source_id, city_id, hotel_url, timestamp in tasks:
            it.insert_task({
                'source': source,
                'url': hotel_url,
                'part': task_tag,
                'source_id': source_id,
                'city_id': 'NULL',
                'country_id': 'NULL'
            })
        return timestamp
예제 #8
0
def send_result_daodao_filter(source, tasks, daodao_filter_table, priority):
    timestamp = None
    if not tasks:
        return timestamp
    task_name = daodao_filter_table+'f'
    task_name = task_name.replace('list', 'detail')
    with InsertTask(worker='proj.total_tasks.result_daodao_filter', queue='hotel_detail', routine_key='hotel_detail',
                    task_name=task_name, source=source.title(), _type='daodaoURLFilter', task_type=TaskType.NORMAL,
                    priority=priority) as it:
        for id, source_list, timestamp in tasks:
            for _source, hotel_url in json.loads(source_list).iteritems():
                if _source not in ('agoda', 'booking', 'ctrip', 'elong', 'hotels'):continue
                if hotel_url in ('null', '{}', None, 'http://', '', 'https://'): continue
                if not hotel_url.startswith('https://www.tripadvisor.cn'): continue

                it.insert_task({
                    'url': hotel_url.strip(),
                    'source': _source,
                    'id': id,
                    'table_name': daodao_filter_table,
                    # 'date_index': 0
                })
    return timestamp
예제 #9
0
def city2list():
    # aaa = str([str(collection_name) for collection_name in db.collection_names() if str(collection_name).startswith('City_Queue_')])
    for collection_name in db.collection_names():
        if not str(collection_name).startswith('City_Queue_'):
            continue
        if collection_name in ('City_Queue_grouptravel_TaskName_city_total_GT_20180312a', 'City_Queue_grouptravel_TaskName_city_total_GT_20180314a'):continue
        # if not collection_name.endswith('0416a'):continue
        collections = db[collection_name]
        _count = 0

        # 先获取一条数据,用以初始化入任务模块,可能这条数据有问题
        for each in collections.find({}):
            if 'task_name' in each:
                per_data = copy.deepcopy(each)
                break
        # per_data = collections.find_one()
        task_name = per_data['task_name']

        new_task_name = re.sub('city_', 'list_', task_name)
        create_table(new_task_name)
        logger.info('转换任务名  %s : %s' % (task_name, new_task_name))

        with InsertTask(worker=per_data['worker'], queue=per_data['queue'], routine_key=per_data['routing_key'],
                        task_name=new_task_name, source=per_data['source'], _type=per_data['type'],
                        priority=per_data['priority'], task_type=TaskType.LIST_TASK) as it:
            for line in collections.find({"finished": 0}):
                # 由于上方使用取多个,找 task_name 的方法来实现,这里会判断 task_name 是否在此当中
                if 'task_name' not in line:
                    continue
                if int(line['date_index']) > len(set(list(map(lambda x: x[0], line['data_count'])))):
                    # 当前日期数目如果与已回来的任务数目相同,或者小于的话,则应该推进任务分发,否则为任务还没有完成,需要等待任务完成后再分发
                    # 发任务数目与返回的全量任务 id 数目相同时,代表之前发的任务已经完成
                    continue

                if len(set(list(map(lambda x: x[0], line['data_count'])))) >= MAX_TASK_PER_CITY:
                    # 当前已完成任务数目大于城市最大任务数目,可认为任务完成
                    collections.update({'list_task_token': line['list_task_token']}, {"$set": {"finished": 1}})

                if len(filter(lambda x: x[-1], line['data_count'])) > FINISHED_ZERO_COUNT:
                    # 如果正常返回的数据中连续 FINISHED_ZERO_COUNT 次为 0 ,认为任务完成,并修改状态位置
                    if all(
                            map(
                                lambda x: int(x[3]) == 0,
                                list(
                                    sorted(
                                        filter(
                                            lambda x: x[-1],
                                            line['data_count']
                                        ),
                                        key=lambda x: x[1]
                                    )
                                )[-FINISHED_ZERO_COUNT:]
                            )
                    ):
                        # 全部为 0 则表明该城市任务已经积累完成
                        collections.update({'list_task_token': line['list_task_token']}, {"$set": {"finished": 1}})
                        continue
                # if all(map(lambda x: x[3] == 0,
                #            list(filter(lambda x: x[-1], line['data_count']))[-FINISHED_ZERO_COUNT:])):
                #     # 全部为 0 则表明该城市任务已经积累完成
                #     collections.update({'list_task_token': line['list_task_token']}, {"$set": {"finished": 1}})
                #     continue

                _count += 1
                if _count == MAX_CITY_TASK_PER_SEARCH:
                    # 到达最大城市任务数目后,结束任务分发
                    break

                # 基本信息,第几个日期
                date_index = line['date_index']

                args = line['args']
                new_date = get_city_date(task_name, date_index)
                args['check_in'] = new_date
                args['date_index'] = date_index

                it.insert_task(args=args)

                # 更新任务状态
                collections.update({
                    '_id': line['_id']
                }, {
                    '$inc': {'date_index': 1}
                })
예제 #10
0
def send_image_task(tasks, task_tag, priority, is_poi_task):
    _count = 0
    md5_data = []
    conn = pymysql.connect(host='10.10.228.253', user='******', password='******', charset='utf8',
                           db='ServicePlatform')
    cursor = conn.cursor()
    update_time = None
    if not tasks:
        return update_time

    source = tasks[0][0]
    suffix = task_tag.split('_', 1)[1]
    with InsertTask(worker='proj.total_tasks.images_task', queue='file_downloader', routine_key='file_downloader',
                    task_name='images_' + suffix, source=source.title(), _type='DownloadImages',
                    priority=priority) as it:
        for source, source_id, city_id, img_items, update_time in tasks:
            if img_items is None:
                continue
            for url in img_items.split('|'):
                if not url:
                    continue
                md5 = hashlib.md5(source + str(source_id) + url).hexdigest()

                if '20171122a' not in task_tag and '20171120a' not in task_tag:
                    if redis_md5.get(md5):
                        continue

                redis_md5.set(md5, 1)
                md5_data.append((md5, datetime.datetime.now()))
                _count += 1
                task_type = task_tag.split('_')[1]
                if task_type == 'total':
                    bucket_name = 'mioji-attr'
                else:
                    bucket_name = "mioji-{}".format(task_type)

                if bucket_name == 'mioji-wanle':
                    file_prefix = "huantaoyou"
                else:
                    file_prefix = ""

                it.insert_task({
                    'source': source,
                    'new_part': task_tag,
                    'target_url': url,
                    'is_poi_task': is_poi_task,
                    'source_id': source_id,
                    'part': task_tag.split('_')[-1],
                    'bucket_name': bucket_name,
                    'file_prefix': file_prefix
                })

                if _count % 5000 == 0:
                    cursor.executemany('insert ignore into crawled_url(md5, update_time) values(%s, %s)', args=md5_data)
                    conn.commit()
                    md5_data = []
        else:
            if len(md5_data) > 0:
                cursor.executemany('insert ignore into crawled_url(md5, update_time) values(%s, %s)', args=md5_data)
                conn.commit()
                cursor.close()
                conn.close()
    return update_time