Esempio n. 1
0
def hilton_to_database(tid, used_times, source, keyword, extra, spider_tag, need_cache=True):
    task = Task()
    task.content = keyword
    task.extra = extra
    spider = factory.get_spider_by_old_source(spider_tag)
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['suggest'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['suggest'], cache_config=none_cache_config)
    logger.info(
        str(len(spider.result['suggest'])) + '  --  ' + keyword)
    return error_code, spider.result['suggest']
def poidetail_to_database(tid, used_times, source, url, need_cache=True):
    task = Task()
    task.content = url
    task.ticket_info = {
        'tid': tid,
        'used_times': used_times
    }
    print (source + '_detail')
    spider = factory.get_spider_by_old_source(source+'_detail')
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['POIdetail'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['POIdetail'], cache_config=none_cache_config)
    print(error_code)
    logger.info(str(spider.result['POIdetail']) + '  --  ' + task.content)
    return error_code, spider.result['POIdetail'], spider.page_store_key_list
Esempio n. 3
0
def hilton_to_database(tid, used_times, source, source_id, city_id, check_in, need_cache=True):
    task = Task()
    task.content = 'NULL&' + str(city_id) + '&' + str(source_id) + '&' + '2&{0}'.format(check_in)
    task.ticket_info = {
        'tid': tid,
        'used_times': used_times,
        'room_info': [{"occ": 2, "num": 1}]
    }
    spider = factory.get_spider_by_old_source('hiltonHotel2')
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['list', 'room'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['list', 'room'], cache_config=none_cache_config)
    print(error_code)
    logger.info(str(spider.result['room']) + '  --  ' + task.content)
    return error_code, spider.result['room'], spider.page_store_key_list
Esempio n. 4
0
def qyer_list_to_database(tid,
                          used_times,
                          source,
                          city_id,
                          check_in,
                          city_url,
                          need_cache=True):
    task = Task()
    task.content = city_url
    task.ticket_info = {'tid': tid, 'used_times': used_times}
    spider = factory.get_spider_by_old_source('qyerList')
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['list'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['list'],
                                  cache_config=none_cache_config)
    print(error_code)
    logger.info(str(spider.result['list']) + '  --  ' + task.content)
    return error_code, spider.result[
        'list'], spider.page_store_key_list, spider.types_result_num
Esempio n. 5
0
def hotel_rest_list_task(self, source, url, city_id, **kwargs):
    try:
        self.task_source = source.title()
        self.task_type = 'DaodaoListInfo'
        logger.info("任务进行中。。。")
        code, result = hotel_list_database(source, url)
        logger.info("code : %s" % str(code))

        if int(code) != 0:
            logger.info("=======================0=========================\n")
            logger.info(str(code) + '   |   ' + str(result))
            logger.info("\n=======================1=========================")
            raise Exception

        self.error_code = str(code)

        for one in result:
            for key, view in one.items():
                rest = HotelRestList()
                rest.source = source
                rest.source_id = int(view['source_id'])
                rest.city_id = int(city_id)
                rest.url = view['view_url']
                rest.name = view['view_name'].strip('\n').strip()

                try:
                    ss = DBSession_mb4()
                    ss.merge(rest)
                    ss.commit()
                except Exception as e:
                    logger.info(
                        "======================= sql 异常========================="
                    )
                    logger.exception(traceback.format_exc(e))

        return True
    except Exception as e:
        logger.exception('==================  异常  0==================')
        logger.exception(source + ' | ' + str(city_id) + ' | ' + url)
        logger.exception(traceback.format_exc(e))
        logger.exception('==================  异常  1==================')
        raise Exception(e)
Esempio n. 6
0
    def _execute(self, **kwargs):
        source = self.task.kwargs['source']
        city_id = self.task.kwargs['city_id']
        country_id = self.task.kwargs['country_id']
        fla = self.task.kwargs.get('list_more', False)

        @func_time_logger
        def hotel_list_crawl():
            error_code, result, page_store_key = hotel_list_database(
                tid=self.task.task_id,
                used_times=self.task.used_times,
                source=source,
                city_id=city_id,
                check_in=self.task.kwargs['check_in'],
                is_new_type=self.task.kwargs.get('is_new_type', False),
                suggest_type=self.task.kwargs.get('suggest_type', '1'),
                suggest=self.task.kwargs.get('suggest', ''),
                need_cache=self.task.used_times == 0,
                flag=fla)
            return error_code, result, page_store_key

        error_code, result, page_store_key = hotel_list_crawl()
        print(result)

        # more_list
        if fla:
            for line in result['filter']:
                line['country_id'] = country_id
                line['source'] = source
            filter_collections.insert_many(result['filter'])
            if len(result['filter']) > 0:
                self.task.error_code = 0
            elif int(error_code) == 0:
                raise ServiceStandardError(ServiceStandardError.EMPTY_TICKET)
            else:
                raise ServiceStandardError(error_code=error_code)
            return result, error_code, self.task.error_code, self.task.task_name, self.task.kwargs[
                'suggest']

        if source == 'starwood' and error_code == 29:
            self.task.error_code = 109
            error_code = 109
        else:
            self.task.error_code = error_code

        res_data = []
        if source in ('ctrip', 'ctripcn', 'starwood', 'gha'):
            for line in result['hotel']:
                sid = line[3]
                hotel_url = line[-1]
                res_data.append((source, sid, city_id, country_id, hotel_url))
        elif source in ('bestwest'):
            for sr, sid, city_id, hotel_url in result['hotel']:
                res_data.append((source, sid, city_id, country_id, hotel_url))
        elif source in ('fourseasons'):
            for line in result['hotel']:
                sid = line[-1]
                hotel_url = line[0]
                res_data.append((source, sid, city_id, country_id, hotel_url))
        elif source in ('hyatt'):
            for line in result['hotel']:
                sid = line[-1]
                hotel_url = line[1]
                res_data.append((source, sid, city_id, country_id, hotel_url))

        elif source == 'hilton':
            for dict_obj in result['hotel']:
                line = dict_obj.values()
                res_data.append(
                    (source, line[2], city_id, country_id, line[0]))
        else:
            for sid, hotel_url in result['hotel']:
                res_data.append((source, sid, city_id, country_id, hotel_url))

        @func_time_logger
        def hotel_list_insert_db():
            try:
                service_platform_conn = service_platform_pool.connection()
                cursor = service_platform_conn.cursor()
                sql = "INSERT IGNORE INTO {} (source, source_id, city_id, country_id, hotel_url) VALUES (%s,%s,%s,%s,%s)".format(
                    self.task.task_name)
                _res = cursor.executemany(sql, res_data)
                service_platform_conn.commit()
                cursor.close()
                service_platform_conn.close()
                self.task.list_task_insert_db_count = _res
                self.task.get_data_per_times = len(res_data)
            except Exception as e:
                self.logger.exception(msg="[mysql error]", exc_info=e)
                raise ServiceStandardError(
                    error_code=ServiceStandardError.MYSQL_ERROR,
                    wrapped_exception=e)

        hotel_list_insert_db()

        try:
            data_collections = mongo_data_client['ServicePlatform'][
                self.task.task_name]
            data_collections.create_index([('source', 1), ('source_id', 1),
                                           ('city_id', 1)],
                                          unique=True,
                                          background=True)
            data = []
            if data:
                for line in res_data:
                    data.append({
                        'list_task_token': self.task.list_task_token,
                        'task_id': self.task.task_id,
                        'source': line[0],
                        'source_id': line[1],
                        'city_id': line[2],
                        'country_id': line[3],
                        'hotel_url': line[4]
                    })
                data_collections.insert(data, continue_on_error=True)
        except pymongo.errors.DuplicateKeyError:
            logger.info("[Duplicate Key]")
        except Exception as exc:
            raise ServiceStandardError(
                error_code=ServiceStandardError.MONGO_ERROR,
                wrapped_exception=exc)

        # 由于错误都是 raise 的,
        # 所以当出现此种情况是,return 的内容均为正确内容
        # 对于抓取平台来讲,当出现此中情况时,数据均应该入库
        # 用 res_data 判断,修改 self.error_code 的值
        if len(res_data) > 0:
            self.task.error_code = 0
        elif int(error_code) == 0:
            raise ServiceStandardError(ServiceStandardError.EMPTY_TICKET)
        else:
            raise ServiceStandardError(error_code=error_code)
        return len(
            res_data), error_code, self.task.error_code, self.task.task_name