def _execute(self, **kwargs):
        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.post(
                    url=search_url,
                    headers=headers,
                    data=json.dumps({
                        "Keyword": keyword,
                        "SaleCityId": "1",
                        "Tab": 64
                    }),
                )
                content = response.content

                city_list = json.loads(content)['Data']
                suggest['suggest'] = content
                db = client['SuggestName']
                db.CtripCitySuggestion.save(suggest)
                self.task.error_code = 0
                return {'搜索到的city数量': len(city_list)}
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
Exemple #2
0
    def _execute(self, **kwargs):

        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.get(url=search_url,
                                       headers=headers,
                                       data={
                                           'Jsoncallback': 'jQuery',
                                           'keyword': keyword
                                       })
                json_data = json.loads(response.content[7:-1])
                suggest['suggest'] = json_data
                db = client['SuggestName']
                db.CtripPoiSDK.save(suggest)
                self.task.error_code = 0
                count = 1
                if isinstance(json_data, list):
                    count = len(json_data)
                return {'搜索到的suggest数量': count}
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
    def _execute(self, **kwargs):
        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.get(url=search_url,
                                       headers=headers,
                                       params={
                                           'searchType': 'InCity',
                                           'applyGrouping': True,
                                           'isWebRequest': True,
                                           'searchTerm': keyword
                                       })

                content = response.content
                root = html.fromstring(content)
                city_list = root.xpath('//city')
                suggest['suggest'] = content
                db = client['SuggestName']
                db.MarriottCitySuggest.save(suggest)
                self.task.error_code = 0
                return {'搜索到的city数量': len(city_list)}
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
    def _execute(self, **kwargs):
        with MySession(need_proxies=True,
                       need_cache=True,
                       auto_update_host=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.post(url=search_url,
                                        headers=headers,
                                        data={'searchText': keyword})

                json_data = json.loads(response.content)
                suggest['suggest'] = json_data
                db = client['SuggestName']
                db.AccorCitySuggest.save(suggest)
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
        self.task.error_code = 0
        return {'搜索到的suggest数量': json_data['TotalItemsCount']}
 def _execute(self, **kwargs):
     with MySession(need_proxies=True, need_cache=True) as session:
         keyword = self.task.kwargs['keyword']
         suggest = {}
         try:
             response = session.post(url=search_url,
                                     headers=headers,
                                     params={
                                         'r':
                                         'search/search/searchSugguestV2',
                                         'query': keyword,
                                         'format': 'json'
                                     })
             content = response.content
             suggest['suggest'] = content
             db = client['SuggestName']
             db.TuniuCitySuggestion.save(suggest)
             self.task.error_code = 0
             return {'搜索到的city数量': 1}
         except requests.exceptions.RequestException as e:
             raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                        wrapped_exception=e)
         except pymongo.errors.PyMongoError as e:
             raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                        wrapped_exception=e)
         except Exception as e:
             raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                        wrapped_exception=e)
    def _execute(self, **kwargs):
        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.get(url=search_url,
                                       headers=headers,
                                       params={
                                           'country': 'cn',
                                           'language': 'zh',
                                           'brand': 'ihg',
                                           'query': keyword
                                       })

                json_data = json.loads(response.content)
                suggest['suggest'] = json_data
                db = client['SuggestName']
                db.IhgCitySuggest.save(suggest)
                self.task.error_code = 0
                return {'搜索到的suggest数量': json_data['preFilterCount']}
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
    def _execute(self, **kwargs):
        table_name = self.task.kwargs['table_name']
        source = self.task.kwargs['source']
        sid = self.task.kwargs['sid']
        other_info = self.task.kwargs['other_info']

        address = json.loads(other_info).get('address').encode('utf8')
        if not address:
            execute_sql(update_status % (2, source, sid, table_name))
            raise ServiceStandardError(
                error_code=ServiceStandardError.KEY_WORDS_FILTER,
                wrapped_exception=Exception(u'address 为空'))

        map_info = google_get_map_info(address)
        if not map_info:
            execute_sql(update_status % (2, source, sid, table_name))
            raise ServiceStandardError(
                error_code=ServiceStandardError.KEY_WORDS_FILTER,
                wrapped_exception=Exception(u'mapinfo 为空'))

        sql = update_map_info % (table_name, map_info, source, sid)
        typ2 = table_name.split('_')[1]
        sql = sql.format(field='source_id' if typ2 == 'hotel' else 'id')

        execute_sql(sql)
        execute_sql(update_status % (1, source, sid, table_name))

        self.task.error_code = 0
        return source, sid
    def _execute(self, **kwargs):
        table_name = self.task.kwargs['task_name']
        source = self.task.kwargs['source']
        sid = self.task.kwargs['sid']
        url = self.task.kwargs['url']

        typ2 = table_name.split('_')[1]
        try:
            source_id = re.compile(r'd(\d+)').findall(url)[0]
            if not source_id:
                raise ServiceStandardError(
                    error_code=ServiceStandardError.PARSE_ERROR)
        except Exception as e:
            raise ServiceStandardError(
                error_code=ServiceStandardError.PARSE_ERROR,
                wrapped_exception=Exception(
                    'can not find source_id, url    %s' % url))

        if typ2 == 'attr':
            img_url = attr_image_parser(source_id)
        elif typ2 == 'rest':
            img_url = rest_image_parser(source_id)
        else:
            img_url = None
        if not img_url:
            raise ServiceStandardError(
                error_code=ServiceStandardError.EMPTY_TICKET)
        sql = update_imgurl % (table_name, img_url, source, sid)
        execute_sql(sql)

        self.task.error_code = 0
        return source, sid
Exemple #9
0
    def _execute(self, **kwargs):
        dept_info = self.task.kwargs['dept_info']
        dest_info = self.task.kwargs['dest_info']
        source = self.task.kwargs['source']
        error_code, result, page_store_key = GT_to_database(
            tid=self.task.task_id,
            used_times=self.task.used_times,
            vacation_type=self.task.kwargs['vacation_type'],
            source=source,
            ticket=self.task.kwargs,
            need_cache=self.task.used_times == 0)

        db[source + 'GT_list'].save({
            'collections': self.task.collection,
            'task_id': self.task.task_id,
            'used_times': self.task.used_times[0],
            'stored_page_keys': page_store_key,
            'dept_info': dept_info,
            'dest_info': dest_info,
            'result': result,
            'source': self.task.kwargs['source'],
            'insert_time': datetime.datetime.now()
        })

        self.task.error_code = error_code

        sql = SQL.format(self.task.task_name)
        data = []
        if source == 'ctrip':
            for res in result:
                data.append(
                    (source, res['pid_3rd'], dept_info['id'], dest_info['id'],
                     json.dumps(res)))
        elif source == 'tuniu':
            for res in result:
                data.append(
                    (source, res['id'], dept_info['id'], dest_info['id'],
                     json.dumps(res)))
        try:
            service_platform_conn = service_platform_pool.connection()
            cursor = service_platform_conn.cursor()
            _res = cursor.executemany(sql, data)
            service_platform_conn.commit()
            cursor.close()
            service_platform_conn.close()
            self.task.get_data_per_times = len(data)
            self.task.list_task_insert_db_count = _res
        except Exception as e:
            raise ServiceStandardError(
                error_code=ServiceStandardError.MYSQL_ERROR,
                wrapped_exception=e)

        if error_code == 27 or len(data) > 0:
            self.task.error_code = 0
        else:
            raise ServiceStandardError(
                error_code=ServiceStandardError.EMPTY_TICKET)

        return len(result), error_code
Exemple #10
0
    def _execute(self, **kwargs):
        sql = SQL.format(table_name=self.task.task_name)
        poi_type = self.task.kwargs['poi_type']
        code, result, others_info, page_store_key = hotel_list_database(
            self.task.kwargs['source'],
            self.task.kwargs['url'],
            type_dict[poi_type],
            spider_name[poi_type],
            need_cache=self.task.used_times == 0)
        self.logger.info('spider    %s %s' % (str(code), str(result)))
        self.task.error_code = code

        try:
            collections.save({
                'collections': self.task.collection,
                'task_id': self.task.task_id,
                'used_times': self.task.used_times[0],
                'others_info': others_info,
                'stored_page_keys': page_store_key,
                'result': result,
                'insert_time': datetime.datetime.now()
            })
        except Exception as e:
            self.logger.exception(msg=traceback.format_exc(e))

        data = []
        try:
            for one in result:
                for key, view in one.items():
                    data.append(
                        (self.task.kwargs['source'], view['source_id'],
                         self.task.kwargs['city_id'],
                         self.task.kwargs['country_id'], view['view_url']))
                    logger.info('%s' % str(
                        (self.task.kwargs['source'], view['source_id'],
                         self.task.kwargs['city_id'],
                         self.task.kwargs['country_id'], view['view_url'])))
            logger.info('%s %s' % (sql, str(data)))
            res = insert(sql, data)
            self.task.list_task_insert_db_count = res
            self.task.get_data_per_times = len(data)
        except Exception as e:
            self.logger.exception(msg="[insert db error]", exc_info=e)
            raise ServiceStandardError(
                error_code=ServiceStandardError.MYSQL_ERROR,
                wrapped_exception=e)

        # 由于错误都是 raise 的,
        # 所以当出现此种情况是,return 的内容均为正确内容
        # 对于抓取平台来讲,当出现此中情况时,数据均应该入库
        # 用 res_data 判断,修改 self.error_code 的值
        if len(data) > 0:
            self.task.error_code = 0
        else:
            raise ServiceStandardError(
                error_code=ServiceStandardError.EMPTY_TICKET)

        return self.task.error_code, self.task.kwargs['url']
Exemple #11
0
    def _execute(self, **kwargs):
        city_id = self.task.kwargs['city_id']
        country_id = self.task.kwargs['country_id']
        check_in = self.task.kwargs['check_in']
        city_url = self.task.kwargs['city_url']

        error_code, result, page_store_key, types_result_num = qyer_list_to_database(
            tid=self.task.task_id,
            used_times=self.task.used_times,
            source=self.task.kwargs['source'],
            city_id=city_id,
            check_in=check_in,
            city_url=city_url,
            need_cache=self.task.used_times == 0)

        collections.save({
            'collections': self.task.collection,
            'task_id': self.task.task_id,
            'used_times': self.task.used_times[0],
            'total_num': types_result_num,
            'stored_page_keys': page_store_key,
            'result': result,
            'insert_time': datetime.datetime.now()
        })

        self.task.error_code = error_code

        sql = SQL.format(self.task.task_name)
        data = []
        for sid, url, page_id, branch, tag in result:
            data.append(('qyer', sid, city_id, country_id, url))
        try:
            service_platform_conn = service_platform_pool.connection()
            cursor = service_platform_conn.cursor()
            _res = cursor.executemany(sql, data)
            service_platform_conn.commit()
            cursor.close()
            service_platform_conn.close()
            self.task.get_data_per_times = len(data)
            self.task.list_task_insert_db_count = _res
        except Exception as e:
            raise ServiceStandardError(
                error_code=ServiceStandardError.MYSQL_ERROR,
                wrapped_exception=e)

        if len(data) > 0:
            self.task.error_code = 0
        else:
            raise ServiceStandardError(
                error_code=ServiceStandardError.EMPTY_TICKET)

        return result, error_code
    def _execute(self, **kwargs):
        with MySession(need_cache=True, need_proxies=True) as session:
            keyword = self.task.kwargs['keyword']
            page = session.get(search_url.format(keyword),
                               headers=headers,
                               timeout=240)
            city_count = 0
            try:
                json_data = json.loads(page.content)
                client = pymongo.MongoClient(**mongo_config)
                db = client['SuggestName']

                db.QyerRawSuggest.save({'suggest': json_data})

                city_list = []
                citys = json_data.get('data', {}).get('list')
                for city in citys:
                    if city.get('type_name') == 'city':
                        city_count += 1
                        city_list.append(city)
                db.QyerCity.save({'city': city_list})
                client.close()
            except Exception as e:
                raise ServiceStandardError(
                    error_code=ServiceStandardError.MYSQL_ERROR,
                    wrapped_exception=e)
        self.task.error_code = 0
        return '抓取到的城市数量:%s' % city_count
Exemple #13
0
 def _execute(self, **kwargs):
     with MySession(need_cache=True, need_proxies=True) as session:
         keyword = self.task.kwargs['keyword']
         page_info = {}
         response = session.get(url=search_url,
                                params={
                                    'ie': 'utf-8',
                                    'tn': 'baidu',
                                    'wd': keyword,
                                    'rqlang': 'cn'
                                },
                                headers=headers)
         try:
             content = response.content
             root = html.fromstring(content)
             page_info['keyword'] = keyword
             page_info['content'] = content
             city_url = []
             city_list = root.xpath(
                 '//a[contains(text(),"place.qyer.com")]/text()')
             for city in city_list:
                 url_str = urljoin('http:', city)
                 url_str = url_str.strip('.').strip('')
                 if not city_url or url_str not in city_url:
                     city_url.append(url_str)
             page_info['city_url'] = city_url
             client = pymongo.MongoClient(**mongo_config)
             db = client['SuggestName']
             db.BaiDuSuggest.save(page_info)
         except Exception as e:
             raise ServiceStandardError(
                 error_code=ServiceStandardError.MYSQL_ERROR,
                 wrapped_exception=e)
     self.task.error_code = 0
     return page_info
Exemple #14
0
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
        if not exc_type:
            self.update_proxy(0)
        elif exc_type in (SSLError, ProxyError):
            self.browser_log()
            self.update_proxy(22)
            raise ServiceStandardError(22,
                                       "代理异常 from Browser [proxy: {}]".format(
                                           self.p_r_o_x_y),
                                       wrapped_exception=exc_type)
        elif exc_type in (ConnectionError, ConnectTimeout):
            self.browser_log()
            self.update_proxy(23)
            raise ServiceStandardError(23,
                                       "代理被禁 from Browser [proxy: {}]".format(
                                           self.p_r_o_x_y),
                                       wrapped_exception=exc_type)

        if self.need_cache:
            # store page check
            if exc_type is None:
                # save any not stored cache
                for k, v in self.md5_resp.items():
                    if not proj.my_lib.Common.RespStore.has_cache(k):
                        logger.info('[保存缓存][md5: {}]'.format(k))
                        proj.my_lib.Common.RespStore.put_by_md5(k, v)
            else:
                # store debug page
                for k, v in self.md5_resp.items():
                    debug_key = "debug_{}".format(k)
                    logger.info('[保存 debug 缓存][md5: {}]'.format(debug_key))
                    if not proj.my_lib.Common.RespStore.has_cache(debug_key):
                        proj.my_lib.Common.RespStore.put_by_md5(debug_key, v)

                # don't store page or delete the page
                if not self.do_not_delete_cache:
                    for each_md5 in self.md5:
                        if proj.my_lib.Common.RespStore.has_cache(each_md5):
                            logger.info('[删除缓存][md5: {}]'.format(each_md5))
                            proj.my_lib.Common.RespStore.delete_cache(each_md5)
                else:
                    for each_md5 in self.md5:
                        logger.info(
                            '[出现异常不清楚并不清除缓存][md5: {}]'.format(each_md5))
Exemple #15
0
    def _execute(self, **kwargs):

        error_code, result = hilton_to_database(
            tid=self.task.task_id,
            used_times=self.task.used_times,
            source='hilton',
            keyword=self.task.kwargs['keyword'],
            extra=self.task.kwargs['extra'],
            spider_tag = 'hiltonSuggest',
            need_cache=self.task.used_times == 0
        )


        conn = pymysql.connect(**config)
        cursor = conn.cursor()
        save_result = []

        self.task.error_code = error_code

        # for sid, url, page_id in result:
        #     data.append(('hilton', sid, city_id, country_id, url))
        # try:
        #     service_platform_conn = service_platform_pool.connection()
        #     cursor = service_platform_conn.cursor()
        #     _res = cursor.executemany(sql, data)
        #     service_platform_conn.commit()
        #     cursor.close()
        #     service_platform_conn.close()
        #     self.task.get_data_per_times = len(data)
        #     self.task.list_task_insert_db_count = _res
        # except Exception as e:
        #     raise ServiceStandardError(error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e)
        print(result)
        self.logger.info('this time, result count of keyword {} is {}'.format(self.task.kwargs['keyword'], len(result)))
        if len(result) > 0:
            try:
                for i in result:
                    # source, sid, sid_md5, s_city, s_country, suggest_type, suggest
                    # save_result.append(
                    #     (str(i['source']), str(i['sid']), str(i['sid_md5']), str(i['s_city']), str(i['s_country']),
                    #      int(i['suggest_type']), str(i['suggest'])))

                    sql = SQL.format(str(i['source']), str(i['sid']), str(i['sid_md5']), str(i['s_city']), str(i['s_country']),
                         i['suggest_type'], str(i['suggest']))
                    cursor.execute(sql)
                # cursor.executemany(SQL, save_result)
            except Exception as e:
                conn.rollback()
                traceback.print_exc(e)
            finally:
                conn.commit()
                conn.close()
            self.task.error_code = 0
        else:
            raise ServiceStandardError(error_code=ServiceStandardError.EMPTY_TICKET)

        return result, error_code
Exemple #16
0
        def hotel_list_insert_daodao(table_name, res_data):
            try:
                service_platform_conn = service_platform_pool.connection()
                cursor = service_platform_conn.cursor()
                sel_sql = 'select id, localtion_id, source_list from {} where source_id = %s'.format(table_name)
                rep_sql = "replace into {} (id, name, name_en, city_id, country_id, `from`, source_id, localtion_id, status, source_list) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)".format(
                    table_name)
                ins_sql = "insert into {} (name, name_en, city_id, country_id, `from`, source_id, localtion_id, status, source_list) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s)".format(
                    table_name)
                cursor.execute(sel_sql, source_id)
                exists_localtion = {}
                for id, localtion_id, source_list in cursor.fetchall():
                    print id, localtion_id, source_list
                    exists_localtion[localtion_id] = (id, json.loads(source_list or "{}"))

                new_hotels_and_not_id = []
                if exists_localtion:
                    for _i, line in enumerate(res_data):
                        new_hotels = {}
                        mylocaltion_id, myhotels = line[6], line[8]
                        if exists_localtion.has_key(mylocaltion_id):
                            yourid, yourhotels = exists_localtion[mylocaltion_id]
                            mykeys = myhotels.keys()
                            yourkeys = yourhotels.keys()
                            if len(mykeys) > len(yourkeys):
                                for k in mykeys:
                                    new_hotels[k] = myhotels.get(k) or yourhotels.get(k)
                            else:
                                for k in yourkeys:
                                    new_hotels[k] = yourhotels.get(k) or myhotels.get(k)

                            line[8] = json.dumps(new_hotels)
                            line.insert(0, yourid)
                        else:
                            line[8] = json.dumps(line[8] or {})
                            new_hotels_and_not_id.append(line)

                    res_data = filter(lambda x:len(x)==10, res_data)
                    cursor.executemany(rep_sql, res_data)
                    service_platform_conn.commit()

                if new_hotels_and_not_id:
                    cursor.executemany(ins_sql, new_hotels_and_not_id)
                    service_platform_conn.commit()
                if not exists_localtion:
                    for line in res_data:
                        line[8] = json.dumps(line[8])
                    cursor.executemany(ins_sql, res_data)
                    service_platform_conn.commit()

                cursor.close()
                service_platform_conn.close()
            except Exception as e:
                self.logger.exception(msg="[mysql error]", exc_info=e)
                raise ServiceStandardError(error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e)
Exemple #17
0
 def hotel_list_insert_db(table_name, res_data):
     try:
         service_platform_conn = service_platform_pool.connection()
         cursor = service_platform_conn.cursor()
         sql = "replace into {} (name, name_en, city_id, country_id, `from`, status, source_list) VALUES (%s,%s,%s,%s,%s,%s,%s)".format(
             table_name)
         cursor.executemany(sql, res_data)
         service_platform_conn.commit()
         cursor.close()
         service_platform_conn.close()
     except Exception as e:
         self.logger.exception(msg="[mysql error]", exc_info=e)
         raise ServiceStandardError(error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e)
    def _execute(self, **kwargs):
        with MySession(need_cache=True, need_proxies=True) as session:
            iata_code = self.task.kwargs['iata_code']
            request_body = {
                "union": "",
                "maker": "",
                "isStop": "0",
                "isDomestic": "1",
                "isCross": "1",
                "queryDate2": "",
                "ftype": "",
                "queryDate1": "",
                "dep": iata_code,
                "isShare": "0",
                "depType": "1",
            }
            response = session.post(
                url="http://map.variflight.com/___api/SuXAvAQ0qWkchQuUUqHN/de1",
                headers=headers,
                data=request_body
            )

            try:
                data = json.loads(response.text)
                if int(data['code']) != 0:
                    raise ServiceStandardError(error_code=ServiceStandardError.PROXY_FORBIDDEN)

                data_collections.save(
                    {
                        'iata_code': iata_code,
                        'data': data
                    }
                )

            except Exception as e:
                raise ServiceStandardError(error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e)
        self.task.error_code = 0
        return data
 def get_content(self, source, line):
     try:
         res = self._get_content(source, line)
         if res:
             return res[1]
         else:
             return None
     except ReportException as e:
         add_content_report(source, str(e.type))
         return None
     except Exception as e:
         raise ServiceStandardError(
             error_code=ServiceStandardError.UNKNOWN_ERROR,
             wrapped_exception=e)
Exemple #20
0
    def _execute(self, **kwargs):

        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.post(url=search_url,
                                        headers=headers,
                                        data={
                                            'action': 'API',
                                            'uiOrigin': 'PTPT-dest',
                                            'types': 'geo,dest',
                                            'hglt': True,
                                            'global': True,
                                            'legacy_format': True,
                                            '_ignoreMinCount': True,
                                            'query': keyword
                                        })

                json_data = json.loads(response.content)
                suggest['suggest'] = json_data
                db = client['SuggestName']
                db.DaoDaoCitySuggest.save(suggest)
                self.task.error_code = 0
                count = 1
                if isinstance(json_data, list):
                    count = len(json_data)
                return {'搜索到的suggest数量': count}
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
    def _execute(self, **kwargs):
        city_id = self.task.kwargs['city_id']
        source_id = self.task.kwargs['source_id']
        check_in = self.task.kwargs['check_in']

        error_code, result, page_store_key = hilton_to_database(
            tid=self.task.task_id,
            used_times=self.task.used_times,
            source=self.task.kwargs['source'],
            source_id=source_id,
            check_in=check_in,
            city_id=city_id,
            need_cache=self.task.used_times == 0
        )

        collections.save({
            'collections': self.task.collection,
            'task_id': self.task.task_id,
            'used_times': self.task.used_times[0],
            'stored_page_keys': page_store_key,
            'check_in': self.task.kwargs['check_in'],
            'result': result,
            'insert_time': datetime.datetime.now()
        })

        self.task.error_code = error_code

        sql = SQL.format(self.task.task_name)
        data = json.dumps(result, ensure_ascii=False)
        # for sid, url, page_id in result:
        #     data.append(('hilton', sid, city_id, country_id, url))
        # try:
        #     service_platform_conn = service_platform_pool.connection()
        #     cursor = service_platform_conn.cursor()
        #     _res = cursor.executemany(sql, data)
        #     service_platform_conn.commit()
        #     cursor.close()
        #     service_platform_conn.close()
        #     self.task.get_data_per_times = len(data)
        #     self.task.list_task_insert_db_count = _res
        # except Exception as e:
        #     raise ServiceStandardError(error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e)

        if len(data) > 0:
            self.task.error_code = 0
        else:
            raise ServiceStandardError(error_code=ServiceStandardError.EMPTY_TICKET)

        return result, error_code
Exemple #22
0
    def _execute(self, **kwargs):
        url = self.task.kwargs['url']
        flag = self.task.kwargs['flag']
        table_name = self.task.kwargs['table_name']

        md5_url = encode(url)
        with MySession(need_proxies=True, need_cache=True) as session:
            page = session.get(url, timeout=240)
            page.encoding = 'utf8'
            if len(page.text) == 0:
                raise ServiceStandardError(
                    error_code=ServiceStandardError.PROXY_FORBIDDEN)
            else:
                content = page.text
                j_data = json.loads(content)
                if j_data['status'] not in ['OK', 'ZERO_RESULTS']:
                    raise ServiceStandardError(
                        error_code=ServiceStandardError.PROXY_FORBIDDEN)

                data = (md5_url, url, content, flag)
                conn = pymysql.connect(host='10.10.231.105',
                                       user='******',
                                       passwd='hourong',
                                       db='crawled_html',
                                       charset="utf8")
                try:
                    with conn as cursor:
                        sql = 'insert ignore into crawled_html.{0}(`md5`,`url`,`content`,`flag`) values (%s,%s,%s,%s)'.format(
                            table_name)
                        print(cursor.execute(sql, data))
                except Exception as e:
                    raise ServiceStandardError(
                        error_code=ServiceStandardError.PROXY_FORBIDDEN,
                        wrapped_exception=e)
            self.task.error_code = 0
            return 'OK', url
 def hotel_list_insert_db():
     try:
         service_platform_conn = service_platform_pool.connection()
         cursor = service_platform_conn.cursor()
         sql = "INSERT IGNORE INTO {} (source, source_id, city_id, country_id, hotel_url) VALUES (%s,%s,%s,%s,%s)".format(
             self.task.task_name)
         _res = cursor.executemany(sql, res_data)
         service_platform_conn.commit()
         cursor.close()
         service_platform_conn.close()
         self.task.list_task_insert_db_count = _res
         self.task.get_data_per_times = len(res_data)
     except Exception as e:
         self.logger.exception(msg="[mysql error]", exc_info=e)
         raise ServiceStandardError(
             error_code=ServiceStandardError.MYSQL_ERROR,
             wrapped_exception=e)
Exemple #24
0
    def _execute(self, **kwargs):
        source = self.task.kwargs['source']
        t1 = time.time()
        error_code, result, page_store_key = GTdetail_to_database(
            tid=self.task.task_id,
            used_times=self.task.used_times,
            source=source,
            ticket=self.task.kwargs,
            need_cache=self.task.used_times == 0)

        t2 = time.time()
        self.logger.info('抓取耗时:   {}'.format(t2 - t1))

        if source == 'ctrip':
            my_collections = ctrip_collections
        elif source == 'tuniu':
            my_collections = tuniu_collections
        else:
            raise Exception(u'未知的源')

        my_collections.save({
            'source': source,
            'collections': self.task.collection,
            'task_id': self.task.task_id,
            'used_times': self.task.used_times[0],
            'stored_page_keys': page_store_key,
            'result': result,
            'args': self.task.kwargs,
            'insert_time': datetime.datetime.now()
        })

        t3 = time.time()
        self.logger.info('入库耗时:   {}'.format(t3 - t2))
        # -- detail 2 mysql --
        if len(result) > 0:
            self.task.error_code = 0
        elif len(result) == 0:
            self.task.error_code = 29
        # elif len(result) == 0:
        #     raise ServiceStandardError(ServiceStandardError.EMPTY_TICKET)
        else:
            raise ServiceStandardError(error_code=error_code)
        return self.task.error_code, len(result)
Exemple #25
0
    def _execute(self, **kwargs):
        # init task val
        source = self.task.kwargs['source']
        source_id = self.task.kwargs['source_id']
        target_url = self.task.kwargs['target_url']
        bucket_name = self.task.kwargs['bucket_name']
        file_prefix = self.task.kwargs['file_prefix']
        is_poi_task = self.task.kwargs.get('is_poi_task', True)
        need_insert_db = self.task.kwargs.get('need_insert_db', True)
        special_file_name = self.task.kwargs.get('special_file_name', '')

        # /album/user/2225/43/Q0tXRx4EY00/index/980x576
        if 'qyer.com' in target_url and source == 'qyer':
            if target_url.endswith('/index'):
                target_url += '/980x576'
            elif target_url.endswith('/index/'):
                target_url += '980x576'

        if 'ahstatic.com' in target_url and source == 'accor':
            if not target_url.startswith('http://'):
                target_url = 'http://' + target_url

        if source == 'ihg':
            if target_url.endswith('4x3?fmt=png-alpha'):
                target_url += '&wid=800&hei=600'

        flag = None
        h = None
        w = None

        file_name = ''

        with MySession(need_cache=True) as session:

            @func_time_logger
            def img_file_get():
                _page = session.get(target_url, timeout=(10800, 10800))
                return _page

            page = img_file_get()

            f_stream = StringIO(page.content)

            if f_stream.len > 10485760:
                # 大于 10MB 的图片信息不入库
                raise ServiceStandardError(
                    error_code=ServiceStandardError.IMG_TOO_LARGE)

            file_md5 = get_stream_md5(f_stream)
            flag, h, w = is_complete_scale_ok(f_stream)

            try:
                suffix = target_url.rsplit('.', 1)[1]
                # 对于 qyer 的图片特殊处理,无文件后缀
                if len(suffix) > 16:
                    suffix = ''
            except IndexError as e:
                suffix = page.headers['Content-Type'].split('/')[1]

            # 无文件后缀名图片直接 md5
            if suffix:
                file_name = hashlib.md5(target_url).hexdigest() + '.' + suffix
            else:
                file_name = hashlib.md5(target_url).hexdigest()

            if flag in [1, 2]:
                raise ServiceStandardError(
                    error_code=ServiceStandardError.IMG_INCOMPLETE)
            else:
                # get img p hash
                _p_hash = img_p_hash(StringIO(page.content))

                # save file stream
                r2 = True
                if bucket_name != 'mioji-wanle':
                    r1 = upload_ks_file_stream(bucket_name,
                                               file_name,
                                               StringIO(page.content),
                                               page.headers['Content-Type'],
                                               hash_check=file_md5)
                else:
                    r1 = upload_ks_file_stream(bucket_name,
                                               '{}/'.format(file_prefix) +
                                               file_name,
                                               StringIO(page.content),
                                               page.headers['Content-Type'],
                                               hash_check=file_md5)
                if bucket_name == 'mioji-attr':
                    r2 = upload_ks_file_stream('mioji-shop',
                                               file_name,
                                               StringIO(page.content),
                                               page.headers['Content-Type'],
                                               hash_check=file_md5)

                if not (r1 and r2):
                    raise ServiceStandardError(
                        ServiceStandardError.IMG_UPLOAD_ERROR)

            use_flag = 1 if flag == 0 else 0
            size = str((h, w))

            # 更新 file name
            if special_file_name != '':
                file_name = special_file_name

            # bucket_name = file_path.split('_')[1] + '_bucket' if is_poi_task else ''

            data = (
                source,  # source
                source_id,  # source_id
                target_url,  # pic_url
                file_name,  # pic_md5
                self.task.task_name[-9:],  # part
                size,  # size
                use_flag,  # poi use , hotel flag
                file_md5,  # file_md5
                bucket_name,  # poi rest attr shop
                json.dumps({"p_hash":
                            _p_hash}),  # img phash for check duplicate
            )

            try:
                table_name = self.task.task_name
                if need_insert_db:
                    if is_poi_task:
                        poi_make_kw(data, table_name)
                    else:
                        hotel_make_kw(data, table_name)

                # 设置标识位
                self.task.error_code = 0
            except exc.SQLAlchemyError as err:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=err)
            except IOError as err:
                raise ServiceStandardError(
                    ServiceStandardError.IMG_UPLOAD_ERROR,
                    wrapped_exception=err)

        # 被过滤的图片返回错误码不为 0
        if flag in [3, 4, 5]:
            raise ServiceStandardError(ServiceStandardError.IMG_SIZE_FILTER)
        self.task.error_code = 0
        return flag, h, w, self.task.error_code, bucket_name, file_name, self.task.task_name
Exemple #26
0
    def _execute(self, **kwargs):
        target_url = self.task.kwargs['target_url']
        city_id = self.task.kwargs['city_id']
        poi_type = self.task.kwargs['poi_type']

        target_url = target_url.replace('.com.hk', '.cn')
        with MySession(need_cache=True) as session:
            page = session.get(target_url, timeout=120)
            page.encoding = 'utf8'

            parser = parser_type[poi_type]
            result = parser(page.content, target_url, city_id=city_id)

            if result == 'Error':
                raise ServiceStandardError(ServiceStandardError.PARSE_ERROR)

            result['city_id'] = city_id
            # result['utime'] = datetime.datetime.now()
            sql_key = result.keys()

            name = result['name']
            # if name.find('停业') > -1:
            #     raise ServiceStandardError(error_code=ServiceStandardError.TARGET_CLOSED)
            name_en = result['name_en']
            map_info = result['map_info']
            address = result['address']

            map_info_is_legal = True
            try:
                lon, lat = map_info.split(',')
                if float(lon) == 0.0 and float(lat) == 0.0:
                    map_info_is_legal = False
            except Exception as e:
                map_info_is_legal = False
                logger.exception(msg="[map info is not legal]", exc_info=e)

            if not key_is_legal(map_info) or not map_info_is_legal:
                if not key_is_legal(address):
                    pass
                    # raise TypeCheckError(
                    #     'Error map_info and address NULL        with parser %ss    url %s' % (
                    #         parser.func_name, target_url))
                google_map_info = google_get_map_info(address)
                if not key_is_legal(google_map_info):
                    pass
                    # raise TypeCheckError(
                    #     'Error google_map_info  NULL  with [parser: {}][url: {}][address: {}][map_info: {}]'.format(
                    #         parser.func_name, target_url, address, map_info)
                    # )
                result['map_info'] = google_map_info
            if key_is_legal(name) or key_is_legal(
                    name_en) or map_info_is_legal or key_is_legal(
                        result.introduction):
                logger.info(name + '  ----------  ' + name_en)
            else:
                raise TypeCheckError(
                    'Error All Keys is None with parser %s  url %s' %
                    (parser.func_name, target_url))

            try:
                session = DBSession()
                session.execute(
                    text(
                        text_2_sql(sql_key).format(
                            table_name=self.task.task_name)), [result])
                session.commit()
                session.close()
            except Exception as e:
                logger.exception(e)
                raise ServiceStandardError(
                    error_code=ServiceStandardError.MYSQL_ERROR,
                    wrapped_exception=e)

            self.task.error_code = 0
            return self.task.error_code
Exemple #27
0
    def _execute(self, **kwargs):
        with MySession(need_cache=True, need_proxies=True) as session:
            city_id = self.task.kwargs['city_id']
            target_url = self.task.kwargs['target_url']
            headers = {'Host': 'place.qyer.com'}
            page = session.get(target_url, headers=headers, timeout=240)
            page.encoding = 'utf8'
            content = page.text

            if '请输入验证码' in content:
                raise Exception("请输入验证码")

            result = page_parser(content=content, target_url=target_url)
            result.city_id = city_id
            name = result.name
            name_en = result.name_en
            map_info = result.map_info
            address = result.address

            map_info_is_legal = True
            try:
                lon, lat = map_info.split(',')
                if float(lon) == 0.0 and float(lat) == 0.0:
                    map_info_is_legal = False
            except Exception as e:
                map_info_is_legal = False
                logger.exception(msg="[map info is not legal]", exc_info=e)

            if not key_is_legal(map_info) or not map_info_is_legal:
                if not key_is_legal(address):
                    # todo 临时注释
                    pass
                    # raise TypeCheckError(
                    #     'Error map_info and address NULL        with parser %ss    url %s' % (
                    #         page_parser.func_name, target_url))
                google_map_info = google_get_map_info(address)
                if not key_is_legal(google_map_info):
                    # todo 临时注释
                    pass
                    # raise TypeCheckError(
                    #     'Error google_map_info  NULL  with [parser: {}][url: {}][address: {}][map_info: {}]'.format(
                    #         page_parser.func_name, target_url, address, map_info)
                    # )
                result.map_info = google_map_info

            if key_is_legal(name) or key_is_legal(
                    name_en) or map_info_is_legal or key_is_legal(
                        result.introduction):
                logger.info(name + '  ----------  ' + name_en)
            else:
                # raise TypeCheckError(
                #     'Error name and name_en Both NULL        with parser %s    url %s' % (
                #         page_parser.func_name, target_url))
                raise TypeCheckError("All Available Key is Null")

        sql_result = result.__dict__
        sql_key = sql_result.keys()
        if '_sa_instance_state' in sql_key:
            sql_key.remove('_sa_instance_state')

        try:
            session = DBSession()
            session.execute(
                text(
                    text_2_sql(sql_key).format(
                        table_name=self.task.task_name)), [sql_result])
            session.commit()
            session.close()
        except Exception as e:
            self.logger.exception(msg="[mysql exec err]", exc_info=e)
            raise ServiceStandardError(
                error_code=ServiceStandardError.MYSQL_ERROR,
                wrapped_exception=e)

        self.task.error_code = 0
        return self.task.error_code
Exemple #28
0
    def _execute(self, **kwargs):

        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.get(url=search_url.format(keyword),
                                       headers=headers)
                #response = requests.get(search_url.format(keyword))
                res = response.content
                root = html.fromstring(res.decode('utf-8'))
                dests = root.xpath("//div[@class='breadbar_v1 cf']/ul/li")
                dest = ''
                try:
                    for de in dests[2:-1]:
                        if dest != '':
                            dest += '|'
                        dest += de.xpath("a/text()")[0]
                except:
                    pass

                print dest
                tag = {}
                try:
                    tags = root.xpath("//ul[@class='map_tab cf']/li")
                    for ta in tags:
                        t = ta.xpath('a/span/text()')[0]
                        tt = ta.xpath('a/text()')[-1].strip()
                        tag[t] = tt
                except:
                    pass
                print tag

                map_info = ''
                try:
                    map_info = re.findall('centerGeo: ({.+})',
                                          res)[0].replace('\'', '\"')
                except:
                    pass
                print map_info

                db = client['SuggestName']
                db.CtripPoiSDK_detail.save({
                    'name':
                    self.task.kwargs['name'],
                    'dest_name':
                    self.task.kwargs['dest_name'],
                    'keyword':
                    keyword,
                    'dest':
                    dest,
                    'tag_info':
                    tag,
                    'map_info':
                    map_info
                })
                self.task.error_code = 0
                return 'OK'
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
Exemple #29
0
    def _execute(self, **kwargs):
        url = self.task.kwargs['url']
        source = self.task.kwargs['source']
        source_id = self.task.kwargs['source_id']
        city_id = self.task.kwargs['city_id']
        country_id = self.task.kwargs['country_id']
        hid = self.task.kwargs['hid']

        headers = {}
        other_info = {'source_id': source_id, 'city_id': city_id}

        if source in ['starwood', 'hyatt', 'gha', 'shangrila', 'fourseasons']:
            error_code, res, page_store_key_list = hotel_detail_database(
                url, source)

            if error_code == 0:
                result = parse_hotel_info(res)
            else:
                raise ServiceStandardError(error_code=error_code)
        else:
            with MySession(need_cache=True) as session:

                # booking start
                if source == 'booking':
                    headers['Referer'] = 'http://www.booking.com'

                # booking end

                session.headers.update(headers)
                start = time.time()
                if source not in ('hilton', 'ihg', 'holiday', 'accor',
                                  'marriott'):
                    page = session.get(url, timeout=240)
                    page.encoding = 'utf8'
                    content = page.text
                elif source == 'ihg':
                    url1, url2 = url.split('#####')
                    page1 = session.get(url1, timeout=240)
                    page1.encoding = 'utf8'
                    content1 = page1.text

                    page2 = session.get(url2, timeout=240)
                    page2.encoding = 'utf8'
                    content2 = page2.text

                    content = [content1, content2]
                elif source == 'holiday':
                    url2, url1 = url.split('#####')
                    page1 = requests.get(
                        url1,
                        headers={
                            'x-ihg-api-key':
                            'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y',
                            'ihg-language': 'zh-CN'
                        },
                        timeout=240)
                    page1.encoding = 'utf8'
                    content1 = page1.text

                    page2 = requests.get(
                        url2,
                        timeout=240,
                        headers={
                            'accept': 'application/json, text/plain, */*',
                            'Content-Type': 'application/json; charset=UTF-8',
                            'user-agent':
                            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                            'ihg-language': 'zh-CN',
                        })
                    page2.encoding = 'utf8'
                    content2 = page2.text

                    page3 = requests.get(url1,
                                         headers={
                                             'x-ihg-api-key':
                                             'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y'
                                         },
                                         timeout=240)
                    page3.encoding = 'utf8'
                    content3 = page3.text

                    content = (content1, content2, content3)
                elif source == 'accor':
                    proxy_url = "http://10.10.239.46:8087/proxy?source=pricelineFlight&user=crawler&passwd=spidermiaoji2014"
                    r = requests.get(proxy_url)
                    proxies = {'https': "socks5://" + str(r.text)}
                    headers = {
                        "User-Agent":
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
                    }
                    page = requests.get(url,
                                        headers=headers,
                                        verify=False,
                                        proxies=proxies)
                    page.encoding = 'utf8'
                    content = page.text
                elif source == 'marriott':
                    url_list = url.split('#####')
                    url = url_list[0]

                    for i in url_list:
                        if len(i.split('=')) > 1:
                            key, value = i.split('=')[0], i.split('=')[1]
                            if key == 'longtitude':
                                other_info['longtitude'] = value
                            if key == 'latitude':
                                other_info['latitude'] = value
                        else:
                            if url_list.index(i) == 1:
                                other_info['hotel_name_en'] = i

                    url2 = url.replace("travel", "hotel-photos")
                    url3 = url.replace("travel/", "maps/travel/")
                    url4 = url.replace("hotels/", "hotels/fact-sheet/")
                    headers = {
                        'User-Agent':
                        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0'
                    }
                    if "https://www.marriott.com" in url:
                        page1 = requests.get(url, headers=headers, timeout=240)
                        page2 = requests.get(url2,
                                             headers=headers,
                                             timeout=240)
                        page3 = requests.get(url3,
                                             headers=headers,
                                             timeout=240)
                        page4 = requests.get(url4,
                                             headers=headers,
                                             timeout=240)

                        page1.encoding = 'utf8'
                        page2.encoding = 'utf8'
                        page3.encoding = 'utf8'
                        page4.encoding = 'utf8'

                        content1 = page1.text
                        content2 = page2.text
                        content3 = page3.text
                        content4 = page4.text
                        content = (content1, content2, content3, content4)
                    else:
                        url2 = url + "/hotel-overview"
                        page1 = requests.get(url, headers=headers, timeout=240)
                        page2 = requests.get(url2,
                                             headers=headers,
                                             timeout=240)
                        page1.encoding = 'utf8'
                        page2.encoding = 'utf8'
                        content1 = page1.text
                        content2 = page2.text
                        content = (content1, content2)
                else:
                    session.auto_update_host = False
                    hilton_index = url.find('index.html')
                    if hilton_index > -1:
                        url = url[:hilton_index]
                    split_args = url.split('/')
                    detail_url = 'http://www3.hilton.com/zh_CN/hotels/{0}/{1}/popup/hotelDetails.html'.format(
                        split_args[-3], split_args[-2])
                    map_info_url = url + 'maps-directions.html'
                    desc_url = url + 'about.html'

                    page = session.get(url)
                    map_info_page = session.get(map_info_url)
                    desc_page = session.get(desc_url)

                    detail_page = session.get(detail_url, )
                    page.encoding = 'utf8'
                    detail_page.encoding = 'utf8'
                    map_info_page.encoding = 'utf8'
                    desc_page.encoding = 'utf8'
                    __content = page.text
                    logger.info(detail_url)
                    __detail_content = detail_page.text
                    __map_info_content = map_info_page.text
                    __desc_content = desc_page.text

                    content = [
                        __content, __detail_content, __map_info_content,
                        __desc_content
                    ]
                logger.debug("[crawl_data][Takes: {}]".format(time.time() -
                                                              start))

                start = time.time()
                result = parse_hotel(content=content,
                                     url=url,
                                     other_info=other_info,
                                     source=source,
                                     part=self.task.task_name,
                                     retry_count=self.task.used_times)
                logger.debug("[parse_hotel][func: {}][Takes: {}]".format(
                    parse_hotel.func_name,
                    time.time() - start))

        try:
            data_collections = mongo_data_client['ServicePlatform'][
                self.task.task_name]
            data_collections.create_index([('source', 1), ('source_id', 1)],
                                          unique=True,
                                          background=True)
            data_collections.create_index([('location', '2dsphere')],
                                          background=True)
            tmp_result = deepcopy(result.values(backdict=True))
            lon, lat = str(result.map_info).split(',')
            lon, lat = float(lon), float(lat)
            tmp_result.update(
                {'location': {
                    'type': "Point",
                    'coordinates': [lon, lat]
                }})
            data_collections.save(tmp_result)
        except pymongo.errors.DuplicateKeyError:
            # logger.exception("[result already in db]", exc_info=e)
            logger.warning("[result already in db]")
        except Exception as exc:
            raise ServiceStandardError(
                error_code=ServiceStandardError.MONGO_ERROR,
                wrapped_exception=exc)

        start = time.time()
        try:
            service_platform_conn = service_platform_pool.connection()
            cursor = service_platform_conn.cursor()
            others_info = json.loads(result.others_info)
            others_info['hid'] = hid
            result.others_info = json.dumps(others_info)
            sql = result.generation_sql()
            sql = sql.format(table_name=self.task.task_name)
            values = result.values()
            self.logger.info(result.__dict__)
            cursor.execute(sql, values)
            service_platform_conn.commit()
            cursor.close()
            service_platform_conn.close()
        except Exception as e:
            logger.exception(e)
            raise ServiceStandardError(
                error_code=ServiceStandardError.MYSQL_ERROR,
                wrapped_exception=e)

        logger.debug("[Insert DB][Takes: {}]".format(time.time() - start))
        self.task.error_code = 0
        return self.task.error_code
Exemple #30
0
    def _execute(self, **kwargs):

        with MySession(need_cache=True,need_proxies=True) as session:
            try:
                keyword = self.task.kwargs['keyword']
                source = self.task.kwargs['source']
                map_info = self.task.kwargs['map_info']
                country_id = self.task.kwargs['country_id']
                city_id = self.task.kwargs['city_id']
                database_name = self.task.kwargs['database_name']
                local_time = urllib.unquote(datetime.datetime.now(pytz.timezone(pytz.country_timezones('cn')[0])).strftime(
                    '%a %b %d %Y %H:%M:%S GMT+0800 (%Z)'))
                if source in 'agoda':

                    url = source_interface[source].format(keyword,local_time)
                    header = {
                        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                        'accept-encoding': 'gzip, deflate, br',
                        'accept-language': 'zh-CN,zh;q=0.9',
                        'accept': 'application/json, text/javascript, */*; q=0.01',
                        'referer': 'https://www.agoda.com/zh-cn/',
                        'authority': 'www.agoda.com',
                        'x-requested-with': 'XMLHttpRequest'
                    }
                    response = session.get(url=url,headers=header)
                    get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source))
                elif source in 'daodao':
                    headers = {
                        'referer': 'https://www.tripadvisor.cn/',
                        'x-requested-with': 'XMLHttpRequest',
                        'accept-encoding': 'gzip, deflate, br',
                        'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
                        'accept-language': 'zh-CN,zh;q=0.9',
                        'Origin': 'https://www.tripadvisor.cn',
                        'Host': 'www.tripadvisor.cn'
                    }
                    url = source_interface[source]
                    response = session.post(
                        url=url,
                        headers=headers,
                        data={
                            'action': 'API',
                            'uiOrigin': 'PTPT-dest',
                            'types': 'geo,dest',
                            'hglt': True,
                            'global': True,
                            'legacy_format': True,
                            '_ignoreMinCount': True,
                            'query': keyword
                        }
                    )
                    get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source))
                elif source in 'qyer':
                    headers = {
                        "Referer": "http://www.qyer.com/",
                        "Host": "www.qyer.com",
                    }
                    url = source_interface[source].format(keyword)
                    response = session.get(url,headers=headers)
                    get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source))
                elif source in 'ctrip':
                    headers = {
                        'Accept-Encoding': 'gzip, deflate',
                        'Accept-Language': 'zh-CN,zh;q=0.9',
                        'Referer': 'http://hotels.ctrip.com/international/',
                        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
                        'Connection': 'keep-alive'
                    }
                    url = source_interface[source].format(keyword)
                    response = session.get(url, headers=headers)
                    get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source))
                else:
                    url = source_interface[source].format(keyword)
                    response = session.get(url=url,)
                    get_suggest = getattr(sys.modules[__name__],'get_{0}_suggest'.format(source))

                count = get_suggest(response.content,map_info,country_id,city_id,database_name,keyword)
                if count >= 0:
                    self.task.error_code = 0
            except Exception as e:
                print(e)
                raise ServiceStandardError(ServiceStandardError.REQ_ERROR,wrapped_exception=e)

        return count