def __init__(self, task, *args, **kwargs): """ 初始化抓取平台任务 SDK :type task: Task :param args: :param kwargs: """ # 初始化任务 self.task = task # 为任务设置 finished code self.finished_error_code = self.get_task_finished_code() self.task.task_finished_code = self.finished_error_code self.logger = get_logger(self.__class__.__name__) # modify handler's formatter datefmt = "%Y-%m-%d %H:%M:%S" file_log_format = "%(asctime)-15s %(threadName)s %(filename)s:%(lineno)d %(levelname)s " \ "[source: {}][type: {}][task_id: {}]: %(message)s".format(self.task.source, self.task.type, self.task.task_id) formtter = logging.Formatter(file_log_format, datefmt) for each_handler in self.logger.handlers: each_handler.setFormatter(formtter) self.logger.info("[init SDK]")
def __init__(self, worker, source, _type, task_name, routine_key, queue, **kwargs): # 任务基本信息 self.worker = worker self.source = source self.type = _type self.task_name = task_name self.routine_key = routine_key self.queue = queue self.task_type = kwargs.get('task_type', TaskType.NORMAL) self.priority = int(kwargs.get("priority", 3)) self.logger = get_logger("InsertMongoTask") self.tasks = TaskList() self.collection_name = self.generate_collection_name() # 数据游标偏移量,用于在查询时发生异常恢复游标位置 self.offset = 0 # 数据游标前置偏移量,用于在入库时恢复游标位置 self.pre_offset = 0 client = pymongo.MongoClient( 'mongodb://*****:*****@10.19.2.103:27017/') self.db = client['MongoTask_Zxp'] # 建立所需要的全部索引 self.create_mongo_indexes() # CITY TASK 获取 date_list if self.task_type == TaskType.CITY_TASK: self.date_list = self.generate_list_date() else: self.date_list = None # 修改 logger 日志打印 # modify handler's formatter datefmt = "%Y-%m-%d %H:%M:%S" file_log_format = "%(asctime)-15s %(threadName)s %(filename)s:%(lineno)d %(levelname)s " \ "[source: {}][type: {}][task_name: {}][collection_name: {}]: %(message)s".format( self.source, self.type, self.task_name, self.collection_name) formtter = logging.Formatter(file_log_format, datefmt) for each_handler in self.logger.handlers: each_handler.setFormatter(formtter) self.logger.info("[init InsertTask]")
import pymongo import pymongo.errors import requests from proj.my_lib.logger import get_logger from proj.my_lib.Common.BaseSDK import BaseSDK from proj.my_lib.Common.Browser import MySession from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.my_lib.Common.Task import Task import json config = { 'host': '10.10.213.148', } logger = get_logger('accor_suggest') client = pymongo.MongoClient(**config) db = client['SuggestName'] search_url = "http://book.accorhotels.cn/Intellisense/Search" headers = { # "Cookie": "NSC_10.10.10.244-80=ffffffff090214e145525d5f4f58455e445a4a423660; language=zh-CN", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", } class AccorCitySDK(BaseSDK): def _execute(self, **kwargs): with MySession(need_proxies=True,
# -*- coding: utf-8 -*- # @Time : 2018/1/25 下午3:40 # @Author : Hou Rong # @Site : # @File : insert_rabbitmq.py # @Software: PyCharm import gevent.monkey gevent.monkey.patch_all() import pika import json import logging from proj.my_lib.logger import get_logger from proj.my_lib.Common.Utils import retry logger = get_logger("insert_rabbitmq") # test HOST = '10.10.189.213' USER = '******' PASSWD = '1220' EXCHANGE = 'GoogleDrive' ROUTINE_KEY = 'GoogleDrive' V_HOST = 'GoogleDrive' QueueName = 'GoogleDrive' logging.getLogger("pika").setLevel(logging.WARNING) logging.getLogger("pika").propagate = False # online # HOST = '10.10.38.166'
import os import sys import cachetools.func from send_task import send_hotel_detail_task, send_poi_detail_task, send_qyer_detail_task,\ send_image_task, send_ctripPoi_detail_task, send_GT_detail_task, send_PoiSource_detail_task, \ send_result_detail_task, send_result_daodao_filter from attach_send_task import qyer_supplement_map_info from proj.my_lib.logger import get_logger from send_email import send_email, SEND_TO, EMAIL_TITLE from proj.my_lib.Common.Utils import get_each_task_collection, generate_collection_name from proj.mysql_pool import service_platform_pool from toolbox.Hash import get_token from MongoTaskInsert import InsertTask, TaskType from rabbitmq_func import detect_msg_num logger = get_logger('monitor') task_statistics = redis.Redis(host='10.10.180.145', db=9) client = pymongo.MongoClient('mongodb://*****:*****@10.19.2.103:27017/') db = client['MongoTask_Zxp'] HOTEL_SOURCE = ( 'agoda', 'booking', 'ctrip', 'elong', 'expedia', 'hotels', 'hoteltravel', 'hrs', 'cheaptickets', 'orbitz', 'travelocity', 'ebookers', 'tripadvisor', 'ctripcn', 'hilton', 'ihg', 'holiday', 'accor', 'marriott', 'starwood', 'hyatt', 'gha', 'shangrila', 'fourseasons') RESULT_SOURCE = ['google', 'daodao'] POI_SOURCE = 'daodao' QYER_SOURCE = 'qyer' CTRIPPOI_SOURCE = 'ctripPoi' POI_S = ('ctripPoi') GT_SOURCE = 'GT' PRIORITY = 3
# -*- coding:utf-8 -*- import pymongo import pymongo.errors import requests.exceptions from proj.my_lib.logger import get_logger from proj.my_lib.Common.BaseSDK import BaseSDK from proj.my_lib.Common.Browser import MySession from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.my_lib.Common.Task import Task import json config = { 'host': '10.10.213.148', } logger = get_logger('ihg_suggest') client = pymongo.MongoClient(**config) db = client['SuggestName'] headers = { 'referer': 'https://www.ihg.com/hotels/cn/zh/reservation', 'x-requested-with': 'XMLHttpRequest', 'accept-encoding': 'gzip, deflate, br', 'accept': '*/*', 'accept-language': 'zh-CN,zh;q=0.9', } search_url = "https://www.ihg.com/guestapi/v1/ihg/cn/zh/web/suggestions" class IhgCitySDK(BaseSDK):
import mioji.common.pages_store import mioji.common.pool import mioji.common.spider from mioji import spider_factory from mioji.common.task_info import Task from mioji.common.utils import simple_get_socks_proxy from mioji.spider_factory import factory from proj.list_config import cache_config, list_cache_path, cache_type, none_cache_config from proj.my_lib.Common.BaseSDK import BaseSDK from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.my_lib.logger import get_logger from proj.mysql_pool import service_platform_pool from proj.my_lib.Common.Browser import proxy_pool logger = get_logger("poiDaodao") mioji.common.spider.NEED_FLIP_LIMIT = False mioji.common.pool.pool.set_size(2024) mioji.common.pages_store.cache_dir = list_cache_path mioji.common.pages_store.STORE_TYPE = cache_type # 初始化工作 (程序启动时执行一次即可) insert_db = None # get_proxy = simple_get_socks_proxy get_proxy = proxy_pool.get_proxy debug = True spider_factory.config_spider(insert_db, None, debug) mioji.common.spider.NEED_FLIP_LIMIT = False mioji.common.logger.logger = logger
@author: feng @date: 18-02-26 ''' import pymongo import pymongo.errors import requests.exceptions from proj.my_lib.logger import get_logger from proj.my_lib.Common.BaseSDK import BaseSDK from proj.my_lib.Common.Browser import MySession from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.my_lib.Common.Task import Task import json from lxml import html import re logger = get_logger('ctripPoi_suggest') client = pymongo.MongoClient('mongodb://*****:*****@10.19.2.103:27017/') db = client['SuggestName'] headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Host": "you.ctrip.com",
#!/usr/bin/env python # -*- coding:utf-8 -*- import pymongo import pymongo.errors import requests.exceptions from proj.my_lib.logger import get_logger from proj.my_lib.Common.BaseSDK import BaseSDK from proj.my_lib.Common.Browser import MySession from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.my_lib.Common.Task import Task import json from lxml import html logger = get_logger('ctrip_suggest') client = pymongo.MongoClient('mongodb://*****:*****@10.19.2.103:27017/') db = client['SuggestName'] headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'vacations.ctrip.com', 'Upgrade-Insecure-Requests': '1', 'Origin': 'http://vacations.ctrip.com', 'Referer': 'http://vacations.ctrip.com/grouptravel/', 'Content-Type': 'application/json;charset=UTF-8' } search_url = "http://vacations.ctrip.com/tour-mainsite-vacations/api/Category/Infer"
Base = declarative_base() class HotelRestList(Base): __tablename__ = 'poi_rest_new' source = Column(String(64), primary_key=True) source_id = Column(Integer, primary_key=True) city_id = Column(Integer, primary_key=True) url = Column(Text, nullable=False) name = Column(String(512), nullable=False) utime = Column(DateTime, default=datetime.datetime.now) from proj.my_lib.logger import get_logger logger = get_logger("restDaodao") mioji.common.spider.NEED_FLIP_LIMIT = False mioji.common.pool.pool.set_size(2024) # from proj.test_spider import DaodaoViewSpider # 初始化工作 (程序启动时执行一次即可) insert_db = None get_proxy = simple_get_socks_proxy debug = False spider_factory.config_spider(insert_db, get_proxy, debug) mioji.common.spider.NEED_FLIP_LIMIT = False # logger = get_task_logger(__name__) mioji.common.logger.logger = logger
# -*- coding: utf-8 -*- # @Time : 2018/1/11 下午6:34 # @Author : Hou Rong # @Site : # @File : VeriFlightSDK.py # @Software: PyCharm import json import pymongo from proj.my_lib.Common.BaseSDK import BaseSDK from proj.my_lib.Common.Browser import MySession from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.my_lib.logger import get_logger from proj.my_lib.Common.Utils import retry from proj.config import MONGO_DATA_URL logger = get_logger("VeriFlightSDK") headers = { "Content-Type": "application/x-www-form-urlencoded; charset=utf-8" } client = pymongo.MongoClient(MONGO_DATA_URL) data_collections = client['Data']['veriflight'] class VeriFlightSDK(BaseSDK): @retry(times=5) def _execute(self, **kwargs): with MySession(need_cache=True, need_proxies=True) as session: iata_code = self.task.kwargs['iata_code'] request_body = { "union": "", "maker": "",
id='monitoring_PoiSource_list') schedule.add_job(city2list, 'cron', second='*/59', id='city2list') schedule.add_job(monitoring_zombies_task_by_hour, 'cron', second='*/59', id='monitoring_zombies_task_by_hour') schedule.add_job(monitoring_zombies_task_total, 'cron', second='*/59', id='monitoring_zombies_task_total') # stream_handler = logging.StreamHandler() # logger = logging.getLogger('rabbitmq_watcher') # logger.addHandler(stream_handler) # logger.setLevel(logging.DEBUG) logger = get_logger("rabbitmq_watcher") ''' 用于管理分发任务的数目 默认为 default 值 key 任务队列名称 val (队列中最少的任务数,单次插入任务数,执行时间间隔) ''' TASK_CONF = { 'default': (0, 0, 10), 'file_downloader': (2000, 3000, 10), 'hotel_detail': (2800, 4000, 10), 'hotel_list': (2800, 4000, 10), 'poi_detail': (36000, 40000, 10), 'poi_list': (36000, 40000, 10), 'supplement_field': (9000, 40000, 10), 'google_api': (9000, 40000, 10), 'merge_task': (10000, 40000, 11),
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/10/6 下午8:28 # @Author : Hou Rong # @Site : # @File : CustomAutoScale.py # @Software: PyCharm import psutil import os import gc from celery.worker.autoscale import Autoscaler from proj.my_lib.logger import get_logger from time import sleep from celery.five import monotonic logger = get_logger('auto scale logger') INIT_POOL_PERCENT = 0.75 class CustomAutoScale(Autoscaler): def _maybe_scale(self, req=None): worker_name = self.worker.hostname memory_obj = psutil.virtual_memory() memory_percent = memory_obj.percent total_load_average = os.getloadavg() load_average = os.getloadavg()[0] load_percent = load_average / 4.0 procs = self.processes calc_percent = memory_percent * load_percent
# @File : RespStore.py # @Software: PyCharm import os import zlib import time import json import pickle import datetime import proj.my_lib.Common.Utils import proj.my_lib.Common.UFileHandler from os import path from proj.my_lib.logger import get_logger # from proj.my_lib.Common.UFileHandler import upload_stream, get_ufile_and_info, delete_ufile, has_file logger = get_logger('RespStore') cache_dir = path.abspath(path.join('/data/nfs/page_saver', 'resp_cache')) STORE_TYPE = 'ufile' def has_dir(): return path.isdir(cache_dir) def has_cache(md5): if STORE_TYPE == 'file': return path.exists(path.join(cache_dir, md5)) elif STORE_TYPE == 'ufile': return proj.my_lib.Common.UFileHandler.has_file("service_platform_{}".format(md5))
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/12/28 下午4:10 # @Author : Hou Rong # @Site : # @File : ProxyPool.py # @Software: PyCharm import time import requests import json import random from collections import defaultdict from proj.my_lib.Common.Utils import retry from proj.my_lib.logger import get_logger logger = get_logger("proxy_pool") PROXY_NUM_RANGE = (70, 150) source_list = [ 'turbojetsail', 'elongHotel', 'ctripHotel', 'tongchengApiHotel', 'expediaHotel', 'bookingHotel', 'HotelsHotel', 'biyiHotel', 'HotelclubHotel', 'venereHotel', 'agodaHotel', 'ebookersHotel', 'ihgHotel', 'marriottHotel', 'amomaHotel', 'hrsHotel', 'HoteltravelHotel', 'accorHotel', 'travelocityHotel', 'orbitzHotel', 'cheapticketsHotel', 'miojiHotel', 'hotwireHotel', 'kempinskiHotel', 'whgHotelsHotel', 'starwoodHotelsHotel', 'hostelworldHotel', 'HotelbedsApiHotel', 'haoqiaoApiHotel', 'innstantApiHotel', 'touricoApiHotel', 'gtaApiHotel', 'daolvApiHotel', 'jacApiHotel', 'mikiApiHotel', 'dotwApiHotel', 'tripadvisorHotel', 'hiltonHotel', 'yundijieHotel', 'elongFlight', 'ryanairFlight', 'ctripFlight', 'jijitongFlight', 'tongchengFlight',
from sqlalchemy.sql import text from proj.my_lib.Common.BaseSDK import BaseSDK from proj.my_lib.Common.Browser import MySession from proj.my_lib.Common.KeyMatch import key_is_legal from proj.my_lib.Common.NetworkUtils import google_get_map_info from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.my_lib.ServiceStandardError import TypeCheckError from proj.my_lib.attr_parser import parse as attr_parser from proj.my_lib.db_localhost import DBSession from proj.my_lib.logger import get_logger from proj.my_lib.new_hotel_parser.data_obj import text_2_sql from proj.my_lib.rest_parser import parse as rest_parser from proj.my_lib.shop_parser import parse as shop_parser logger = get_logger("POIDetail") parser_type = {'attr': attr_parser, 'rest': rest_parser, 'shop': shop_parser} class PoiDetailSDK(BaseSDK): def _execute(self, **kwargs): target_url = self.task.kwargs['target_url'] city_id = self.task.kwargs['city_id'] poi_type = self.task.kwargs['poi_type'] target_url = target_url.replace('.com.hk', '.cn') with MySession(need_cache=True) as session: page = session.get(target_url, timeout=120) page.encoding = 'utf8'
from copy import deepcopy from mioji.spider_factory import factory from mioji.common.task_info import Task from proj.my_lib.Common.BaseSDK import BaseSDK from proj.my_lib.Common.Browser import MySession from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.list_config import cache_config, none_cache_config from proj.my_lib.logger import get_logger from proj.my_lib.new_hotel_parser.hotel_parser import parse_hotel from proj.mysql_pool import service_platform_pool from mongo_pool import mongo_data_client from proj.my_lib.models.HotelModel import CommonHotel import json logger = get_logger("HotelDetailSDK") def hotel_detail_database(url, source, need_cache=True): task = Task() task.content = url spider = factory.get_spider_by_old_source(source + 'DetailHotel') spider.task = task spider.task.source = source if need_cache: error_code = spider.crawl(required=['hotel'], cache_config=cache_config) else: error_code = spider.crawl(required=['hotel'], cache_config=none_cache_config) logger.info(
# coding=utf-8 import sys import traceback reload(sys) sys.setdefaultencoding('utf8') from proj.celery import app from proj.my_lib.new_hotel_parser.hotel_parser import parse_hotel from proj.my_lib.BaseTask import BaseTask from proj.my_lib.PageSaver import get_page_content from my_lib.new_hotel_parser.data_obj import DBSession from proj.my_lib.logger import get_logger logger = get_logger("HotelTripadvisor") @app.task(bind=True, base=BaseTask, max_retries=2, rate_limit='5/s') def hotel_static_base_data(self, parent_task_id, task_name, source, source_id, city_id, hotel_url, **kwargs): logger.info("parent task id: {0}, start task".format(parent_task_id)) self.task_source = source.title() self.task_type = 'HotelStaticDataParse' # 获取保存的页面信息 other_info = {'source_id': source_id, 'city_id': city_id} logger.info( 'http://10.10.180.145:8888/hotel_page_viewer?task_name=hotel_base_data_tripadvisor_total_new&id=' + parent_task_id) content = get_page_content(task_id=parent_task_id, task_name=task_name)
# @Author : Hou Rong # @Site : # @File : hotel_routine_tasks.py # @Software: PyCharm # coding=utf-8 import re from my_lib.new_hotel_parser.data_obj import DBSession from proj.celery import app from proj.my_lib.new_hotel_parser.hotel_parser import parse_hotel from proj.my_lib.ServiceStandardError import TypeCheckError from proj.my_lib.BaseRoutineTask import BaseRoutineTask from proj.my_lib.PageSaver import save_task_and_page_content from proj.my_lib.Common.Browser import MySession from proj.my_lib.logger import get_logger logger = get_logger("HotelDetail") @app.task(bind=True, base=BaseRoutineTask, max_retries=2, rate_limit='6/s') def hotel_routine_base_data(self, source, url, other_info, **kwargs): self.task_source = source.title() self.task_type = 'Hotel' self.error_code = 0 # 初始化任务 try: # hotels if source == 'hotels': hotel_id = re.findall('hotel-id=(\d+)', url)[0] url = 'http://zh.hotels.com/hotel/details.html?hotel-id=' + hotel_id
import pymongo import pymongo.errors import requests.exceptions from proj.my_lib.logger import get_logger from proj.my_lib.Common.BaseSDK import BaseSDK from proj.my_lib.Common.Browser import MySession from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.my_lib.Common.Task import Task import json config = { 'host': '10.10.213.148', } logger = get_logger('daodao_suggest') client = pymongo.MongoClient(**config) db = client['SuggestName'] headers = { 'referer': 'https://www.tripadvisor.cn/', 'x-requested-with': 'XMLHttpRequest', 'accept-encoding': 'gzip, deflate, br', 'accept': 'text/javascript, text/html, application/xml, text/xml, */*', 'accept-language': 'zh-CN,zh;q=0.9', 'Origin': 'https://www.tripadvisor.cn', 'Host': 'www.tripadvisor.cn' } search_url = "https://www.tripadvisor.cn/TypeAheadJson"
# -*- coding:utf-8 -*- import pymongo import pymongo.errors import requests.exceptions from proj.my_lib.logger import get_logger from proj.my_lib.Common.BaseSDK import BaseSDK from proj.my_lib.Common.Browser import MySession from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.my_lib.Common.Task import Task import json from lxml import html config = { 'host': '10.10.213.148', } logger = get_logger('marriott_suggest') client = pymongo.MongoClient(**config) db = client['SuggestName'] headers = { 'referer': 'http://www.marriott.com.cn/default.mi', 'x-requested-with': 'XMLHttpRequest', 'accept-encoding': 'gzip, deflate, br', 'accept': '*/*', 'accept-language': 'zh-CN,zh;q=0.9', } search_url = "http://www.marriott.com.cn/search/autoComplete.mi" class MarriottCitySDK(BaseSDK):
from proj.my_lib.Common.KeyMatch import key_is_legal from proj.my_lib.Common.NetworkUtils import google_get_map_info from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.my_lib.ServiceStandardError import TypeCheckError from proj.my_lib.logger import get_logger from proj.my_lib.my_qyer_parser.data_obj import DBSession from proj.my_lib.my_qyer_parser.my_parser import page_parser from proj.my_lib.new_hotel_parser.data_obj import text_2_sql from proj.my_lib.Common.Utils import retry from lxml import html import json import pymongo from urlparse import urljoin mongo_config = {'host': '10.10.213.148'} logger = get_logger("QyerPoiCity") search_url = 'https://www.baidu.com/s' headers = { 'Host': 'www.baidu.com', 'is_referer': 'https://www.baidu.com/', 'is_xhr': '1', 'Referer': 'https://www.baidu.com/', } class BaiDuSearchSDK(BaseSDK): @retry(times=5) def _execute(self, **kwargs): with MySession(need_cache=True, need_proxies=True) as session:
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/10/31 下午7:34 # @Author : Hou Rong # @Site : # @File : img_hash.py # @Software: PyCharm import imagehash from PIL import Image from proj.my_lib.logger import get_logger, func_time_logger logger = get_logger("img_hash") @func_time_logger def _img_p_hash(f_obj): f_obj.seek(0) try: img_obj = Image.open(f_obj) except Exception as exc: logger.exception(msg="[error img]", exc_info=exc) return None try: _hash = imagehash.phash(img_obj) except Exception as exc: logger.exception(msg="[could not calculate phash]", exc_info=exc) return None f_obj.seek(0) return _hash
from .my_lib.hotel_comment.expedia import parser as expedia_comment_parser from .my_lib.hotel_comment.venere import parser as venere_comment_parser from .my_lib.is_complete_scale_ok import is_complete_scale_ok from .my_lib.rest_parser import insert_db as rest_insert_db from .my_lib.rest_parser import parse as rest_parser from .my_lib.shop_parser import insert_db as shop_insert_db from .my_lib.shop_parser import parse as shop_parser from .my_lib.task_module.task_func import get_task_id, update_task, insert_task from .my_lib.tp_comment_parser import parse, long_comment_parse, insert_db platforms.C_FORCE_ROOT = True _rate_limit_dict = get_rate_limit() from proj.my_lib.logger import get_logger logger = get_logger('ImgList') @app.task def add_task(): for i in range(10): add.delay(random.randint(1, 10), random.randint(1, 10)) @app.task def add_image_url(): url_list_file = ['img_url_1101', 'img_url_1103', 'img_url_test'] count = 0 for file_name in url_list_file: path = '/search/image/' + file_name + '_celery' if os.path.exists(path):
from sqlalchemy.sql import text from proj.my_lib.Common.BaseSDK import BaseSDK from proj.my_lib.Common.Browser import MySession from proj.my_lib.Common.KeyMatch import key_is_legal from proj.my_lib.Common.NetworkUtils import google_get_map_info from proj.my_lib.ServiceStandardError import ServiceStandardError from proj.my_lib.ServiceStandardError import TypeCheckError from proj.my_lib.logger import get_logger from proj.my_lib.my_qyer_parser.data_obj import DBSession from proj.my_lib.my_qyer_parser.my_parser import page_parser from proj.my_lib.new_hotel_parser.data_obj import text_2_sql from proj.my_lib.Common.Utils import retry logger = get_logger("QyerPoiDetail") class QyerDetailSDK(BaseSDK): @retry(times=3) def _execute(self, **kwargs): with MySession(need_cache=True, need_proxies=True) as session: city_id = self.task.kwargs['city_id'] target_url = self.task.kwargs['target_url'] headers = {'Host': 'place.qyer.com'} page = session.get(target_url, headers=headers, timeout=240) page.encoding = 'utf8' content = page.text if '请输入验证码' in content: raise Exception("请输入验证码")
# -*- coding: utf-8 -*- # @Time : 2017/10/9 下午2:06 # @Author : Hou Rong # @Site : ${SITE} # @File : NetworkUtils.py # @Software: PyCharm import json from urllib import quote import pymongo import proj.my_lib.Common.Browser from proj.my_lib.Common.Utils import Coordinate from proj.my_lib.logger import get_logger from proj.my_lib.Common.Utils import retry logger = get_logger("google_map_info_logger") client = pymongo.MongoClient('10.19.2.103:27017', 27017, username='******', password='******') db = client['Google_city'] @retry(times=4, raise_exc=False) def google_get_map_info(temp, address): logger.info('google_get_map_info +++ {1}'.format(address)) result = None with proj.my_lib.Common.Browser.MySession(need_cache=True) as session: page = session.get( 'https://maps.googleapis.com/maps/api/geocode/json?address=' +
#!/usr/bin/env python # encoding: utf-8 import pymysql from pymysql.cursors import SSDictCursor, SSCursor from proj.my_lib.logger import get_logger logger = get_logger("data_source") class MysqlSource: """ 数据库data源 """ def __init__(self, db_config, table_or_query='', size=500, is_table=True, is_dict_cursor=False): self._db_config = db_config if is_dict_cursor: self._db_config['cursorclass'] = SSDictCursor else: self._db_config['cursorclass'] = SSCursor self._size = size self._table = table_or_query if is_table: self._sql = 'select * from {0}'.format(self._table) else: self._sql = table_or_query def __iter__(self): return cursor_gen(pymysql.Connect(**self._db_config), self._sql, self._size)
# @File : download_img.py # @Software: PyCharm import gevent.monkey gevent.monkey.patch_all() import gevent.pool import pymysql import time import os from proj.my_lib.ks_upload_file_stream import download from proj.mysql_pool import service_platform_pool, spider_data_poi_pool from proj.my_lib.logger import get_logger, func_time_logger pool = gevent.pool.Pool(size=200) logger = get_logger("pic_detect_download") PARENT_PATH = "/data/image/formatted_image" SCAN_FILTER = 6000 EACH_TIMES_PER_TASK = 1000 MAX_PIC_PER_VIEW = 10000 update_data_list = [] task_data_list = [] is_new_task = True @func_time_logger def insert_all_data(): global update_data_list
import zlib import requests import time import datetime import logging import httplib import functools import proj.my_lib.Common.Utils from ucloud.ufile import putufile, downloadufile, postufile, deleteufile from ucloud.compact import BytesIO from proj.my_lib.logger import get_logger from ucloud.ufile import config u_logger = logging.getLogger("UCLOUD") u_logger.setLevel(logging.ERROR) logger = get_logger("ufile_uploader") local_ip = proj.my_lib.Common.Utils.get_local_ip() if local_ip.startswith('10.10'): config.set_default(uploadsuffix='.ufile.cn-north-03.ucloud.cn') config.set_default(downloadsuffix='.ufile.cn-north-03.ucloud.cn') elif local_ip.startswith('10.19'): config.set_default(uploadsuffix='.ufile.cn-north-04.ucloud.cn') config.set_default(downloadsuffix='.ufile.cn-north-04.ucloud.cn') else: logger.debug("[no ucloud machine][use public ip]") config.set_default(connection_timeout=60) # public_key = 'vCuKhG6UcHvB1UswK/q5zjMMHxt7PjKIu3/6q1JtxiZHtAuv' # private_key = 'fdfbdf9cb0ebfeed522f664efc44f752694b15f6' public_key = 'M7jIsudUE4Nvn6zQGjNMWxReCrSpc8HcWdBztizB38qvbXkS'
import hashlib import sys import pymysql import mock from MongoTaskInsert import InsertTask, TaskType from collections import defaultdict import proj.my_lib.my_mongo_insert from pymongo.errors import DuplicateKeyError from send_email import send_email, SEND_TO, EMAIL_TITLE from proj.my_lib.logger import get_logger from warnings import filterwarnings filterwarnings('ignore', category=pymysql.err.Warning) logger = get_logger("send_task") client = pymongo.MongoClient(host='10.10.231.105') collections = client['MongoTask']['Task'] redis_md5 = redis.Redis(host='10.10.114.35', db=9) def hourong_patch(data): try: with mock.patch('pymongo.collection.Collection._insert', proj.my_lib.my_mongo_insert.Collection._insert): result = collections.insert(data, continue_on_error=True) return result['n'] except DuplicateKeyError as e: send_email(EMAIL_TITLE, '%s %s \n %s' % (sys._getframe().f_code.co_name, datetime.datetime.now(), traceback.format_exc(e)), SEND_TO)