コード例 #1
0
ファイル: BaseSDK.py プロジェクト: 20113261/platform_service
    def __init__(self, task, *args, **kwargs):
        """
        初始化抓取平台任务 SDK
        :type task: Task
        :param args:
        :param kwargs:
        """
        # 初始化任务
        self.task = task

        # 为任务设置 finished code
        self.finished_error_code = self.get_task_finished_code()
        self.task.task_finished_code = self.finished_error_code

        self.logger = get_logger(self.__class__.__name__)

        # modify handler's formatter
        datefmt = "%Y-%m-%d %H:%M:%S"
        file_log_format = "%(asctime)-15s %(threadName)s %(filename)s:%(lineno)d %(levelname)s " \
                          "[source: {}][type: {}][task_id: {}]:        %(message)s".format(self.task.source,
                                                                                           self.task.type,
                                                                                           self.task.task_id)
        formtter = logging.Formatter(file_log_format, datefmt)
        for each_handler in self.logger.handlers:
            each_handler.setFormatter(formtter)

        self.logger.info("[init SDK]")
コード例 #2
0
    def __init__(self, worker, source, _type, task_name, routine_key, queue,
                 **kwargs):
        # 任务基本信息
        self.worker = worker
        self.source = source
        self.type = _type
        self.task_name = task_name
        self.routine_key = routine_key
        self.queue = queue
        self.task_type = kwargs.get('task_type', TaskType.NORMAL)

        self.priority = int(kwargs.get("priority", 3))
        self.logger = get_logger("InsertMongoTask")
        self.tasks = TaskList()

        self.collection_name = self.generate_collection_name()

        # 数据游标偏移量,用于在查询时发生异常恢复游标位置
        self.offset = 0
        # 数据游标前置偏移量,用于在入库时恢复游标位置
        self.pre_offset = 0

        client = pymongo.MongoClient(
            'mongodb://*****:*****@10.19.2.103:27017/')
        self.db = client['MongoTask_Zxp']

        # 建立所需要的全部索引
        self.create_mongo_indexes()

        # CITY TASK 获取 date_list
        if self.task_type == TaskType.CITY_TASK:
            self.date_list = self.generate_list_date()
        else:
            self.date_list = None

        # 修改 logger 日志打印
        # modify handler's formatter
        datefmt = "%Y-%m-%d %H:%M:%S"
        file_log_format = "%(asctime)-15s %(threadName)s %(filename)s:%(lineno)d %(levelname)s " \
                          "[source: {}][type: {}][task_name: {}][collection_name: {}]:        %(message)s".format(
            self.source, self.type, self.task_name, self.collection_name)
        formtter = logging.Formatter(file_log_format, datefmt)

        for each_handler in self.logger.handlers:
            each_handler.setFormatter(formtter)
        self.logger.info("[init InsertTask]")
コード例 #3
0
import pymongo
import pymongo.errors
import requests
from proj.my_lib.logger import get_logger
from proj.my_lib.Common.BaseSDK import BaseSDK
from proj.my_lib.Common.Browser import MySession
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.my_lib.Common.Task import Task
import json

config = {
    'host': '10.10.213.148',
}

logger = get_logger('accor_suggest')

client = pymongo.MongoClient(**config)
db = client['SuggestName']

search_url = "http://book.accorhotels.cn/Intellisense/Search"

headers = {
    # "Cookie": "NSC_10.10.10.244-80=ffffffff090214e145525d5f4f58455e445a4a423660; language=zh-CN",
    "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
}


class AccorCitySDK(BaseSDK):
    def _execute(self, **kwargs):
        with MySession(need_proxies=True,
コード例 #4
0
# -*- coding: utf-8 -*-
# @Time    : 2018/1/25 下午3:40
# @Author  : Hou Rong
# @Site    :
# @File    : insert_rabbitmq.py
# @Software: PyCharm
import gevent.monkey

gevent.monkey.patch_all()
import pika
import json
import logging
from proj.my_lib.logger import get_logger
from proj.my_lib.Common.Utils import retry

logger = get_logger("insert_rabbitmq")

# test
HOST = '10.10.189.213'
USER = '******'
PASSWD = '1220'
EXCHANGE = 'GoogleDrive'
ROUTINE_KEY = 'GoogleDrive'
V_HOST = 'GoogleDrive'
QueueName = 'GoogleDrive'

logging.getLogger("pika").setLevel(logging.WARNING)
logging.getLogger("pika").propagate = False

# online
# HOST = '10.10.38.166'
コード例 #5
0
ファイル: monitor.py プロジェクト: 20113261/platform_service
import os
import sys
import cachetools.func
from send_task import send_hotel_detail_task, send_poi_detail_task, send_qyer_detail_task,\
    send_image_task, send_ctripPoi_detail_task, send_GT_detail_task, send_PoiSource_detail_task, \
    send_result_detail_task, send_result_daodao_filter
from attach_send_task import qyer_supplement_map_info
from proj.my_lib.logger import get_logger
from send_email import send_email, SEND_TO, EMAIL_TITLE
from proj.my_lib.Common.Utils import get_each_task_collection, generate_collection_name
from proj.mysql_pool import service_platform_pool
from toolbox.Hash import get_token
from MongoTaskInsert import InsertTask, TaskType
from rabbitmq_func import detect_msg_num

logger = get_logger('monitor')

task_statistics = redis.Redis(host='10.10.180.145', db=9)
client = pymongo.MongoClient('mongodb://*****:*****@10.19.2.103:27017/')
db = client['MongoTask_Zxp']
HOTEL_SOURCE = (
    'agoda', 'booking', 'ctrip', 'elong', 'expedia', 'hotels', 'hoteltravel', 'hrs', 'cheaptickets', 'orbitz',
    'travelocity', 'ebookers', 'tripadvisor', 'ctripcn', 'hilton', 'ihg', 'holiday', 'accor', 'marriott', 'starwood',
    'hyatt', 'gha', 'shangrila', 'fourseasons')
RESULT_SOURCE = ['google', 'daodao']
POI_SOURCE = 'daodao'
QYER_SOURCE = 'qyer'
CTRIPPOI_SOURCE = 'ctripPoi'
POI_S = ('ctripPoi')
GT_SOURCE = 'GT'
PRIORITY = 3
コード例 #6
0
# -*- coding:utf-8 -*-
import pymongo
import pymongo.errors
import requests.exceptions
from proj.my_lib.logger import get_logger
from proj.my_lib.Common.BaseSDK import BaseSDK
from proj.my_lib.Common.Browser import MySession
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.my_lib.Common.Task import Task
import json

config = {
    'host': '10.10.213.148',
}

logger = get_logger('ihg_suggest')

client = pymongo.MongoClient(**config)
db = client['SuggestName']

headers = {
    'referer': 'https://www.ihg.com/hotels/cn/zh/reservation',
    'x-requested-with': 'XMLHttpRequest',
    'accept-encoding': 'gzip, deflate, br',
    'accept': '*/*',
    'accept-language': 'zh-CN,zh;q=0.9',
}
search_url = "https://www.ihg.com/guestapi/v1/ihg/cn/zh/web/suggestions"


class IhgCitySDK(BaseSDK):
コード例 #7
0
import mioji.common.pages_store
import mioji.common.pool
import mioji.common.spider
from mioji import spider_factory
from mioji.common.task_info import Task
from mioji.common.utils import simple_get_socks_proxy
from mioji.spider_factory import factory

from proj.list_config import cache_config, list_cache_path, cache_type, none_cache_config
from proj.my_lib.Common.BaseSDK import BaseSDK
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.my_lib.logger import get_logger
from proj.mysql_pool import service_platform_pool
from proj.my_lib.Common.Browser import proxy_pool

logger = get_logger("poiDaodao")

mioji.common.spider.NEED_FLIP_LIMIT = False
mioji.common.pool.pool.set_size(2024)
mioji.common.pages_store.cache_dir = list_cache_path
mioji.common.pages_store.STORE_TYPE = cache_type

# 初始化工作 (程序启动时执行一次即可)
insert_db = None
# get_proxy = simple_get_socks_proxy
get_proxy = proxy_pool.get_proxy
debug = True
spider_factory.config_spider(insert_db, None, debug)
mioji.common.spider.NEED_FLIP_LIMIT = False

mioji.common.logger.logger = logger
コード例 #8
0
@author: feng
@date: 18-02-26
'''
import pymongo
import pymongo.errors
import requests.exceptions
from proj.my_lib.logger import get_logger
from proj.my_lib.Common.BaseSDK import BaseSDK
from proj.my_lib.Common.Browser import MySession
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.my_lib.Common.Task import Task
import json
from lxml import html
import re

logger = get_logger('ctripPoi_suggest')

client = pymongo.MongoClient('mongodb://*****:*****@10.19.2.103:27017/')
db = client['SuggestName']

headers = {
    "Accept":
    "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding":
    "gzip, deflate",
    "Accept-Language":
    "zh-CN,zh;q=0.9,en;q=0.8",
    "Connection":
    "keep-alive",
    "Host":
    "you.ctrip.com",
コード例 #9
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import pymongo
import pymongo.errors
import requests.exceptions
from proj.my_lib.logger import get_logger
from proj.my_lib.Common.BaseSDK import BaseSDK
from proj.my_lib.Common.Browser import MySession
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.my_lib.Common.Task import Task
import json
from lxml import html

logger = get_logger('ctrip_suggest')

client = pymongo.MongoClient('mongodb://*****:*****@10.19.2.103:27017/')
db = client['SuggestName']

headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    'Host': 'vacations.ctrip.com',
    'Upgrade-Insecure-Requests': '1',
    'Origin': 'http://vacations.ctrip.com',
    'Referer': 'http://vacations.ctrip.com/grouptravel/',
    'Content-Type': 'application/json;charset=UTF-8'
}
search_url = "http://vacations.ctrip.com/tour-mainsite-vacations/api/Category/Infer"
コード例 #10
0
Base = declarative_base()


class HotelRestList(Base):
    __tablename__ = 'poi_rest_new'
    source = Column(String(64), primary_key=True)
    source_id = Column(Integer, primary_key=True)
    city_id = Column(Integer, primary_key=True)
    url = Column(Text, nullable=False)
    name = Column(String(512), nullable=False)
    utime = Column(DateTime, default=datetime.datetime.now)


from proj.my_lib.logger import get_logger

logger = get_logger("restDaodao")

mioji.common.spider.NEED_FLIP_LIMIT = False
mioji.common.pool.pool.set_size(2024)

# from proj.test_spider import DaodaoViewSpider

# 初始化工作 (程序启动时执行一次即可)
insert_db = None
get_proxy = simple_get_socks_proxy
debug = False
spider_factory.config_spider(insert_db, get_proxy, debug)
mioji.common.spider.NEED_FLIP_LIMIT = False

# logger = get_task_logger(__name__)
mioji.common.logger.logger = logger
コード例 #11
0
# -*- coding: utf-8 -*-
# @Time    : 2018/1/11 下午6:34
# @Author  : Hou Rong
# @Site    : 
# @File    : VeriFlightSDK.py
# @Software: PyCharm
import json
import pymongo
from proj.my_lib.Common.BaseSDK import BaseSDK
from proj.my_lib.Common.Browser import MySession
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.my_lib.logger import get_logger
from proj.my_lib.Common.Utils import retry
from proj.config import MONGO_DATA_URL

logger = get_logger("VeriFlightSDK")
headers = {
    "Content-Type": "application/x-www-form-urlencoded; charset=utf-8"
}
client = pymongo.MongoClient(MONGO_DATA_URL)
data_collections = client['Data']['veriflight']


class VeriFlightSDK(BaseSDK):
    @retry(times=5)
    def _execute(self, **kwargs):
        with MySession(need_cache=True, need_proxies=True) as session:
            iata_code = self.task.kwargs['iata_code']
            request_body = {
                "union": "",
                "maker": "",
コード例 #12
0
                 id='monitoring_PoiSource_list')
schedule.add_job(city2list, 'cron', second='*/59', id='city2list')
schedule.add_job(monitoring_zombies_task_by_hour,
                 'cron',
                 second='*/59',
                 id='monitoring_zombies_task_by_hour')
schedule.add_job(monitoring_zombies_task_total,
                 'cron',
                 second='*/59',
                 id='monitoring_zombies_task_total')

# stream_handler = logging.StreamHandler()
# logger = logging.getLogger('rabbitmq_watcher')
# logger.addHandler(stream_handler)
# logger.setLevel(logging.DEBUG)
logger = get_logger("rabbitmq_watcher")
'''
用于管理分发任务的数目
默认为 default 值
key 任务队列名称 val (队列中最少的任务数,单次插入任务数,执行时间间隔)
'''
TASK_CONF = {
    'default': (0, 0, 10),
    'file_downloader': (2000, 3000, 10),
    'hotel_detail': (2800, 4000, 10),
    'hotel_list': (2800, 4000, 10),
    'poi_detail': (36000, 40000, 10),
    'poi_list': (36000, 40000, 10),
    'supplement_field': (9000, 40000, 10),
    'google_api': (9000, 40000, 10),
    'merge_task': (10000, 40000, 11),
コード例 #13
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/10/6 下午8:28
# @Author  : Hou Rong
# @Site    :
# @File    : CustomAutoScale.py
# @Software: PyCharm
import psutil
import os
import gc
from celery.worker.autoscale import Autoscaler
from proj.my_lib.logger import get_logger
from time import sleep
from celery.five import monotonic

logger = get_logger('auto scale logger')

INIT_POOL_PERCENT = 0.75


class CustomAutoScale(Autoscaler):
    def _maybe_scale(self, req=None):
        worker_name = self.worker.hostname
        memory_obj = psutil.virtual_memory()
        memory_percent = memory_obj.percent
        total_load_average = os.getloadavg()
        load_average = os.getloadavg()[0]
        load_percent = load_average / 4.0
        procs = self.processes

        calc_percent = memory_percent * load_percent
コード例 #14
0
# @File    : RespStore.py
# @Software: PyCharm
import os
import zlib
import time
import json
import pickle
import datetime
import proj.my_lib.Common.Utils
import proj.my_lib.Common.UFileHandler
from os import path
from proj.my_lib.logger import get_logger

# from proj.my_lib.Common.UFileHandler import upload_stream, get_ufile_and_info, delete_ufile, has_file

logger = get_logger('RespStore')

cache_dir = path.abspath(path.join('/data/nfs/page_saver', 'resp_cache'))

STORE_TYPE = 'ufile'


def has_dir():
    return path.isdir(cache_dir)


def has_cache(md5):
    if STORE_TYPE == 'file':
        return path.exists(path.join(cache_dir, md5))
    elif STORE_TYPE == 'ufile':
        return proj.my_lib.Common.UFileHandler.has_file("service_platform_{}".format(md5))
コード例 #15
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/12/28 下午4:10
# @Author  : Hou Rong
# @Site    :
# @File    : ProxyPool.py
# @Software: PyCharm
import time
import requests
import json
import random
from collections import defaultdict
from proj.my_lib.Common.Utils import retry
from proj.my_lib.logger import get_logger

logger = get_logger("proxy_pool")

PROXY_NUM_RANGE = (70, 150)

source_list = [
    'turbojetsail', 'elongHotel', 'ctripHotel', 'tongchengApiHotel',
    'expediaHotel', 'bookingHotel', 'HotelsHotel', 'biyiHotel',
    'HotelclubHotel', 'venereHotel', 'agodaHotel', 'ebookersHotel', 'ihgHotel',
    'marriottHotel', 'amomaHotel', 'hrsHotel', 'HoteltravelHotel',
    'accorHotel', 'travelocityHotel', 'orbitzHotel', 'cheapticketsHotel',
    'miojiHotel', 'hotwireHotel', 'kempinskiHotel', 'whgHotelsHotel',
    'starwoodHotelsHotel', 'hostelworldHotel', 'HotelbedsApiHotel',
    'haoqiaoApiHotel', 'innstantApiHotel', 'touricoApiHotel', 'gtaApiHotel',
    'daolvApiHotel', 'jacApiHotel', 'mikiApiHotel', 'dotwApiHotel',
    'tripadvisorHotel', 'hiltonHotel', 'yundijieHotel', 'elongFlight',
    'ryanairFlight', 'ctripFlight', 'jijitongFlight', 'tongchengFlight',
コード例 #16
0
from sqlalchemy.sql import text

from proj.my_lib.Common.BaseSDK import BaseSDK
from proj.my_lib.Common.Browser import MySession
from proj.my_lib.Common.KeyMatch import key_is_legal
from proj.my_lib.Common.NetworkUtils import google_get_map_info
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.my_lib.ServiceStandardError import TypeCheckError
from proj.my_lib.attr_parser import parse as attr_parser
from proj.my_lib.db_localhost import DBSession
from proj.my_lib.logger import get_logger
from proj.my_lib.new_hotel_parser.data_obj import text_2_sql
from proj.my_lib.rest_parser import parse as rest_parser
from proj.my_lib.shop_parser import parse as shop_parser

logger = get_logger("POIDetail")

parser_type = {'attr': attr_parser, 'rest': rest_parser, 'shop': shop_parser}


class PoiDetailSDK(BaseSDK):
    def _execute(self, **kwargs):
        target_url = self.task.kwargs['target_url']
        city_id = self.task.kwargs['city_id']
        poi_type = self.task.kwargs['poi_type']

        target_url = target_url.replace('.com.hk', '.cn')
        with MySession(need_cache=True) as session:
            page = session.get(target_url, timeout=120)
            page.encoding = 'utf8'
コード例 #17
0
from copy import deepcopy

from mioji.spider_factory import factory
from mioji.common.task_info import Task
from proj.my_lib.Common.BaseSDK import BaseSDK
from proj.my_lib.Common.Browser import MySession
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.list_config import cache_config, none_cache_config
from proj.my_lib.logger import get_logger
from proj.my_lib.new_hotel_parser.hotel_parser import parse_hotel
from proj.mysql_pool import service_platform_pool
from mongo_pool import mongo_data_client
from proj.my_lib.models.HotelModel import CommonHotel
import json

logger = get_logger("HotelDetailSDK")


def hotel_detail_database(url, source, need_cache=True):
    task = Task()
    task.content = url
    spider = factory.get_spider_by_old_source(source + 'DetailHotel')
    spider.task = task
    spider.task.source = source
    if need_cache:
        error_code = spider.crawl(required=['hotel'],
                                  cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['hotel'],
                                  cache_config=none_cache_config)
    logger.info(
コード例 #18
0
# coding=utf-8
import sys
import traceback

reload(sys)
sys.setdefaultencoding('utf8')

from proj.celery import app
from proj.my_lib.new_hotel_parser.hotel_parser import parse_hotel
from proj.my_lib.BaseTask import BaseTask
from proj.my_lib.PageSaver import get_page_content
from my_lib.new_hotel_parser.data_obj import DBSession

from proj.my_lib.logger import get_logger

logger = get_logger("HotelTripadvisor")


@app.task(bind=True, base=BaseTask, max_retries=2, rate_limit='5/s')
def hotel_static_base_data(self, parent_task_id, task_name, source, source_id,
                           city_id, hotel_url, **kwargs):
    logger.info("parent task id: {0}, start task".format(parent_task_id))
    self.task_source = source.title()
    self.task_type = 'HotelStaticDataParse'
    # 获取保存的页面信息
    other_info = {'source_id': source_id, 'city_id': city_id}
    logger.info(
        'http://10.10.180.145:8888/hotel_page_viewer?task_name=hotel_base_data_tripadvisor_total_new&id='
        + parent_task_id)

    content = get_page_content(task_id=parent_task_id, task_name=task_name)
コード例 #19
0
# @Author  : Hou Rong
# @Site    :
# @File    : hotel_routine_tasks.py
# @Software: PyCharm
# coding=utf-8
import re
from my_lib.new_hotel_parser.data_obj import DBSession
from proj.celery import app
from proj.my_lib.new_hotel_parser.hotel_parser import parse_hotel
from proj.my_lib.ServiceStandardError import TypeCheckError
from proj.my_lib.BaseRoutineTask import BaseRoutineTask
from proj.my_lib.PageSaver import save_task_and_page_content
from proj.my_lib.Common.Browser import MySession
from proj.my_lib.logger import get_logger

logger = get_logger("HotelDetail")


@app.task(bind=True, base=BaseRoutineTask, max_retries=2, rate_limit='6/s')
def hotel_routine_base_data(self, source, url, other_info, **kwargs):
    self.task_source = source.title()
    self.task_type = 'Hotel'

    self.error_code = 0

    # 初始化任务
    try:
        # hotels
        if source == 'hotels':
            hotel_id = re.findall('hotel-id=(\d+)', url)[0]
            url = 'http://zh.hotels.com/hotel/details.html?hotel-id=' + hotel_id
コード例 #20
0
import pymongo
import pymongo.errors
import requests.exceptions
from proj.my_lib.logger import get_logger
from proj.my_lib.Common.BaseSDK import BaseSDK
from proj.my_lib.Common.Browser import MySession
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.my_lib.Common.Task import Task
import json

config = {
    'host': '10.10.213.148',
}

logger = get_logger('daodao_suggest')

client = pymongo.MongoClient(**config)
db = client['SuggestName']

headers = {
    'referer': 'https://www.tripadvisor.cn/',
    'x-requested-with': 'XMLHttpRequest',
    'accept-encoding': 'gzip, deflate, br',
    'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
    'accept-language': 'zh-CN,zh;q=0.9',
    'Origin': 'https://www.tripadvisor.cn',
    'Host': 'www.tripadvisor.cn'
}
search_url = "https://www.tripadvisor.cn/TypeAheadJson"
コード例 #21
0
# -*- coding:utf-8 -*-
import pymongo
import pymongo.errors
import requests.exceptions
from proj.my_lib.logger import get_logger
from proj.my_lib.Common.BaseSDK import BaseSDK
from proj.my_lib.Common.Browser import MySession
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.my_lib.Common.Task import Task
import json
from lxml import html
config = {
    'host': '10.10.213.148',
}

logger = get_logger('marriott_suggest')

client = pymongo.MongoClient(**config)
db = client['SuggestName']

headers = {
    'referer': 'http://www.marriott.com.cn/default.mi',
    'x-requested-with': 'XMLHttpRequest',
    'accept-encoding': 'gzip, deflate, br',
    'accept': '*/*',
    'accept-language': 'zh-CN,zh;q=0.9',
}
search_url = "http://www.marriott.com.cn/search/autoComplete.mi"


class MarriottCitySDK(BaseSDK):
コード例 #22
0
from proj.my_lib.Common.KeyMatch import key_is_legal
from proj.my_lib.Common.NetworkUtils import google_get_map_info
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.my_lib.ServiceStandardError import TypeCheckError
from proj.my_lib.logger import get_logger
from proj.my_lib.my_qyer_parser.data_obj import DBSession
from proj.my_lib.my_qyer_parser.my_parser import page_parser
from proj.my_lib.new_hotel_parser.data_obj import text_2_sql
from proj.my_lib.Common.Utils import retry
from lxml import html
import json
import pymongo
from urlparse import urljoin

mongo_config = {'host': '10.10.213.148'}
logger = get_logger("QyerPoiCity")

search_url = 'https://www.baidu.com/s'

headers = {
    'Host': 'www.baidu.com',
    'is_referer': 'https://www.baidu.com/',
    'is_xhr': '1',
    'Referer': 'https://www.baidu.com/',
}


class BaiDuSearchSDK(BaseSDK):
    @retry(times=5)
    def _execute(self, **kwargs):
        with MySession(need_cache=True, need_proxies=True) as session:
コード例 #23
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/10/31 下午7:34
# @Author  : Hou Rong
# @Site    :
# @File    : img_hash.py
# @Software: PyCharm
import imagehash
from PIL import Image
from proj.my_lib.logger import get_logger, func_time_logger

logger = get_logger("img_hash")


@func_time_logger
def _img_p_hash(f_obj):
    f_obj.seek(0)
    try:
        img_obj = Image.open(f_obj)
    except Exception as exc:
        logger.exception(msg="[error img]", exc_info=exc)
        return None

    try:
        _hash = imagehash.phash(img_obj)
    except Exception as exc:
        logger.exception(msg="[could not calculate phash]", exc_info=exc)
        return None
    f_obj.seek(0)
    return _hash
コード例 #24
0
ファイル: tasks.py プロジェクト: 20113261/platform_service
from .my_lib.hotel_comment.expedia import parser as expedia_comment_parser
from .my_lib.hotel_comment.venere import parser as venere_comment_parser
from .my_lib.is_complete_scale_ok import is_complete_scale_ok
from .my_lib.rest_parser import insert_db as rest_insert_db
from .my_lib.rest_parser import parse as rest_parser
from .my_lib.shop_parser import insert_db as shop_insert_db
from .my_lib.shop_parser import parse as shop_parser
from .my_lib.task_module.task_func import get_task_id, update_task, insert_task
from .my_lib.tp_comment_parser import parse, long_comment_parse, insert_db

platforms.C_FORCE_ROOT = True

_rate_limit_dict = get_rate_limit()
from proj.my_lib.logger import get_logger

logger = get_logger('ImgList')


@app.task
def add_task():
    for i in range(10):
        add.delay(random.randint(1, 10), random.randint(1, 10))


@app.task
def add_image_url():
    url_list_file = ['img_url_1101', 'img_url_1103', 'img_url_test']
    count = 0
    for file_name in url_list_file:
        path = '/search/image/' + file_name + '_celery'
        if os.path.exists(path):
コード例 #25
0
from sqlalchemy.sql import text

from proj.my_lib.Common.BaseSDK import BaseSDK
from proj.my_lib.Common.Browser import MySession
from proj.my_lib.Common.KeyMatch import key_is_legal
from proj.my_lib.Common.NetworkUtils import google_get_map_info
from proj.my_lib.ServiceStandardError import ServiceStandardError
from proj.my_lib.ServiceStandardError import TypeCheckError
from proj.my_lib.logger import get_logger
from proj.my_lib.my_qyer_parser.data_obj import DBSession
from proj.my_lib.my_qyer_parser.my_parser import page_parser
from proj.my_lib.new_hotel_parser.data_obj import text_2_sql
from proj.my_lib.Common.Utils import retry

logger = get_logger("QyerPoiDetail")


class QyerDetailSDK(BaseSDK):
    @retry(times=3)
    def _execute(self, **kwargs):
        with MySession(need_cache=True, need_proxies=True) as session:
            city_id = self.task.kwargs['city_id']
            target_url = self.task.kwargs['target_url']
            headers = {'Host': 'place.qyer.com'}
            page = session.get(target_url, headers=headers, timeout=240)
            page.encoding = 'utf8'
            content = page.text

            if '请输入验证码' in content:
                raise Exception("请输入验证码")
コード例 #26
0
# -*- coding: utf-8 -*-
# @Time    : 2017/10/9 下午2:06
# @Author  : Hou Rong
# @Site    : ${SITE}
# @File    : NetworkUtils.py
# @Software: PyCharm
import json
from urllib import quote

import pymongo
import proj.my_lib.Common.Browser
from proj.my_lib.Common.Utils import Coordinate
from proj.my_lib.logger import get_logger
from proj.my_lib.Common.Utils import retry

logger = get_logger("google_map_info_logger")

client = pymongo.MongoClient('10.19.2.103:27017',
                             27017,
                             username='******',
                             password='******')
db = client['Google_city']


@retry(times=4, raise_exc=False)
def google_get_map_info(temp, address):
    logger.info('google_get_map_info +++ {1}'.format(address))
    result = None
    with proj.my_lib.Common.Browser.MySession(need_cache=True) as session:
        page = session.get(
            'https://maps.googleapis.com/maps/api/geocode/json?address=' +
コード例 #27
0
#!/usr/bin/env python
# encoding: utf-8
import pymysql
from pymysql.cursors import SSDictCursor, SSCursor
from proj.my_lib.logger import get_logger

logger = get_logger("data_source")


class MysqlSource:
    """
    数据库data源
    """

    def __init__(self, db_config, table_or_query='', size=500, is_table=True, is_dict_cursor=False):
        self._db_config = db_config
        if is_dict_cursor:
            self._db_config['cursorclass'] = SSDictCursor
        else:
            self._db_config['cursorclass'] = SSCursor
        self._size = size
        self._table = table_or_query
        if is_table:
            self._sql = 'select * from {0}'.format(self._table)
        else:
            self._sql = table_or_query

    def __iter__(self):
        return cursor_gen(pymysql.Connect(**self._db_config), self._sql, self._size)

コード例 #28
0
# @File    : download_img.py
# @Software: PyCharm
import gevent.monkey

gevent.monkey.patch_all()
import gevent.pool
import pymysql
import time
import os
from proj.my_lib.ks_upload_file_stream import download
from proj.mysql_pool import service_platform_pool, spider_data_poi_pool
from proj.my_lib.logger import get_logger, func_time_logger

pool = gevent.pool.Pool(size=200)

logger = get_logger("pic_detect_download")

PARENT_PATH = "/data/image/formatted_image"

SCAN_FILTER = 6000
EACH_TIMES_PER_TASK = 1000
MAX_PIC_PER_VIEW = 10000

update_data_list = []
task_data_list = []
is_new_task = True


@func_time_logger
def insert_all_data():
    global update_data_list
コード例 #29
0
import zlib
import requests
import time
import datetime
import logging
import httplib
import functools
import proj.my_lib.Common.Utils
from ucloud.ufile import putufile, downloadufile, postufile, deleteufile
from ucloud.compact import BytesIO
from proj.my_lib.logger import get_logger
from ucloud.ufile import config

u_logger = logging.getLogger("UCLOUD")
u_logger.setLevel(logging.ERROR)
logger = get_logger("ufile_uploader")

local_ip = proj.my_lib.Common.Utils.get_local_ip()
if local_ip.startswith('10.10'):
    config.set_default(uploadsuffix='.ufile.cn-north-03.ucloud.cn')
    config.set_default(downloadsuffix='.ufile.cn-north-03.ucloud.cn')
elif local_ip.startswith('10.19'):
    config.set_default(uploadsuffix='.ufile.cn-north-04.ucloud.cn')
    config.set_default(downloadsuffix='.ufile.cn-north-04.ucloud.cn')
else:
    logger.debug("[no ucloud machine][use public ip]")
config.set_default(connection_timeout=60)

# public_key = 'vCuKhG6UcHvB1UswK/q5zjMMHxt7PjKIu3/6q1JtxiZHtAuv'
# private_key = 'fdfbdf9cb0ebfeed522f664efc44f752694b15f6'
public_key = 'M7jIsudUE4Nvn6zQGjNMWxReCrSpc8HcWdBztizB38qvbXkS'
コード例 #30
0
import hashlib
import sys
import pymysql
import mock
from MongoTaskInsert import InsertTask, TaskType
from collections import defaultdict

import proj.my_lib.my_mongo_insert
from pymongo.errors import DuplicateKeyError
from send_email import send_email, SEND_TO, EMAIL_TITLE
from proj.my_lib.logger import get_logger

from warnings import filterwarnings

filterwarnings('ignore', category=pymysql.err.Warning)
logger = get_logger("send_task")

client = pymongo.MongoClient(host='10.10.231.105')
collections = client['MongoTask']['Task']
redis_md5 = redis.Redis(host='10.10.114.35', db=9)


def hourong_patch(data):
    try:
        with mock.patch('pymongo.collection.Collection._insert', proj.my_lib.my_mongo_insert.Collection._insert):
            result = collections.insert(data, continue_on_error=True)
            return result['n']
    except DuplicateKeyError as e:
        send_email(EMAIL_TITLE,
                   '%s   %s \n %s' % (sys._getframe().f_code.co_name, datetime.datetime.now(), traceback.format_exc(e)),
                   SEND_TO)