def __init__(self, driver_path: str = DRIVER_PATH, proxy: dict = None, ua: str = USER_AGENT.DEFAULT_UA, headless: bool = False): chrome_options = webdriver.ChromeOptions() if proxy is not None: chrome_options.add_argument("--proxy-server={}".format( "http://{}:{}".format(proxy["host"], proxy["port"]))) chrome_options.add_argument("--ignore-certificate-errors") if ua: chrome_options.add_argument("user-agent=" + ua) if headless: chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") self.driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options) self.driver.click = self.click self.driver.tap = self.tap self.driver.send = self.send_keys self.driver.execute_js = self.execute_js self.driver.flick = self.flick self.driver.scroll = self.scroll self.driver.request = self.request self.driver.drag_and_drop = self.drag_and_drop self.logger = logger_util.get_logger(ChromeDriver) self.driver.logger = self.logger
def __init__(self, urls, headers, proxies, timeout=8, log_file_path=""): self.urls = urls self.headers = headers self.proxies = proxies self.timeout = timeout self.logger = get_logger(SingleThreadApiTestUnit) handlerFormat = logging.Formatter( "[%(asctime)s]-[%(name)s]-[%(levelname)s]: %(message)s") streamHandler = logging.StreamHandler() streamHandler.setFormatter(handlerFormat) fileHandler = logging.FileHandler(log_file_path, "a") fileHandler.setFormatter(handlerFormat) self.logger.addHandler(streamHandler) self.logger.addHandler(fileHandler)
def __init__(self, proxy: dict = None, ua: str = USER_AGENT.DEFAULT_UA): self.logger = logger_util.get_logger(PhantomjsDriver) dcap = dict(DesiredCapabilities.PHANTOMJS) if proxy is not None: service_args = [ "--proxy={}:{}".format(proxy["host"], proxy["port"]), "--proxy-type={}".format(proxy["type"]), '--ignore-ssl-errors=true', ] else: service_args = [] if not ua: dcap["phantomjs.page.settings.userAgent"] = ua else: dcap["phantomjs.page.settings.userAgent"] = ua self.driver = webdriver.PhantomJS( executable_path= "/opt/package/phantomjs-2.1.1-linux-x86_64/bin/phantomjs", service_args=service_args, desired_capabilities=dcap) self.driver.logger = self.logger
import traceback import pymongo from dio_core.network.downloader import Downloader from dio_core.network.downloader.downloader import Setting from dio_core.utils import logger_util, time_util, md5_util, url_util from dio_core.utils.file_util import csv_util logger = logger_util.get_logger(__file__) fields = ( "_id,productList,name,monthSalesTip,wmPoiScore,distance,shippingFeeTip,minPriceTip,deliveryTimeTip,averagePri" "ceTip,thirdCategory,recommendInfo,activityList,labelInfoList,keyword,url") # 主页url, 搜索url, MAIN_URL = "http://h5.waimai.meituan.com/waimai/mindex/home" SEARCH_URL = "http://i.waimai.meituan.com/openh5/search/poi" SHOP_SEARCH_URL = "http://i.waimai.meituan.com/openh5/homepage/poilist?_={}" FOOD_URL = "http://i.waimai.meituan.com/openh5/poi/food" COMMENT_URL = "http://i.waimai.meituan.com/openh5/poi/comments" # mongodb 配置 client = pymongo.MongoClient(host='localhost', port=27017) db = client['meituan'] meituanwaimai_shop_list = db['meituanwaimai_shop_list_v1'] meituanwaimai_search_list = db['meituanwaimai_search_list'] meituanwaimai_food_list = db['meituanwaimai_food_list_v1'] meituanwaimai_comment_list = db['meituanwaimai_comment_list_v1'] decrypt_collection = db["meituanwaimai_decrypt"]
def __new__(cls, *args, **kwargs): if cls.logger is None: cls.logger = logger_util.get_logger(cls.__class__.__name__) return super().__new__(cls)
import pymysql from dio_core.utils import json_util from dio_core.utils.logger_util import get_logger logger = get_logger(__file__) def getRhino() -> pymysql.Connection: """获取 rhino connect""" return pymysql.connect("devrhino1", "rhino", "rhino", "db_datatub_rhino", port=3306, cursorclass=pymysql.cursors.DictCursor, charset='utf8') def updateSourceCrawlId(taskId: int, mapping: dict): """ 更新 mapping = { "redis-link": [1, 3] } """ conn = getRhino() cur = conn.cursor() # 获取taskConfig querySql = "SELECT t.* FROM t_rhino_task_config t WHERE id = {};".format(
def __init__(self): self.logger = logger_util.get_logger(self.__class__)
# @Time : 18-5-26 下午8:13 # @Author : DioMryang # @File : mysql_util.py # @Description : from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker from dio_core.utils import logger_util logger = logger_util.get_logger("mysql_util") def create_connection(**config): """ 创建mysql连接 :return: """ logger.info("create mysql connect {host}:{port}/{db_name}".format(**config)) engine = create_engine('mysql+{driver}://{user}:{password}@{host}:{port}/{db_name}?charset=utf8'.format(**config) , encoding='utf-8') return sessionmaker(bind=engine)()