예제 #1
0
 def __init__(self, pinyin, version=None):
     self.pinyin = pinyin
     self.ua = cu.get_user_agent()
     self.version = version
     self.logging = Logging(name=pinyin)
     self.recChar = None
     self.yzm_count = 0
     cg = ConfigGet('Config.ini')
     opt = cg.get("setting", "debug", "false")
     self.debug = 1 if opt.lower() == "true" else 0
     pass
예제 #2
0
 def firstPorxy(self,proxy_num):
     """
     连续访问:自建和非自建代理连续访问
     非连续访问:非自建代理连续访问
     :return:
     """
     if self.proxy:
         if not ConfigGet(cfpath).has_option('series_num',self.pro_name):
             if str(self.proxy.split(":")[-1]) in  ["42271","42272"]:
                 proxy_num = 0
                 self.proxy = self.getPorxy
             elif self.proxy_num >= 50:
                 proxy_num = 0
                 self.proxy = self.getPorxy
             else:
                 proxy_num += 1
         elif self.pro_name >= int(f(cfpath,'series_num',self.pro_name)):
             proxy_num = 0
             self.proxy = self.getPorxy
         else:
             self.proxy_num += 1
     else:
         proxy_num = 0
         self.proxy = self.getPorxy
     return proxy_num,self.proxy
예제 #3
0
 def getPorxy(self):
     debug = ConfigGet('Config.ini').get('setting','debug').lower()
     if debug == 'true':
         return None
     else:
         self.proxy = self.httpProxyApi
         return self.proxy
예제 #4
0
 def __init__(self,
              work,
              log_name = "ProcessControl",
              conf_file = "ItemConfig.ini",
              conf_key="qyxx",
              seed_key=None):
     """
     初始化进程管理参数
     :param work:      进程调用的主函数
     :param log_name:  log文件名字
     :param conf_file: 配置文件名字
     :param conf_key:  从配置文件里面获取信息的key,该key定义了加载的爬虫类型和进程数量
     :param seed_key:  该key定义了加载的爬虫需不需要种子
     """
     self.work = work
     self.conf_file = conf_file
     self.logger = logging.getLogger(log_name)
     self.conf_getter = ConfigGet(self.conf_file)
     self.conf_key = conf_key
     self.seed_key = seed_key
예제 #5
0
# -*- coding: utf-8 -*-
"""
企业信息网代理模块
"""
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import time
from requests import get
import re
sys.path.append('../')
from Config.ConfigGet import ConfigGet
from CommonLib.DB.DBManager import DBManager


f = lambda x,y,z:ConfigGet(x).get(y,z)

cfpath = 'ConfigProxy.ini'

class Proxy(object):
    def __init__(self,pro_name):
        self.pro_name=pro_name
        self.proxyInit()
        host = f(cfpath,'db','host')
        port = f(cfpath,'db','port')
        self.__db= DBManager.getInstance('ssdb','%s_black_proxy' % self.pro_name,host = host,port = int(port))
        self.proxy = None

    def proxyInit(self):
        """
        代理统计初始化
예제 #6
0
# -*- coding: utf-8 -*-
"""
年报信息抓取接口模块
"""
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import time
sys.path.append("../")
from CommonLib.DB.DBManager import DBManager
from Config.ConfigGet import ConfigGet
fp = 'NbxxApiControler.ini'
f = lambda x: ConfigGet(fp).get('db', x)


class NbxxApiControler(object):
    def __init__(self):
        self.__db = DBManager.getInstance(f('type'),
                                          f('table'),
                                          server=f('server'))

    def getRowkey(self, **kwargs):
        '''
        获取rowkey
        :param kwargs:mapping:company_name,company_zch or zch
        :return:
        '''

        company_name = kwargs.get('company_name', '')
        zch = kwargs.get('company_zch', '')
        zch = zch if zch else kwargs.get('zch', '')
예제 #7
0
class ProcessControl(object):
    """
    Class for control the processes under different bbd_types
    """

    def __init__(self,
                 work,
                 log_name = "ProcessControl",
                 conf_file = "ItemConfig.ini",
                 conf_key="qyxx",
                 seed_key=None):
        """
        初始化进程管理参数
        :param work:      进程调用的主函数
        :param log_name:  log文件名字
        :param conf_file: 配置文件名字
        :param conf_key:  从配置文件里面获取信息的key,该key定义了加载的爬虫类型和进程数量
        :param seed_key:  该key定义了加载的爬虫需不需要种子
        """
        self.work = work
        self.conf_file = conf_file
        self.logger = logging.getLogger(log_name)
        self.conf_getter = ConfigGet(self.conf_file)
        self.conf_key = conf_key
        self.seed_key = seed_key

    def loadConfig(self):
        """
        get md5 value of  config file use for compare
        :return:
        """
        # self.conf_md5=calcFileMD5(self.conf_file)
        # self.PROV_DICT = config.PROVINCE_INFO_DICT

        self.conf_md5 = self.conf_getter.cfMd5()
        self.PROV_DICT = self.conf_getter.itemsToDict(self.conf_key)
        if self.seed_key:
            self.SEED_DICT = self.conf_getter.itemsToDict(self.seed_key)
        else:
            self.SEED_DICT = None
        
    def startProcess(self,bbd_type, process_name):
        """
        start a process use the bbd_type and name
        :param bbd_type: use to load different kinds of crawlers
        :param process_name: process name for record the info of process
        :return: object which include process info
        """
        if self.SEED_DICT and bbd_type in self.SEED_DICT:
            p = Process(target = self.work, name = process_name, args = (bbd_type, self.SEED_DICT[bbd_type]))
        else:
            p = Process(target = self.work, name = process_name, args = (bbd_type,))
        p.start()
        return p
    def startBulkProcess(self, bbd_type, num,idx=0):
        """
        start a bulk of processes use the bbd_type and number, this method will call startProcess to create processes
        :param bbd_type: used to init instance
        :param num: process numer that want to start
        :param idx: default is 0 for new , if there are already some processes running, please pass the number of running processes here
        :return: a list include processes that succ created and running
        """
        p_list = []
        num=int(num)
        for index in range(0,num):
            p=self.startProcess(bbd_type, bbd_type+":"+str(index+idx))
            p_list.append(p)
        return p_list


    def endTypeProcess(self, bbd_type, p_dict):

        """
        terminate all processes under one bbd_type
        :param bbd_type: process bbd_type
        :param p_dict: dict that contains all process info
        :return: dict which deleted the key of bbd_type
        """
        for process in p_dict[bbd_type]:
            process.terminate()
            process.join()
            log_msg = "Type:" + bbd_type +" PID:"+ str(process.pid)+ " killed"
            self.logger.warning(log_msg)
        del p_dict[bbd_type]
        return p_dict

    def endBulkProcess(self, bbd_type, p_dict,num):
        """
        terminate the number of processes under bbd_type
        :param bbd_type: process bbd_type
        :param p_dict: dict that contains all process info
        :param num: the num of processes want to be killed
        :return: dict that removed the info of killed processes
        """
        p_list = p_dict[bbd_type]
        num = abs(num)
        for process in range(0,num):
            process = p_list.pop()
            process.terminate()
            process.join()
            log_msg = "Type:" + bbd_type + " PID:" + str(process.pid) + " killed"
            self.logger.warning(log_msg)
        p_dict[bbd_type] = p_list
        return p_dict
    # def work(self):
    #     pass
    def run(self):
        """
        main method , create all bbd_types of process and start the monitor
        :return: None
        """
        self.loadConfig()
        process_dict = {}
        for bbd_type, num in self.PROV_DICT.items():
            process_list=self.startBulkProcess(bbd_type,int(num))
            process_dict.update({bbd_type:process_list})
        self.processMonitor(process_dict)

    def processMonitor(self,p_dict):
        """
        monitor the process status
        1. if process is terminated unexpectedly, restart the process
        2. add new processes
        3. kill processes that usr want to kill
        the actions are defined in configuration file
        :param p_dict: dict that include all process infomation
        :return:
        """
        while True:
            self.conf_getter.reload()
            # new_conf_md5 = calcFileMD5(self.conf_file)
            new_conf_md5 = self.conf_getter.cfMd5()
            if self.conf_md5 == new_conf_md5:
                # print "***********************************Condif file no change , print info"
                self.logger.info("configuration file no change , print info, logger=%s",str(self.logger))
                for bbd_type, p_list in p_dict.items():
                    if p_list:
                        for p in p_list:
                            if p.is_alive():
                                log_msg = "[ "+p.name+" ]"+" status: "+ str(p.is_alive())+ " pid: "+ str(p.pid)
                                self.logger.info(log_msg)
                            if not p.is_alive():
                                log_msg = "[ " + p.name + " ]" + " status: Dead " + " pid: " + str(p.pid)
                                self.logger.info(log_msg)
                                new_p = self.startProcess(bbd_type,p.name)
                                log_msg = "Restart"+ "[ " + new_p.name + " ]" + " status: Dead " + " pid: " + str(new_p.pid)
                                self.logger.info(log_msg)
                                p_list.append(new_p)
                        updated_p_list = filter(lambda p:p.is_alive(),p_list)
                        p_dict[bbd_type] = updated_p_list

            else:
                # print "*************************************Condif file changed ,Reload "
                self.logger.info("configuration file changed ,Reload")
                # reload(config)
                # self.NEW_PROV_DICT = config.PROVINCE_INFO_DICT
                self.NEW_PROV_DICT = self.conf_getter.itemsToDict(self.conf_key)
                new_list = self.getNewProvs(self.PROV_DICT, self.NEW_PROV_DICT)
                del_list = self.getDelProvs(self.PROV_DICT, self.NEW_PROV_DICT)
                if not del_list:
                    update_dict = self.getUpdateProvs(self.PROV_DICT, self.NEW_PROV_DICT )
                if new_list:
                    for bbd_type in new_list:
                        p_list = self.startBulkProcess(bbd_type, self.NEW_PROV_DICT[bbd_type])
                        p_dict.update({bbd_type:p_list})
                if del_list:
                    for bbd_type in del_list:
                        p_dict = self.endTypeProcess(bbd_type, p_dict)
                for bbd_type,new_num in update_dict.items():
                    if new_num > 0:
                        p_list = self.startBulkProcess(bbd_type, new_num,idx=len(p_dict[bbd_type]))
                        p_dict[bbd_type].extend(p_list)
                    else:
                        p_dict =    self.endBulkProcess( bbd_type, p_dict,new_num)
                self.PROV_DICT = self.NEW_PROV_DICT
                self.conf_md5 = new_conf_md5
            time.sleep(5)




    def getNewProvs(self,pre_dict, new_dict):
        """
        get new bbd_types
        :param pre_dict: previous dict which include province info
        :param new_dict: new dict which include province info
        :return: new bbd_types stored in a list return [] of no new one
        """
        pre_set = set(pre_dict.keys())
        new_set = set(new_dict.keys())
        return list(new_set - pre_set)

    def getDelProvs(self, pre_dict, new_dict):
        """
        get delete bbd_types
        :param pre_dict: previous dict which include province info
        :param new_dict: new dict which include province info
        :return: deleted bbd_types stored in a list return [] of no delete one
        """
        pre_set = set(pre_dict.keys())
        new_set = set(new_dict.keys())
        return list(pre_set - new_set)
    def getUpdateProvs(self, pre_dict, new_dict):
        """
        get updated info for processes
        :param pre_dict: previous dict which include province info
        :param new_dict: new dict which include province info
        :return: dict include the updated bbd_type and number for processes
        """

        n_dict={}
        for k,v in pre_dict.items():
            if v != new_dict[k]:
                n_dict.update({k:int(new_dict[k])-int(v)})
        return n_dict