コード例 #1
0
    def __downlowdFile(self, url, req):
        # http://stackoverflow.com/questions/862173/how-to-download-a-file-using-python-in-a-smarter-way
        """
        下载文件的代码逻辑.沿用王洋逻辑,还没调试
        :param url:
        :param req:
        """
        reqheaders = req.headers
        revealfn = url.split('/')[-1]

        if "." in revealfn[-6:]:
            fileName = revealfn
        else:
            if ('Content-Disposition' in reqheaders.keys()):
                fileName = reqheaders['Content-Disposition'].split('filename=')[1]
                fileName = fileName.replace('"', '').replace("'", "")
            else:
                r = urllib.request.urlopen(url)
                if r.url != url:
                    fileName = basename(urlsplit(r.url)[2])
            self.URL_inf.FileName = fileName

        _FileName = None
        if (self.URL_inf.FilePath):
            _FileName = str(self.URL_inf.FilePath) + fileName
        else:
            _FileName = fileName

        with open(_FileName, "wb") as donefile:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
                    donefile.write(chunk)

        Log.i("File:"+_FileName+"downLoaded")
コード例 #2
0
 def find_many(self, collection_name, filter_dict=None, projection_dict=None, limit_size=0, skip_index=0):
     """
     查找多条表记录,默认返回空数组
     :param collection_name: str 集合名
     :param filter_dict: dict filter_dict: 过滤条件如{'campaignId':123}
     :param projection_dict: dict 返回的字段如{'campaign.status':1,'updated':1,'_id':0}
     :param limit_size: int 限定返回的数据条数
     :param skip_index: int 游标位移
     :return: list 查询到的记录组成的列表,每个元素是一个字典
     """
     result = []
     try:
         collection = self.database.get_collection(collection_name)
         if not limit_size:
             if not skip_index:
                 result = collection.find(filter_dict, projection_dict)
             else:
                 result = collection.find(filter_dict, projection_dict).skip(skip_index)
         else:
             if not skip_index:
                 result = collection.find(filter_dict, projection_dict).limit(limit_size)
             else:
                 result = collection.find(filter_dict, projection_dict).skip(skip_index).limit(limit_size)
     except Exception as e:
         Log.e('find data failed: %s' % e)
     finally:
         return result
コード例 #3
0
 def gets_html(url,
               params=None,
               headers=None,
               cookies=None,
               proxies=None,
               charset='UTF-8'):
     '''
     发送https get请求
     :param url:str 请求的url
     :param params:dict 参数
     :param headers:dict 自定义请求头
     :param cookies:dict 网站cookies
     :param proxies:dict 代理
     :return: str 返回的str文本
     '''
     html = None
     try:
         r = requests.get(url,
                          params=params,
                          headers=headers,
                          cookies=cookies,
                          proxies=proxies,
                          verify=False)
         r.encoding = charset
         html = r.text
     except Exception as e:
         Log.e("https get html failed -> " + str(e))
     finally:
         pass
     return html
コード例 #4
0
 def posts_html(url,
                data=None,
                headers=None,
                cookies=None,
                proxies=None,
                charset='UTF-8'):
     '''
     发送https post请求
     :param url:str 请求的url
     :param data:dict post的数据
     :param headers:dict 自定义请求头
     :return: str 返回的str文本
     '''
     html = None
     try:
         r = requests.post(url,
                           data=data,
                           headers=headers,
                           cookies=cookies,
                           proxies=proxies,
                           verify=False)
         r.encoding = charset
         html = r.text
     except Exception as e:
         Log.e("https post html failed -> " + str(e))
     finally:
         pass
     return html
コード例 #5
0
 def __init__(self, host, port, db_name, default_collection):
     Log.i('Init MongoDB')
     self.client = pymongo.MongoClient(
         host=host, port=port, connect=False
     )  # Connection() 和 MongoClient() safe MongoClient被设计成线程安全、可以被多线程共享的
     self.db = self.client.get_database(db_name)
     self.collection = self.db.get_collection(default_collection)
コード例 #6
0
 def get_file(file_name,
              url,
              params=None,
              headers=None,
              cookies=None,
              proxies=None):
     '''
     发送http get请求文件
     :return:
     '''
     html = True
     try:
         r = requests.get(url,
                          params=params,
                          headers=headers,
                          cookies=cookies,
                          proxies=proxies)
         with open(file_name, 'wb') as fd:
             for chunk in r.iter_content(512):
                 fd.write(chunk)
     except Exception as e:
         Log.e("http get file failed -> " + str(e))
         html = False
     finally:
         pass
     return html
コード例 #7
0
 def update(self, collection_name, filter_dict, update_dict, insert=False, multi=False, auto_uptime=True):
     """
     更新表记录,默认返回false
     :param collection_name: str 集合名
     :param filter_dict: dict 过滤条件,如{'campaignId':{'$in':[1,2,3]}}
     :param update_dict: dict 更新的字段,如{'$set':{'status_key:0','campaign.status':1},{'$unset':'campaign.name':'test_camp'}}
     :param insert: bool 如果需要更新的记录不存在是否插入
     :param multi: bool 是否更新所有符合条件的记录, False则只更新一条,True则更新所有
     :return: bool 是否更新成功
     """
     result = False
     try:
         if auto_uptime:
             timestamp = time.time()
             uptimestamp = int(round(timestamp * 1000))
             uptime = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
             if '$set' in update_dict:
                 update_dict['$set']['uptime']=uptime
                 update_dict['$set']['uptimestamp'] = uptimestamp
             else:
                 update_dict['$set']={'uptime':uptime,'uptimestamp':uptimestamp}
         collection = self.database.get_collection(collection_name)
         collection.update(filter_dict, update_dict, insert, multi)
         result = True
         Log.d("update success!")
     except Exception as e:
         Log.e('update failed: %s' % e)
         traceback.print_exc()
     finally:
         return result
コード例 #8
0
ファイル: SchedulerX.py プロジェクト: yajie100/ScrapyX
 def run(self):
     '''
     每隔1秒,循环读取tasks
     交给Downloader
     :return:
     '''
     #创建进程池
     pool = Pool()
     while True:
         #获取一条待执行的Task,并置为doing状态
         task = self.taskUtil.get_ready()
         if task is not None and len(task) > 0 or True:
             Log.i('-----------------------------')
             #用进程池启动Downloader
             pool.apply_async(self.run_downloader, args=(task, ))
         #休眠n秒(从配置文件中读取)
         items = ConfigUtil.getItems('scheduler')
         interval_min = items['interval_min']
         interval_max = items['interval_max']
         seconds = random.randint(int(interval_min), int(interval_max))
         Log.i('Start sleep ' + str(seconds) + ' seconds')
         time.sleep(seconds)
     pool.close()
     pool.join()
     log.i('All subprocesses done.')
コード例 #9
0
ファイル: PipelineX.py プロジェクト: slj886/ScrapyX
 def run(self):
     '''
     分发
     :return: 无
     '''
     Log.i('Pipeline.run()')
     if self.task['results'] is not None and len(self.task['results']) > 0:
         #下次任务入队列
         if self.task['next_tasks'] is not None:
             for next_task in self.task['next_tasks']:
                 self.taskUtil.insert_one(next_task)
         #本次解析结果入库
         # 利用反射机制自动执行pipeline_<parser名>()函数,如果找不到则执行默认的pipeline_default()函数
         if hasattr(self, 'pipeline_' + self.task['parser']):
             func = getattr(self, 'pipeline_' + self.task['parser'])
             func(self.task['table'])
         else:
             self.pipeline_default(self.task['table'])
         #将完整task存入mongo,并将本条task
         self.task['state'] = 'done'
         self.taskUtil.replace_one(self.task['_id'], self.task)
     else:
         #没有解析出结果,则表示中间出错了,等待下次再启动
         pass
     Log.i('this task is finished')
コード例 #10
0
 def insert(self, collection_name, insert_data, auto_uptime=True):
     """
     更新表记录,默认返回false
     :param collection_name: str 集合名
     :param insert_data: dict 插入的数据,如{'campaignId':{'$in':[1,2,3]}}
     :return: bool 是否更新成功
     """
     result = False
     try:
         if auto_uptime:
             timestamp = time.time()
             uptimestamp = int(round(timestamp * 1000))
             uptime = datetime.datetime.fromtimestamp(timestamp).strftime(
                 '%Y-%m-%d %H:%M:%S.%f')[:-3]
             if type(insert_data) == dict:
                 insert_data['uptime'] = uptime
                 insert_data['uptimestamp'] = uptimestamp
             elif type(insert_data) == list:
                 items = []
                 for data in insert_data:
                     data['uptime'] = uptime
                     data['uptimestamp'] = uptimestamp
                     items.append(data)
                 insert_data = items
         collection = self.database.get_collection(collection_name)
         collection.insert(insert_data)
         result = True
         Log.d("insert success!")
     except Exception as e:
         Log.e('insert failed: %s' % e)
     finally:
         return result
コード例 #11
0
 def replace(self,
             collection_name,
             filter_dict,
             replace_data,
             auto_uptime=True):
     """
     替换文档,默认返回false
     :param collection_name: str 集合名
     :param filter_dict: dict 查询条件,如{'campaignId':{'$in':[1,2,3]}}
     :param replace_data: dict 替换的数据,如{'campaignId':{'$in':[4,5,6]}}
     :return: bool 是否更新成功
     """
     result = False
     try:
         if auto_uptime:
             timestamp = time.time()
             uptimestamp = int(round(timestamp * 1000))
             uptime = datetime.datetime.fromtimestamp(timestamp).strftime(
                 '%Y-%m-%d %H:%M:%S.%f')[:-3]
             replace_data['uptime'] = uptime
             replace_data['uptimestamp'] = uptimestamp
         collection = self.database.get_collection(collection_name)
         collection.replace_one(filter_dict, replace_data)
         result = True
         Log.d("remove success!")
     except Exception as e:
         Log.e('remove failed: %s' % e)
     finally:
         return result
コード例 #12
0
    def __setconsumer(self):
        """
        返回消费父链接话题的消费者对象
        :return:
        """
        conf = kafka_setting
        context = ssl.create_default_context()
        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
        context.verify_mode = ssl.CERT_REQUIRED
        # context.check_hostname = True
        context.load_verify_locations(CACERT_FILENAME)

        try:
            consumer = KafkaConsumer(bootstrap_servers=conf['bootstrap_servers'],
                                     group_id=conf['consumer_id'],
                                     sasl_mechanism="PLAIN",
                                     ssl_context=context,
                                     security_protocol='SASL_SSL',
                                     api_version=(0, 10),
                                     sasl_plain_username=conf['sasl_plain_username'],
                                     sasl_plain_password=conf['sasl_plain_password'])
        except KafkaError as e:
            Log.e(e + 'kafkaConsumer failed')

        return consumer
コード例 #13
0
 def get_json(url,
              params=None,
              headers=None,
              cookies=None,
              proxies=None,
              charset='UTF-8'):
     '''
     发送http get请求
     :param url:str 请求的url
     :param params:dict 参数
     :param headers:dict 自定义请求头
     :param cookies:dict 网站cookies
     :param proxies:dict 代理
     :return: json 返回的json对象
     '''
     html = None
     try:
         r = requests.get(url,
                          params=params,
                          headers=headers,
                          cookies=cookies,
                          proxies=proxies)
         r.encoding = charset
         html = r.json()
     except Exception as e:
         Log.e("http get json failed -> " + str(e))
     finally:
         pass
     return html
コード例 #14
0
def run():
    if sys.argv[1] is None and sys.argv[2] is None and sys.argv[3] is None:
        Log.i("no params error")
        os._exit(0)
    else:
        isCrontab = sys.argv[1]
        begin = sys.argv[2]
        end = sys.argv[3]

    #是否crontab命令启动,1代表是,其它代表否
    if isCrontab==str(1):
        crontab = 1
    else:
        crontab = 0

    p_list = list()
    producerProcess = ProducerUrl("producer", crontab, begin, end)
    p_list.append(producerProcess)

    # start = TimeUtil.getDefaultTimeIt()

    for p in p_list:
        p.daemon = True
        p.start()
    for p in p_list:
        p.join()

    # end = TimeUtil.getDefaultTimeIt()
    # Log.i('ProducerUrlParentPid run for %.2fm' %(end - start))
    if crontab==1:
        os._exit(0)
コード例 #15
0
 def close_conn(self):
     """
     关闭数据库链接
     :return: 无返回值
     """
     if self.client:
         self.client.close()
         Log.d('closed mongo connection')
コード例 #16
0
 def consumerurl(self):
     """
     消费父链接
     :param queueDictData:
     """
     conf = localKafka_setting
     self.consumer.subscribe((conf['topic_name']))
     # TODO 这里阻塞是消费者连接超时,底层SDK主动调用断线重连API,监听数据回调(永久死循环,无JB优化了)
     for message in self.consumer:
         jsondata = str(message.value, "utf-8")
         Log.i(jsondata)
コード例 #17
0
 def insert(self, item, collection_name=None):
     if collection_name != None:
         collection = self.db.get_collection(collection_name)
         try:
             return len(collection.insert(item))
         except Exception:
             return 0
     else:
         try:
             return len(self.collection.insert(item))
         except Exception as e:
             Log.e("mongo insert failed -> " + str(e))
             return 0
コード例 #18
0
    def __setconsumer(self):
        """
        返回消费父链接话题的消费者对象
        :return:
        """
        conf = localKafka_setting
        try:
            consumer = KafkaConsumer(
                bootstrap_servers=conf['bootstrap_servers'],
                group_id=conf['consumer_id'])
        except KafkaError as e:
            Log.e(e + 'kafkaConsumer failed')

        return consumer
コード例 #19
0
def getIpProxyPoolFromeRemote():
    """
    直接从远程获取免费ip代理
    :return: 可用的ip代理
    """
    if USE_PROXY is False:
        return None

    try:
        # Log.i('获取代理···')
        resp = requests.get(PROXY_REMOTE_URL, timeout=TIMEOUT)
        return resp.text
    except Exception as e:
        Log.e('无法获取代理信息,请确认代理系统是否启动')
        return None
コード例 #20
0
 def get_header(url, params=None, headers=None, cookies=None, proxies=None):
     '''
     发送http head请求
     :param url:str 请求的url
     :return: dict header值
     '''
     html = None
     try:
         r = requests.get(url)
         html = r.headers
     except Exception as e:
         Log.e("http get header failed -> " + str(e))
     finally:
         pass
     return html
コード例 #21
0
    def _sendChildUrl(self,URL_inf, mutex_lock):
        # # 保存数据并提取子链接重新投入生产对应的话题
        KafkaOperator = kafkaUrlinformation()
        # TODO 这里用类管理不同网站的逻辑
        parseCCGPModule = ParserCCGPModule(URL_inf, KafkaOperator)
        ccgpChildrenLink = parseCCGPModule.getLinks()

        if ccgpChildrenLink is None:
            mutex_lock.release()  # 临界区结束,互斥的结束
            return

        for link in ccgpChildrenLink:
            #于浩说不要发父链接给他
            if link.DeepNum >= 0:
                Log.i("produce<<"+json.dumps(link.class2dict()))
                KafkaOperator.producerUrl(json.dumps(link.class2dict()))
コード例 #22
0
 def find_one(self, collection_name, filter_dict=None, projection_dict=None):
     """
     查找一条表记录,默认返回空字典
     :param collection_name: str 集合名
     :param filter_dict: dict 过滤条件如{'campaignId':123}
     :param projection_dict: dict 返回的字段如{'campaign.status':1,'updated':1,'_id':0}
     :return: dict 查找到的数据
     """
     result = {}
     try:
         collection = self.database.get_collection(collection_name)
         result = collection.find_one(filter_dict, projection_dict)
     except Exception as e:
         Log.e('find data failed: %s' % e)
     finally:
         return result
コード例 #23
0
 def count(self, collection_name, filter_dict=None):
     """
     查找表记录条数,默认返回0
     :param collection_name: str 集合名
     :param table_name: str 表名
     :param filter_dict: dict 过滤条件
     :return: int 表记录条数
     """
     tab_size = 0
     try:
         collection = self.database.get_collection(collection_name)
         tab_size = collection.find(filter_dict).count()
         return tab_size
     except Exception as e:
         Log.e('get table size failed: %s' % e)
     finally:
         return tab_size
コード例 #24
0
 def producterUUID(self, strurl):
     """
     生产ggcp话题的uuid
     :param strurl:
     """
     try:
         conf = kafka_setting
         #TODO 抛出异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs.
         future = self.producer.send(conf['topic_name_ccgp'], bytes(strurl, 'ASCII'))
         self.producer.flush()
         future.get()
     except KafkaError as e:
         self.producer.close()
         if self.producer is None:
             self.producer = self.__setproducer()
         Log.e(e+'send message failed')
         pass
コード例 #25
0
 def delete(self, collection_name, filter_dict):
     """
     更新表记录,默认返回false
     :param collection_name: str 集合名
     :param filter_dict: dict 查询条件,如{'campaignId':{'$in':[1,2,3]}}
     :return: bool 是否更新成功
     """
     result = False
     try:
         collection = self.database.get_collection(collection_name)
         collection.remove(filter_dict)
         result = True
         Log.d("remove success!")
     except Exception as e:
         Log.e('remove failed: %s' % e)
     finally:
         return result
コード例 #26
0
    def run(self):

        try:
            pass
        except Exception as e:
            Log.i("AsyncThreadScanner run exception<<" + e.message)

        # 移除线程队列
        AsyncThreadScanner.lck.acquire()
        AsyncThreadScanner.tList.remove(self)

        # 如果移除此完成的队列线程数刚好达到上限数值-1,则说明有线程在等待执行,那么我们释放event,让等待事件执行
        if len(AsyncThreadScanner.tList) == AsyncThreadScanner.maxThreads - 1:
            AsyncThreadScanner.event.set()
            AsyncThreadScanner.event.clear()

        AsyncThreadScanner.lck.release()
コード例 #27
0
    def run(self):
        '''
        获取免费IP代理进程执行,循环读取tasks
        :return:
        '''
        Log.i('Downloader.run() in {0}'.format(time.ctime()))

        p_list = list()

        downloaderRun = Process(target=self.run_downloader, args=(self.pipeDictData,))
        p_list.append(downloaderRun)

        for p in p_list:
            p.daemon = True
            p.start()
        for p in p_list:
            p.join()
コード例 #28
0
ファイル: DownloaderX.py プロジェクト: yajie100/ScrapyX
 def run(self):
     '''
     线程执行,默认调用方法,任务分发
     :return:
     '''
     Log.i('Downloader.run()')
     #利用反射机制自动执行download_<parser名>()函数,如果找不到则执行默认的download_default()函数
     if hasattr(self, 'download_' + self.task['parser']):
         func = getattr(self, 'download_' + self.task['parser'])
         func()
     else:
         self.download_default()
     #启动解析器
     parserModule = Setting.PARSER_MODULE
     ParserX = importlib.import_module(parserModule)
     parser = ParserX.Parser(self.task)
     parser.run()
コード例 #29
0
 def run(self):
     '''
     分发
     :return:
     '''
     Log.i('Parser.run()')
     # 利用反射机制自动执行parse_<parser名>()函数,如果找不到则执行默认的parse_default()函数
     if hasattr(self, 'parse_' + self.task['parser']):
         func = getattr(self, 'parse_' + self.task['parser'])
         func()
     else:
         self.parse_default()
     #启动Pipeline
     pipelineModule = Setting.PIPELINE_MODULE
     PipelineX = importlib.import_module(pipelineModule)
     pipeline = PipelineX.Pipeline(self.task)
     pipeline.run()
コード例 #30
0
 def producerUrl(self, strurl):
     """
     生产父链接
     :param strurl:
     """
     try:
         conf = kafka_setting
         future = self.producer.send(conf['topic_name'], bytes(strurl, 'ASCII'))
         self.producer.flush()
         future.get()
     except KafkaError as e:
         #TODO 异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs处理
         #https://stackoverflow.com/questions/48261501/kafka-errors-kafkatimeouterror-kafkatimeouterror-failed-to-update-metadata-aft
         self.producer.close()
         if self.producer is None:
             self.producer = self.__setproducer()
         Log.e(e+'send message failed')
         pass