def __downlowdFile(self, url, req): # http://stackoverflow.com/questions/862173/how-to-download-a-file-using-python-in-a-smarter-way """ 下载文件的代码逻辑.沿用王洋逻辑,还没调试 :param url: :param req: """ reqheaders = req.headers revealfn = url.split('/')[-1] if "." in revealfn[-6:]: fileName = revealfn else: if ('Content-Disposition' in reqheaders.keys()): fileName = reqheaders['Content-Disposition'].split('filename=')[1] fileName = fileName.replace('"', '').replace("'", "") else: r = urllib.request.urlopen(url) if r.url != url: fileName = basename(urlsplit(r.url)[2]) self.URL_inf.FileName = fileName _FileName = None if (self.URL_inf.FilePath): _FileName = str(self.URL_inf.FilePath) + fileName else: _FileName = fileName with open(_FileName, "wb") as donefile: for chunk in req.iter_content(chunk_size=1024): if chunk: donefile.write(chunk) Log.i("File:"+_FileName+"downLoaded")
def find_many(self, collection_name, filter_dict=None, projection_dict=None, limit_size=0, skip_index=0): """ 查找多条表记录,默认返回空数组 :param collection_name: str 集合名 :param filter_dict: dict filter_dict: 过滤条件如{'campaignId':123} :param projection_dict: dict 返回的字段如{'campaign.status':1,'updated':1,'_id':0} :param limit_size: int 限定返回的数据条数 :param skip_index: int 游标位移 :return: list 查询到的记录组成的列表,每个元素是一个字典 """ result = [] try: collection = self.database.get_collection(collection_name) if not limit_size: if not skip_index: result = collection.find(filter_dict, projection_dict) else: result = collection.find(filter_dict, projection_dict).skip(skip_index) else: if not skip_index: result = collection.find(filter_dict, projection_dict).limit(limit_size) else: result = collection.find(filter_dict, projection_dict).skip(skip_index).limit(limit_size) except Exception as e: Log.e('find data failed: %s' % e) finally: return result
def gets_html(url, params=None, headers=None, cookies=None, proxies=None, charset='UTF-8'): ''' 发送https get请求 :param url:str 请求的url :param params:dict 参数 :param headers:dict 自定义请求头 :param cookies:dict 网站cookies :param proxies:dict 代理 :return: str 返回的str文本 ''' html = None try: r = requests.get(url, params=params, headers=headers, cookies=cookies, proxies=proxies, verify=False) r.encoding = charset html = r.text except Exception as e: Log.e("https get html failed -> " + str(e)) finally: pass return html
def posts_html(url, data=None, headers=None, cookies=None, proxies=None, charset='UTF-8'): ''' 发送https post请求 :param url:str 请求的url :param data:dict post的数据 :param headers:dict 自定义请求头 :return: str 返回的str文本 ''' html = None try: r = requests.post(url, data=data, headers=headers, cookies=cookies, proxies=proxies, verify=False) r.encoding = charset html = r.text except Exception as e: Log.e("https post html failed -> " + str(e)) finally: pass return html
def __init__(self, host, port, db_name, default_collection): Log.i('Init MongoDB') self.client = pymongo.MongoClient( host=host, port=port, connect=False ) # Connection() 和 MongoClient() safe MongoClient被设计成线程安全、可以被多线程共享的 self.db = self.client.get_database(db_name) self.collection = self.db.get_collection(default_collection)
def get_file(file_name, url, params=None, headers=None, cookies=None, proxies=None): ''' 发送http get请求文件 :return: ''' html = True try: r = requests.get(url, params=params, headers=headers, cookies=cookies, proxies=proxies) with open(file_name, 'wb') as fd: for chunk in r.iter_content(512): fd.write(chunk) except Exception as e: Log.e("http get file failed -> " + str(e)) html = False finally: pass return html
def update(self, collection_name, filter_dict, update_dict, insert=False, multi=False, auto_uptime=True): """ 更新表记录,默认返回false :param collection_name: str 集合名 :param filter_dict: dict 过滤条件,如{'campaignId':{'$in':[1,2,3]}} :param update_dict: dict 更新的字段,如{'$set':{'status_key:0','campaign.status':1},{'$unset':'campaign.name':'test_camp'}} :param insert: bool 如果需要更新的记录不存在是否插入 :param multi: bool 是否更新所有符合条件的记录, False则只更新一条,True则更新所有 :return: bool 是否更新成功 """ result = False try: if auto_uptime: timestamp = time.time() uptimestamp = int(round(timestamp * 1000)) uptime = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] if '$set' in update_dict: update_dict['$set']['uptime']=uptime update_dict['$set']['uptimestamp'] = uptimestamp else: update_dict['$set']={'uptime':uptime,'uptimestamp':uptimestamp} collection = self.database.get_collection(collection_name) collection.update(filter_dict, update_dict, insert, multi) result = True Log.d("update success!") except Exception as e: Log.e('update failed: %s' % e) traceback.print_exc() finally: return result
def run(self): ''' 每隔1秒,循环读取tasks 交给Downloader :return: ''' #创建进程池 pool = Pool() while True: #获取一条待执行的Task,并置为doing状态 task = self.taskUtil.get_ready() if task is not None and len(task) > 0 or True: Log.i('-----------------------------') #用进程池启动Downloader pool.apply_async(self.run_downloader, args=(task, )) #休眠n秒(从配置文件中读取) items = ConfigUtil.getItems('scheduler') interval_min = items['interval_min'] interval_max = items['interval_max'] seconds = random.randint(int(interval_min), int(interval_max)) Log.i('Start sleep ' + str(seconds) + ' seconds') time.sleep(seconds) pool.close() pool.join() log.i('All subprocesses done.')
def run(self): ''' 分发 :return: 无 ''' Log.i('Pipeline.run()') if self.task['results'] is not None and len(self.task['results']) > 0: #下次任务入队列 if self.task['next_tasks'] is not None: for next_task in self.task['next_tasks']: self.taskUtil.insert_one(next_task) #本次解析结果入库 # 利用反射机制自动执行pipeline_<parser名>()函数,如果找不到则执行默认的pipeline_default()函数 if hasattr(self, 'pipeline_' + self.task['parser']): func = getattr(self, 'pipeline_' + self.task['parser']) func(self.task['table']) else: self.pipeline_default(self.task['table']) #将完整task存入mongo,并将本条task self.task['state'] = 'done' self.taskUtil.replace_one(self.task['_id'], self.task) else: #没有解析出结果,则表示中间出错了,等待下次再启动 pass Log.i('this task is finished')
def insert(self, collection_name, insert_data, auto_uptime=True): """ 更新表记录,默认返回false :param collection_name: str 集合名 :param insert_data: dict 插入的数据,如{'campaignId':{'$in':[1,2,3]}} :return: bool 是否更新成功 """ result = False try: if auto_uptime: timestamp = time.time() uptimestamp = int(round(timestamp * 1000)) uptime = datetime.datetime.fromtimestamp(timestamp).strftime( '%Y-%m-%d %H:%M:%S.%f')[:-3] if type(insert_data) == dict: insert_data['uptime'] = uptime insert_data['uptimestamp'] = uptimestamp elif type(insert_data) == list: items = [] for data in insert_data: data['uptime'] = uptime data['uptimestamp'] = uptimestamp items.append(data) insert_data = items collection = self.database.get_collection(collection_name) collection.insert(insert_data) result = True Log.d("insert success!") except Exception as e: Log.e('insert failed: %s' % e) finally: return result
def replace(self, collection_name, filter_dict, replace_data, auto_uptime=True): """ 替换文档,默认返回false :param collection_name: str 集合名 :param filter_dict: dict 查询条件,如{'campaignId':{'$in':[1,2,3]}} :param replace_data: dict 替换的数据,如{'campaignId':{'$in':[4,5,6]}} :return: bool 是否更新成功 """ result = False try: if auto_uptime: timestamp = time.time() uptimestamp = int(round(timestamp * 1000)) uptime = datetime.datetime.fromtimestamp(timestamp).strftime( '%Y-%m-%d %H:%M:%S.%f')[:-3] replace_data['uptime'] = uptime replace_data['uptimestamp'] = uptimestamp collection = self.database.get_collection(collection_name) collection.replace_one(filter_dict, replace_data) result = True Log.d("remove success!") except Exception as e: Log.e('remove failed: %s' % e) finally: return result
def __setconsumer(self): """ 返回消费父链接话题的消费者对象 :return: """ conf = kafka_setting context = ssl.create_default_context() context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) context.verify_mode = ssl.CERT_REQUIRED # context.check_hostname = True context.load_verify_locations(CACERT_FILENAME) try: consumer = KafkaConsumer(bootstrap_servers=conf['bootstrap_servers'], group_id=conf['consumer_id'], sasl_mechanism="PLAIN", ssl_context=context, security_protocol='SASL_SSL', api_version=(0, 10), sasl_plain_username=conf['sasl_plain_username'], sasl_plain_password=conf['sasl_plain_password']) except KafkaError as e: Log.e(e + 'kafkaConsumer failed') return consumer
def get_json(url, params=None, headers=None, cookies=None, proxies=None, charset='UTF-8'): ''' 发送http get请求 :param url:str 请求的url :param params:dict 参数 :param headers:dict 自定义请求头 :param cookies:dict 网站cookies :param proxies:dict 代理 :return: json 返回的json对象 ''' html = None try: r = requests.get(url, params=params, headers=headers, cookies=cookies, proxies=proxies) r.encoding = charset html = r.json() except Exception as e: Log.e("http get json failed -> " + str(e)) finally: pass return html
def run(): if sys.argv[1] is None and sys.argv[2] is None and sys.argv[3] is None: Log.i("no params error") os._exit(0) else: isCrontab = sys.argv[1] begin = sys.argv[2] end = sys.argv[3] #是否crontab命令启动,1代表是,其它代表否 if isCrontab==str(1): crontab = 1 else: crontab = 0 p_list = list() producerProcess = ProducerUrl("producer", crontab, begin, end) p_list.append(producerProcess) # start = TimeUtil.getDefaultTimeIt() for p in p_list: p.daemon = True p.start() for p in p_list: p.join() # end = TimeUtil.getDefaultTimeIt() # Log.i('ProducerUrlParentPid run for %.2fm' %(end - start)) if crontab==1: os._exit(0)
def close_conn(self): """ 关闭数据库链接 :return: 无返回值 """ if self.client: self.client.close() Log.d('closed mongo connection')
def consumerurl(self): """ 消费父链接 :param queueDictData: """ conf = localKafka_setting self.consumer.subscribe((conf['topic_name'])) # TODO 这里阻塞是消费者连接超时,底层SDK主动调用断线重连API,监听数据回调(永久死循环,无JB优化了) for message in self.consumer: jsondata = str(message.value, "utf-8") Log.i(jsondata)
def insert(self, item, collection_name=None): if collection_name != None: collection = self.db.get_collection(collection_name) try: return len(collection.insert(item)) except Exception: return 0 else: try: return len(self.collection.insert(item)) except Exception as e: Log.e("mongo insert failed -> " + str(e)) return 0
def __setconsumer(self): """ 返回消费父链接话题的消费者对象 :return: """ conf = localKafka_setting try: consumer = KafkaConsumer( bootstrap_servers=conf['bootstrap_servers'], group_id=conf['consumer_id']) except KafkaError as e: Log.e(e + 'kafkaConsumer failed') return consumer
def getIpProxyPoolFromeRemote(): """ 直接从远程获取免费ip代理 :return: 可用的ip代理 """ if USE_PROXY is False: return None try: # Log.i('获取代理···') resp = requests.get(PROXY_REMOTE_URL, timeout=TIMEOUT) return resp.text except Exception as e: Log.e('无法获取代理信息,请确认代理系统是否启动') return None
def get_header(url, params=None, headers=None, cookies=None, proxies=None): ''' 发送http head请求 :param url:str 请求的url :return: dict header值 ''' html = None try: r = requests.get(url) html = r.headers except Exception as e: Log.e("http get header failed -> " + str(e)) finally: pass return html
def _sendChildUrl(self,URL_inf, mutex_lock): # # 保存数据并提取子链接重新投入生产对应的话题 KafkaOperator = kafkaUrlinformation() # TODO 这里用类管理不同网站的逻辑 parseCCGPModule = ParserCCGPModule(URL_inf, KafkaOperator) ccgpChildrenLink = parseCCGPModule.getLinks() if ccgpChildrenLink is None: mutex_lock.release() # 临界区结束,互斥的结束 return for link in ccgpChildrenLink: #于浩说不要发父链接给他 if link.DeepNum >= 0: Log.i("produce<<"+json.dumps(link.class2dict())) KafkaOperator.producerUrl(json.dumps(link.class2dict()))
def find_one(self, collection_name, filter_dict=None, projection_dict=None): """ 查找一条表记录,默认返回空字典 :param collection_name: str 集合名 :param filter_dict: dict 过滤条件如{'campaignId':123} :param projection_dict: dict 返回的字段如{'campaign.status':1,'updated':1,'_id':0} :return: dict 查找到的数据 """ result = {} try: collection = self.database.get_collection(collection_name) result = collection.find_one(filter_dict, projection_dict) except Exception as e: Log.e('find data failed: %s' % e) finally: return result
def count(self, collection_name, filter_dict=None): """ 查找表记录条数,默认返回0 :param collection_name: str 集合名 :param table_name: str 表名 :param filter_dict: dict 过滤条件 :return: int 表记录条数 """ tab_size = 0 try: collection = self.database.get_collection(collection_name) tab_size = collection.find(filter_dict).count() return tab_size except Exception as e: Log.e('get table size failed: %s' % e) finally: return tab_size
def producterUUID(self, strurl): """ 生产ggcp话题的uuid :param strurl: """ try: conf = kafka_setting #TODO 抛出异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs. future = self.producer.send(conf['topic_name_ccgp'], bytes(strurl, 'ASCII')) self.producer.flush() future.get() except KafkaError as e: self.producer.close() if self.producer is None: self.producer = self.__setproducer() Log.e(e+'send message failed') pass
def delete(self, collection_name, filter_dict): """ 更新表记录,默认返回false :param collection_name: str 集合名 :param filter_dict: dict 查询条件,如{'campaignId':{'$in':[1,2,3]}} :return: bool 是否更新成功 """ result = False try: collection = self.database.get_collection(collection_name) collection.remove(filter_dict) result = True Log.d("remove success!") except Exception as e: Log.e('remove failed: %s' % e) finally: return result
def run(self): try: pass except Exception as e: Log.i("AsyncThreadScanner run exception<<" + e.message) # 移除线程队列 AsyncThreadScanner.lck.acquire() AsyncThreadScanner.tList.remove(self) # 如果移除此完成的队列线程数刚好达到上限数值-1,则说明有线程在等待执行,那么我们释放event,让等待事件执行 if len(AsyncThreadScanner.tList) == AsyncThreadScanner.maxThreads - 1: AsyncThreadScanner.event.set() AsyncThreadScanner.event.clear() AsyncThreadScanner.lck.release()
def run(self): ''' 获取免费IP代理进程执行,循环读取tasks :return: ''' Log.i('Downloader.run() in {0}'.format(time.ctime())) p_list = list() downloaderRun = Process(target=self.run_downloader, args=(self.pipeDictData,)) p_list.append(downloaderRun) for p in p_list: p.daemon = True p.start() for p in p_list: p.join()
def run(self): ''' 线程执行,默认调用方法,任务分发 :return: ''' Log.i('Downloader.run()') #利用反射机制自动执行download_<parser名>()函数,如果找不到则执行默认的download_default()函数 if hasattr(self, 'download_' + self.task['parser']): func = getattr(self, 'download_' + self.task['parser']) func() else: self.download_default() #启动解析器 parserModule = Setting.PARSER_MODULE ParserX = importlib.import_module(parserModule) parser = ParserX.Parser(self.task) parser.run()
def run(self): ''' 分发 :return: ''' Log.i('Parser.run()') # 利用反射机制自动执行parse_<parser名>()函数,如果找不到则执行默认的parse_default()函数 if hasattr(self, 'parse_' + self.task['parser']): func = getattr(self, 'parse_' + self.task['parser']) func() else: self.parse_default() #启动Pipeline pipelineModule = Setting.PIPELINE_MODULE PipelineX = importlib.import_module(pipelineModule) pipeline = PipelineX.Pipeline(self.task) pipeline.run()
def producerUrl(self, strurl): """ 生产父链接 :param strurl: """ try: conf = kafka_setting future = self.producer.send(conf['topic_name'], bytes(strurl, 'ASCII')) self.producer.flush() future.get() except KafkaError as e: #TODO 异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs处理 #https://stackoverflow.com/questions/48261501/kafka-errors-kafkatimeouterror-kafkatimeouterror-failed-to-update-metadata-aft self.producer.close() if self.producer is None: self.producer = self.__setproducer() Log.e(e+'send message failed') pass