def create_book(self, command, counter): Path.reset_path() Debug.logger.info(u"开始制作第 {} 本电子书".format(counter)) Debug.logger.info(u"对记录 {} 进行分析".format(command)) task_list = CommandParser.get_task_list(command) # 分析命令 if len(task_list) == 0: return for task in task_list: if Config.debug_for_create_book: pass else: Worker.distribute(task) Debug.logger.info(u"网页信息抓取完毕") task_result_list = [] for task in task_list: task_result = TaskResult(task) task_result.extract_data() task_result_list.append(task_result) Debug.logger.info(u"数据库信息获取完毕") # 下载图片 for task_result in task_result_list: task_result.download_img() Debug.logger.info(u"所有任务图片获取完毕") # 按体积自动分卷 # 渲染html && 压缩为电子书 book = Book(task_result_list) book_list = book.auto_split(Config.max_book_size_mb * 1024) for chapter in book_list: chapter.create_book() return
def test_initialization(): config = Config.from_environ() config['Blockchain']['GasPrice'] = 'fast' worker = Worker(config) worker.web3.eth.setGasPriceStrategy.assert_called_once_with( fast_gas_price_strategy) assert worker.dynamic_gas_price_strategy assert worker.gas_price == worker.web3.eth.generateGasPrice() config['Blockchain']['GasPrice'] = 'medium' worker = Worker(config) worker.web3.eth.setGasPriceStrategy.assert_called_once_with( medium_gas_price_strategy) assert worker.dynamic_gas_price_strategy assert worker.gas_price == worker.web3.eth.generateGasPrice() config['Blockchain']['GasPrice'] = 6000000 worker = Worker(config) assert not worker.dynamic_gas_price_strategy assert worker.gas_price == config['Blockchain']['GasPrice'] worker.web3.eth.setGasPriceStrategy.assert_not_called() config['Blockchain']['GasPrice'] = 'slow' with pytest.raises(Exception) as einfo: worker = Worker(config) assert str(einfo.value) == 'Invalid gas price strategy:\'slow\'' config['Blockchain']['GasPrice'] = None with pytest.raises(Exception) as einfo: worker = Worker(config) assert str(einfo.value) == 'Invalid gas price strategy:None'
def catch(account_id): # 关键就在这里了 u""" :param target_url: https://xueqiu.com/4065977305 :return: """ mock_sleep_time = 0.5 article_url_index_list = [] # 获取最大页码 # url = 'http://chuansong.me/account/{}'.format(account_id) # front_page_content = Http.get_content(url) # max_page = XueQiuWorker.parse_max_page(front_page_content) column_info = {} column_info[u'column_id'] = account_id column_info[u'title'] = "" column_info['article_count'] = 0 column_info['follower_count'] = 0 column_info['description'] = '' column_info['image_url'] = '' from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) strtT = '1558513651020' # https://api.wallstreetcn.com/apiv1/content/themes/stream/1005680?type=newest&cursor=1558066610478&limit=20 max_page = 2 index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(1, max_page): resuorcecatch(account_id, strtT)
def test23hostPermissions(self): worker = Worker() self.source += 'test23hostPermissions' self.destination = self.source + '_delete' worker.work(self.source) expected = 3 actual = worker.wrapper.getManifestVersion() self.assertEqual(actual, expected, 'manifest_version') manifest = worker.wrapper.manifest key = 'permissions' self.assertIn(key, manifest) self.assertEqual(len(manifest[key]), 2) key = 'optional_permissions' self.assertIn(key, manifest) self.assertEqual(len(manifest[key]), 1) key = 'host_permissions' self.assertIn(key, manifest) self.assertEqual(len(manifest[key]), 2) shutil.rmtree(self.destination)
def distribute(self, files: List[str]) -> None: with Manager() as manager: worker = Worker(self.sender_factory, len(files), self.queue, manager.Value('i', 0), manager.Value('i', 0)) with Pool(self.processes) as pool: pool.map(worker.upload, files) self.active.value = False worker.notify('')
def __init__(self): self.log = Logger.get_logger_instance() self.__config = Config.get_config_instance() self.__worker = Worker() self.queue_handler = TaskHandler( self.__config["queue"]["job_type"], self.__config["queue"]["task_type"], self.__config["queue"]["job_manager_url"], self.__config["queue"]["heartbeat_manager_url"], self.__config["queue"]["heartbeat_interval_seconds"], self.log)
def test_mv3_to_mv2_B(self): worker = Worker() self.source += 'tabstourls_mv3' self.destination = self.source + '_delete' worker.work(self.source) expected = 2 actual = worker.wrapper.getManifestVersion() self.assertEqual(actual, expected, 'manifest_version') shutil.rmtree(self.destination)
def test(unprocessed_queue, unprocessed_bucket, issued_bucket, unwrap, wrap): config = Config.from_environ() document_store_address = config['DocumentStore']['Address'] # overriding safe VisibilityTimeout config['Worker']['Polling']['VisibilityTimeout'] = 1 queue_test_wait_time_seconds = config['Worker']['Polling'][ 'VisibilityTimeout'] * 2 document_v2 = DOCUMENT_V2_TEMPLATE.substitute( DocumentStoreAddress=document_store_address) document_v2 = json.loads(document_v2) wrapped_document_v2 = wrap(document_v2, '2.0') document_v3 = DOCUMENT_V3_TEMPLATE.substitute( DocumentStoreAddress=document_store_address) document_v3 = json.loads(document_v3) wrapped_document_v3 = wrap(document_v3, '3.0') worker = Worker(config) index = 1 # checking both schema versions to test auto version definition for document in [document_v2, document_v3]: key = f'document-{index}' unprocessed_bucket.Object(key).put(Body=json.dumps(document)) worker.poll() issued_document = json.load(issued_bucket.Object(key).get()['Body']) assert unwrap(issued_document) == document index += 1 time.sleep(queue_test_wait_time_seconds) index = 1 # checking both schema versions to test auto version definition for wrapped documents for document in [wrapped_document_v2, wrapped_document_v3]: key = f'wrapped-document-{index}' unprocessed_bucket.Object(key).put(Body=json.dumps(document)) worker.poll() issued_document = json.load(issued_bucket.Object(key).get()['Body']) assert unwrap(issued_document) == unwrap(document) index += 1 time.sleep(queue_test_wait_time_seconds) # check that all messages were processed assert not unprocessed_queue.receive_messages( WaitTimeSeconds=queue_test_wait_time_seconds, MaxNumberOfMessages=1, VisibilityTimeout=0) # Checking issuing already issued wrapped document # it should be moved to issued bucket without calling contract.issue method # after signature and document store verifications passed key = 'issued-wrapped-document' assert worker.is_issued_document(wrapped_document_v2) unprocessed_bucket.Object(key).put(Body=json.dumps(wrapped_document_v2)) worker.poll() issued_document = json.load(issued_bucket.Object(key).get()['Body']) assert issued_document == wrapped_document_v2
def test_mv3_to_mv2_C(self): worker = Worker() self.source += 'timebadge_mv3' self.destination = self.source + '_delete' worker.work(self.source) expected = 2 actual = worker.wrapper.getManifestVersion() self.assertEqual(actual, expected, 'manifest_version') manifest = worker.wrapper.manifest self.assertIn('background', manifest) self.assertIn('scripts', manifest['background']) shutil.rmtree(self.destination)
def add_all(self, subject): if subject == self.form.ID: # use cached unmapped unmapped = self._current['unmapped'] adder = self.wc_adder else: # use cached wc_unmapped unmapped = self._current['wcunmapped'] adder = self.moein_adder # get checked-unmapped ids checked = [int(row[0]) for row in self.options_list.getChecked()] # filter unmapped by checked unmapped = [um for um in unmapped if um['id'] in checked] # check for unmapped if unmapped: pd = Progress(self.options_list, self.messages[8], 0, len(unmapped)) pd.show() worker = Worker(adder, unmapped) worker.signals.progress.connect(pd.setValue) worker.signals.error.connect(pd.close) worker.signals.error.connect(self.add_all_error) worker.signals.done.connect(self.add_all_done) QThreadPool.globalInstance().start(worker) self.options_list.btnAddAll.setDisabled(True)
def run(self): for item in self.target: logger.debug('start ' + item.__name__) worker = Worker(item) self.pool.start(worker) self.pool.waitForDone() logger.debug('thread finished')
def setUp(self) -> None: self.manager = Manager() self.queue = self.manager.Queue() self.done = self.manager.Value('i', 0) self.failed = self.manager.Value('i', 0) self.worker = Worker(MockSenderFactory(), len(self.files), self.queue, self.done, self.failed)
def test23contentSecurityPolicy(self): worker = Worker() self.source += 'test23contentSecurityPolicy' self.destination = self.source + '_delete' worker.work(self.source) expected = 3 actual = worker.wrapper.getManifestVersion() self.assertEqual(actual, expected, 'manifest_version') manifest = worker.wrapper.manifest key = 'content_security_policy' self.assertIn(key, manifest) self.assertIn('extension_pages', manifest[key]) self.assertIn('sandbox', manifest[key]) shutil.rmtree(self.destination)
def create_book(self, command, counter): Path.reset_path() Debug.logger.info(u"开始制作第 {} 本电子书".format(counter)) Debug.logger.info(u"对记录 {} 进行分析".format(command)) task_list = CommandParser.get_task_list(command) # 分析命令 if len(task_list) == 0: return for task in task_list: if Config.debug_for_create_book: pass else: Worker.distribute(task) Debug.logger.info(u"网页信息抓取完毕") task_result_list = [] toTo_list = [ Type.wechat, Type.huxiu, Type.huawei, Type.xueqiu, Type.sina, Type.zhengshitang, Type.jinwankansa, Type.wuxia, Type.doc360, Type.todo, Type.todo1, Type.todo2, Type.fiel, Type.taoguba_article ] for task in task_list: if task.get_task_type() in toTo_list: task = ColumnTask(task.account_id) task_result = TaskResult(task) task_result.extract_data() task_result_list.append(task_result) Debug.logger.info(u"数据库信息获取完毕") # 下载图片 for task_result in task_result_list: task_result.download_img() # print '所有任务图片获取完毕' Debug.logger.info(u"所有任务图片获取完毕") # 按体积自动分卷 # 渲染html && 压缩为电子书 book = Book(task_result_list) book_list = book.auto_split(Config.max_book_size_mb * 1024) for chapter in book_list: chapter.create_book() return
def test23executeScript(self): worker = Worker() self.source += 'test23executeScript' self.destination = self.source + '_delete' worker.work(self.source) expected = 3 actual = worker.wrapper.getManifestVersion() self.assertEqual(actual, expected, 'manifest_version') manifest = worker.wrapper.manifest self.assertIn('background', manifest) self.assertIn('service_worker', manifest['background']) self.assertEqual(manifest['background']['service_worker'], 'service_worker.js') self.assertIn('permissions', manifest) self.assertIn('scripting', manifest['permissions']) shutil.rmtree(self.destination)
def test_mv2_C(self): worker = Worker() self.source += 'backgroundScripts_mv2' self.destination = self.source + '_delete' worker.work(self.source) expected = 3 actual = worker.wrapper.getManifestVersion() self.assertEqual(actual, expected, 'manifest_version') manifest = worker.wrapper.manifest self.assertIn('background', manifest) self.assertIn('service_worker', manifest['background']) self.assertEqual(manifest['background']['service_worker'], 'service_worker.js') self.assertFalse( os.path.exists(worker.wrapper.destination + os.sep + 'script1.js')) self.assertFalse( os.path.exists(worker.wrapper.destination + os.sep + 'script2.js')) shutil.rmtree(self.destination)
def main(): # PBT_Quadratic Environment initialization convergenceTolerance = 10e-4 #??? maxStep = 30 # updateInterval = 4 # every 4 iteration, do an update init_theta = [0.9, 0.9] # set initial weights #create the worker population numOfWorkers = 2 init_hyperParam = [[0, 1], [1, 0]] worker_list = [ Worker(init_theta, init_hyperParam[i]) for i in range(numOfWorkers) ] run1 = train(worker_list, step, eval, ready, exploit, explore, lossFunc, convergenceTolerance, maxStep) # Visualization # def plot_value(run, i, steps, title): # plt.subplot(2, 4, i) # plt.plot(run[0].eval_history, color='b', lw=0.7) # plt.plot(run[1].eval_history, color='r', lw=0.7) # plt.axhline(y=1.2, linestyle='dotted', color='k') # axes = plt.gca() # axes.set_xlim([0, steps]) # axes.set_ylim([0.0, 1.21]) # # plt.title(title) # plt.xlabel('Step') # plt.ylabel('Q') # return def plot_theta(run, i, steps, title): x_b = [_[0] for _ in run[0].theta_history] y_b = [_[1] for _ in run[0].theta_history] x_r = [_[0] for _ in run[1].theta_history] y_r = [_[1] for _ in run[1].theta_history] plt.subplot(2, 4, i) plt.scatter(x_b, y_b, color='b', s=2) plt.scatter(x_r, y_r, color='r', s=2) plt.title(title) plt.xlabel('theta0') plt.ylabel('theta1') return plot_theta(run1, 1, steps=maxStep, title='PBT') plt.show()
def start(self): # 检查更新 self.check_update() # 登录 login = Login() zhihu_client = login.get_login_client() Worker.set_zhihu_client(zhihu_client) Debug.logger.info(u"开始读取ReadList.txt设置信息") if not Path.is_file(u'./ReadList.txt'): # 当ReadList不存在的时候自动创建之 with open(u'./ReadList.txt', u'w') as read_list: read_list.close() print Debug.logger.info(u"ReadList.txt 内容为空,自动退出") return book_counter = self.read_list() Debug.logger.info(u"所有书籍制作完成。") Debug.logger.info(u"本次共制作书籍{0}本".format(book_counter)) Debug.logger.info(u"感谢您的使用") Debug.logger.info(u"点按任意键退出") return
def run(self): self._create_socket() print(f'server started on {self.config.host}:{self.config.port}') workers = [] for x in range(self.config.cpu_limit): w = Worker(self._sock, self.config) workers.append(w) w.start() try: for w in workers: w.join() except KeyboardInterrupt: for w in workers: w.terminate() finally: self._sock.close()
def test23webAccessibleResources(self): worker = Worker() self.source += 'test23webAccessibleResources' self.destination = self.source + '_delete' worker.work(self.source) expected = 3 actual = worker.wrapper.getManifestVersion() self.assertEqual(actual, expected, 'manifest_version') manifest = worker.wrapper.manifest self.assertIn('background', manifest) self.assertIn('service_worker', manifest['background']) self.assertEqual(manifest['background']['service_worker'], 'service_worker.js') self.assertIn('permissions', manifest) self.assertIn('scripting', manifest['permissions']) key = 'web_accessible_resources' self.assertIn(key, manifest) self.assertEqual(len(manifest[key][0]['resources']), 2) shutil.rmtree(self.destination)
def test_gas_price_update(load_unprocessed_document, generate_gas_price): gas_price = 111 generate_gas_price.return_value = gas_price config = Config.from_environ() config['Blockchain']['GasPrice'] = 'fast' config['Blockchain']['GasPriceRefreshRate'] = 2 key = 'document-key' document = DOCUMENT_V2_TEMPLATE.substitute( DocumentStoreAddress=config['DocumentStore']['Address']) document = json.loads(document) load_unprocessed_document.return_value = key, document message = mock.Mock() message.body = json.dumps({'Records': [{}]}) worker = Worker(config) worker.web3.eth.sendRawTransaction.return_value = b'transaction-hash' worker.web3.eth.waitForTransactionReceipt().status = 1 assert worker.gas_price == gas_price # testing transaction timeout causing gas price increase generate_gas_price.reset_mock() worker.web3.eth.waitForTransactionReceipt.side_effect = TimeExhausted assert not worker.process_message(message) generate_gas_price.assert_not_called() assert worker.gas_price == int(gas_price * 1.1) # testing gas price refresh generate_gas_price.reset_mock() worker.web3.eth.waitForTransactionReceipt.side_effect = None for i in range(config['Blockchain']['GasPriceRefreshRate']): assert worker.process_message(message) generate_gas_price.assert_called_once() # testing no refresh on static gas price config['Blockchain']['GasPrice'] = 20 worker = Worker(config) worker.web3.eth.sendRawTransaction.return_value = b'transaction-hash' worker.web3.eth.waitForTransactionReceipt().status = 1 worker.web3.reset_mock() for i in range(config['Blockchain']['GasPriceRefreshRate']): assert worker.process_message(message) worker.web3.eth.generateGasPrice.assert_not_called()
def update(self): conn = connection.get() self.model.set_connection(conn) try: mapped = self.model.mapped(update_required=True) except Exception as e: msg = Message(self.ui, Message.ERROR, self.messages[4], str(e)) msg.show() else: if mapped: pd = Progress(self.ui, self.messages[9], 0, len(mapped)) pd.show() worker = Worker(self.updater, mapped) worker.signals.progress.connect(pd.setValue) worker.signals.error.connect(pd.close) worker.signals.error.connect(self.update_error) worker.signals.done.connect(self.update_done) QThreadPool.globalInstance().start(worker) self.tab.btnUpdate.setDisabled(True) finally: conn.close()
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 column_info = FileColumnParser('').get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "毛泽东军事文选" from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) star_page = 0 max_page = 1 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 path = '/Users/ink/Desktop/ht' list = os.listdir(path) #列出文件夹下所有的目录与文件 for i in list: # print i if str(i).endswith('htm') or str(i).endswith('html'): filename = u'/Users/ink/Desktop/ht/{}'.format(i) convert_encoding(filename, 'utf-8') f = open(filename) contents = f.read() # print(contents) # gb2312 转 article_info = FileArticleParser(contents).get_article_info() if len(article_info) > 0: article_info['article_id'] = i article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) f.close()
def catch(account_id): # 关键就在这里了 u""" :param target_url: https://xueqiu.com/4065977305 :return: """ mock_sleep_time = 0.5 article_url_index_list = [] # 获取最大页码 # url = 'http://chuansong.me/account/{}'.format(account_id) # front_page_content = Http.get_content(url) # max_page = XueQiuWorker.parse_max_page(front_page_content) # _url = "http://xueqiu.com/v4/statuses/user_timeline.json?user_id={0}&page={1}&type=2" ''是all 2主贴 5 回复 _url = "https://xueqiu.com/v4/statuses/user_timeline.json?user_id={0}&page={1}&type=0" first = _url.format(account_id, 1) r = Http.get_json_content(first) max_page = 1 try: jdata = json.loads(r.text, encoding='utf-8') max_page = jdata['maxPage'] + 1 except KeyError as e: print '打开失败 >>>>>>> Cookie' # max_page = 1 # 分析网页内容,存到数据库里 # 需要验证码 content_profile = Http.get_content(u'https://xueqiu.com/u/{}/profile'.format(account_id)) column_info = XueQiuColumnParser(content_profile).get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "" with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if split_url.split('/')[-1] == account_id: column_info[u'title'] = line.split('#')[1] column_info[u'image_url'] = str(line.split('#')[2]).strip('\n') from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) # index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(1, max_page): request_url = _url.format(account_id, raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info(u"开始抓取{countert}号文章,剩余{article_count}篇".format(countert=article_url_index, article_count=len(index_work_set))) content = Http.get_content(request_url) if not content: return jdata = json.loads(content) articles = jdata['statuses'] for article in articles: # print article article_info = XueQiuArticleParser(article).get_article_info() if len(article_info) > 0: article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] Debug.logger.debug(u' {} 的内容抓取完成'.format(request_url)) return
def start(): '''Start the Application''' log = configure_log() log.info('Starting Cloud Worker Node Agent') log.info('--------------------------') args = parse_args() settings = {'base_url': args.server, 'secret': args.secret, 'client_id': C.CLIENT_ID, 'client_secret': C.CLIENT_SECRET, 'username': C.USERNAME, 'password': C.PASSWORD} server = Server(settings) node = Node(server) #Send the hostname, ip etc to the server node.send_info() #Update the node status to ready node.update_node_status(C.STATUS_READY) #Get Config config = Config(server, node) actions = Action(server, node) processor = Processor() workers = Worker(server, node, processor) output = Output(server, node) finished = False #Loop forever (kind of) while not finished: log.info('Looping') log.info('--------------------------') #Update last seen date node.update_node_date() #Get config config.refresh() #Get actions num_pending = actions.get_pending() #Respond to actions if actions.has_pending(): message = 'Responding to %d Actions ...' % num_pending output.send(message) actions.respond_to_pending() #Get workers/commands workers.refresh() workers.process_workers() #TODO #Respond to/run commands #Send output to server log.info('Sleeping for %d seconds ...', config.get(C.CONFIG_POLL_PERIOD)) time.sleep(config.get(C.CONFIG_POLL_PERIOD))
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 url = 'https://www.huxiu.com/{}'.format(account_id) front_page_content = Http.get_content(url) # Config.now_id_likeName = account_id # Config.save() column_info = HuXiuColumnParser(front_page_content).get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = account_id from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) u_result = urllib.quote( account_id.decode(sys.stdin.encoding).encode('utf8')) print account_id max_page = 2 idds = '' # with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: split_url = line.split('#')[0] if split_url.split('/')[-1] == account_id: # Config.now_id_likeName = line.split('#')[1] max_page = int(line.split('#')[-1]) + 1 idds = str(line.split('#')[1]) print max_page max_page = -1 # 分析网页内容,存到数据库里 Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 for raw_front_page_index in range(0, max_page + 1): #https://www.huxiu.com/search.html?s=%E5%B7%B4%E8%8F%B2%E7%89%B9&sort=dateline:desc request_url = u'https://www.huxiu.com/search.html?s={}&sort=dateline%3Adesc&per_page={}'.format( u_result, raw_front_page_index) #request_url = u'https://www.huxiu.com/member/{}/article/{}.html'.format(idds,raw_front_page_index) # request_url = 'https://www.huxiu.com/member/1872007.html' index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, "lxml") list_pcyc_l_ = soup.find_all('li') # list_pcyc_l_ = soup.find_all('div',class_='mob-ctt') for tgo_right in list_pcyc_l_: for link in tgo_right.findAll('a'): hre = str(link.get('href')) if hre.startswith('/article/', 0, 10): print u'https://www.huxiu.com{}'.format( link.get('href')) article_url_index_list.append( 'https://www.huxiu.com{}'.format( link.get('href'))) del index_work_set[raw_front_page_index] article_url_index_list.append( 'https://www.huxiu.com/article/299355.html') article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = HuXiuArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def setUp(self): self.workers = [ Worker(init_hyperParam=[0, 1], init_theta=[0.9, 0.9]), Worker(init_hyperParam=[1, 0], init_theta=[0.5, 0.5]) ]
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 1 max_sleep_time = 1 article_url_index_list = [] # 获取最大页码 column_info = WeiXinColumnParser('').get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = account_id column_info[u'image_url'] = 'https://wpimg.wallstcn.com/3598b719-ab0d-4be7-bc09-30c3ae29a3cc.jpg?imageView2/1/w/240/h/240' max_page = 1 # with open('ReadList.txt', 'r') as read_list: # read_list = read_list.readlines() # for line in read_list: # split_url = line.split('#')[0] # if str(split_url).__contains__(account_id): # # Config.now_id_likeName = line.split('#')[1] # max_page = int(line.split('#')[-1]) + 1 # column_info[u'title'] = str(line.split('#')[1]) # # # max_page = 1 # print max_page from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) # article_url_index_list.append('https://mp.weixin.qq.com/s?__biz=MjM5MjczNDc0Mw==&mid=2650847984&idx=2&sn=b7b111e5964d2f2fb568ba0d419e3edf&chksm=bd55d1888a22589e2f3bab0613b346427079efc6b82fac869d4f78244a500c3e5cc8cb8402ed&scene=21#wechat_redirect') # article_url_index_list.append('https://mp.weixin.qq.com/s/yj1BT3jWyxLjlEnzz0vEtQ') with open('/Users/0/Desktop/list.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: article_url_index_list.append(str(line).strip('\n')) article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"'.format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info(u"开始抓取 {countert} 号文章,剩余{article_count}篇".format(countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) time.sleep(mock_sleep_time) if len(request_url_content) == 0: random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0 Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time)) time.sleep(random_sleep_time) continue #article_info = Todo2ArticleParser(request_url_content).get_article_info() # article_info = HuXiuArticleParser(request_url_content).get_article_info() article_info = WeiXinArticleParser(request_url_content).get_article_info() # article_info = WallStreetArticleParser(request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
def catch(account_id): # 关键就在这里了 article_url_index_list = [] # 获取最大页码 url = 'http://www.taoguba.com.cn/Article/' + account_id + '/1' front_page_content = Http.get_content(url) star_page = 1 with open('ReadList.txt', 'r') as read_list: read_list = read_list.readlines() for line in read_list: if str(line).__contains__('#'): split_url = line.split('#')[0] trgId = split_url.split('/')[-2] if trgId == account_id: pg = (split_url.split('/')[-1]) print pg star_page = int(pg) if star_page == 0: star_page = 1 else: print star_page max_page = 2 dom = BeautifulSoup(front_page_content, "lxml") list_pcyc_l_ = dom.find_all('div', class_="left t_page01") try: for tgo_tgo_ in list_pcyc_l_: linkl = tgo_tgo_.findAll('a') tarUrl = linkl[0].get('href') max_page = int(tarUrl.split('/')[3]) except IndexError as e: max_page = 1 column_info = TGBColumnParser(front_page_content).get_column_info() from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() # 获取每一页中文章的地址的地址 # star_page = 100 for raw_front_page_index in range(star_page, max_page + 1): request_url = 'http://www.taoguba.com.cn/Article/' + account_id + '/' + str( raw_front_page_index) article_url_index_list.append(request_url) article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = TGBArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
if __name__ == '__main__': PARSER = argparse.ArgumentParser( description='Client tool for DoEnjoy mini-Spark') subparsers = PARSER.add_subparsers() init_wordcount_client_parser(subparsers) init_pagerank_client_parser(subparsers) init_wordcount_streaming_client_parser(subparsers) init_master_parser(subparsers) init_worker_parser(subparsers) ARGS = PARSER.parse_args() if ARGS.action == 'master': master = Master(ARGS.port, ARGS.debug) master.run() elif ARGS.action == 'worker': worker = Worker(ARGS.master_address, ARGS.self_address, ARGS.debug) worker.run() elif ARGS.action == 'page_rank': page_rank_client = PageRankClient(ARGS.file_path, ARGS.iterative) client = get_client(ARGS.master_address) execute_command(client, client.get_job, pickle_object(page_rank_client), ARGS.self_address) page_rank_client.start_server("0.0.0.0") elif ARGS.action == 'word_count': word_count_client = WordCountClient(ARGS.file_path) client = get_client(ARGS.master_address) execute_command(client, client.get_job, pickle_object(word_count_client), ARGS.self_address) word_count_client.start_server("0.0.0.0:" + ARGS.self_address.split(":")[1]) elif ARGS.action == 'wordcount_streaming':
def catch(account_id): # 关键就在这里了 mock_sleep_time = 0.5 base_sleep_time = 10 max_sleep_time = 10 article_url_index_list = [] # 获取最大页码 column_info = Todo3ColumnParser('').get_column_info() column_info[u'column_id'] = account_id column_info[u'title'] = "新能源汽车" column_info['article_count'] = 0 column_info['follower_count'] = 0 column_info['description'] = '' column_info['image_url'] = '' from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) star_page = 1 max_page = 1 from src.worker import Worker Worker.save_record_list(u'Column', [column_info]) Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page)) index_work_set = OrderedDict() #获取每一页中文章的地址的地址 for raw_front_page_index in range(star_page, max_page): request_url = u'https://post.smzdm.com/fenlei/xinnengyuanche/p{}/'.format( raw_front_page_index) index_work_set[raw_front_page_index] = request_url re_catch_counter = 0 catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for raw_front_page_index in index_work_set: catch_counter += 1 Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter)) request_url = index_work_set[raw_front_page_index] Debug.logger.info( u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format( raw_front_page_index=raw_front_page_index, max_page=len(index_work_set))) request_url_content = Http.get_content(request_url) soup = BeautifulSoup(request_url_content, 'lxml') list_p_list = soup.find_all('div', class_='list-border clearfix') for p in list_p_list: # print p list_pcyc_li = p.find_all('a') li = list_pcyc_li[0] tarUrl = li.get('href') ttt = str(tarUrl).split("#")[-1] print ttt if not (ttt is None): article_url_index_list.append(ttt) del index_work_set[raw_front_page_index] article_count = len(article_url_index_list) Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format( article_count=article_count)) index_work_set = OrderedDict() for article_url_index in article_url_index_list: print 'query : ' + article_url_index article_db = DB.query_row( 'select count(*) as article_count from Article where article_id = "{}"' .format(article_url_index)) if article_db['article_count'] > 0: continue request_url = article_url_index index_work_set[article_url_index] = request_url re_catch_counter = 0 while len(index_work_set) > 0 and re_catch_counter <= 20: re_catch_counter += 1 for article_url_index in index_work_set: request_url = index_work_set[article_url_index] Debug.logger.info( u"开始抓取{countert}号文章,剩余{article_count}篇".format( countert=article_url_index, article_count=len(index_work_set))) request_url_content = Http.get_content(request_url) article_info = Todo3ArticleParser( request_url_content).get_article_info() if len(article_info) > 0: article_info['article_id'] = article_url_index article_info['column_id'] = account_id Worker.save_record_list(u'Article', [article_info]) del index_work_set[article_url_index] return
if __name__ == '__main__': PARSER = argparse.ArgumentParser( description='Client tool for DoEnjoy mini-Spark') subparsers = PARSER.add_subparsers() init_wordcount_client_parser(subparsers) init_pagerank_client_parser(subparsers) init_wordcount_streaming_client_parser(subparsers) init_master_parser(subparsers) init_worker_parser(subparsers) ARGS = PARSER.parse_args() if ARGS.action == 'master': master = Master(ARGS.port, ARGS.debug) master.run() elif ARGS.action == 'worker': worker = Worker(ARGS.master_address, ARGS.self_address, ARGS.debug) worker.run() elif ARGS.action == 'page_rank': page_rank_client = PageRankClient(ARGS.file_path, ARGS.iterative) client = get_client(ARGS.master_address) execute_command(client, client.get_job, pickle_object(page_rank_client), ARGS.self_address) page_rank_client.start_server("0.0.0.0") elif ARGS.action == 'word_count': word_count_client = WordCountClient(ARGS.file_path) client = get_client(ARGS.master_address) execute_command(client, client.get_job, pickle_object(word_count_client),