def pomppu_page_crawl(lines, n_parallel=4): n_parallel = 4 crawler = Crawler(save_path='E:\\crawl_result') bucket_size = 512 for b in range(0, len(lines), bucket_size): print('open driver') ports = [4444, 4445, 4446, 4447] # drivers = [RemoteChrome(port=4444).driver for _ in range(4)] jobs = [] for i in range(b, min(b + bucket_size, len(lines))): job_id = i % n_parallel # driver = drivers[job_id] driver = None crawler = crawler url = lines[i] id_ = i jobs += [(driver, crawler, url, id_)] pooling = PbarPooling(n_parallel=n_parallel, func=pomppu_singlepage_crawl, child_timeout=2) try: # pooling.map(func=pomppu_singlepage_crawl, jobs=jobs) pooling.map(func=dummy_task, jobs=jobs) except KeyboardInterrupt: print('KeyboardInterrupt') break except BaseException as e: log_error_trace(print, e) finally: # [driver.quit() for driver in drivers] pooling.save_fail_list() print('close drivers')
def fit(self, Xs, Ys): Ys = self.np_arr_to_index(Ys) for key in self.pack: try: self.pack[key].fit(Xs, Ys) except BaseException as e: log_error_trace(self.log.warn, e) self.log.warn(f'while fitting, {key} raise {e}')
def wrapper(*args, **kwargs): self = args[0] log_func = self.log try: return func(*args, **kwargs) except KeyboardInterrupt: log_func("KeyboardInterrupt detected abort process") except Exception as e: log_error_trace(log_func, e)
def wrapper(*args, **kwargs): self = args[0] key = args[2] try: return func(*args, **kwargs) except BaseException as e: self.log.warn(f'\nfail {func.__name__}, {key}\n') log_error_trace(self.log.warn, e)
def dump_and_load(pickler, obj): try: path = './pkl' with open(path, 'wb') as f: pickler.dump(obj, f) with open(path, 'rb') as f: obj = pickler.load(f) except BaseException as e: log_error_trace(pprint, e)
def predict_confidence(self, Xs): confidences = {} for key, clf in self.pack.items(): try: confidences[key] = clf.predict_confidence(Xs) except BaseException as e: log_error_trace(self.log.warn, e, f'while execute confidence at {key},\n') return confidences
def fit(self, x, y): x, y = self._if_df_encode(x, y) label_Ys = self.np_arr_to_index(y) self._check_n_class(y) for key in tqdm(self.pack): tqdm.write(f'fit {key}') try: self.pack[key].fit(x, label_Ys) except BaseException as e: log_error_trace(self.log.warn, e) self.log.warn(f'while fit, {key} raise {e}')
def predict_confidence(self, x): x = self._if_df_encode_x(x) confidences = {} for key, clf in tqdm(self.pack.items()): tqdm.write(f'predict_confidence {key}') try: confidences[key] = clf.predict_confidence(x) except BaseException as e: log_error_trace(self.log.warn, e, f'while execute confidence at {key},\n') return confidences
def get(self): rets = [] while sum([q.qsize() for q in self.queues]) > 0: for pool_id in range(self.n_parallel): ret = None if self.queues[pool_id].qsize() == 0: continue child, job = self.queues[pool_id].get() try: ret = child.get(timeout=self.child_timeout) except KeyboardInterrupt: self.log("KeyboardInterrupt terminate pools\n" "{fail}/{total} fail".format( fail=len(self.fail_list), total=len(self.jobs))) self.terminate() raise KeyboardInterrupt except BaseException as e: log_error_trace(self.log, e) self.log("job fail, kill job={job}, child={child}".format( child=str(None), job=str(job[3]))) self.pbar.update(1) self.fail_list += [job] self.pools[pool_id].terminate() self.pools[pool_id].join() self.pools[pool_id] = Pool(1, initializer=self.initializer, initargs=self.initargs) new_queue = Queue() while self.queues[pool_id].qsize() > 0: _, job = self.queues[pool_id].get() child = self.pools[pool_id].apply_async( self.func, job, callback=self.update_pbar) new_queue.put((child, job)) self.queues[pool_id] = new_queue finally: rets += [ret] self.pbar.close() self.log("{fail}/{total} fail".format(fail=len(self.fail_list), total=len(self.jobs))) self.log('end pooling queue') return rets
def pomppu_singlepage_crawl(driver, crawler, url, job_id): try: # time.sleep(10) query = url_query_parser(url) driver.get(url) try: if driver.find_element_by_class_name("pre"): execute_js_show_comment_page(driver, query) except BaseException: pass str_soup = [] html = driver.page_source soup = BeautifulSoup(html, 'lxml') # main content main_content_args = ('td', {'class': "board-contents"}) for item in soup.find_all(*main_content_args): str_soup += [item.text] # comment comment_args = ('textarea', {'class': 'ori_comment'}) for item in soup.find_all(*comment_args): str_soup += [item.text] try: max_page = get_max_comment_page(soup) # print('{} max_page {}'.format(job_id, max_page)) for i in range(2, max_page + 1): execute_js_go_comment_page(driver, query, i) element = driver.find_element_by_id("quote") part = str(element.get_attribute('innerHTML')) part = BeautifulSoup(part, 'lxml') for soup in part.find_all(*comment_args): str_soup += [str(soup.text)] except BaseException: pass str_soup = "\n".join(str_soup) crawler.save_html(str_soup, path_tail=str(job_id) + ".txt") except BaseException as e: log_error_trace(print, e)
def wrapper(*args, **kwargs): date = datetime.now() start = time.time() try: ret = func(*args, **kwargs) except BaseException as e: log_error_trace(print, e) ret = None finally: elapse_time = time.time() - start msg = f"in {func.__name__}(), time {time.time() - start:.4f}'s elapsed" if elapse_time > 60: now = datetime.now() - date msg += f", {now}" print(msg) return ret
def deco_hyperOpt_fn_wrapper(params): start_time = time.time() trial = { 'loss': None, 'status': None, 'eval_time': None, # 'other_stuff': None, # -- attachments are handled differently # 'attachments': # {'time_module': None}, # 'params': kwargs['params'] 'params': params } try: if issubclass(func, HyperOpt_fn): ret = func.fn(params, feed_args, feed_kwargs) else: ret = func(params, feed_args, feed_kwargs) if type(ret) is dict: trial.update(ret) else: trial['loss'] = ret trial['status'] = STATUS_OK except BaseException as e: log_error_trace(print, e) trial['loss'] = np.inf trial['status'] = STATUS_FAIL finally: trial['eval_time'] = time.time() - start_time if min_best is False: trial['loss'] = -trial['loss'] try: pbar.update(1) except BaseException: pass return trial
def join(self): pbar = tqdm.tqdm(total=len(self.childs)) q = Queue() for id_, child in self.childs.items(): q.put((id_, child)) while q.qsize() > 0: id_, child = q.get() try: child.get(self.TIMEOUT) pbar.update(1) except TimeoutError: q.put((id_, child)) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: pbar.update(1) self.log.warn(f'job fail') log_error_trace(self.log.info, e) pbar.close()