コード例 #1
0
def pomppu_page_crawl(lines, n_parallel=4):
    n_parallel = 4
    crawler = Crawler(save_path='E:\\crawl_result')

    bucket_size = 512
    for b in range(0, len(lines), bucket_size):
        print('open driver')
        ports = [4444, 4445, 4446, 4447]
        # drivers = [RemoteChrome(port=4444).driver for _ in range(4)]

        jobs = []
        for i in range(b, min(b + bucket_size, len(lines))):
            job_id = i % n_parallel
            # driver = drivers[job_id]
            driver = None
            crawler = crawler
            url = lines[i]
            id_ = i
            jobs += [(driver, crawler, url, id_)]

        pooling = PbarPooling(n_parallel=n_parallel, func=pomppu_singlepage_crawl, child_timeout=2)
        try:
            # pooling.map(func=pomppu_singlepage_crawl, jobs=jobs)
            pooling.map(func=dummy_task, jobs=jobs)
        except KeyboardInterrupt:
            print('KeyboardInterrupt')
            break
        except BaseException as e:
            log_error_trace(print, e)
        finally:
            # [driver.quit() for driver in drivers]
            pooling.save_fail_list()
            print('close drivers')
コード例 #2
0
 def fit(self, Xs, Ys):
     Ys = self.np_arr_to_index(Ys)
     for key in self.pack:
         try:
             self.pack[key].fit(Xs, Ys)
         except BaseException as e:
             log_error_trace(self.log.warn, e)
             self.log.warn(f'while fitting, {key} raise {e}')
コード例 #3
0
 def wrapper(*args, **kwargs):
     self = args[0]
     log_func = self.log
     try:
         return func(*args, **kwargs)
     except KeyboardInterrupt:
         log_func("KeyboardInterrupt detected abort process")
     except Exception as e:
         log_error_trace(log_func, e)
コード例 #4
0
    def wrapper(*args, **kwargs):
        self = args[0]
        key = args[2]

        try:
            return func(*args, **kwargs)
        except BaseException as e:
            self.log.warn(f'\nfail {func.__name__}, {key}\n')
            log_error_trace(self.log.warn, e)
コード例 #5
0
def dump_and_load(pickler, obj):
    try:
        path = './pkl'
        with open(path, 'wb') as f:
            pickler.dump(obj, f)
        with open(path, 'rb') as f:
            obj = pickler.load(f)

    except BaseException as e:
        log_error_trace(pprint, e)
コード例 #6
0
    def predict_confidence(self, Xs):
        confidences = {}
        for key, clf in self.pack.items():
            try:
                confidences[key] = clf.predict_confidence(Xs)
            except BaseException as e:
                log_error_trace(self.log.warn, e,
                                f'while execute confidence at {key},\n')

        return confidences
コード例 #7
0
    def fit(self, x, y):
        x, y = self._if_df_encode(x, y)
        label_Ys = self.np_arr_to_index(y)

        self._check_n_class(y)

        for key in tqdm(self.pack):
            tqdm.write(f'fit {key}')

            try:
                self.pack[key].fit(x, label_Ys)
            except BaseException as e:
                log_error_trace(self.log.warn, e)
                self.log.warn(f'while fit, {key} raise {e}')
コード例 #8
0
    def predict_confidence(self, x):
        x = self._if_df_encode_x(x)

        confidences = {}
        for key, clf in tqdm(self.pack.items()):
            tqdm.write(f'predict_confidence {key}')

            try:
                confidences[key] = clf.predict_confidence(x)
            except BaseException as e:
                log_error_trace(self.log.warn, e,
                                f'while execute confidence at {key},\n')

        return confidences
コード例 #9
0
ファイル: PbarPooling.py プロジェクト: demetoir/MLtools
    def get(self):
        rets = []
        while sum([q.qsize() for q in self.queues]) > 0:
            for pool_id in range(self.n_parallel):
                ret = None
                if self.queues[pool_id].qsize() == 0:
                    continue
                child, job = self.queues[pool_id].get()
                try:
                    ret = child.get(timeout=self.child_timeout)
                except KeyboardInterrupt:
                    self.log("KeyboardInterrupt terminate pools\n"
                             "{fail}/{total} fail".format(
                                 fail=len(self.fail_list),
                                 total=len(self.jobs)))
                    self.terminate()
                    raise KeyboardInterrupt
                except BaseException as e:
                    log_error_trace(self.log, e)
                    self.log("job fail, kill job={job}, child={child}".format(
                        child=str(None), job=str(job[3])))
                    self.pbar.update(1)
                    self.fail_list += [job]
                    self.pools[pool_id].terminate()
                    self.pools[pool_id].join()
                    self.pools[pool_id] = Pool(1,
                                               initializer=self.initializer,
                                               initargs=self.initargs)

                    new_queue = Queue()
                    while self.queues[pool_id].qsize() > 0:
                        _, job = self.queues[pool_id].get()
                        child = self.pools[pool_id].apply_async(
                            self.func, job, callback=self.update_pbar)
                        new_queue.put((child, job))

                    self.queues[pool_id] = new_queue

                finally:
                    rets += [ret]

        self.pbar.close()
        self.log("{fail}/{total} fail".format(fail=len(self.fail_list),
                                              total=len(self.jobs)))
        self.log('end pooling queue')
        return rets
コード例 #10
0
def pomppu_singlepage_crawl(driver, crawler, url, job_id):
    try:
        # time.sleep(10)
        query = url_query_parser(url)
        driver.get(url)

        try:
            if driver.find_element_by_class_name("pre"):
                execute_js_show_comment_page(driver, query)
        except BaseException:
            pass

        str_soup = []

        html = driver.page_source
        soup = BeautifulSoup(html, 'lxml')

        # main content
        main_content_args = ('td', {'class': "board-contents"})
        for item in soup.find_all(*main_content_args):
            str_soup += [item.text]

        # comment
        comment_args = ('textarea', {'class': 'ori_comment'})
        for item in soup.find_all(*comment_args):
            str_soup += [item.text]

        try:
            max_page = get_max_comment_page(soup)
            # print('{} max_page {}'.format(job_id, max_page))
            for i in range(2, max_page + 1):
                execute_js_go_comment_page(driver, query, i)
                element = driver.find_element_by_id("quote")

                part = str(element.get_attribute('innerHTML'))
                part = BeautifulSoup(part, 'lxml')

                for soup in part.find_all(*comment_args):
                    str_soup += [str(soup.text)]
        except BaseException:
            pass

        str_soup = "\n".join(str_soup)
        crawler.save_html(str_soup, path_tail=str(job_id) + ".txt")
    except BaseException as e:
        log_error_trace(print, e)
コード例 #11
0
    def wrapper(*args, **kwargs):
        date = datetime.now()
        start = time.time()
        try:
            ret = func(*args, **kwargs)
        except BaseException as e:
            log_error_trace(print, e)
            ret = None
        finally:

            elapse_time = time.time() - start
            msg = f"in {func.__name__}(), time {time.time() - start:.4f}'s elapsed"

            if elapse_time > 60:
                now = datetime.now() - date
                msg += f", {now}"

            print(msg)
        return ret
コード例 #12
0
    def deco_hyperOpt_fn_wrapper(params):
        start_time = time.time()
        trial = {
            'loss': None,
            'status': None,
            'eval_time': None,
            # 'other_stuff': None,
            # -- attachments are handled differently
            # 'attachments':
            #     {'time_module': None},
            # 'params': kwargs['params']
            'params': params
        }

        try:
            if issubclass(func, HyperOpt_fn):
                ret = func.fn(params, feed_args, feed_kwargs)
            else:
                ret = func(params, feed_args, feed_kwargs)

            if type(ret) is dict:
                trial.update(ret)
            else:
                trial['loss'] = ret

            trial['status'] = STATUS_OK

        except BaseException as e:
            log_error_trace(print, e)
            trial['loss'] = np.inf
            trial['status'] = STATUS_FAIL
        finally:
            trial['eval_time'] = time.time() - start_time

            if min_best is False:
                trial['loss'] = -trial['loss']

            try:
                pbar.update(1)
            except BaseException:
                pass
            return trial
コード例 #13
0
    def join(self):
        pbar = tqdm.tqdm(total=len(self.childs))

        q = Queue()
        for id_, child in self.childs.items():
            q.put((id_, child))

        while q.qsize() > 0:
            id_, child = q.get()
            try:
                child.get(self.TIMEOUT)
                pbar.update(1)
            except TimeoutError:
                q.put((id_, child))
            except KeyboardInterrupt:
                raise KeyboardInterrupt
            except BaseException as e:
                pbar.update(1)
                self.log.warn(f'job fail')
                log_error_trace(self.log.info, e)

        pbar.close()