Beispiel #1
0
def _get_job_rule(pool: happybase.ConnectionPool,
                  job_name) -> crawler.CrawlJobCore:
    '''
        获取 hbase 里的 crawl_job_core (爬取规则)
    '''
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(job_name)
            row = table.row(rule_row_key, columns=[
                rule_col,
            ])
            rule = row[bytes(rule_col, encoding="utf-8")].decode("utf-8")
            # _json_str = row.values
            # print(rule)
            common.print_info("get crawl rule: {}".format(rule))
            crawl_job_core = crawler.CrawlJobCore.loads(rule)
            # TODO 键 有点问题
            return crawl_job_core
        except Exception as e:
            common.print_exception(e)
            return None
            pass
        finally:
            conn.close()  # 关闭连接
Beispiel #2
0
def _save_results(pool: happybase.ConnectionPool, crawl_job_core, url,
                  result_list) -> bool:
    '''
        保存爬取结果到 hbase 里
        如果 result_list 为空,不进行操作
    '''
    if not bool(result_list):
        return False
    core = crawl_job_core
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(core.name)
            row_key = url
            table.put(row_key, {
                results_col_pattern(i): ele
                for i, ele in enumerate(result_list)
            })
            return True
        except Exception as e:
            common.print_exception(e)
            return False
            pass
        finally:
            conn.close()  # 关闭连接
Beispiel #3
0
def _get_job_result(pool: happybase.ConnectionPool, crawl_job_name) -> list:
    '''
        获取爬虫结果
    '''
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(crawl_job_name)
            result_list = []
            for key, value in table.scan(include_timestamp=True):
                tmp = {}
                tmp['url'] = key.decode("utf-8")
                # tmp['result']={ele.decode("utf-8"):value[ele].decode("utf-8") for ele in value}
                tmp['result'] = {
                    ele.decode("utf-8"):
                    (value[ele][0].decode("utf-8"), value[ele][1])
                    for ele in value
                }
                result_list.append(tmp)
            return result_list
        except Exception as e:
            common.print_exception(e)
            return None
            pass
        finally:
            conn.close()  # 关闭连接
Beispiel #4
0
 def run_task_fetcher(self):
     '''
         从 redis 中获取 任务
     '''
     while not self.end_flag:
         try:
             obj_tuple = QUEUE.get_wait(timeout=self.timeout)
             # print(obj)
             if obj_tuple is None:
                 # 取出为空,说明超时了
                 continue
             _, content = obj_tuple
             task_info = CrawlTaskJson.from_json_str(content)
             # 如果该任务在 close_set 里,说明它被手动关闭了
             if CLOSE_SET.is_member(task_info.job_name):
                 common.print_info(
                     "this crawl_job has been closed: {}".format(
                         task_info.job_name))
                 continue
             # 判断是否为合法url
             for url in task_info.urls:
                 assert common.urltools.check_url(url)
             # 阻塞一段时间,防止其他节点抢不到
             time.sleep(0.5)
             self.add_urls(task_info.job_name, task_info.layer,
                           task_info.urls)
         except Exception as e:
             common.print_exception(e)
Beispiel #5
0
 def work(self, tasks: common.LockedIterator):
     '''
         接收 crawl_tasks,并执行这些 task 
     '''
     while not self.end_flag:
         try:
             self.activate()
             for task in tasks:
                 task(self.driver)
         except Exception as e:
             common.print_exception(e)
             continue
         finally:
             self.close()
Beispiel #6
0
 def process(self):
     while not self.quit:
         #self.polling()
         try:
             self.polling()
             found = self.queue.get(True, self.default_polling_interval)
             found.process(self)
         except queue.Empty:
             found = None
         except KeyboardInterrupt:
             found = None
             self.quit = True
         except Exception as e:
             print_exception()
Beispiel #7
0
def _get_job_list(pool: happybase.ConnectionPool) -> list:
    '''
        获取hbase中存的job名称list
    '''
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table_list = conn.tables()
            return table_list
        except Exception as e:
            common.print_exception(e)
            return None
            pass
        finally:
            conn.close()  # 关闭连接
Beispiel #8
0
def _remove_job(pool, crawl_job_name) -> bool:
    '''
        删除 job (删除job_name所对应的表)
        要想删除 hbase 的表,应该先 disable 掉它
    '''
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            conn.delete_table(crawl_job_name, disable=True)
            return True
        except Exception as e:
            common.print_exception(e)
            return False
            pass
        finally:
            conn.close()  # 关闭连接
def export_csv(service, docid, filename_template='%(title)s - %(sheet)s.csv'):

    error_count = common.error_count # Note original error count

    if docid and re.match("^https://docs.google.com/spreadsheets/d/", docid, re.IGNORECASE):
        docid = docid.split("/")[5]

    try:
        for (doc, sheet), rows in itersheets(service, docid):
            file_name = filename_template % {'title': doc, 'sheet': sheet}
            file_path = os.path.join(common.args[TITLE_DIR], common.args[TITLE_PREFIX], file_name)
            common.print_notice("Saving \"%s\" sheet to file: %s" % (common.colour_text(sheet), common.colour_text(file_name, common.COLOUR_GREEN)))
            with open(file_name, 'wb') as fd:
                write_csv(service, fd, rows)
    except Exception as e:
        common.print_exception(e)
    return error_count == common.error_count
Beispiel #10
0
def _set_job_rule(pool: happybase.ConnectionPool, crawl_job_core) -> bool:
    '''
        改变规则
    '''
    core = crawl_job_core
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            table = conn.table(core.name)
            table.put(rule_row_key, {
                rule_col: core.dumps(),
            })
            return True
        except Exception as e:
            common.print_exception(e)
            return False
            pass
        finally:
            conn.close()  # 关闭连接
Beispiel #11
0
def _save_job(pool: happybase.ConnectionPool, crawl_job_core)-> bool:
    '''
        存储 crawl_job_core (爬取规则) 到 hbase 里
    '''
    core = crawl_job_core
    with pool.connection() as conn:
        try:
            conn: happybase.Connection
            conn.create_table(name=core.name, families={
                rule_col: dict(max_versions=rule_max_version),
                results_family: dict(max_versions=results_max_version),
            })
            table = conn.table(core.name)
            table.put(rule_row_key, {
                rule_col: core.dumps()
            })
            return True
        except Exception as e:
            common.print_exception(e)
            return False
            pass
        finally:
            conn.close()  # 关闭连接