コード例 #1
0
ファイル: AddInitUrl.py プロジェクト: logonmy/XX
    def addInitUrlFromCheck(hcfg, rcfg, getRow, ts=0):
        import XX.DB.HappyBaseHelper as HaB

        conn_redis = RedisHelper.get_redis_connect_by_cfg(rcfg)
        conn_hbase = HaB.HappyBaseHeleper.get_connection_by_cfg(hcfg)
        # pool = HaB.HappyBaseHeleper.getPoolByCfg(hcfg)
        while 1:
            keys = conn_redis.keys("*:start_urls:check")
            if not keys:
                BF.print_from_head("No More Check IU in " + str(rcfg["host"]),
                                   ts=ts)
                continue
            for key in keys:
                jd = json.loads(conn_redis.lpop(key))
                url = jd["url"]
                if url:
                    # table = HaB.HappyBaseHeleper.getTable("crawl_" + jd["project"], pool=pool)
                    table = HaB.HappyBaseHeleper.get_table("crawl_" +
                                                           jd["project"],
                                                           conn=conn_hbase)
                    # HBase是否存在
                    row = getRow(url=url)
                    if row:
                        exists = HaB.HappyBaseHeleper.get_row(row)
                        if not exists:
                            res = conn_redis.lpush(key[:-6], url)
                            print("Add new IU res \t\t" + str(res))
                        else:
                            print("Already Crawled!\t\t" + url)
                    else:
                        print("==== No row key", jd)
                time.sleep(ts)
コード例 #2
0
ファイル: PipeLine.py プロジェクト: logonmy/XX
 def process_item(self, item, spider):
     topicdocu = self.client.topics[spider.name]
     producer = topicdocu.get_producer()
     # 数据处理
     item = chtml.parseDict(item)
     json_str = json.dumps(item, ensure_ascii=False)
     producer.produce(json_str)
     bf.printFromHead(spider.name + "\tAdd kafka")
     return item
コード例 #3
0
def re_add_not200(rcfg=RC.ali2_cfg(db=0), ts=10):
    conn_redis = RedisHelper.get_redis_connect_by_cfg(rcfg)
    while 1:
        keys = conn_redis.keys("*not200*")
        if not keys:
            BF.print_from_head("No More not 200 Spider in " + str(rcfg["host"]), ts=ts)
            continue
        for key in keys:
            url = conn_redis.spop(key)
            if url:
                if conn_redis.sadd("s_not_200_urls", url):
                    logger.info("Readd url res is\t" + str(conn_redis.lpush(key[:-7], url)) + "\tkey is\t" + key[:-7] + "\t url is \t" + url)
                else:
                    print("Retry already!")
            else:
                logger.info("No url in set \t" + str(key))
            time.sleep(ts)
コード例 #4
0
 def redis2mysql(self, **kw):
     while 1:
         spider = kw.get("spider")
         json_str = self.conn_redis.lpop(spider + ":items")
         if kw.get("debug"):
             # 放回
             self.conn_redis.lpush(spider + ":items", json_str)
         if json_str:
             json_data = json.loads(json_str, encoding="utf-8")
             func = kw.get("func")
             # func(json_data, kw.get("mysql_cfg"))
             func(json_data, self.conn_mysql)
         else:
             bf.print_no_end(spider + "\tNo more item")
             time.sleep(kw.get("ts", 5))
             if kw.get("once"):
                 print("One circle over")
                 break
コード例 #5
0
def redis2mysql(**kw):
    conn_redis = udr.RedisHelper.get_redis_connect_by_cfg(kw.get("redis_cfg"))
    conn_mysql = sa.SqlAlchemyHelper.get_session_by_cfg(kw.get("mysql_cfg"))
    while 1:
        spider = kw.get("spider")
        json_str = conn_redis.rpop(spider + ":items")
        if kw.get("debug"):
            # 放回
            print("+++>>> Readd=====")
            conn_redis.lpush(spider + ":items", json_str)
        if json_str:
            json_data = json.loads(json_str, encoding="utf-8")
            func = kw.get("func")
            func(json_data, conn_mysql)
            conn_mysql.commit()
        else:
            # bf.printFromHead(spider + "\tNo more item")
            bf.print_from_head(spider + "\tNo more item \t")
            time.sleep(kw.get("ts", 5))
            if kw.get("once"):
                print("One circle over")
                break
コード例 #6
0
ファイル: CacheHelper.py プロジェクト: logonmy/XX
def cache_file_2_hbase(root_path, hb_cfg, table_name, pro_num=0):
    conn_hbase = happybase.Connection(**hb_cfg)
    table = conn_hbase.table("crawl_" + table_name)
    for fp, fn in FH.FileHelper.getFileList(root_path):
        if not fn.startswith(cc.WORDS16[pro_num]):
            continue
        spider = fp.split(os.sep)[-4]
        response = pickle.load(open(fp + os.sep + fn, "rb"))
        row = spider + "_" + Enc.Encrypt.md5(response.url)
        if table.row(row):
            BF.print_from_head("Exists\t" + row)
            continue
        data = {
            "source:url": str(response.url),
            "source:status_code": str(response.status),
            "source:html": str(response.text),
            "source:type": "html",
            "source:size": str(len(response.text)),
            "source:encoding": response.encoding
        }
        table.put(row, data)
        logger.info(row)
コード例 #7
0
ファイル: DataProducer.py プロジェクト: logonmy/XX
    def json_2_redis(*args, **kw):
        rcfg = kw.get("rcfg")
        if not rcfg:
            print("No rcfg" + "===" * 10)
            return
        rename = kw.get("rename", 0)
        conn_redis = dr.RedisHelper.get_redis_connect_by_cfg(rcfg)
        fp = kw.get("fp", "")
        ts = kw.get("ts", 1)
        spider = kw.get("spider")
        if rename and str(fp).startswith(dt.get_today().replace("-", "_")):
            return

        for line in open(fp, encoding="utf-8"):
            length = conn_redis.llen(spider + ":items")
            if length > 50000:
                bf.print_from_head(fp + "\t Too much,Please customer\t" +
                                   str(length) + "\t\t")
                time.sleep(ts)
            bf.print_blank_end(conn_redis.lpush(spider + ":items", line))
        if rename:
            uf.FileHelper.rename_file(fp, str(fp) + "1")
        print("=====File Over\t" + fp + "=====")
        conn_redis.connection_pool.disconnect()
コード例 #8
0
ファイル: AddInitUrl.py プロジェクト: logonmy/XX
    def add_table_column2redis(pro_num,
                               *args,
                               column=None,
                               url_fun=None,
                               process_num=10,
                               fn="",
                               spider="",
                               module_name="",
                               class_name=None,
                               r_cfg=None,
                               m_cfg=None,
                               service=True,
                               from_id=None,
                               limit=3000,
                               **kwargs):
        session = sa.SqlAlchemyHelper.get_session_by_cfg(m_cfg)
        conn_redis = ur.RedisHelper.get_redis_connect_by_cfg(r_cfg)
        if kwargs.get("del_q"):
            conn_redis.delete(spider + kwargs.get("suffix", ":start_urls"))
        if from_id is None:
            from_id = conn_redis.get("kid_" + str(fn) + "_" + class_name +
                                     "_" + str(pro_num) + "_from_id")
            from_id = from_id if from_id else 0
            logger.info("From id is \t" + str(from_id))

        while 1:
            if conn_redis.llen(spider +
                               kwargs.get("suffix", ":start_urls")) > limit:
                BF.print_from_head("===Too much\t" + class_name + "\t")
                time.sleep(2 * (pro_num + 1))
                continue
            model_class = getattr(importlib.import_module(module_name),
                                  class_name)
            infos = model_class.getByFromIdAndMod(from_id,
                                                  process_num,
                                                  pro_num,
                                                  session,
                                                  limit=10)
            if infos:
                for info in infos:
                    if url_fun:
                        url = url_fun(info.__dict__.get(column))
                    else:
                        url = info.__dict__.get(column)
                    if url:
                        url = url.strip()
                        if kwargs.get("bf"):
                            bloomFilter = BloomFilter.BloomFilter(conn_redis,
                                                                  key=spider)
                            if bloomFilter.is_exists(url):
                                BF.print_no_end("-")
                            else:
                                res = conn_redis.lpush(
                                    spider +
                                    kwargs.get("suffix", ":start_urls"), url)
                                logger.info(str((spider, res, info.id, url)))
                                bloomFilter.add(url)
                        else:
                            res = conn_redis.lpush(
                                spider + kwargs.get("suffix", ":start_urls"),
                                url)
                            logger.info(str((spider, res, info.id, url)))

                    from_id = info.id
                    conn_redis.set(
                        "kid_" + str(fn) + "_" + class_name + "_" +
                        str(pro_num) + "_from_id", from_id)
            else:
                if service:
                    BF.print_from_head("No More\t" + class_name + "\t")
                    time.sleep(2 * (pro_num + 1))
                    session.commit()
                else:
                    return