def addInitUrlFromCheck(hcfg, rcfg, getRow, ts=0): import XX.DB.HappyBaseHelper as HaB conn_redis = RedisHelper.get_redis_connect_by_cfg(rcfg) conn_hbase = HaB.HappyBaseHeleper.get_connection_by_cfg(hcfg) # pool = HaB.HappyBaseHeleper.getPoolByCfg(hcfg) while 1: keys = conn_redis.keys("*:start_urls:check") if not keys: BF.print_from_head("No More Check IU in " + str(rcfg["host"]), ts=ts) continue for key in keys: jd = json.loads(conn_redis.lpop(key)) url = jd["url"] if url: # table = HaB.HappyBaseHeleper.getTable("crawl_" + jd["project"], pool=pool) table = HaB.HappyBaseHeleper.get_table("crawl_" + jd["project"], conn=conn_hbase) # HBase是否存在 row = getRow(url=url) if row: exists = HaB.HappyBaseHeleper.get_row(row) if not exists: res = conn_redis.lpush(key[:-6], url) print("Add new IU res \t\t" + str(res)) else: print("Already Crawled!\t\t" + url) else: print("==== No row key", jd) time.sleep(ts)
def process_item(self, item, spider): topicdocu = self.client.topics[spider.name] producer = topicdocu.get_producer() # 数据处理 item = chtml.parseDict(item) json_str = json.dumps(item, ensure_ascii=False) producer.produce(json_str) bf.printFromHead(spider.name + "\tAdd kafka") return item
def re_add_not200(rcfg=RC.ali2_cfg(db=0), ts=10): conn_redis = RedisHelper.get_redis_connect_by_cfg(rcfg) while 1: keys = conn_redis.keys("*not200*") if not keys: BF.print_from_head("No More not 200 Spider in " + str(rcfg["host"]), ts=ts) continue for key in keys: url = conn_redis.spop(key) if url: if conn_redis.sadd("s_not_200_urls", url): logger.info("Readd url res is\t" + str(conn_redis.lpush(key[:-7], url)) + "\tkey is\t" + key[:-7] + "\t url is \t" + url) else: print("Retry already!") else: logger.info("No url in set \t" + str(key)) time.sleep(ts)
def redis2mysql(self, **kw): while 1: spider = kw.get("spider") json_str = self.conn_redis.lpop(spider + ":items") if kw.get("debug"): # 放回 self.conn_redis.lpush(spider + ":items", json_str) if json_str: json_data = json.loads(json_str, encoding="utf-8") func = kw.get("func") # func(json_data, kw.get("mysql_cfg")) func(json_data, self.conn_mysql) else: bf.print_no_end(spider + "\tNo more item") time.sleep(kw.get("ts", 5)) if kw.get("once"): print("One circle over") break
def redis2mysql(**kw): conn_redis = udr.RedisHelper.get_redis_connect_by_cfg(kw.get("redis_cfg")) conn_mysql = sa.SqlAlchemyHelper.get_session_by_cfg(kw.get("mysql_cfg")) while 1: spider = kw.get("spider") json_str = conn_redis.rpop(spider + ":items") if kw.get("debug"): # 放回 print("+++>>> Readd=====") conn_redis.lpush(spider + ":items", json_str) if json_str: json_data = json.loads(json_str, encoding="utf-8") func = kw.get("func") func(json_data, conn_mysql) conn_mysql.commit() else: # bf.printFromHead(spider + "\tNo more item") bf.print_from_head(spider + "\tNo more item \t") time.sleep(kw.get("ts", 5)) if kw.get("once"): print("One circle over") break
def cache_file_2_hbase(root_path, hb_cfg, table_name, pro_num=0): conn_hbase = happybase.Connection(**hb_cfg) table = conn_hbase.table("crawl_" + table_name) for fp, fn in FH.FileHelper.getFileList(root_path): if not fn.startswith(cc.WORDS16[pro_num]): continue spider = fp.split(os.sep)[-4] response = pickle.load(open(fp + os.sep + fn, "rb")) row = spider + "_" + Enc.Encrypt.md5(response.url) if table.row(row): BF.print_from_head("Exists\t" + row) continue data = { "source:url": str(response.url), "source:status_code": str(response.status), "source:html": str(response.text), "source:type": "html", "source:size": str(len(response.text)), "source:encoding": response.encoding } table.put(row, data) logger.info(row)
def json_2_redis(*args, **kw): rcfg = kw.get("rcfg") if not rcfg: print("No rcfg" + "===" * 10) return rename = kw.get("rename", 0) conn_redis = dr.RedisHelper.get_redis_connect_by_cfg(rcfg) fp = kw.get("fp", "") ts = kw.get("ts", 1) spider = kw.get("spider") if rename and str(fp).startswith(dt.get_today().replace("-", "_")): return for line in open(fp, encoding="utf-8"): length = conn_redis.llen(spider + ":items") if length > 50000: bf.print_from_head(fp + "\t Too much,Please customer\t" + str(length) + "\t\t") time.sleep(ts) bf.print_blank_end(conn_redis.lpush(spider + ":items", line)) if rename: uf.FileHelper.rename_file(fp, str(fp) + "1") print("=====File Over\t" + fp + "=====") conn_redis.connection_pool.disconnect()
def add_table_column2redis(pro_num, *args, column=None, url_fun=None, process_num=10, fn="", spider="", module_name="", class_name=None, r_cfg=None, m_cfg=None, service=True, from_id=None, limit=3000, **kwargs): session = sa.SqlAlchemyHelper.get_session_by_cfg(m_cfg) conn_redis = ur.RedisHelper.get_redis_connect_by_cfg(r_cfg) if kwargs.get("del_q"): conn_redis.delete(spider + kwargs.get("suffix", ":start_urls")) if from_id is None: from_id = conn_redis.get("kid_" + str(fn) + "_" + class_name + "_" + str(pro_num) + "_from_id") from_id = from_id if from_id else 0 logger.info("From id is \t" + str(from_id)) while 1: if conn_redis.llen(spider + kwargs.get("suffix", ":start_urls")) > limit: BF.print_from_head("===Too much\t" + class_name + "\t") time.sleep(2 * (pro_num + 1)) continue model_class = getattr(importlib.import_module(module_name), class_name) infos = model_class.getByFromIdAndMod(from_id, process_num, pro_num, session, limit=10) if infos: for info in infos: if url_fun: url = url_fun(info.__dict__.get(column)) else: url = info.__dict__.get(column) if url: url = url.strip() if kwargs.get("bf"): bloomFilter = BloomFilter.BloomFilter(conn_redis, key=spider) if bloomFilter.is_exists(url): BF.print_no_end("-") else: res = conn_redis.lpush( spider + kwargs.get("suffix", ":start_urls"), url) logger.info(str((spider, res, info.id, url))) bloomFilter.add(url) else: res = conn_redis.lpush( spider + kwargs.get("suffix", ":start_urls"), url) logger.info(str((spider, res, info.id, url))) from_id = info.id conn_redis.set( "kid_" + str(fn) + "_" + class_name + "_" + str(pro_num) + "_from_id", from_id) else: if service: BF.print_from_head("No More\t" + class_name + "\t") time.sleep(2 * (pro_num + 1)) session.commit() else: return