def process_request(self, request, spider): save_path = ToCrawlUrl.settings.get( "ROOT_PATH_LOG") + ctime.get_today() + "_to_crawl.log" log_file(save_path, str(ctime.get_now_time()) + "\t" + spider.name + "\t" + request.url + "\n", method="a")
def process_request(self, request, spider): save_path = self.settings.get( "ROOT_PATH_LOG") + Dt.get_today() + "_to_crawl.log" # TODO: logging.info(save_path, str(Dt.get_now_time()) + "\t" + spider.name + "\t" + request.url + "\n", method="a")
def process_response(self, request, response, spider): save_path = CrawledUrl.settings.get( "ROOT_PATH_LOG") + ctime.get_today() + "_carwled.log" log_file(save_path, str(ctime.get_now_time()) + "\t" + spider.name + "\t" + str(response.status) + "\t" + request.url + "\n", method="a") return response
def process_response(self, request, response, spider): save_path = self.settings.get( "ROOT_PATH_LOG") + Dt.get_today() + "_carwled.log" # TODO: logging.info(save_path, str(Dt.get_now_time()) + "\t" + spider.name + "\t" + str(response.status) + "\t" + request.url + "\n", method="a") return response
def get_date_time(s): if not s or not isinstance(s, str): return "" r = r"(\d{2,4}[-/年]\d{1,2}[-/月]\d{1,2}日? \d{2}:\d{2}(:\d{2})?)" r = r"\d{2,4}[-/年]\d{1,2}[-/月]\d{1,2}日? \d{2}:\d{2}(:\d{2})?" p = re.compile(r) result = re.findall(p, s) try: result[0] = result[0].replace('年', '-').replace('月', '-').replace( '日', '').replace('号', '') if result[0] == "今天": return DT.get_today() if len(result[0]) == 4 or len(result[0]) == 5: print(result[0]) return str(DT.get_today().split("-")[0]) + result[0] return result[0] except Exception as e: print(e)
def get_date(s): if not s or not isinstance(s, str): return "" p = re.compile( r'\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}|\d{4}年\d{1,2}月\d{1,2}[日号]|今天|\d{1,2}月\d{1,2}[日号]' ) result = re.findall(p, s) try: print(result[0]) result[0] = result[0].replace('年', '-').replace('月', '-').replace( '日', '').replace('号', '') if result[0] == "今天": return DT.get_today() if len(result[0]) == 4 or len(result[0]) == 5: print(result[0]) return str(DT.get_today().split("-")[0]) + result[0] return result[0] except Exception as e: print(e)
def inner(*args, **kwargs): if len(args) < 2: json_data = kwargs.get("json_data", None) spider = kwargs.get("spider", None) fp = kwargs.get("fp", None) else: json_data = args[0] spider = args[1] fp = args[3] if json_data: today = dt.get_today().replace("-", "_") uf.FileHelper.mkdir(fp + spider) json.dump(json_data, open(fp + spider + os.sep + today + ".json", "a", encoding="utf-8"), ensure_ascii=False) return func(*args, **kwargs) # 2
def json_2_redis(*args, **kw): rcfg = kw.get("rcfg") if not rcfg: print("No rcfg" + "===" * 10) return rename = kw.get("rename", 0) conn_redis = dr.RedisHelper.get_redis_connect_by_cfg(rcfg) fp = kw.get("fp", "") ts = kw.get("ts", 1) spider = kw.get("spider") if rename and str(fp).startswith(dt.get_today().replace("-", "_")): return for line in open(fp, encoding="utf-8"): length = conn_redis.llen(spider + ":items") if length > 50000: bf.print_from_head(fp + "\t Too much,Please customer\t" + str(length) + "\t\t") time.sleep(ts) bf.print_blank_end(conn_redis.lpush(spider + ":items", line)) if rename: uf.FileHelper.rename_file(fp, str(fp) + "1") print("=====File Over\t" + fp + "=====") conn_redis.connection_pool.disconnect()