def est_html_cdc(conp, f, **args): m = page() sql = "select distinct href from %s.gg where href not in(select href from %s.gg_html ) and (not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页')" % ( conp[4], conp[4]) df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["href"].values if arr == []: print("无href更新") return None if "html_total" in args.keys(): html_total = args["html_total"] arr = arr[:html_total] setting = { "num": 5, "arr": arr, "f": f, "conp": conp, "tb": "gg_html", "headless": True } if "num" in args.keys(): setting["num"] = args["num"] setting = {**setting, **args} if len(arr) > 2000 and setting['num'] < 20: setting["num"] = 20 m.write(**setting)
def est_html_work(conp, f, **args): if "size" in args.keys(): size = args["size"] else: size = None m = page() if size is not None: sql = "select distinct href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' limit %d" % ( conp[4], size) else: sql = "select distinct href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' " % ( conp[4]) df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["href"].values if "html_total" in args.keys(): html_total = args["html_total"] arr = arr[:html_total] print(arr[:3]) setting = { "num": 20, "arr": arr, "f": f, "conp": conp, "tb": "gg_html", "headless": True } if "num" in args.keys(): setting["num"] = args["num"] setting = {**setting, **args} m.write(**setting)
def html_cdc(conp, f, headless=True): m = page() sql = "select distinct href from %s.gg where href not in(select href from %s.gg_html ) and (not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页')" % ( conp[4], conp[4]) df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["href"].values if arr == []: print("无href更新") return None setting = {"num": 5, "arr": arr, "f": f, "conp": conp, "tb": "gg_html", "headless": headless} m.write(**setting)
def html_work(conp, f, size=None, headless=True): m = page() if size is not None: sql = "select distinct href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' limit %d" % ( conp[4], size) else: sql = "select distinct href from %s.gg where not coalesce(info,'{}')::jsonb?'hreftype' or coalesce(info,'{}')::jsonb->>'hreftype'='可抓网页' " % ( conp[4]) df = db_query(sql, dbtype="postgresql", conp=conp) arr = df["href"].values print(arr[:3]) setting = {"num": 20, "arr": arr, "f": f, "conp": conp, "tb": "gg_html", "headless": headless} m.write(**setting)