def prepare_lrus(lru, lruLinks, crawlMetas={}): lrus = [] now = int(time()) lrus.append(LRUs.lru_to_stemnodes(lru)) lrus[-1][-1]["crawled"] = True lrus[-1][-1]["crawlDepth"] = crawlMetas.get("depth", 0) lrus[-1][-1]["crawlTimestamp"] = crawlMetas.get("timestamp", now) lrus[-1][-1]["crawlHTTPCode"] = crawlMetas.get("status", 200) lrus[-1][-1]["crawlError"] = crawlMetas.get("error", None) lrus[-1][-1]["pageEncoding"] = crawlMetas.get("encoding", "utf-8") for link in lruLinks: lrus.append(LRUs.lru_to_stemnodes(link)) lrus[-1][-1]["linked"] = True lrus[-1][-1]["crawlDepth"] = crawlMetas.get("depth", 0) + 1 lrus[-1][-1]["crawlTimestamp"] = crawlMetas.get("timestamp", now) return lrus
def define_webentities(neo4j, lrus=TEST_DATA["manual_webentities"]): wes = [{ "name": LRUs.name_lru(lru), "prefixes": LRUs.get_alt_prefixes(lru) } for lru in lrus] neo4j.write_query("index_lrus", lrus=[ LRUs.lru_to_stemnodes(l) for lru in lrus for l in LRUs.get_alt_prefixes(lru) ]) neo4j.write_query("create_wes", webentities=wes)
def create_webentities(neo4j, lrus): webentities = [] lrusToCreate = [] for lru in lrus: we = {} we['prefixes'] = LRUs.get_alt_prefixes(lru) lrusToCreate += we['prefixes'] we['name'] = LRUs.name_lru(lru) webentities.append(we) result = neo4j.write_query( "index_lrus", lrus=[LRUs.lru_to_stemnodes(lru) for lru in lrusToCreate]) print(result._summary.counters.__dict__) result = neo4j.write_query("create_wes", webentities=webentities) print(result._summary.counters.__dict__)
def init_WE_creation_rules(neo4j, rules=TEST_DATA["WECRs"]): extended_rules = [{ "prefix": prefix, "pattern": r["pattern"] } for r in rules for prefix in LRUs.get_alt_prefixes(r["prefix"])] # precompile regexps for creation rules in runtime WECR_regexps = { r["prefix"] + r["pattern"]: re.compile(getPreset(r["pattern"], r["prefix"])) for r in extended_rules } neo4j.write_query("index_lrus", lrus=[ LRUs.lru_to_stemnodes(r["prefix"]) for r in extended_rules if r["prefix"] ]) neo4j.write_query("create_wecreationrules", rules=extended_rules) return WECR_regexps
def run_WE_creation_rule(neo4j, lastcheck): #we_prefixes = neo4j.read_query("we_default_creation_rule", lastcheck=lastcheck) we_prefixes = neo4j.read_query("we_apply_creation_rule", lastcheck=lastcheck) #lrus = next(we_prefixes.records())["lrus"] lrus = [r['lru'] for r in we_prefixes.records()] webentities = [] lrusToCreate = [] for lru in lrus: we = {} we['prefixes'] = LRUs.get_alt_prefixes(lru) lrusToCreate += we['prefixes'] we['name'] = LRUs.name_lru(lru) webentities.append(we) result = neo4j.write_query( "index_lrus", lrus=[LRUs.lru_to_stemnodes(lru) for lru in lrusToCreate]) print(result._summary.counters.__dict__) result = neo4j.write_query("create_wes", webentities=webentities) print(result._summary.counters.__dict__)