def read_cns_core_jsonld(version, path="data"): name = "cns-core" filename = "../{}/releases/{}/{}.jsonld".format(path, version, name) filename = file2abspath(filename, __file__) items = file2json(filename)["@graph"] return items
def load_data(self): filename_cache = os.path.join(self.dir_output, "schemaorg.json") if os.path.exists(filename_cache): return file2json(filename_cache) #examples self._init_examples() # the word count stats 2015 self._init_stat2015() # init the schema, with information from examples and stats self._init_schema() json2file(self.map_id_node, filename_cache) return self.map_id_node
def task_superclasses(args): filename = "../local/releases/3.2/schema_taxonomy.json" filename = file2abspath(filename, __file__) data = file2json(filename) pairs = [] loadmapping(data, [], pairs) logging.info(json.dumps(pairs, indent=4, ensure_ascii=False)) mapping = collections.defaultdict(list) for pair in pairs: key = pair["to"] mapping[key].append(pair["to"]) for pair in pairs: key = pair["to"] for parent in pair["path"]: if parent not in mapping[key]: mapping[key].append(parent) logging.info(json.dumps(mapping, indent=4, ensure_ascii=False)) filename = "../data/releases/3.2/schema.superclass.json" filename = file2abspath(filename, __file__) json2file(mapping, filename)
def _load_item_data(self, version): # load cns-core data filename = "../data/releases/{}/cns-core.jsonld".format(version) filename = file2abspath(filename, __file__) items = file2json(filename)["@graph"] logging.info(len(items)) fileds_index_suggest = ["name","nameZh"] fileds_index_search = ["name","nameZh","description", "descriptionZh", "wikidataName"] fields_suggest_payload = ["@id", "name","nameZh", "description", "descriptionZh", "wikidataName", "wikidataUrl","wikipediaUrl"] es_index = self.es_config["es_index"] es_type = self.es_config["es_type"] for item in items: # add suggestion field index_suggest = [] index_search = [] suggest_payload = {} for p, v in item.items(): if p in fields_suggest_payload: suggest_payload[p] = v if v: vx = v if isinstance(v, unicode): #remove markups vx = re.sub(ur"<[^>]+>","",vx) #remove url in description vx = re.sub(ur"[hH][tT][tT][pP][s|S]?://[\S]+","",vx) if p in fileds_index_suggest: index_suggest.append(vx) if p in fileds_index_search: index_search.append(vx) item["id"] = any2sha1(item["@id"]) #logging.info(item["id"]) item["index_wildcard"] = u"".join(index_suggest) item["index_search"] = u"".join(index_search) if len(item["index_wildcard"])==0: logging.info(json.dumps(item, indent=4)) exit() item["index_suggest"] = { "input": index_suggest, #"output": u"{}({})".format(item["name"],item["nameZh"]), #"payload" : suggest_payload, } yield { "_id": item["id"], "_index": es_index, "_type": es_type, "_source": item }