def object_date_add(obj): today = datetime.date.today().strftime("%Y-%m") for key in ["dateCreated", "dateLastUpdated"]: if not date_lookup_force and obj["id"] in id_date and key in id_date[ obj["id"]]: obj[key] = id_date[obj["id"]][key] else: if not re_placeholder.search(obj["filename"]): if key == "dateCreated": cmd = "git log --diff-filter=A --follow --format=%aD -1 -- " else: cmd = "git log --format=%aD -1 -- " result = subprocess.check_output(cmd + " " + obj["filename"], shell=True) values = result.decode().lower().split(" ") else: values = [] date = today if len(values) >= 4: if values[2] in mon_index: date = values[3] + "." + mon_index[values[2]] obj[key] = date if obj["id"] not in id_date: id_date[obj["id"]] = {} if obj["__typename"] == "Media" and "presenters" in obj: for person_venue in obj["presenters"]: if "date" in person_venue: obj["date"] = person_venue["date"] if "venue" in person_venue and "venue" == person_venue["venue"][:5]: vid = person_venue["venue"] if vid in id_object: person_venue["venue"] = id_object[vid]["name"] else: print(" missing venue:", person_venue["venue"]) else: for type_key in [["Dataset", "dateStart"], ["Paper", "datePublished"]]: type_, key = type_key if obj["__typename"] == type_ and key in obj: date = utils.date_parse(obj[key]) if date: obj["date"] = date obj[key] = date key = "dateEnd" if key in obj: if obj[key].lower() == "ongoing": obj[key] = obj["date"] = today else: date = utils.date_parse(obj[key]) if date: obj[key] = date if "date" not in obj: obj["date"] = obj["dateLastUpdated"] obj["date"] = utils.date_parse(obj["date"])
def object_date_add(obj): for key in ["dateCreated","dateLastUpdated"]: if not date_lookup_force and obj["id"] in id_date and key in id_date[obj["id"]]: obj[key] = id_date[obj["id"]][key] else: if key == "dateCreated": cmd = "git log --diff-filter=A --follow --format=%aD -1 -- " else: cmd = "git log --format=%aD -1 -- " result = subprocess.check_output(cmd+" "+obj["filename"],shell=True) values = result.decode().lower().split(" ") date = datetime.date.today().strftime("%Y.%m") if len(values) >= 4: if values[2] in mon_index: date = values[3]+"."+mon_index[values[2]] obj[key] = date if obj["id"] not in id_date: id_date[obj["id"]] = {} if obj["__typename"] == "Media" and "presenters" in obj: for person_venue in obj["presenters"]: if "date" in person_venue: obj["date"] = person_venue["date"] else: for type_key in [["Dataset","dateStart"], ["Paper","datePublished"]]: type_,key = type_key if obj["__typename"] == type_ and key in obj: obj["date"] = obj[key] if "date" not in obj: obj["date"] = obj["dateLastUpdated"] obj["date"] = utils.date_parse(obj["date"])
def main(): load_ids("paper", "papers", args.papers_file) load_ids("media", "presentations", args.media_file) error = False for type_ in os.listdir("sources"): p = "sources/" + type_ if os.path.isdir(p): for fname in os.listdir(p): fname = p + "/" + fname if re.search("json$", fname) and "___pubdb" not in fname: try: obj = json.load(open(fname, "r")) except json.decoder.JSONDecodeError as e: error = True print("error", fname, e) continue except ValueError as e: print("-----------\nJSON ERROR in ", fname, "\n") raise e id_add(fname, type_, obj["id"]) if "name" in obj: name = utils.id_create(fname, type_, obj["name"]) #if "evolution" in name: #print (obj["id"]) #print (name) #print () name_id[name] = utils.id_create( fname, type_, obj["id"]) if type_ == "person": utils.person_seen_add(fname, obj) if error: sys.exit(1) print("processing objects") for obj in objects: obj["tags"].append("caida") key_to_key(obj, "pubdb_presentation_id", "pubdb_id") key_to_key(obj, "venue", "publisher") resources_front = [] resources_back = [] if "presenters" in obj: obj["type"] = "PRESENTATION" for info in obj["presenters"]: key_to_key(info, "name", "person") key_to_key(info, "organization", "organizations") for key in ["name", "person"]: if key in info: info["person"] = "person:" + info[key] person_create(obj["id"], info["person"]) if key != "person": del info[key] if "date" in info: date = utils.date_parse(info["date"]) if date is not None: info["date"] = date if "date" not in obj or obj["date"] < info["date"]: obj["date"] = info["date"] if "authors" in obj: for info in obj["authors"]: key_to_key(info, "organization", "organizations") if "links" in obj: links = [] for link in obj["links"]: if link["label"] == "DOI": obj["doi"] = link["to"] m = re.search( "https://www.caida.org/publications/([^\/]+)/(\d\d\d\d)\/([^/]+)/$", link["to"]) id_ = None if m: type_, date, id_ = m.groups() if type_ == "papers": type_ = "paper" elif type_ == "presentations": type_ = "media" m = re.search( "https://catalog.caida.org/details/([^\/]+)/([^/]+)", link["to"]) if m: type_, id_ = m.groups() id_ = utils.id_create(obj["filename"], type_, id_) if id_ is not None and id_ in seen: links.append({"to": id_, "label": link["label"]}) else: resource = { "name": link["label"], "url": link["to"], "tags": [] } if re.search("^pdf$", resource["name"], re.IGNORECASE): resources_front.append(resource) else: resources_back.append(resource) obj["links"] = links if obj["__typename"] == "paper": obj["bibtexFields"] = {} for key_from in [ "type", "booktitle", "institution", "journal", "volume", "venue", "pages", "peerReviewedYes", "bibtex", "year", "mon" ]: if key_from in obj and len(obj[key_from]) > 0: if key_from == "booktitle": key_to = "bookTitle" else: key_to = key_from obj["bibtexFields"][key_to] = obj[key_from] del obj[key_from] resources_front.append({ "name": "bibtex", "url": "https://www.caida.org/publications/papers/" + obj["id"][:4] + "/" + obj["id"][5:] + "/bibtex.html" }) resources_front.extend(resources_back) obj["resources"] = resources_front if "datePublished" in obj: obj["date"] = utils.date_parse(obj["datePublished"]) if "linkedObjects" in obj and len(obj["linkedObjects"]) > 0: linked = obj["linkedObjects"].lower().strip() if re_ids_only.search(linked): for to_id in re_whitespace.split(linked): obj["links"].append(to_id) else: print(obj["id"], "failed to parse linkedObject `" + linked + "'") json.dump(obj, open(obj["filename"], "w"), indent=4) for obj in id_person.values(): if "already_exists" not in obj: json.dump(obj, open(obj["filename"], "w"), indent=4)
def object_finish(obj): ############ # links ############ if "links" in obj: for link in obj["links"]: link_add(obj, link) del obj["links"] if "tags" not in obj: obj["tags"] = [] for key, value in obj.items(): if key == "tags": for i, tag in enumerate(obj["tags"]): o = object_lookup_type_name(obj["filename"], "tag", tag) if o is not None: tag = obj["tags"][i] = o["id"] link_add(obj, tag) #elif key == "resources": # for resource in obj["resources"]: # for i,tag in enumerate(resource[key]): # resource["tags"][i] = object_lookup_type_name("tag",tag)["id"] elif re_date_key.search(key) and type(obj[key]) == str: date = utils.date_parse(obj[key]) if date: obj[key] = date #values = re_not_digit.split(obj[key]) #digits = ["1990","01","01","00","00","00"] #for i,value in enumerate(values): #digits[i] = value ##dt = datetime.datetime.strptime(" ".join(digits), "%Y %m %d %H %M %S") #date = int(time.mktime(dt.timetuple())) #obj[key] = "%s/%s/%s %s:%s:%s" % (digits[0],digits[1],digits[2],digits[3],digits[4],digits[5]) #elif obj["__typename"] == "Venue" and key == "dates": # for date_url in obj[key]: # venue_add_date_url(obj,date_url["date"],date_url["url"]) elif key == "persons" or key == "venues" or key == "presenters" or key == "authors": dirty = [] i = 0 persons = set() while i < len(obj[key]): person_org = obj[key][i] error = False if type(person_org) == dict: caida = False if "organizations" in person_org: for org in person_org["organizations"]: if re.search("caida", org, re.IGNORECASE): caida = True for k in ["person", "presenter"]: if k in person_org: person = person_lookup_id(obj["filename"], person_org[k]) persons.add(person["id"]) if person is not None: if caida: if "tags" not in person: person["tags"] = ["caida"] else: person["tags"].append("caida") person_org[k] = person["id"] else: error = True elif type(person_org) == str and person_org[7:] == "person:": person = person_lookup_id(obj["filename"], person_org) persons.add(person["id"]) if person is not None: obj[key][i] = person["id"] else: error = True if error: del obj[key][i] else: i += 1 for person_id in persons: link_add(obj, person_id) personName_add(obj, person_id) elif key == "licenses": licenses = list(obj[key]) for i, id_ in enumerate(licenses): id_2 = utils.id_create(obj["filename"], "license", id_) if id_2 not in id_object: name = id_[8:] object_add("License", { "id": id_2, "name": id_[8:], "filename": obj["filename"] }) obj[key][i] = id_object[id_2]["id"] else: obj[key] = tag_convert(obj["filename"], obj[key])