def object_add(type_, info): info["__typename"] = type_ = type_.title() error = False if type_ == "Person": person_add_names(info) if "name" in info: info["__typename"] = type_.title() if "id" not in info: info["id"] = utils.id_create(info["filename"], info["__typename"],info["name"]) else: info["id"] = utils.id_create(info["filename"], info["__typename"],info["id"]) else: error_add(info["filename"], "failed to find name:"+json.dumps(info)) error = True if type_ == "paper": if "datePublished" in info: info["date"] = info["datePublished"] else: error_add(info["filename"], "failed to find paper's date") error = True m = re.search("^paper:(\d\d\d\d)_(.+)", info["id"]) if m: date,id_short = m.groups() id_paper[id_short] = info else: info["id"] = utils.id_create(filename, info["__typename"],info["id"]) if not error: id_object[info["id"]] = info return info return None
def link_add(obj,info): if type(info) == str: to_original = info to = utils.id_create(None,None,info) info = { "to":to } else: if "to" in info: to_original = info["to"] to = info["to"] = utils.id_create(None,None,info["to"]) else: error_add(obj["filename"],"link has no to"+json.dumps(info)) return None if to is None: error_add(obj["filename"],"invalid id "+to_original) return None if to not in id_object: error_add(obj["filename"], "missing id "+to) return False info["from"] = obj["id"] for a_b in [["from","to"],["to","from"]]: a,b = a_b a_id = info[a] b_id = info[b] link = { "from":a_id, "to":b_id } if a+"_label" in info: link["from_label"] = info[a+"_label"] if b+"_label" in info: link["to_label"] = info[b+"_label"] if "label" in info: link["label"] = info["label"] if a_id not in id_id_link: id_id_link[a_id] = {} if b_id in id_id_link[a_id]: for key,value in link.items(): if key not in id_id_link[a_id][b_id]: id_id_link[a_id][b_id][key] = value else: id_id_link[a_id][b_id] = link return True
def add_seen_ids(source_dir): global seen_ids re_placeholder = re.compile(r"___caida") for fname in sorted(os.listdir(source_dir)): path = source_dir + "/" + fname if os.path.isdir(path): type_ = fname for filename in sorted(os.listdir(path)): file_path = path + "/" + filename if re.search( "\.json$", filename, re.IGNORECASE) and not re_placeholder.search(filename): try: info = json.load(open(file_path)) info["filename"] = file_path id = info["id"] = utils.id_create( info["filename"], type_, info["id"]) if id in seen_id: print("duplicate id found in\n ", filename, "\n ", seen_id[id]) else: seen_id[id] = file_path except Exception as e: print("\nerror", path + "/" + filename) print(" ", e) sys.exit()
def object_lookup_type_name(filename, type_,name): if type_ == name[0:(len(type_)+1)]: name = name[(len(type_)+1):] id_ = utils.id_create(filename, type_,name) return object_lookup({ "id":id_, "filename":filename, "__typename":type_, "name":name })
def person_create(filename, obj): id_ = utils.id_create("filename", 'person', obj) if id_ not in id_person: if obj[:7] == "person:": nameLast, nameFirst = obj[7:].split("__") else: nameLast, nameFirst = obj.split("__") person = { "id": id_, "__typename": "person", "filename": "sources/person/" + id_[7:] + "__pubdb.json", "nameLast": nameLast.replace("_", " ").title(), "nameFirst": nameFirst.replace("_", " ").title() } id_person[id_] = person
def person_create(filename, obj): if obj[:7] == "person:": nameLast, nameFirst = obj[7:].split("__") else: nameLast, nameFirst = obj.split("__") person = utils.person_seen_check(nameLast, nameFirst) if person is None: id_ = utils.id_create("filename", 'person', obj) if id_ not in id_person: person = { "id": id_, "__typename": "person", "filename": "sources/person/" + id_[7:] + "__pubdb.json", "nameLast": nameLast.replace("_", " ").title(), "nameFirst": nameFirst.replace("_", " ").title() } id_person[id_] = person elif person["id"] not in id_person: person["already_exists"] = True id_person[person["id"]] = person
def object_lookup_id(filename, id_): id_ = utils.id_create(filename,None,id_) if id_ in id_object: return id_object[id_] m = re_type_name.search(id_) if m: type_,name = m.groups() if type_ == "Person": return None return object_lookup({ "id":id_, "filename":filename, "__typename":type_.title(), "name":name.replace("_"," ").title() }) else: print ("failed to parse id",id_) sys.exit()
def object_lookup(info): type_ = info["__typename"] = info["__typename"].lower() info["__typename"] = type_.title() if "id" not in info: if "name" in info and "__typename" in info: id_ = utils.id_create(info["__typename"],info["name"]) info["id"] = id_ else: print ("no id or name,_typename",info) sys.exit() else: if not re.search("^"+type_,info["id"]): info["id"] = info["__typename"]+":"+info["id"] id_ = info["id"] if id_ not in id_object: obj = object_add(info["__typename"], info) if obj is not None: object_finish(obj) return obj else: return id_object[id_]
def add_author(fname, last_name, first_name): global author_data person = utils.person_seen_check(last_name, first_name) if person is None: type_, author_id = utils.id_create(fname, "person", last_name + "__" + first_name).split(":") if author_id not in author_data: file_path = "sources/person/{}___externallinks.json".format( author_id) author_data[author_id] = { "id": "person:{}".format(author_id), "__typename": "person", "filename": file_path, "nameLast": last_name, "nameFirst": first_name, "organizations": [] } else: author_id = person["id"] return author_id
def main(): load_ids("paper", "papers", args.papers_file) load_ids("media", "presentations", args.media_file) error = False for type_ in os.listdir("sources"): p = "sources/" + type_ if os.path.isdir(p): for fname in os.listdir(p): fname = p + "/" + fname if re.search("json$", fname) and "___pubdb" not in fname: try: obj = json.load(open(fname, "r")) except json.decoder.JSONDecodeError as e: error = True print("error", fname, e) continue except ValueError as e: print("-----------\nJSON ERROR in ", fname, "\n") raise e id_add(fname, type_, obj["id"]) if "name" in obj: name = utils.id_create(fname, type_, obj["name"]) #if "evolution" in name: #print (obj["id"]) #print (name) #print () name_id[name] = utils.id_create( fname, type_, obj["id"]) if type_ == "person": utils.person_seen_add(fname, obj) if error: sys.exit(1) print("processing objects") for obj in objects: obj["tags"].append("caida") key_to_key(obj, "pubdb_presentation_id", "pubdb_id") key_to_key(obj, "venue", "publisher") resources_front = [] resources_back = [] if "presenters" in obj: obj["type"] = "PRESENTATION" for info in obj["presenters"]: key_to_key(info, "name", "person") key_to_key(info, "organization", "organizations") for key in ["name", "person"]: if key in info: info["person"] = "person:" + info[key] person_create(obj["id"], info["person"]) if key != "person": del info[key] if "date" in info: date = utils.date_parse(info["date"]) if date is not None: info["date"] = date if "date" not in obj or obj["date"] < info["date"]: obj["date"] = info["date"] if "authors" in obj: for info in obj["authors"]: key_to_key(info, "organization", "organizations") if "links" in obj: links = [] for link in obj["links"]: if link["label"] == "DOI": obj["doi"] = link["to"] m = re.search( "https://www.caida.org/publications/([^\/]+)/(\d\d\d\d)\/([^/]+)/$", link["to"]) id_ = None if m: type_, date, id_ = m.groups() if type_ == "papers": type_ = "paper" elif type_ == "presentations": type_ = "media" m = re.search( "https://catalog.caida.org/details/([^\/]+)/([^/]+)", link["to"]) if m: type_, id_ = m.groups() id_ = utils.id_create(obj["filename"], type_, id_) if id_ is not None and id_ in seen: links.append({"to": id_, "label": link["label"]}) else: resource = { "name": link["label"], "url": link["to"], "tags": [] } if re.search("^pdf$", resource["name"], re.IGNORECASE): resources_front.append(resource) else: resources_back.append(resource) obj["links"] = links if obj["__typename"] == "paper": obj["bibtexFields"] = {} for key_from in [ "type", "booktitle", "institution", "journal", "volume", "venue", "pages", "peerReviewedYes", "bibtex", "year", "mon" ]: if key_from in obj and len(obj[key_from]) > 0: if key_from == "booktitle": key_to = "bookTitle" else: key_to = key_from obj["bibtexFields"][key_to] = obj[key_from] del obj[key_from] resources_front.append({ "name": "bibtex", "url": "https://www.caida.org/publications/papers/" + obj["id"][:4] + "/" + obj["id"][5:] + "/bibtex.html" }) resources_front.extend(resources_back) obj["resources"] = resources_front if "datePublished" in obj: obj["date"] = utils.date_parse(obj["datePublished"]) if "linkedObjects" in obj and len(obj["linkedObjects"]) > 0: linked = obj["linkedObjects"].lower().strip() if re_ids_only.search(linked): for to_id in re_whitespace.split(linked): obj["links"].append(to_id) else: print(obj["id"], "failed to parse linkedObject `" + linked + "'") json.dump(obj, open(obj["filename"], "w"), indent=4) for obj in id_person.values(): if "already_exists" not in obj: json.dump(obj, open(obj["filename"], "w"), indent=4)
def id_add(filename, type_, id_): id_ = utils.id_create(filename, type_, id_) yearless = id_yearless(id_) name_id[yearless] = id_ seen.add(id_)
def parse_catalog_data_caida(source_dir): global id_2_object global seen_datasets global seen_softwares global seen_urls global re_mkdn global re_mdta global re_dlim # number skipped with no description number_skipped_no_description = 0 re_md = re.compile("\.md$", re.IGNORECASE) # Iterate over each file in catalog-data-caida/sources. for type_ in sorted(os.listdir(source_dir)): path = source_dir + type_ + "/" if os.path.isdir(path): for file in sorted(os.listdir(path)): file_name = file[:file.index(".")].replace("-", "_") file_path = path + file # Edge Case: Skip if file is not a .md file. if re_mkdn.search(file): metadata = parse_metadata(file_path) elif re_json.search(file): try: with open(file_path) as f: metadata = json.load(f) except Exception as e: print("\nerror:", file_path) print(" ", e) sys.exit(1) else: print(" skipping", file) continue # Edge Case: Replace missing names with ID. if "name" not in metadata: name = metadata["id"].replace("_", " ").upper() metadata["name"] = name id_ = metadata["id"] = utils.id_create(file_path, type_, metadata["id"]) # not including private datasets if id_ in seen_id: print("duplicate id", id_) print(" ", file_path) print(" ", seen_id[id_]) continue if id_ in id_2_object: print("duplicate", id_) print(" ", id_2_object[id_]["filename"]) print(" ", metadata["filename"]) continue else: id_2_object[metadata["id"]] = metadata # If it has no description skip it if "description" not in metadata or re.search( "^\s*$", metadata["description"]): number_skipped_no_description += 1 continue # Edge Case: Add CAIDA as organization if missing key. if "organization" not in metadata: metadata["organization"] = "CAIDA" # Edge Case: Add CAIDA as a tag to all datasets. if "tags" not in metadata: metadata["tags"] = [] if "CAIDA" in metadata["organization"]: if "caida" not in metadata["tags"]: metadata["tags"].append("caida") # Edge Case: Remove 0 length lists from objects. keys = [] for key, value in metadata.items(): if type(value) == str and re.search("^\s*$", value): keys.append(key) for key in keys: del metadata[key]
def object_finish(obj): ############ # links ############ if "links" in obj: for link in obj["links"]: link_add(obj,link) del obj["links"] for key,value in obj.items(): if key == "tags": for i,tag in enumerate(obj["tags"]): o = object_lookup_type_name(obj["filename"], "tag",tag) if o is not None: tag = obj["tags"][i] = o["id"] link_add(obj,tag) #elif key == "resources": # for resource in obj["resources"]: # for i,tag in enumerate(resource[key]): # resource["tags"][i] = object_lookup_type_name("tag",tag)["id"] elif re_date_key.search(key) and type(obj[key]) == str: values = re_not_digit.split(obj[key]) digits = ["1990","01","01","00","00","00"] for i,value in enumerate(values): digits[i] = value #dt = datetime.datetime.strptime(" ".join(digits), "%Y %m %d %H %M %S") #date = int(time.mktime(dt.timetuple())) obj[key] = "%s/%s/%s %s:%s:%s" % (digits[0],digits[1],digits[2],digits[3],digits[4],digits[5]) #elif obj["__typename"] == "Venue" and key == "dates": # for date_url in obj[key]: # venue_add_date_url(obj,date_url["date"],date_url["url"]) elif key == "persons" or key == "venues" or key == "presenters" or key == "authors": dirty = [] i = 0 while i < len(obj[key]): person_org = obj[key][i] error = False if type(person_org) == dict: for k in ["person","presenter"]: if k in person_org: person = person_lookup_id(obj["filename"],person_org[k]) if person is not None: person_org[k] = person["id"] else: error = True elif type(person_org) == str and person_org[7:] == "person:": person = person_lookup_id(obj["filename"],person_org) if person is not None: obj[key][i] = person["id"] else: error = True if error: del obj[key][i] else: i += 1 elif key == "licenses": licenses = list(obj[key]) for i,id_ in enumerate(licenses): id_2 = utils.id_create(obj["filename"],None,id_); if id_2 not in id_object: name = id_[8:] object_add("License", { "id":id_2, "name":id_[8:], "filename":obj["filename"] }) obj[key][i] = id_object[id_2]["id"] else: obj[key] = tag_convert(obj["filename"], obj[key])
def parse_paper(fname, curr_paper): global author_data global type_2_bibtex global papers global alternate_links # Dictionary that will be printed as a JSON. paper = { "__typename": "paper", "type": "paper", "authors": [], "bibtexFields": {}, "links": [], "resources": [], } # Split the current paper into each line. curr_paper = curr_paper.split("\n") re_year = re.compile("(\d\d\d\d)") re_year_month = re.compile("(\d\d\d\d).(\d\d)") # Iterate over each line of the current paper. found = False for line in curr_paper: # Split the current line between the TOPKEY, and its value. line = line.split(":") # Edge Case: Skip empty lines. if len(line) <= 1: continue # Remove any whitespace, and the quotes around the data. line[1] = ":".join(map(str, line[1:])) line[1] = line[1].replace('"', "").strip() # Check which TOPKEY is used for the current line if "MARKER" in line[0]: paper["id"] = utils.id_create(fname, "paper", line[1]) elif "TYPE" in line[0]: paper_type = line[1] paper["bibtexFields"]["type"] = paper_type elif "AUTHOR" in line[0]: # Handle the two seperate ways that authors can be stored. authors = [] for author in re.split(";\s*", re.sub("\.\s*,", ";", line[1])): names = re.split("\s*,\s*", author) if len(names) == 4: authors.append(names[0] + ", " + names[1]) authors.append(names[2] + ", " + names[3]) else: authors.append(author) # Iterate over each author and add there an object for them. for author in authors: author = author.strip() #author = re.split(r"\W+", author) if re.search("\s*,\s*", author): last_name, first_name = re.split("\s*,\s*", author) elif not re.search("^[a-z]+$", author, re.IGNORECASE): print("unparseable", line[1]) print(" ", [last_name, first_name]) first_name = "" last_name = author author_id = add_author(fname, last_name, first_name) paper["authors"].append({"person": author_id}) # Geo is the country the data request came from. # It is not the organization # elif "GEOLOC" in line[0]: elif "TITLE" in line[0] and "CTITLE" not in line[0]: title = line[1] paper["name"] = title elif "YEAR" in line[0]: date_str = line[1] m = re_year_month.search(date_str) date = None year = None month = None if m: year = m.group(1) month = m.group(2) date = year + "." + month else: m = re_year.search(date_str) if m: year = m.group(1) date = year if date: paper["datePublished"] = date paper["date"] = date paper["bibtexFields"]["year"] = year if month: paper["bibtexFields"]["month"] = month elif "TOPKEY" in line[0]: datasets = line[1].split(",") # Iterate over each dataset and link them to catalog datasets. for dataset in datasets: # Remove any whitespace. dataset = dataset.strip().lower() # Try to map the current dataset to a catalog dataset. if dataset in topkey_2_dataset: dataset = topkey_2_dataset[dataset] elif len(dataset) == 0: continue elif dataset.replace(" ", "-") in topkey_2_dataset: dataset = topkey_2_dataset[dataset.replace(" ", "-")] elif dataset.replace("_", "-") in topkey_2_dataset: dataset = topkey_2_dataset[dataset.replace("_", "-")] else: keys = topkey_2_dataset.keys() closest_match = difflib.get_close_matches(dataset, keys, 1) # Edge Case: Reverse the dataset if no match, then give up. if len(closest_match) == 0: dataset = dataset.replace(" ", "-").replace("_", "-") dataset = dataset.split("-") dataset.reverse() dataset = "-".join(map(str, dataset)) if dataset in topkey_2_dataset: dataset = topkey_2_dataset[dataset] else: continue else: dataset = topkey_2_dataset[closest_match[0]] # Append link to the dataset. alternate_link = False for alternate in alternate_links: if alternate in dataset: paper["links"].append({"to": "{}".format(dataset)}) alternate_link = True # So long as the dataset isn't an alternate link, add it. if not alternate_link: # Edge Case: Handles datasets that are mapped to lists. if type(dataset) is list: for data in dataset: paper["links"].append( {"to": "dataset:{}".format(data)}) else: paper["links"].append( {"to": "dataset:{}".format(dataset)}) elif "SERIAL" in line[0]: publisher = line[1] paper["publisher"] = publisher paper["bibtexFields"]["journal"] = publisher elif "VOLUME" in line[0]: volume = line[1] paper["bibtexFields"]["volume"] = volume elif "CHAPTER" in line[0] or "ARTICLE" in line[0]: number = line[1] paper["number"] = number elif "PAGE" in line[0]: pages = line[1].replace("(", "").replace(")", "") paper["pages"] = pages paper["bibtexFields"]["pages"] = pages elif "CTITLE" in line[0]: conference_title = line[1] paper["publisher"] = conference_title paper["bibtexFields"]["bookTitle"] = conference_title elif "DOI" in line[0]: doi = line[1] paper["resources"].append({ "name": "DOI", "url": "https://dx.doi.org/" + doi }) elif "URL" in line[0]: url = line[1] paper["resources"].append({"name": "URL", "url": url}) elif "ABS" in line[0]: paper["description"] = line[1] elif "PUBLISH" in line[0]: paper["bibtexFields"]["institutions"] = line[1] elif "REMARK" in line[0] or "PLACE" in line[0]: if "annotation" not in paper or len(paper["annotation"]) != 0: paper["annotation"] = line[1] else: paper["annotation"] += " {}".format(line[1]) # Only add papers that have ID. if "id" in paper: papers[paper["id"]] = paper
def main(): links = set() if not os.path.exists(files_dir): print("error:", files_dir, " does not exits", file=sys.stderr) sys.exit(1) sys.exit() for type_ in os.listdir("sources"): p = "sources/" + type_ if os.path.isdir(p): for fname in os.listdir(p): fname = p + "/" + fname if re.search("json$", fname) and "__" not in fname: try: obj = json.load(open(fname, "r")) id_ = utils.id_create(fname, type_, obj["id"]) if "resources" in obj: for resource in obj["resources"]: if "url" in resource and len( resource["url"]) > 10: url = url_cleaner(resource["url"]) if "media" not in id_ or resource[ "name"] == "pdf": url_id[url] = id_ except ValueError as e: print(fname) raise e #if "evolution" in name: #print (obj["id"]) #print (name) #print () name_id[name] = utils.id_create( fname, type_, obj["id"]) for type_, filename in [["media", "data/PANDA-Presentations-json.pl.json"], ["paper", "data/PANDA-Papers-json.pl.json"]]: for obj in json.load(open(filename, "r")): if "linkedObjects" in obj and len(obj["linkedObjects"]) > 0: continue id_ = utils.id_create(filename, type_, obj["id"]) failed = None if "links" in obj: for link in obj["links"]: if "to" in link: m = re.search("(\d\d\d\d/[^\/]+/[^/]+.pdf$)", link["to"]) if m: fname = data_dir + "/" + m.group(1) found = None if os.path.exists(fname): found = fname else: fname = "data/presentations/" + m.group(1) if os.path.exists(fname): found = fname if found: fname_txt = re.sub("pdf", "txt", fname) if not os.path.exists(fname_txt): subprocess.run(["pdftotext", found]) with open(fname_txt, "r") as f: for line in f: m = re.search("(http[^\s]+)", line) if m: url = url_cleaner(m.group(1)) if url in url_id: link = [id_, url_id[url]] links.add(json.dumps(link)) else: m = re.search( "www.caida.org/data/([^/]+)", url) if m: i = utils.id_create( "", "dataset", m.group(1)) #if i not in invalid_id: #print (" ",url," ",id_, " ",i) #print (i) #filename = "data/www_caida_org/"+re.sub("[^a-z]+","_",url)+".html" #print (filename, m.group(1)) #if not os.path.exists(filename): #download("http://"+url,filename) #sys.exit() with open(pubdb_links_file, "w") as f: print("writing", pubdb_links_file) json.dump(list(links), f, indent=4)
#! /usr/bin/env python3 import json import sys import lib.utils as utils import subprocess for fname in sys.argv[1:]: person = json.load(open(fname, "r")) if "nameFirst" not in person or "nameLast" not in person: if "person:" in person["id"][:7]: names = person["id"][7:].split("_") else: names = person["id"].split("_") person["nameLast"] = names[0].title() person["nameFirst"] = " ".join(names[1:]).title() person["name"] = person["nameLast"] + ", " + person["nameFirst"] if "organizaion" in person: person["organizations"] = person["organization"] del person["organization"] person["id"] = utils.id_create( fname, "person", person["nameLast"] + "__" + person["nameFirst"]) filename = person["id"][7:] + ".json" subprocess.run(["git", "mv", fname, filename]) json.dump(person, open(filename, "w"), indent=4)