コード例 #1
0
ファイル: data-build.py プロジェクト: CyberCF/catalog-data
def object_date_add(obj):
    today = datetime.date.today().strftime("%Y-%m")

    for key in ["dateCreated", "dateLastUpdated"]:
        if not date_lookup_force and obj["id"] in id_date and key in id_date[
                obj["id"]]:
            obj[key] = id_date[obj["id"]][key]
        else:
            if not re_placeholder.search(obj["filename"]):
                if key == "dateCreated":
                    cmd = "git log --diff-filter=A --follow --format=%aD -1 -- "
                else:
                    cmd = "git log --format=%aD -1 -- "

                result = subprocess.check_output(cmd + " " + obj["filename"],
                                                 shell=True)
                values = result.decode().lower().split(" ")
            else:
                values = []
            date = today
            if len(values) >= 4:
                if values[2] in mon_index:
                    date = values[3] + "." + mon_index[values[2]]
            obj[key] = date
            if obj["id"] not in id_date:
                id_date[obj["id"]] = {}

    if obj["__typename"] == "Media" and "presenters" in obj:
        for person_venue in obj["presenters"]:
            if "date" in person_venue:
                obj["date"] = person_venue["date"]
            if "venue" in person_venue and "venue" == person_venue["venue"][:5]:
                vid = person_venue["venue"]
                if vid in id_object:
                    person_venue["venue"] = id_object[vid]["name"]
                else:
                    print("    missing venue:", person_venue["venue"])
    else:
        for type_key in [["Dataset", "dateStart"], ["Paper", "datePublished"]]:
            type_, key = type_key
            if obj["__typename"] == type_ and key in obj:
                date = utils.date_parse(obj[key])
                if date:
                    obj["date"] = date
                    obj[key] = date
    key = "dateEnd"
    if key in obj:
        if obj[key].lower() == "ongoing":
            obj[key] = obj["date"] = today
        else:
            date = utils.date_parse(obj[key])
            if date:
                obj[key] = date

    if "date" not in obj:
        obj["date"] = obj["dateLastUpdated"]
    obj["date"] = utils.date_parse(obj["date"])
コード例 #2
0
def object_date_add(obj):
    for key in ["dateCreated","dateLastUpdated"]:
        if not date_lookup_force and obj["id"] in id_date and key in id_date[obj["id"]]:
            obj[key] = id_date[obj["id"]][key]
        else:
            if key == "dateCreated":
                cmd = "git log --diff-filter=A --follow --format=%aD -1 -- "
            else:
                cmd = "git log --format=%aD -1 -- "

            result = subprocess.check_output(cmd+" "+obj["filename"],shell=True)
            values = result.decode().lower().split(" ")
            date = datetime.date.today().strftime("%Y.%m")
            if len(values) >= 4:
                if values[2] in mon_index:
                    date = values[3]+"."+mon_index[values[2]]
            obj[key] = date
            if obj["id"] not in id_date:
                id_date[obj["id"]] = {}

    if obj["__typename"] == "Media" and "presenters" in obj:
        for person_venue in obj["presenters"]:
            if "date" in person_venue:
                obj["date"] = person_venue["date"]
    else:
        for type_key in [["Dataset","dateStart"], ["Paper","datePublished"]]:
            type_,key = type_key
            if obj["__typename"] == type_ and key in obj:
                obj["date"] = obj[key]

    if "date" not in obj:
        obj["date"] = obj["dateLastUpdated"]
    obj["date"] = utils.date_parse(obj["date"])
コード例 #3
0
def main():
    load_ids("paper", "papers", args.papers_file)
    load_ids("media", "presentations", args.media_file)
    error = False
    for type_ in os.listdir("sources"):
        p = "sources/" + type_
        if os.path.isdir(p):
            for fname in os.listdir(p):
                fname = p + "/" + fname
                if re.search("json$", fname) and "___pubdb" not in fname:
                    try:
                        obj = json.load(open(fname, "r"))
                    except json.decoder.JSONDecodeError as e:
                        error = True
                        print("error", fname, e)
                        continue
                    except ValueError as e:
                        print("-----------\nJSON ERROR in ", fname, "\n")
                        raise e
                    id_add(fname, type_, obj["id"])
                    if "name" in obj:
                        name = utils.id_create(fname, type_, obj["name"])
                        #if "evolution" in name:
                        #print (obj["id"])
                        #print (name)
                        #print ()
                        name_id[name] = utils.id_create(
                            fname, type_, obj["id"])
                    if type_ == "person":
                        utils.person_seen_add(fname, obj)

    if error:
        sys.exit(1)

    print("processing objects")
    for obj in objects:
        obj["tags"].append("caida")
        key_to_key(obj, "pubdb_presentation_id", "pubdb_id")
        key_to_key(obj, "venue", "publisher")
        resources_front = []
        resources_back = []
        if "presenters" in obj:
            obj["type"] = "PRESENTATION"
            for info in obj["presenters"]:
                key_to_key(info, "name", "person")
                key_to_key(info, "organization", "organizations")
                for key in ["name", "person"]:
                    if key in info:
                        info["person"] = "person:" + info[key]
                        person_create(obj["id"], info["person"])
                        if key != "person":
                            del info[key]
                if "date" in info:
                    date = utils.date_parse(info["date"])
                    if date is not None:
                        info["date"] = date
                        if "date" not in obj or obj["date"] < info["date"]:
                            obj["date"] = info["date"]
        if "authors" in obj:
            for info in obj["authors"]:
                key_to_key(info, "organization", "organizations")

        if "links" in obj:
            links = []
            for link in obj["links"]:
                if link["label"] == "DOI":
                    obj["doi"] = link["to"]

                m = re.search(
                    "https://www.caida.org/publications/([^\/]+)/(\d\d\d\d)\/([^/]+)/$",
                    link["to"])
                id_ = None
                if m:
                    type_, date, id_ = m.groups()
                    if type_ == "papers":
                        type_ = "paper"
                    elif type_ == "presentations":
                        type_ = "media"

                m = re.search(
                    "https://catalog.caida.org/details/([^\/]+)/([^/]+)",
                    link["to"])
                if m:
                    type_, id_ = m.groups()
                    id_ = utils.id_create(obj["filename"], type_, id_)

                if id_ is not None and id_ in seen:
                    links.append({"to": id_, "label": link["label"]})
                else:
                    resource = {
                        "name": link["label"],
                        "url": link["to"],
                        "tags": []
                    }
                    if re.search("^pdf$", resource["name"], re.IGNORECASE):
                        resources_front.append(resource)
                    else:
                        resources_back.append(resource)
            obj["links"] = links
        if obj["__typename"] == "paper":
            obj["bibtexFields"] = {}
            for key_from in [
                    "type", "booktitle", "institution", "journal", "volume",
                    "venue", "pages", "peerReviewedYes", "bibtex", "year",
                    "mon"
            ]:
                if key_from in obj and len(obj[key_from]) > 0:
                    if key_from == "booktitle":
                        key_to = "bookTitle"
                    else:
                        key_to = key_from

                    obj["bibtexFields"][key_to] = obj[key_from]
                    del obj[key_from]

            resources_front.append({
                "name":
                "bibtex",
                "url":
                "https://www.caida.org/publications/papers/" + obj["id"][:4] +
                "/" + obj["id"][5:] + "/bibtex.html"
            })
        resources_front.extend(resources_back)
        obj["resources"] = resources_front

        if "datePublished" in obj:
            obj["date"] = utils.date_parse(obj["datePublished"])

        if "linkedObjects" in obj and len(obj["linkedObjects"]) > 0:
            linked = obj["linkedObjects"].lower().strip()
            if re_ids_only.search(linked):
                for to_id in re_whitespace.split(linked):
                    obj["links"].append(to_id)
            else:
                print(obj["id"],
                      "failed to parse linkedObject `" + linked + "'")

        json.dump(obj, open(obj["filename"], "w"), indent=4)

    for obj in id_person.values():
        if "already_exists" not in obj:
            json.dump(obj, open(obj["filename"], "w"), indent=4)
コード例 #4
0
ファイル: data-build.py プロジェクト: CyberCF/catalog-data
def object_finish(obj):

    ############
    # links
    ############
    if "links" in obj:
        for link in obj["links"]:
            link_add(obj, link)
        del obj["links"]

    if "tags" not in obj:
        obj["tags"] = []

    for key, value in obj.items():
        if key == "tags":
            for i, tag in enumerate(obj["tags"]):
                o = object_lookup_type_name(obj["filename"], "tag", tag)
                if o is not None:
                    tag = obj["tags"][i] = o["id"]
                    link_add(obj, tag)

        #elif key == "resources":
        #    for resource in obj["resources"]:
        #        for i,tag in enumerate(resource[key]):
        #            resource["tags"][i] = object_lookup_type_name("tag",tag)["id"]

        elif re_date_key.search(key) and type(obj[key]) == str:
            date = utils.date_parse(obj[key])
            if date:
                obj[key] = date
            #values = re_not_digit.split(obj[key])
            #digits = ["1990","01","01","00","00","00"]
            #for i,value in enumerate(values):
            #digits[i] = value
            ##dt = datetime.datetime.strptime(" ".join(digits), "%Y %m %d %H %M %S")
            #date = int(time.mktime(dt.timetuple()))
            #obj[key] = "%s/%s/%s %s:%s:%s" % (digits[0],digits[1],digits[2],digits[3],digits[4],digits[5])

        #elif obj["__typename"] == "Venue" and key == "dates":
        #    for date_url in obj[key]:
        #        venue_add_date_url(obj,date_url["date"],date_url["url"])

        elif key == "persons" or key == "venues" or key == "presenters" or key == "authors":
            dirty = []
            i = 0
            persons = set()
            while i < len(obj[key]):
                person_org = obj[key][i]
                error = False
                if type(person_org) == dict:
                    caida = False
                    if "organizations" in person_org:
                        for org in person_org["organizations"]:
                            if re.search("caida", org, re.IGNORECASE):
                                caida = True
                    for k in ["person", "presenter"]:
                        if k in person_org:
                            person = person_lookup_id(obj["filename"],
                                                      person_org[k])
                            persons.add(person["id"])
                            if person is not None:
                                if caida:
                                    if "tags" not in person:
                                        person["tags"] = ["caida"]
                                    else:
                                        person["tags"].append("caida")
                                person_org[k] = person["id"]
                            else:
                                error = True
                elif type(person_org) == str and person_org[7:] == "person:":
                    person = person_lookup_id(obj["filename"], person_org)
                    persons.add(person["id"])
                    if person is not None:
                        obj[key][i] = person["id"]
                    else:
                        error = True
                if error:
                    del obj[key][i]
                else:
                    i += 1
            for person_id in persons:
                link_add(obj, person_id)
                personName_add(obj, person_id)
        elif key == "licenses":
            licenses = list(obj[key])
            for i, id_ in enumerate(licenses):
                id_2 = utils.id_create(obj["filename"], "license", id_)
                if id_2 not in id_object:
                    name = id_[8:]
                    object_add("License", {
                        "id": id_2,
                        "name": id_[8:],
                        "filename": obj["filename"]
                    })
                obj[key][i] = id_object[id_2]["id"]
        else:
            obj[key] = tag_convert(obj["filename"], obj[key])