Python id_createの例、lib.utils.id_create Pythonの例

コード例 #1

0

ファイルを表示

def object_add(type_, info): 
    info["__typename"] = type_ = type_.title()

    error = False
    if type_ == "Person":
        person_add_names(info)

    if "name" in info:
        info["__typename"] = type_.title()
        if "id" not in info:
            info["id"] = utils.id_create(info["filename"], info["__typename"],info["name"])
        else:
            info["id"] = utils.id_create(info["filename"], info["__typename"],info["id"])
    else:
        error_add(info["filename"], "failed to find name:"+json.dumps(info))
        error = True
    
    if type_ == "paper":
        if "datePublished"  in info:
            info["date"] = info["datePublished"]
        else:
            error_add(info["filename"], "failed to find paper's date")
            error = True

        m = re.search("^paper:(\d\d\d\d)_(.+)", info["id"])
        if m:
            date,id_short = m.groups()
            id_paper[id_short] = info
        else:
            info["id"] = utils.id_create(filename, info["__typename"],info["id"])

    if not error:
        id_object[info["id"]] = info
        return info
    return None

コード例 #2

0

ファイルを表示

def link_add(obj,info):

    if type(info) == str:
        to_original = info
        to = utils.id_create(None,None,info)
        info = { "to":to }
    else:
        if "to" in info:
            to_original = info["to"]
            to = info["to"] = utils.id_create(None,None,info["to"])
        else:
            error_add(obj["filename"],"link has no to"+json.dumps(info))
            return None


    if to is None:
        error_add(obj["filename"],"invalid id "+to_original)
        return None

    if to not in id_object:
        error_add(obj["filename"], "missing id "+to)
        return False

    info["from"] = obj["id"]

    for a_b in [["from","to"],["to","from"]]:
        a,b = a_b
        a_id = info[a]
        b_id = info[b]
        link = {
            "from":a_id,
            "to":b_id
            }
        if a+"_label" in info:
            link["from_label"] = info[a+"_label"]
        if b+"_label" in info:
            link["to_label"] = info[b+"_label"]
        if "label" in info:
            link["label"] = info["label"]

        if a_id not in id_id_link:
            id_id_link[a_id] = {}
        if b_id in id_id_link[a_id]:
            for key,value in link.items():
                if key not in id_id_link[a_id][b_id]:
                    id_id_link[a_id][b_id][key] = value
        else:
            id_id_link[a_id][b_id] = link
    return True

コード例 #3

0

ファイルを表示

ファイル: caida_placeholder.py プロジェクト: CyberCF/catalog-data

def add_seen_ids(source_dir):
    global seen_ids

    re_placeholder = re.compile(r"___caida")
    for fname in sorted(os.listdir(source_dir)):
        path = source_dir + "/" + fname
        if os.path.isdir(path):
            type_ = fname
            for filename in sorted(os.listdir(path)):
                file_path = path + "/" + filename
                if re.search(
                        "\.json$", filename,
                        re.IGNORECASE) and not re_placeholder.search(filename):
                    try:
                        info = json.load(open(file_path))
                        info["filename"] = file_path
                        id = info["id"] = utils.id_create(
                            info["filename"], type_, info["id"])
                        if id in seen_id:
                            print("duplicate id found in\n   ", filename,
                                  "\n   ", seen_id[id])
                        else:
                            seen_id[id] = file_path
                    except Exception as e:
                        print("\nerror", path + "/" + filename)
                        print("    ", e)
                        sys.exit()

コード例 #4

0

ファイルを表示

def object_lookup_type_name(filename, type_,name):
    if type_ == name[0:(len(type_)+1)]:
        name = name[(len(type_)+1):]
    id_ = utils.id_create(filename, type_,name)
    return object_lookup({
        "id":id_,
        "filename":filename, 
        "__typename":type_,
        "name":name
    })

コード例 #5

0

ファイルを表示

ファイル: pubdb_placeholder.py プロジェクト: chushuai/catalog-data

def person_create(filename, obj):
    id_ = utils.id_create("filename", 'person', obj)
    if id_ not in id_person:
        if obj[:7] == "person:":
            nameLast, nameFirst = obj[7:].split("__")
        else:
            nameLast, nameFirst = obj.split("__")
        person = {
            "id": id_,
            "__typename": "person",
            "filename": "sources/person/" + id_[7:] + "__pubdb.json",
            "nameLast": nameLast.replace("_", " ").title(),
            "nameFirst": nameFirst.replace("_", " ").title()
        }
        id_person[id_] = person

コード例 #6

0

ファイルを表示

ファイル: pubdb_placeholder.py プロジェクト: CyberCF/catalog-data

def person_create(filename, obj):
    if obj[:7] == "person:":
        nameLast, nameFirst = obj[7:].split("__")
    else:
        nameLast, nameFirst = obj.split("__")
    person = utils.person_seen_check(nameLast, nameFirst)
    if person is None:
        id_ = utils.id_create("filename", 'person', obj)
        if id_ not in id_person:
            person = {
                "id": id_,
                "__typename": "person",
                "filename": "sources/person/" + id_[7:] + "__pubdb.json",
                "nameLast": nameLast.replace("_", " ").title(),
                "nameFirst": nameFirst.replace("_", " ").title()
            }
            id_person[id_] = person
    elif person["id"] not in id_person:
        person["already_exists"] = True
        id_person[person["id"]] = person

コード例 #7

0

ファイルを表示

def object_lookup_id(filename, id_):
    id_ = utils.id_create(filename,None,id_)
    if id_ in id_object:
        return id_object[id_]

    m = re_type_name.search(id_)
    if m:
        type_,name = m.groups()
        if type_ == "Person":
            return None

        return object_lookup({
            "id":id_,
            "filename":filename,
            "__typename":type_.title(),
            "name":name.replace("_"," ").title()
        })
    else:
        print ("failed to parse id",id_)
        sys.exit()

コード例 #8

0

ファイルを表示

def object_lookup(info):
    type_ = info["__typename"] = info["__typename"].lower()
    info["__typename"] = type_.title()
    if "id" not in info:
        if "name" in info and "__typename" in info:
            id_ = utils.id_create(info["__typename"],info["name"])
            info["id"] = id_
        else:
            print ("no id or name,_typename",info)
            sys.exit()
    else:
        if not re.search("^"+type_,info["id"]):
            info["id"] = info["__typename"]+":"+info["id"]
    id_ = info["id"]
    if id_ not in id_object:
        obj = object_add(info["__typename"], info)
        if obj is not None:
            object_finish(obj)
            return obj
    else:
        return id_object[id_]

コード例 #9

0

ファイルを表示

ファイル: externallinks_placeholder.py プロジェクト: CyberCF/catalog-data

def add_author(fname, last_name, first_name):
    global author_data
    person = utils.person_seen_check(last_name, first_name)

    if person is None:
        type_, author_id = utils.id_create(fname, "person", last_name + "__" +
                                           first_name).split(":")
        if author_id not in author_data:
            file_path = "sources/person/{}___externallinks.json".format(
                author_id)

            author_data[author_id] = {
                "id": "person:{}".format(author_id),
                "__typename": "person",
                "filename": file_path,
                "nameLast": last_name,
                "nameFirst": first_name,
                "organizations": []
            }

    else:
        author_id = person["id"]

    return author_id

コード例 #10

0

ファイルを表示

ファイル: pubdb_placeholder.py プロジェクト: CyberCF/catalog-data

def main():
    load_ids("paper", "papers", args.papers_file)
    load_ids("media", "presentations", args.media_file)
    error = False
    for type_ in os.listdir("sources"):
        p = "sources/" + type_
        if os.path.isdir(p):
            for fname in os.listdir(p):
                fname = p + "/" + fname
                if re.search("json$", fname) and "___pubdb" not in fname:
                    try:
                        obj = json.load(open(fname, "r"))
                    except json.decoder.JSONDecodeError as e:
                        error = True
                        print("error", fname, e)
                        continue
                    except ValueError as e:
                        print("-----------\nJSON ERROR in ", fname, "\n")
                        raise e
                    id_add(fname, type_, obj["id"])
                    if "name" in obj:
                        name = utils.id_create(fname, type_, obj["name"])
                        #if "evolution" in name:
                        #print (obj["id"])
                        #print (name)
                        #print ()
                        name_id[name] = utils.id_create(
                            fname, type_, obj["id"])
                    if type_ == "person":
                        utils.person_seen_add(fname, obj)

    if error:
        sys.exit(1)

    print("processing objects")
    for obj in objects:
        obj["tags"].append("caida")
        key_to_key(obj, "pubdb_presentation_id", "pubdb_id")
        key_to_key(obj, "venue", "publisher")
        resources_front = []
        resources_back = []
        if "presenters" in obj:
            obj["type"] = "PRESENTATION"
            for info in obj["presenters"]:
                key_to_key(info, "name", "person")
                key_to_key(info, "organization", "organizations")
                for key in ["name", "person"]:
                    if key in info:
                        info["person"] = "person:" + info[key]
                        person_create(obj["id"], info["person"])
                        if key != "person":
                            del info[key]
                if "date" in info:
                    date = utils.date_parse(info["date"])
                    if date is not None:
                        info["date"] = date
                        if "date" not in obj or obj["date"] < info["date"]:
                            obj["date"] = info["date"]
        if "authors" in obj:
            for info in obj["authors"]:
                key_to_key(info, "organization", "organizations")

        if "links" in obj:
            links = []
            for link in obj["links"]:
                if link["label"] == "DOI":
                    obj["doi"] = link["to"]

                m = re.search(
                    "https://www.caida.org/publications/([^\/]+)/(\d\d\d\d)\/([^/]+)/$",
                    link["to"])
                id_ = None
                if m:
                    type_, date, id_ = m.groups()
                    if type_ == "papers":
                        type_ = "paper"
                    elif type_ == "presentations":
                        type_ = "media"

                m = re.search(
                    "https://catalog.caida.org/details/([^\/]+)/([^/]+)",
                    link["to"])
                if m:
                    type_, id_ = m.groups()
                    id_ = utils.id_create(obj["filename"], type_, id_)

                if id_ is not None and id_ in seen:
                    links.append({"to": id_, "label": link["label"]})
                else:
                    resource = {
                        "name": link["label"],
                        "url": link["to"],
                        "tags": []
                    }
                    if re.search("^pdf$", resource["name"], re.IGNORECASE):
                        resources_front.append(resource)
                    else:
                        resources_back.append(resource)
            obj["links"] = links
        if obj["__typename"] == "paper":
            obj["bibtexFields"] = {}
            for key_from in [
                    "type", "booktitle", "institution", "journal", "volume",
                    "venue", "pages", "peerReviewedYes", "bibtex", "year",
                    "mon"
            ]:
                if key_from in obj and len(obj[key_from]) > 0:
                    if key_from == "booktitle":
                        key_to = "bookTitle"
                    else:
                        key_to = key_from

                    obj["bibtexFields"][key_to] = obj[key_from]
                    del obj[key_from]

            resources_front.append({
                "name":
                "bibtex",
                "url":
                "https://www.caida.org/publications/papers/" + obj["id"][:4] +
                "/" + obj["id"][5:] + "/bibtex.html"
            })
        resources_front.extend(resources_back)
        obj["resources"] = resources_front

        if "datePublished" in obj:
            obj["date"] = utils.date_parse(obj["datePublished"])

        if "linkedObjects" in obj and len(obj["linkedObjects"]) > 0:
            linked = obj["linkedObjects"].lower().strip()
            if re_ids_only.search(linked):
                for to_id in re_whitespace.split(linked):
                    obj["links"].append(to_id)
            else:
                print(obj["id"],
                      "failed to parse linkedObject `" + linked + "'")

        json.dump(obj, open(obj["filename"], "w"), indent=4)

    for obj in id_person.values():
        if "already_exists" not in obj:
            json.dump(obj, open(obj["filename"], "w"), indent=4)

コード例 #11

0

ファイルを表示

ファイル: pubdb_placeholder.py プロジェクト: CyberCF/catalog-data

def id_add(filename, type_, id_):
    id_ = utils.id_create(filename, type_, id_)
    yearless = id_yearless(id_)
    name_id[yearless] = id_
    seen.add(id_)

コード例 #12

0

ファイルを表示

ファイル: caida_placeholder.py プロジェクト: CyberCF/catalog-data

def parse_catalog_data_caida(source_dir):
    global id_2_object
    global seen_datasets
    global seen_softwares
    global seen_urls
    global re_mkdn
    global re_mdta
    global re_dlim

    # number skipped with no description
    number_skipped_no_description = 0

    re_md = re.compile("\.md$", re.IGNORECASE)

    # Iterate over each file in catalog-data-caida/sources.
    for type_ in sorted(os.listdir(source_dir)):
        path = source_dir + type_ + "/"
        if os.path.isdir(path):
            for file in sorted(os.listdir(path)):

                file_name = file[:file.index(".")].replace("-", "_")
                file_path = path + file

                # Edge Case: Skip if file is not a .md file.
                if re_mkdn.search(file):
                    metadata = parse_metadata(file_path)
                elif re_json.search(file):
                    try:
                        with open(file_path) as f:
                            metadata = json.load(f)
                    except Exception as e:
                        print("\nerror:", file_path)
                        print("    ", e)
                        sys.exit(1)
                else:
                    print("   skipping", file)
                    continue

                # Edge Case: Replace missing names with ID.
                if "name" not in metadata:
                    name = metadata["id"].replace("_", " ").upper()
                    metadata["name"] = name

                id_ = metadata["id"] = utils.id_create(file_path, type_,
                                                       metadata["id"])
                # not including private datasets
                if id_ in seen_id:
                    print("duplicate id", id_)
                    print("    ", file_path)
                    print("    ", seen_id[id_])
                    continue
                if id_ in id_2_object:
                    print("duplicate", id_)
                    print("    ", id_2_object[id_]["filename"])
                    print("    ", metadata["filename"])
                    continue
                else:
                    id_2_object[metadata["id"]] = metadata

                # If it has no description skip it
                if "description" not in metadata or re.search(
                        "^\s*$", metadata["description"]):
                    number_skipped_no_description += 1
                    continue

                # Edge Case: Add CAIDA as organization if missing key.
                if "organization" not in metadata:
                    metadata["organization"] = "CAIDA"

                # Edge Case: Add CAIDA as a tag to all datasets.
                if "tags" not in metadata:
                    metadata["tags"] = []
                if "CAIDA" in metadata["organization"]:
                    if "caida" not in metadata["tags"]:
                        metadata["tags"].append("caida")

                # Edge Case: Remove 0 length lists from objects.
                keys = []
                for key, value in metadata.items():
                    if type(value) == str and re.search("^\s*$", value):
                        keys.append(key)
                for key in keys:
                    del metadata[key]

コード例 #13

0

ファイルを表示

def object_finish(obj):

        ############
        # links 
        ############
    if "links" in obj:
        for link in obj["links"]:
            link_add(obj,link)
        del obj["links"]


    for key,value in obj.items():
        if key == "tags":
            for i,tag in enumerate(obj["tags"]):
                o = object_lookup_type_name(obj["filename"], "tag",tag)
                if o is not None:
                    tag = obj["tags"][i] = o["id"]
                    link_add(obj,tag)

        #elif key == "resources":
        #    for resource in obj["resources"]:
        #        for i,tag in enumerate(resource[key]):
        #            resource["tags"][i] = object_lookup_type_name("tag",tag)["id"]

        elif re_date_key.search(key) and type(obj[key]) == str:
            values = re_not_digit.split(obj[key])
            digits = ["1990","01","01","00","00","00"]
            for i,value in enumerate(values):
                digits[i] = value
            #dt = datetime.datetime.strptime(" ".join(digits), "%Y %m %d %H %M %S")
            #date = int(time.mktime(dt.timetuple()))
            obj[key] = "%s/%s/%s %s:%s:%s" % (digits[0],digits[1],digits[2],digits[3],digits[4],digits[5])

        #elif obj["__typename"] == "Venue" and key == "dates":
        #    for date_url in obj[key]:
        #        venue_add_date_url(obj,date_url["date"],date_url["url"])

        elif key == "persons" or key == "venues" or key == "presenters" or key == "authors":
                dirty = []
                i = 0
                while i < len(obj[key]):
                    person_org = obj[key][i]
                    error = False
                    if type(person_org) == dict:
                        for k in ["person","presenter"]:
                            if k in person_org:
                                person = person_lookup_id(obj["filename"],person_org[k])
                                if person is not None:
                                    person_org[k] = person["id"]
                                else:
                                    error = True
                    elif type(person_org) == str and person_org[7:] == "person:":
                        person = person_lookup_id(obj["filename"],person_org)
                        if person is not None:
                            obj[key][i] = person["id"]
                        else:
                            error = True
                    if error:
                        del obj[key][i]
                    else:
                        i += 1
        elif key == "licenses":
            licenses = list(obj[key])
            for i,id_ in enumerate(licenses):
                id_2 = utils.id_create(obj["filename"],None,id_);
                if id_2 not in id_object:
                    name = id_[8:]
                    object_add("License", {
                        "id":id_2,
                        "name":id_[8:],
                        "filename":obj["filename"]
                    })
                obj[key][i] = id_object[id_2]["id"]
        else:
            obj[key] = tag_convert(obj["filename"], obj[key])

コード例 #14

0

ファイルを表示

ファイル: externallinks_placeholder.py プロジェクト: CyberCF/catalog-data

def parse_paper(fname, curr_paper):
    global author_data
    global type_2_bibtex
    global papers
    global alternate_links

    # Dictionary that will be printed as a JSON.
    paper = {
        "__typename": "paper",
        "type": "paper",
        "authors": [],
        "bibtexFields": {},
        "links": [],
        "resources": [],
    }

    # Split the current paper into each line.
    curr_paper = curr_paper.split("\n")

    re_year = re.compile("(\d\d\d\d)")
    re_year_month = re.compile("(\d\d\d\d).(\d\d)")
    # Iterate over each line of the current paper.
    found = False
    for line in curr_paper:
        # Split the current line between the TOPKEY, and its value.
        line = line.split(":")

        # Edge Case: Skip empty lines.
        if len(line) <= 1:
            continue

        # Remove any whitespace, and the quotes around the data.
        line[1] = ":".join(map(str, line[1:]))
        line[1] = line[1].replace('"', "").strip()

        # Check which TOPKEY is used for the current line
        if "MARKER" in line[0]:
            paper["id"] = utils.id_create(fname, "paper", line[1])

        elif "TYPE" in line[0]:
            paper_type = line[1]
            paper["bibtexFields"]["type"] = paper_type

        elif "AUTHOR" in line[0]:
            # Handle the two seperate ways that authors can be stored.
            authors = []
            for author in re.split(";\s*", re.sub("\.\s*,", ";", line[1])):
                names = re.split("\s*,\s*", author)
                if len(names) == 4:
                    authors.append(names[0] + ", " + names[1])
                    authors.append(names[2] + ", " + names[3])
                else:
                    authors.append(author)

            # Iterate over each author and add there an object for them.
            for author in authors:
                author = author.strip()
                #author = re.split(r"\W+", author)
                if re.search("\s*,\s*", author):
                    last_name, first_name = re.split("\s*,\s*", author)
                elif not re.search("^[a-z]+$", author, re.IGNORECASE):
                    print("unparseable", line[1])
                    print("    ", [last_name, first_name])
                    first_name = ""
                    last_name = author
                author_id = add_author(fname, last_name, first_name)

                paper["authors"].append({"person": author_id})

        # Geo is the country the data request came from.
        # It is not the organization
        # elif "GEOLOC" in line[0]:

        elif "TITLE" in line[0] and "CTITLE" not in line[0]:
            title = line[1]
            paper["name"] = title

        elif "YEAR" in line[0]:
            date_str = line[1]
            m = re_year_month.search(date_str)
            date = None
            year = None
            month = None
            if m:
                year = m.group(1)
                month = m.group(2)
                date = year + "." + month
            else:
                m = re_year.search(date_str)
                if m:
                    year = m.group(1)
                    date = year
            if date:
                paper["datePublished"] = date
                paper["date"] = date
                paper["bibtexFields"]["year"] = year
                if month:
                    paper["bibtexFields"]["month"] = month

        elif "TOPKEY" in line[0]:
            datasets = line[1].split(",")

            # Iterate over each dataset and link them to catalog datasets.
            for dataset in datasets:
                # Remove any whitespace.
                dataset = dataset.strip().lower()

                # Try to map the current dataset to a catalog dataset.
                if dataset in topkey_2_dataset:
                    dataset = topkey_2_dataset[dataset]
                elif len(dataset) == 0:
                    continue
                elif dataset.replace(" ", "-") in topkey_2_dataset:
                    dataset = topkey_2_dataset[dataset.replace(" ", "-")]
                elif dataset.replace("_", "-") in topkey_2_dataset:
                    dataset = topkey_2_dataset[dataset.replace("_", "-")]
                else:
                    keys = topkey_2_dataset.keys()
                    closest_match = difflib.get_close_matches(dataset, keys, 1)

                    # Edge Case: Reverse the dataset if no match, then give up.
                    if len(closest_match) == 0:
                        dataset = dataset.replace(" ", "-").replace("_", "-")
                        dataset = dataset.split("-")
                        dataset.reverse()
                        dataset = "-".join(map(str, dataset))
                        if dataset in topkey_2_dataset:
                            dataset = topkey_2_dataset[dataset]
                        else:
                            continue
                    else:
                        dataset = topkey_2_dataset[closest_match[0]]

                # Append link to the dataset.
                alternate_link = False
                for alternate in alternate_links:
                    if alternate in dataset:
                        paper["links"].append({"to": "{}".format(dataset)})
                        alternate_link = True

                # So long as the dataset isn't an alternate link, add it.
                if not alternate_link:
                    # Edge Case: Handles datasets that are mapped to lists.
                    if type(dataset) is list:
                        for data in dataset:
                            paper["links"].append(
                                {"to": "dataset:{}".format(data)})
                    else:
                        paper["links"].append(
                            {"to": "dataset:{}".format(dataset)})

        elif "SERIAL" in line[0]:
            publisher = line[1]
            paper["publisher"] = publisher
            paper["bibtexFields"]["journal"] = publisher

        elif "VOLUME" in line[0]:
            volume = line[1]
            paper["bibtexFields"]["volume"] = volume

        elif "CHAPTER" in line[0] or "ARTICLE" in line[0]:
            number = line[1]
            paper["number"] = number

        elif "PAGE" in line[0]:
            pages = line[1].replace("(", "").replace(")", "")
            paper["pages"] = pages
            paper["bibtexFields"]["pages"] = pages

        elif "CTITLE" in line[0]:
            conference_title = line[1]
            paper["publisher"] = conference_title
            paper["bibtexFields"]["bookTitle"] = conference_title

        elif "DOI" in line[0]:
            doi = line[1]
            paper["resources"].append({
                "name": "DOI",
                "url": "https://dx.doi.org/" + doi
            })

        elif "URL" in line[0]:
            url = line[1]
            paper["resources"].append({"name": "URL", "url": url})

        elif "ABS" in line[0]:
            paper["description"] = line[1]

        elif "PUBLISH" in line[0]:
            paper["bibtexFields"]["institutions"] = line[1]

        elif "REMARK" in line[0] or "PLACE" in line[0]:
            if "annotation" not in paper or len(paper["annotation"]) != 0:
                paper["annotation"] = line[1]
            else:
                paper["annotation"] += " {}".format(line[1])

    # Only add papers that have ID.
    if "id" in paper:
        papers[paper["id"]] = paper

コード例 #15

0

ファイルを表示

def main():
    links = set()
    if not os.path.exists(files_dir):
        print("error:", files_dir, " does not exits", file=sys.stderr)
        sys.exit(1)
    sys.exit()

    for type_ in os.listdir("sources"):
        p = "sources/" + type_
        if os.path.isdir(p):
            for fname in os.listdir(p):
                fname = p + "/" + fname
                if re.search("json$", fname) and "__" not in fname:
                    try:
                        obj = json.load(open(fname, "r"))
                        id_ = utils.id_create(fname, type_, obj["id"])
                        if "resources" in obj:
                            for resource in obj["resources"]:
                                if "url" in resource and len(
                                        resource["url"]) > 10:
                                    url = url_cleaner(resource["url"])
                                    if "media" not in id_ or resource[
                                            "name"] == "pdf":
                                        url_id[url] = id_
                    except ValueError as e:
                        print(fname)
                        raise e
                        #if "evolution" in name:
                        #print (obj["id"])
                        #print (name)
                        #print ()
                        name_id[name] = utils.id_create(
                            fname, type_, obj["id"])
    for type_, filename in [["media", "data/PANDA-Presentations-json.pl.json"],
                            ["paper", "data/PANDA-Papers-json.pl.json"]]:
        for obj in json.load(open(filename, "r")):
            if "linkedObjects" in obj and len(obj["linkedObjects"]) > 0:
                continue

            id_ = utils.id_create(filename, type_, obj["id"])
            failed = None
            if "links" in obj:
                for link in obj["links"]:
                    if "to" in link:
                        m = re.search("(\d\d\d\d/[^\/]+/[^/]+.pdf$)",
                                      link["to"])
                        if m:
                            fname = data_dir + "/" + m.group(1)
                            found = None
                            if os.path.exists(fname):
                                found = fname
                            else:
                                fname = "data/presentations/" + m.group(1)
                                if os.path.exists(fname):
                                    found = fname
                            if found:
                                fname_txt = re.sub("pdf", "txt", fname)
                                if not os.path.exists(fname_txt):
                                    subprocess.run(["pdftotext", found])
                                with open(fname_txt, "r") as f:
                                    for line in f:
                                        m = re.search("(http[^\s]+)", line)
                                        if m:
                                            url = url_cleaner(m.group(1))
                                            if url in url_id:
                                                link = [id_, url_id[url]]
                                                links.add(json.dumps(link))
                                            else:
                                                m = re.search(
                                                    "www.caida.org/data/([^/]+)",
                                                    url)
                                                if m:
                                                    i = utils.id_create(
                                                        "", "dataset",
                                                        m.group(1))
                                                    #if i not in invalid_id:
                                                    #print ("    ",url,"            ",id_, "     ",i)
                                                    #print (i)
                                                    #filename = "data/www_caida_org/"+re.sub("[^a-z]+","_",url)+".html"
                                                    #print (filename, m.group(1))
                                                    #if not os.path.exists(filename):
                                                    #download("http://"+url,filename)
                                                    #sys.exit()

    with open(pubdb_links_file, "w") as f:
        print("writing", pubdb_links_file)
        json.dump(list(links), f, indent=4)

コード例 #16

0

ファイルを表示

ファイル: fix_person.py プロジェクト: chushuai/catalog-data

#!  /usr/bin/env python3
import json
import sys
import lib.utils as utils
import subprocess

for fname in sys.argv[1:]:
    person = json.load(open(fname, "r"))
    if "nameFirst" not in person or "nameLast" not in person:
        if "person:" in person["id"][:7]:
            names = person["id"][7:].split("_")
        else:
            names = person["id"].split("_")
        person["nameLast"] = names[0].title()
        person["nameFirst"] = " ".join(names[1:]).title()
        person["name"] = person["nameLast"] + ", " + person["nameFirst"]
    if "organizaion" in person:
        person["organizations"] = person["organization"]
        del person["organization"]
    person["id"] = utils.id_create(
        fname, "person", person["nameLast"] + "__" + person["nameFirst"])
    filename = person["id"][7:] + ".json"
    subprocess.run(["git", "mv", fname, filename])
    json.dump(person, open(filename, "w"), indent=4)