Exemple #1
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    print("console output is redirected to graph_ous.log ...")

    stdout = sys.stdout

    log = open(os.path.join(LOG_DIR, "graph_ous.log"), "w+")
    sys.stdout = log

    ous = utils.read_json(os.path.join(OUS_DIR, "all.json"))

    org_nodes = [['Id', 'Label']]
    org_edges = [['Source', 'Target']]

    for record in ous['records']:
        org_unit_id = record['data']['objectId']
        org_unit_name = utils.clean_string(record['data']['name'])
        org_nodes.append([org_unit_id, org_unit_name])
        if 'parentAffiliation' in record['data']:
            parent = record['data']['parentAffiliation']['objectId']
            org_edges.append([org_unit_id, parent])

    utils.write_csv(os.path.join(GRAPH_DIR, 'pure--ous_nodes.csv'), org_nodes)
    utils.write_csv(os.path.join(GRAPH_DIR, 'pure--ous_ous_edges.csv'),
                    org_edges)

    log.close()
    sys.stdout = stdout
Exemple #2
0
def routine():
    data_paths = []
    for root, dirs, files in os.walk(TITLES_DIR):
        for name in files:
            if not name.endswith("raw.txt") and name.endswith(".txt"):
                data_paths.append(os.path.realpath(os.path.join(root, name)))
    for dp in data_paths:
        out_dir = "/".join(dp.split("/")[:-1])
        out_pre = out_file = dp.split("/")[-1].split(".")[0]
        out_file = out_pre + ".csv"
        out_path = out_dir + '/' + out_file
        doc = [["Doc", "Term"]]
        doc_id = 1
        lines = utils.read_plain_clean(dp)
        for line in lines:
            for word in line.split():
                doc.append([out_pre + ":" + str(doc_id), word])
            doc_id += 1
        utils.write_csv(out_path, doc)
Exemple #3
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    stdout = sys.stdout

    print("console output is redirected to graph_languages.log ...")

    log = open(os.path.join(LOG_DIR, "graph_languages.log"), "w+")
    sys.stdout = log

    languages_raw = utils.read_json(os.path.join(LANG_DIR, 'collection.json'))

    languages = [['Id', 'Label', 'Coordinates']]

    dc_title = 'http_purl_org_dc_elements_1_1_title'
    # dc_idx = 'http_purl_org_dc_elements_1_1_identifier'
    google_coordinates = 'http_earth_google_com_kml_2_1_coordinates'

    for lang in languages_raw:
        name = ''
        if dc_title in languages_raw[lang]:
            name = languages_raw[lang][dc_title]
            if type(name) == list:
                name = name[0]
        else:
            print("no name found for language", lang)
        coordinates = ''
        if google_coordinates in languages_raw[lang]:
            coordinates = languages_raw[lang][google_coordinates]
        languages.append([lang, name, coordinates])

    utils.write_csv(os.path.join(GRAPH_DIR, 'pure--lang_nodes.csv'), languages)

    log.close()
    sys.stdout = stdout
Exemple #4
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    print("console output is redirected to graph_contexts.log ...")

    stdout = sys.stdout

    log = open(os.path.join(LOG_DIR, "graph_contexts.log"), "w+")
    sys.stdout = log

    ctxs = utils.read_json(os.path.join(CTX_DIR, "all.json"))

    ctx_nodes = [["Id", "Label", "Created"]]
    ctx_edges = [["Source", "Target"]]

    for rec in ctxs['records']:
        objectId = rec['data']['objectId']
        name = rec['data']['name']
        created = rec['data']['creationDate'].split("-")[0]
        ctx_nodes.append([objectId, name, created])
        maintainers = rec['data']['responsibleAffiliations']
        for m in maintainers:
            maintainer = m['objectId']
            ctx_edges.append([objectId, maintainer])

    utils.write_csv(os.path.join(GRAPH_DIR, "pure--ctx_nodes.csv"), ctx_nodes)
    utils.write_csv(os.path.join(GRAPH_DIR, "pure--ctx_ous_edges.csv"),
                    ctx_edges)

    log.close()
    sys.stdout = stdout
Exemple #5
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    print("console output is redirected to graph_contexts_mpis.log ...")

    stdout = sys.stdout

    log = open(os.path.join(LOG_DIR, "graph_contexts_mpis.log"), "w+")
    sys.stdout = log

    ctxs = utils.read_json(os.path.join(CTX_DIR, "all.json"))
    ous = utils.read_json(os.path.join(OUS_DIR, "all.json"))
    mpis = utils.read_json(os.path.join(MAPPED_DIR, "ous_ctx.json"))

    institutes = [['Id', 'Label']]
    contexts = [['Id', 'Label', 'Created']]

    for rec in ous['records']:
        if rec['data']['objectId'] in mpis:
            objectId = rec['data']['objectId']
            name = utils.clean_string(rec['data']['name'])
            institutes.append([objectId, name])

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_nodes--ctx.csv'),
                    institutes)

    institutes_contexts = [['Source', 'Target']]
    mpis_ctx = []

    for mpi in mpis:
        for context in mpis[mpi]:
            institutes_contexts.append([mpi, context])
            mpis_ctx.append(context)

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_ctx_edges.csv'),
                    institutes_contexts)

    for rec in ctxs['records']:
        objectId = rec['data']['objectId']
        if objectId in mpis_ctx:
            name = rec['data']['name'].replace('"', '')
            created = rec['data']['creationDate'].split("-")[0]
            contexts.append([objectId, name, created])

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ctx_nodes--ous.csv'),
                    contexts)

    log.close()
    sys.stdout = stdout
Exemple #6
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    stdout = sys.stdout

    print("console output is redirected to graph_descriptor.log ...")

    log = open(os.path.join(LOG_DIR, "graph_description.log"), "w+")
    sys.stdout = log

    # Tags of Institutes

    ous_tags = utils.read_json(os.path.join(MAPPED_DIR, 'ous_tags.json'))
    tags = list(utils.read_json(os.path.join(MAPPED_DIR, 'tags_ous.json')).keys())

    tag_nodes = [["Id", "Label"]]
    tags.sort()

    for i, t in enumerate(tags):
        tag_id = 'tag_' + str(i + 1)
        tag_nodes.append([tag_id, t])

    utils.write_csv(os.path.join(GRAPH_DIR, "mpis--tags_nodes.csv"), tag_nodes)

    mpis_tags = [['Source', 'Target']]

    print("try to find tags for", len(ous_tags), "institutes")

    for mpi in ous_tags:
        mpi_tags = ous_tags[mpi]
        for tag in mpi_tags:
            tag_id = tags.index(tag) + 1
            tag_id = 'tag_' + str(tag_id)
            mpis_tags.append([mpi, tag_id])

    print("found", len(mpis_tags) - 1, "edges from",
          len(ous_tags), "institutes to",
          len(tag_nodes) - 1, "tags")

    utils.write_csv(os.path.join(
        GRAPH_DIR, 'mpis--ous_tags_edges.csv'), mpis_tags)

    # Categories of Institutes

    mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json'))
    cats = utils.read_json(os.path.join(MAPPED_DIR, 'cat_ous.json'))

    cat_nodes = [["Id", "Label"]]
    cat_edges = [["Source", "Target"]]

    mpis_nodes = [["Id", "Label"]]

    all_mpis = []
    all_cats = list(cats.keys())
    all_cats.sort()

    print("try to find categories for", len(mpis), "institutes")

    for i, category in enumerate(all_cats):
        cat_idx = "category_" + str(i + 1)
        cat_nodes.append([cat_idx, category])
        ous_idx = cats[category]
        for ou_idx in ous_idx:
            if ou_idx not in all_mpis:
                all_mpis.append(ou_idx)
                mpis_nodes.append([ou_idx, mpis[ou_idx]])
            cat_edges.append([ou_idx, cat_idx])

    print("found", len(cat_edges) - 1, "edges from",
          len(all_mpis), "institutes to",
          len(all_cats), "categories")

    utils.write_csv(os.path.join(
        GRAPH_DIR, "mpis--ous_nodes--cats.csv"), mpis_nodes)
    utils.write_csv(os.path.join(GRAPH_DIR, "mpis--cats_nodes.csv"), cat_nodes)
    utils.write_csv(os.path.join(
        GRAPH_DIR, "mpis--ous_cat_edges.csv"), cat_edges)

    # Tags of Institutes of Categories

    cats = utils.read_json(os.path.join(MAPPED_DIR, 'cat_ous.json'))
    tags = utils.read_json(os.path.join(MAPPED_DIR, 'ous_tags.json'))

    t = list(tags.keys())
    t.sort()

    c = list(cats.keys())
    c.sort()

    all_c = []
    all_t = []

    cat_tags = {}
    tags_cat = {}

    for cat in c:
        cat_tags[cat] = []
        for ou_idx in cats[cat]:
            if ou_idx not in all_c:
                all_c.append(ou_idx)
            ou_tags = tags[ou_idx]
            for ou_tag in ou_tags:
                if ou_tag not in all_t:
                    all_t.append(ou_tag)
                if ou_tag not in tags_cat:
                    tags_cat[ou_tag] = [cat]
                else:
                    if cat not in tags_cat[ou_tag]:
                        tags_cat[ou_tag].append(cat)
                if ou_tag not in cat_tags[cat]:
                    cat_tags[cat].append(ou_tag)

    all_c.sort()

    ctags = {}

    for i, cat in enumerate(c):
        cat_idx = "category_" + str(i + 1)
        ctags[cat_idx] = cat_tags[cat]

    ct_edge = {}

    for cat in ctags:
        ct_edge[cat] = []

    all_t.sort()

    for i, tag in enumerate(all_t):
        tag_idx = "tag_" + str(i + 1)
        for cat in ctags:
            if tag in ctags[cat]:
                ct_edge[cat].append(tag_idx)
            else:
                continue

    cat_edges = [["Source", "Target"]]

    for cat in ct_edge:
        tags = ct_edge[cat]
        for cat_tag in tags:
            cat_edges.append([cat, cat_tag])

    print("found categories for", len(all_c), "institutes")

    utils.write_csv(os.path.join(
        GRAPH_DIR, "mpis--cats-tags_edges.csv"), cat_edges)

    log.close()
    sys.stdout = stdout
Exemple #7
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    print("console output is redirected to graph_ous_mpis.log ...")

    stdout = sys.stdout

    log = open(os.path.join(LOG_DIR, "graph_ous_mpis.log"), "w+")
    sys.stdout = log

    mpis = utils.read_json(os.path.join(MAPPED_DIR, "ous_mpi.json"))
    ous = utils.read_json(os.path.join(OUS_DIR, "all.json"))

    ous_nodes = [["Id", "Label"]]
    ous_edges = [["Source", "Target"]]

    children = []

    ous_collected = []

    for rec in ous['records']:
        if rec['data']['objectId'] in mpis:
            objectId = rec['data']['objectId']
            name = utils.clean_string(rec['data']['name'])
            ous_nodes.append([objectId, name])
            ous_collected.append(objectId)
            if 'parentAffiliation' in rec['data']:
                parent = rec['data']['parentAffiliation']['objectId']
                ous_edges.append([objectId, parent])
            else:
                print("no parent:", objectId)
        if rec['data']['objectId'] not in mpis and 'parentAffiliation' in rec[
                'data']:
            if rec['data']['parentAffiliation']['objectId'] in mpis \
                    or rec['data']['parentAffiliation']['objectId'] in children:
                objectId = rec['data']['objectId']
                name = utils.clean_string(rec['data']['name'])
                ous_nodes.append([objectId, name])
                ous_collected.append(objectId)
                parent = rec['data']['parentAffiliation']['objectId']
                ous_edges.append([objectId, parent])
                if rec['data']['hasChildren']:
                    children.append(objectId)

    found = True
    while found:
        changed = False
        for rec in ous['records']:
            if rec['data']['objectId'] not in ous_collected \
                    and 'parentAffiliation' in rec['data']:
                if rec['data']['parentAffiliation']['objectId'] in mpis \
                        or rec['data']['parentAffiliation']['objectId'] in children:
                    objectId = rec['data']['objectId']
                    name = utils.clean_string(rec['data']['name'])
                    ous_nodes.append([objectId, name])
                    ous_collected.append(objectId)
                    changed = True
                    parent = rec['data']['parentAffiliation']['objectId']
                    ous_edges.append([objectId, parent])
                    if rec['data']['hasChildren']:
                        children.append(objectId)
        if not changed:
            found = False

    utils.write_csv(os.path.join(GRAPH_DIR, "mpis--ous_nodes--tree-full.csv"),
                    ous_nodes)
    utils.write_csv(os.path.join(GRAPH_DIR, "mpis--ous_ous_edges--tree.csv"),
                    ous_edges)

    # Institutes

    institutes = [['Id', 'Label']]

    for rec in ous['records']:
        if rec['data']['objectId'] in mpis:
            objectId = rec['data']['objectId']
            name = utils.clean_string(rec['data']['name'])
            institutes.append([objectId, name])

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_nodes.csv'), institutes)

    # Children of Institutes

    kids_names = [["Id", "Label"]]

    mpis_kids_nodes = utils.read_csv_with_header(
        os.path.join(GRAPH_DIR, 'mpis--ous_nodes--tree-full.csv'))
    mpis_kids_nodes = list(mpis_kids_nodes.values())

    for i in range(1, len(mpis_kids_nodes[0])):
        kid_id = mpis_kids_nodes[0][i]
        kid_name = utils.clean_string(mpis_kids_nodes[1][i])
        if kid_id not in mpis:
            kids_names.append([kid_id, kid_name])

    utils.write_csv(
        os.path.join(GRAPH_DIR, 'mpis--ous_nodes--tree-children.csv'),
        kids_names)

    log.close()
    sys.stdout = stdout
Exemple #8
0
                        pub_src_org_nodes.append([src_org_id, src_org_name])
                        all_src_orgs.append(src_org_id)
                src_org_table_i += 1

            src_table_i += 1

print("")
print("processed", len(items_total), "records from", len(data_paths),
      "collections!")
print("found", len(src_table) - 1, "sources!")
print("found", aut_table_i, "internal authorships!")
print("found", ext_table_i, "external authorships!")

print("")

utils.write_csv(os.path.join(TABLES_DIR, "publications.csv"), pub_table)
utils.write_csv(os.path.join(TABLES_DIR, "publications_sources.csv"),
                src_table)
utils.write_csv(os.path.join(TABLES_DIR, "publications_authors.csv"),
                aut_table)
utils.write_csv(os.path.join(TABLES_DIR, "publications_externals.csv"),
                ext_table)
utils.write_csv(os.path.join(TABLES_DIR, "publications_organizations.csv"),
                org_table)
utils.write_csv(os.path.join(TABLES_DIR, "publications_sources_authors.csv"),
                src_aut_table)
utils.write_csv(os.path.join(TABLES_DIR, "publications_sources_externals.csv"),
                src_ext_table)
utils.write_csv(
    os.path.join(TABLES_DIR, "publications_sources_organizations.csv"),
    src_org_table)