def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) print("console output is redirected to graph_ous.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "graph_ous.log"), "w+") sys.stdout = log ous = utils.read_json(os.path.join(OUS_DIR, "all.json")) org_nodes = [['Id', 'Label']] org_edges = [['Source', 'Target']] for record in ous['records']: org_unit_id = record['data']['objectId'] org_unit_name = utils.clean_string(record['data']['name']) org_nodes.append([org_unit_id, org_unit_name]) if 'parentAffiliation' in record['data']: parent = record['data']['parentAffiliation']['objectId'] org_edges.append([org_unit_id, parent]) utils.write_csv(os.path.join(GRAPH_DIR, 'pure--ous_nodes.csv'), org_nodes) utils.write_csv(os.path.join(GRAPH_DIR, 'pure--ous_ous_edges.csv'), org_edges) log.close() sys.stdout = stdout
def routine(): data_paths = [] for root, dirs, files in os.walk(TITLES_DIR): for name in files: if not name.endswith("raw.txt") and name.endswith(".txt"): data_paths.append(os.path.realpath(os.path.join(root, name))) for dp in data_paths: out_dir = "/".join(dp.split("/")[:-1]) out_pre = out_file = dp.split("/")[-1].split(".")[0] out_file = out_pre + ".csv" out_path = out_dir + '/' + out_file doc = [["Doc", "Term"]] doc_id = 1 lines = utils.read_plain_clean(dp) for line in lines: for word in line.split(): doc.append([out_pre + ":" + str(doc_id), word]) doc_id += 1 utils.write_csv(out_path, doc)
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) stdout = sys.stdout print("console output is redirected to graph_languages.log ...") log = open(os.path.join(LOG_DIR, "graph_languages.log"), "w+") sys.stdout = log languages_raw = utils.read_json(os.path.join(LANG_DIR, 'collection.json')) languages = [['Id', 'Label', 'Coordinates']] dc_title = 'http_purl_org_dc_elements_1_1_title' # dc_idx = 'http_purl_org_dc_elements_1_1_identifier' google_coordinates = 'http_earth_google_com_kml_2_1_coordinates' for lang in languages_raw: name = '' if dc_title in languages_raw[lang]: name = languages_raw[lang][dc_title] if type(name) == list: name = name[0] else: print("no name found for language", lang) coordinates = '' if google_coordinates in languages_raw[lang]: coordinates = languages_raw[lang][google_coordinates] languages.append([lang, name, coordinates]) utils.write_csv(os.path.join(GRAPH_DIR, 'pure--lang_nodes.csv'), languages) log.close() sys.stdout = stdout
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) print("console output is redirected to graph_contexts.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "graph_contexts.log"), "w+") sys.stdout = log ctxs = utils.read_json(os.path.join(CTX_DIR, "all.json")) ctx_nodes = [["Id", "Label", "Created"]] ctx_edges = [["Source", "Target"]] for rec in ctxs['records']: objectId = rec['data']['objectId'] name = rec['data']['name'] created = rec['data']['creationDate'].split("-")[0] ctx_nodes.append([objectId, name, created]) maintainers = rec['data']['responsibleAffiliations'] for m in maintainers: maintainer = m['objectId'] ctx_edges.append([objectId, maintainer]) utils.write_csv(os.path.join(GRAPH_DIR, "pure--ctx_nodes.csv"), ctx_nodes) utils.write_csv(os.path.join(GRAPH_DIR, "pure--ctx_ous_edges.csv"), ctx_edges) log.close() sys.stdout = stdout
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) print("console output is redirected to graph_contexts_mpis.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "graph_contexts_mpis.log"), "w+") sys.stdout = log ctxs = utils.read_json(os.path.join(CTX_DIR, "all.json")) ous = utils.read_json(os.path.join(OUS_DIR, "all.json")) mpis = utils.read_json(os.path.join(MAPPED_DIR, "ous_ctx.json")) institutes = [['Id', 'Label']] contexts = [['Id', 'Label', 'Created']] for rec in ous['records']: if rec['data']['objectId'] in mpis: objectId = rec['data']['objectId'] name = utils.clean_string(rec['data']['name']) institutes.append([objectId, name]) utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_nodes--ctx.csv'), institutes) institutes_contexts = [['Source', 'Target']] mpis_ctx = [] for mpi in mpis: for context in mpis[mpi]: institutes_contexts.append([mpi, context]) mpis_ctx.append(context) utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_ctx_edges.csv'), institutes_contexts) for rec in ctxs['records']: objectId = rec['data']['objectId'] if objectId in mpis_ctx: name = rec['data']['name'].replace('"', '') created = rec['data']['creationDate'].split("-")[0] contexts.append([objectId, name, created]) utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ctx_nodes--ous.csv'), contexts) log.close() sys.stdout = stdout
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) stdout = sys.stdout print("console output is redirected to graph_descriptor.log ...") log = open(os.path.join(LOG_DIR, "graph_description.log"), "w+") sys.stdout = log # Tags of Institutes ous_tags = utils.read_json(os.path.join(MAPPED_DIR, 'ous_tags.json')) tags = list(utils.read_json(os.path.join(MAPPED_DIR, 'tags_ous.json')).keys()) tag_nodes = [["Id", "Label"]] tags.sort() for i, t in enumerate(tags): tag_id = 'tag_' + str(i + 1) tag_nodes.append([tag_id, t]) utils.write_csv(os.path.join(GRAPH_DIR, "mpis--tags_nodes.csv"), tag_nodes) mpis_tags = [['Source', 'Target']] print("try to find tags for", len(ous_tags), "institutes") for mpi in ous_tags: mpi_tags = ous_tags[mpi] for tag in mpi_tags: tag_id = tags.index(tag) + 1 tag_id = 'tag_' + str(tag_id) mpis_tags.append([mpi, tag_id]) print("found", len(mpis_tags) - 1, "edges from", len(ous_tags), "institutes to", len(tag_nodes) - 1, "tags") utils.write_csv(os.path.join( GRAPH_DIR, 'mpis--ous_tags_edges.csv'), mpis_tags) # Categories of Institutes mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json')) cats = utils.read_json(os.path.join(MAPPED_DIR, 'cat_ous.json')) cat_nodes = [["Id", "Label"]] cat_edges = [["Source", "Target"]] mpis_nodes = [["Id", "Label"]] all_mpis = [] all_cats = list(cats.keys()) all_cats.sort() print("try to find categories for", len(mpis), "institutes") for i, category in enumerate(all_cats): cat_idx = "category_" + str(i + 1) cat_nodes.append([cat_idx, category]) ous_idx = cats[category] for ou_idx in ous_idx: if ou_idx not in all_mpis: all_mpis.append(ou_idx) mpis_nodes.append([ou_idx, mpis[ou_idx]]) cat_edges.append([ou_idx, cat_idx]) print("found", len(cat_edges) - 1, "edges from", len(all_mpis), "institutes to", len(all_cats), "categories") utils.write_csv(os.path.join( GRAPH_DIR, "mpis--ous_nodes--cats.csv"), mpis_nodes) utils.write_csv(os.path.join(GRAPH_DIR, "mpis--cats_nodes.csv"), cat_nodes) utils.write_csv(os.path.join( GRAPH_DIR, "mpis--ous_cat_edges.csv"), cat_edges) # Tags of Institutes of Categories cats = utils.read_json(os.path.join(MAPPED_DIR, 'cat_ous.json')) tags = utils.read_json(os.path.join(MAPPED_DIR, 'ous_tags.json')) t = list(tags.keys()) t.sort() c = list(cats.keys()) c.sort() all_c = [] all_t = [] cat_tags = {} tags_cat = {} for cat in c: cat_tags[cat] = [] for ou_idx in cats[cat]: if ou_idx not in all_c: all_c.append(ou_idx) ou_tags = tags[ou_idx] for ou_tag in ou_tags: if ou_tag not in all_t: all_t.append(ou_tag) if ou_tag not in tags_cat: tags_cat[ou_tag] = [cat] else: if cat not in tags_cat[ou_tag]: tags_cat[ou_tag].append(cat) if ou_tag not in cat_tags[cat]: cat_tags[cat].append(ou_tag) all_c.sort() ctags = {} for i, cat in enumerate(c): cat_idx = "category_" + str(i + 1) ctags[cat_idx] = cat_tags[cat] ct_edge = {} for cat in ctags: ct_edge[cat] = [] all_t.sort() for i, tag in enumerate(all_t): tag_idx = "tag_" + str(i + 1) for cat in ctags: if tag in ctags[cat]: ct_edge[cat].append(tag_idx) else: continue cat_edges = [["Source", "Target"]] for cat in ct_edge: tags = ct_edge[cat] for cat_tag in tags: cat_edges.append([cat, cat_tag]) print("found categories for", len(all_c), "institutes") utils.write_csv(os.path.join( GRAPH_DIR, "mpis--cats-tags_edges.csv"), cat_edges) log.close() sys.stdout = stdout
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) print("console output is redirected to graph_ous_mpis.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "graph_ous_mpis.log"), "w+") sys.stdout = log mpis = utils.read_json(os.path.join(MAPPED_DIR, "ous_mpi.json")) ous = utils.read_json(os.path.join(OUS_DIR, "all.json")) ous_nodes = [["Id", "Label"]] ous_edges = [["Source", "Target"]] children = [] ous_collected = [] for rec in ous['records']: if rec['data']['objectId'] in mpis: objectId = rec['data']['objectId'] name = utils.clean_string(rec['data']['name']) ous_nodes.append([objectId, name]) ous_collected.append(objectId) if 'parentAffiliation' in rec['data']: parent = rec['data']['parentAffiliation']['objectId'] ous_edges.append([objectId, parent]) else: print("no parent:", objectId) if rec['data']['objectId'] not in mpis and 'parentAffiliation' in rec[ 'data']: if rec['data']['parentAffiliation']['objectId'] in mpis \ or rec['data']['parentAffiliation']['objectId'] in children: objectId = rec['data']['objectId'] name = utils.clean_string(rec['data']['name']) ous_nodes.append([objectId, name]) ous_collected.append(objectId) parent = rec['data']['parentAffiliation']['objectId'] ous_edges.append([objectId, parent]) if rec['data']['hasChildren']: children.append(objectId) found = True while found: changed = False for rec in ous['records']: if rec['data']['objectId'] not in ous_collected \ and 'parentAffiliation' in rec['data']: if rec['data']['parentAffiliation']['objectId'] in mpis \ or rec['data']['parentAffiliation']['objectId'] in children: objectId = rec['data']['objectId'] name = utils.clean_string(rec['data']['name']) ous_nodes.append([objectId, name]) ous_collected.append(objectId) changed = True parent = rec['data']['parentAffiliation']['objectId'] ous_edges.append([objectId, parent]) if rec['data']['hasChildren']: children.append(objectId) if not changed: found = False utils.write_csv(os.path.join(GRAPH_DIR, "mpis--ous_nodes--tree-full.csv"), ous_nodes) utils.write_csv(os.path.join(GRAPH_DIR, "mpis--ous_ous_edges--tree.csv"), ous_edges) # Institutes institutes = [['Id', 'Label']] for rec in ous['records']: if rec['data']['objectId'] in mpis: objectId = rec['data']['objectId'] name = utils.clean_string(rec['data']['name']) institutes.append([objectId, name]) utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_nodes.csv'), institutes) # Children of Institutes kids_names = [["Id", "Label"]] mpis_kids_nodes = utils.read_csv_with_header( os.path.join(GRAPH_DIR, 'mpis--ous_nodes--tree-full.csv')) mpis_kids_nodes = list(mpis_kids_nodes.values()) for i in range(1, len(mpis_kids_nodes[0])): kid_id = mpis_kids_nodes[0][i] kid_name = utils.clean_string(mpis_kids_nodes[1][i]) if kid_id not in mpis: kids_names.append([kid_id, kid_name]) utils.write_csv( os.path.join(GRAPH_DIR, 'mpis--ous_nodes--tree-children.csv'), kids_names) log.close() sys.stdout = stdout
pub_src_org_nodes.append([src_org_id, src_org_name]) all_src_orgs.append(src_org_id) src_org_table_i += 1 src_table_i += 1 print("") print("processed", len(items_total), "records from", len(data_paths), "collections!") print("found", len(src_table) - 1, "sources!") print("found", aut_table_i, "internal authorships!") print("found", ext_table_i, "external authorships!") print("") utils.write_csv(os.path.join(TABLES_DIR, "publications.csv"), pub_table) utils.write_csv(os.path.join(TABLES_DIR, "publications_sources.csv"), src_table) utils.write_csv(os.path.join(TABLES_DIR, "publications_authors.csv"), aut_table) utils.write_csv(os.path.join(TABLES_DIR, "publications_externals.csv"), ext_table) utils.write_csv(os.path.join(TABLES_DIR, "publications_organizations.csv"), org_table) utils.write_csv(os.path.join(TABLES_DIR, "publications_sources_authors.csv"), src_aut_table) utils.write_csv(os.path.join(TABLES_DIR, "publications_sources_externals.csv"), src_ext_table) utils.write_csv( os.path.join(TABLES_DIR, "publications_sources_organizations.csv"), src_org_table)