def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) print("console output is redirected to graph_contexts_mpis.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "graph_contexts_mpis.log"), "w+") sys.stdout = log ctxs = utils.read_json(os.path.join(CTX_DIR, "all.json")) ous = utils.read_json(os.path.join(OUS_DIR, "all.json")) mpis = utils.read_json(os.path.join(MAPPED_DIR, "ous_ctx.json")) institutes = [['Id', 'Label']] contexts = [['Id', 'Label', 'Created']] for rec in ous['records']: if rec['data']['objectId'] in mpis: objectId = rec['data']['objectId'] name = utils.clean_string(rec['data']['name']) institutes.append([objectId, name]) utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_nodes--ctx.csv'), institutes) institutes_contexts = [['Source', 'Target']] mpis_ctx = [] for mpi in mpis: for context in mpis[mpi]: institutes_contexts.append([mpi, context]) mpis_ctx.append(context) utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_ctx_edges.csv'), institutes_contexts) for rec in ctxs['records']: objectId = rec['data']['objectId'] if objectId in mpis_ctx: name = rec['data']['name'].replace('"', '') created = rec['data']['creationDate'].split("-")[0] contexts.append([objectId, name, created]) utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ctx_nodes--ous.csv'), contexts) log.close() sys.stdout = stdout
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) print("console output is redirected to graph_ous.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "graph_ous.log"), "w+") sys.stdout = log ous = utils.read_json(os.path.join(OUS_DIR, "all.json")) org_nodes = [['Id', 'Label']] org_edges = [['Source', 'Target']] for record in ous['records']: org_unit_id = record['data']['objectId'] org_unit_name = utils.clean_string(record['data']['name']) org_nodes.append([org_unit_id, org_unit_name]) if 'parentAffiliation' in record['data']: parent = record['data']['parentAffiliation']['objectId'] org_edges.append([org_unit_id, parent]) utils.write_csv(os.path.join(GRAPH_DIR, 'pure--ous_nodes.csv'), org_nodes) utils.write_csv(os.path.join(GRAPH_DIR, 'pure--ous_ous_edges.csv'), org_edges) log.close() sys.stdout = stdout
def routine(): if not os.path.exists(ITEMS_DIR): os.makedirs(ITEMS_DIR) ################################ ### RETRIEVE RECORDS OF CTXs ### ################################ ctxs = utils.read_json(os.path.join(CTX_DIR, "all.json")) ctx_meta = {} for rec in ctxs['records']: objectId = rec['data']['objectId'] ctx_meta[objectId] = rec['data']['name'] ctx_ids = list(ctx_meta.keys()) ctx_ids.sort() client = Client() for ctx_idx in tqdm(ctx_ids): print("retrieve data of context:", ctx_meta[ctx_idx]) ctx_data = client.get_data(ctx_id=ctx_idx) utils.write_json(os.path.join(ITEMS_DIR, ctx_idx+".json"), ctx_data.collection)
def routine(): all_ous = [] all_fnd = {} eng_fnd = utils.read_json(os.path.join(MAP_DIR, "ous_fnd_eng.json")) for eng_ou in eng_fnd: all_fnd[eng_ou] = eng_fnd[eng_ou] all_ous.append(eng_fnd[eng_ou]) deu_fnd = utils.read_json(os.path.join(MAP_DIR, "ous_fnd_deu.json")) # deu_ous = list(deu_fnd.values()) for deu_ou in deu_fnd: if deu_fnd[deu_ou] not in all_ous: all_fnd[deu_ou] = deu_fnd[deu_ou] all_ous.append(deu_fnd[deu_ou]) utils.write_json(os.path.join(MAP_DIR, "mpi_ous.json"), all_fnd)
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) stdout = sys.stdout print("console output is redirected to graph_languages.log ...") log = open(os.path.join(LOG_DIR, "graph_languages.log"), "w+") sys.stdout = log languages_raw = utils.read_json(os.path.join(LANG_DIR, 'collection.json')) languages = [['Id', 'Label', 'Coordinates']] dc_title = 'http_purl_org_dc_elements_1_1_title' # dc_idx = 'http_purl_org_dc_elements_1_1_identifier' google_coordinates = 'http_earth_google_com_kml_2_1_coordinates' for lang in languages_raw: name = '' if dc_title in languages_raw[lang]: name = languages_raw[lang][dc_title] if type(name) == list: name = name[0] else: print("no name found for language", lang) coordinates = '' if google_coordinates in languages_raw[lang]: coordinates = languages_raw[lang][google_coordinates] languages.append([lang, name, coordinates]) utils.write_csv(os.path.join(GRAPH_DIR, 'pure--lang_nodes.csv'), languages) log.close() sys.stdout = stdout
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) print("console output is redirected to graph_contexts.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "graph_contexts.log"), "w+") sys.stdout = log ctxs = utils.read_json(os.path.join(CTX_DIR, "all.json")) ctx_nodes = [["Id", "Label", "Created"]] ctx_edges = [["Source", "Target"]] for rec in ctxs['records']: objectId = rec['data']['objectId'] name = rec['data']['name'] created = rec['data']['creationDate'].split("-")[0] ctx_nodes.append([objectId, name, created]) maintainers = rec['data']['responsibleAffiliations'] for m in maintainers: maintainer = m['objectId'] ctx_edges.append([objectId, maintainer]) utils.write_csv(os.path.join(GRAPH_DIR, "pure--ctx_nodes.csv"), ctx_nodes) utils.write_csv(os.path.join(GRAPH_DIR, "pure--ctx_ous_edges.csv"), ctx_edges) log.close() sys.stdout = stdout
from pybman import utils from .utils_paths import BASE_DIR PURE_DIR = BASE_DIR + 'pure/' MPIS_DIR = BASE_DIR + 'mpis/' ############################################################## ### READ FILES CONTAINING ALL MAX PLANCK INSTITUTES (MPIs) ### ### AND ORGANIZATIONAL UNITS (OUs) FROM PURE ################# ############################################################## mpis = utils.read_json(MPIS_DIR + 'scrape/all_deu.json') pure_ous = utils.read_json(PURE_DIR + 'ous/all.json') ############################ ### EXTRACT NAMES OF OUs ### ############################ names = {} for record in pure_ous['records']: idx = record['data']['objectId'] metadata = record['data']['metadata'] name = metadata['name'].strip() names[name] = idx if 'alternativeNames' in metadata and metadata['alternativeNames'][0]: for altname in metadata['alternativeNames']: names[altname.strip()] = idx m = list(mpis.keys())
def routine(): if not os.path.exists(MAP_DIR): os.makedirs(MAP_DIR) ###################################################################### ### READ FILES CONTAINING METADATA ON MAX PLANCK INSTITUTES (MPIs) ### ### AND ORGANIZATIONAL UNITS (OUs) FROM PURE ######################### ###################################################################### mpis = utils.read_json(os.path.join(SCRAPE_DIR, 'all_deu.json')) pure_ous = utils.read_json(os.path.join(OUS_DIR, 'all.json')) ############################ ### EXTRACT NAMES OF OUs ### ############################ names = {} for record in pure_ous['records']: idx = record['data']['objectId'] metadata = record['data']['metadata'] name = metadata['name'].strip() names[name] = idx if 'alternativeNames' in metadata and metadata['alternativeNames'][0]: for altname in metadata['alternativeNames']: names[altname.strip()] = idx m = list(mpis.keys()) n = list(names.keys()) ############################# ### MAP MPIs TO NAMES/OUs ### ############################# not_fnd = [] fnd = {} for mpi in m: if mpi in n: fnd[mpi] = names[mpi] elif mpi.replace('-', ' ') in n: fnd[mpi] = names[mpi.replace('-', ' ')] else: # print("no equivalent found for", mpi) not_fnd.append(mpi) idea = {} for no_eq in not_fnd: parts = no_eq.split('Max-Planck-Institut für') if len(parts) > 1: for ou in n: if parts[1].strip().lower() in ou.lower(): if no_eq in idea: if ou not in idea[no_eq]: idea[no_eq].append(ou) else: idea[no_eq] = [ou] for no_eq in not_fnd: parts = no_eq.split('für') if len(parts) > 1: for ou in n: if parts[1].strip().lower() in ou.lower(): if no_eq in idea: if ou not in idea[no_eq]: idea[no_eq].append(ou) else: idea[no_eq] = [ou] for no_eq in not_fnd: parts = no_eq.split(',') if len(parts) > 1: for ou in n: if parts[0].strip().lower() in ou.lower(): if no_eq in idea: if ou not in idea[no_eq]: idea[no_eq].append(ou) else: idea[no_eq] = [ou] ############################### ### PRINT AND WRITE RESULTS ### ############################### print("") print("found matches for") counter = 0 for mpi in m: if mpi not in not_fnd: counter += 1 print(mpi) print(str(counter), "in total") utils.write_json(os.path.join(MAP_DIR, "ous_fnd_deu.json"), fnd) print("") print("found possible matches for") counter = 0 for nt_eq in idea: counter += 1 print(nt_eq) print(str(counter), "in total") utils.write_json(os.path.join(MAP_DIR, "ous_ideas_deu.json"), idea) print("") print("no match found for") counter = 0 for nt_eq in not_fnd: if nt_eq not in idea: counter += 1 print(nt_eq) print(str(counter), "in total") print("") utils.write_json(os.path.join(MAP_DIR, "ous_not_fnd_deu.txt"), not_fnd)
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) stdout = sys.stdout print("console output is redirected to graph_descriptor.log ...") log = open(os.path.join(LOG_DIR, "graph_description.log"), "w+") sys.stdout = log # Tags of Institutes ous_tags = utils.read_json(os.path.join(MAPPED_DIR, 'ous_tags.json')) tags = list(utils.read_json(os.path.join(MAPPED_DIR, 'tags_ous.json')).keys()) tag_nodes = [["Id", "Label"]] tags.sort() for i, t in enumerate(tags): tag_id = 'tag_' + str(i + 1) tag_nodes.append([tag_id, t]) utils.write_csv(os.path.join(GRAPH_DIR, "mpis--tags_nodes.csv"), tag_nodes) mpis_tags = [['Source', 'Target']] print("try to find tags for", len(ous_tags), "institutes") for mpi in ous_tags: mpi_tags = ous_tags[mpi] for tag in mpi_tags: tag_id = tags.index(tag) + 1 tag_id = 'tag_' + str(tag_id) mpis_tags.append([mpi, tag_id]) print("found", len(mpis_tags) - 1, "edges from", len(ous_tags), "institutes to", len(tag_nodes) - 1, "tags") utils.write_csv(os.path.join( GRAPH_DIR, 'mpis--ous_tags_edges.csv'), mpis_tags) # Categories of Institutes mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json')) cats = utils.read_json(os.path.join(MAPPED_DIR, 'cat_ous.json')) cat_nodes = [["Id", "Label"]] cat_edges = [["Source", "Target"]] mpis_nodes = [["Id", "Label"]] all_mpis = [] all_cats = list(cats.keys()) all_cats.sort() print("try to find categories for", len(mpis), "institutes") for i, category in enumerate(all_cats): cat_idx = "category_" + str(i + 1) cat_nodes.append([cat_idx, category]) ous_idx = cats[category] for ou_idx in ous_idx: if ou_idx not in all_mpis: all_mpis.append(ou_idx) mpis_nodes.append([ou_idx, mpis[ou_idx]]) cat_edges.append([ou_idx, cat_idx]) print("found", len(cat_edges) - 1, "edges from", len(all_mpis), "institutes to", len(all_cats), "categories") utils.write_csv(os.path.join( GRAPH_DIR, "mpis--ous_nodes--cats.csv"), mpis_nodes) utils.write_csv(os.path.join(GRAPH_DIR, "mpis--cats_nodes.csv"), cat_nodes) utils.write_csv(os.path.join( GRAPH_DIR, "mpis--ous_cat_edges.csv"), cat_edges) # Tags of Institutes of Categories cats = utils.read_json(os.path.join(MAPPED_DIR, 'cat_ous.json')) tags = utils.read_json(os.path.join(MAPPED_DIR, 'ous_tags.json')) t = list(tags.keys()) t.sort() c = list(cats.keys()) c.sort() all_c = [] all_t = [] cat_tags = {} tags_cat = {} for cat in c: cat_tags[cat] = [] for ou_idx in cats[cat]: if ou_idx not in all_c: all_c.append(ou_idx) ou_tags = tags[ou_idx] for ou_tag in ou_tags: if ou_tag not in all_t: all_t.append(ou_tag) if ou_tag not in tags_cat: tags_cat[ou_tag] = [cat] else: if cat not in tags_cat[ou_tag]: tags_cat[ou_tag].append(cat) if ou_tag not in cat_tags[cat]: cat_tags[cat].append(ou_tag) all_c.sort() ctags = {} for i, cat in enumerate(c): cat_idx = "category_" + str(i + 1) ctags[cat_idx] = cat_tags[cat] ct_edge = {} for cat in ctags: ct_edge[cat] = [] all_t.sort() for i, tag in enumerate(all_t): tag_idx = "tag_" + str(i + 1) for cat in ctags: if tag in ctags[cat]: ct_edge[cat].append(tag_idx) else: continue cat_edges = [["Source", "Target"]] for cat in ct_edge: tags = ct_edge[cat] for cat_tag in tags: cat_edges.append([cat, cat_tag]) print("found categories for", len(all_c), "institutes") utils.write_csv(os.path.join( GRAPH_DIR, "mpis--cats-tags_edges.csv"), cat_edges) log.close() sys.stdout = stdout
MPI_LANG_GENRE = os.path.join(TITLES_DIR, 'mpi-lang-genre/') MPI_LANG_YEARS_GENRE = os.path.join(TITLES_DIR, 'mpi-lang-year-genre/') PERS_LANG = os.path.join(TITLES_DIR, 'pers-lang/') PERS_LANG_YEARS = os.path.join(TITLES_DIR, 'pers-lang-year/') PERS_LANG_GENRE = os.path.join(TITLES_DIR, 'pers-lang-genre/') PERS_LANG_YEARS_GENRE = os.path.join(TITLES_DIR, 'pers-lang-year-genre/') CAT_LANG = os.path.join(TITLES_DIR, 'cat-lang/') CAT_LANG_GENRE = os.path.join(TITLES_DIR, 'cat-lang-genre/') CAT_LANG_YEARS = os.path.join(TITLES_DIR, 'cat-lang-year/') CAT_LANG_YEARS_GENRE = os.path.join(TITLES_DIR, 'cat-lang-year-genre/') YEARS = list(range(2000, 2020)) langs = utils.read_json(os.path.join(LANG_DIR, 'collection.json')) cat_ous = utils.read_json(os.path.join(MAPPED_DIR, 'cat_ous.json')) ous_ctx = utils.read_json(os.path.join(EXTDATA_DIR, 'selected.json')) cats = list(cat_ous.keys()) mpis = list(ous_ctx.keys()) cats.sort() mpis.sort() def titles_from_ctx_in_language(ctx_id='ctx_1542176', lang_id='eng', preprocess=True): total = ld.get_data(ctx_id)[0] data_set = DataSet(data_id=ctx_id + "_released", raw=total.get_items_released()) lang_data = data_set.get_languages_data()
def routine(): if not os.path.exists(MAP_DIR): os.makedirs(MAP_DIR) ############################################################## ### READ FILES CONTAINING ALL MAX PLANCK INSTITUTES (MPIs) ### ### AND ORGANIZATIONAL UNITS (OUs) FROM PURE ################# ############################################################## mpis = utils.read_json(os.path.join(SCRAPE_DIR, 'all.json')) pure_ous = utils.read_json(os.path.join(OUS_DIR, 'all.json')) ############################ ### EXTRACT NAMES OF OUs ### ############################ names = {} for record in pure_ous['records']: idx = record['data']['objectId'] metadata = record['data']['metadata'] name = metadata['name'].strip() names[name] = idx if 'alternativeNames' in metadata and metadata['alternativeNames'][0]: for altname in metadata['alternativeNames']: names[altname.strip()] = idx m = list(mpis.keys()) n = list(names.keys()) ############################# ### MAP MPIs TO NAMES/OUs ### ############################# not_fnd = [] fnd = {} for mpi in m: if mpi in n: fnd[mpi] = names[mpi] elif mpi.replace('Max Planck Institute', 'MPI') in n: fnd[mpi] = names[mpi.replace('Max Planck Institute', 'MPI')] elif mpi.split(",")[0] in n: fnd[mpi] = names[mpi.split(",")[0]] # Max Planck Institute for Software Systems, Kaiserslautern site # Max Planck Institute for Software Systems, Saarbrücken site # Max Planck Institute for Intelligent Systems, Stuttgart site # Max Planck Institute for Intelligent Systems, Tübingen site elif mpi.split(" (")[0] in n: fnd[mpi] = names[mpi.split(" (")[0]] # Max Planck Institute for Gravitational Physics (Hannover) # Max Planck Institute for Ornithology (Radolfzell) # Max Planck Institute for Plasma Physics (Greifswald) elif mpi == 'Research Group Social Neuroscience': # part of the Max Planck Institute for Human Cognitive and Brain Sciences continue else: # print("no equivalent found for", mpi) not_fnd.append(mpi) idea = {} for no_eq in not_fnd: parts = no_eq.split('Max Planck Institute for') if len(parts) > 1: for ou in n: if parts[1].strip().lower() in ou.lower(): if no_eq in idea: if ou not in idea[no_eq]: idea[no_eq].append(ou) else: idea[no_eq] = [ou] for no_eq in not_fnd: parts = no_eq.split('for') if len(parts) > 1: for ou in n: if parts[1].strip().lower() in ou.lower(): if no_eq in idea: if ou not in idea[no_eq]: idea[no_eq].append(ou) else: idea[no_eq] = [ou] for no_eq in not_fnd: parts = no_eq.split('of') if len(parts) > 1: for ou in n: if parts[1].strip().lower() in ou.lower(): if no_eq in idea: if ou not in idea[no_eq]: idea[no_eq].append(ou) else: idea[no_eq] = [ou] for no_eq in not_fnd: parts = no_eq.split(',') if len(parts) > 1: for ou in n: if parts[0].strip().lower() in ou.lower(): if no_eq in idea: if ou not in idea[no_eq]: idea[no_eq].append(ou) else: idea[no_eq] = [ou] ############################### ### PRINT AND WRITE RESULTS ### ############################### print("") print("found matches for") counter = 0 for mpi in m: if mpi not in not_fnd: counter += 1 print(mpi) print(str(counter),"in total") utils.write_json(os.path.join(MAP_DIR, "ous_fnd_eng.json"), fnd) print("") print("found possible matches for") counter = 0 for nt_eq in idea: counter += 1 print(nt_eq) print(str(counter),"in total") utils.write_json(os.path.join(MAP_DIR, "ous_ideas_eng.json"), idea) print("") print("no match found for") counter = 0 for nt_eq in not_fnd: if nt_eq not in idea: counter += 1 print(nt_eq) print(str(counter),"in total") print("") utils.write_json(os.path.join(MAP_DIR, "ous_not_fnd_eng.txt"), not_fnd)
from tqdm import tqdm from pybman import utils from pybman import Client from pybman.rest import ContextRestController from .utils_paths import BASE_DIR PURE_DIR = BASE_DIR + 'pure/' ITEMS_DIR = BASE_DIR + 'items/' ################################ ### RETRIEVE RECORDS OF CTXs ### ################################ ctxs = utils.read_json(PURE_DIR + "ctx/all.json") ctx_meta = {} for rec in ctxs['records']: objectId = rec['data']['objectId'] ctx_meta[objectId] = rec['data']['name'] ctx_ids = list(ctx_meta.keys()) ctx_ids.sort() client = Client() for ctx_idx in tqdm(ctx_ids): print("retrieve data of context:", ctx_meta[ctx_idx]) ctx_data = client.get_data(ctx_id=ctx_idx)
from .utils_paths import SCRAPE_DIR, CTX_DIR, MAP_DIR, MAPPED_DIR, LOG_DIR if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(MAPPED_DIR): os.makedirs(MAPPED_DIR) print("console output is redirected to map_post.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "map_post.log"), "w+") sys.stdout = log mpis = utils.read_json(os.path.join(SCRAPE_DIR, "all.json")) print("scraped", len(mpis), "institues!") mpis_mapped = utils.read_json(os.path.join(MAP_DIR, "mpi_ous.json")) # ous.json o = list(mpis_mapped.values()) m = list(mpis_mapped.keys()) ous_mpi = {} for i in range(len(o)): ou = o[i] name = m[i] ous_mpi[ou] = name
import sys from pybman import utils from .utils_paths import BASE_DIR MPIS_DIR = BASE_DIR + 'mpis/' print("console output is redirected to map_post.log ...") stdout = sys.stdout log = open("map_post.log", "w+") sys.stdout = log mpis = utils.read_json(BASE_DIR + 'mpis/scrape/all.json') print("scraped",len(mpis),"institues!") mpis_mapped = utils.read_json(BASE_DIR + 'mpis/map/mpi_ous.json') # ous.json o = list(mpis_mapped.values()) m = list(mpis_mapped.keys()) ous_mpi = {} for i in range(len(o)): ou = o[i] name = m[i] ous_mpi[ou] = name
from pybman import utils from .utils_paths import BASE_DIR MPIS_DIR = BASE_DIR + 'mpis/' all_ous = [] all_fnd = {} eng_fnd = utils.read_json(MPIS_DIR + "map/ous_fnd_eng.json") for eng_ou in eng_fnd: all_fnd[eng_ou] = eng_fnd[eng_ou] all_ous.append(eng_fnd[eng_ou]) deu_fnd = utils.read_json(MPIS_DIR + "map/ous_fnd_deu.json") deu_ous = list(deu_fnd.values()) for deu_ou in deu_fnd: if deu_fnd[deu_ou] not in all_ous: all_fnd[deu_ou] = deu_fnd[deu_ou] all_ous.append(deu_fnd[deu_ou]) utils.write_json(MPIS_DIR + "map/mpi_ous.json", all_fnd)
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(GRAPH_DIR): os.makedirs(GRAPH_DIR) print("console output is redirected to graph_ous_mpis.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "graph_ous_mpis.log"), "w+") sys.stdout = log mpis = utils.read_json(os.path.join(MAPPED_DIR, "ous_mpi.json")) ous = utils.read_json(os.path.join(OUS_DIR, "all.json")) ous_nodes = [["Id", "Label"]] ous_edges = [["Source", "Target"]] children = [] ous_collected = [] for rec in ous['records']: if rec['data']['objectId'] in mpis: objectId = rec['data']['objectId'] name = utils.clean_string(rec['data']['name']) ous_nodes.append([objectId, name]) ous_collected.append(objectId) if 'parentAffiliation' in rec['data']: parent = rec['data']['parentAffiliation']['objectId'] ous_edges.append([objectId, parent]) else: print("no parent:", objectId) if rec['data']['objectId'] not in mpis and 'parentAffiliation' in rec[ 'data']: if rec['data']['parentAffiliation']['objectId'] in mpis \ or rec['data']['parentAffiliation']['objectId'] in children: objectId = rec['data']['objectId'] name = utils.clean_string(rec['data']['name']) ous_nodes.append([objectId, name]) ous_collected.append(objectId) parent = rec['data']['parentAffiliation']['objectId'] ous_edges.append([objectId, parent]) if rec['data']['hasChildren']: children.append(objectId) found = True while found: changed = False for rec in ous['records']: if rec['data']['objectId'] not in ous_collected \ and 'parentAffiliation' in rec['data']: if rec['data']['parentAffiliation']['objectId'] in mpis \ or rec['data']['parentAffiliation']['objectId'] in children: objectId = rec['data']['objectId'] name = utils.clean_string(rec['data']['name']) ous_nodes.append([objectId, name]) ous_collected.append(objectId) changed = True parent = rec['data']['parentAffiliation']['objectId'] ous_edges.append([objectId, parent]) if rec['data']['hasChildren']: children.append(objectId) if not changed: found = False utils.write_csv(os.path.join(GRAPH_DIR, "mpis--ous_nodes--tree-full.csv"), ous_nodes) utils.write_csv(os.path.join(GRAPH_DIR, "mpis--ous_ous_edges--tree.csv"), ous_edges) # Institutes institutes = [['Id', 'Label']] for rec in ous['records']: if rec['data']['objectId'] in mpis: objectId = rec['data']['objectId'] name = utils.clean_string(rec['data']['name']) institutes.append([objectId, name]) utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_nodes.csv'), institutes) # Children of Institutes kids_names = [["Id", "Label"]] mpis_kids_nodes = utils.read_csv_with_header( os.path.join(GRAPH_DIR, 'mpis--ous_nodes--tree-full.csv')) mpis_kids_nodes = list(mpis_kids_nodes.values()) for i in range(1, len(mpis_kids_nodes[0])): kid_id = mpis_kids_nodes[0][i] kid_name = utils.clean_string(mpis_kids_nodes[1][i]) if kid_id not in mpis: kids_names.append([kid_id, kid_name]) utils.write_csv( os.path.join(GRAPH_DIR, 'mpis--ous_nodes--tree-children.csv'), kids_names) log.close() sys.stdout = stdout
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) print("console output is redirected to count_persons.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "count_persons.log"), "w+") sys.stdout = log from ..utils.local import ld PERS_STATS = os.path.join(STATS_DIR, 'persons') if not os.path.exists(PERS_STATS): os.makedirs(PERS_STATS) ous_ctx = utils.read_json(os.path.join(EXTDATA_DIR, 'selected.json')) mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json')) print("start processing!") start_time = time.time() for mpi in mpis: if mpi not in ous_ctx: print(mpis[mpi] + " has no contexts!") print("") continue print("processing " + mpis[mpi] + "...") stats = {} mpi_ctxs = ous_ctx[mpi] for mpi_ctx in mpi_ctxs: print("extracting " + mpi_ctx + " ...") all = ld.get_data(mpi_ctx)[0] # consider only released items data_set = DataSet(data_id=all.idx + "_released", raw=all.get_items_released()) if not data_set.records: print(mpi_ctx + " has no records!") continue authors = data_set.get_creators_data( ) # only CoNE related authors! a = list(authors.keys()) a.sort() print(str(len(a)) + " CoNE persons to process ...") records = 0 for i in a: if i in stats: stats[i] += len(authors[i]) else: stats[i] = len(authors[i]) records += len(authors[i]) print("... with " + str(records) + " attributed records!") if not stats: continue stats = sorted(stats.items(), key=lambda x: x[1], reverse=True) idx, num_pub = zip(*stats) total = len(idx) path = os.path.join(PERS_STATS, mpi + '_pers_pub.csv') print("write stats to file: " + path) with open(path, 'w', newline='') as csv_file: csv_writer = csv.writer( csv_file, delimiter='\t', quotechar='', quoting=csv.QUOTE_NONE) # , quoting=csv.QUOTE_MINIMAL csv_writer.writerow(['authors', 'publications']) for i in range(0, total): csv_writer.writerow([idx[i], num_pub[i]]) print("finished " + mpis[mpi] + "!") print("") print("finished processing after %s sec!" % round(time.time() - start_time, 2)) log.close() sys.stdout = stdout
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) print("console output is redirected to count_journals.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "count_journals.log"), "w+") sys.stdout = log from ..utils.local import ld JOUR_STATS = os.path.join(STATS_DIR, 'journals') if not os.path.exists(JOUR_STATS): os.makedirs(JOUR_STATS) ous_ctx = utils.read_json(os.path.join(EXTDATA_DIR, 'selected.json')) mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json')) print("start processing!") start_time = time.time() for mpi in mpis: if mpi not in ous_ctx: print(mpis[mpi] + " has no contexts!") print("") continue print("processing " + mpis[mpi] + "...") articles = [] journals = {} counter = 0 nojour = 0 mpi_ctxs = ous_ctx[mpi] for mpi_ctx in mpi_ctxs: print("extracting " + mpi_ctx + " ...") all = ld.get_data(mpi_ctx)[0] # consider only released items data_set = DataSet(data_id=all.idx + "_released", raw=all.get_items_released()) if not data_set.records: print(mpi_ctx + " has no records!") continue print(str(data_set.num) + " records to process...") for record in data_set.records: data = record['data'] if data['publicState'] == 'RELEASED': if data['metadata']['genre'] == 'ARTICLE': articles.append(record) for article in articles: jour = False if 'sources' in article['data']['metadata']: for source in article['data']['metadata']['sources']: if source['genre'] == 'JOURNAL': if 'title' in source: jour = True counter += 1 if source['title'] in journals: journals[source['title']] += 1 else: journals[source['title']] = 1 else: print(article['data']['objectId'] + " has journal as source without title!") continue if jour: break if not jour: nojour += 1 else: print("found article " + article['data']['objectId'] + " without any source!") print('found ' + str(counter) + ' articles with journals as source') print('found ' + str(nojour) + ' articles without a journal as souce') journals = sorted(journals.items(), key=lambda x: x[1], reverse=True) total = len(journals) path = os.path.join(JOUR_STATS, mpi + '_jour_art.csv') print("write stats to file: " + path) with open(path, 'w', newline='') as csv_file: # quoting=csv.QUOTE_NONE csv_writer = csv.writer( csv_file, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow(['journals', 'articles']) for i in range(0, total): jour, art = journals[i] jour = jour.replace('\t', ' ') jour = jour.replace(',', '') jour = utils.clean_string(jour) csv_writer.writerow([jour, art]) print("finished " + mpis[mpi] + "!") print("") print("finished processing after %s sec!" % round(time.time() - start_time, 2)) log.close() sys.stdout = stdout