Ejemplo n.º 1
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    print("console output is redirected to graph_contexts_mpis.log ...")

    stdout = sys.stdout

    log = open(os.path.join(LOG_DIR, "graph_contexts_mpis.log"), "w+")
    sys.stdout = log

    ctxs = utils.read_json(os.path.join(CTX_DIR, "all.json"))
    ous = utils.read_json(os.path.join(OUS_DIR, "all.json"))
    mpis = utils.read_json(os.path.join(MAPPED_DIR, "ous_ctx.json"))

    institutes = [['Id', 'Label']]
    contexts = [['Id', 'Label', 'Created']]

    for rec in ous['records']:
        if rec['data']['objectId'] in mpis:
            objectId = rec['data']['objectId']
            name = utils.clean_string(rec['data']['name'])
            institutes.append([objectId, name])

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_nodes--ctx.csv'),
                    institutes)

    institutes_contexts = [['Source', 'Target']]
    mpis_ctx = []

    for mpi in mpis:
        for context in mpis[mpi]:
            institutes_contexts.append([mpi, context])
            mpis_ctx.append(context)

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_ctx_edges.csv'),
                    institutes_contexts)

    for rec in ctxs['records']:
        objectId = rec['data']['objectId']
        if objectId in mpis_ctx:
            name = rec['data']['name'].replace('"', '')
            created = rec['data']['creationDate'].split("-")[0]
            contexts.append([objectId, name, created])

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ctx_nodes--ous.csv'),
                    contexts)

    log.close()
    sys.stdout = stdout
Ejemplo n.º 2
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    print("console output is redirected to graph_ous.log ...")

    stdout = sys.stdout

    log = open(os.path.join(LOG_DIR, "graph_ous.log"), "w+")
    sys.stdout = log

    ous = utils.read_json(os.path.join(OUS_DIR, "all.json"))

    org_nodes = [['Id', 'Label']]
    org_edges = [['Source', 'Target']]

    for record in ous['records']:
        org_unit_id = record['data']['objectId']
        org_unit_name = utils.clean_string(record['data']['name'])
        org_nodes.append([org_unit_id, org_unit_name])
        if 'parentAffiliation' in record['data']:
            parent = record['data']['parentAffiliation']['objectId']
            org_edges.append([org_unit_id, parent])

    utils.write_csv(os.path.join(GRAPH_DIR, 'pure--ous_nodes.csv'), org_nodes)
    utils.write_csv(os.path.join(GRAPH_DIR, 'pure--ous_ous_edges.csv'),
                    org_edges)

    log.close()
    sys.stdout = stdout
Ejemplo n.º 3
0
def routine():

    if not os.path.exists(ITEMS_DIR):
        os.makedirs(ITEMS_DIR)

    ################################
    ### RETRIEVE RECORDS OF CTXs ###
    ################################

    ctxs = utils.read_json(os.path.join(CTX_DIR, "all.json"))

    ctx_meta = {}

    for rec in ctxs['records']:
        objectId = rec['data']['objectId']
        ctx_meta[objectId] = rec['data']['name']

    ctx_ids = list(ctx_meta.keys())
    ctx_ids.sort()

    client = Client()

    for ctx_idx in tqdm(ctx_ids):
        print("retrieve data of context:", ctx_meta[ctx_idx])
        ctx_data = client.get_data(ctx_id=ctx_idx)
        utils.write_json(os.path.join(ITEMS_DIR, ctx_idx+".json"),
                         ctx_data.collection)
Ejemplo n.º 4
0
def routine():

    all_ous = []
    all_fnd = {}

    eng_fnd = utils.read_json(os.path.join(MAP_DIR, "ous_fnd_eng.json"))

    for eng_ou in eng_fnd:
        all_fnd[eng_ou] = eng_fnd[eng_ou]
        all_ous.append(eng_fnd[eng_ou])

    deu_fnd = utils.read_json(os.path.join(MAP_DIR, "ous_fnd_deu.json"))
    # deu_ous = list(deu_fnd.values())

    for deu_ou in deu_fnd:
        if deu_fnd[deu_ou] not in all_ous:
            all_fnd[deu_ou] = deu_fnd[deu_ou]
            all_ous.append(deu_fnd[deu_ou])

    utils.write_json(os.path.join(MAP_DIR, "mpi_ous.json"), all_fnd)
Ejemplo n.º 5
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    stdout = sys.stdout

    print("console output is redirected to graph_languages.log ...")

    log = open(os.path.join(LOG_DIR, "graph_languages.log"), "w+")
    sys.stdout = log

    languages_raw = utils.read_json(os.path.join(LANG_DIR, 'collection.json'))

    languages = [['Id', 'Label', 'Coordinates']]

    dc_title = 'http_purl_org_dc_elements_1_1_title'
    # dc_idx = 'http_purl_org_dc_elements_1_1_identifier'
    google_coordinates = 'http_earth_google_com_kml_2_1_coordinates'

    for lang in languages_raw:
        name = ''
        if dc_title in languages_raw[lang]:
            name = languages_raw[lang][dc_title]
            if type(name) == list:
                name = name[0]
        else:
            print("no name found for language", lang)
        coordinates = ''
        if google_coordinates in languages_raw[lang]:
            coordinates = languages_raw[lang][google_coordinates]
        languages.append([lang, name, coordinates])

    utils.write_csv(os.path.join(GRAPH_DIR, 'pure--lang_nodes.csv'), languages)

    log.close()
    sys.stdout = stdout
Ejemplo n.º 6
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    print("console output is redirected to graph_contexts.log ...")

    stdout = sys.stdout

    log = open(os.path.join(LOG_DIR, "graph_contexts.log"), "w+")
    sys.stdout = log

    ctxs = utils.read_json(os.path.join(CTX_DIR, "all.json"))

    ctx_nodes = [["Id", "Label", "Created"]]
    ctx_edges = [["Source", "Target"]]

    for rec in ctxs['records']:
        objectId = rec['data']['objectId']
        name = rec['data']['name']
        created = rec['data']['creationDate'].split("-")[0]
        ctx_nodes.append([objectId, name, created])
        maintainers = rec['data']['responsibleAffiliations']
        for m in maintainers:
            maintainer = m['objectId']
            ctx_edges.append([objectId, maintainer])

    utils.write_csv(os.path.join(GRAPH_DIR, "pure--ctx_nodes.csv"), ctx_nodes)
    utils.write_csv(os.path.join(GRAPH_DIR, "pure--ctx_ous_edges.csv"),
                    ctx_edges)

    log.close()
    sys.stdout = stdout
Ejemplo n.º 7
0
from pybman import utils

from .utils_paths import BASE_DIR

PURE_DIR = BASE_DIR + 'pure/'
MPIS_DIR = BASE_DIR + 'mpis/'

##############################################################
### READ FILES CONTAINING ALL MAX PLANCK INSTITUTES (MPIs) ###
### AND ORGANIZATIONAL UNITS (OUs) FROM PURE #################
##############################################################

mpis = utils.read_json(MPIS_DIR + 'scrape/all_deu.json')
pure_ous = utils.read_json(PURE_DIR + 'ous/all.json')

############################
### EXTRACT NAMES OF OUs ###
############################

names = {}

for record in pure_ous['records']:
    idx = record['data']['objectId']
    metadata = record['data']['metadata']
    name = metadata['name'].strip()
    names[name] = idx
    if 'alternativeNames' in metadata and metadata['alternativeNames'][0]:
        for altname in metadata['alternativeNames']:
            names[altname.strip()] = idx

m = list(mpis.keys())
Ejemplo n.º 8
0
def routine():

    if not os.path.exists(MAP_DIR):
        os.makedirs(MAP_DIR)

    ######################################################################
    ### READ FILES CONTAINING METADATA ON MAX PLANCK INSTITUTES (MPIs) ###
    ### AND ORGANIZATIONAL UNITS (OUs) FROM PURE #########################
    ######################################################################

    mpis = utils.read_json(os.path.join(SCRAPE_DIR, 'all_deu.json'))
    pure_ous = utils.read_json(os.path.join(OUS_DIR, 'all.json'))

    ############################
    ### EXTRACT NAMES OF OUs ###
    ############################

    names = {}

    for record in pure_ous['records']:
        idx = record['data']['objectId']
        metadata = record['data']['metadata']
        name = metadata['name'].strip()
        names[name] = idx
        if 'alternativeNames' in metadata and metadata['alternativeNames'][0]:
            for altname in metadata['alternativeNames']:
                names[altname.strip()] = idx

    m = list(mpis.keys())
    n = list(names.keys())

    #############################
    ### MAP MPIs TO NAMES/OUs ###
    #############################

    not_fnd = []
    fnd = {}

    for mpi in m:
        if mpi in n:
            fnd[mpi] = names[mpi]
        elif mpi.replace('-', ' ') in n:
            fnd[mpi] = names[mpi.replace('-', ' ')]
        else:
            # print("no equivalent found for", mpi)
            not_fnd.append(mpi)

    idea = {}

    for no_eq in not_fnd:
        parts = no_eq.split('Max-Planck-Institut für')
        if len(parts) > 1:
            for ou in n:
                if parts[1].strip().lower() in ou.lower():
                    if no_eq in idea:
                        if ou not in idea[no_eq]:
                            idea[no_eq].append(ou)
                    else:
                        idea[no_eq] = [ou]

    for no_eq in not_fnd:
        parts = no_eq.split('für')
        if len(parts) > 1:
            for ou in n:
                if parts[1].strip().lower() in ou.lower():
                    if no_eq in idea:
                        if ou not in idea[no_eq]:
                            idea[no_eq].append(ou)
                    else:
                        idea[no_eq] = [ou]

    for no_eq in not_fnd:
        parts = no_eq.split(',')
        if len(parts) > 1:
            for ou in n:
                if parts[0].strip().lower() in ou.lower():
                    if no_eq in idea:
                        if ou not in idea[no_eq]:
                            idea[no_eq].append(ou)
                    else:
                        idea[no_eq] = [ou]

    ###############################
    ### PRINT AND WRITE RESULTS ###
    ###############################

    print("")
    print("found matches for")
    counter = 0
    for mpi in m:
        if mpi not in not_fnd:
            counter += 1
            print(mpi)

    print(str(counter), "in total")
    utils.write_json(os.path.join(MAP_DIR, "ous_fnd_deu.json"), fnd)

    print("")
    print("found possible matches for")
    counter = 0
    for nt_eq in idea:
        counter += 1
        print(nt_eq)

    print(str(counter), "in total")
    utils.write_json(os.path.join(MAP_DIR, "ous_ideas_deu.json"), idea)

    print("")
    print("no match found for")
    counter = 0
    for nt_eq in not_fnd:
        if nt_eq not in idea:
            counter += 1
            print(nt_eq)

    print(str(counter), "in total")
    print("")
    utils.write_json(os.path.join(MAP_DIR, "ous_not_fnd_deu.txt"), not_fnd)
Ejemplo n.º 9
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    stdout = sys.stdout

    print("console output is redirected to graph_descriptor.log ...")

    log = open(os.path.join(LOG_DIR, "graph_description.log"), "w+")
    sys.stdout = log

    # Tags of Institutes

    ous_tags = utils.read_json(os.path.join(MAPPED_DIR, 'ous_tags.json'))
    tags = list(utils.read_json(os.path.join(MAPPED_DIR, 'tags_ous.json')).keys())

    tag_nodes = [["Id", "Label"]]
    tags.sort()

    for i, t in enumerate(tags):
        tag_id = 'tag_' + str(i + 1)
        tag_nodes.append([tag_id, t])

    utils.write_csv(os.path.join(GRAPH_DIR, "mpis--tags_nodes.csv"), tag_nodes)

    mpis_tags = [['Source', 'Target']]

    print("try to find tags for", len(ous_tags), "institutes")

    for mpi in ous_tags:
        mpi_tags = ous_tags[mpi]
        for tag in mpi_tags:
            tag_id = tags.index(tag) + 1
            tag_id = 'tag_' + str(tag_id)
            mpis_tags.append([mpi, tag_id])

    print("found", len(mpis_tags) - 1, "edges from",
          len(ous_tags), "institutes to",
          len(tag_nodes) - 1, "tags")

    utils.write_csv(os.path.join(
        GRAPH_DIR, 'mpis--ous_tags_edges.csv'), mpis_tags)

    # Categories of Institutes

    mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json'))
    cats = utils.read_json(os.path.join(MAPPED_DIR, 'cat_ous.json'))

    cat_nodes = [["Id", "Label"]]
    cat_edges = [["Source", "Target"]]

    mpis_nodes = [["Id", "Label"]]

    all_mpis = []
    all_cats = list(cats.keys())
    all_cats.sort()

    print("try to find categories for", len(mpis), "institutes")

    for i, category in enumerate(all_cats):
        cat_idx = "category_" + str(i + 1)
        cat_nodes.append([cat_idx, category])
        ous_idx = cats[category]
        for ou_idx in ous_idx:
            if ou_idx not in all_mpis:
                all_mpis.append(ou_idx)
                mpis_nodes.append([ou_idx, mpis[ou_idx]])
            cat_edges.append([ou_idx, cat_idx])

    print("found", len(cat_edges) - 1, "edges from",
          len(all_mpis), "institutes to",
          len(all_cats), "categories")

    utils.write_csv(os.path.join(
        GRAPH_DIR, "mpis--ous_nodes--cats.csv"), mpis_nodes)
    utils.write_csv(os.path.join(GRAPH_DIR, "mpis--cats_nodes.csv"), cat_nodes)
    utils.write_csv(os.path.join(
        GRAPH_DIR, "mpis--ous_cat_edges.csv"), cat_edges)

    # Tags of Institutes of Categories

    cats = utils.read_json(os.path.join(MAPPED_DIR, 'cat_ous.json'))
    tags = utils.read_json(os.path.join(MAPPED_DIR, 'ous_tags.json'))

    t = list(tags.keys())
    t.sort()

    c = list(cats.keys())
    c.sort()

    all_c = []
    all_t = []

    cat_tags = {}
    tags_cat = {}

    for cat in c:
        cat_tags[cat] = []
        for ou_idx in cats[cat]:
            if ou_idx not in all_c:
                all_c.append(ou_idx)
            ou_tags = tags[ou_idx]
            for ou_tag in ou_tags:
                if ou_tag not in all_t:
                    all_t.append(ou_tag)
                if ou_tag not in tags_cat:
                    tags_cat[ou_tag] = [cat]
                else:
                    if cat not in tags_cat[ou_tag]:
                        tags_cat[ou_tag].append(cat)
                if ou_tag not in cat_tags[cat]:
                    cat_tags[cat].append(ou_tag)

    all_c.sort()

    ctags = {}

    for i, cat in enumerate(c):
        cat_idx = "category_" + str(i + 1)
        ctags[cat_idx] = cat_tags[cat]

    ct_edge = {}

    for cat in ctags:
        ct_edge[cat] = []

    all_t.sort()

    for i, tag in enumerate(all_t):
        tag_idx = "tag_" + str(i + 1)
        for cat in ctags:
            if tag in ctags[cat]:
                ct_edge[cat].append(tag_idx)
            else:
                continue

    cat_edges = [["Source", "Target"]]

    for cat in ct_edge:
        tags = ct_edge[cat]
        for cat_tag in tags:
            cat_edges.append([cat, cat_tag])

    print("found categories for", len(all_c), "institutes")

    utils.write_csv(os.path.join(
        GRAPH_DIR, "mpis--cats-tags_edges.csv"), cat_edges)

    log.close()
    sys.stdout = stdout
Ejemplo n.º 10
0
MPI_LANG_GENRE = os.path.join(TITLES_DIR, 'mpi-lang-genre/')
MPI_LANG_YEARS_GENRE = os.path.join(TITLES_DIR, 'mpi-lang-year-genre/')

PERS_LANG = os.path.join(TITLES_DIR, 'pers-lang/')
PERS_LANG_YEARS = os.path.join(TITLES_DIR, 'pers-lang-year/')
PERS_LANG_GENRE = os.path.join(TITLES_DIR, 'pers-lang-genre/')
PERS_LANG_YEARS_GENRE = os.path.join(TITLES_DIR, 'pers-lang-year-genre/')

CAT_LANG = os.path.join(TITLES_DIR, 'cat-lang/')
CAT_LANG_GENRE = os.path.join(TITLES_DIR, 'cat-lang-genre/')
CAT_LANG_YEARS = os.path.join(TITLES_DIR, 'cat-lang-year/')
CAT_LANG_YEARS_GENRE = os.path.join(TITLES_DIR, 'cat-lang-year-genre/')

YEARS = list(range(2000, 2020))

langs = utils.read_json(os.path.join(LANG_DIR, 'collection.json'))
cat_ous = utils.read_json(os.path.join(MAPPED_DIR, 'cat_ous.json'))
ous_ctx = utils.read_json(os.path.join(EXTDATA_DIR, 'selected.json'))
cats = list(cat_ous.keys())
mpis = list(ous_ctx.keys())
cats.sort()
mpis.sort()


def titles_from_ctx_in_language(ctx_id='ctx_1542176',
                                lang_id='eng',
                                preprocess=True):
    total = ld.get_data(ctx_id)[0]
    data_set = DataSet(data_id=ctx_id + "_released",
                       raw=total.get_items_released())
    lang_data = data_set.get_languages_data()
Ejemplo n.º 11
0
def routine():

    if not os.path.exists(MAP_DIR):
        os.makedirs(MAP_DIR)

    ##############################################################
    ### READ FILES CONTAINING ALL MAX PLANCK INSTITUTES (MPIs) ###
    ### AND ORGANIZATIONAL UNITS (OUs) FROM PURE #################
    ##############################################################

    mpis = utils.read_json(os.path.join(SCRAPE_DIR, 'all.json'))
    pure_ous = utils.read_json(os.path.join(OUS_DIR, 'all.json'))

    ############################
    ### EXTRACT NAMES OF OUs ###
    ############################

    names = {}

    for record in pure_ous['records']:
        idx = record['data']['objectId']
        metadata = record['data']['metadata']
        name = metadata['name'].strip()
        names[name] = idx
        if 'alternativeNames' in metadata and metadata['alternativeNames'][0]:
            for altname in metadata['alternativeNames']:
                names[altname.strip()] = idx

    m = list(mpis.keys())
    n = list(names.keys())

    #############################
    ### MAP MPIs TO NAMES/OUs ###
    #############################

    not_fnd = []
    fnd = {}

    for mpi in m:
        if mpi in n:
            fnd[mpi] = names[mpi]
        elif mpi.replace('Max Planck Institute', 'MPI') in n:
            fnd[mpi] = names[mpi.replace('Max Planck Institute', 'MPI')]
        elif mpi.split(",")[0] in n:
            fnd[mpi] = names[mpi.split(",")[0]]
            # Max Planck Institute for Software Systems, Kaiserslautern site
            # Max Planck Institute for Software Systems, Saarbrücken site
            # Max Planck Institute for Intelligent Systems, Stuttgart site
            # Max Planck Institute for Intelligent Systems, Tübingen site
        elif mpi.split(" (")[0] in n:
            fnd[mpi] = names[mpi.split(" (")[0]]
            # Max Planck Institute for Gravitational Physics (Hannover)
            # Max Planck Institute for Ornithology (Radolfzell)
            # Max Planck Institute for Plasma Physics (Greifswald)
        elif mpi == 'Research Group Social Neuroscience':
            # part of the Max Planck Institute for Human Cognitive and Brain Sciences
            continue
        else:
            # print("no equivalent found for", mpi)
            not_fnd.append(mpi)

    idea = {}

    for no_eq in not_fnd:
        parts = no_eq.split('Max Planck Institute for')
        if len(parts) > 1:
            for ou in n:
                if parts[1].strip().lower() in ou.lower():
                    if no_eq in idea:
                        if ou not in idea[no_eq]:
                            idea[no_eq].append(ou)
                    else:
                        idea[no_eq] = [ou]

    for no_eq in not_fnd:
        parts = no_eq.split('for')
        if len(parts) > 1:
            for ou in n:
                if parts[1].strip().lower() in ou.lower():
                    if no_eq in idea:
                        if ou not in idea[no_eq]:
                            idea[no_eq].append(ou)
                    else:
                        idea[no_eq] = [ou]

    for no_eq in not_fnd:
        parts = no_eq.split('of')
        if len(parts) > 1:
            for ou in n:
                if parts[1].strip().lower() in ou.lower():
                    if no_eq in idea:
                        if ou not in idea[no_eq]:
                            idea[no_eq].append(ou)
                    else:
                        idea[no_eq] = [ou]

    for no_eq in not_fnd:
        parts = no_eq.split(',')
        if len(parts) > 1:
            for ou in n:
                if parts[0].strip().lower() in ou.lower():
                    if no_eq in idea:
                        if ou not in idea[no_eq]:
                            idea[no_eq].append(ou)
                    else:
                        idea[no_eq] = [ou]

    ###############################
    ### PRINT AND WRITE RESULTS ###
    ###############################

    print("")
    print("found matches for")
    counter = 0
    for mpi in m:
        if mpi not in not_fnd:
            counter += 1
            print(mpi)

    print(str(counter),"in total")
    utils.write_json(os.path.join(MAP_DIR, "ous_fnd_eng.json"), fnd)

    print("")
    print("found possible matches for")
    counter = 0
    for nt_eq in idea:
        counter += 1
        print(nt_eq)

    print(str(counter),"in total")
    utils.write_json(os.path.join(MAP_DIR, "ous_ideas_eng.json"), idea)

    print("")
    print("no match found for")
    counter = 0
    for nt_eq in not_fnd:
        if nt_eq not in idea:
            counter += 1
            print(nt_eq)

    print(str(counter),"in total")
    print("")
    utils.write_json(os.path.join(MAP_DIR, "ous_not_fnd_eng.txt"), not_fnd)
Ejemplo n.º 12
0
from tqdm import tqdm

from pybman import utils
from pybman import Client
from pybman.rest import ContextRestController

from .utils_paths import BASE_DIR

PURE_DIR = BASE_DIR + 'pure/'
ITEMS_DIR = BASE_DIR + 'items/'

################################
### RETRIEVE RECORDS OF CTXs ###
################################

ctxs = utils.read_json(PURE_DIR + "ctx/all.json")

ctx_meta = {}

for rec in ctxs['records']:
    objectId = rec['data']['objectId']
    ctx_meta[objectId] = rec['data']['name']

ctx_ids = list(ctx_meta.keys())
ctx_ids.sort()

client = Client()

for ctx_idx in tqdm(ctx_ids):
    print("retrieve data of context:", ctx_meta[ctx_idx])
    ctx_data = client.get_data(ctx_id=ctx_idx)
Ejemplo n.º 13
0
from .utils_paths import SCRAPE_DIR, CTX_DIR, MAP_DIR, MAPPED_DIR, LOG_DIR

if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

if not os.path.exists(MAPPED_DIR):
    os.makedirs(MAPPED_DIR)

print("console output is redirected to map_post.log ...")

stdout = sys.stdout

log = open(os.path.join(LOG_DIR, "map_post.log"), "w+")
sys.stdout = log

mpis = utils.read_json(os.path.join(SCRAPE_DIR, "all.json"))

print("scraped", len(mpis), "institues!")

mpis_mapped = utils.read_json(os.path.join(MAP_DIR,
                                           "mpi_ous.json"))  # ous.json

o = list(mpis_mapped.values())
m = list(mpis_mapped.keys())

ous_mpi = {}

for i in range(len(o)):
    ou = o[i]
    name = m[i]
    ous_mpi[ou] = name
Ejemplo n.º 14
0
import sys

from pybman import utils

from .utils_paths import BASE_DIR

MPIS_DIR = BASE_DIR + 'mpis/'

print("console output is redirected to map_post.log ...")

stdout = sys.stdout

log = open("map_post.log", "w+")
sys.stdout = log

mpis = utils.read_json(BASE_DIR + 'mpis/scrape/all.json')

print("scraped",len(mpis),"institues!")

mpis_mapped = utils.read_json(BASE_DIR + 'mpis/map/mpi_ous.json') # ous.json

o = list(mpis_mapped.values())
m = list(mpis_mapped.keys())

ous_mpi = {}

for i in range(len(o)):
    ou = o[i]
    name = m[i]
    ous_mpi[ou] = name
Ejemplo n.º 15
0
from pybman import utils

from .utils_paths import BASE_DIR

MPIS_DIR = BASE_DIR + 'mpis/'

all_ous = []
all_fnd = {}

eng_fnd = utils.read_json(MPIS_DIR + "map/ous_fnd_eng.json")

for eng_ou in eng_fnd:
    all_fnd[eng_ou] = eng_fnd[eng_ou]
    all_ous.append(eng_fnd[eng_ou])

deu_fnd = utils.read_json(MPIS_DIR + "map/ous_fnd_deu.json")
deu_ous = list(deu_fnd.values())

for deu_ou in deu_fnd:
    if deu_fnd[deu_ou] not in all_ous:
        all_fnd[deu_ou] = deu_fnd[deu_ou]
        all_ous.append(deu_fnd[deu_ou])

utils.write_json(MPIS_DIR + "map/mpi_ous.json", all_fnd)
Ejemplo n.º 16
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    print("console output is redirected to graph_ous_mpis.log ...")

    stdout = sys.stdout

    log = open(os.path.join(LOG_DIR, "graph_ous_mpis.log"), "w+")
    sys.stdout = log

    mpis = utils.read_json(os.path.join(MAPPED_DIR, "ous_mpi.json"))
    ous = utils.read_json(os.path.join(OUS_DIR, "all.json"))

    ous_nodes = [["Id", "Label"]]
    ous_edges = [["Source", "Target"]]

    children = []

    ous_collected = []

    for rec in ous['records']:
        if rec['data']['objectId'] in mpis:
            objectId = rec['data']['objectId']
            name = utils.clean_string(rec['data']['name'])
            ous_nodes.append([objectId, name])
            ous_collected.append(objectId)
            if 'parentAffiliation' in rec['data']:
                parent = rec['data']['parentAffiliation']['objectId']
                ous_edges.append([objectId, parent])
            else:
                print("no parent:", objectId)
        if rec['data']['objectId'] not in mpis and 'parentAffiliation' in rec[
                'data']:
            if rec['data']['parentAffiliation']['objectId'] in mpis \
                    or rec['data']['parentAffiliation']['objectId'] in children:
                objectId = rec['data']['objectId']
                name = utils.clean_string(rec['data']['name'])
                ous_nodes.append([objectId, name])
                ous_collected.append(objectId)
                parent = rec['data']['parentAffiliation']['objectId']
                ous_edges.append([objectId, parent])
                if rec['data']['hasChildren']:
                    children.append(objectId)

    found = True
    while found:
        changed = False
        for rec in ous['records']:
            if rec['data']['objectId'] not in ous_collected \
                    and 'parentAffiliation' in rec['data']:
                if rec['data']['parentAffiliation']['objectId'] in mpis \
                        or rec['data']['parentAffiliation']['objectId'] in children:
                    objectId = rec['data']['objectId']
                    name = utils.clean_string(rec['data']['name'])
                    ous_nodes.append([objectId, name])
                    ous_collected.append(objectId)
                    changed = True
                    parent = rec['data']['parentAffiliation']['objectId']
                    ous_edges.append([objectId, parent])
                    if rec['data']['hasChildren']:
                        children.append(objectId)
        if not changed:
            found = False

    utils.write_csv(os.path.join(GRAPH_DIR, "mpis--ous_nodes--tree-full.csv"),
                    ous_nodes)
    utils.write_csv(os.path.join(GRAPH_DIR, "mpis--ous_ous_edges--tree.csv"),
                    ous_edges)

    # Institutes

    institutes = [['Id', 'Label']]

    for rec in ous['records']:
        if rec['data']['objectId'] in mpis:
            objectId = rec['data']['objectId']
            name = utils.clean_string(rec['data']['name'])
            institutes.append([objectId, name])

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_nodes.csv'), institutes)

    # Children of Institutes

    kids_names = [["Id", "Label"]]

    mpis_kids_nodes = utils.read_csv_with_header(
        os.path.join(GRAPH_DIR, 'mpis--ous_nodes--tree-full.csv'))
    mpis_kids_nodes = list(mpis_kids_nodes.values())

    for i in range(1, len(mpis_kids_nodes[0])):
        kid_id = mpis_kids_nodes[0][i]
        kid_name = utils.clean_string(mpis_kids_nodes[1][i])
        if kid_id not in mpis:
            kids_names.append([kid_id, kid_name])

    utils.write_csv(
        os.path.join(GRAPH_DIR, 'mpis--ous_nodes--tree-children.csv'),
        kids_names)

    log.close()
    sys.stdout = stdout
Ejemplo n.º 17
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    print("console output is redirected to count_persons.log ...")

    stdout = sys.stdout
    log = open(os.path.join(LOG_DIR, "count_persons.log"), "w+")
    sys.stdout = log

    from ..utils.local import ld

    PERS_STATS = os.path.join(STATS_DIR, 'persons')

    if not os.path.exists(PERS_STATS):
        os.makedirs(PERS_STATS)

    ous_ctx = utils.read_json(os.path.join(EXTDATA_DIR, 'selected.json'))
    mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json'))

    print("start processing!")
    start_time = time.time()

    for mpi in mpis:
        if mpi not in ous_ctx:
            print(mpis[mpi] + " has no contexts!")
            print("")
            continue

        print("processing " + mpis[mpi] + "...")
        stats = {}
        mpi_ctxs = ous_ctx[mpi]
        for mpi_ctx in mpi_ctxs:
            print("extracting " + mpi_ctx + " ...")

            all = ld.get_data(mpi_ctx)[0]

            # consider only released items
            data_set = DataSet(data_id=all.idx + "_released",
                               raw=all.get_items_released())

            if not data_set.records:
                print(mpi_ctx + " has no records!")
                continue

            authors = data_set.get_creators_data(
            )  # only CoNE related authors!

            a = list(authors.keys())
            a.sort()

            print(str(len(a)) + " CoNE persons to process ...")

            records = 0

            for i in a:
                if i in stats:
                    stats[i] += len(authors[i])
                else:
                    stats[i] = len(authors[i])
                records += len(authors[i])

            print("... with " + str(records) + " attributed records!")

        if not stats:
            continue

        stats = sorted(stats.items(), key=lambda x: x[1], reverse=True)

        idx, num_pub = zip(*stats)

        total = len(idx)

        path = os.path.join(PERS_STATS, mpi + '_pers_pub.csv')

        print("write stats to file: " + path)

        with open(path, 'w', newline='') as csv_file:
            csv_writer = csv.writer(
                csv_file, delimiter='\t', quotechar='',
                quoting=csv.QUOTE_NONE)  # , quoting=csv.QUOTE_MINIMAL
            csv_writer.writerow(['authors', 'publications'])
            for i in range(0, total):
                csv_writer.writerow([idx[i], num_pub[i]])

        print("finished " + mpis[mpi] + "!")
        print("")

    print("finished processing after %s sec!" %
          round(time.time() - start_time, 2))

    log.close()
    sys.stdout = stdout
Ejemplo n.º 18
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    print("console output is redirected to count_journals.log ...")

    stdout = sys.stdout
    log = open(os.path.join(LOG_DIR, "count_journals.log"), "w+")
    sys.stdout = log

    from ..utils.local import ld

    JOUR_STATS = os.path.join(STATS_DIR, 'journals')

    if not os.path.exists(JOUR_STATS):
        os.makedirs(JOUR_STATS)

    ous_ctx = utils.read_json(os.path.join(EXTDATA_DIR, 'selected.json'))
    mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json'))

    print("start processing!")
    start_time = time.time()

    for mpi in mpis:
        if mpi not in ous_ctx:
            print(mpis[mpi] + " has no contexts!")
            print("")
            continue

        print("processing " + mpis[mpi] + "...")

        articles = []
        journals = {}
        counter = 0
        nojour = 0

        mpi_ctxs = ous_ctx[mpi]
        for mpi_ctx in mpi_ctxs:
            print("extracting " + mpi_ctx + " ...")

            all = ld.get_data(mpi_ctx)[0]

            # consider only released items
            data_set = DataSet(data_id=all.idx + "_released",
                               raw=all.get_items_released())

            if not data_set.records:
                print(mpi_ctx + " has no records!")
                continue

            print(str(data_set.num) + " records to process...")

            for record in data_set.records:
                data = record['data']
                if data['publicState'] == 'RELEASED':
                    if data['metadata']['genre'] == 'ARTICLE':
                        articles.append(record)

            for article in articles:
                jour = False
                if 'sources' in article['data']['metadata']:
                    for source in article['data']['metadata']['sources']:
                        if source['genre'] == 'JOURNAL':
                            if 'title' in source:
                                jour = True
                                counter += 1
                                if source['title'] in journals:
                                    journals[source['title']] += 1
                                else:
                                    journals[source['title']] = 1
                            else:
                                print(article['data']['objectId'] +
                                      " has journal as source without title!")
                                continue
                        if jour:
                            break
                    if not jour:
                        nojour += 1
                else:
                    print("found article " +
                          article['data']['objectId'] + " without any source!")

        print('found ' + str(counter) + ' articles with journals as source')
        print('found ' + str(nojour) + ' articles without a journal as souce')

        journals = sorted(journals.items(), key=lambda x: x[1], reverse=True)

        total = len(journals)

        path = os.path.join(JOUR_STATS, mpi + '_jour_art.csv')

        print("write stats to file: " + path)

        with open(path, 'w', newline='') as csv_file:
            # quoting=csv.QUOTE_NONE
            csv_writer = csv.writer(
                csv_file, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            csv_writer.writerow(['journals', 'articles'])
            for i in range(0, total):
                jour, art = journals[i]
                jour = jour.replace('\t', ' ')
                jour = jour.replace(',', '')
                jour = utils.clean_string(jour)
                csv_writer.writerow([jour, art])

        print("finished " + mpis[mpi] + "!")
        print("")

    print("finished processing after %s sec!" %
          round(time.time() - start_time, 2))

    log.close()
    sys.stdout = stdout