Beispiel #1
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    print("console output is redirected to graph_ous.log ...")

    stdout = sys.stdout

    log = open(os.path.join(LOG_DIR, "graph_ous.log"), "w+")
    sys.stdout = log

    ous = utils.read_json(os.path.join(OUS_DIR, "all.json"))

    org_nodes = [['Id', 'Label']]
    org_edges = [['Source', 'Target']]

    for record in ous['records']:
        org_unit_id = record['data']['objectId']
        org_unit_name = utils.clean_string(record['data']['name'])
        org_nodes.append([org_unit_id, org_unit_name])
        if 'parentAffiliation' in record['data']:
            parent = record['data']['parentAffiliation']['objectId']
            org_edges.append([org_unit_id, parent])

    utils.write_csv(os.path.join(GRAPH_DIR, 'pure--ous_nodes.csv'), org_nodes)
    utils.write_csv(os.path.join(GRAPH_DIR, 'pure--ous_ous_edges.csv'),
                    org_edges)

    log.close()
    sys.stdout = stdout
Beispiel #2
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    print("console output is redirected to graph_contexts_mpis.log ...")

    stdout = sys.stdout

    log = open(os.path.join(LOG_DIR, "graph_contexts_mpis.log"), "w+")
    sys.stdout = log

    ctxs = utils.read_json(os.path.join(CTX_DIR, "all.json"))
    ous = utils.read_json(os.path.join(OUS_DIR, "all.json"))
    mpis = utils.read_json(os.path.join(MAPPED_DIR, "ous_ctx.json"))

    institutes = [['Id', 'Label']]
    contexts = [['Id', 'Label', 'Created']]

    for rec in ous['records']:
        if rec['data']['objectId'] in mpis:
            objectId = rec['data']['objectId']
            name = utils.clean_string(rec['data']['name'])
            institutes.append([objectId, name])

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_nodes--ctx.csv'),
                    institutes)

    institutes_contexts = [['Source', 'Target']]
    mpis_ctx = []

    for mpi in mpis:
        for context in mpis[mpi]:
            institutes_contexts.append([mpi, context])
            mpis_ctx.append(context)

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_ctx_edges.csv'),
                    institutes_contexts)

    for rec in ctxs['records']:
        objectId = rec['data']['objectId']
        if objectId in mpis_ctx:
            name = rec['data']['name'].replace('"', '')
            created = rec['data']['creationDate'].split("-")[0]
            contexts.append([objectId, name, created])

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ctx_nodes--ous.csv'),
                    contexts)

    log.close()
    sys.stdout = stdout
Beispiel #3
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    if not os.path.exists(GRAPH_DIR):
        os.makedirs(GRAPH_DIR)

    print("console output is redirected to graph_ous_mpis.log ...")

    stdout = sys.stdout

    log = open(os.path.join(LOG_DIR, "graph_ous_mpis.log"), "w+")
    sys.stdout = log

    mpis = utils.read_json(os.path.join(MAPPED_DIR, "ous_mpi.json"))
    ous = utils.read_json(os.path.join(OUS_DIR, "all.json"))

    ous_nodes = [["Id", "Label"]]
    ous_edges = [["Source", "Target"]]

    children = []

    ous_collected = []

    for rec in ous['records']:
        if rec['data']['objectId'] in mpis:
            objectId = rec['data']['objectId']
            name = utils.clean_string(rec['data']['name'])
            ous_nodes.append([objectId, name])
            ous_collected.append(objectId)
            if 'parentAffiliation' in rec['data']:
                parent = rec['data']['parentAffiliation']['objectId']
                ous_edges.append([objectId, parent])
            else:
                print("no parent:", objectId)
        if rec['data']['objectId'] not in mpis and 'parentAffiliation' in rec[
                'data']:
            if rec['data']['parentAffiliation']['objectId'] in mpis \
                    or rec['data']['parentAffiliation']['objectId'] in children:
                objectId = rec['data']['objectId']
                name = utils.clean_string(rec['data']['name'])
                ous_nodes.append([objectId, name])
                ous_collected.append(objectId)
                parent = rec['data']['parentAffiliation']['objectId']
                ous_edges.append([objectId, parent])
                if rec['data']['hasChildren']:
                    children.append(objectId)

    found = True
    while found:
        changed = False
        for rec in ous['records']:
            if rec['data']['objectId'] not in ous_collected \
                    and 'parentAffiliation' in rec['data']:
                if rec['data']['parentAffiliation']['objectId'] in mpis \
                        or rec['data']['parentAffiliation']['objectId'] in children:
                    objectId = rec['data']['objectId']
                    name = utils.clean_string(rec['data']['name'])
                    ous_nodes.append([objectId, name])
                    ous_collected.append(objectId)
                    changed = True
                    parent = rec['data']['parentAffiliation']['objectId']
                    ous_edges.append([objectId, parent])
                    if rec['data']['hasChildren']:
                        children.append(objectId)
        if not changed:
            found = False

    utils.write_csv(os.path.join(GRAPH_DIR, "mpis--ous_nodes--tree-full.csv"),
                    ous_nodes)
    utils.write_csv(os.path.join(GRAPH_DIR, "mpis--ous_ous_edges--tree.csv"),
                    ous_edges)

    # Institutes

    institutes = [['Id', 'Label']]

    for rec in ous['records']:
        if rec['data']['objectId'] in mpis:
            objectId = rec['data']['objectId']
            name = utils.clean_string(rec['data']['name'])
            institutes.append([objectId, name])

    utils.write_csv(os.path.join(GRAPH_DIR, 'mpis--ous_nodes.csv'), institutes)

    # Children of Institutes

    kids_names = [["Id", "Label"]]

    mpis_kids_nodes = utils.read_csv_with_header(
        os.path.join(GRAPH_DIR, 'mpis--ous_nodes--tree-full.csv'))
    mpis_kids_nodes = list(mpis_kids_nodes.values())

    for i in range(1, len(mpis_kids_nodes[0])):
        kid_id = mpis_kids_nodes[0][i]
        kid_name = utils.clean_string(mpis_kids_nodes[1][i])
        if kid_id not in mpis:
            kids_names.append([kid_id, kid_name])

    utils.write_csv(
        os.path.join(GRAPH_DIR, 'mpis--ous_nodes--tree-children.csv'),
        kids_names)

    log.close()
    sys.stdout = stdout
Beispiel #4
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    print("console output is redirected to count_journals.log ...")

    stdout = sys.stdout
    log = open(os.path.join(LOG_DIR, "count_journals.log"), "w+")
    sys.stdout = log

    from ..utils.local import ld

    JOUR_STATS = os.path.join(STATS_DIR, 'journals')

    if not os.path.exists(JOUR_STATS):
        os.makedirs(JOUR_STATS)

    ous_ctx = utils.read_json(os.path.join(EXTDATA_DIR, 'selected.json'))
    mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json'))

    print("start processing!")
    start_time = time.time()

    for mpi in mpis:
        if mpi not in ous_ctx:
            print(mpis[mpi] + " has no contexts!")
            print("")
            continue

        print("processing " + mpis[mpi] + "...")

        articles = []
        journals = {}
        counter = 0
        nojour = 0

        mpi_ctxs = ous_ctx[mpi]
        for mpi_ctx in mpi_ctxs:
            print("extracting " + mpi_ctx + " ...")

            all = ld.get_data(mpi_ctx)[0]

            # consider only released items
            data_set = DataSet(data_id=all.idx + "_released",
                               raw=all.get_items_released())

            if not data_set.records:
                print(mpi_ctx + " has no records!")
                continue

            print(str(data_set.num) + " records to process...")

            for record in data_set.records:
                data = record['data']
                if data['publicState'] == 'RELEASED':
                    if data['metadata']['genre'] == 'ARTICLE':
                        articles.append(record)

            for article in articles:
                jour = False
                if 'sources' in article['data']['metadata']:
                    for source in article['data']['metadata']['sources']:
                        if source['genre'] == 'JOURNAL':
                            if 'title' in source:
                                jour = True
                                counter += 1
                                if source['title'] in journals:
                                    journals[source['title']] += 1
                                else:
                                    journals[source['title']] = 1
                            else:
                                print(article['data']['objectId'] +
                                      " has journal as source without title!")
                                continue
                        if jour:
                            break
                    if not jour:
                        nojour += 1
                else:
                    print("found article " +
                          article['data']['objectId'] + " without any source!")

        print('found ' + str(counter) + ' articles with journals as source')
        print('found ' + str(nojour) + ' articles without a journal as souce')

        journals = sorted(journals.items(), key=lambda x: x[1], reverse=True)

        total = len(journals)

        path = os.path.join(JOUR_STATS, mpi + '_jour_art.csv')

        print("write stats to file: " + path)

        with open(path, 'w', newline='') as csv_file:
            # quoting=csv.QUOTE_NONE
            csv_writer = csv.writer(
                csv_file, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            csv_writer.writerow(['journals', 'articles'])
            for i in range(0, total):
                jour, art = journals[i]
                jour = jour.replace('\t', ' ')
                jour = jour.replace(',', '')
                jour = utils.clean_string(jour)
                csv_writer.writerow([jour, art])

        print("finished " + mpis[mpi] + "!")
        print("")

    print("finished processing after %s sec!" %
          round(time.time() - start_time, 2))

    log.close()
    sys.stdout = stdout