Esempio n. 1
0
def titles_from_ctx_in_language_by_person_from_genre(ctx_id='ctx_1542176',
                                                     lang_id='eng',
                                                     genre='ARTICLE',
                                                     preprocess=True):
    total = ld.get_data(ctx_id)[0]
    data_set = DataSet(data_id=ctx_id + "_released",
                       raw=total.get_items_released())
    genres = data_set.get_genre_data()
    if genre in genres:
        genre_data = DataSet(data_id=ctx_id + genre, raw=genres[genre])
        lang_data = genre_data.get_languages_data()
        if lang_id in lang_data:
            lang_records = lang_data[lang_id]
            lang_data = DataSet(data_id=ctx_id + '_' + lang_id,
                                raw=lang_records)
            creator_rec = lang_data.get_creators_data()
            creators = list(creator_rec.keys())
            creator_titles = {}
            creators.sort()
            for c in creators:
                titles = extract.titles_from_records(creator_rec[c])
                if preprocess:
                    titles = [clean(title) for title in titles if clean(title)]
                creator_titles[c] = titles
            return creator_titles
        else:
            print(ctx_id, "has no " + lang_id + " publications!")
            return {}
    else:
        print(ctx_id, "has no publications with genre", genre + "!")
        return {}
Esempio n. 2
0
def titles_from_ctx_in_language_by_person_and_year(ctx_id='ctx_1542176',
                                                   lang_id='eng',
                                                   preprocess=True):
    total = ld.get_data(ctx_id)[0]
    data_set = DataSet(data_id=ctx_id + "_released",
                       raw=total.get_items_released())
    lang_data = data_set.get_languages_data()
    if lang_id in lang_data:
        lang_records = lang_data[lang_id]
        lang_data = DataSet(data_id=ctx_id + '_' + lang_id, raw=lang_records)
        creator_rec = lang_data.get_creators_data()
        creators = list(creator_rec.keys())
        creator_titles = {}
        creators.sort()
        for c in creators:
            creator_titles[c] = {}
            creator_data = DataSet(data_id=ctx_id + "_subset",
                                   raw=creator_rec[c])
            years_data = creator_data.get_years_data()
            for year in years_data:
                titles = extract.titles_from_records(years_data[year])
                if preprocess:
                    titles = [clean(title) for title in titles if clean(title)]
                creator_titles[c][year] = titles
        return creator_titles
    else:
        print(ctx_id, "has no " + lang_id + " publications!")
        return {}
Esempio n. 3
0
def titles_from_ctx_in_language_by_year(ctx_id='ctx_1542176',
                                        lang_id='eng',
                                        preprocess=True):
    total = ld.get_data(ctx_id)[0]
    data_set = DataSet(data_id=ctx_id + "_released",
                       raw=total.get_items_released())
    lang_data = data_set.get_languages_data()
    if lang_id in lang_data:
        lang_records = lang_data[lang_id]
        lang_data = DataSet(data_id=ctx_id + '_' + lang_id, raw=lang_records)
        years_data = lang_data.get_years_data()
        lang_years_titles = {}
        for year in years_data:
            lang_year_data = DataSet(data_id=lang_id + '_' + year,
                                     raw=years_data[year])
            lang_year_titles = extract.titles_from_records(
                lang_year_data.records)
            if preprocess:
                lang_year_titles = [
                    clean(title) for title in lang_year_titles if clean(title)
                ]
            lang_years_titles[year] = lang_year_titles
        return lang_years_titles
    else:
        print(ctx_id, "has no " + lang_id + " publications!")
        return {}
Esempio n. 4
0
def titles_from_ctx_in_language_and_genre(ctx_id='ctx_1542176',
                                          lang_id='eng',
                                          genre='ARTICLE',
                                          preprocess=True):
    total = ld.get_data(ctx_id)[0]
    data_set = DataSet(data_id=ctx_id + "_released",
                       raw=total.get_items_released())
    genres = data_set.get_genre_data()
    if genre in genres:
        genre_data = DataSet(data_id=ctx_id + genre, raw=genres[genre])
        lang_data = genre_data.get_languages_data()
        if lang_id in lang_data:
            lang_records = lang_data[lang_id]
            lang_data = DataSet(data_id=ctx_id + '_' + lang_id,
                                raw=lang_records)
            lang_titles = extract.titles_from_records(lang_data.records)
            if preprocess:
                lang_titles = [
                    clean(title) for title in lang_titles if clean(title)
                ]
            return lang_titles
        else:
            print(ctx_id, "has no " + lang_id + " publications with genre",
                  genre + "!")
            return []
    else:
        print(ctx_id, "has no publications with genre", genre + "!")
        return []
Esempio n. 5
0
def titles_from_ctx_in_language_by_item(ctx_id='ctx_1542176',
                                        lang_id='eng',
                                        preprocess=True):
    total = ld.get_data(ctx_id)[0]
    data_set = DataSet(data_id=ctx_id + "_released",
                       raw=total.get_items_released())
    lang_data = data_set.get_languages_data()
    if lang_id in lang_data:
        lang_records = lang_data[lang_id]
        lang_data = DataSet(data_id=ctx_id + '_' + lang_id, raw=lang_records)
        lang_titles = extract.titles_from_records(lang_data.records)
        lang_idx = [extract.idx_from_item(r) for r in lang_data.records]
        if preprocess:
            lang_titles = [clean(title) for title in lang_titles]
        item_title = {}
        for i, j in enumerate(lang_idx):
            if lang_titles[i]:  # only consider non empty title strings
                item_title[j] = lang_titles[i]
        return item_title
    else:
        print(ctx_id, "has no " + lang_id + " publications!")
        return {}
Esempio n. 6
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    print("console output is redirected to count_persons.log ...")

    stdout = sys.stdout
    log = open(os.path.join(LOG_DIR, "count_persons.log"), "w+")
    sys.stdout = log

    from ..utils.local import ld

    PERS_STATS = os.path.join(STATS_DIR, 'persons')

    if not os.path.exists(PERS_STATS):
        os.makedirs(PERS_STATS)

    ous_ctx = utils.read_json(os.path.join(EXTDATA_DIR, 'selected.json'))
    mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json'))

    print("start processing!")
    start_time = time.time()

    for mpi in mpis:
        if mpi not in ous_ctx:
            print(mpis[mpi] + " has no contexts!")
            print("")
            continue

        print("processing " + mpis[mpi] + "...")
        stats = {}
        mpi_ctxs = ous_ctx[mpi]
        for mpi_ctx in mpi_ctxs:
            print("extracting " + mpi_ctx + " ...")

            all = ld.get_data(mpi_ctx)[0]

            # consider only released items
            data_set = DataSet(data_id=all.idx + "_released",
                               raw=all.get_items_released())

            if not data_set.records:
                print(mpi_ctx + " has no records!")
                continue

            authors = data_set.get_creators_data(
            )  # only CoNE related authors!

            a = list(authors.keys())
            a.sort()

            print(str(len(a)) + " CoNE persons to process ...")

            records = 0

            for i in a:
                if i in stats:
                    stats[i] += len(authors[i])
                else:
                    stats[i] = len(authors[i])
                records += len(authors[i])

            print("... with " + str(records) + " attributed records!")

        if not stats:
            continue

        stats = sorted(stats.items(), key=lambda x: x[1], reverse=True)

        idx, num_pub = zip(*stats)

        total = len(idx)

        path = os.path.join(PERS_STATS, mpi + '_pers_pub.csv')

        print("write stats to file: " + path)

        with open(path, 'w', newline='') as csv_file:
            csv_writer = csv.writer(
                csv_file, delimiter='\t', quotechar='',
                quoting=csv.QUOTE_NONE)  # , quoting=csv.QUOTE_MINIMAL
            csv_writer.writerow(['authors', 'publications'])
            for i in range(0, total):
                csv_writer.writerow([idx[i], num_pub[i]])

        print("finished " + mpis[mpi] + "!")
        print("")

    print("finished processing after %s sec!" %
          round(time.time() - start_time, 2))

    log.close()
    sys.stdout = stdout
Esempio n. 7
0
print("start processing data!")

start_time = time.time()

for path in data_paths:

    idx = path.split("/")[-1].replace(".json", "")

    print("")
    print("processing", idx, "...")

    all = ld.get_data(idx)[0]

    # consider only released items
    data_set = DataSet(data_id=all.idx + "_released",
                       raw=all.get_items_released())

    print(data_set.num, "records to process...")

    # loop over every record
    for record in data_set.records:

        # ///////////////////// #
        # /// PUBLICATIONS /// #
        # /////////////////// #

        item_id = extract.idx_from_item(record)
        item_year = extract.date_from_item(record)
        item_title = clean_title(extract.title_from_item(record))
        item_genre = extract.genre_from_item(record)
        item_lang = ";".join(extract.languages_from_items(record))
Esempio n. 8
0
def routine():

    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)

    print("console output is redirected to count_journals.log ...")

    stdout = sys.stdout
    log = open(os.path.join(LOG_DIR, "count_journals.log"), "w+")
    sys.stdout = log

    from ..utils.local import ld

    JOUR_STATS = os.path.join(STATS_DIR, 'journals')

    if not os.path.exists(JOUR_STATS):
        os.makedirs(JOUR_STATS)

    ous_ctx = utils.read_json(os.path.join(EXTDATA_DIR, 'selected.json'))
    mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json'))

    print("start processing!")
    start_time = time.time()

    for mpi in mpis:
        if mpi not in ous_ctx:
            print(mpis[mpi] + " has no contexts!")
            print("")
            continue

        print("processing " + mpis[mpi] + "...")

        articles = []
        journals = {}
        counter = 0
        nojour = 0

        mpi_ctxs = ous_ctx[mpi]
        for mpi_ctx in mpi_ctxs:
            print("extracting " + mpi_ctx + " ...")

            all = ld.get_data(mpi_ctx)[0]

            # consider only released items
            data_set = DataSet(data_id=all.idx + "_released",
                               raw=all.get_items_released())

            if not data_set.records:
                print(mpi_ctx + " has no records!")
                continue

            print(str(data_set.num) + " records to process...")

            for record in data_set.records:
                data = record['data']
                if data['publicState'] == 'RELEASED':
                    if data['metadata']['genre'] == 'ARTICLE':
                        articles.append(record)

            for article in articles:
                jour = False
                if 'sources' in article['data']['metadata']:
                    for source in article['data']['metadata']['sources']:
                        if source['genre'] == 'JOURNAL':
                            if 'title' in source:
                                jour = True
                                counter += 1
                                if source['title'] in journals:
                                    journals[source['title']] += 1
                                else:
                                    journals[source['title']] = 1
                            else:
                                print(article['data']['objectId'] +
                                      " has journal as source without title!")
                                continue
                        if jour:
                            break
                    if not jour:
                        nojour += 1
                else:
                    print("found article " +
                          article['data']['objectId'] + " without any source!")

        print('found ' + str(counter) + ' articles with journals as source')
        print('found ' + str(nojour) + ' articles without a journal as souce')

        journals = sorted(journals.items(), key=lambda x: x[1], reverse=True)

        total = len(journals)

        path = os.path.join(JOUR_STATS, mpi + '_jour_art.csv')

        print("write stats to file: " + path)

        with open(path, 'w', newline='') as csv_file:
            # quoting=csv.QUOTE_NONE
            csv_writer = csv.writer(
                csv_file, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            csv_writer.writerow(['journals', 'articles'])
            for i in range(0, total):
                jour, art = journals[i]
                jour = jour.replace('\t', ' ')
                jour = jour.replace(',', '')
                jour = utils.clean_string(jour)
                csv_writer.writerow([jour, art])

        print("finished " + mpis[mpi] + "!")
        print("")

    print("finished processing after %s sec!" %
          round(time.time() - start_time, 2))

    log.close()
    sys.stdout = stdout