def titles_from_ctx_in_language_by_person_from_genre(ctx_id='ctx_1542176', lang_id='eng', genre='ARTICLE', preprocess=True): total = ld.get_data(ctx_id)[0] data_set = DataSet(data_id=ctx_id + "_released", raw=total.get_items_released()) genres = data_set.get_genre_data() if genre in genres: genre_data = DataSet(data_id=ctx_id + genre, raw=genres[genre]) lang_data = genre_data.get_languages_data() if lang_id in lang_data: lang_records = lang_data[lang_id] lang_data = DataSet(data_id=ctx_id + '_' + lang_id, raw=lang_records) creator_rec = lang_data.get_creators_data() creators = list(creator_rec.keys()) creator_titles = {} creators.sort() for c in creators: titles = extract.titles_from_records(creator_rec[c]) if preprocess: titles = [clean(title) for title in titles if clean(title)] creator_titles[c] = titles return creator_titles else: print(ctx_id, "has no " + lang_id + " publications!") return {} else: print(ctx_id, "has no publications with genre", genre + "!") return {}
def titles_from_ctx_in_language_by_person_and_year(ctx_id='ctx_1542176', lang_id='eng', preprocess=True): total = ld.get_data(ctx_id)[0] data_set = DataSet(data_id=ctx_id + "_released", raw=total.get_items_released()) lang_data = data_set.get_languages_data() if lang_id in lang_data: lang_records = lang_data[lang_id] lang_data = DataSet(data_id=ctx_id + '_' + lang_id, raw=lang_records) creator_rec = lang_data.get_creators_data() creators = list(creator_rec.keys()) creator_titles = {} creators.sort() for c in creators: creator_titles[c] = {} creator_data = DataSet(data_id=ctx_id + "_subset", raw=creator_rec[c]) years_data = creator_data.get_years_data() for year in years_data: titles = extract.titles_from_records(years_data[year]) if preprocess: titles = [clean(title) for title in titles if clean(title)] creator_titles[c][year] = titles return creator_titles else: print(ctx_id, "has no " + lang_id + " publications!") return {}
def titles_from_ctx_in_language_by_year(ctx_id='ctx_1542176', lang_id='eng', preprocess=True): total = ld.get_data(ctx_id)[0] data_set = DataSet(data_id=ctx_id + "_released", raw=total.get_items_released()) lang_data = data_set.get_languages_data() if lang_id in lang_data: lang_records = lang_data[lang_id] lang_data = DataSet(data_id=ctx_id + '_' + lang_id, raw=lang_records) years_data = lang_data.get_years_data() lang_years_titles = {} for year in years_data: lang_year_data = DataSet(data_id=lang_id + '_' + year, raw=years_data[year]) lang_year_titles = extract.titles_from_records( lang_year_data.records) if preprocess: lang_year_titles = [ clean(title) for title in lang_year_titles if clean(title) ] lang_years_titles[year] = lang_year_titles return lang_years_titles else: print(ctx_id, "has no " + lang_id + " publications!") return {}
def titles_from_ctx_in_language_and_genre(ctx_id='ctx_1542176', lang_id='eng', genre='ARTICLE', preprocess=True): total = ld.get_data(ctx_id)[0] data_set = DataSet(data_id=ctx_id + "_released", raw=total.get_items_released()) genres = data_set.get_genre_data() if genre in genres: genre_data = DataSet(data_id=ctx_id + genre, raw=genres[genre]) lang_data = genre_data.get_languages_data() if lang_id in lang_data: lang_records = lang_data[lang_id] lang_data = DataSet(data_id=ctx_id + '_' + lang_id, raw=lang_records) lang_titles = extract.titles_from_records(lang_data.records) if preprocess: lang_titles = [ clean(title) for title in lang_titles if clean(title) ] return lang_titles else: print(ctx_id, "has no " + lang_id + " publications with genre", genre + "!") return [] else: print(ctx_id, "has no publications with genre", genre + "!") return []
def titles_from_ctx_in_language_by_item(ctx_id='ctx_1542176', lang_id='eng', preprocess=True): total = ld.get_data(ctx_id)[0] data_set = DataSet(data_id=ctx_id + "_released", raw=total.get_items_released()) lang_data = data_set.get_languages_data() if lang_id in lang_data: lang_records = lang_data[lang_id] lang_data = DataSet(data_id=ctx_id + '_' + lang_id, raw=lang_records) lang_titles = extract.titles_from_records(lang_data.records) lang_idx = [extract.idx_from_item(r) for r in lang_data.records] if preprocess: lang_titles = [clean(title) for title in lang_titles] item_title = {} for i, j in enumerate(lang_idx): if lang_titles[i]: # only consider non empty title strings item_title[j] = lang_titles[i] return item_title else: print(ctx_id, "has no " + lang_id + " publications!") return {}
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) print("console output is redirected to count_persons.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "count_persons.log"), "w+") sys.stdout = log from ..utils.local import ld PERS_STATS = os.path.join(STATS_DIR, 'persons') if not os.path.exists(PERS_STATS): os.makedirs(PERS_STATS) ous_ctx = utils.read_json(os.path.join(EXTDATA_DIR, 'selected.json')) mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json')) print("start processing!") start_time = time.time() for mpi in mpis: if mpi not in ous_ctx: print(mpis[mpi] + " has no contexts!") print("") continue print("processing " + mpis[mpi] + "...") stats = {} mpi_ctxs = ous_ctx[mpi] for mpi_ctx in mpi_ctxs: print("extracting " + mpi_ctx + " ...") all = ld.get_data(mpi_ctx)[0] # consider only released items data_set = DataSet(data_id=all.idx + "_released", raw=all.get_items_released()) if not data_set.records: print(mpi_ctx + " has no records!") continue authors = data_set.get_creators_data( ) # only CoNE related authors! a = list(authors.keys()) a.sort() print(str(len(a)) + " CoNE persons to process ...") records = 0 for i in a: if i in stats: stats[i] += len(authors[i]) else: stats[i] = len(authors[i]) records += len(authors[i]) print("... with " + str(records) + " attributed records!") if not stats: continue stats = sorted(stats.items(), key=lambda x: x[1], reverse=True) idx, num_pub = zip(*stats) total = len(idx) path = os.path.join(PERS_STATS, mpi + '_pers_pub.csv') print("write stats to file: " + path) with open(path, 'w', newline='') as csv_file: csv_writer = csv.writer( csv_file, delimiter='\t', quotechar='', quoting=csv.QUOTE_NONE) # , quoting=csv.QUOTE_MINIMAL csv_writer.writerow(['authors', 'publications']) for i in range(0, total): csv_writer.writerow([idx[i], num_pub[i]]) print("finished " + mpis[mpi] + "!") print("") print("finished processing after %s sec!" % round(time.time() - start_time, 2)) log.close() sys.stdout = stdout
print("start processing data!") start_time = time.time() for path in data_paths: idx = path.split("/")[-1].replace(".json", "") print("") print("processing", idx, "...") all = ld.get_data(idx)[0] # consider only released items data_set = DataSet(data_id=all.idx + "_released", raw=all.get_items_released()) print(data_set.num, "records to process...") # loop over every record for record in data_set.records: # ///////////////////// # # /// PUBLICATIONS /// # # /////////////////// # item_id = extract.idx_from_item(record) item_year = extract.date_from_item(record) item_title = clean_title(extract.title_from_item(record)) item_genre = extract.genre_from_item(record) item_lang = ";".join(extract.languages_from_items(record))
def routine(): if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) print("console output is redirected to count_journals.log ...") stdout = sys.stdout log = open(os.path.join(LOG_DIR, "count_journals.log"), "w+") sys.stdout = log from ..utils.local import ld JOUR_STATS = os.path.join(STATS_DIR, 'journals') if not os.path.exists(JOUR_STATS): os.makedirs(JOUR_STATS) ous_ctx = utils.read_json(os.path.join(EXTDATA_DIR, 'selected.json')) mpis = utils.read_json(os.path.join(MAPPED_DIR, 'ous_mpi.json')) print("start processing!") start_time = time.time() for mpi in mpis: if mpi not in ous_ctx: print(mpis[mpi] + " has no contexts!") print("") continue print("processing " + mpis[mpi] + "...") articles = [] journals = {} counter = 0 nojour = 0 mpi_ctxs = ous_ctx[mpi] for mpi_ctx in mpi_ctxs: print("extracting " + mpi_ctx + " ...") all = ld.get_data(mpi_ctx)[0] # consider only released items data_set = DataSet(data_id=all.idx + "_released", raw=all.get_items_released()) if not data_set.records: print(mpi_ctx + " has no records!") continue print(str(data_set.num) + " records to process...") for record in data_set.records: data = record['data'] if data['publicState'] == 'RELEASED': if data['metadata']['genre'] == 'ARTICLE': articles.append(record) for article in articles: jour = False if 'sources' in article['data']['metadata']: for source in article['data']['metadata']['sources']: if source['genre'] == 'JOURNAL': if 'title' in source: jour = True counter += 1 if source['title'] in journals: journals[source['title']] += 1 else: journals[source['title']] = 1 else: print(article['data']['objectId'] + " has journal as source without title!") continue if jour: break if not jour: nojour += 1 else: print("found article " + article['data']['objectId'] + " without any source!") print('found ' + str(counter) + ' articles with journals as source') print('found ' + str(nojour) + ' articles without a journal as souce') journals = sorted(journals.items(), key=lambda x: x[1], reverse=True) total = len(journals) path = os.path.join(JOUR_STATS, mpi + '_jour_art.csv') print("write stats to file: " + path) with open(path, 'w', newline='') as csv_file: # quoting=csv.QUOTE_NONE csv_writer = csv.writer( csv_file, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow(['journals', 'articles']) for i in range(0, total): jour, art = journals[i] jour = jour.replace('\t', ' ') jour = jour.replace(',', '') jour = utils.clean_string(jour) csv_writer.writerow([jour, art]) print("finished " + mpis[mpi] + "!") print("") print("finished processing after %s sec!" % round(time.time() - start_time, 2)) log.close() sys.stdout = stdout