def _process_springer_catalogue(max_lookups=None): global COVERAGE_CACHE, LOOKUPS_PERFORMED current_year = datetime.datetime.now().year years = [str(year) for year in range(2015, current_year + 1)] for year in years: # Perform a simple check before wasting any time on processing catalogue_file = os.path.join(SPRINGER_JOURNAL_LISTS_DIR, year + ".csv") if not os.path.isfile(catalogue_file): raise IOError("Catalogue file " + catalogue_file + " not found!") for year in years: msg = "Looking up coverage stats for Open Choice journals in " + year print(colorise("--- " + msg + " ---", "green")) catalogue_file = os.path.join(SPRINGER_JOURNAL_LISTS_DIR, year + ".csv") reader = csv.DictReader(open(catalogue_file, "r")) for line in reader: if max_lookups is not None and LOOKUPS_PERFORMED >= max_lookups: return title = line["Title"] oa_option = line["Open Access Option"] if oa_option != "Hybrid (Open Choice)": msg = u'Journal "{}" is not an Open Choice journal (oa_option={}), skipping...' print(colorise(msg.format(title, oa_option), "yellow")) continue journal_id = line["product_id"] already_cached = True try: _ = COVERAGE_CACHE[journal_id]['years'][year][ "num_journal_total_articles"] _ = COVERAGE_CACHE[journal_id]['years'][year][ "num_journal_oa_articles"] except KeyError: try: _update_journal_stats(title, journal_id, year) except ValueError as ve: error_msg = ( 'Journal "{}" ({}): ValueError while obtaining journal ' + 'stats, annual stats not added to cache.') error_msg = colorise(error_msg.format(title, journal_id), "red") print(error_msg) ERROR_MSGS.append(error_msg) continue LOOKUPS_PERFORMED += 1 already_cached = False if already_cached: msg = 'Stats for journal "{}" in {} already cached.' print(colorise(msg.format(title, year), "yellow"))
def _get_springer_journal_stats(journal_id, period, oa=False): if not journal_id.isdigit(): raise ValueError("Invalid journal id " + journal_id + " (not a number)") url = SPRINGER_FULL_SEARCH.format(journal_id, period, period) if oa: url = SPRINGER_OA_SEARCH.format(journal_id, period, period) print(url) try: req = Request(url, None) response = urlopen(req) content = response.read() content = content.decode("utf-8") results = {} except HTTPError as httpe: if httpe.code == 503: # retry on timeout print(colorise("Timeout (HTTP 503), retrying...", "yellow")) return _get_springer_journal_stats(journal_id, period, oa) else: raise httpe count_match = SEARCH_RESULTS_COUNT_RE.search(content) if count_match: count = count_match.groupdict()['count'] count = count.replace(",", "") results['count'] = int(count) else: raise ValueError("Regex could not detect a results count at " + url) title_match = SEARCH_RESULTS_TITLE_RE.search(content) if title_match: title = (title_match.groupdict()['title']) htmlparser = HTMLParser() results['title'] = htmlparser.unescape(title) else: raise ValueError("Regex could not detect a journal title at " + url) return results
def _process_springer_catalogue(max_lookups=None): global COVERAGE_CACHE, LOOKUPS_PERFORMED current_year = datetime.datetime.now().year years = [str(year) for year in range(2015, current_year + 1)] for year in years: # Perform a simple check before wasting any time on processing catalogue_file = os.path.join(SPRINGER_JOURNAL_LISTS_DIR, year + ".csv") if not os.path.isfile(catalogue_file): raise IOError("Catalogue file " + catalogue_file + " not found!") for year in years: msg = "Looking up coverage stats for Open Choice journals in " + year print(colorise("--- " + msg + " ---", "green")) catalogue_file = os.path.join(SPRINGER_JOURNAL_LISTS_DIR, year + ".csv") reader = csv.DictReader(open(catalogue_file, "r")) for line in reader: if max_lookups is not None and LOOKUPS_PERFORMED >= max_lookups: return title = line["Title"] oa_option = line["Open Access Option"] if oa_option != "Hybrid (Open Choice)": msg = u'Journal "{}" is not an Open Choice journal (oa_option={}), skipping...' print(colorise(msg.format(title, oa_option), "yellow")) continue journal_id = line["product_id"] already_cached = True try: _ = COVERAGE_CACHE[journal_id]['years'][year]["num_journal_total_articles"] _ = COVERAGE_CACHE[journal_id]['years'][year]["num_journal_oa_articles"] except KeyError: try: _update_journal_stats(title, journal_id, year) except ValueError as ve: error_msg = ('Journal "{}" ({}): ValueError while obtaining journal ' + 'stats, annual stats not added to cache.') error_msg = colorise(error_msg.format(title, journal_id), "red") print(error_msg) ERROR_MSGS.append(error_msg) continue LOOKUPS_PERFORMED += 1 already_cached = False if already_cached: msg = 'Stats for journal "{}" in {} already cached.' print(colorise(msg.format(title, year), "yellow"))
def _update_journal_stats(title, journal_id, year, verbose=True): global COVERAGE_CACHE total = _get_springer_journal_stats(journal_id, year, oa=False) oa = _get_springer_journal_stats(journal_id, year, oa=True) if verbose: msg = 'Obtained stats for journal "{}" in {}: {} OA, {} Total' print(colorise(msg.format(title, year, oa["count"], total["count"]), "green")) if journal_id not in COVERAGE_CACHE: COVERAGE_CACHE[journal_id] = {'title': title, 'years': {}} if year not in COVERAGE_CACHE[journal_id]['years']: COVERAGE_CACHE[journal_id]['years'][year] = {} COVERAGE_CACHE[journal_id]['years'][year]["num_journal_total_articles"] = total["count"] COVERAGE_CACHE[journal_id]['years'][year]["num_journal_oa_articles"] = oa["count"]
def _shutdown(): """ Write cache content back to disk before terminating and display collected error messages. """ print("Updating cache files..") with open(COVERAGE_CACHE_FILE, "w") as f: f.write( json.dumps(COVERAGE_CACHE, sort_keys=True, indent=4, separators=(',', ': '))) f.flush() with open(PUBDATES_CACHE_FILE, "w") as f: f.write( json.dumps(PERSISTENT_PUBDATES_CACHE, sort_keys=True, indent=4, separators=(',', ': '))) f.flush() with open(JOURNAL_ID_CACHE_FILE, "w") as f: f.write( json.dumps(JOURNAL_ID_CACHE, sort_keys=True, indent=4, separators=(',', ': '))) f.flush() print("Done.") num_articles = 0 for _, dois in PERSISTENT_PUBDATES_CACHE.items(): num_articles += len(dois) print( "The article cache now contains publication dates for {} DOIs".format( num_articles)) if ERROR_MSGS: print( colorise("There were errors during the lookup process:", "yellow")) for msg in ERROR_MSGS: print(msg) sys.exit()
def _shutdown(): """ Write cache content back to disk before terminating and display collected error messages. """ print("Updating cache files..") with open(COVERAGE_CACHE_FILE, "w") as f: f.write(json.dumps(COVERAGE_CACHE, sort_keys=True, indent=4, separators=(',', ': '))) f.flush() with open(PUBDATES_CACHE_FILE, "w") as f: f.write(json.dumps(PERSISTENT_PUBDATES_CACHE, sort_keys=True, indent=4, separators=(',', ': '))) f.flush() with open(JOURNAL_ID_CACHE_FILE, "w") as f: f.write(json.dumps(JOURNAL_ID_CACHE, sort_keys=True, indent=4, separators=(',', ': '))) f.flush() print("Done.") num_articles = 0 for _, dois in PERSISTENT_PUBDATES_CACHE.items(): num_articles += len(dois) print("The article cache now contains publication dates for {} DOIs".format(num_articles)) if ERROR_MSGS: print(colorise("There were errors during the lookup process:", "yellow")) for msg in ERROR_MSGS: print(msg) sys.exit()
def create_cubes_tables(connectable, apc_file_name, transformative_agreements_file_name, schema="openapc_schema"): apc_fields = [("institution", "string"), ("period", "string"), ("euro", "float"), ("doi", "string"), ("is_hybrid", "string"), ("publisher", "string"), ("journal_full_title", "string"), ("issn", "string"), ("issn_print", "string"), ("issn_electronic", "string"), ("issn_l", "string"), ("license_ref", "string"), ("indexed_in_crossref", "string"), ("pmid", "string"), ("pmcid", "string"), ("ut", "string"), ("url", "string"), ("doaj", "string"), ("country", "string")] transformative_agreements_fields = [("institution", "string"), ("period", "string"), ("doi", "string"), ("is_hybrid", "string"), ("publisher", "string"), ("journal_full_title", "string"), ("issn", "string"), ("issn_print", "string"), ("issn_electronic", "string"), ("issn_l", "string"), ("license_ref", "string"), ("indexed_in_crossref", "string"), ("pmid", "string"), ("pmcid", "string"), ("ut", "string"), ("url", "string"), ("doaj", "string"), ("country", "string"), ("agreement", "string")] springer_compact_coverage_fields = [ ("period", "string"), ("publisher", "string"), ("journal_full_title", "string"), ("is_hybrid", "string"), ("num_springer_compact_articles", "float"), ("num_journal_total_articles", "float"), ("num_journal_oa_articles", "float") ] metadata = sqlalchemy.MetaData(bind=connectable) openapc_table = sqlalchemy.Table("openapc", metadata, autoload=False, schema=schema) if openapc_table.exists(): openapc_table.drop(checkfirst=False) init_table(openapc_table, apc_fields) openapc_insert_command = openapc_table.insert() transformative_agreements_table = sqlalchemy.Table( "transformative_agreements", metadata, autoload=False, schema=schema) if transformative_agreements_table.exists(): transformative_agreements_table.drop(checkfirst=False) init_table(transformative_agreements_table, transformative_agreements_fields) transformative_agreements_insert_command = transformative_agreements_table.insert( ) combined_table = sqlalchemy.Table("combined", metadata, autoload=False, schema=schema) if combined_table.exists(): combined_table.drop(checkfirst=False) init_table(combined_table, apc_fields) combined_insert_command = combined_table.insert() springer_compact_coverage_table = sqlalchemy.Table( "springer_compact_coverage", metadata, autoload=False, schema=schema) if springer_compact_coverage_table.exists(): springer_compact_coverage_table.drop(checkfirst=False) init_table(springer_compact_coverage_table, springer_compact_coverage_fields) springer_compact_coverage_insert_command = springer_compact_coverage_table.insert( ) # a dict to store individual insert commands for every table tables_insert_commands = { "openapc": openapc_insert_command, "transformative_agreements": transformative_agreements_insert_command, "combined": combined_insert_command, "springer_compact_coverage": springer_compact_coverage_insert_command } transformative_agreements_institution_countries = {} reader = csv.DictReader( open("static/institutions_transformative_agreements.csv", "r")) for row in reader: institution_name = row["institution"] country = row["country"] transformative_agreements_institution_countries[ institution_name] = country journal_coverage = None article_pubyears = None try: cache_file = open(scc.COVERAGE_CACHE_FILE, "r") journal_coverage = json.loads(cache_file.read()) cache_file.close() cache_file = open(scc.PUBDATES_CACHE_FILE, "r") article_pubyears = json.loads(cache_file.read()) cache_file.close() except IOError as ioe: msg = "Error while trying to access cache file: {}" print(msg.format(ioe)) sys.exit() except ValueError as ve: msg = "Error while trying to decode cache structure in: {}" print(msg.format(str(ve))) sys.exit() summarised_transformative_agreements = {} journal_id_title_map = {} reader = csv.DictReader(open(transformative_agreements_file_name, "r")) institution_key_errors = [] for row in reader: institution = row["institution"] publisher = row["publisher"] issn = row["issn"] doi = row["doi"] # colons cannot be escaped in URL queries to the cubes server, so we have # to remove them here row["journal_full_title"] = row["journal_full_title"].replace(":", "") title = row["journal_full_title"] try: row["country"] = transformative_agreements_institution_countries[ institution] except KeyError: if institution not in institution_key_errors: institution_key_errors.append(institution) tables_insert_commands["transformative_agreements"].execute(row) if row["euro"] != "NA": tables_insert_commands["combined"].execute(row) if publisher != "Springer Nature": continue journal_id = scc._get_springer_journal_id_from_doi(doi, issn) journal_id_title_map[journal_id] = title try: pub_year = article_pubyears[journal_id][doi] except KeyError: msg = ( u"Publication year entry not found in article cache for {}. " + "You might have to update the article cache with 'python " + "assets_generator.py coverage_stats'. Using the 'period' " + "column for now.") print(colorise(msg.format(doi), "yellow")) pub_year = row["period"] if journal_id not in summarised_transformative_agreements: summarised_transformative_agreements[journal_id] = {} if pub_year not in summarised_transformative_agreements[journal_id]: summarised_transformative_agreements[journal_id][pub_year] = 1 else: summarised_transformative_agreements[journal_id][pub_year] += 1 if institution_key_errors: print("KeyError: The following institutions were not found in the " + "institutions_transformative_agreements file:") for institution in institution_key_errors: print(institution) sys.exit() for journal_id, info in journal_coverage.items(): for year, stats in info["years"].items(): row = { "publisher": "Springer Nature", "journal_full_title": info["title"], "period": year, "is_hybrid": "TRUE", "num_journal_total_articles": stats["num_journal_total_articles"], "num_journal_oa_articles": stats["num_journal_oa_articles"] } try: row["num_springer_compact_articles"] = summarised_transformative_agreements[ journal_id][year] except KeyError: row["num_springer_compact_articles"] = 0 tables_insert_commands["springer_compact_coverage"].execute(row) institution_countries = {} reader = csv.DictReader(open("static/institutions.csv", "r")) for row in reader: cubes_name = row["institution_cubes_name"] institution_name = row["institution"] country = row["country"] institution_countries[institution_name] = country if institution_name not in tables_insert_commands: table = sqlalchemy.Table(cubes_name, metadata, autoload=False, schema=schema) if table.exists(): table.drop(checkfirst=False) init_table(table, apc_fields) insert_command = table.insert() tables_insert_commands[institution_name] = insert_command reader = csv.DictReader(open(apc_file_name, "r")) for row in reader: institution = row["institution"] # colons cannot be escaped in URL queries to the cubes server, so we have # to remove them here row["journal_full_title"] = row["journal_full_title"].replace(":", "") row["country"] = institution_countries[institution] tables_insert_commands[institution].execute(row) tables_insert_commands["openapc"].execute(row) tables_insert_commands["combined"].execute(row)
def create_cubes_tables(connectable, apc_file_name, offsetting_file_name, schema="openapc_schema"): apc_fields = [ ("institution", "string"), ("period", "string"), ("euro", "float"), ("doi", "string"), ("is_hybrid", "string"), ("publisher", "string"), ("journal_full_title", "string"), ("issn", "string"), ("issn_print", "string"), ("issn_electronic", "string"), ("issn_l", "string"), ("license_ref", "string"), ("indexed_in_crossref", "string"), ("pmid", "string"), ("pmcid", "string"), ("ut", "string"), ("url", "string"), ("doaj", "string"), ("country", "string") ] offsetting_fields = [ ("institution", "string"), ("period", "string"), ("doi", "string"), ("is_hybrid", "string"), ("publisher", "string"), ("journal_full_title", "string"), ("issn", "string"), ("issn_print", "string"), ("issn_electronic", "string"), ("issn_l", "string"), ("license_ref", "string"), ("indexed_in_crossref", "string"), ("pmid", "string"), ("pmcid", "string"), ("ut", "string"), ("url", "string"), ("doaj", "string"), ("country", "string"), ] offsetting_coverage_fields = [ ("period", "string"), ("publisher", "string"), ("journal_full_title", "string"), ("is_hybrid", "string"), ("num_offsetting_articles", "float"), ("num_journal_total_articles", "float"), ("num_journal_oa_articles", "float") ] metadata = sqlalchemy.MetaData(bind=connectable) openapc_table = sqlalchemy.Table("openapc", metadata, autoload=False, schema=schema) if openapc_table.exists(): openapc_table.drop(checkfirst=False) init_table(openapc_table, apc_fields) openapc_insert_command = openapc_table.insert() offsetting_table = sqlalchemy.Table("offsetting", metadata, autoload=False, schema=schema) if offsetting_table.exists(): offsetting_table.drop(checkfirst=False) init_table(offsetting_table, offsetting_fields) offsetting_insert_command = offsetting_table.insert() combined_table = sqlalchemy.Table("combined", metadata, autoload=False, schema=schema) if combined_table.exists(): combined_table.drop(checkfirst=False) init_table(combined_table, apc_fields) combined_insert_command = combined_table.insert() offsetting_coverage_table = sqlalchemy.Table("offsetting_coverage", metadata, autoload=False, schema=schema) if offsetting_coverage_table.exists(): offsetting_coverage_table.drop(checkfirst=False) init_table(offsetting_coverage_table, offsetting_coverage_fields) offsetting_coverage_insert_command = offsetting_coverage_table.insert() # a dict to store individual insert commands for every table tables_insert_commands = { "openapc": openapc_insert_command, "offsetting": offsetting_insert_command, "combined": combined_insert_command, "offsetting_coverage": offsetting_coverage_insert_command } offsetting_institution_countries = {} reader = csv.DictReader(open("static/institutions_offsetting.csv", "r")) for row in reader: institution_name = row["institution"] country = row["country"] offsetting_institution_countries[institution_name] = country journal_coverage = None article_pubyears = None try: cache_file = open(oc.COVERAGE_CACHE_FILE, "r") journal_coverage = json.loads(cache_file.read()) cache_file.close() cache_file = open(oc.PUBDATES_CACHE_FILE, "r") article_pubyears = json.loads(cache_file.read()) cache_file.close() except IOError as ioe: msg = "Error while trying to access cache file: {}" print(msg.format(ioe)) sys.exit() except ValueError as ve: msg = "Error while trying to decode cache structure in: {}" print(msg.format(str(ve))) sys.exit() summarised_offsetting = {} journal_id_title_map = {} reader = csv.DictReader(open(offsetting_file_name, "r")) institution_key_errors = [] for row in reader: institution = row["institution"] publisher = row["publisher"] issn = row["issn"] doi = row["doi"] # colons cannot be escaped in URL queries to the cubes server, so we have # to remove them here row["journal_full_title"] = row["journal_full_title"].replace(":", "") title = row["journal_full_title"] try: row["country"] = offsetting_institution_countries[institution] except KeyError: if institution not in institution_key_errors: institution_key_errors.append(institution) tables_insert_commands["offsetting"].execute(row) if row["euro"] != "NA": tables_insert_commands["combined"].execute(row) if publisher != "Springer Nature": continue journal_id = oc._get_springer_journal_id_from_doi(doi, issn) journal_id_title_map[journal_id] = title try: pub_year = article_pubyears[journal_id][doi] except KeyError: msg = (u"Publication year entry not found in article cache for {}. " + "You might have to update the article cache with 'python " + "assets_generator.py coverage_stats'. Using the 'period' " + "column for now.") print(colorise(msg.format(doi), "yellow")) pub_year = row["period"] if journal_id not in summarised_offsetting: summarised_offsetting[journal_id] = {} if pub_year not in summarised_offsetting[journal_id]: summarised_offsetting[journal_id][pub_year] = 1 else: summarised_offsetting[journal_id][pub_year] += 1 if institution_key_errors: print("KeyError: The following institutions were not found in the " + "institutions_offsetting file:") for institution in institution_key_errors: print(institution) sys.exit() for journal_id, info in journal_coverage.items(): for year, stats in info["years"].items(): row = { "publisher": "Springer Nature", "journal_full_title": info["title"], "period": year, "is_hybrid": "TRUE", "num_journal_total_articles": stats["num_journal_total_articles"], "num_journal_oa_articles": stats["num_journal_oa_articles"] } try: row["num_offsetting_articles"] = summarised_offsetting[journal_id][year] except KeyError: row["num_offsetting_articles"] = 0 tables_insert_commands["offsetting_coverage"].execute(row) institution_countries = {} reader = csv.DictReader(open("static/institutions.csv", "r")) for row in reader: cubes_name = row["institution_cubes_name"] institution_name = row["institution"] country = row["country"] institution_countries[institution_name] = country if institution_name not in tables_insert_commands: table = sqlalchemy.Table(cubes_name, metadata, autoload=False, schema=schema) if table.exists(): table.drop(checkfirst=False) init_table(table, apc_fields) insert_command = table.insert() tables_insert_commands[institution_name] = insert_command reader = csv.DictReader(open(apc_file_name, "r")) for row in reader: institution = row["institution"] # colons cannot be escaped in URL queries to the cubes server, so we have # to remove them here row["journal_full_title"] = row["journal_full_title"].replace(":", "") row["country"] = institution_countries[institution] tables_insert_commands[institution].execute(row) tables_insert_commands["openapc"].execute(row) tables_insert_commands["combined"].execute(row)
def update_coverage_stats(transformative_agreements_file, max_lookups, refetch=True): global COVERAGE_CACHE, JOURNAL_ID_CACHE, PERSISTENT_PUBDATES_CACHE, LOOKUPS_PERFORMED LOOKUPS_PERFORMED = 0 if os.path.isfile(COVERAGE_CACHE_FILE): with open(COVERAGE_CACHE_FILE, "r") as f: try: COVERAGE_CACHE = json.loads(f.read()) print("coverage cache file sucessfully loaded.") except ValueError: print("Could not decode a cache structure from " + COVERAGE_CACHE_FILE + ", starting with an empty coverage cache.") else: print("No cache file (" + COVERAGE_CACHE_FILE + ") found, starting with an empty coverage cache.") if os.path.isfile(PUBDATES_CACHE_FILE): with open(PUBDATES_CACHE_FILE, "r") as f: try: PERSISTENT_PUBDATES_CACHE = json.loads(f.read()) print("Pub dates cache file sucessfully loaded.") except ValueError: print("Could not decode a cache structure from " + PUBDATES_CACHE_FILE + ", starting with an empty pub date cache.") else: print("No cache file (" + PUBDATES_CACHE_FILE + ") found, starting with an empty pub date cache.") if not os.path.isdir(JOURNAL_CSV_DIR): raise IOError("Journal CSV directory " + JOURNAL_CSV_DIR + " not found!") _process_springer_catalogue(max_lookups) reader = csv.DictReader(open(transformative_agreements_file, "r")) for line in reader: if max_lookups is not None and LOOKUPS_PERFORMED >= max_lookups: print("maximum number of lookups performed.") _shutdown() lookup_performed = False found = True publisher = line["publisher"] if publisher != "Springer Nature": continue issn = line["issn"] period = line["period"] title = line["journal_full_title"] doi = line["doi"] journal_id = _get_springer_journal_id_from_doi(doi, issn) # Retreive publication dates for articles from CSV summaries on SpringerLink. # Employ a multi-level cache structure to minimize IO: # 1. try to look up the doi in the persistent publication dates cache # 2. if the journal is not present, repopulate local cache segment from a CSV file in the journal CSV dir # 3a. if no CSV for the journal could be found, fetch it from SpringerLink # 3b. Alternative to 3: If a CSV was found but it does not contain the DOI, re-fetch it from SpringerLink try: _ = PERSISTENT_PUBDATES_CACHE[journal_id][doi] print("Journal {} ('{}'): DOI {} already cached.".format(journal_id, title, doi)) except KeyError: if journal_id not in TEMP_JOURNAL_CACHE: msg = "Journal {} ('{}'): Not found in temp cache, repopulating..." print(msg.format(journal_id, title)) TEMP_JOURNAL_CACHE[journal_id] = _get_journal_cache_from_csv(journal_id, refetch=False) if doi not in TEMP_JOURNAL_CACHE[journal_id]: if refetch: msg = u"Journal {} ('{}'): DOI {} not found in cache, re-fetching csv file..." print(msg.format(journal_id, title, doi)) TEMP_JOURNAL_CACHE[journal_id] = _get_journal_cache_from_csv(journal_id, refetch=True) if doi not in TEMP_JOURNAL_CACHE[journal_id]: msg = u"Journal {} ('{}'): DOI {} NOT FOUND in SpringerLink data!" msg = colorise(msg.format(title, journal_id, doi), "red") print(msg) ERROR_MSGS.append(msg) found = False lookup_performed = True if journal_id not in PERSISTENT_PUBDATES_CACHE: PERSISTENT_PUBDATES_CACHE[journal_id] = {} if found: PERSISTENT_PUBDATES_CACHE[journal_id][doi] = TEMP_JOURNAL_CACHE[journal_id][doi] pub_year = PERSISTENT_PUBDATES_CACHE[journal_id][doi] compare_msg = u"DOI {} found in Springer data, Pub year is {} ".format(doi, pub_year) if pub_year == period: compare_msg += colorise("(same as transformative_agreements period)", "green") else: compare_msg += colorise("(DIFFERENT from transformative_agreements period, which is {})".format(period), "yellow") msg = u"Journal {} ('{}'): ".format(journal_id, title) print(msg.ljust(80) + compare_msg) if found: pub_year = PERSISTENT_PUBDATES_CACHE[journal_id][doi] else: # If a lookup error occured we will retreive coverage stats for the period year instead, since # the aggregation process will make use of this value. pub_year = period # Test if journal stats are present try: _ = COVERAGE_CACHE[journal_id]['years'][pub_year]["num_journal_total_articles"] _ = COVERAGE_CACHE[journal_id]['years'][pub_year]["num_journal_oa_articles"] except KeyError: try: _update_journal_stats(title, journal_id, pub_year) lookup_performed = True error_msg = ('No stats found for journal "{}" ({}) in {} albeit having ' + 'downloaded the full Open Choice catalogue. Stats were ' + 'obtained retroactively.') error_msg = colorise(error_msg.format(title, journal_id, pub_year), "red") print(error_msg) ERROR_MSGS.append(error_msg) except ValueError as ve: error_msg = ('Critical Error while processing DOI {}: No stats found ' + ' for journal "{}" ({}) in {} albeit having downloaded the ' + 'full Open Choice catalogue and stats could not be obtained ' + 'retroactively (ValueError: {}).') error_msg = colorise(error_msg.format(doi, title, journal_id, pub_year, str(ve)), "red") print(error_msg) ERROR_MSGS.append(error_msg) _shutdown() if lookup_performed: LOOKUPS_PERFORMED += 1 _shutdown()
def create_cubes_tables(connectable, apc_file_name, transformative_agreements_file_name, schema="openapc_schema"): apc_fields = [("institution", "string"), ("period", "string"), ("euro", "float"), ("doi", "string"), ("is_hybrid", "string"), ("publisher", "string"), ("journal_full_title", "string"), ("issn", "string"), ("issn_print", "string"), ("issn_electronic", "string"), ("issn_l", "string"), ("license_ref", "string"), ("indexed_in_crossref", "string"), ("pmid", "string"), ("pmcid", "string"), ("ut", "string"), ("url", "string"), ("doaj", "string"), ("country", "string")] deal_fields = apc_fields + [("opt_out", "string")] transformative_agreements_fields = [("institution", "string"), ("period", "string"), ("doi", "string"), ("is_hybrid", "string"), ("publisher", "string"), ("journal_full_title", "string"), ("issn", "string"), ("issn_print", "string"), ("issn_electronic", "string"), ("issn_l", "string"), ("license_ref", "string"), ("indexed_in_crossref", "string"), ("pmid", "string"), ("pmcid", "string"), ("ut", "string"), ("url", "string"), ("doaj", "string"), ("country", "string"), ("agreement", "string")] bpc_fields = [("institution", "string"), ("period", "string"), ("euro", "float"), ("doi", "string"), ("backlist_oa", "string"), ("publisher", "string"), ("book_title", "string"), ("isbn", "string"), ("isbn_print", "string"), ("isbn_electronic", "string"), ("license_ref", "string"), ("indexed_in_crossref", "string"), ("doab", "string"), ("country", "string")] springer_compact_coverage_fields = [ ("period", "string"), ("publisher", "string"), ("journal_full_title", "string"), ("is_hybrid", "string"), ("num_springer_compact_articles", "float"), ("num_journal_total_articles", "float"), ("num_journal_oa_articles", "float") ] metadata = sqlalchemy.MetaData(bind=connectable) openapc_table = sqlalchemy.Table("openapc", metadata, autoload=False, schema=schema) if openapc_table.exists(): openapc_table.drop(checkfirst=False) init_table(openapc_table, apc_fields) openapc_insert_command = openapc_table.insert() transformative_agreements_table = sqlalchemy.Table( "transformative_agreements", metadata, autoload=False, schema=schema) if transformative_agreements_table.exists(): transformative_agreements_table.drop(checkfirst=False) init_table(transformative_agreements_table, transformative_agreements_fields) transformative_agreements_insert_command = transformative_agreements_table.insert( ) bpc_table = sqlalchemy.Table("bpc", metadata, autoload=False, schema=schema) if bpc_table.exists(): bpc_table.drop(checkfirst=False) init_table(bpc_table, bpc_fields) bpc_insert_command = bpc_table.insert() combined_table = sqlalchemy.Table("combined", metadata, autoload=False, schema=schema) if combined_table.exists(): combined_table.drop(checkfirst=False) init_table(combined_table, apc_fields) combined_insert_command = combined_table.insert() springer_compact_coverage_table = sqlalchemy.Table( "springer_compact_coverage", metadata, autoload=False, schema=schema) if springer_compact_coverage_table.exists(): springer_compact_coverage_table.drop(checkfirst=False) init_table(springer_compact_coverage_table, springer_compact_coverage_fields) springer_compact_coverage_insert_command = springer_compact_coverage_table.insert( ) deal_table = sqlalchemy.Table("deal", metadata, autoload=False, schema=schema) if deal_table.exists(): deal_table.drop(checkfirst=False) init_table(deal_table, deal_fields) deal_insert_command = deal_table.insert() # a dict to store individual insert commands for every table tables_insert_commands = { "openapc": openapc_insert_command, "transformative_agreements": transformative_agreements_insert_command, "bpc": bpc_insert_command, "combined": combined_insert_command, "springer_compact_coverage": springer_compact_coverage_insert_command, "deal": deal_insert_command } bpcs_institution_countries = {} reader = csv.DictReader(open(INSTITUTIONS_BPC_FILE, "r")) for row in reader: institution_name = row["institution"] country = row["country"] bpcs_institution_countries[institution_name] = country reader = csv.DictReader(open(BPC_FILE, "r")) for row in reader: row["book_title"] = row["book_title"].replace(":", "") institution = row["institution"] row["country"] = bpcs_institution_countries[institution] tables_insert_commands["bpc"].execute(row) transformative_agreements_institution_countries = {} reader = csv.DictReader( open(INSTITUTIONS_TRANSFORMATIVE_AGREEMENTS_FILE, "r")) for row in reader: institution_name = row["institution"] country = row["country"] transformative_agreements_institution_countries[ institution_name] = country journal_coverage = None article_pubyears = None try: cache_file = open(scc.COVERAGE_CACHE_FILE, "r") journal_coverage = json.loads(cache_file.read()) cache_file.close() cache_file = open(scc.PUBDATES_CACHE_FILE, "r") article_pubyears = json.loads(cache_file.read()) cache_file.close() except IOError as ioe: msg = "Error while trying to access cache file: {}" print(msg.format(ioe)) sys.exit() except ValueError as ve: msg = "Error while trying to decode cache structure in: {}" print(msg.format(str(ve))) sys.exit() summarised_transformative_agreements = {} journal_id_title_map = {} institution_key_errors = [] reader = csv.DictReader(open(DEAL_WILEY_OPT_OUT_FILE, "r")) for row in reader: row_copy = deepcopy( row ) # work on a deep copy since we make some DEAL-specific changes row_copy["opt_out"] = "TRUE" if row_copy["publisher"] in DEAL_IMPRINTS["Wiley-Blackwell"]: row_copy["publisher"] = "Wiley-Blackwell" institution = row_copy["institution"] try: row_copy[ "country"] = transformative_agreements_institution_countries[ institution] except KeyError: if institution not in institution_key_errors: institution_key_errors.append(institution) if row_copy["period"] == "2019": # Special rule: Half 2019 costs since DEAL only started in 07/19 halved = round(float(row_copy["euro"]) / 2, 2) row_copy["euro"] = str(halved) tables_insert_commands["deal"].execute(row_copy) reader = csv.DictReader(open(DEAL_SPRINGER_OPT_OUT_FILE, "r")) for row in reader: row_copy = deepcopy( row ) # work on a deep copy since we make some DEAL-specific changes row_copy["opt_out"] = "TRUE" if row_copy["publisher"] in DEAL_IMPRINTS["Springer Nature"]: row_copy["publisher"] = "Springer Nature" institution = row_copy["institution"] try: row_copy[ "country"] = transformative_agreements_institution_countries[ institution] except KeyError: if institution not in institution_key_errors: institution_key_errors.append(institution) tables_insert_commands["deal"].execute(row_copy) reader = csv.DictReader(open(transformative_agreements_file_name, "r")) for row in reader: institution = row["institution"] publisher = row["publisher"] issn = row["issn"] doi = row["doi"] # colons cannot be escaped in URL queries to the cubes server, so we have # to remove them here row["journal_full_title"] = row["journal_full_title"].replace(":", "") title = row["journal_full_title"] try: row["country"] = transformative_agreements_institution_countries[ institution] except KeyError: if institution not in institution_key_errors: institution_key_errors.append(institution) tables_insert_commands["transformative_agreements"].execute(row) if row["euro"] != "NA": tables_insert_commands["combined"].execute(row) if row["agreement"] == "DEAL Wiley Germany": # DEAL Wiley row_copy = deepcopy(row) row_copy["opt_out"] = "FALSE" if row_copy["period"] == "2019": # Special rule: Half 2019 costs since DEAL only started in 07/19 halved = round(float(row["euro"]) / 2, 2) row_copy["euro"] = str(halved) if row_copy["publisher"] in DEAL_IMPRINTS["Wiley-Blackwell"]: row_copy["publisher"] = "Wiley-Blackwell" tables_insert_commands["deal"].execute(row_copy) if row["agreement"] == "DEAL Springer Nature Germany": row_copy = deepcopy(row) # DEAL SN row_copy["opt_out"] = "FALSE" if row_copy["publisher"] in DEAL_IMPRINTS["Springer Nature"]: row_copy["publisher"] = "Springer Nature" tables_insert_commands["deal"].execute(row_copy) if publisher != "Springer Nature": continue journal_id = scc._get_springer_journal_id_from_doi(doi, issn) journal_id_title_map[journal_id] = title try: pub_year = article_pubyears[journal_id][doi] except KeyError: msg = ( u"Publication year entry not found in article cache for {}. " + "You might have to update the article cache with 'python " + "assets_generator.py coverage_stats'. Using the 'period' " + "column for now.") print(colorise(msg.format(doi), "yellow")) pub_year = row["period"] if journal_id not in summarised_transformative_agreements: summarised_transformative_agreements[journal_id] = {} if pub_year not in summarised_transformative_agreements[journal_id]: summarised_transformative_agreements[journal_id][pub_year] = 1 else: summarised_transformative_agreements[journal_id][pub_year] += 1 if institution_key_errors: print("KeyError: The following institutions were not found in the " + "institutions_transformative_agreements file:") for institution in institution_key_errors: print(institution) sys.exit() for journal_id, info in journal_coverage.items(): for year, stats in info["years"].items(): row = { "publisher": "Springer Nature", "journal_full_title": info["title"], "period": year, "is_hybrid": "TRUE", "num_journal_total_articles": stats["num_journal_total_articles"], "num_journal_oa_articles": stats["num_journal_oa_articles"] } try: row["num_springer_compact_articles"] = summarised_transformative_agreements[ journal_id][year] except KeyError: row["num_springer_compact_articles"] = 0 tables_insert_commands["springer_compact_coverage"].execute(row) institution_countries = {} reader = csv.DictReader(open(INSTITUTIONS_FILE, "r")) for row in reader: cubes_name = row["institution_cubes_name"] institution_name = row["institution"] country = row["country"] institution_countries[institution_name] = country if institution_name not in tables_insert_commands: table = sqlalchemy.Table(cubes_name, metadata, autoload=False, schema=schema) if table.exists(): table.drop(checkfirst=False) init_table(table, apc_fields) insert_command = table.insert() tables_insert_commands[institution_name] = insert_command reader = csv.DictReader(open(apc_file_name, "r")) for row in reader: institution = row["institution"] # colons cannot be escaped in URL queries to the cubes server, so we have # to remove them here row["journal_full_title"] = row["journal_full_title"].replace(":", "") row["country"] = institution_countries[institution] tables_insert_commands[institution].execute(row) tables_insert_commands["openapc"].execute(row) tables_insert_commands["combined"].execute(row) # DEAL Wiley if row["publisher"] in DEAL_IMPRINTS["Wiley-Blackwell"] and row[ "country"] == "DEU" and row["is_hybrid"] == "FALSE": if row["period"] in ["2019", "2020", "2021", "2022"]: row["publisher"] = "Wiley-Blackwell" # Imprint normalization tables_insert_commands["deal"].execute(row) # DEAL Springer if row["publisher"] in DEAL_IMPRINTS["Springer Nature"] and row[ "country"] == "DEU" and row["is_hybrid"] == "FALSE": if row["period"] in ["2020", "2021", "2022"]: row["publisher"] = "Springer Nature" tables_insert_commands["deal"].execute(row)
def update_coverage_stats(offsetting_file, max_lookups, refetch=True): global COVERAGE_CACHE, JOURNAL_ID_CACHE, PERSISTENT_PUBDATES_CACHE, LOOKUPS_PERFORMED LOOKUPS_PERFORMED = 0 if os.path.isfile(COVERAGE_CACHE_FILE): with open(COVERAGE_CACHE_FILE, "r") as f: try: COVERAGE_CACHE = json.loads(f.read()) print("coverage cache file sucessfully loaded.") except ValueError: print("Could not decode a cache structure from " + COVERAGE_CACHE_FILE + ", starting with an empty coverage cache.") else: print("No cache file (" + COVERAGE_CACHE_FILE + ") found, starting with an empty coverage cache.") if os.path.isfile(PUBDATES_CACHE_FILE): with open(PUBDATES_CACHE_FILE, "r") as f: try: PERSISTENT_PUBDATES_CACHE = json.loads(f.read()) print("Pub dates cache file sucessfully loaded.") except ValueError: print("Could not decode a cache structure from " + PUBDATES_CACHE_FILE + ", starting with an empty pub date cache.") else: print("No cache file (" + PUBDATES_CACHE_FILE + ") found, starting with an empty pub date cache.") if not os.path.isdir(JOURNAL_CSV_DIR): raise IOError("Journal CSV directory " + JOURNAL_CSV_DIR + " not found!") _process_springer_catalogue(max_lookups) reader = csv.DictReader(open(offsetting_file, "r")) for line in reader: if max_lookups is not None and LOOKUPS_PERFORMED >= max_lookups: print("maximum number of lookups performed.") _shutdown() lookup_performed = False found = True publisher = line["publisher"] if publisher != "Springer Nature": continue issn = line["issn"] period = line["period"] title = line["journal_full_title"] doi = line["doi"] journal_id = _get_springer_journal_id_from_doi(doi, issn) # Retreive publication dates for articles from CSV summaries on SpringerLink. # Employ a multi-level cache structure to minimize IO: # 1. try to look up the doi in the persistent publication dates cache # 2. if the journal is not present, repopulate local cache segment from a CSV file in the journal CSV dir # 3a. if no CSV for the journal could be found, fetch it from SpringerLink # 3b. Alternative to 3: If a CSV was found but it does not contain the DOI, re-fetch it from SpringerLink try: _ = PERSISTENT_PUBDATES_CACHE[journal_id][doi] print("Journal {} ('{}'): DOI {} already cached.".format(journal_id, title, doi)) except KeyError: if journal_id not in TEMP_JOURNAL_CACHE: msg = "Journal {} ('{}'): Not found in temp cache, repopulating..." print(msg.format(journal_id, title)) TEMP_JOURNAL_CACHE[journal_id] = _get_journal_cache_from_csv(journal_id, refetch=False) if doi not in TEMP_JOURNAL_CACHE[journal_id]: if refetch: msg = u"Journal {} ('{}'): DOI {} not found in cache, re-fetching csv file..." print(msg.format(journal_id, title, doi)) TEMP_JOURNAL_CACHE[journal_id] = _get_journal_cache_from_csv(journal_id, refetch=True) if doi not in TEMP_JOURNAL_CACHE[journal_id]: msg = u"Journal {} ('{}'): DOI {} NOT FOUND in SpringerLink data!" msg = colorise(msg.format(title, journal_id, doi), "red") print(msg) ERROR_MSGS.append(msg) found = False lookup_performed = True if journal_id not in PERSISTENT_PUBDATES_CACHE: PERSISTENT_PUBDATES_CACHE[journal_id] = {} if found: PERSISTENT_PUBDATES_CACHE[journal_id][doi] = TEMP_JOURNAL_CACHE[journal_id][doi] pub_year = PERSISTENT_PUBDATES_CACHE[journal_id][doi] compare_msg = u"DOI {} found in Springer data, Pub year is {} ".format(doi, pub_year) if pub_year == period: compare_msg += colorise("(same as offsetting period)", "green") else: compare_msg += colorise("(DIFFERENT from offsetting period, which is {})".format(period), "yellow") msg = u"Journal {} ('{}'): ".format(journal_id, title) print(msg.ljust(80) + compare_msg) if found: pub_year = PERSISTENT_PUBDATES_CACHE[journal_id][doi] else: # If a lookup error occured we will retreive coverage stats for the period year instead, since # the aggregation process will make use of this value. pub_year = period # Test if journal stats are present try: _ = COVERAGE_CACHE[journal_id]['years'][pub_year]["num_journal_total_articles"] _ = COVERAGE_CACHE[journal_id]['years'][pub_year]["num_journal_oa_articles"] except KeyError: try: _update_journal_stats(title, journal_id, pub_year) lookup_performed = True error_msg = ('No stats found for journal "{}" ({}) in {} albeit having ' + 'downloaded the full Open Choice catalogue. Stats were ' + 'obtained retroactively.') error_msg = colorise(error_msg.format(title, journal_id, pub_year), "red") print(error_msg) ERROR_MSGS.append(error_msg) except ValueError as ve: error_msg = ('Critical Error while processing DOI {}: No stats found ' + ' for journal "{}" ({}) in {} albeit having downloaded the ' + 'full Open Choice catalogue and stats could not be obtained ' + 'retroactively (ValueError: {}).') error_msg = colorise(error_msg.format(doi, title, journal_id, pub_year, str(ve)), "red") print(error_msg) ERROR_MSGS.append(error_msg) _shutdown() if lookup_performed: LOOKUPS_PERFORMED += 1 _shutdown()