def check_apc_field_content(row_object): __tracebackhide__ = True row = row_object.row line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number) if not oat.has_value(row['journal_full_title']): fail(line_str + 'the column "journal_full_title" must not be empty') if len(row['journal_full_title']) != len(row['journal_full_title'].strip()): fail(line_str + 'journal title (' + row['journal_full_title'] + ') has leading or trailing whitespaces') if not oat.has_value(row['issn']): fail(line_str + 'the column "issn" must not be empty') if row['doaj'] not in ["TRUE", "FALSE"]: fail(line_str + 'value in row "doaj" must either be TRUE or FALSE') if row['is_hybrid'] not in ["TRUE", "FALSE"]: fail(line_str + 'value in row "is_hybrid" must either be TRUE or FALSE') if row_object.origin == "ta": if not oat.has_value(row['agreement']): fail(line_str + 'the column "agreement" must not be empty') if not row_object.origin == "ta": try: euro = float(row['euro']) if euro <= 0: fail(line_str + 'value in row "euro" (' + row['euro'] + ') must be larger than 0') except ValueError: fail(line_str + 'value in row "euro" (' + row['euro'] + ') is no valid number')
def check_name_consistency(row_object): __tracebackhide__ = True row = row_object.row issn = row["issn"] if oat.has_value(row["issn"]) else None issn_p = row["issn_print"] if oat.has_value(row["issn_print"]) else None issn_e = row["issn_electronic"] if oat.has_value(row["issn_electronic"]) else None hybrid_status_changed = len({issn, issn_p, issn_e}.intersection(JOURNAL_HYBRID_STATUS_CHANGED)) > 0 journal = row["journal_full_title"] publ = row["publisher"] hybrid = row["is_hybrid"] line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number) msg = (u'' + line_str + 'Two entries share a common {}ISSN ({}), but the ' + '{} differs ("{}" vs "{}")') if issn is not None: same_issn_rows = issn_dict[issn] for other_row in same_issn_rows: other_publ = other_row["publisher"] other_journal = other_row["journal_full_title"] other_hybrid = other_row["is_hybrid"] if not other_publ == publ and not in_whitelist(issn, publ, other_publ): ret = msg.format("", issn, "publisher name", publ, other_publ) fail(ret) if not other_journal == journal: ret = msg.format("", issn, "journal title", journal, other_journal) fail(ret) if other_hybrid != hybrid and not hybrid_status_changed: ret = msg.format("", issn, "hybrid status", hybrid, other_hybrid) fail(ret) if issn_p is not None: same_issn_p_rows = issn_p_dict[issn_p] for other_row in same_issn_p_rows: other_publ = other_row["publisher"] other_journal = other_row["journal_full_title"] other_hybrid = other_row["is_hybrid"] if not other_publ == publ and not in_whitelist(issn_p, publ, other_publ): ret = msg.format("Print ", issn_p, "publisher name", publ, other_publ) fail(ret) if not other_journal == journal: ret = msg.format("Print ", issn_p, "journal title", journal, other_journal) fail(ret) if other_hybrid != hybrid and not hybrid_status_changed: ret = msg.format("Print ", issn_p, "hybrid status", hybrid, other_hybrid) fail(ret) if issn_e is not None: same_issn_e_rows = issn_e_dict[issn_e] for other_row in same_issn_e_rows: other_publ = other_row["publisher"] other_journal = other_row["journal_full_title"] other_hybrid = other_row["is_hybrid"] if not other_publ == publ and not in_whitelist(issn_e, publ, other_publ): ret = msg.format("Electronic ", issn_e, "publisher name", publ, other_publ) fail(ret) if not other_journal == journal: ret = msg.format("Electronic ", issn_e, "journal title", journal, other_journal) fail(ret) if other_hybrid != hybrid and not hybrid_status_changed: ret = msg.format("Electronic ", issn_e, "hybrid status", hybrid, other_hybrid) fail(ret)
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"]) parser.add_argument("journaltocs_user", help=ARG_HELP_STRINGS["journaltocs_user"]) parser.add_argument("-i", "--integrate", action="store_true", help=ARG_HELP_STRINGS["integrate"]) parser.add_argument("-m", "--max_lookups", type=int, default=100, help=ARG_HELP_STRINGS["max_lookups"]) args = parser.parse_args() analysed_journals = {} modified_content = [] lookups = 0 header, content = oat.get_csv_file_content(args.source_file, enc="utf-8", force_header=True) header_line = header[0] modified_content = [list(header_line)] for line in content: if not oat.has_value(line[6]): #journal_full_title modified_content.append(line) continue if not oat.has_value(line[4]): #is_hybrid title = line[6] oat.print_y('Looking up journal {}'.format(title)) if title not in analysed_journals: if lookups < args.max_lookups: hybrid_status = get_hybrid_status(line, args.journaltocs_user) if hybrid_status is not None: analysed_journals[title] = hybrid_status else: analysed_journals[title] = "NA" lookups += 1 line[4] = analysed_journals[title] else: oat.print_r("Maximum number of lookups reached!") else: line[4] = analysed_journals[title] modified_content.append(line) with open("out.csv", "w") as out: if args.integrate: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True) writer.write_rows(modified_content) else: out.write("journal_full_title,is_hybrid\n") for key, value in analysed_journals.items(): out.write(key + "," + value + "\n")
def check_name_consistency(row_object): __tracebackhide__ = True row = row_object.row issn = row["issn"] if oat.has_value(row["issn"]) else None issn_p = row["issn_print"] if oat.has_value(row["issn_print"]) else None issn_e = row["issn_electronic"] if oat.has_value(row["issn_electronic"]) else None journal = row["journal_full_title"] publ = row["publisher"] hybrid = row["is_hybrid"] line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number) msg = (u'' + line_str + 'Two entries share a common {}ISSN ({}), but the ' + '{} differs ("{}" vs "{}")') if issn is not None: same_issn_rows = issn_dict[issn] for other_row in same_issn_rows: other_publ = other_row["publisher"] other_journal = other_row["journal_full_title"] other_hybrid = other_row["is_hybrid"] if not other_publ == publ and not in_whitelist(issn, publ, other_publ): ret = msg.format("", issn, "publisher name", publ, other_publ) pytest.fail(ret) if not other_journal == journal: ret = msg.format("", issn, "journal title", journal, other_journal) pytest.fail(ret) if not other_hybrid == hybrid and issn not in JOURNAL_HYBRID_STATUS_CHANGED: ret = msg.format("", issn, "hybrid status", hybrid, other_hybrid) pytest.fail(ret) if issn_p is not None: same_issn_p_rows = issn_p_dict[issn_p] for other_row in same_issn_p_rows: other_publ = other_row["publisher"] other_journal = other_row["journal_full_title"] other_hybrid = other_row["is_hybrid"] if not other_publ == publ and not in_whitelist(issn_p, publ, other_publ): ret = msg.format("Print ", issn_p, "publisher name", publ, other_publ) pytest.fail(ret) if not other_journal == journal: ret = msg.format("Print ", issn_p, "journal title", journal, other_journal) pytest.fail(ret) if not other_hybrid == hybrid and issn not in JOURNAL_HYBRID_STATUS_CHANGED: ret = msg.format("Print ", issn_p, "hybrid status", hybrid, other_hybrid) pytest.fail(ret) if issn_e is not None: same_issn_e_rows = issn_e_dict[issn_e] for other_row in same_issn_e_rows: other_publ = other_row["publisher"] other_journal = other_row["journal_full_title"] other_hybrid = other_row["is_hybrid"] if not other_publ == publ and not in_whitelist(issn_e, publ, other_publ): ret = msg.format("Electronic ", issn_e, "publisher name", publ, other_publ) pytest.fail(ret) if not other_journal == journal: ret = msg.format("Electronic ", issn_e, "journal title", journal, other_journal) pytest.fail(ret) if not other_hybrid == hybrid and issn not in JOURNAL_HYBRID_STATUS_CHANGED: ret = msg.format("Electronic ", issn_e, "hybrid status", hybrid, other_hybrid) pytest.fail(ret)
def check_field_content(row_object): __tracebackhide__ = True row = row_object.row line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number) if not oat.has_value(row['publisher']): fail(line_str + 'the column "publisher" must not be empty') if not oat.has_value(row['journal_full_title']): fail(line_str + 'the column "journal_full_title" must not be empty') if not oat.has_value(row['issn']): fail(line_str + 'the column "issn" must not be empty') if row['doaj'] not in ["TRUE", "FALSE"]: fail(line_str + 'value in row "doaj" must either be TRUE or FALSE') if row['indexed_in_crossref'] not in ["TRUE", "FALSE"]: fail(line_str + 'value in row "indexed_in_crossref" must either be TRUE or FALSE') if row['is_hybrid'] not in ["TRUE", "FALSE"]: fail(line_str + 'value in row "is_hybrid" must either be TRUE or FALSE') if not row['doi'] == "NA": doi_norm = oat.get_normalised_DOI(row['doi']) if doi_norm is None: fail( line_str + 'value in row "doi" must either be NA or represent a valid DOI' ) elif doi_norm != row['doi']: fail( line_str + 'value in row "doi" contains a valid DOI, but the format ' + 'is not correct. It should be the simple DOI name, not ' + 'handbook notation (doi:...) or a HTTP URI (http://dx.doi.org/...)' ) if len(row['publisher']) != len(row['publisher'].strip()): fail(line_str + 'publisher name (' + row['publisher'] + ') has leading or trailing whitespaces') if len(row['journal_full_title']) != len( row['journal_full_title'].strip()): fail(line_str + 'journal title (' + row['journal_full_title'] + ') has leading or trailing whitespaces') if row_object.transformative_agreements: if not oat.has_value(row['agreement']): fail(line_str + 'the column "agreement" must not be empty') if not row_object.transformative_agreements: try: euro = float(row['euro']) if euro <= 0: fail(line_str + 'value in row "euro" (' + row['euro'] + ') must be larger than 0') except ValueError: fail(line_str + 'value in row "euro" (' + row['euro'] + ') is no valid number')
def generate_metadata_section(institution, ins_content, stats, lang): markdown = LANG[lang]["md_header"] ins_line = None for line in ins_content: if line[0] == institution: ins_line = line break else: oat.print_r("ERROR: Entry " + institution + " not found in institutions file!") sys.exit() locale_date = format_date(date.today(), locale=lang) markdown += "* " + LANG[lang]["md_date"] + ": " + locale_date + "\n" git_rev = run(["git", "describe", "--tags", "--abbrev=0"], capture_output=True).stdout.decode() git_rev = git_rev.replace("\n", "") rev_url = "https://github.com/OpenAPC/openapc-de/tree/" + git_rev markdown += "* " + LANG[lang][ "md_rev"] + ": [" + git_rev + "](" + rev_url + ")\n" markdown += "* " + LANG[lang]["md_ins"] + ": " + ins_line[2] + "\n" if oat.has_value(ins_line[7]): grid_id = ins_line[7] grid_url = "https://www.grid.ac/institutes/" + grid_id markdown += "* " + LANG[lang][ "md_grid"] + ": [" + grid_id + "](" + grid_url + ")\n" if oat.has_value(ins_line[8]): ror_id = ins_line[8] markdown += "* " + LANG[lang][ "md_ror"] + ": [" + ror_id + "](" + ror_id + ")\n" markdown += "* " + LANG[lang]["md_ins_apc"] + ": " + ins_line[0] + "\n" url = "https://treemaps.openapc.net/apcdata/" treemap_url = "<" + url + ins_line[1].replace("_", "-") + ">" markdown += "* " + LANG[lang]["md_treemap"] + ": " + treemap_url + "\n" data_dir = ins_line[6] if oat.has_value(data_dir): stats = get_data_dir_stats(data_dir) data_url = "https://github.com/OpenAPC/openapc-de/tree/master/data/" + data_dir markdown += "* " + LANG[lang][ "md_data_dir"] + ": [" + data_dir + "](" + data_url + ")\n" markdown += "* " + LANG[lang]["md_num_files"] + ": " + str( stats["orig_files"]) + "\n" markdown += "* " + LANG[lang]["md_readme"] + ": " if stats["readme"]: markdown += LANG[lang]["md_readme_yes"] else: markdown += LANG[lang]["md_readme_no"] markdown += "\n" else: oat.print_y("WARNING: No data dir entry found for " + institution + "!") markdown += "\n" return markdown
def get_hybrid_status(line, username): for issn in [7, 8, 9, 10]: if not oat.has_value(line[issn]): continue msg = 'Looking up ISSN {}...' oat.print_y(msg.format(line[issn])) jtoc_metadata = get_jtoc_metadata(line[issn], username) sleep(1) if jtoc_metadata["jtoc_id"] is not None: msg = ('Entry found (publisher: {}, title: {}, jtoc_ID: {}, ' + 'obtaining hybrid status...)') oat.print_g( msg.format(jtoc_metadata["jtoc_publisher"], jtoc_metadata["jtoc_title"], jtoc_metadata["jtoc_id"])) journal_type = get_jtoc_journal_type(jtoc_metadata["jtoc_id"]) if not journal_type: oat.print_r("Error while obtaining hybrid status!") continue sleep(1) msg = "journaltocs type is '{}' , mapped to is_hybrid = {}" oat.print_g(msg.format(journal_type[0], journal_type[1])) return journal_type[1] oat.print_r("None of the ISSN values found in journaltocs!") return None
def generate_apc_deviaton_section(institution, articles, stats, lang, csv_output=False): if csv_output: csv_content = [[ "Journal", "Publisher", "Journal Articles in OpenAPC", "Period", "DOI", "Reported Costs", "OpenAPC Mean Value", "OpenAPC Standard Deviation", "Difference (absolute)", "Difference (Standard Deviations)" ]] md_content = "" journal_dict = {} for article in articles: journal = article[6] if journal not in journal_dict: journal_dict[journal] = [article] else: journal_dict[journal].append(article) journals = list(journal_dict.keys()) journals.sort() md_content += LANG[lang]["ad_header"] md_content += LANG[lang]["ad_intro"] md_content += LANG[lang]["ad_disc"] for journal in journals: publisher = journal_dict[journal][0][5] num_articles = journal_dict[journal][0][22] md_content += LANG[lang]["ad_table_header"].format( journal, publisher, num_articles) md_content += LANG[lang]["ad_th"] for article in journal_dict[journal]: row = "|" for index in [1, 3, 2, 18, 19, 20]: elem = str(article[index]).replace("|", "\|") if index == 3: # doi if oat.has_value(elem): elem = "[" + elem + "](https://doi.org/" + elem + ")" else: # No doi, use url instead elem = "[Link](" + article[16] + ")" if index in [2, 18, 19, 20]: # monetary elem = elem + "€" row += elem + "|" row += "\n" md_content += row if csv_output: line = [] for index in [6, 5, 22, 1, 3, 2, 18, 19, 20, 21]: line.append(str(article[index])) csv_content.append(line) md_content += "\n\n" md_content += LANG[lang]["ad_stats_header"].format(institution) for stat in ["articles", "not_checked", "within_limits", "significant"]: md_content += "* " + LANG[lang]["ad_stats_" + stat] md_content += ": " + str(stats[stat]) + "\n" if csv_output: with open("report.csv", "w") as out: csv_writer = csv.writer(out) csv_writer.writerows(csv_content) return md_content
def check_isbns(row_object): __tracebackhide__ = True row = row_object.row line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number) isbn = row["isbn"] publisher = row["publisher"] if not oat.has_value(isbn): fail(line_str + 'The isbn column may not be empty') return test_result = ISBNHANDLING.test_and_normalize_isbn(isbn) if not test_result["valid"]: error = ISBNHANDLING.ISBN_ERRORS[test_result["error_type"]] fail(line_str + 'The isbn is invalid: ' + error) return group_and_publisher = _get_isbn_group_publisher(isbn) for other_publisher in isbn_dict[group_and_publisher]: if other_publisher != publisher and not wl.publisher_identity( publisher, other_publisher): msg = line_str + ( 'Two book entries share a common group-publisher combination in ' + 'their ISBNs ({}), but the publisher name differs ("{}" vs "{}")' ) fail(msg.format(group_and_publisher, publisher, other_publisher))
def check_common_field_content(row_object): __tracebackhide__ = True row = row_object.row line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number) if not oat.has_value(row['publisher']): fail(line_str + 'the column "publisher" must not be empty') if row['indexed_in_crossref'] not in ["TRUE", "FALSE"]: fail(line_str + 'value in row "indexed_in_crossref" must either be TRUE or FALSE') if not row['doi'] == "NA": doi_norm = oat.get_normalised_DOI(row['doi']) if doi_norm is None: fail( line_str + 'value in row "doi" must either be NA or represent a valid DOI' ) elif doi_norm != row['doi']: fail( line_str + 'value in row "doi" contains a valid DOI, but the format ' + 'is not correct. It should be the simple DOI name, not ' + 'handbook notation (doi:...) or a HTTP URI (http://dx.doi.org/...)' ) if len(row['publisher']) != len(row['publisher'].strip()): fail(line_str + 'publisher name (' + row['publisher'] + ') has leading or trailing whitespaces')
def check_bpc_field_content(row_object): __tracebackhide__ = True row = row_object.row line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number) if not oat.has_value(row['book_title']): fail(line_str + 'the column "book_title" must not be empty') if len(row['book_title']) != len(row['book_title'].strip()): fail(line_str + 'book title (' + row['book_title'] + ') has leading or trailing whitespaces')
def check_optional_fields(row_object): __tracebackhide__ = True row = row_object.row if row['doi'] == "NA": line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number) if not oat.has_value(row['publisher']): pytest.fail(line_str + 'if no DOI is given, the column ' + '"publisher" must not be empty') if not oat.has_value(row['journal_full_title']): pytest.fail(line_str + 'if no DOI is given, the column ' + '"journal_full_title" must not be empty') if not oat.has_value(row['issn']): pytest.fail(line_str + 'if no DOI is given, the column "issn" ' + 'must not be empty') if not oat.has_value(row['url']): pytest.fail(line_str + 'if no DOI is given, the column "url" ' + 'must not be empty')
def check_optional_fields(row_object): __tracebackhide__ = True row = row_object.row if row['doi'] == "NA": line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number) if not oat.has_value(row['publisher']): pytest.fail(line_str + 'if no DOI is given, the column ' + '"publisher" must not be empty') if not oat.has_value(row['journal_full_title']): pytest.fail(line_str + 'if no DOI is given, the column ' + '"journal_full_title" must not be empty') if not oat.has_value(row['issn']): pytest.fail(line_str + 'if no DOI is given, the column "issn" ' + 'must not be empty') if not oat.has_value(row['url']): pytest.fail(line_str + 'if no DOI is given, the column "url" ' + 'must not be empty')
def check_optional_identifier(row_object): __tracebackhide__ = True row = row_object.row if row['doi'] == "NA": line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number) if not oat.has_value(row['url']): fail(line_str + 'if no DOI is given, the column "url" ' + 'must not be empty')
def check_for_isbn_duplicates(row_object): __tracebackhide__ = True isbn_list = [] # prepare a deduplicated list for isbn_type in ["isbn", "isbn_print", "isbn_electronic"]: isbn = row_object.row[isbn_type] if oat.has_value(isbn) and isbn not in isbn_list and isbn not in wl.NON_DUPLICATE_ISBNS: isbn_list.append(isbn) for isbn in isbn_list: isbn_duplicate_list.remove(isbn) if isbn in isbn_duplicate_list: line_str = '{}, line {}: '.format(row_object.file_name, row_object.line_number) fail(line_str + 'Duplicate: ISBN "' + isbn + '" was ' + 'encountered more than one time')
def is_whitelisted(field_type, new_value, established_value, issn, issn_p, issn_e, issn_l): # The JOURNAL_HYBRID_STATUS_CHANGED wl only lists one of the issns, # so we have to compare all issn types for a match. if field_type == "is_hybrid": if len({issn, issn_p, issn_e, issn_l}.intersection( wl.JOURNAL_HYBRID_STATUS_CHANGED)) > 0: return True return False # The publisher wls, on the other hand, list all issn types, so we have # implement a different kind of logic if field_type == "publisher": for issn_type in [issn, issn_p, issn_e, issn_l]: if oat.has_value(issn_type) and not wl.in_whitelist( issn_type, established_value, new_value): return False return True return False
def generate_apc_deviaton_section(institution, articles, stats, lang): md_content = "" journal_dict = {} for article in articles: journal = article[6] if journal not in journal_dict: journal_dict[journal] = [article] else: journal_dict[journal].append(article) journals = list(journal_dict.keys()) journals.sort() md_content += LANG[lang]["ad_header"] md_content += LANG[lang]["ad_intro"] md_content += LANG[lang]["ad_disc"] for journal in journals: publisher = journal_dict[journal][0][5] num_articles = journal_dict[journal][0][21] md_content += LANG[lang]["ad_table_header"].format( journal, publisher, num_articles) md_content += LANG[lang]["ad_th"] for article in journal_dict[journal]: row = "|" for index in [1, 3, 2, 18, 19, 20]: elem = str(article[index]).replace("|", "\|") if index == 3: # doi if oat.has_value(elem): elem = "[" + elem + "](https://doi.org/" + elem + ")" else: # No doi, use url instead elem = "[Link](" + article[16] + ")" if index in [2, 18, 19, 20]: # monetary elem = elem + "€" row += elem + "|" row += "\n" md_content += row md_content += "\n\n" md_content += LANG[lang]["ad_stats_header"].format(institution) for stat in ["articles", "not_checked", "within_limits", "significant"]: md_content += "* " + LANG[lang]["ad_stats_" + stat] md_content += ": " + str(stats[stat]) + "\n" return md_content
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) args = parser.parse_args() for file_path in [APC_DE_FILE, TA_FILE]: with open(file_path, "r") as path: reader = csv.DictReader(path) oat.print_b("Preparing mapping tables from " + file_path + "...") for line in reader: data = { "journal_full_title": line["journal_full_title"], "publisher": line["publisher"], "is_hybrid": line["is_hybrid"], "count": 1 } for issn_type in ISSN_DICTS.keys(): issn = line[issn_type] if issn not in ISSN_DICTS[issn_type]: ISSN_DICTS[issn_type][issn] = data else: ISSN_DICTS[issn_type][issn]["count"] += 1 if reader.line_num % 10000 == 0: oat.print_b(str(reader.line_num) + " lines processed") modified_content = [] header = None with open(args.csv_file) as csv_file: reader = csv.DictReader(csv_file) header = list(reader.fieldnames) stopped = False for line in reader: if stopped: modified_content.append(line) continue for issn_type in ISSN_DICTS.keys(): issn = line[issn_type] if not oat.has_value(issn): continue if issn in ISSN_DICTS[issn_type]: for field_type in [ "is_hybrid", "publisher", "journal_full_title" ]: new_value = line[field_type] established_value = ISSN_DICTS[issn_type][issn][ field_type] if new_value != established_value and not is_whitelisted( field_type, new_value, established_value, line["issn"], line["issn_print"], line["issn_electronic"], line["issn_l"]): msg = MISMATCH_MSG.format( reader.line_num, oat.colorize(field_type, "cyan"), issn_type, issn, line["is_hybrid"], line["publisher"], line["journal_full_title"], oat.colorize( str(ISSN_DICTS[issn_type][issn]["count"]), "cyan"), ISSN_DICTS[issn_type][issn]["is_hybrid"], ISSN_DICTS[issn_type][issn]["publisher"], ISSN_DICTS[issn_type][issn] ["journal_full_title"]) print(msg) ask_msg = CORRECT_MSG.format( field_type, oat.colorize(established_value, "green"), field_type, oat.colorize(established_value, "green")) ezb_msg = None ret = input(ask_msg) while ret not in ["1", "2", "3", "4"]: if ret == "5": if ezb_msg is None: ezb_msg = _prepare_ezb_info(issn) print(ezb_msg) ret = input( "Please select an option from 1 to 5 > ") print("\n\n\n\n") if ret in ["1", "2"]: line[field_type] = established_value if ret in ["2", "4"]: stopped = True break modified_content.append(line) modified_lines = [header] for line in modified_content: modified_lines.append(list(line.values())) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True, False) writer.write_rows(modified_lines)
line = 2 for row in reader: for field in UNUSED_FIELDS: del(row[field]) transformative_agreements = False if file_name == TRANSAGREE_FILE_PATH: transformative_agreements = True apc_data.append(RowObject(file_name, line, row, transformative_agreements)) doi_duplicate_list.append(row["doi"]) reduced_row = {} for field in ISSN_DICT_FIELDS: reduced_row[field] = row[field] issn = row["issn"] if oat.has_value(issn): if issn not in issn_dict: issn_dict[issn] = [reduced_row] elif reduced_row not in issn_dict[issn]: issn_dict[issn].append(reduced_row) issn_p = row["issn_print"] if oat.has_value(issn_p): if issn_p not in issn_p_dict: issn_p_dict[issn_p] = [reduced_row] elif reduced_row not in issn_p_dict[issn_p]: issn_p_dict[issn_p].append(reduced_row) issn_e = row["issn_electronic"] if oat.has_value(issn_e): if issn_e not in issn_e_dict: issn_e_dict[issn_e] = [reduced_row] elif reduced_row not in issn_e_dict[issn_e]:
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file") parser.add_argument("source_column", type=int) parser.add_argument("currency_column", type=int) parser.add_argument("period_column", type=int) parser.add_argument("target_column", type=int) parser.add_argument("-f", "--force_overwrite", action="store_true") parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc, True) fieldnames = header.pop() modified_content = [] line_num = 0 for column_type in ["source_column", "currency_column", "period_column", "target_column"]: index = getattr(args, column_type) msg = "Column {} ('{}') is the {}." oat.print_g(msg.format(index, fieldnames[index], column_type)) start = input("\nStart conversion? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() for line in content: line_num += 1 if not oat.has_value(line[args.source_column]): oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...") modified_content.append(line) continue monetary_value = None try: monetary_value = locale.atof(line[args.source_column]) except ValueError: msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, line[args.source_column])) modified_content.append(line) continue currency = line[args.currency_column] if currency == "EUR": msg = "WARNING: Currency in line {} is already EUR, skipping..." oat.print_y(msg.format(line_num)) line[args.target_column] = line[args.source_column] modified_content.append(line) continue if not oat.has_value(currency): msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, currency)) modified_content.append(line) continue period = line[args.period_column] frequency = get_frequency(period) if frequency is None: msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, period)) modified_content.append(line) continue if currency not in EXCHANGE_RATES[frequency]: msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...' oat.print_b(msg.format(frequency, currency)) rates = oat.get_euro_exchange_rates(currency, frequency) EXCHANGE_RATES[frequency][currency] = rates rate = EXCHANGE_RATES[frequency][currency].get(period) if rate is None and frequency == "A": rate = _calulate_preliminary_annual_average(period, currency) if rate: EXCHANGE_RATES[frequency][currency][period] = rate if rate is None: if frequency != "D": msg = "Error: No conversion rate found for currency {} for period {} (line {}), aborting..." oat.print_r(msg.format(currency, period, line_num)) sys.exit() day_retries = 0 while rate is None: msg = "Warning: No conversion rate found for currency {} for period {} (line {}), trying next day..." oat.print_y(msg.format(currency, period, line_num)) period = get_next_day(period) rate = EXCHANGE_RATES[frequency][currency].get(period) day_retries += 1 if day_retries > 5: msg = "Error: Look-ahead limit for days exceeded, aborting..." oat.print_r(msg) sys.exit() euro_value = round(monetary_value/float(rate), 2) line[args.target_column] = str(euro_value) msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR" msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value) oat.print_g(msg) modified_content.append(line) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows([fieldnames] + modified_content)
with open(metadata["file_path"], "r") as csv_file: reader = DictReader(csv_file) line = 2 for row in reader: for field in metadata["unused_fields"]: del(row[field]) metadata["target_file"].append(RowObject(metadata["file_path"], line, row, data_file)) doi_duplicate_list.append(row["doi"]) if metadata["has_issn"]: reduced_row = {} for field in ISSN_DICT_FIELDS: reduced_row[field] = row[field] issn = row["issn"] if oat.has_value(issn): if issn not in issn_dict: issn_dict[issn] = [reduced_row] elif reduced_row not in issn_dict[issn]: issn_dict[issn].append(reduced_row) issn_p = row["issn_print"] if oat.has_value(issn_p): if issn_p not in issn_p_dict: issn_p_dict[issn_p] = [reduced_row] elif reduced_row not in issn_p_dict[issn_p]: issn_p_dict[issn_p].append(reduced_row) issn_e = row["issn_electronic"] if oat.has_value(issn_e): if issn_e not in issn_e_dict: issn_e_dict[issn_e] = [reduced_row] elif reduced_row not in issn_e_dict[issn_e]:
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file") parser.add_argument("source_column", type=int) parser.add_argument("currency_column", type=int) parser.add_argument("period_column", type=int) parser.add_argument("target_column", type=int) parser.add_argument("-f", "--force_overwrite", action="store_true") parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc) fieldnames = header.pop() modified_content = [] line_num = 0 for column_type in [ "source_column", "currency_column", "period_column", "target_column" ]: index = getattr(args, column_type) msg = "Column {} ('{}') is the {}." oat.print_g(msg.format(index, fieldnames[index], column_type)) start = input("\nStart conversion? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() for line in content: line_num += 1 if not oat.has_value(line[args.source_column]): oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...") modified_content.append(line) continue monetary_value = None try: monetary_value = locale.atof(line[args.source_column]) except ValueError: msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, line[args.source_column])) modified_content.append(line) continue period = line[args.period_column] if not oat.has_value(period) or not period.isdigit(): msg = "WARNING: Could not extract a valid year string from period column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, period)) modified_content.append(line) continue currency = line[args.currency_column] if not oat.has_value(currency): msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, currency)) modified_content.append(line) continue try: rate = AVG_YEARLY_CONVERSION_RATES[currency][period] except KeyError: msg = "ERROR: No conversion rate found for currency {} in year {} (line {}), aborting..." oat.print_r(msg.format(currency, period, line_num)) sys.exit() euro_value = round(monetary_value / rate, 2) line[args.target_column] = str(euro_value) modified_content.append(line) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows([fieldnames] + modified_content)
def integrate_changes(articles, file_path, enriched_file=False): ''' Update existing entries in a previously created harvest file. Args: articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest() file_path: Path to the CSV file the new values should be integrated into. enriched_file: If true, columns which are overwritten during enrichment will not be updated Returns: A tuple. The first element is a reduced list of article dicts, containing those which did not find a matching DOI in the file (Order preserved). The second element is the list of column headers encountered in the harvest file. ''' if not os.path.isfile(file_path): return (articles, None) enriched_blacklist = [ "institution", "publisher", "journal_full_title", "issn", "license_ref", "pmid" ] article_dict = OrderedDict() for article in articles: # This is possible because currently all repos use a local ID/record url, but it's just # a workaround. We might have to change to OAI record IDs later. url = article["url"] if oat.has_value(url): article_dict[url] = article updated_lines = [] fieldnames = None with open(file_path, "r") as f: reader = DictReader(f) fieldnames = reader.fieldnames updated_lines.append(list(fieldnames)) #header start_msg = "Integrating changes in harvest data into existing file {}" oat.print_g(start_msg.format(file_path)) for line in reader: url = line["url"] line_num = reader.reader.line_num msg = "Line {}: Checking for changes ({})" oat.print_b(msg.format(line_num, url)) if url in article_dict: for key, value in article_dict[url].items(): if enriched_file and key in enriched_blacklist: continue if key in line and value != line[key]: update_msg = 'Updating value in column {} ("{}" -> "{}")' oat.print_g(update_msg.format(key, line[key], value)) line[key] = value del (article_dict[url]) updated_line = [line[key] for key in fieldnames] updated_lines.append(updated_line) else: remove_msg = "URL {} no longer found in harvest data, removing article" oat.print_r(remove_msg.format(url)) with open(file_path, "w") as f: mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None writer = oat.OpenAPCUnicodeWriter(f, quotemask=mask, openapc_quote_rules=True, has_header=True) writer.write_rows(updated_lines) return (article_dict.values(), fieldnames)
def integrate_changes(articles, file_path, enriched_file=False, dry_run=False): ''' Update existing entries in a previously created harvest file. Args: articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest() file_path: Path to the CSV file the new values should be integrated into. enriched_file: If true, columns which are overwritten during enrichment will not be updated dry_run: Do not make any changes to the file (but still report changes and return the list of unencountered articles) Returns: A tuple. The first element is a reduced list of article dicts, containing those which did not find a matching DOI in the file (Order preserved). The second element is the list of column headers encountered in the harvest file. ''' messages = { 'wet': { 'start': 'Integrating changes in harvest data into existing file {}', 'line_change': 'Line {}: Updating value in column {} ("{}" -> "{}")', 'remove': 'PID {} no longer found in harvest data, removing article', }, 'dry': { 'start': 'Dry Run: Comparing harvest data to existing file {}', 'line_change': 'Line {} ({}): Change in column {} ("{}" -> "{}")', 'remove': 'PID {} no longer found in harvest data, article would be removed', } } messages = messages['dry'] if dry_run else messages['wet'] if not os.path.isfile(file_path): return (articles, None) enriched_blacklist = [ "institution", "publisher", "journal_full_title", "issn", "license_ref", "pmid" ] article_dict = OrderedDict() for article in articles: # Harvested articles use OAI record IDs in the url field as PID. url = article["url"] if oat.has_value(url): article_dict[url] = article updated_lines = [] fieldnames = None with open(file_path, "r") as f: reader = DictReader(f) fieldnames = reader.fieldnames updated_lines.append(list(fieldnames)) #header oat.print_y(messages["start"].format(file_path)) for line in reader: url = line["url"] if not oat.has_value(line["institution"]): # Do not change empty lines updated_lines.append([line[key] for key in fieldnames]) continue line_num = reader.reader.line_num if url in article_dict: for key, value in article_dict[url].items(): if enriched_file and key in enriched_blacklist: continue if key in line and value != line[key]: oat.print_g(messages["line_change"].format( line_num, line["url"], key, line[key], value)) line[key] = value del (article_dict[url]) updated_line = [line[key] for key in fieldnames] updated_lines.append(updated_line) else: oat.print_r(messages["remove"].format(url)) if not dry_run: with open(file_path, "w") as f: mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None writer = oat.OpenAPCUnicodeWriter(f, quotemask=mask, openapc_quote_rules=True, has_header=True) writer.write_rows(updated_lines) return (article_dict.values(), fieldnames)
grid_list = json_dict["institutes"] for index, ins in enumerate(grid_list): deciles = { round((len(grid_list) / 10) * i): str(i * 10) + "%" for i in range(1, 10) } if index in deciles: print(deciles[index]) if ins["status"] != "active": continue grid_names = [ins["name"]] if "aliases" in ins: grid_names += ins["aliases"] for institutions_row in ins_content: if oat.has_value(institutions_row[7]): continue institutions_name = institutions_row[2] grid_name, highest_ratio = get_best_match(grid_names, institutions_name) match_type = get_match_type(highest_ratio) if match_type != None: grid_id = ins["id"] msg = '{} match: "{}" might be Grid institution "{}" ({}).' question = 'Assign Grid ID {} ({}) (y/n/q)?' msg = msg.format(match_type["name"], institutions_name, grid_name, highest_ratio) question = question.format(grid_id, ins["name"]) match_type["print_func"](msg) start = input(question) while start not in ["y", "n", "q"]:
def main(): parser = argparse.ArgumentParser() parser.add_argument("source_file") parser.add_argument("source_column", type=int) parser.add_argument("currency_column", type=int) parser.add_argument("period_column", type=int) parser.add_argument("target_column", type=int) parser.add_argument("-f", "--force_overwrite", action="store_true") parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print ("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: print ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() header, content = oat.get_csv_file_content(args.source_file, enc, True) fieldnames = header.pop() modified_content = [] line_num = 0 for column_type in ["source_column", "currency_column", "period_column", "target_column"]: index = getattr(args, column_type) msg = "Column {} ('{}') is the {}." oat.print_g(msg.format(index, fieldnames[index], column_type)) start = input("\nStart conversion? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() for line in content: line_num += 1 if not oat.has_value(line[args.source_column]): oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...") modified_content.append(line) continue monetary_value = None try: monetary_value = locale.atof(line[args.source_column]) except ValueError: msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, line[args.source_column])) modified_content.append(line) continue currency = line[args.currency_column] if not oat.has_value(currency): msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, currency)) modified_content.append(line) continue period = line[args.period_column] frequency = get_frequency(period) if frequency is None: msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..." oat.print_y(msg.format(line_num, period)) modified_content.append(line) continue if currency not in EXCHANGE_RATES[frequency]: msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...' oat.print_b(msg.format(frequency, currency)) rates = oat.get_euro_exchange_rates(currency, frequency) EXCHANGE_RATES[frequency][currency] = rates try: rate = EXCHANGE_RATES[frequency][currency][period] except KeyError: msg = "ERROR: No conversion rate found for currency {} for period {} (line {}), aborting..." oat.print_r(msg.format(currency, period, line_num)) sys.exit() euro_value = round(monetary_value/float(rate), 2) line[args.target_column] = str(euro_value) msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR" msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value) oat.print_g(msg) modified_content.append(line) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows([fieldnames] + modified_content)
def integrate_changes(articles, file_path, enriched_file=False): ''' Update existing entries in a previously created harvest file. Args: articles: A list of article dicts, as retured by openapc_toolkit.oai_harvest() file_path: Path to the CSV file the new values should be integrated into. enriched_file: If true, columns which are overwritten during enrichment will not be updated Returns: A tuple. The first element is a reduced list of article dicts, containing those which did not find a matching DOI in the file (Order preserved). The second element is the list of column headers encountered in the harvest file. ''' if not os.path.isfile(file_path): return (articles, None) enriched_blacklist = ["institution", "publisher", "journal_full_title", "issn", "license_ref", "pmid"] article_dict = OrderedDict() for article in articles: doi = article["doi"] if oat.has_value(doi): article_dict[doi] = article updated_lines = [] fieldnames = None with open(file_path, "r") as f: reader = DictReader(f) fieldnames = reader.fieldnames updated_lines.append(list(fieldnames)) #header start_msg = "Integrating changes in harvest data into existing file {}" oat.print_g(start_msg.format(file_path)) for line in reader: doi = line["doi"] line_num = reader.reader.line_num if not oat.has_value(doi): msg = "Line {}: No DOI found, change check not possible" oat.print_y(msg.format(line_num)) updated_line = [line[key] for key in fieldnames] updated_lines.append(updated_line) else: msg = "Line {}: Checking for changes ({})" oat.print_b(msg.format(line_num, doi)) if doi in article_dict: for key, value in article_dict[doi].items(): if enriched_file and key in enriched_blacklist: continue if key in line and value != line[key]: update_msg = 'Updating value in column {} ("{}" -> "{}")' oat.print_g(update_msg.format(key, line[key], value)) line[key] = value del(article_dict[doi]) updated_line = [line[key] for key in fieldnames] updated_lines.append(updated_line) else: remove_msg = "DOI {} no longer found in harvest data, removing article" oat.print_r(remove_msg.format(doi)) with open(file_path, "w") as f: mask = oat.OPENAPC_STANDARD_QUOTEMASK if enriched_file else None writer = oat.OpenAPCUnicodeWriter(f, quotemask=mask, openapc_quote_rules=True, has_header=True) writer.write_rows(updated_lines) return (article_dict.values(), fieldnames)
issn_dict = {} issn_p_dict = {} issn_e_dict = {} for file_name in ["data/apc_de.csv", "data/offsetting/offsetting.csv"]: csv_file = open(file_name, "r") reader = oat.UnicodeDictReader(csv_file) line = 2 for row in reader: test_apc = True if file_name == "data/offsetting/offsetting.csv": test_apc = False apc_data.append(RowObject(file_name, line, row, test_apc)) doi_duplicate_list.append(row["doi"]) issn = row["issn"] if oat.has_value(issn): if issn not in issn_dict: issn_dict[issn] = [row] else: issn_dict[issn].append(row) issn_p = row["issn_print"] if oat.has_value(issn_p): if issn_p not in issn_p_dict: issn_p_dict[issn_p] = [row] else: issn_p_dict[issn_p].append(row) issn_e = row["issn_electronic"] if oat.has_value(issn_e): if issn_e not in issn_e_dict: issn_e_dict[issn_e] = [row] else: