def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-O", "--offsetting_mode", help=ARG_HELP_STRINGS["offsetting"]) parser.add_argument("-b", "--bypass-cert-verification", action="store_true", help=ARG_HELP_STRINGS["bypass"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-f", "--force", action="store_true", help=ARG_HELP_STRINGS["force"]) parser.add_argument("-i", "--ignore-header", action="store_true", help=ARG_HELP_STRINGS["ignore_header"]) parser.add_argument("-j", "--force-header", action="store_true", help=ARG_HELP_STRINGS["force_header"]) parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-a", "--add-unknown-columns", action="store_true", help=ARG_HELP_STRINGS["unknown_columns"]) parser.add_argument("-d", "--dialect", choices=["excel", "excel-tab", "unix"], help=ARG_HELP_STRINGS["dialect"]) parser.add_argument("-v", "--verbose", action="store_true", help=ARG_HELP_STRINGS["verbose"]) parser.add_argument("-o", "--overwrite", action="store_true", help=ARG_HELP_STRINGS["overwrite"]) parser.add_argument("-u", "--update", action="store_true", help=ARG_HELP_STRINGS["update"]) parser.add_argument("-r", "--round_monetary", action="store_true", help=ARG_HELP_STRINGS["round_monetary"]) parser.add_argument("--no-crossref", action="store_true", help=ARG_HELP_STRINGS["no_crossref"]) parser.add_argument("--no-pubmed", action="store_true", help=ARG_HELP_STRINGS["no_pubmed"]) parser.add_argument("--no-doaj", action="store_true", help=ARG_HELP_STRINGS["no_doaj"]) parser.add_argument("-institution", "--institution_column", type=int, help=ARG_HELP_STRINGS["institution"]) parser.add_argument("-period", "--period_column", type=int, help=ARG_HELP_STRINGS["period"]) parser.add_argument("-doi", "--doi_column", type=int, help=ARG_HELP_STRINGS["doi"]) parser.add_argument("-euro", "--euro_column", type=int, help=ARG_HELP_STRINGS["euro"]) parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int, help=ARG_HELP_STRINGS["is_hybrid"]) parser.add_argument("-publisher", "--publisher_column", type=int, help=ARG_HELP_STRINGS["publisher"]) parser.add_argument("-journal_full_title", "--journal_full_title_column", type=int, help=ARG_HELP_STRINGS["journal_full_title"]) parser.add_argument("-book_title", "--book_title_column", type=int, help=ARG_HELP_STRINGS["book_title"]) parser.add_argument("-issn", "--issn_column", type=int, help=ARG_HELP_STRINGS["issn"]) parser.add_argument("-isbn", "--isbn_column", type=int, help=ARG_HELP_STRINGS["isbn"]) parser.add_argument("-backlist_oa", "--backlist_oa_column", type=int, help=ARG_HELP_STRINGS["backlist_oa"]) parser.add_argument("-additional_isbns", "--additional_isbn_columns", type=int, nargs='+', help=ARG_HELP_STRINGS["additional_isbns"]) parser.add_argument("-url", "--url_column", type=int, help=ARG_HELP_STRINGS["url"]) parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"]) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() enc = None # CSV file encoding if args.encoding: try: codec = codecs.lookup(args.encoding) msg = ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) oat.print_g(msg) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() result = oat.analyze_csv_file(args.csv_file, enc=enc) if result["success"]: csv_analysis = result["data"] print(csv_analysis) else: print(result["error_msg"]) sys.exit() if args.dialect: dialect = args.dialect oat.print_g('Dialect sniffing results ignored, using built-in CSV dialect "' + dialect + '"') else: dialect = csv_analysis.dialect if enc is None: enc = csv_analysis.enc has_header = csv_analysis.has_header or args.force_header if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() csv_file = open(args.csv_file, "r", encoding=enc) reader = csv.reader(csv_file, dialect=dialect) first_row = next(reader) num_columns = len(first_row) print("\nCSV file has {} columns.".format(num_columns)) csv_file.seek(0) reader = csv.reader(csv_file, dialect=dialect) if args.update and args.overwrite: oat.print_r("Error: Either use the -u or the -o option, not both.") sys.exit() if args.overwrite: for column in OVERWRITE_STRATEGY.keys(): OVERWRITE_STRATEGY[column] = CSVColumn.OW_ALWAYS elif not args.update: for column in OVERWRITE_STRATEGY.keys(): OVERWRITE_STRATEGY[column] = CSVColumn.OW_ASK additional_isbn_columns = [] if args.additional_isbn_columns: for index in args.additional_isbn_columns: if index > num_columns: msg = "Error: Additional ISBN column index {} exceeds number of columns ({})." oat.print_r(msg.format(index, num_columns)) sys.exit() else: additional_isbn_columns.append(index) column_map = { "institution": CSVColumn("institution", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.institution_column, overwrite=OVERWRITE_STRATEGY["institution"]), "period": CSVColumn("period",{"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.period_column, overwrite=OVERWRITE_STRATEGY["period"]), "euro": CSVColumn("euro", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.euro_column, overwrite=OVERWRITE_STRATEGY["euro"]), "doi": CSVColumn("doi", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.MANDATORY}, args.doi_column, overwrite=OVERWRITE_STRATEGY["doi"]), "is_hybrid": CSVColumn("is_hybrid", {"articles": CSVColumn.MANDATORY, "books": CSVColumn.NONE}, args.is_hybrid_column, overwrite=OVERWRITE_STRATEGY["is_hybrid"]), "publisher": CSVColumn("publisher", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.publisher_column, overwrite=OVERWRITE_STRATEGY["publisher"]), "journal_full_title": CSVColumn("journal_full_title", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.journal_full_title_column, overwrite=OVERWRITE_STRATEGY["journal_full_title"]), "issn": CSVColumn("issn", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.issn_column, overwrite=OVERWRITE_STRATEGY["issn"]), "issn_print": CSVColumn("issn_print", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_print"]), "issn_electronic": CSVColumn("issn_electronic", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_electronic"]), "issn_l": CSVColumn("issn_l", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["issn_l"]), "license_ref": CSVColumn("license_ref", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE} , None, overwrite=OVERWRITE_STRATEGY["license_ref"]), "indexed_in_crossref": CSVColumn("indexed_in_crossref", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["indexed_in_crossref"]), "pmid": CSVColumn("pmid", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["pmid"]), "pmcid": CSVColumn("pmcid", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["pmcid"]), "ut": CSVColumn("ut", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["ut"]), "url": CSVColumn("url", {"articles": CSVColumn.BACKUP, "books": CSVColumn.NONE}, args.url_column, overwrite=OVERWRITE_STRATEGY["url"]), "doaj": CSVColumn("doaj", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["doaj"]), "agreement": CSVColumn("agreement", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["agreement"]), "book_title": CSVColumn("book_title", {"articles": CSVColumn.NONE, "books": CSVColumn.RECOMMENDED}, args.book_title_column, overwrite=OVERWRITE_STRATEGY["book_title"]), "backlist_oa": CSVColumn("backlist_oa", {"articles": CSVColumn.NONE, "books": CSVColumn.MANDATORY}, args.backlist_oa_column, overwrite=OVERWRITE_STRATEGY["backlist_oa"]), "isbn": CSVColumn("isbn", {"articles": CSVColumn.NONE, "books": CSVColumn.BACKUP}, args.isbn_column, overwrite=OVERWRITE_STRATEGY["isbn"]), "isbn_print": CSVColumn("isbn_print", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["isbn_print"]), "isbn_electronic": CSVColumn("isbn_electronic", {"articles": CSVColumn.NONE, "books": CSVColumn.NONE}, None, overwrite=OVERWRITE_STRATEGY["isbn_electronic"]) } header = None if has_header: for row in reader: if not row: # Skip empty lines continue header = row # First non-empty row should be the header if args.ignore_header: print("Skipping header analysis due to command line argument.") break else: print("\n *** Analyzing CSV header ***\n") for (index, item) in enumerate(header): if index in additional_isbn_columns: msg = "Column named '{}' at index {} is designated as additional ISBN column" print(msg.format(item, index)) continue column_type = oat.get_column_type_from_whitelist(item) if column_type is not None and column_map[column_type].index is None: column_map[column_type].index = index column_map[column_type].column_name = item found_msg = ("Found column named '{}' at index {}, " + "assuming this to be the '{}' column.") print(found_msg.format(item, index, column_type)) break print("\n *** Starting heuristical analysis ***\n") for row in reader: if not row: # Skip empty lines # We analyze the first non-empty line, a possible header should # have been processed by now. continue column_candidates = { "doi": [], "period": [], "euro": [] } found_msg = "The entry in column {} looks like a potential {}: {}" for (index, entry) in enumerate(row): if index in [csvcolumn.index for csvcolumn in column_map.values()] + additional_isbn_columns: # Skip columns already assigned continue entry = entry.strip() # Search for a DOI if column_map['doi'].index is None: if oat.DOI_RE.match(entry): column_id = str(index) # identify column either numerically or by column header if header: column_id += " ('" + header[index] + "')" print(found_msg.format(column_id, "DOI", entry)) column_candidates['doi'].append(index) continue # Search for a potential year string if column_map['period'].index is None: try: maybe_period = int(entry) now = datetime.date.today().year # Should be a wide enough margin if maybe_period >= 2000 and maybe_period <= now + 2: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print(found_msg.format(column_id, "year", entry)) column_candidates['period'].append(index) continue except ValueError: pass # Search for a potential monetary amount if column_map['euro'].index is None: try: maybe_euro = locale.atof(entry) if maybe_euro >= 10 and maybe_euro <= 10000: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print (found_msg.format(column_id, "euro amount", entry)) column_candidates['euro'].append(index) continue except ValueError: pass for column_type, candidates in column_candidates.items(): if column_map[column_type].index is not None: continue if len(candidates) > 1: print("Could not reliably identify the '" + column_type + "' column - more than one possible candiate!") elif len(candidates) < 1: print("No candidate found for column '" + column_type + "'!") else: index = candidates.pop() column_map[column_type].index = index if header: column_id = header[index] column_map[column_type].column_name = column_id else: column_id = index msg = "Assuming column '{}' to be the '{}' column." print(msg.format(column_id, column_type)) column_map[column_type].index = index break print("\n *** CSV file analysis summary ***\n") index_dict = {csvc.index: csvc for csvc in column_map.values()} for index in range(num_columns): column_name = "" if header: column_name = header[index] if index in index_dict: column = index_dict[index] msg = u"column number {} ({}) is the '{}' column ({})".format( index, column_name, column.column_type, column.get_req_description()) print(msg) elif index in additional_isbn_columns: msg = u"column number {} ({}) is an additional ISBN column".format(index, column_name) oat.print_c(msg) else: if args.add_unknown_columns: msg = (u"column number {} ({}) is an unknown column, it will be " + "appended to the generated CSV file") print(msg.format(index, column_name)) if not column_name: # Use a generic name column_name = "unknown" while column_name in column_map.keys(): # TODO: Replace by a numerical, increasing suffix column_name += "_" column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index) else: msg = (u"column number {} ({}) is an unknown column, it will be " + "ignored") print(msg.format(index, column_name)) print() for column in column_map.values(): if column.index is None: msg = "The '{}' column could not be identified ({})" print(msg.format(column.column_type, column.get_req_description())) print() article_mand_missing = [x.column_type for x in column_map.values() if x.requirement["articles"] == CSVColumn.MANDATORY and x.index is None] article_back_missing = [x.column_type for x in column_map.values() if x.requirement["articles"] == CSVColumn.BACKUP and x.index is None] book_mand_missing = [x.column_type for x in column_map.values() if x.requirement["books"] == CSVColumn.MANDATORY and x.index is None] book_back_missing = [x.column_type for x in column_map.values() if x.requirement["books"] == CSVColumn.BACKUP and x.index is None] if article_mand_missing: msg = "Article enrichment is not possible - mandatory columns are missing ({})" oat.print_y(msg.format(", ".join(article_mand_missing))) elif article_back_missing: msg = "Article enrichment is possible, but backup columns are missing ({}) - each record will need a valid DOI" oat.print_b(msg.format(", ".join(article_back_missing))) else: oat.print_g("Article enrichment is possible with all backup columns in place") if book_mand_missing: msg = "Book enrichment is not possible - mandatory columns are missing ({})" oat.print_y(msg.format(", ".join(book_mand_missing))) elif book_back_missing: msg = "Book enrichment is possible, but backup columns are missing ({}) - each record will need a valid DOI" oat.print_b(msg.format(", ".join(book_back_missing))) else: oat.print_g("Book enrichment is possible with all backup columns in place") print() if article_mand_missing and book_mand_missing: if not args.force: oat.print_r("ERROR: Could not detect the minimum mandatory data set for any " + "publication type. There are 2 ways to fix this:") if not header: print("1) Add a header row to your file and identify the " + "column(s) by assigning them an appropiate column name.") else: print("1) Identify the missing column(s) by assigning them " + "a different column name in the CSV header (You can " + "use the column name(s) mentioned in the message above)") print("2) Use command line parameters when calling this script " + "to identify the missing columns (use -h for help) ") sys.exit() else: oat.print_y("WARNING: Could not detect the minimum mandatory data set for any " + "publication type - forced to continue.") start = input("\nStart metadata aggregation? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() print("\n *** Starting metadata aggregation ***\n") enriched_content = {} for record_type, fields in oat.COLUMN_SCHEMAS.items(): # add headers enriched_content[record_type] = { "count": 0, "content": [list(fields)] } if not os.path.isdir("tempfiles"): os.mkdir("tempfiles") isbn_handling = oat.ISBNHandling("tempfiles/ISBNRangeFile.xml") doab_analysis = oat.DOABAnalysis(isbn_handling, "tempfiles/DOAB.csv", verbose=False) doaj_analysis = oat.DOAJAnalysis("tempfiles/DOAJ.csv") csv_file.seek(0) reader = csv.reader(csv_file, dialect=dialect) header_processed = False row_num = 0 for row in reader: row_num += 1 if not row: continue # skip empty lines if not header_processed: header_processed = True if has_header: # If the CSV file has a header, we are currently there - skip it # to get to the first data row continue if args.start and args.start > row_num: continue if args.end and args.end < row_num: continue print("---Processing line number " + str(row_num) + "---") result_type, enriched_row = oat.process_row(row, row_num, column_map, num_columns, additional_isbn_columns, doab_analysis, doaj_analysis, args.no_crossref, args.no_pubmed, args.no_doaj, args.round_monetary, args.offsetting_mode) for record_type, value in enriched_content.items(): if record_type == result_type: value["content"].append(enriched_row) value["count"] += 1 else: empty_line = ["" for x in value["content"][0]] value["content"].append(empty_line) csv_file.close() for record_type, value in enriched_content.items(): if value["count"] > 0: with open('out_' + record_type + '.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, oat.OPENAPC_STANDARD_QUOTEMASK, True, True, True) writer.write_rows(value["content"]) if not bufferedHandler.buffer: oat.print_g("Metadata enrichment successful, no errors occured") else: oat.print_r("There were errors during the enrichment process:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"]) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) oat.print_g(msg) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() head, content = oat.get_csv_file_content(args.csv_file, enc) content = head + content header = {"User-Agent": "Mozilla/5.0 Firefox/45.0"} line_num = 0 for line in content: line_num += 1 if args.start and args.start > line_num: continue if args.end and args.end < line_num: continue institution = line[0] period = line[1] doi = line[3] is_hybrid = line[4] publisher = line[5] journal = line[6] if publisher != "Elsevier" or is_hybrid != "TRUE": continue init_msg = (u"Line {}: Checking {} article from {}, published in " + "{}...").format(line_num, institution, period, journal) oat.print_b(init_msg) url = 'http://doi.org/' + doi req = urllib2.Request(url, None, header) ret_value = {'success': True} try: response = urllib2.urlopen(req) target = response.geturl() resolve_msg = u"DOI {} resolved, led us to {}".format(doi, target) if "sciencedirect.com" not in target: oat.print_y(resolve_msg) oat.print_y("Journal not located at sciencedirect, skipping...") continue oat.print_b(resolve_msg) content_string = response.read() single_match = pdflink_re.search(content_string) if single_match: link_url = single_match.groups()[0] oat.print_g(u"PDF link found: " + link_url) else: multi_match = pdflink_multi_re.search(content_string) if multi_match: link_url = multi_match.groups()[0] link_url = link_url.replace("&", "&") oat.print_g(u"PDF link found (more than one document): " + link_url) else: error_msg = (u"No PDF link found! (line {}, DOI: {}, " + "landing page: {})").format(line_num, doi, target) logging.error(error_msg) time.sleep(1) except urllib2.HTTPError as httpe: code = str(httpe.getcode()) oat.print_r("HTTPError: {} - {}".format(code, httpe.reason)) except urllib2.URLError as urle: oat.print_r("URLError: {}".format(urle.reason)) if not bufferedHandler.buffer: oat.print_g("\nLookup finished, all articles were accessible on sciencedirect") else: oat.print_r("\nLookup finished, not all articles could be accessed on sciencedirect:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-O", "--offsetting_mode", help=ARG_HELP_STRINGS["offsetting"]) parser.add_argument("-b", "--bypass-cert-verification", action="store_true", help=ARG_HELP_STRINGS["bypass"]) parser.add_argument("-d", "--offline_doaj", help=ARG_HELP_STRINGS["offline_doaj"]) parser.add_argument("-D", "--offline_doaj_download", help=ARG_HELP_STRINGS["offline_doaj_download"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-f", "--force", action="store_true", help=ARG_HELP_STRINGS["force"]) parser.add_argument("-i", "--ignore-header", action="store_true", help=ARG_HELP_STRINGS["ignore_header"]) parser.add_argument("-j", "--force-header", action="store_true", help=ARG_HELP_STRINGS["force_header"]) parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-a", "--add-unknown-columns", action="store_true", help=ARG_HELP_STRINGS["unknown_columns"]) parser.add_argument("-v", "--verbose", action="store_true", help=ARG_HELP_STRINGS["verbose"]) parser.add_argument("-o", "--overwrite", action="store_true", help=ARG_HELP_STRINGS["overwrite"]) parser.add_argument("-u", "--update", action="store_true", help=ARG_HELP_STRINGS["update"]) parser.add_argument("-r", "--round_monetary", action="store_true", help=ARG_HELP_STRINGS["round_monetary"]) parser.add_argument("--no-crossref", action="store_true", help=ARG_HELP_STRINGS["no_crossref"]) parser.add_argument("--no-pubmed", action="store_true", help=ARG_HELP_STRINGS["no_pubmed"]) parser.add_argument("--no-doaj", action="store_true", help=ARG_HELP_STRINGS["no_doaj"]) parser.add_argument("-institution", "--institution_column", type=int, help=ARG_HELP_STRINGS["institution"]) parser.add_argument("-period", "--period_column", type=int, help=ARG_HELP_STRINGS["period"]) parser.add_argument("-doi", "--doi_column", type=int, help=ARG_HELP_STRINGS["doi"]) parser.add_argument("-euro", "--euro_column", type=int, help=ARG_HELP_STRINGS["euro"]) parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int, help=ARG_HELP_STRINGS["is_hybrid"]) parser.add_argument("-publisher", "--publisher_column", type=int, help=ARG_HELP_STRINGS["publisher"]) parser.add_argument("-journal_full_title", "--journal_full_title_column", type=int, help=ARG_HELP_STRINGS["journal_full_title"]) parser.add_argument("-issn", "--issn_column", type=int, help=ARG_HELP_STRINGS["issn"]) parser.add_argument("-url", "--url_column", type=int, help=ARG_HELP_STRINGS["url"]) parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"]) parser.add_argument("-q", "--quotemask", default="tfftttttttttttttttt", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-n", "--no-openapc-quote-rules", help=ARG_HELP_STRINGS["no_openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) if args.offline_doaj and args.offline_doaj_download: oat.print_r("Error: Either use the -d or the -D option, not both.") sys.exit() if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: msg = "locale '{}' not found, normalised to '{}'".format( args.locale, norm) oat.print_y(msg) try: loc = locale.setlocale(locale.LC_ALL, norm) oat.print_g("Using locale " + loc) except locale.Error as loce: msg = "Setting locale to {} failed: {}".format(norm, loce.message) oat.print_r(msg) sys.exit() enc = None # CSV file encoding if args.encoding: try: codec = codecs.lookup(args.encoding) msg = ("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) oat.print_g(msg) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() result = oat.analyze_csv_file(args.csv_file, enc=enc) if result["success"]: csv_analysis = result["data"] print(csv_analysis) else: print(result["error_msg"]) sys.exit() if enc is None: enc = csv_analysis.enc dialect = csv_analysis.dialect has_header = csv_analysis.has_header or args.force_header if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and " + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] doaj_offline_analysis = None if args.offline_doaj: if os.path.isfile(args.offline_doaj): doaj_offline_analysis = oat.DOAJOfflineAnalysis(args.offline_doaj) else: oat.print_r("Error: " + args.offline_doaj + " does not seem " "to be a file!") sys.exit() elif args.offline_doaj_download: if os.path.isfile(args.offline_doaj_download): oat.print_r("Error: Target file '" + args.offline_doaj_download + "' already exists!") sys.exit() doaj_offline_analysis = oat.DOAJOfflineAnalysis( args.offline_doaj_download, download=True) csv_file = open(args.csv_file, "r", encoding=enc) reader = csv.reader(csv_file, dialect=dialect) first_row = next(reader) num_columns = len(first_row) print("\nCSV file has {} columns.".format(num_columns)) csv_file.seek(0) reader = csv.reader(csv_file, dialect=dialect) if args.update and args.overwrite: oat.print_r("Error: Either use the -u or the -o option, not both.") sys.exit() if args.overwrite: for column in OVERWRITE_STRATEGY.keys(): OVERWRITE_STRATEGY[column] = CSVColumn.OW_ALWAYS elif not args.update: for column in OVERWRITE_STRATEGY.keys(): OVERWRITE_STRATEGY[column] = CSVColumn.OW_ASK openapc_column_map = OrderedDict([ ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column, overwrite=OVERWRITE_STRATEGY["institution"])), ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column, overwrite=OVERWRITE_STRATEGY["period"])), ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column, overwrite=OVERWRITE_STRATEGY["euro"])), ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column, overwrite=OVERWRITE_STRATEGY["doi"])), ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column, overwrite=OVERWRITE_STRATEGY["is_hybrid"])), ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column, overwrite=OVERWRITE_STRATEGY["publisher"])), ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL, args.journal_full_title_column, overwrite=OVERWRITE_STRATEGY["journal_full_title"])), ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column, overwrite=OVERWRITE_STRATEGY["issn"])), ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["issn_print"])), ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["issn_electronic"])), ("issn_l", CSVColumn("issn_l", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["issn_l"])), ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["license_ref"])), ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["indexed_in_crossref"])), ("pmid", CSVColumn("pmid", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["pmid"])), ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["pmcid"])), ("ut", CSVColumn("ut", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["ut"])), ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column, overwrite=OVERWRITE_STRATEGY["url"])), ("doaj", CSVColumn("doaj", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["doaj"])) ]) offsetting_column_map = OrderedDict([ ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column, overwrite=OVERWRITE_STRATEGY["institution"])), ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column, overwrite=OVERWRITE_STRATEGY["period"])), ("euro", CSVColumn("euro", CSVColumn.NONE, args.euro_column, overwrite=OVERWRITE_STRATEGY["euro"])), ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column, overwrite=OVERWRITE_STRATEGY["doi"])), ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column, overwrite=OVERWRITE_STRATEGY["is_hybrid"])), ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column, overwrite=OVERWRITE_STRATEGY["publisher"])), ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL, args.journal_full_title_column, overwrite=OVERWRITE_STRATEGY["journal_full_title"])), ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column, overwrite=OVERWRITE_STRATEGY["issn"])), ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["issn_print"])), ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["issn_electronic"])), ("issn_l", CSVColumn("issn_l", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["issn_l"])), ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["license_ref"])), ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["indexed_in_crossref"])), ("pmid", CSVColumn("pmid", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["pmid"])), ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["pmcid"])), ("ut", CSVColumn("ut", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["ut"])), ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column, overwrite=OVERWRITE_STRATEGY["url"])), ("doaj", CSVColumn("doaj", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["doaj"])), ("agreement", CSVColumn("agreement", CSVColumn.NONE, None, overwrite=OVERWRITE_STRATEGY["agreement"])), ]) if args.offsetting_mode: column_map = offsetting_column_map else: column_map = openapc_column_map header = None if has_header: for row in reader: if not row: # Skip empty lines continue header = row # First non-empty row should be the header if args.ignore_header: print("Skipping header analysis due to command line argument.") break else: print("\n *** Analyzing CSV header ***\n") for (index, item) in enumerate(header): column_type = oat.get_column_type_from_whitelist(item) if column_type is not None and column_map[ column_type].index is None: column_map[column_type].index = index column_map[column_type].column_name = item found_msg = ("Found column named '{}' at index {}, " + "assuming this to be the {} column.") print(found_msg.format(item, index, column_type)) break print("\n *** Starting heuristical analysis ***\n") for row in reader: if not row: # Skip empty lines # We analyze the first non-empty line, a possible header should # have been processed by now. continue column_candidates = {"doi": [], "period": [], "euro": []} found_msg = "The entry in column {} looks like a potential {}: {}" for (index, entry) in enumerate(row): if index in [csvcolumn.index for csvcolumn in column_map.values()]: # Skip columns already assigned continue entry = entry.strip() # Search for a DOI if column_map['doi'].index is None: if oat.DOI_RE.match(entry): column_id = str(index) # identify column either numerically or by column header if header: column_id += " ('" + header[index] + "')" print(found_msg.format(column_id, "DOI", entry)) column_candidates['doi'].append(index) continue # Search for a potential year string if column_map['period'].index is None: try: maybe_period = int(entry) now = datetime.date.today().year # Should be a wide enough margin if maybe_period >= 2000 and maybe_period <= now + 2: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print(found_msg.format(column_id, "year", entry)) column_candidates['period'].append(index) continue except ValueError: pass # Search for a potential monetary amount if column_map['euro'].index is None: try: maybe_euro = locale.atof(entry) if maybe_euro >= 10 and maybe_euro <= 10000: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print(found_msg.format(column_id, "euro amount", entry)) column_candidates['euro'].append(index) continue except ValueError: pass for column_type, candidates in column_candidates.items(): if column_map[column_type].index is not None: continue if len(candidates) > 1: print("Could not reliably identify the '" + column_type + "' column - more than one possible candiate!") elif len(candidates) < 1: print("No candidate found for column '" + column_type + "'!") else: index = candidates.pop() column_map[column_type].index = index if header: column_id = header[index] column_map[column_type].column_name = column_id else: column_id = index msg = "Assuming column '{}' to be the '{}' column." print(msg.format(column_id, column_type)) column_map[column_type].index = index break # Wrap up: Check if there any mandatory column types left which have not # yet been identified - we cannot continue in that case (unless forced). unassigned = [ x for x in iter(column_map.items()) if x[1].requirement == CSVColumn.MANDATORY and x[1].index is None ] if unassigned: for item in unassigned: print("The {} column is still unidentified.".format(item[0])) if header: print("The CSV header is:\n" + dialect.delimiter.join(header)) if not args.force: print("ERROR: We cannot continue because not all mandatory " + "column types in the CSV file could be automatically " + "identified. There are 2 ways to fix this:") if not header: print("1) Add a header row to your file and identify the " + "column(s) by assigning them an appropiate column name.") else: print("1) Identify the missing column(s) by assigning them " + "a different column name in the CSV header (You can " + "use the column name(s) mentioned in the message above)") print("2) Use command line parameters when calling this script " + "to identify the missing columns (use -h for help) ") sys.exit() else: print("WARNING: Not all mandatory column types in the CSV file " + "could be automatically identified - forced to continue.") print("\n *** CSV file analysis summary ***\n") index_dict = {csvc.index: csvc for csvc in column_map.values()} for index in range(num_columns): column_name = "" if header: column_name = header[index] if index in index_dict: column = index_dict[index] msg = u"column number {} ({}) is the {} column '{}'".format( index, column_name, column.requirement, column.column_type) if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]: oat.print_g(msg) else: oat.print_b(msg) else: if args.add_unknown_columns: msg = ( u"column number {} ({}) is an unknown column, it will be " + "appended to the generated CSV file") oat.print_y(msg.format(index, column_name)) if not column_name: # Use a generic name column_name = "unknown" while column_name in column_map.keys(): # TODO: Replace by a numerical, increasing suffix column_name += "_" column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index) else: msg = ( u"column number {} ({}) is an unknown column, it will be " + "ignored") oat.print_y(msg.format(index, column_name)) print() for column in column_map.values(): if column.index is None: msg = "The {} column '{}' could not be identified." print(msg.format(column.requirement, column.column_type)) # Check for unassigned optional column types. We can continue but should # issue a warning as all entries will need a valid DOI in this case. unassigned = [ k for k, v in column_map.items() if v.requirement == CSVColumn.OPTIONAL and v.index is None ] if unassigned: print("\nWARNING: Not all optional column types could be " + "identified. Metadata aggregation is still possible, but " + "every entry in the CSV file will need a valid DOI.") start = input("\nStart metadata aggregation? (y/n):") while start not in ["y", "n"]: start = input("Please type 'y' or 'n':") if start == "n": sys.exit() print("\n *** Starting metadata aggregation ***\n") enriched_content = [] csv_file.seek(0) reader = csv.reader(csv_file, dialect=dialect) header_processed = False row_num = 0 for row in reader: row_num += 1 if not row: continue # skip empty lines if not header_processed: header_processed = True enriched_content.append(list(column_map.keys())) if has_header: # If the CSV file has a header, we are currently there - skip it # to get to the first data row continue if args.start and args.start > row_num: continue if args.end and args.end < row_num: continue print("---Processing line number " + str(row_num) + "---") enriched_row = oat.process_row(row, row_num, column_map, num_columns, args.no_crossref, args.no_pubmed, args.no_doaj, doaj_offline_analysis, args.round_monetary, args.offsetting_mode) enriched_content.append(enriched_row) csv_file.close() with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, not args.no_openapc_quote_rules, True, True) writer.write_rows(enriched_content) if not bufferedHandler.buffer: oat.print_g("Metadata enrichment successful, no errors occured") else: oat.print_r("There were errors during the enrichment process:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"]) args = parser.parse_args() handler = logging.StreamHandler(sys.stderr) handler.setFormatter(oat.ANSIColorFormatter()) bufferedHandler = oat.BufferedErrorHandler(handler) bufferedHandler.setFormatter(oat.ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) msg = "Encoding '{}' found in Python's codec collection as '{}'" oat.print_g(msg.format(args.encoding, codec.name)) enc = args.encoding except LookupError: msg = ("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") oat.print_r(msg) sys.exit() head, content = oat.get_csv_file_content(args.csv_file, enc) content = head + content line_num = 0 for line in content: line_num += 1 if args.start and args.start > line_num: continue if args.end and args.end < line_num: continue # Check hybrid status if line[4] != "TRUE": continue institution = line[0] period = line[1] doi = line[3] publisher = line[5] journal = line[6] for lpl in lpl_list: if lpl.publisher_matches(publisher): init_msg = (u"Line {}: Checking {} article from {}, published in '" + "{}'...").format(line_num, institution, period, journal) oat.print_b(init_msg) page_content = get_landingpage_content(doi, lpl) if page_content is None: continue pdf_link = lpl.search_for_oa(page_content) if pdf_link is None: error_msg = (u"No PDF link found! (line {}, DOI: " + "http://doi.org/{}").format(line_num, doi) logging.error(error_msg) elif pdf_link == "": warning_msg = (u"A RegexGroup matched, but no PDF " + "link was found! (line {}, DOI: " + "http://doi.org/{}").format(line_num, doi) logging.warning(warning_msg) else: oat.print_g(u"PDF link found: " + pdf_link) time.sleep(1) if not bufferedHandler.buffer: oat.print_g("\nLookup finished, all articles were accessible") else: oat.print_r("\nLookup finished, not all articles could be accessed:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()