def _get_csv_file_content(file_name, enc=None): result = oat.analyze_csv_file(file_name, 500) if result["success"]: csv_analysis = result["data"] print csv_analysis else: print result["error_msg"] sys.exit() if enc is None: enc = csv_analysis.enc if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() dialect = csv_analysis.dialect csv_file = open(file_name, "r") content = [] reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) header = [] if csv_analysis.has_header: header.append(reader.next()) for row in reader: content.append(row) csv_file.close() return (header, content)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-b", "--bypass-cert-verification", action="store_true", help=ARG_HELP_STRINGS["bypass"]) parser.add_argument("-d", "--offline_doaj", help=ARG_HELP_STRINGS["offline_doaj"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-f", "--force", action="store_true", help=ARG_HELP_STRINGS["force"]) parser.add_argument("-i", "--ignore-header", action="store_true", help=ARG_HELP_STRINGS["ignore_header"]) parser.add_argument("-j", "--force-header", action="store_true", help=ARG_HELP_STRINGS["force_header"]) parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-u", "--add-unknown-columns", action="store_true", help=ARG_HELP_STRINGS["unknown_columns"]) parser.add_argument("-v", "--verbose", action="store_true", help=ARG_HELP_STRINGS["verbose"]) parser.add_argument("-institution", "--institution_column", type=int, help=ARG_HELP_STRINGS["institution"]) parser.add_argument("-period", "--period_column", type=int, help=ARG_HELP_STRINGS["period"]) parser.add_argument("-doi", "--doi_column", type=int, help=ARG_HELP_STRINGS["doi"]) parser.add_argument("-euro", "--euro_column", type=int, help=ARG_HELP_STRINGS["euro"]) parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int, help=ARG_HELP_STRINGS["is_hybrid"]) parser.add_argument("-publisher", "--publisher_column", type=int, help=ARG_HELP_STRINGS["publisher"]) parser.add_argument("-journal_full_title", "--journal_full_title_column", type=int, help=ARG_HELP_STRINGS["journal_full_title"]) parser.add_argument("-issn", "--issn_column", type=int, help=ARG_HELP_STRINGS["issn"]) parser.add_argument("-url", "--url_column", type=int, help=ARG_HELP_STRINGS["url"]) parser.add_argument("-start", type=int, help=ARG_HELP_STRINGS["start"]) parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["end"]) args = parser.parse_args() enc = None # CSV file encoding handler = logging.StreamHandler(sys.stderr) handler.setFormatter(ANSIColorFormatter()) bufferedHandler = BufferedErrorHandler(handler) bufferedHandler.setFormatter(ANSIColorFormatter()) logging.root.addHandler(handler) logging.root.addHandler(bufferedHandler) logging.root.setLevel(logging.INFO) if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: print "locale '{}' not found, normalized to '{}'".format( args.locale, norm) try: loc = locale.setlocale(locale.LC_ALL, norm) print "Using locale", loc except locale.Error as loce: print "Setting locale to " + norm + " failed: " + loce.message sys.exit() if args.encoding: try: codec = codecs.lookup(args.encoding) print("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() result = oat.analyze_csv_file(args.csv_file) if result["success"]: csv_analysis = result["data"] print csv_analysis else: print result["error_msg"] sys.exit() if enc is None: enc = csv_analysis.enc dialect = csv_analysis.dialect has_header = csv_analysis.has_header or args.force_header if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() doaj_offline_analysis = None if args.offline_doaj: if os.path.isfile(args.offline_doaj): doaj_offline_analysis = oat.DOAJOfflineAnalysis(args.offline_doaj) else: oat.print_r("Error: " + args.offline_doaj + " does not seem " "to be a file!") csv_file = open(args.csv_file, "r") reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) first_row = reader.next() num_columns = len(first_row) print "\nCSV file has {} columns.".format(num_columns) csv_file.seek(0) reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) column_map = OrderedDict([ ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column)), ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column)), ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column)), ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column)), ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column)), ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column)), ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL, args.journal_full_title_column)), ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column)), ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None)), ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None)), ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None)), ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None)), ("pmid", CSVColumn("pmid", CSVColumn.NONE, None)), ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None)), ("ut", CSVColumn("ut", CSVColumn.NONE, None)), ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column)), ("doaj", CSVColumn("doaj", CSVColumn.NONE, None)) ]) # Do not quote the values in the 'period' and 'euro' columns quotemask = [ True, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ] header = None if has_header: for row in reader: if not row: # Skip empty lines continue header = row # First non-empty row should be the header if args.ignore_header: print "Skipping header analysis due to command line argument." break else: print "\n *** Analyzing CSV header ***\n" for (index, item) in enumerate(header): column_type = oat.get_column_type_from_whitelist(item) if column_type is not None and column_map[ column_type].index is None: column_map[column_type].index = index column_map[column_type].column_name = item print("Found column named '{}' at index {}, " + "assuming this to be the {} column.").format( item, index, column_type) break print "\n *** Starting heuristical analysis ***\n" for row in reader: if not row: # Skip empty lines # We analyze the first non-empty line, a possible header should # have been processed by now. continue column_candidates = {"doi": [], "period": [], "euro": []} for (index, entry) in enumerate(row): if index in [csvcolumn.index for csvcolumn in column_map.values()]: # Skip columns already assigned continue entry = entry.strip() # Search for a DOI if column_map['doi'].index is None: if oat.DOI_RE.match(entry): column_id = str(index) # identify column either numerical or by column header if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "DOI: {}").format(column_id, entry) column_candidates['doi'].append(index) continue # Search for a potential year string if column_map['period'].index is None: try: maybe_period = int(entry) now = datetime.date.today().year # Should be a wide enough margin if maybe_period >= 2000 and maybe_period <= now + 2: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "potential period: {}").format(column_id, entry) column_candidates['period'].append(index) continue except ValueError: pass # Search for a potential monetary amount if column_map['euro'].index is None: try: maybe_euro = locale.atof(entry) # Are there APCs above 6000€ ?? if maybe_euro >= 10 and maybe_euro <= 6000: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "potential euro amount: {}").format( column_id, entry) column_candidates['euro'].append(index) continue except ValueError: pass for column_type, candidates in column_candidates.iteritems(): if column_map[column_type].index is not None: continue if len(candidates) > 1: print("Could not reliably identify the '" + column_type + "' column - more than one possible candiate!") elif len(candidates) < 1: print "No candidate found for column '" + column_type + "'!" else: index = candidates.pop() column_map[column_type].index = index if header: column_id = header[index] column_map[column_type].column_name = column_id else: column_id = index print("Assuming column '{}' to be the '{}' " + "column.").format(column_id, column_type) column_map[column_type].index = index break # Wrap up: Check if there any mandatory column types left which have not # yet been identified - we cannot continue in that case (unless forced). unassigned = filter( lambda (k, v): v.requirement == CSVColumn.MANDATORY and v.index is None, column_map.iteritems()) if unassigned: for item in unassigned: print "The {} column is still unidentified.".format(item[0]) if header: print "The CSV header is:\n" + dialect.delimiter.join(header) if not args.force: print("ERROR: We cannot continue because not all mandatory " + "column types in the CSV file could be automatically " + "identified. There are 2 ways to fix this:") if not header: print( "1) Add a header row to your file and identify the " + "column(s) by assigning them an appropiate column name.") else: print( "1) Identify the missing column(s) by assigning them " + "a different column name in the CSV header (You can " + "use the column name(s) mentioned in the message above)") print("2) Use command line parameters when calling this script " + "to identify the missing columns (use -h for help) ") sys.exit() else: print("WARNING: Not all mandatory column types in the CSV file " + "could be automatically identified - forced to continue.") print "\n *** CSV file analysis summary ***\n" index_dict = {csvc.index: csvc for csvc in column_map.values()} for index in range(num_columns): column_name = "" if header: column_name = header[index] if index in index_dict: column = index_dict[index] msg = "column number {} ({}) is the {} column '{}'".format( index, column_name, column.requirement, column.column_type) if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]: oat.print_g(msg) else: oat.print_b(msg) else: if args.add_unknown_columns: msg = ( "column number {} ({}) is an unknown column, it will be " + "appended to the generated CSV file") oat.print_y(msg.format(index, column_name)) if not column_name: # Use a generic name column_name = "unknown" while column_name in column_map.keys(): # TODO: Replace by a numerical, increasing suffix column_name += "_" column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index) else: msg = ( "column number {} ({}) is an unknown column, it will be " + "ignored") oat.print_y(msg.format(index, column_name)) print "" for column in column_map.values(): if column.index is None: msg = "The {} column '{}' could not be identified." print msg.format(column.requirement, column.column_type) # Check for unassigned optional column types. We can continue but should # issue a warning as all entries will need a valid DOI in this case. unassigned = filter( lambda (k, v): v.requirement == CSVColumn.OPTIONAL and v.index is None, column_map.iteritems()) if unassigned: print("\nWARNING: Not all optional column types could be " + "identified. Metadata aggregation is still possible, but " + "every entry in the CSV file will need a valid DOI.") start = raw_input("\nStart metadata aggregation? (y/n):") while start not in ["y", "n"]: start = raw_input("Please type 'y' or 'n':") if start == "n": sys.exit() print "\n *** Starting metadata aggregation ***\n" enriched_content = [] csv_file.seek(0) reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) header_processed = False row_num = 0 for row in reader: row_num += 1 if not row: continue # skip empty lines if not header_processed: header_processed = True enriched_content.append(column_map.keys()) if has_header: # If the CSV file has a header, we are currently there - skip it # to get to the first data row continue if args.start and args.start > row_num: continue if args.end and args.end < row_num: continue print "---Processing line number " + str(row_num) + "---" enriched_row = oat.process_row(row, row_num, column_map, num_columns, doaj_offline_analysis, args.bypass_cert_verification) enriched_content.append(enriched_row) csv_file.close() with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, quotemask, True, True) writer.write_rows(enriched_content) if not bufferedHandler.buffer: oat.print_g("Metadata enrichment successful, no errors occured") else: oat.print_r("There were errors during the enrichment process:\n") # closing will implicitly flush the handler and print any buffered # messages to stderr bufferedHandler.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("apc_file", help=ARG_HELP_STRINGS["apc_file"]) parser.add_argument("issn_l_file", help=ARG_HELP_STRINGS["issn_l_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) args = parser.parse_args() quote_rules = args.openapc_quote_rules mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] enc = None if args.encoding: try: codec = codecs.lookup(args.encoding) print("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() result = oat.analyze_csv_file(args.apc_file, 500) if result["success"]: csv_analysis = result["data"] print csv_analysis else: print result["error_msg"] sys.exit() if enc is None: enc = csv_analysis.enc if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() dialect = csv_analysis.dialect csv_file = open(args.apc_file, "r") reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) oat.print_g("Preparing mapping table...") itself = other = 0 issn_l_re = re.compile( "^(?P<issn>\d{4}-\d{3}[\dxX])\t(?P<issn_l>\d{4}-\d{3}[\dxX])$") issn_l_file = open(args.issn_l_file, "r") issn_l_dict = {} for i, line in enumerate(issn_l_file): if i % 100000 == 0: print str(i) + " lines processed." match = issn_l_re.match(line) if match: match_dict = match.groupdict() issn_l_dict[match_dict['issn']] = match_dict['issn_l'] if match_dict['issn'] == match_dict['issn_l']: itself += 1 else: other += 1 print str(itself) + " ISSNs pointing to itself as ISSN-L, " + str( other) + " to another value." oat.print_g("Starting enrichment...") issn_matches = issn_p_matches = issn_e_matches = unmatched = different = 0 enriched_lines = [] for line in reader: if len(line) == 0: enriched_lines.append(line) continue issn = reformat_issn(line[7]) issn_p = reformat_issn(line[8]) issn_e = reformat_issn(line[9]) target = None if issn in issn_l_dict: target = issn_l_dict[issn] line[10] = target issn_matches += 1 elif issn_p in issn_l_dict: target = issn_l_dict[issn_p] line[10] = target issn_p_matches += 1 elif issn_e in issn_l_dict: target = issn_l_dict[issn_e] line[10] = target issn_e_matches += 1 else: unmatched += 1 if target is not None and target not in [issn, issn_p, issn_e]: different += 1 enriched_lines.append(line) print "{} issn_l values mapped by issn, {} by issn_p, {} by issn_e. {} could not be assigned.\n In {} cases the ISSN-L was different from all existing ISSN values".format( issn_matches, issn_p_matches, issn_e_matches, unmatched, different) with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False) writer.write_rows(enriched_lines)
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"]) parser.add_argument("-o", "--openapc_quote_rules", help=ARG_HELP_STRINGS["openapc_quote_rules"], action="store_true", default=False) subparsers = parser.add_subparsers(help='The column operation to perform') delete_parser = subparsers.add_parser("delete", help="delete help") delete_parser.add_argument("column_index", type=int, help='bar help') delete_parser.set_defaults(func=delete_column) insert_parser = subparsers.add_parser("insert", help="insert help") insert_parser.add_argument("target_index", type=int, help='bar help') insert_parser.add_argument("column_name", help='bar help') insert_parser.add_argument("default_value", help='bar help') insert_parser.set_defaults(func=insert_column) move_parser = subparsers.add_parser("move", help="move help") move_parser.add_argument("column_index", type=int, help='bar help') move_parser.add_argument("target_index", type=int, help='bar help') move_parser.set_defaults(func=move_column) copy_parser = subparsers.add_parser("copy", help="copy help") copy_parser.set_defaults(func=copy) args = parser.parse_args() quote_rules = args.openapc_quote_rules enc = None #CSV file encoding if args.encoding: try: codec = codecs.lookup(args.encoding) print("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() result = oat.analyze_csv_file(args.csv_file, 500) if result["success"]: csv_analysis = result["data"] print csv_analysis else: print result["error_msg"] sys.exit() if enc is None: enc = csv_analysis.enc if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() dialect = csv_analysis.dialect csv_file = open(args.csv_file, "r") reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) new_rows = args.func(reader, args) csv_file.close() mask = None if args.quotemask: reduced = args.quotemask.replace("f", "").replace("t", "") if len(reduced) > 0: print("Error: A quotemask may only contain the letters 't' and" + "'f'!") sys.exit() mask = [True if x == "t" else False for x in args.quotemask] with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True) writer.write_rows(new_rows)
return "URLError: {}".format(urle.reason) except ET.ParseError as etpe: return "ElementTree ParseError: {}".format(str(etpe)) parser = argparse.ArgumentParser() parser.add_argument( "doi_or_file", help= "An OpenAPC-compatible CSV file or a single DOI to look up in crossref.") args = parser.parse_args() arg = args.doi_or_file if os.path.isfile(arg): csv_file = open(arg, "r") reader = oat.UnicodeReader(csv_file) line_number = 0 for line in reader: if not line: prefix = "" else: prefix = get_prefix(line[3]) result = str(line_number) + ": " + prefix if prefix == "Springer (Biomed Central Ltd.)": oat.print_g(result) elif prefix == "Nature Publishing Group": oat.print_r(result) else: print result line_number += 1 else:
def main(): parser = argparse.ArgumentParser() parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"]) parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"]) parser.add_argument("-v", "--verbose", action="store_true", help=ARG_HELP_STRINGS["verbose"]) parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"]) parser.add_argument("-i", "--ignore-header", action="store_true", help=ARG_HELP_STRINGS["headers"]) parser.add_argument("-f", "--force", action="store_true", help=ARG_HELP_STRINGS["force"]) parser.add_argument("-b", "--bypass-cert-verification", action="store_true", help=ARG_HELP_STRINGS["bypass"]) parser.add_argument("-institution", "--institution_column", type=int, help=ARG_HELP_STRINGS["institution"]) parser.add_argument("-period", "--period_column", type=int, help=ARG_HELP_STRINGS["period"]) parser.add_argument("-doi", "--doi_column", type=int, help=ARG_HELP_STRINGS["doi"]) parser.add_argument("-euro", "--euro_column", type=int, help=ARG_HELP_STRINGS["euro"]) parser.add_argument("-is_hybrid", "--is_hybrid_column", type=int, help=ARG_HELP_STRINGS["is_hybrid"]) parser.add_argument("-publisher", "--publisher_column", type=int, help=ARG_HELP_STRINGS["publisher"]) parser.add_argument("-journal_full_title", "--journal_full_title_column", type=int, help=ARG_HELP_STRINGS["journal_full_title"]) parser.add_argument("-issn", "--issn_column", type=int, help=ARG_HELP_STRINGS["issn"]) parser.add_argument("-url", "--url_column", type=int, help=ARG_HELP_STRINGS["url"]) args = parser.parse_args() enc = None # CSV file encoding if args.locale: norm = locale.normalize(args.locale) if norm != args.locale: print "locale '{}' not found, normalized to '{}'".format( args.locale, norm) try: loc = locale.setlocale(locale.LC_ALL, norm) print "Using locale", loc except locale.Error as loce: print "Setting locale to " + norm + " failed: " + loce.message sys.exit() if args.encoding: try: codec = codecs.lookup(args.encoding) print("Encoding '{}' found in Python's codec collection " + "as '{}'").format(args.encoding, codec.name) enc = args.encoding except LookupError: print("Error: '" + args.encoding + "' not found Python's " + "codec collection. Either look for a valid name here " + "(https://docs.python.org/2/library/codecs.html#standard-" + "encodings) or omit this argument to enable automated " + "guessing.") sys.exit() result = oat.analyze_csv_file(args.csv_file) if result["success"]: csv_analysis = result["data"] print csv_analysis else: print result["error_msg"] sys.exit() if enc is None: enc = csv_analysis.enc dialect = csv_analysis.dialect has_header = csv_analysis.has_header if enc is None: print("Error: No encoding given for CSV file and automated " + "detection failed. Please set the encoding manually via the " + "--enc argument") sys.exit() csv_file = open(args.csv_file, "r") reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) first_row = reader.next() num_columns = len(first_row) print "\nCSV file has {} columns.".format(num_columns) csv_file.seek(0) reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) column_map = OrderedDict([ ("institution", CSVColumn("institution", CSVColumn.MANDATORY, args.institution_column)), ("period", CSVColumn("period", CSVColumn.MANDATORY, args.period_column)), ("euro", CSVColumn("euro", CSVColumn.MANDATORY, args.euro_column)), ("doi", CSVColumn("doi", CSVColumn.MANDATORY, args.doi_column)), ("is_hybrid", CSVColumn("is_hybrid", CSVColumn.MANDATORY, args.is_hybrid_column)), ("publisher", CSVColumn("publisher", CSVColumn.OPTIONAL, args.publisher_column)), ("journal_full_title", CSVColumn("journal_full_title", CSVColumn.OPTIONAL, args.journal_full_title_column)), ("issn", CSVColumn("issn", CSVColumn.OPTIONAL, args.issn_column)), ("issn_print", CSVColumn("issn_print", CSVColumn.NONE, None)), ("issn_electronic", CSVColumn("issn_electronic", CSVColumn.NONE, None)), ("license_ref", CSVColumn("license_ref", CSVColumn.NONE, None)), ("indexed_in_crossref", CSVColumn("indexed_in_crossref", CSVColumn.NONE, None)), ("pmid", CSVColumn("pmid", CSVColumn.NONE, None)), ("pmcid", CSVColumn("pmcid", CSVColumn.NONE, None)), ("ut", CSVColumn("ut", CSVColumn.NONE, None)), ("url", CSVColumn("url", CSVColumn.OPTIONAL, args.url_column)), ("doaj", CSVColumn("doaj", CSVColumn.NONE, None)) ]) # Do not quote the values in the 'period' and 'euro' columns quotemask = [ True, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ] header = None if has_header: for row in reader: if not row: # Skip empty lines continue header = row # First non-empty row should be the header if args.ignore_header: print "Skipping header analysis due to command line argument." break else: print "\n *** Analyzing CSV header ***\n" for (index, item) in enumerate(header): column_type = oat.get_column_type_from_whitelist(item) if column_type is not None and column_map[ column_type].index is None: column_map[column_type].index = index column_map[column_type].column_name = item print("Found column named '{}' at index {}, " + "assuming this to be the {} column.").format( item, index, column_type) break print "\n *** Starting heuristical analysis ***\n" for row in reader: if not row: # Skip empty lines # We analyze the first non-empty line, a possible header should # have been processed by now. continue column_candidates = {"doi": [], "period": [], "euro": []} for (index, entry) in enumerate(row): if index in [csvcolumn.index for csvcolumn in column_map.values()]: # Skip columns already assigned continue entry = entry.strip() # Search for a DOI if column_map['doi'].index is None: if oat.DOI_RE.match(entry): column_id = str(index) # identify column either numerical or by column header if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "DOI: {}").format(column_id, entry) column_candidates['doi'].append(index) continue # Search for a potential year string if column_map['period'].index is None: try: maybe_period = int(entry) now = datetime.date.today().year # Should be a wide enough margin if maybe_period >= 2000 and maybe_period <= now + 2: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "potential period: {}").format(column_id, entry) column_candidates['period'].append(index) continue except ValueError: pass # Search for a potential monetary amount if column_map['euro'].index is None: try: maybe_euro = locale.atof(entry) # Are there APCs above 6000€ ?? if maybe_euro >= 10 and maybe_euro <= 6000: column_id = str(index) if header: column_id += " ('" + header[index] + "')" print("The entry in column {} looks like a " + "potential euro amount: {}").format( column_id, entry) column_candidates['euro'].append(index) continue except ValueError: pass for column_type, candidates in column_candidates.iteritems(): if column_map[column_type].index is not None: continue if len(candidates) > 1: print("Could not reliably identify the '" + column_type + "' column - more than one possible candiate!") elif len(candidates) < 1: print "No candidate found for column '" + column_type + "'!" else: index = candidates.pop() column_map[column_type].index = index if header: column_id = header[index] column_map[column_type].column_name = column_id else: column_id = index print("Assuming column '{}' to be the '{}' " + "column.").format(column_id, column_type) column_map[column_type].index = index break # Wrap up: Check if there any mandatory column types left which have not # yet been identified - we cannot continue in that case (unless forced). unassigned = filter( lambda (k, v): v.requirement == CSVColumn.MANDATORY and v.index is None, column_map.iteritems()) if unassigned: for item in unassigned: print "The {} column is still unidentified.".format(item[0]) if header: print "The CSV header is:\n" + dialect.delimiter.join(header) if not args.force: print("ERROR: We cannot continue because not all mandatory " + "column types in the CSV file could be automatically " + "identified. There are 2 ways to fix this:") if not header: print( "1) Add a header row to your file and identify the " + "column(s) by assigning them an appropiate column name.") else: print( "1) Identify the missing column(s) by assigning them " + "a different column name in the CSV header (You can " + "use the column name(s) mentioned in the message above)") print("2) Use command line parameters when calling this script " + "to identify the missing columns (use -h for help) ") sys.exit() else: print("WARNING: Not all mandatory column types in the CSV file " + "could be automatically identified - forced to continue.") print "\n *** CSV file analysis summary ***\n" index_dict = {csvc.index: csvc for csvc in column_map.values()} for index in range(num_columns): column_name = "" if header: column_name = header[index] if index in index_dict: column = index_dict[index] msg = "column number {} ({}) is the {} column '{}'".format( index, column_name, column.requirement, column.column_type) if column.requirement in [CSVColumn.MANDATORY, CSVColumn.OPTIONAL]: oat.print_g(msg) else: oat.print_b(msg) else: msg = ("column number {} ({}) is an unknown column, it will be " + "appended to the generated CSV file") oat.print_y(msg.format(index, column_name)) if not column_name: # Use a generic name column_name = "unknown" while column_name in column_map.keys(): # TODO: Replace by a numerical, increasing suffix column_name += "_" column_map[column_name] = CSVColumn(column_name, CSVColumn.NONE, index) print "" for column in column_map.values(): if column.index is None: msg = "The {} column '{}' could not be identified." print msg.format(column.requirement, column.column_type) # Check for unassigned optional column types. We can continue but should # issue a warning as all entries will need a valid DOI in this case. unassigned = filter( lambda (k, v): v.requirement == CSVColumn.OPTIONAL and v.index is None, column_map.iteritems()) if unassigned: print("\nWARNING: Not all optional column types could be " + "identified. Metadata aggregation is still possible, but " + "every entry in the CSV file will need a valid DOI.") start = raw_input("\nStart metadata aggregation? (y/n):") while start not in ["y", "n"]: start = raw_input("Please type 'y' or 'n':") if start == "n": sys.exit() print "\n *** Starting metadata aggregation ***\n" enriched_content = [] error_messages = [] csv_file.seek(0) reader = oat.UnicodeReader(csv_file, dialect=dialect, encoding=enc) header_processed = False row_num = 0 for row in reader: row_num += 1 if not row: continue # skip empty lines if not header_processed: header_processed = True enriched_content.append(column_map.keys()) if has_header: # If the CSV file has a header, we are currently there - skip it # to get to the first data row continue print "---Processing line number " + str(row_num) + "---" if len(row) != num_columns: error_msg = ( "Syntax: the number of values in line {} ({}) " + "differs from the number of columns ({}). Line left " + "unchanged, please correct the error in the result " + "file and re-run.") error_msg_fmt = error_msg.format(row_num, len(row), num_columns) error_messages.append("Line {}: {}".format(row_num, error_msg_fmt)) oat.print_r(error_msg_fmt) enriched_content.append(row) continue doi = row[column_map["doi"].index] current_row = OrderedDict() # Copy content of identified columns for csv_column in column_map.values(): if csv_column.index is not None and len(row[csv_column.index]) > 0: if csv_column.column_type == "euro": # special case for monetary values: Cast to float to ensure # the decimal point is a dot (instead of a comma) euro_value = row[csv_column.index] try: euro = locale.atof(euro_value) if euro.is_integer(): euro = int(euro) current_row[csv_column.column_type] = str(euro) except ValueError: msg = ERROR_MSGS["locale"].format( euro_value, csv_column.index) oat.print_r(msg) sys.exit() else: current_row[csv_column.column_type] = row[csv_column.index] else: current_row[csv_column.column_type] = "NA" # include crossref metadata crossref_result = oat.get_metadata_from_crossref(doi) if crossref_result["success"]: print "Crossref: DOI resolved: " + doi current_row["indexed_in_crossref"] = "TRUE" data = crossref_result["data"] for key, value in data.iteritems(): if value is not None: if key == "journal_full_title": unified_value = oat.get_unified_journal_title(value) if unified_value != value: msg = INFO_MSGS["unify"].format( "journal title", value, unified_value) oat.print_b(msg) new_value = unified_value elif key == "publisher": unified_value = oat.get_unified_publisher_name(value) if unified_value != value: msg = INFO_MSGS["unify"].format( "publisher name", value, unified_value) oat.print_b(msg) new_value = unified_value else: new_value = value else: new_value = "NA" if args.verbose: print(u"WARNING: Element '{}' not found in in " + "response for doi {}.").format(key, doi) old_value = current_row[key] current_row[key] = column_map[key].check_overwrite( old_value, new_value) else: error_msg = ("Crossref: Error while trying to resolve DOI " + doi + ": " + crossref_result["error_msg"]) oat.print_r(error_msg) error_messages.append("Line {}: {}".format(row_num, error_msg)) current_row["indexed_in_crossref"] = "FALSE" # include pubmed metadata pubmed_result = oat.get_metadata_from_pubmed(doi) if pubmed_result["success"]: print "Pubmed: DOI resolved: " + doi data = pubmed_result["data"] for key, value in data.iteritems(): if value is not None: new_value = value else: new_value = "NA" if args.verbose: print(u"WARNING: Element '{}' not found in in " + "response for doi {}.").format(key, doi) old_value = current_row[key] current_row[key] = column_map[key].check_overwrite( old_value, new_value) else: error_msg = ("Pubmed: Error while trying to resolve DOI " + doi + ": " + pubmed_result["error_msg"]) oat.print_r(error_msg) error_messages.append("Line {}: {}".format(row_num, error_msg)) # lookup in DOAJ. try the EISSN first, then ISSN and finally print ISSN if current_row["doaj"] != "TRUE": issns = [] if current_row["issn_electronic"] != "NA": issns.append(current_row["issn_electronic"]) if current_row["issn"] != "NA": issns.append(current_row["issn"]) if current_row["issn_print"] != "NA": issns.append(current_row["issn_print"]) for issn in issns: doaj_res = oat.lookup_journal_in_doaj( issn, args.bypass_cert_verification) if doaj_res["data_received"]: if doaj_res["data"]["in_doaj"]: msg = "DOAJ: Journal ISSN ({}) found in DOAJ ('{}')." print msg.format(issn, doaj_res["data"]["title"]) current_row["doaj"] = "TRUE" break else: msg = "DOAJ: Journal ISSN ({}) not found in DOAJ." current_row["doaj"] = "FALSE" print msg.format(issn) else: msg = "DOAJ: Error while trying to look up ISSN {}: {}" msg_fmt = msg.format(issn, doaj_res["error_msg"]) oat.print_r(msg_fmt) error_messages.append("Line {}: {}".format( row_num, msg_fmt)) enriched_content.append(current_row.values()) csv_file.close() with open('out.csv', 'w') as out: writer = oat.OpenAPCUnicodeWriter(out, quotemask, True, True) writer.write_rows(enriched_content) if not error_messages: oat.print_g("Metadata enrichment successful, no errors occured") else: oat.print_r("There were errors during the enrichment process:\n") for msg in error_messages: print msg + "\n"