Esempi in Python per get_csv_file_content, esempi in Python per openapc_toolkit.get_csv_file_content

Esempio n. 1

0

Mostra file

def main():
    args = parse()
    _, apc_content = oat.get_csv_file_content("../data/apc_de.csv", "utf-8",
                                              True)
    _, ins_content = oat.get_csv_file_content("../data/institutions.csv",
                                              "utf-8", True)
    _, dup_content = oat.get_csv_file_content(
        "../data/unresolved_duplicates.csv", "utf-8", True)

    sig_articles, stats = find_significant_apc_differences(
        apc_content, args.institution, args.verbose)

    report = ""
    report += generate_header(args.lang)
    report += generate_metadata_section(args.institution, ins_content, stats,
                                        args.lang)
    report += generate_duplicates_section(args.institution, dup_content,
                                          ins_content, args.lang)
    if not args.no_doi_resolve_test:
        report += generate_nonresolving_dois_section(args.institution,
                                                     apc_content, args.lang)
    report += generate_apc_deviaton_section(args.institution, sig_articles,
                                            stats, args.lang, args.csv_output)

    ins = args.institution.lower().replace(" ", "_")
    today = format_date(date.today(), format="dd_MM_yy")
    file_name = "report_" + ins + "_" + today + ".pdf"
    with open("report.md", "w") as out:
        out.write(report)
    run([
        "pandoc", "report.md", "-f", "markdown", "-o", file_name,
        "--pdf-engine=xelatex"
    ])

Esempio n. 2

0

Mostra file

File: deal_wiley_extract.py Progetto: sma-h/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("enriched_file", help=ARG_HELP_STRINGS["enriched_file"])
    args = parser.parse_args()
    
    header, content = oat.get_csv_file_content(args.enriched_file, enc="utf-8", force_header=True)
    header_line = header[0]
    
    core_content = [list(header_line)]
    ta_content = [list(header_line) + ["agreement"]]
    print(core_content)
    print(ta_content)
    
    for line in content:
        if line[4] == "TRUE" and line[5] in PUBLISHER_LIST:
            core_content.append(list(EMPTY_LINE_CORE))
            ta_content.append(line + [AGREEMENT_NAME])
        else:
            core_content.append(line)
            ta_content.append(list(EMPTY_LINE_TA))

    with open("out_orig.csv", "w") as out:
        writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
        writer.write_rows(core_content)
    with open("out_deal_wiley.csv", "w") as out:
        writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
        writer.write_rows(ta_content)

Esempio n. 3

0

Mostra file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules
    
    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            codec_msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(codec_msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
        
    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and" +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    header, content = oat.get_csv_file_content(args.csv_file, enc)

    line_num = 1
    for line in content:
        publisher = line[5]
        journal = line[6]
        journal_new = oat.get_unified_journal_title(journal)
        publisher_new = oat.get_unified_publisher_name(publisher)
        if publisher_new != publisher:
            line[5] = publisher_new
            msg = u"Line {}: Updated publisher name ({} -> {})"
            oat.print_g(msg.format(line_num, publisher, publisher_new))
        if journal_new != journal:
            line[6] = journal_new
            msg = u"Line {}: Updated journal_full_title ({} -> {})"
            oat.print_g(msg.format(line_num, journal, journal_new))
        line_num += 1
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows(header + content)

Esempio n. 4

0

Mostra file

File: csv_occurence_count.py Progetto: ulb-openscience/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"])
    parser.add_argument("count_column",
                        type=int,
                        help=ARG_HELP_STRINGS["count_column"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-s",
                        "--sort",
                        action="store_true",
                        help=ARG_HELP_STRINGS["sort"])

    args = parser.parse_args()

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            print("Encoding '{}' found in Python's codec collection " +
                  "as '{}'").format(args.encoding, codec.name)
            enc = args.encoding
        except LookupError:
            oat.print_r(
                "Error: '" + args.encoding + "' not found Python's " +
                "codec collection. Either look for a valid name here " +
                "(https://docs.python.org/2/library/codecs.html#standard-" +
                "encodings) or omit this argument to enable automated " +
                "guessing.")
            sys.exit()

    header, content = oat.get_csv_file_content(args.source_file, enc)

    column_name = "column " + str(args.count_column)
    if header:
        header_line = header[0]
        column_name = header_line[args.count_column]

    oat.print_g("Performing occurence count in column '" + column_name + "'")
    occurence_dict = OrderedDict()

    for line in content:
        try:
            value = line[args.count_column]
        except IndexError as ie:
            oat.print_y("IndexError ({}) at line {}, skipping...".format(
                ie.message, line))
            continue
        if value not in occurence_dict:
            occurence_dict[value] = 1
        else:
            occurence_dict[value] += 1

    if args.sort:
        occurence_dict = OrderedDict(
            sorted(occurence_dict.items(), key=lambda x: x[1], reverse=True))

    for item in occurence_dict.items():
        print item[0] + ": " + str(item[1])

Esempio n. 5

0

Mostra file

File: import_hybrid_status_from_journaltocs.py Progetto: sma-h/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"])
    parser.add_argument("journaltocs_user",
                        help=ARG_HELP_STRINGS["journaltocs_user"])
    parser.add_argument("-i",
                        "--integrate",
                        action="store_true",
                        help=ARG_HELP_STRINGS["integrate"])
    parser.add_argument("-m",
                        "--max_lookups",
                        type=int,
                        default=100,
                        help=ARG_HELP_STRINGS["max_lookups"])
    args = parser.parse_args()

    analysed_journals = {}

    modified_content = []

    lookups = 0
    header, content = oat.get_csv_file_content(args.source_file,
                                               enc="utf-8",
                                               force_header=True)
    header_line = header[0]
    modified_content = [list(header_line)]
    for line in content:
        if not oat.has_value(line[6]):  #journal_full_title
            modified_content.append(line)
            continue
        if not oat.has_value(line[4]):  #is_hybrid
            title = line[6]
            oat.print_y('Looking up journal {}'.format(title))
            if title not in analysed_journals:
                if lookups < args.max_lookups:
                    hybrid_status = get_hybrid_status(line,
                                                      args.journaltocs_user)
                    if hybrid_status is not None:
                        analysed_journals[title] = hybrid_status
                    else:
                        analysed_journals[title] = "NA"
                    lookups += 1
                    line[4] = analysed_journals[title]
                else:
                    oat.print_r("Maximum number of lookups reached!")
            else:
                line[4] = analysed_journals[title]
        modified_content.append(line)

    with open("out.csv", "w") as out:
        if args.integrate:
            writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
            writer.write_rows(modified_content)
        else:
            out.write("journal_full_title,is_hybrid\n")
            for key, value in analysed_journals.items():
                out.write(key + "," + value + "\n")

Esempio n. 6

0

Mostra file

File: csv_occurence_count.py Progetto: MPDL/unibiAPC

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"])
    parser.add_argument("count_column", type=int, help=ARG_HELP_STRINGS["count_column"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-s", "--sort", action="store_true", help=ARG_HELP_STRINGS["sort"])

    args = parser.parse_args()

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            print(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            oat.print_r("Error: '" + args.encoding + "' not found Python's " +
                        "codec collection. Either look for a valid name here " +
                        "(https://docs.python.org/2/library/codecs.html#standard-" +
                        "encodings) or omit this argument to enable automated " +
                        "guessing.")
            sys.exit()

    header, content = oat.get_csv_file_content(args.source_file, enc)

    column_name = "column " + str(args.count_column)
    if header:
        header_line = header[0]
        column_name = header_line[args.count_column]

    oat.print_g("Performing occurence count in column '" + column_name + "'")
    occurence_dict = OrderedDict()

    for line in content:
        try:
            value = line[args.count_column]
        except IndexError as ie:
            oat.print_y("IndexError ({}) at line {}, skipping...".format(ie.message, line))
            continue
        if value not in occurence_dict:
            occurence_dict[value] = 1
        else:
            occurence_dict[value] += 1

    if args.sort:
        occurence_dict = OrderedDict(sorted(occurence_dict.items(), key=lambda x: x[1],
                                            reverse=True))

    for item in occurence_dict.items():
        print(item[0] + ": " + str(item[1]))

Esempio n. 7

0

Mostra file

File: hybrid_oa_check.py Progetto: tullney/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"])
    args = parser.parse_args()

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(oat.ANSIColorFormatter())
    bufferedHandler = oat.BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(oat.ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = ("Encoding '{}' found in Python's codec collection " +
                   "as '{}'").format(args.encoding, codec.name)
            oat.print_g(msg)
            enc = args.encoding
        except LookupError:
            msg = ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            oat.print_r(msg)
            sys.exit()

    head, content = oat.get_csv_file_content(args.csv_file, enc)
    content = head + content

    line_num = 0
    for line in content:
        line_num += 1
        if args.start and args.start > line_num:
            continue
        if args.end and args.end < line_num:
            continue
        # Check hybrid status
        if line[4] != "TRUE":
            continue
        institution = line[0]
        period = line[1]
        doi = line[3]
        publisher = line[5]
        journal = line[6]
        for lpl in lpl_list:
            if lpl.publisher_matches(publisher):
                init_msg = (u"Line {}: Checking {} article from {}, published in '" +
                            "{}'...").format(line_num, institution, period, journal)
                oat.print_b(init_msg)
                page_content = get_landingpage_content(doi, lpl)
                if page_content is None:
                    continue
                pdf_link = lpl.search_for_oa(page_content)
                if pdf_link is None:
                    error_msg = (u"No PDF link found! (line {}, DOI: " +
                                 "http://doi.org/{}").format(line_num, doi)
                    logging.error(error_msg)
                elif pdf_link == "":
                    warning_msg = (u"A RegexGroup matched, but no PDF " +
                                   "link was found! (line {}, DOI: " +
                                   "http://doi.org/{}").format(line_num, doi)
                    logging.warning(warning_msg)
                else:
                    oat.print_g(u"PDF link found: " + pdf_link)
        time.sleep(1)

    if not bufferedHandler.buffer:
        oat.print_g("\nLookup finished, all articles were accessible on sciencedirect")
    else:
        oat.print_r("\nLookup finished, not all articles could be accessed on sciencedirect:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()

Esempio n. 8

0

Mostra file

File: sciencedirect_check_oa.py Progetto: SodertornHB/openapc-se

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"])
    args = parser.parse_args()

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(oat.ANSIColorFormatter())
    bufferedHandler = oat.BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(oat.ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = ("Encoding '{}' found in Python's codec collection " +
                   "as '{}'").format(args.encoding, codec.name)
            oat.print_g(msg)
            enc = args.encoding
        except LookupError:
            msg = ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            oat.print_r(msg)
            sys.exit()

    head, content = oat.get_csv_file_content(args.csv_file, enc)
    content = head + content

    header = {"User-Agent": "Mozilla/5.0 Firefox/45.0"}

    line_num = 0
    for line in content:
        line_num += 1
        if args.start and args.start > line_num:
            continue
        if args.end and args.end < line_num:
            continue
        institution = line[0]
        period = line[1]
        doi = line[3]
        is_hybrid = line[4]
        publisher = line[5]
        journal = line[6]
        if publisher != "Elsevier" or is_hybrid != "TRUE":
            continue
        init_msg = (u"Line {}: Checking {} article from {}, published in " +
                    "{}...").format(line_num, institution, period, journal)
        oat.print_b(init_msg)
        url = 'http://doi.org/' + doi
        req = urllib2.Request(url, None, header)
        ret_value = {'success': True}
        try:
            response = urllib2.urlopen(req)
            target = response.geturl()
            resolve_msg = u"DOI {} resolved, led us to {}".format(doi, target)
            if "sciencedirect.com" not in target:
                oat.print_y(resolve_msg)
                oat.print_y("Journal not located at sciencedirect, skipping...")
                continue
            oat.print_b(resolve_msg)
            content_string = response.read()
            single_match = pdflink_re.search(content_string)
            if single_match:
                link_url = single_match.groups()[0]
                oat.print_g(u"PDF link found: " + link_url)
            else:
                multi_match = pdflink_multi_re.search(content_string)
                if multi_match:
                   link_url = multi_match.groups()[0]
                   link_url = link_url.replace("&amp;", "&")
                   oat.print_g(u"PDF link found (more than one document): " + link_url)
                else:
                    error_msg = (u"No PDF link found! (line {}, DOI: {}, " +
                                 "landing page: {})").format(line_num, doi, target)
                    logging.error(error_msg)
            time.sleep(1)
        except urllib2.HTTPError as httpe:
            code = str(httpe.getcode())
            oat.print_r("HTTPError: {} - {}".format(code, httpe.reason))
        except urllib2.URLError as urle:
            oat.print_r("URLError: {}".format(urle.reason))

    if not bufferedHandler.buffer:
        oat.print_g("\nLookup finished, all articles were accessible on sciencedirect")
    else:
        oat.print_r("\nLookup finished, not all articles could be accessed on sciencedirect:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()

Esempio n. 9

0

Mostra file

File: csv_delete_rows.py Progetto: zuphilip/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("index", type=int, help=ARG_HELP_STRINGS["index"])
    parser.add_argument("-v", "--value", help=ARG_HELP_STRINGS["value"])
    parser.add_argument("-f", "--file", help=ARG_HELP_STRINGS["file"])
    parser.add_argument("-d",
                        "--full_delete",
                        action="store_true",
                        help=ARG_HELP_STRINGS["full_delete"])
    parser.add_argument("-i",
                        "--ignore_case",
                        action="store_true",
                        help=ARG_HELP_STRINGS["ignore_case"])
    parser.add_argument("-r",
                        "--results_file",
                        action="store_true",
                        help=ARG_HELP_STRINGS["results_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q",
                        "--quotemask",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o",
                        "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true",
                        default=False)

    args = parser.parse_args()
    if args.value is None and args.file is None:
        parser.error("Either a single value (-v option) or a file of " +
                     "multiple values (-f option) must be given.")

    values = []
    if args.file:
        if not os.path.isfile(args.file):
            print("Error: '" + args.file + "' is no valid file!")
            sys.exit()
        with open(args.file, "r") as f:
            for line in f:
                if len(line) > 0:
                    value = line.strip("\r\n")
                    if args.ignore_case:
                        values.append(value.lower())
                    else:
                        values.append(value)
        oat.print_g(str(len(values)) + " values read from file")

    if args.value is not None:
        if args.ignore_case:
            values.append(args.value.lower())
        else:
            values.append(args.value)
        if args.file:
            oat.print_y("Value argument given in addition to file " +
                        "argument, adding value to file imports...")

    quote_rules = args.openapc_quote_rules

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            print(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    header, content = oat.get_csv_file_content(args.csv_file, enc)

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print("Error: A quotemask may only contain the letters 't' and" +
                  "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    empty_line = ['' for element in content[0]]
    column_name = "column " + str(args.index)
    if header:
        header_line = header[0]
        column_name = header_line[args.index]
        empty_line = ['' for element in header_line]
    msg = u"Performing line deletion on condition '{}' in {}".format(
        column_name, values)
    oat.print_g(msg)

    modified_content = []
    deleted_lines = []
    num_total_lines = num_deleted_lines = 0
    for line in content:
        if len(line) == 0:
            continue
        num_total_lines += 1
        current_value = line[args.index]
        if args.ignore_case:
            current_value = current_value.lower()
        if current_value not in values:
            modified_content.append(line)
        else:
            num_deleted_lines += 1
            if not args.full_delete:
                modified_content.append(list(empty_line))
            if args.results_file:
                deleted_lines.append(line)

    msg = u"Process complete, deleted {} out of {} total lines"
    oat.print_g(msg.format(num_deleted_lines, num_total_lines))

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
        writer.write_rows(copy.deepcopy(header) + modified_content)

    if args.results_file and len(deleted_lines) > 0:
        with open('del.csv', 'w') as out:
            writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
            writer.write_rows(copy.deepcopy(header) + deleted_lines)

Esempio n. 10

0

Mostra file

    for name in grid_names:
        current_ratio = ratio(name, institutions_name)
        if current_ratio > highest_ratio:
            highest_ratio = current_ratio
            grid_name = name
    return grid_name, highest_ratio


def write_out_file(ins_header, ins_content):
    with open("out.csv", "w") as out_file:
        quote_mask = [False for x in range(7)]
        writer = oat.OpenAPCUnicodeWriter(out_file, quote_mask, False, False)
        writer.write_rows(ins_header + ins_content)


ins_header, ins_content = oat.get_csv_file_content("../data/institutions.csv",
                                                   "utf-8", True, False)

with open("grid.json") as grid_file:
    content = grid_file.read()
    json_dict = json.loads(content)
    grid_list = json_dict["institutes"]

for index, ins in enumerate(grid_list):
    deciles = {
        round((len(grid_list) / 10) * i): str(i * 10) + "%"
        for i in range(1, 10)
    }
    if index in deciles:
        print(deciles[index])
    if ins["status"] != "active":
        continue

Esempio n. 11

0

Mostra file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("new_file", help=ARG_HELP_STRINGS["new_file"])
    parser.add_argument("target_file", help=ARG_HELP_STRINGS["new_file"])
    parser.add_argument('cost_tolerance', type=float, help=ARG_HELP_STRINGS["cost_tolerance"])
    parser.add_argument('enriched_files', nargs='+', help=ARG_HELP_STRINGS["enriched_files"])
    parser.add_argument('-b', '--batch', type=int, help=ARG_HELP_STRINGS["batch"])
    
    args = parser.parse_args()
    
    target_file_name = get_filename(args.target_file)
    new_file_name = get_filename(args.new_file)
    
    
    for path in args.enriched_files:
        if not os.path.isfile(path):
            oat.print_r('Error: "' + path + '" is no valid file path!')
            sys.exit()
        ENRICHED_FILES[path] = {"modified": False, "file_name": get_filename(path)}
        ENRICHED_FILES[path]["header"], ENRICHED_FILES[path]["content"] = oat.get_csv_file_content(path, enc="utf-8", force_header=True)
        
    
    target_header, target_content = oat.get_csv_file_content(args.target_file, enc="utf-8", force_header=True)
    new_header, new_content = oat.get_csv_file_content(args.new_file, enc="utf-8", force_header=True)
    ud_header, ud_content = oat.get_csv_file_content(UD_FILE, enc="utf-8", force_header=True)
    
    duplicates = []
    target_dois = [line[3] for line in target_content]
    
    for new_index, line in enumerate(new_content):
        doi = line[3]
        if doi == "NA" or doi not in target_dois:
            continue
        else:
            target_index = get_duplicate_index(target_content, doi)
            duplicates.append((new_index, target_index))
    
    count = 0
    for pair in duplicates:
        new_line = new_content[pair[0]]
        target_line = target_content[pair[1]]
        doi = target_line[3]
        new_cost = float(new_line[2])
        target_cost = float(target_line[2])
        if new_cost >= target_cost:
            deviation = (new_cost - target_cost) / new_cost
        else:
            deviation = (target_cost - new_cost) / target_cost
        oat.print_b("Duplicate found:")
        print("In new file " + new_file_name + ":")
        print(",".join(new_line))
        print("In target file " + target_file_name + ":")
        print(",".join(target_line))
        if new_line[0] != target_line[0]:
            msg = 'Institutional mismatch "{}"/"{}". Lines will be deleted and added to the unresolved duplicates file.'
            oat.print_r(msg.format(new_line[0],target_line[0]))
            new_content[pair[0]] = list(EMPTY_LINE)
            target_content[pair[1]] = REPLACEMENT
            ud_content += [target_line]
            ud_content += [new_line]
            path, index = find_in_enriched_files(doi)
            ENRICHED_FILES[path]["content"][index] = list(EMPTY_LINE)
            ENRICHED_FILES[path]["modified"] = True
        elif deviation <= args.cost_tolerance:
            msg = "Cost deviation between {} and {} is below tolerance threshold ({} <= {}). Entries are treated as equal, only the new one will be deleted."
            oat.print_g(msg.format(new_cost, target_cost, deviation, args.cost_tolerance))
            new_content[pair[0]] = list(EMPTY_LINE)
        else:
            msg = "Cost deviation between {} and {} exceeds tolerance threshold ({} > {}). Entries are treated as different, both will be deleted."
            oat.print_y(msg.format(new_cost, target_cost, deviation, args.cost_tolerance))
            new_content[pair[0]] = list(EMPTY_LINE)
            target_content[pair[1]] = REPLACEMENT
            path, index = find_in_enriched_files(doi)
            ENRICHED_FILES[path]["content"][index] = list(EMPTY_LINE)
            ENRICHED_FILES[path]["modified"] = True
        count += 1
        if args.batch and count >= args.batch:
            break

    while REPLACEMENT in target_content:
        target_content.remove(REPLACEMENT)
    
    with open(args.target_file, 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
        writer.write_rows(target_header + target_content)
    with open(args.new_file, 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
        writer.write_rows(new_header + new_content)
    with open(UD_FILE, 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
        writer.write_rows(ud_header + ud_content)
    for path, enriched_file in ENRICHED_FILES.items():
        if enriched_file["modified"]:
            with open(path, 'w') as out:
                writer = oat.OpenAPCUnicodeWriter(out, QUOTE_MASK, True, True)
                writer.write_rows(enriched_file["header"] + enriched_file["content"])

Esempio n. 12

0

Mostra file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("apc_file", help=ARG_HELP_STRINGS["apc_file"])
    parser.add_argument("issn_l_file", help=ARG_HELP_STRINGS["issn_l_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q",
                        "--quotemask",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o",
                        "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true",
                        default=False)

    args = parser.parse_args()

    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print("Error: A quotemask may only contain the letters 't' and" +
                  "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    enc = None

    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    header, content = oat.get_csv_file_content(args.apc_file, enc)

    oat.print_g("Preparing mapping table...")
    itself = other = 0
    issn_l_re = re.compile(
        "^(?P<issn>\d{4}-\d{3}[\dxX])\t(?P<issn_l>\d{4}-\d{3}[\dxX])$")
    issn_l_file = open(args.issn_l_file, "r")
    issn_l_dict = {}
    for i, line in enumerate(issn_l_file):
        if i % 100000 == 0:
            print(str(i) + " lines processed.")
        match = issn_l_re.match(line)
        if match:
            match_dict = match.groupdict()
            issn_l_dict[match_dict['issn']] = match_dict['issn_l']
            if match_dict['issn'] == match_dict['issn_l']:
                itself += 1
            else:
                other += 1
    print(
        str(itself) + " ISSNs pointing to itself as ISSN-L, " + str(other) +
        " to another value.")
    oat.print_g("Starting enrichment...")

    issn_matches = issn_p_matches = issn_e_matches = unmatched = different = corrections = 0
    enriched_lines = []
    for line in content:
        if len(line) == 0:
            enriched_lines.append(line)
            continue
        issn = reformat_issn(line[7])
        issn_p = reformat_issn(line[8])
        issn_e = reformat_issn(line[9])
        target = None
        if issn in issn_l_dict:
            target = issn_l_dict[issn]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_matches += 1
        elif issn_p in issn_l_dict:
            target = issn_l_dict[issn_p]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_p_matches += 1
        elif issn_e in issn_l_dict:
            target = issn_l_dict[issn_e]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_e_matches += 1
        else:
            unmatched += 1
        if target is not None and target not in [issn, issn_p, issn_e]:
            different += 1
        enriched_lines.append(line)

    msg = ("{} issn_l values mapped by issn, {} by issn_p, {} by issn_e. {} " +
           "could not be assigned.\n{} issn_l values were corrected during " +
           "the process.\n In {} cases the ISSN-L was different from all " +
           "existing ISSN values")
    print(
        msg.format(issn_matches, issn_p_matches, issn_e_matches, unmatched,
                   corrections, different))

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows(header + enriched_lines)

Esempio n. 13

0

Mostra file

File: monetary_conversion.py Progetto: UBHUStatistik/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file")
    parser.add_argument("source_column", type=int)
    parser.add_argument("currency_column", type=int)
    parser.add_argument("period_column", type=int)
    parser.add_argument("target_column", type=int)
    parser.add_argument("-f", "--force_overwrite", action="store_true")
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and"  +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    enc = None
    
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
    
    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                  args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()
        
    header, content = oat.get_csv_file_content(args.source_file, enc, True)
    fieldnames = header.pop()
    
    modified_content = []
    line_num = 0
    
    for column_type in ["source_column", "currency_column", "period_column", "target_column"]:
        index = getattr(args, column_type)
        msg = "Column {} ('{}') is the {}."
        oat.print_g(msg.format(index, fieldnames[index], column_type))
    
    start = input("\nStart conversion? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()
    
    for line in content:
        line_num += 1
        if not oat.has_value(line[args.source_column]):
            oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...")
            modified_content.append(line)
            continue
        monetary_value = None
        try: 
            monetary_value = locale.atof(line[args.source_column])
        except ValueError:
            msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, line[args.source_column]))
            modified_content.append(line)
            continue
        currency = line[args.currency_column]
        if currency == "EUR":
            msg = "WARNING: Currency in line {} is already EUR, skipping..."
            oat.print_y(msg.format(line_num))
            line[args.target_column] = line[args.source_column]
            modified_content.append(line)
            continue
        if not oat.has_value(currency):
            msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, currency))
            modified_content.append(line)
            continue
        period = line[args.period_column]
        frequency = get_frequency(period)
        if frequency is None:
            msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, period))
            modified_content.append(line)
            continue
        if currency not in EXCHANGE_RATES[frequency]:
            msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...'
            oat.print_b(msg.format(frequency, currency))
            rates = oat.get_euro_exchange_rates(currency, frequency)
            EXCHANGE_RATES[frequency][currency] = rates
        rate = EXCHANGE_RATES[frequency][currency].get(period)
        if rate is None and frequency == "A":
            rate = _calulate_preliminary_annual_average(period, currency)
            if rate:
                EXCHANGE_RATES[frequency][currency][period] = rate
        if rate is None:
            if frequency != "D":
                msg = "Error: No conversion rate found for currency {} for period {} (line {}), aborting..."
                oat.print_r(msg.format(currency, period, line_num))
                sys.exit()
            day_retries = 0
            while rate is None:
                msg = "Warning: No conversion rate found for currency {} for period {} (line {}), trying next day..."
                oat.print_y(msg.format(currency, period, line_num))
                period = get_next_day(period)
                rate = EXCHANGE_RATES[frequency][currency].get(period)
                day_retries += 1
                if day_retries > 5:
                    msg = "Error: Look-ahead limit for days exceeded, aborting..."
                    oat.print_r(msg)
                    sys.exit()

        euro_value = round(monetary_value/float(rate), 2)
        line[args.target_column] = str(euro_value)
        
        msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR"
        msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value)
        oat.print_g(msg)
        
        modified_content.append(line)
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows([fieldnames] + modified_content)

Esempio n. 14

0

Mostra file

File: csv_delete_rows.py Progetto: MPDL/unibiAPC

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("index", type=int, help=ARG_HELP_STRINGS["index"])
    parser.add_argument("-v", "--value", help=ARG_HELP_STRINGS["value"])
    parser.add_argument("-f", "--file", help=ARG_HELP_STRINGS["file"])
    parser.add_argument("-d", "--full_delete", action="store_true", help=ARG_HELP_STRINGS["full_delete"])
    parser.add_argument("-i", "--ignore_case", action="store_true", help=ARG_HELP_STRINGS["ignore_case"])
    parser.add_argument("-r", "--results_file", action="store_true", help=ARG_HELP_STRINGS["results_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    
    args = parser.parse_args()
    if args.value is None and args.file is None:
        parser.error("Either a single value (-v option) or a file of " +
                     "multiple values (-f option) must be given.")
    
    values = []
    if args.file:
        if not os.path.isfile(args.file):
            print("Error: '" + args.file + "' is no valid file!")
            sys.exit() 
        with open(args.file, "r") as f:
            for line in f:
                if len(line) > 0:
                    value = line.strip("\r\n")
                    if args.ignore_case:
                        values.append(value.lower())
                    else:
                        values.append(value)
        oat.print_g(str(len(values)) + " values read from file")
    
    if args.value is not None:
        if args.ignore_case:
            values.append(args.value.lower())
        else:
            values.append(args.value)
        if args.file:
            oat.print_y("Value argument given in addition to file " +
                        "argument, adding value to file imports...")
    
    quote_rules = args.openapc_quote_rules
    
    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            print (msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
            
    header, content = oat.get_csv_file_content(args.csv_file, enc)
        
    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and" +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    empty_line = ['' for element in content[0]]
    column_name = "column " + str(args.index)
    if header:
        header_line = header[0]
        column_name = header_line[args.index]
        empty_line = ['' for element in header_line]
    msg = u"Performing line deletion on condition '{}' in {}".format(column_name, values)
    oat.print_g(msg)
    
    modified_content = []
    deleted_lines = []
    num_total_lines = num_deleted_lines = 0
    for line in content:
        if len(line) == 0:
            continue
        num_total_lines += 1
        current_value = line[args.index]
        if args.ignore_case:
            current_value = current_value.lower()
        if current_value not in values:
            modified_content.append(line)
        else:
            num_deleted_lines += 1
            if not args.full_delete:
                modified_content.append(list(empty_line))
            if args.results_file:
                deleted_lines.append(line)
            
    msg = u"Process complete, deleted {} out of {} total lines"        
    oat.print_g(msg.format(num_deleted_lines, num_total_lines))
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
        writer.write_rows(copy.deepcopy(header) + modified_content)

    if args.results_file and len(deleted_lines) > 0:
        with open('del.csv', 'w') as out:
            writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
            writer.write_rows(copy.deepcopy(header) + deleted_lines)

Esempio n. 15

0

Mostra file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    subparsers = parser.add_subparsers(help='The column operation to perform')
    
    delete_parser = subparsers.add_parser("delete", help="delete help")
    delete_parser.add_argument("column_index", type=int, help='bar help')
    delete_parser.set_defaults(func=delete_column)
    
    insert_parser = subparsers.add_parser("insert", help="insert help")
    insert_parser.add_argument("target_index", type=int, help='bar help')
    insert_parser.add_argument("column_name", help='bar help')
    insert_parser.add_argument("default_value", help='bar help')
    insert_parser.set_defaults(func=insert_column)
    
    move_parser = subparsers.add_parser("move", help="move help")
    move_parser.add_argument("column_index", type=int, help='bar help')
    move_parser.add_argument("target_index", type=int, help='bar help')
    move_parser.set_defaults(func=move_column)
    
    copy_parser = subparsers.add_parser("copy", help="copy help")
    copy_parser.set_defaults(func=copy)
    
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules
    
    enc = None #CSV file encoding
    
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            print(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
    
    header, content = oat.get_csv_file_content(args.csv_file, enc)

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and"  +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    new_rows = args.func(header, content, args)
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows(new_rows)

Esempio n. 16

0

Mostra file

File: csv_row_reorder.py Progetto: MPDL/unibiAPC

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("column", type=int, help=ARG_HELP_STRINGS["column"])
    parser.add_argument("other_csv_file", nargs="?", help=ARG_HELP_STRINGS["other_csv_file"])
    parser.add_argument("other_column", type=int, nargs="?", help=ARG_HELP_STRINGS["other_column"])
    parser.add_argument("-e2", "--other_encoding", help=ARG_HELP_STRINGS["other_encoding"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-i", "--ignore_case", action="store_true", default=False,
                        help=ARG_HELP_STRINGS["ignore_case"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)

    args = parser.parse_args()

    quote_rules = args.openapc_quote_rules

    encs = [] #CSV file encodings

    for encoding in [args.encoding, args.other_encoding]:
        if encoding:
            try:
                codec = codecs.lookup(encoding)
                msg = "Encoding '{}' found in Python's codec collection as '{}'"
                print(msg.format(encoding, codec.name))
            except LookupError:
                print("Error: '" + encoding + "' not found Python's " +
                      "codec collection. Either look for a valid name here " +
                      "(https://docs.python.org/2/library/codecs.html#standard-" +
                      "encodings) or omit this argument to enable automated " +
                      "guessing.")
                sys.exit()
        encs.append(encoding)

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if reduced:
            print("Error: A quotemask may only contain the letters 't' and 'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    header, content = oat.get_csv_file_content(args.csv_file, enc=encs[0])
    column = args.column

    if not args.other_csv_file:
        rearranged_content = header + sorted(content, key=lambda x: x[column])
    else:
        rearranged_content = []
        _, second_content = oat.get_csv_file_content(args.other_csv_file, enc=encs[1])
        other_column = column # default: use same column index as in first file
        if args.other_column:
            other_column = args.other_column

        for other_row in second_content:
            if args.ignore_case:
                matching_rows = [row for row in content if row[column].lower() == other_row[other_column].lower()]
            else:
                matching_rows = [row for row in content if row[column] == other_row[other_column]]
            rearranged_content += matching_rows
            for matching_row in matching_rows:
                content.remove(matching_row)
        unmatched_msg = ("{} rows could not be rearranged (unmatched in second csv file) " +
                         "and were appended to the end of the result file " +
                         "in original order.")
        if content:
            oat.print_y(unmatched_msg.format(len(content)))
        else:
            oat.print_g("All rows matched.")
        rearranged_content = header + rearranged_content + content # append any unmatched rows

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
        writer.write_rows(rearranged_content)

Esempio n. 17

0

Mostra file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file")
    parser.add_argument("source_column", type=int)
    parser.add_argument("currency_column", type=int)
    parser.add_argument("period_column", type=int)
    parser.add_argument("target_column", type=int)
    parser.add_argument("-f", "--force_overwrite", action="store_true")
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q",
                        "--quotemask",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o",
                        "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true",
                        default=False)
    args = parser.parse_args()

    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print("Error: A quotemask may only contain the letters 't' and" +
                  "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    enc = None

    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()

    header, content = oat.get_csv_file_content(args.source_file, enc)
    fieldnames = header.pop()

    modified_content = []
    line_num = 0

    for column_type in [
            "source_column", "currency_column", "period_column",
            "target_column"
    ]:
        index = getattr(args, column_type)
        msg = "Column {} ('{}') is the {}."
        oat.print_g(msg.format(index, fieldnames[index], column_type))

    start = input("\nStart conversion? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()

    for line in content:
        line_num += 1
        if not oat.has_value(line[args.source_column]):
            oat.print_y("WARNING: No source value found in line " +
                        str(line_num) + ", skipping...")
            modified_content.append(line)
            continue
        monetary_value = None
        try:
            monetary_value = locale.atof(line[args.source_column])
        except ValueError:
            msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, line[args.source_column]))
            modified_content.append(line)
            continue
        period = line[args.period_column]
        if not oat.has_value(period) or not period.isdigit():
            msg = "WARNING: Could not extract a valid year string from period column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, period))
            modified_content.append(line)
            continue
        currency = line[args.currency_column]
        if not oat.has_value(currency):
            msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, currency))
            modified_content.append(line)
            continue
        try:
            rate = AVG_YEARLY_CONVERSION_RATES[currency][period]
        except KeyError:
            msg = "ERROR: No conversion rate found for currency {} in year {} (line {}), aborting..."
            oat.print_r(msg.format(currency, period, line_num))
            sys.exit()

        euro_value = round(monetary_value / rate, 2)
        line[args.target_column] = str(euro_value)

        modified_content.append(line)

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows([fieldnames] + modified_content)

Esempio n. 18

0

Mostra file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file", help=ARG_HELP_STRINGS["source_file"])
    parser.add_argument("source_file_key_column", type=int, help=ARG_HELP_STRINGS["source_file_key_column"])
    parser.add_argument("source_file_value_column", type=int, help=ARG_HELP_STRINGS["source_file_value_column"])
    parser.add_argument("target_file", help=ARG_HELP_STRINGS["target_file"])
    parser.add_argument("target_file_key_column", type=int, help=ARG_HELP_STRINGS["target_file_key_column"])
    parser.add_argument("target_file_value_column", type=int, help=ARG_HELP_STRINGS["target_file_value_column"])
    parser.add_argument("-s", "--strict", action="store_true", help=ARG_HELP_STRINGS["strict"])
    parser.add_argument("-f", "--force_overwrite", action="store_true", help=ARG_HELP_STRINGS["force_overwrite"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-e2", "--other_encoding", help=ARG_HELP_STRINGS["other_encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules
    
    encs = [] #CSV file encodings
    
    for encoding in [args.encoding, args.other_encoding]:
        if encoding:
            try:
                codec = codecs.lookup(encoding)
                print ("Encoding '{}' found in Python's codec collection " +
                       "as '{}'").format(encoding, codec.name)
                enc = args.encoding
            except LookupError:
                print ("Error: '" + encoding + "' not found Python's " +
                       "codec collection. Either look for a valid name here " +
                       "(https://docs.python.org/2/library/codecs.html#standard-" +
                       "encodings) or omit this argument to enable automated " +
                       "guessing.")
                sys.exit()
        encs.append(encoding)
        
    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and" +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    source_header, source_content = oat.get_csv_file_content(args.source_file, enc=encs[0])
    
    key_column_name = "column " + str(args.source_file_key_column)
    value_column_name = "column " + str(args.source_file_value_column)
    if source_header:
        header = source_header[0]
        key_column_name = header[args.source_file_key_column]
        value_column_name = header[args.source_file_value_column]
    msg = u"Creating mapping table ({} -> {}) for source file {}...".format(key_column_name, value_column_name, args.source_file)
    oat.print_g(msg)
    mapping_table = {}
    ambiguous_keys = []
    for line in source_content:
        if line:
            key = line[args.source_file_key_column]
            if key == 'NA':
                continue
            value = line[args.source_file_value_column]
            if key not in mapping_table:
                mapping_table[key] = value
            else:
                if mapping_table[key] != value:
                    if not args.strict:
                        msg = u"WARNING: Replacing existing value '{}' for key '{}' with new value '{}'".format(mapping_table[key], key, value)
                        mapping_table[key] = value
                        oat.print_y(msg)
                    else:
                        if key not in ambiguous_keys:
                            ambiguous_keys.append(key)
    if args.strict:
        for key in ambiguous_keys:
            del(mapping_table[key])
            msg = u"INFO: Ambiguous key '{}' dropped from mapping table".format(key)
            oat.print_b(msg)
    
    oat.print_g("mapping table created, contains " + str(len(mapping_table)) + " entries")
    
    target_header, target_content = oat.get_csv_file_content(args.target_file, enc=encs[1])
    
    
    line_num = 0 if not target_header else 1
    
    replace_msg = u"Line {}: Found matching key '{}', replaced old value '{}' by '{}'"
    modified_content = []
    for line in target_content:
        key = line[args.target_file_key_column]
        if key in mapping_table:
            new_value = mapping_table[key]
            old_value = line[args.target_file_value_column]
            if old_value != new_value:
                if len(old_value) == 0 or old_value == "NA":
                    line[args.target_file_value_column] = new_value
                    msg = replace_msg.format(line_num, key, old_value, new_value)
                    oat.print_g(msg)
                else:
                    if args.force_overwrite:
                        line[args.target_file_value_column] = new_value
                        msg = replace_msg.format(line_num, key, old_value, new_value)
                        oat.print_y(msg)
        modified_content.append(line)
        line_num += 1
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
        writer.write_rows(target_header + modified_content)

Esempio n. 19

0

Mostra file

File: hybrid_oa_check.py Progetto: zuphilip/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-start", type=int, default=1, help=ARG_HELP_STRINGS["start"])
    parser.add_argument("-end", type=int, help=ARG_HELP_STRINGS["start"])
    args = parser.parse_args()

    handler = logging.StreamHandler(sys.stderr)
    handler.setFormatter(oat.ANSIColorFormatter())
    bufferedHandler = oat.BufferedErrorHandler(handler)
    bufferedHandler.setFormatter(oat.ANSIColorFormatter())
    logging.root.addHandler(handler)
    logging.root.addHandler(bufferedHandler)
    logging.root.setLevel(logging.INFO)

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            msg = ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            oat.print_r(msg)
            sys.exit()

    head, content = oat.get_csv_file_content(args.csv_file, enc)
    content = head + content

    line_num = 0
    for line in content:
        line_num += 1
        if args.start and args.start > line_num:
            continue
        if args.end and args.end < line_num:
            continue
        # Check hybrid status
        if line[4] != "TRUE":
            continue
        institution = line[0]
        period = line[1]
        doi = line[3]
        publisher = line[5]
        journal = line[6]
        for lpl in lpl_list:
            if lpl.publisher_matches(publisher):
                init_msg = (u"Line {}: Checking {} article from {}, published in '" +
                            "{}'...").format(line_num, institution, period, journal)
                oat.print_b(init_msg)
                page_content = get_landingpage_content(doi, lpl)
                if page_content is None:
                    continue
                pdf_link = lpl.search_for_oa(page_content)
                if pdf_link is None:
                    error_msg = (u"No PDF link found! (line {}, DOI: " +
                                 "http://doi.org/{}").format(line_num, doi)
                    logging.error(error_msg)
                elif pdf_link == "":
                    warning_msg = (u"A RegexGroup matched, but no PDF " +
                                   "link was found! (line {}, DOI: " +
                                   "http://doi.org/{}").format(line_num, doi)
                    logging.warning(warning_msg)
                else:
                    oat.print_g(u"PDF link found: " + pdf_link)
        time.sleep(1)

    if not bufferedHandler.buffer:
        oat.print_g("\nLookup finished, all articles were accessible")
    else:
        oat.print_r("\nLookup finished, not all articles could be accessed:\n")
    # closing will implicitly flush the handler and print any buffered
    # messages to stderr
    bufferedHandler.close()

Esempio n. 20

0

Mostra file

File: monetary_conversion.py Progetto: MPDL/unibiAPC

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("source_file")
    parser.add_argument("source_column", type=int)
    parser.add_argument("currency_column", type=int)
    parser.add_argument("period_column", type=int)
    parser.add_argument("target_column", type=int)
    parser.add_argument("-f", "--force_overwrite", action="store_true")
    parser.add_argument("-l", "--locale", help=ARG_HELP_STRINGS["locale"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and"  +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    enc = None
    
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
    
    if args.locale:
        norm = locale.normalize(args.locale)
        if norm != args.locale:
            msg = "locale '{}' not found, normalised to '{}'".format(
                  args.locale, norm)
            oat.print_y(msg)
        try:
            loc = locale.setlocale(locale.LC_ALL, norm)
            oat.print_g("Using locale " + loc)
        except locale.Error as loce:
            msg = "Setting locale to {} failed: {}".format(norm, loce.message)
            oat.print_r(msg)
            sys.exit()
        
    header, content = oat.get_csv_file_content(args.source_file, enc, True)
    fieldnames = header.pop()
    
    modified_content = []
    line_num = 0
    
    for column_type in ["source_column", "currency_column", "period_column", "target_column"]:
        index = getattr(args, column_type)
        msg = "Column {} ('{}') is the {}."
        oat.print_g(msg.format(index, fieldnames[index], column_type))
    
    start = input("\nStart conversion? (y/n):")
    while start not in ["y", "n"]:
        start = input("Please type 'y' or 'n':")
    if start == "n":
        sys.exit()
    
    for line in content:
        line_num += 1
        if not oat.has_value(line[args.source_column]):
            oat.print_y("WARNING: No source value found in line " + str(line_num) + ", skipping...")
            modified_content.append(line)
            continue
        monetary_value = None
        try: 
            monetary_value = locale.atof(line[args.source_column])
        except ValueError:
            msg = "WARNING: Could not extract a valid monetary value from source column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, line[args.source_column]))
            modified_content.append(line)
            continue
        currency = line[args.currency_column]
        if not oat.has_value(currency):
            msg = "WARNING: Could not extract a valid currency indicator from currency column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, currency))
            modified_content.append(line)
            continue
        period = line[args.period_column]
        frequency = get_frequency(period)
        if frequency is None:
            msg = "WARNING: Could not extract a valid date string from period column in line {} ('{}'), skipping..."
            oat.print_y(msg.format(line_num, period))
            modified_content.append(line)
            continue
        if currency not in EXCHANGE_RATES[frequency]:
            msg = 'No exchange rates ({}) found for currency "{}", querying ECB data warehouse...'
            oat.print_b(msg.format(frequency, currency))
            rates = oat.get_euro_exchange_rates(currency, frequency)
            EXCHANGE_RATES[frequency][currency] = rates
        try:
            rate = EXCHANGE_RATES[frequency][currency][period]
        except KeyError:
            msg = "ERROR: No conversion rate found for currency {} for period {} (line {}), aborting..."
            oat.print_r(msg.format(currency, period, line_num))
            sys.exit()
        
        euro_value = round(monetary_value/float(rate), 2)
        line[args.target_column] = str(euro_value)
        
        msg = "Line {}: {} exchange rate ({}) for date {} is {} -> {} / {} = {} EUR"
        msg = msg.format(line_num, currency, frequency, period, rate, monetary_value, rate, euro_value)
        oat.print_g(msg)
        
        modified_content.append(line)
    
    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows([fieldnames] + modified_content)

Esempio n. 21

0

Mostra file

File: csv_find_similarities.py Progetto: zuphilip/openapc-de

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("index", type=int, help=ARG_HELP_STRINGS["index"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-m", "--min_ratio", type=float, help=ARG_HELP_STRINGS["min_ratio"],
                        default=0.0)

    args = parser.parse_args()

    if args.min_ratio < 0.0 or args.min_ratio > 1.0:
        oat.print_r("Error: min_ratio parameter must be a float between 0.0 and 1.0")
        sys.exit()

    enc = None
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            print(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print("Error: '" + args.encoding + "' not found Python's " +
                  "codec collection. Either look for a valid name here " +
                  "(https://docs.python.org/2/library/codecs.html#standard-" +
                  "encodings) or omit this argument to enable automated " +
                  "guessing.")
            sys.exit()

    header, content = oat.get_csv_file_content(args.csv_file, enc)
    header = header.pop()

    entities = []
    line_num = 0
    msg = "Processed {} entries in column '{}', {} unique entities found."
    last_msg = None
    for line in content:
        line_num += 1
        if line[args.index] not in entities:
            entities.append(line[args.index])
        if line_num == len(content) or line_num % 100 == 0:
            last_msg = msg.format(line_num, header[args.index], len(entities))
            print(last_msg, end="\r")
    print(last_msg)

    sim_pairs = []
    n = len(entities) - 1
    num_pairs = int((n*n + n) / 2)
    msg = ("Calculated Levenshtein ratio for {} out of {} possible entity combinations ({}%), " +
           "{} have passed the minimum ratio so far.")
    last_msg = None
    num_calcs = 0
    while entities:
        first_part = entities.pop(0)
        for second_part in entities:
            lev_ratio = ratio(first_part, second_part)
            num_calcs += 1
            if lev_ratio >= args.min_ratio:
                sim_pairs.append([first_part, second_part, str(lev_ratio)])
            if num_calcs == num_pairs or num_calcs % 100 == 0:
                last_msg = msg.format(num_calcs, num_pairs, round(num_calcs/num_pairs * 100, 1),
                                      len(sim_pairs))
                print(last_msg, end="\r")
    print(last_msg)

    sim_pairs.sort(key=lambda x: x[2], reverse=True)
    sim_pairs.insert(0, ["first_item", "second_item", "levenshtein_ratio"])
    with open("out.csv", "w") as out_file:
        writer = oat.OpenAPCUnicodeWriter(out_file)
        writer.write_rows(sim_pairs)

Esempio n. 22

0

Mostra file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("csv_file", help=ARG_HELP_STRINGS["csv_file"])
    parser.add_argument("column", type=int, help=ARG_HELP_STRINGS["column"])
    parser.add_argument("other_csv_file",
                        nargs="?",
                        help=ARG_HELP_STRINGS["other_csv_file"])
    parser.add_argument("other_column",
                        type=int,
                        nargs="?",
                        help=ARG_HELP_STRINGS["other_column"])
    parser.add_argument("-e2",
                        "--other_encoding",
                        help=ARG_HELP_STRINGS["other_encoding"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-i",
                        "--ignore_case",
                        action="store_true",
                        default=False,
                        help=ARG_HELP_STRINGS["ignore_case"])
    parser.add_argument("-q",
                        "--quotemask",
                        help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o",
                        "--openapc_quote_rules",
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true",
                        default=False)

    args = parser.parse_args()

    quote_rules = args.openapc_quote_rules

    encs = []  #CSV file encodings

    for encoding in [args.encoding, args.other_encoding]:
        if encoding:
            try:
                codec = codecs.lookup(encoding)
                msg = "Encoding '{}' found in Python's codec collection as '{}'"
                print(msg.format(encoding, codec.name))
            except LookupError:
                print(
                    "Error: '" + encoding + "' not found Python's " +
                    "codec collection. Either look for a valid name here " +
                    "(https://docs.python.org/2/library/codecs.html#standard-"
                    + "encodings) or omit this argument to enable automated " +
                    "guessing.")
                sys.exit()
        encs.append(encoding)

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if reduced:
            print(
                "Error: A quotemask may only contain the letters 't' and 'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]

    header, content = oat.get_csv_file_content(args.csv_file, enc=encs[0])
    column = args.column

    if not args.other_csv_file:
        rearranged_content = header + sorted(content, key=lambda x: x[column])
    else:
        rearranged_content = []
        _, second_content = oat.get_csv_file_content(args.other_csv_file,
                                                     enc=encs[1])
        other_column = column  # default: use same column index as in first file
        if args.other_column:
            other_column = args.other_column

        for other_row in second_content:
            if args.ignore_case:
                matching_rows = [
                    row for row in content
                    if row[column].lower() == other_row[other_column].lower()
                ]
            else:
                matching_rows = [
                    row for row in content
                    if row[column] == other_row[other_column]
                ]
            rearranged_content += matching_rows
            for matching_row in matching_rows:
                content.remove(matching_row)
        unmatched_msg = (
            "{} rows could not be rearranged (unmatched in second csv file) " +
            "and were appended to the end of the result file " +
            "in original order.")
        if content:
            oat.print_y(unmatched_msg.format(len(content)))
        else:
            oat.print_g("All rows matched.")
        rearranged_content = header + rearranged_content + content  # append any unmatched rows

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, False)
        writer.write_rows(rearranged_content)

Esempio n. 23

0

Mostra file

File: issn_l_enrichment.py Progetto: MPDL/unibiAPC

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("apc_file", help=ARG_HELP_STRINGS["apc_file"])
    parser.add_argument("issn_l_file", help=ARG_HELP_STRINGS["issn_l_file"])
    parser.add_argument("-e", "--encoding", help=ARG_HELP_STRINGS["encoding"])
    parser.add_argument("-q", "--quotemask", help=ARG_HELP_STRINGS["quotemask"])
    parser.add_argument("-o", "--openapc_quote_rules", 
                        help=ARG_HELP_STRINGS["openapc_quote_rules"],
                        action="store_true", default=False)
    
    args = parser.parse_args()
    
    quote_rules = args.openapc_quote_rules

    mask = None
    if args.quotemask:
        reduced = args.quotemask.replace("f", "").replace("t", "")
        if len(reduced) > 0:
            print ("Error: A quotemask may only contain the letters 't' and"  +
                   "'f'!")
            sys.exit()
        mask = [True if x == "t" else False for x in args.quotemask]
    
    enc = None
    
    if args.encoding:
        try:
            codec = codecs.lookup(args.encoding)
            msg = "Encoding '{}' found in Python's codec collection as '{}'"
            oat.print_g(msg.format(args.encoding, codec.name))
            enc = args.encoding
        except LookupError:
            print ("Error: '" + args.encoding + "' not found Python's " +
                   "codec collection. Either look for a valid name here " +
                   "(https://docs.python.org/2/library/codecs.html#standard-" +
                   "encodings) or omit this argument to enable automated " +
                   "guessing.")
            sys.exit()
        
    header, content = oat.get_csv_file_content(args.apc_file, enc)
    
    oat.print_g("Preparing mapping table...")
    itself = other = 0
    issn_l_re = re.compile("^(?P<issn>\d{4}-\d{3}[\dxX])\t(?P<issn_l>\d{4}-\d{3}[\dxX])$")
    issn_l_file = open(args.issn_l_file, "r")
    issn_l_dict = {}
    for i, line in enumerate(issn_l_file):
        if i % 100000 == 0:
            print(str(i) + " lines processed.")
        match = issn_l_re.match(line)
        if match:
            match_dict = match.groupdict()
            issn_l_dict[match_dict['issn']] = match_dict['issn_l']
            if match_dict['issn'] == match_dict['issn_l']:
                itself += 1
            else:
                other += 1
    print(str(itself) + " ISSNs pointing to itself as ISSN-L, " + str(other) + " to another value.")
    oat.print_g("Starting enrichment...")
    
    issn_matches = issn_p_matches = issn_e_matches = unmatched = different = corrections = 0
    enriched_lines = []
    for line in content:
        if len(line) == 0:
            enriched_lines.append(line)
            continue
        issn = reformat_issn(line[7])
        issn_p = reformat_issn(line[8])
        issn_e = reformat_issn(line[9])
        target = None
        if issn in issn_l_dict:
            target = issn_l_dict[issn]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_matches += 1
        elif issn_p in issn_l_dict:
            target = issn_l_dict[issn_p]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_p_matches += 1
        elif issn_e in issn_l_dict:
            target = issn_l_dict[issn_e]
            corrected_target = oat.get_corrected_issn_l(target)
            if corrected_target != target:
                corrections += 1
            line[10] = corrected_target
            issn_e_matches += 1
        else:
            unmatched += 1
        if target is not None and target not in [issn, issn_p, issn_e]:
            different += 1
        enriched_lines.append(line)
    
    msg = ("{} issn_l values mapped by issn, {} by issn_p, {} by issn_e. {} " +
           "could not be assigned.\n{} issn_l values were corrected during " +
           "the process.\n In {} cases the ISSN-L was different from all " +
           "existing ISSN values")
    print(msg.format(issn_matches, issn_p_matches, issn_e_matches, 
                     unmatched, corrections, different))

    with open('out.csv', 'w') as out:
        writer = oat.OpenAPCUnicodeWriter(out, mask, quote_rules, True)
        writer.write_rows(header + enriched_lines)