def parse_reference_line(ref_line, kbs, bad_titles_count={}):
    """Parse one reference line

    @input a string representing a single reference bullet
    @output parsed references (a list of elements objects)
    """
    # Strip the 'marker' (e.g. [1]) from this reference line:
    (line_marker, ref_line) = remove_reference_line_marker(ref_line)
    # Find DOI sections in citation
    (ref_line, identified_dois) = identify_and_tag_DOI(ref_line)
    # Identify and replace URLs in the line:
    (ref_line, identified_urls) = identify_and_tag_URLs(ref_line)
    # Tag <cds.JOURNAL>, etc.
    tagged_line, bad_titles_count = tag_reference_line(ref_line,
                                                       kbs,
                                                       bad_titles_count)

    # Debug print tagging (authors, titles, volumes, etc.)
    write_message('* tags %r' % tagged_line, verbose=9)

    # Using the recorded information, create a MARC XML representation
    # of the rebuilt line:
    # At the same time, get stats of citations found in the reference line
    # (titles, urls, etc):
    citation_elements, line_marker, counts = \
        parse_tagged_reference_line(line_marker,
                                    tagged_line,
                                    identified_dois,
                                    identified_urls)

    # Transformations on elements
    citation_elements = split_volume_from_journal(citation_elements)
    citation_elements = format_volume(citation_elements)
    citation_elements = handle_special_journals(citation_elements, kbs)
    citation_elements = format_report_number(citation_elements)
    citation_elements = format_author_ed(citation_elements)
    citation_elements = look_for_books(citation_elements, kbs)
    citation_elements = format_hep(citation_elements)
    citation_elements = remove_b_for_nucl_phys(citation_elements)
    citation_elements = mangle_volume(citation_elements)

    # Split the reference in multiple ones if needed
    splitted_citations = split_citations(citation_elements)

    # Remove references with only misc text
    splitted_citations = remove_invalid_references(splitted_citations)
    # Find year
    splitted_citations = add_year_elements(splitted_citations)
    # For debugging puposes
    print_citations(splitted_citations, line_marker)

    return splitted_citations, line_marker, counts, bad_titles_count
Exemple #2
0
def parse_reference_line(ref_line, kbs, bad_titles_count={}):
    """Parse one reference line

    @input a string representing a single reference bullet
    @output parsed references (a list of elements objects)
    """
    # Strip the 'marker' (e.g. [1]) from this reference line:
    (line_marker, ref_line) = remove_reference_line_marker(ref_line)
    # Find DOI sections in citation
    (ref_line, identified_dois) = identify_and_tag_DOI(ref_line)
    # Identify and replace URLs in the line:
    (ref_line, identified_urls) = identify_and_tag_URLs(ref_line)
    # Tag <cds.JOURNAL>, etc.
    tagged_line, bad_titles_count = tag_reference_line(ref_line, kbs,
                                                       bad_titles_count)

    # Debug print tagging (authors, titles, volumes, etc.)
    write_message('* tags %r' % tagged_line, verbose=9)

    # Using the recorded information, create a MARC XML representation
    # of the rebuilt line:
    # At the same time, get stats of citations found in the reference line
    # (titles, urls, etc):
    citation_elements, line_marker, counts = \
        parse_tagged_reference_line(line_marker,
                                    tagged_line,
                                    identified_dois,
                                    identified_urls)

    # Transformations on elements
    citation_elements = split_volume_from_journal(citation_elements)
    citation_elements = format_volume(citation_elements)
    citation_elements = handle_special_journals(citation_elements, kbs)
    citation_elements = format_report_number(citation_elements)
    citation_elements = format_author_ed(citation_elements)
    citation_elements = look_for_books(citation_elements, kbs)
    citation_elements = format_hep(citation_elements)
    citation_elements = remove_b_for_nucl_phys(citation_elements)
    citation_elements = mangle_volume(citation_elements)

    # Split the reference in multiple ones if needed
    splitted_citations = split_citations(citation_elements)

    # Remove references with only misc text
    splitted_citations = remove_invalid_references(splitted_citations)
    # Find year
    splitted_citations = add_year_elements(splitted_citations)
    # For debugging puposes
    print_citations(splitted_citations, line_marker)

    return splitted_citations, line_marker, counts, bad_titles_count
def extract_journal_reference(line):
    """Extracts the journal reference from
    MARC field 773 and parses for specific
    journal information.

    Parameter: line - field 773__x, the raw journal ref
    Return: list of tuples with data values"""
    tagged_line = tag_reference_line(line, get_kbs(), {})[0]
    if tagged_line is None:
        return None

    elements, dummy_marker, dummy_stats = parse_tagged_reference_line("", tagged_line, [], [])

    for element in elements:
        if element["type"] == "JOURNAL":
            return element
def extract_journal_reference(line):
    """Extracts the journal reference from
    MARC field 773 and parses for specific
    journal information.

    Parameter: line - field 773__x, the raw journal ref
    Return: list of tuples with data values"""
    tagged_line = tag_reference_line(line, get_kbs(), {})[0]
    if tagged_line is None:
        return None

    elements, dummy_marker, dummy_stats = parse_tagged_reference_line(
        '', tagged_line, [], [])

    for element in elements:
        if element['type'] == 'JOURNAL':
            return element