コード例 #1
0
ファイル: text_normalizer.py プロジェクト: SCOAP3/invenio
def cut_references(text_lines):
    """Returns the text lines with the references cut."""
    ref_sect_start = find_reference_section(text_lines)
    if ref_sect_start is not None:
        start = ref_sect_start["start_line"]
        end = find_end_of_reference_section(text_lines, start,
                                            ref_sect_start["marker"], ref_sect_start["marker_pattern"])
        del text_lines[start:end + 1]
    else:
        log.warning("Found no references to remove.")
        return text_lines

    return text_lines
コード例 #2
0
ファイル: text.py プロジェクト: chokribr/invenio-1
def extract_references_from_fulltext(fulltext):
    """Locate and extract the reference section from a fulltext document.
       Return the extracted reference section as a list of strings, whereby each
       string in the list is considered to be a single reference line.
        E.g. a string could be something like:
        '[19] Wilson, A. Unpublished (1986).
       @param fulltext: (list) of strings, whereby each string is a line of the
        document.
       @return: (list) of strings, where each string is an extracted reference
        line.
    """
    # Try to remove pagebreaks, headers, footers
    fulltext = remove_page_boundary_lines(fulltext)
    status = 0
    # How ref section found flag
    how_found_start = 0
    # Find start of refs section
    ref_sect_start = get_reference_section_beginning(fulltext)

    if ref_sect_start is None:
        ## No References
        refs = []
        status = 4
        write_message(
            "* extract_references_from_fulltext: "
            "ref_sect_start is None",
            verbose=2)
    else:
        # If a reference section was found, however weak
        ref_sect_end = \
           find_end_of_reference_section(fulltext,
                                         ref_sect_start["start_line"],
                                         ref_sect_start["marker"],
                                         ref_sect_start["marker_pattern"])
        if ref_sect_end is None:
            # No End to refs? Not safe to extract
            refs = []
            status = 5
            write_message(
                "* extract_references_from_fulltext: "
                "no end to refs!",
                verbose=2)
        else:
            # If the end of the reference section was found.. start extraction
            refs = get_reference_lines(
                fulltext, ref_sect_start["start_line"], ref_sect_end,
                ref_sect_start["title_string"],
                ref_sect_start["marker_pattern"],
                ref_sect_start["title_marker_same_line"])

    return refs, status, how_found_start
コード例 #3
0
ファイル: text_normalizer.py プロジェクト: chokribr/invenio-1
def cut_references(text_lines):
    """Returns the text lines with the references cut."""
    ref_sect_start = find_reference_section(text_lines)
    if ref_sect_start is not None:
        start = ref_sect_start["start_line"]
        end = find_end_of_reference_section(text_lines, start,
                                            ref_sect_start["marker"],
                                            ref_sect_start["marker_pattern"])
        del text_lines[start:end + 1]
    else:
        log.warning("Found no references to remove.")
        return text_lines

    return text_lines
コード例 #4
0
ファイル: text.py プロジェクト: mhellmic/b2share
def extract_references_from_fulltext(fulltext):
    """Locate and extract the reference section from a fulltext document.
       Return the extracted reference section as a list of strings, whereby each
       string in the list is considered to be a single reference line.
        E.g. a string could be something like:
        '[19] Wilson, A. Unpublished (1986).
       @param fulltext: (list) of strings, whereby each string is a line of the
        document.
       @return: (list) of strings, where each string is an extracted reference
        line.
    """
    # Try to remove pagebreaks, headers, footers
    fulltext = remove_page_boundary_lines(fulltext)
    status = 0
    # How ref section found flag
    how_found_start = 0
    # Find start of refs section
    ref_sect_start = get_reference_section_beginning(fulltext)

    if ref_sect_start is None:
        ## No References
        refs = []
        status = 4
        write_message("* extract_references_from_fulltext: " \
                         "ref_sect_start is None", verbose=2)
    else:
        # If a reference section was found, however weak
        ref_sect_end = \
           find_end_of_reference_section(fulltext,
                                         ref_sect_start["start_line"],
                                         ref_sect_start["marker"],
                                         ref_sect_start["marker_pattern"])
        if ref_sect_end is None:
            # No End to refs? Not safe to extract
            refs = []
            status = 5
            write_message("* extract_references_from_fulltext: " \
                             "no end to refs!", verbose=2)
        else:
            # If the end of the reference section was found.. start extraction
            refs = get_reference_lines(fulltext,
                                       ref_sect_start["start_line"],
                                       ref_sect_end,
                                       ref_sect_start["title_string"],
                                       ref_sect_start["marker_pattern"],
                                       ref_sect_start["title_marker_same_line"],
                                       ref_sect_start["marker"])

    return refs, status, how_found_start