def cut_references(text_lines): """Returns the text lines with the references cut.""" ref_sect_start = find_reference_section(text_lines) if ref_sect_start is not None: start = ref_sect_start["start_line"] end = find_end_of_reference_section(text_lines, start, ref_sect_start["marker"], ref_sect_start["marker_pattern"]) del text_lines[start:end + 1] else: log.warning("Found no references to remove.") return text_lines return text_lines
def extract_references_from_fulltext(fulltext): """Locate and extract the reference section from a fulltext document. Return the extracted reference section as a list of strings, whereby each string in the list is considered to be a single reference line. E.g. a string could be something like: '[19] Wilson, A. Unpublished (1986). @param fulltext: (list) of strings, whereby each string is a line of the document. @return: (list) of strings, where each string is an extracted reference line. """ # Try to remove pagebreaks, headers, footers fulltext = remove_page_boundary_lines(fulltext) status = 0 # How ref section found flag how_found_start = 0 # Find start of refs section ref_sect_start = get_reference_section_beginning(fulltext) if ref_sect_start is None: ## No References refs = [] status = 4 write_message( "* extract_references_from_fulltext: " "ref_sect_start is None", verbose=2) else: # If a reference section was found, however weak ref_sect_end = \ find_end_of_reference_section(fulltext, ref_sect_start["start_line"], ref_sect_start["marker"], ref_sect_start["marker_pattern"]) if ref_sect_end is None: # No End to refs? Not safe to extract refs = [] status = 5 write_message( "* extract_references_from_fulltext: " "no end to refs!", verbose=2) else: # If the end of the reference section was found.. start extraction refs = get_reference_lines( fulltext, ref_sect_start["start_line"], ref_sect_end, ref_sect_start["title_string"], ref_sect_start["marker_pattern"], ref_sect_start["title_marker_same_line"]) return refs, status, how_found_start
def extract_references_from_fulltext(fulltext): """Locate and extract the reference section from a fulltext document. Return the extracted reference section as a list of strings, whereby each string in the list is considered to be a single reference line. E.g. a string could be something like: '[19] Wilson, A. Unpublished (1986). @param fulltext: (list) of strings, whereby each string is a line of the document. @return: (list) of strings, where each string is an extracted reference line. """ # Try to remove pagebreaks, headers, footers fulltext = remove_page_boundary_lines(fulltext) status = 0 # How ref section found flag how_found_start = 0 # Find start of refs section ref_sect_start = get_reference_section_beginning(fulltext) if ref_sect_start is None: ## No References refs = [] status = 4 write_message("* extract_references_from_fulltext: " \ "ref_sect_start is None", verbose=2) else: # If a reference section was found, however weak ref_sect_end = \ find_end_of_reference_section(fulltext, ref_sect_start["start_line"], ref_sect_start["marker"], ref_sect_start["marker_pattern"]) if ref_sect_end is None: # No End to refs? Not safe to extract refs = [] status = 5 write_message("* extract_references_from_fulltext: " \ "no end to refs!", verbose=2) else: # If the end of the reference section was found.. start extraction refs = get_reference_lines(fulltext, ref_sect_start["start_line"], ref_sect_end, ref_sect_start["title_string"], ref_sect_start["marker_pattern"], ref_sect_start["title_marker_same_line"], ref_sect_start["marker"]) return refs, status, how_found_start