def find_numeration_in_body(docbody): marker_patterns = get_reference_line_numeration_marker_patterns() ref_details = None found_title = False for line in docbody: # Move past blank lines if line.isspace(): continue # Is this line numerated like a reference line? mark_match = regex_match_list(line, marker_patterns) if mark_match: mark = mark_match.group('mark') mk_ptn = mark_match.re.pattern ref_details = { 'marker': mark, 'marker_pattern': mk_ptn, 'title_marker_same_line': False, } # Check if it's the first reference # Something like [1] or (1), etc. m_num = re_num.search(mark) if m_num and m_num.group(0) == '1': # 1st ref truly found break else: # No numeration ref_details = { 'title_marker_same_line': False, 'marker': None, 'marker_pattern': None, } return ref_details, found_title
def find_reference_section(docbody): """Search in document body for its reference section. More precisely, find the first line of the reference section. Effectively, the function starts at the end of a document and works backwards, line-by-line, looking for the title of a reference section. It stops when (if) it finds something that it considers to be the first line of a reference section. @param docbody: (list) of strings - the full document body. @return: (dictionary) : { 'start_line' : (integer) - index in docbody of 1st reference line, 'title_string' : (string) - title of the reference section. 'marker' : (string) - the marker of the first reference line, 'marker_pattern' : (string) - regexp string used to find the marker, 'title_marker_same_line' : (integer) - flag to indicate whether the reference section title was on the same line as the first reference line's marker or not. 1 if it was; 0 if not. } Much of this information is used by later functions to rebuild a reference section. -- OR -- (None) - when the reference section could not be found. """ ref_details = None title_patterns = get_reference_section_title_patterns() # Try to find refs section title: for reversed_index, line in enumerate(reversed(docbody)): title_match = regex_match_list(line, title_patterns) if title_match: title = title_match.group('title') index = len(docbody) - 1 - reversed_index temp_ref_details, found_title = find_numeration( docbody[index:index + 6], title) if temp_ref_details: if ref_details and 'title' in ref_details \ and ref_details['title'] \ and not temp_ref_details['title']: continue if ref_details and 'marker' in ref_details \ and ref_details['marker'] \ and not temp_ref_details['marker']: continue ref_details = temp_ref_details ref_details['start_line'] = index ref_details['title_string'] = title if found_title: break return ref_details
def find_reference_section(docbody): """Search in document body for its reference section. More precisely, find the first line of the reference section. Effectively, the function starts at the end of a document and works backwards, line-by-line, looking for the title of a reference section. It stops when (if) it finds something that it considers to be the first line of a reference section. @param docbody: (list) of strings - the full document body. @return: (dictionary) : { 'start_line' : (integer) - index in docbody of 1st reference line, 'title_string' : (string) - title of the reference section. 'marker' : (string) - the marker of the first reference line, 'marker_pattern' : (string) - regexp string used to find the marker, 'title_marker_same_line' : (integer) - flag to indicate whether the reference section title was on the same line as the first reference line's marker or not. 1 if it was; 0 if not. } Much of this information is used by later functions to rebuild a reference section. -- OR -- (None) - when the reference section could not be found. """ ref_details = None title_patterns = get_reference_section_title_patterns() # Try to find refs section title: for reversed_index, line in enumerate(reversed(docbody)): title_match = regex_match_list(line, title_patterns) if title_match: title = title_match.group('title') index = len(docbody) - 1 - reversed_index temp_ref_details, found_title = find_numeration(docbody[index:index+3], title) if temp_ref_details: if ref_details and 'title' in ref_details \ and ref_details['title'] \ and not temp_ref_details['title']: continue if ref_details and 'marker' in ref_details \ and ref_details['marker'] \ and not temp_ref_details['marker']: continue ref_details = temp_ref_details ref_details['start_line'] = index ref_details['title_string'] = title if found_title: break return ref_details
def find_numeration_in_body(docbody): marker_patterns = get_reference_line_numeration_marker_patterns() ref_details = None found_title = False # No numeration unless we find one ref_details = { 'title_marker_same_line': False, 'marker': None, 'marker_pattern': None, } for line in docbody: # Move past blank lines if line.isspace(): continue # Is this line numerated like a reference line? m_num = None mark_match = regex_match_list(line, marker_patterns) if mark_match: # Check if it's the first reference # Something like [1] or (1), etc. try: m_num = mark_match.group('marknum') if m_num != '1': continue except IndexError: pass mark = mark_match.group('mark') mk_ptn = mark_match.re.pattern ref_details = { 'marker': mark, 'marker_pattern': mk_ptn, 'title_marker_same_line': False, } break return ref_details, found_title
def remove_reference_line_marker(line): """Trim a reference line's 'marker' from the beginning of the line. @param line: (string) - the reference line. @return: (tuple) containing two strings: + The reference line's marker (or if there was not one, a 'space' character. + The reference line with it's marker removed from the beginning. """ # Get patterns to identify reference-line marker patterns: marker_patterns = get_reference_line_numeration_marker_patterns() line = line.lstrip() marker_match = regex_match_list(line, marker_patterns) if marker_match is not None: # found a marker: marker_val = marker_match.group(u'mark') # trim the marker from the start of the line: line = line[marker_match.end():].lstrip() else: marker_val = u" " return (marker_val, line)
def find_numeration_in_title(docbody, title): ref_details = None found_title = False try: first_line = docbody[0] except IndexError: return ref_details, found_title # Need to escape to avoid problems like 'References[' title = re.escape(title) mk_with_title_ptns = \ get_reference_line_numeration_marker_patterns(title) mk_with_title_match = \ regex_match_list(first_line, mk_with_title_ptns) if mk_with_title_match: mk = mk_with_title_match.group('mark') mk_ptn = mk_with_title_match.re.pattern m_num = re_num.search(mk) if m_num and m_num.group(0) == '1': # Mark found found_title = True ref_details = { 'marker': mk, 'marker_pattern': mk_ptn, 'title_marker_same_line': True } else: ref_details = { 'marker': mk, 'marker_pattern': mk_ptn, 'title_marker_same_line': True } return ref_details, found_title
def find_end_of_reference_section(docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn): """Given that the start of a document's reference section has already been recognised, this function is tasked with finding the line-number in the document of the last line of the reference section. @param docbody: (list) of strings - the entire plain-text document body. @param ref_start_line: (integer) - the index in docbody of the first line of the reference section. @param ref_line_marker: (string) - the line marker of the first reference line. @param ref_line_marker_ptn: (string) - the pattern used to search for a reference line marker. @return: (integer) - index in docbody of the last reference line -- OR -- (None) - if ref_start_line was invalid. """ section_ended = False x = ref_start_line if type(x) is not int or x < 0 or \ x > len(docbody) or len(docbody) < 1: # The provided 'first line' of the reference section was invalid. # Either it was out of bounds in the document body, or it was not a # valid integer. # Can't safely find end of refs with this info - quit. return None # Get patterns for testing line: t_patterns = get_post_reference_section_title_patterns() kw_patterns = get_post_reference_section_keyword_patterns() if None not in (ref_line_marker, ref_line_marker_ptn): mk_patterns = [re.compile(ref_line_marker_ptn, re.I|re.UNICODE)] else: mk_patterns = get_reference_line_numeration_marker_patterns() current_reference_count = 0 while x < len(docbody) and not section_ended: # save the reference count num_match = regex_match_list(docbody[x].strip(), mk_patterns) if num_match: try: current_reference_count = int(num_match.group('marknum')) except (ValueError, IndexError): # non numerical references marking pass # look for a likely section title that would follow a reference section: end_match = regex_match_list(docbody[x].strip(), t_patterns) if not end_match: # didn't match a section title - try looking for keywords that # suggest the end of a reference section: end_match = regex_match_list(docbody[x].strip(), kw_patterns) else: # Is it really the end of the reference section? Check within the next # 5 lines for other reference numeration markers: y = x + 1 line_found = False while y < x + 200 and y < len(docbody) and not line_found: num_match = regex_match_list(docbody[y].strip(), mk_patterns) if num_match and not num_match.group(0).isdigit(): try: num = int(num_match.group('marknum')) if current_reference_count + 1 == num: line_found = True except ValueError: # We have the marknum index so it is # numeric pattern for references like # [1], [2] but this match is not a number pass except IndexError: # We have a non numerical references marking # we don't check for a number continuity line_found = True y += 1 if not line_found: # No ref line found-end section section_ended = True if not section_ended: # Does this & the next 5 lines simply contain numbers? If yes, it's # probably the axis scale of a graph in a fig. End refs section digit_test_str = docbody[x].replace(" ", "").\ replace(".", "").\ replace("-", "").\ replace("+", "").\ replace(u"\u00D7", "").\ replace(u"\u2212", "").\ strip() if len(digit_test_str) > 10 and digit_test_str.isdigit(): # The line contains only digits and is longer than 10 chars: y = x + 1 digit_lines = 4 num_digit_lines = 1 while y < x + digit_lines and y < len(docbody): digit_test_str = docbody[y].replace(" ", "").\ replace(".", "").\ replace("-", "").\ replace("+", "").\ replace(u"\u00D7", "").\ replace(u"\u2212", "").\ strip() if len(digit_test_str) > 10 and digit_test_str.isdigit(): num_digit_lines += 1 elif len(digit_test_str) == 0: # This is a blank line. Don't count it, to accommodate # documents that are double-line spaced: digit_lines += 1 y = y + 1 if num_digit_lines == digit_lines: section_ended = True x += 1 return x - 1
def find_reference_section_no_title_generic(docbody, marker_patterns): """This function would generally be used when it was not possible to locate the start of a document's reference section by means of its title. Instead, this function will look for reference lines that have numeric markers of the format [1], [2], {1}, {2}, etc. @param docbody: (list) of strings -each string is a line in the document. @return: (dictionary) : { 'start_line' : (integer) - index in docbody of 1st reference line, 'title_string' : (None) - title of the reference section (None since no title), 'marker' : (string) - the marker of the first reference line, 'marker_pattern' : (string) - the regexp string used to find the marker, 'title_marker_same_line' : (integer) 0 - to signal title not on same line as marker. } Much of this information is used by later functions to rebuild a reference section. -- OR -- (None) - when the reference section could not be found. """ if not docbody: return None ref_start_line = ref_line_marker = None # try to find first reference line in the reference section: found_ref_sect = False for reversed_index, line in enumerate(reversed(docbody)): mark_match = regex_match_list(line.strip(), marker_patterns) if mark_match and mark_match.group('marknum') == '1': # Get marker recognition pattern: mark_pattern = mark_match.re.pattern # Look for [2] in next 10 lines: next_test_lines = 10 index = len(docbody) - reversed_index zone_to_check = docbody[index:index+next_test_lines] if len(zone_to_check) < 5: # We found a 1 towards the end, we assume # we only have one reference found = True else: # Check for number 2 found = False for l in zone_to_check: mark_match2 = regex_match_list(l.strip(), marker_patterns) if mark_match2 and mark_match2.group('marknum') == '2': found = True break if found: # Found next reference line: found_ref_sect = True ref_start_line = len(docbody) - 1 - reversed_index ref_line_marker = mark_match.group('mark') ref_line_marker_pattern = mark_pattern break if found_ref_sect: ref_sectn_details = { 'start_line' : ref_start_line, 'title_string' : None, 'marker' : ref_line_marker.strip(), 'marker_pattern' : ref_line_marker_pattern, 'title_marker_same_line' : False, } else: # didn't manage to find the reference section ref_sectn_details = None return ref_sectn_details
def find_end_of_reference_section(docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn): """Given that the start of a document's reference section has already been recognised, this function is tasked with finding the line-number in the document of the last line of the reference section. @param docbody: (list) of strings - the entire plain-text document body. @param ref_start_line: (integer) - the index in docbody of the first line of the reference section. @param ref_line_marker: (string) - the line marker of the first reference line. @param ref_line_marker_ptn: (string) - the pattern used to search for a reference line marker. @return: (integer) - index in docbody of the last reference line -- OR -- (None) - if ref_start_line was invalid. """ section_ended = False x = ref_start_line if type(x) is not int or x < 0 or \ x > len(docbody) or len(docbody) < 1: # The provided 'first line' of the reference section was invalid. # Either it was out of bounds in the document body, or it was not a # valid integer. # Can't safely find end of refs with this info - quit. return None # Get patterns for testing line: t_patterns = get_post_reference_section_title_patterns() kw_patterns = get_post_reference_section_keyword_patterns() if None not in (ref_line_marker, ref_line_marker_ptn): mk_patterns = [re.compile(ref_line_marker_ptn, re.I | re.UNICODE)] else: mk_patterns = get_reference_line_numeration_marker_patterns() current_reference_count = 0 while x < len(docbody) and not section_ended: # save the reference count num_match = regex_match_list(docbody[x].strip(), mk_patterns) if num_match: try: current_reference_count = int(num_match.group('marknum')) except (ValueError, IndexError): # non numerical references marking pass # look for a likely section title that would follow a reference section: end_match = regex_match_list(docbody[x].strip(), t_patterns) if not end_match: # didn't match a section title - try looking for keywords that # suggest the end of a reference section: end_match = regex_match_list(docbody[x].strip(), kw_patterns) else: # Is it really the end of the reference section? Check within the next # 5 lines for other reference numeration markers: y = x + 1 line_found = False while y < x + 200 and y < len(docbody) and not line_found: num_match = regex_match_list(docbody[y].strip(), mk_patterns) if num_match and not num_match.group(0).isdigit(): try: num = int(num_match.group('marknum')) if current_reference_count + 1 == num: line_found = True except ValueError: # We have the marknum index so it is # numeric pattern for references like # [1], [2] but this match is not a number pass except IndexError: # We have a non numerical references marking # we don't check for a number continuity line_found = True y += 1 if not line_found: # No ref line found-end section section_ended = True if not section_ended: # Does this & the next 5 lines simply contain numbers? If yes, it's # probably the axis scale of a graph in a fig. End refs section digit_test_str = docbody[x].replace(" ", "").\ replace(".", "").\ replace("-", "").\ replace("+", "").\ replace(u"\u00D7", "").\ replace(u"\u2212", "").\ strip() if len(digit_test_str) > 10 and digit_test_str.isdigit(): # The line contains only digits and is longer than 10 chars: y = x + 1 digit_lines = 4 num_digit_lines = 1 while y < x + digit_lines and y < len(docbody): digit_test_str = docbody[y].replace(" ", "").\ replace(".", "").\ replace("-", "").\ replace("+", "").\ replace(u"\u00D7", "").\ replace(u"\u2212", "").\ strip() if len(digit_test_str) > 10 and digit_test_str.isdigit(): num_digit_lines += 1 elif len(digit_test_str) == 0: # This is a blank line. Don't count it, to accommodate # documents that are double-line spaced: digit_lines += 1 y = y + 1 if num_digit_lines == digit_lines: section_ended = True x += 1 return x - 1
def find_reference_section_no_title_generic(docbody, marker_patterns): """This function would generally be used when it was not possible to locate the start of a document's reference section by means of its title. Instead, this function will look for reference lines that have numeric markers of the format [1], [2], {1}, {2}, etc. @param docbody: (list) of strings -each string is a line in the document. @return: (dictionary) : { 'start_line' : (integer) - index in docbody of 1st reference line, 'title_string' : (None) - title of the reference section (None since no title), 'marker' : (string) - the marker of the first reference line, 'marker_pattern' : (string) - the regexp string used to find the marker, 'title_marker_same_line' : (integer) 0 - to signal title not on same line as marker. } Much of this information is used by later functions to rebuild a reference section. -- OR -- (None) - when the reference section could not be found. """ if not docbody: return None ref_start_line = ref_line_marker = None # try to find first reference line in the reference section: found_ref_sect = False for reversed_index, line in enumerate(reversed(docbody)): mark_match = regex_match_list(line.strip(), marker_patterns) if mark_match and mark_match.group('marknum') == '1': # Get marker recognition pattern: mark_pattern = mark_match.re.pattern # Look for [2] in next 10 lines: next_test_lines = 10 index = len(docbody) - reversed_index zone_to_check = docbody[index:index + next_test_lines] if len(zone_to_check) < 5: # We found a 1 towards the end, we assume # we only have one reference found = True else: # Check for number 2 found = False for l in zone_to_check: mark_match2 = regex_match_list(l.strip(), marker_patterns) if mark_match2 and mark_match2.group('marknum') == '2': found = True break if found: # Found next reference line: found_ref_sect = True ref_start_line = len(docbody) - 1 - reversed_index ref_line_marker = mark_match.group('mark') ref_line_marker_pattern = mark_pattern break if found_ref_sect: ref_sectn_details = { 'start_line': ref_start_line, 'title_string': None, 'marker': ref_line_marker.strip(), 'marker_pattern': ref_line_marker_pattern, 'title_marker_same_line': False, } else: # didn't manage to find the reference section ref_sectn_details = None return ref_sectn_details