Example #1
0
 def test_regex_match_list(self):
     s = 'ABC'
     m = refextract_re.regex_match_list(
         s, [re.compile('C.C'), re.compile('A.C')])
     self.assert_(m)
     m = refextract_re.regex_match_list(s, [re.compile('C.C')])
     self.assertEqual(m, None)
 def test_regex_match_list(self):
     s = 'ABC'
     m = refextract_re.regex_match_list(s, [
         re.compile('C.C'),
         re.compile('A.C')
     ])
     self.assert_(m)
     m = refextract_re.regex_match_list(s, [
         re.compile('C.C')
     ])
     self.assertEqual(m, None)
Example #3
0
def find_numeration_in_title(docbody, title):
    ref_details = None
    found_title = False

    try:
        first_line = docbody[0]
    except IndexError:
        return ref_details, found_title

    # Need to escape to avoid problems like 'References['
    title = re.escape(title)

    mk_with_title_ptns = get_reference_line_numeration_marker_patterns(title)
    mk_with_title_match = regex_match_list(first_line, mk_with_title_ptns)
    if mk_with_title_match:
        mk = mk_with_title_match.group("mark")
        mk_ptn = mk_with_title_match.re.pattern
        m_num = re_num.search(mk)
        if m_num and m_num.group(0) == "1":
            # Mark found
            found_title = True
            ref_details = {"marker": mk, "marker_pattern": mk_ptn, "title_marker_same_line": True}
        else:
            ref_details = {"marker": mk, "marker_pattern": mk_ptn, "title_marker_same_line": True}

    return ref_details, found_title
Example #4
0
def find_numeration_in_body(docbody):
    marker_patterns = get_reference_line_numeration_marker_patterns()
    ref_details = None
    found_title = False

    for line in docbody:
        # Move past blank lines
        if line.isspace():
            continue

        # Is this line numerated like a reference line?
        mark_match = regex_match_list(line, marker_patterns)
        if mark_match:
            mark = mark_match.group('mark')
            mk_ptn = mark_match.re.pattern
            ref_details = {
                'marker': mark,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': False,
            }
            # Check if it's the first reference
            # Something like [1] or (1), etc.
            m_num = re_num.search(mark)
            if m_num and m_num.group(0) == '1':
                # 1st ref truly found
                break
        else:
            # No numeration
            ref_details = {
                'title_marker_same_line': False,
                'marker': None,
                'marker_pattern': None,
            }

    return ref_details, found_title
Example #5
0
def find_numeration_in_body(docbody):
    marker_patterns = get_reference_line_numeration_marker_patterns()
    ref_details = None
    found_title = False

    for line in docbody:
        # Move past blank lines
        if line.isspace():
            continue

        # Is this line numerated like a reference line?
        mark_match = regex_match_list(line, marker_patterns)
        if mark_match:
            mark = mark_match.group("mark")
            mk_ptn = mark_match.re.pattern
            ref_details = {"marker": mark, "marker_pattern": mk_ptn, "title_marker_same_line": False}
            # Check if it's the first reference
            # Something like [1] or (1), etc.
            m_num = re_num.search(mark)
            if m_num and m_num.group(0) == "1":
                # 1st ref truly found
                break
        else:
            # No numeration
            ref_details = {"title_marker_same_line": False, "marker": None, "marker_pattern": None}

    return ref_details, found_title
Example #6
0
def find_reference_section(docbody):
    """Search in document body for its reference section.

    More precisely, find
    the first line of the reference section. Effectively, the function starts
    at the end of a document and works backwards, line-by-line, looking for
    the title of a reference section. It stops when (if) it finds something
    that it considers to be the first line of a reference section.
    @param docbody: (list) of strings - the full document body.
    @return: (dictionary) :
        { 'start_line' : (integer) - index in docbody of 1st reference line,
          'title_string' : (string) - title of the reference section.
          'marker' : (string) - the marker of the first reference line,
          'marker_pattern' : (string) - regexp string used to find the marker,
          'title_marker_same_line' : (integer) - flag to indicate whether the
                                        reference section title was on the same
                                        line as the first reference line's
                                        marker or not. 1 if it was; 0 if not.
        }
        Much of this information is used by later functions to rebuild
        a reference section.
         -- OR --
                (None) - when the reference section could not be found.
    """
    ref_details = None
    title_patterns = get_reference_section_title_patterns()

    # Try to find refs section title:
    for reversed_index, line in enumerate(reversed(docbody)):
        title_match = regex_match_list(line, title_patterns)
        if title_match:
            title = title_match.group('title')
            index = len(docbody) - 1 - reversed_index
            temp_ref_details, found_title = find_numeration(
                docbody[index:index + 6], title)
            if temp_ref_details:
                if ref_details and 'title' in ref_details \
                               and ref_details['title'] \
                               and not temp_ref_details['title']:
                    continue
                if ref_details and 'marker' in ref_details \
                               and ref_details['marker'] \
                               and not temp_ref_details['marker']:
                    continue

                ref_details = temp_ref_details
                ref_details['start_line'] = index
                ref_details['title_string'] = title

            if found_title:
                break

    return ref_details
Example #7
0
def find_reference_section(docbody):
    """Search in document body for its reference section.

    More precisely, find
    the first line of the reference section. Effectively, the function starts
    at the end of a document and works backwards, line-by-line, looking for
    the title of a reference section. It stops when (if) it finds something
    that it considers to be the first line of a reference section.
    @param docbody: (list) of strings - the full document body.
    @return: (dictionary) :
        { 'start_line' : (integer) - index in docbody of 1st reference line,
          'title_string' : (string) - title of the reference section.
          'marker' : (string) - the marker of the first reference line,
          'marker_pattern' : (string) - regexp string used to find the marker,
          'title_marker_same_line' : (integer) - flag to indicate whether the
                                        reference section title was on the same
                                        line as the first reference line's
                                        marker or not. 1 if it was; 0 if not.
        }
        Much of this information is used by later functions to rebuild
        a reference section.
         -- OR --
                (None) - when the reference section could not be found.
    """
    ref_details = None
    title_patterns = get_reference_section_title_patterns()

    # Try to find refs section title:
    for reversed_index, line in enumerate(reversed(docbody)):
        title_match = regex_match_list(line, title_patterns)
        if title_match:
            title = title_match.group('title')
            index = len(docbody) - 1 - reversed_index
            temp_ref_details, found_title = find_numeration(docbody[index:index+6], title)
            if temp_ref_details:
                if ref_details and 'title' in ref_details \
                               and ref_details['title'] \
                               and not temp_ref_details['title']:
                    continue
                if ref_details and 'marker' in ref_details \
                               and ref_details['marker'] \
                               and not temp_ref_details['marker']:
                    continue

                ref_details = temp_ref_details
                ref_details['start_line'] = index
                ref_details['title_string'] = title

            if found_title:
                break

    return ref_details
Example #8
0
def find_numeration_in_body(docbody):
    marker_patterns = get_reference_line_numeration_marker_patterns()
    ref_details = None
    found_title = False

    # No numeration unless we find one
    ref_details = {
        'title_marker_same_line': False,
        'marker': None,
        'marker_pattern': None,
    }

    for line in docbody:
        # Move past blank lines
        if line.isspace():
            continue

        # Is this line numerated like a reference line?
        m_num = None
        mark_match = regex_match_list(line, marker_patterns)
        if mark_match:
            # Check if it's the first reference
            # Something like [1] or (1), etc.
            try:
                m_num = mark_match.group('marknum')
                if m_num != '1':
                    continue
            except IndexError:
                pass

            mark = mark_match.group('mark')
            mk_ptn = mark_match.re.pattern
            ref_details = {
                'marker': mark,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': False,
            }

            break

    return ref_details, found_title
Example #9
0
def find_numeration_in_body(docbody):
    marker_patterns = get_reference_line_numeration_marker_patterns()
    ref_details = None
    found_title = False

    # No numeration unless we find one
    ref_details = {
        'title_marker_same_line': False,
        'marker': None,
        'marker_pattern': None,
    }

    for line in docbody:
        # Move past blank lines
        if line.isspace():
            continue

        # Is this line numerated like a reference line?
        m_num = None
        mark_match = regex_match_list(line, marker_patterns)
        if mark_match:
            # Check if it's the first reference
            # Something like [1] or (1), etc.
            try:
                m_num = mark_match.group('marknum')
                if m_num != '1':
                    continue
            except IndexError:
                pass

            mark = mark_match.group('mark')
            mk_ptn = mark_match.re.pattern
            ref_details = {
                'marker': mark,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': False,
            }

            break

    return ref_details, found_title
Example #10
0
def remove_reference_line_marker(line):
    """Trim a reference line's 'marker' from the beginning of the line.
       @param line: (string) - the reference line.
       @return: (tuple) containing two strings:
                 + The reference line's marker (or if there was not one,
                   a 'space' character.
                 + The reference line with it's marker removed from the
                   beginning.
    """
    # Get patterns to identify reference-line marker patterns:
    marker_patterns = get_reference_line_numeration_marker_patterns()
    line = line.lstrip()

    marker_match = regex_match_list(line, marker_patterns)

    if marker_match is not None:
        # found a marker:
        marker_val = marker_match.group(u'mark')
        # trim the marker from the start of the line:
        line = line[marker_match.end():].lstrip()
    else:
        marker_val = u" "
    return (marker_val, line)
Example #11
0
def remove_reference_line_marker(line):
    """Trim a reference line's 'marker' from the beginning of the line.
       @param line: (string) - the reference line.
       @return: (tuple) containing two strings:
                 + The reference line's marker (or if there was not one,
                   a 'space' character.
                 + The reference line with it's marker removed from the
                   beginning.
    """
    # Get patterns to identify reference-line marker patterns:
    marker_patterns = get_reference_line_numeration_marker_patterns()
    line = line.lstrip()

    marker_match = regex_match_list(line, marker_patterns)

    if marker_match is not None:
        # found a marker:
        marker_val = marker_match.group(u'mark')
        # trim the marker from the start of the line:
        line = line[marker_match.end():].lstrip()
    else:
        marker_val = u" "
    return (marker_val, line)
Example #12
0
def find_numeration_in_title(docbody, title):
    ref_details = None
    found_title = False

    try:
        first_line = docbody[0]
    except IndexError:
        return ref_details, found_title

    # Need to escape to avoid problems like 'References['
    title = re.escape(title)

    mk_with_title_ptns = \
       get_reference_line_numeration_marker_patterns(title)
    mk_with_title_match = \
       regex_match_list(first_line, mk_with_title_ptns)
    if mk_with_title_match:
        mk = mk_with_title_match.group('mark')
        mk_ptn = mk_with_title_match.re.pattern
        m_num = re_num.search(mk)
        if m_num and m_num.group(0) == '1':
            # Mark found
            found_title = True
            ref_details = {
                'marker': mk,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': True
            }
        else:
            ref_details = {
                'marker': mk,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': True
            }

    return ref_details, found_title
Example #13
0
def find_end_of_reference_section(docbody,
                                  ref_start_line,
                                  ref_line_marker,
                                  ref_line_marker_ptn):
    """Given that the start of a document's reference section has already been
       recognised, this function is tasked with finding the line-number in the
       document of the last line of the reference section.
       @param docbody: (list) of strings - the entire plain-text document body.
       @param ref_start_line: (integer) - the index in docbody of the first line
        of the reference section.
       @param ref_line_marker: (string) - the line marker of the first reference
        line.
       @param ref_line_marker_ptn: (string) - the pattern used to search for a
        reference line marker.
       @return: (integer) - index in docbody of the last reference line
         -- OR --
                (None) - if ref_start_line was invalid.
    """
    section_ended = False
    x = ref_start_line
    if type(x) is not int or x < 0 or \
           x > len(docbody) or len(docbody) < 1:
        # The provided 'first line' of the reference section was invalid.
        # Either it was out of bounds in the document body, or it was not a
        # valid integer.
        # Can't safely find end of refs with this info - quit.
        return None
    # Get patterns for testing line:
    t_patterns = get_post_reference_section_title_patterns()
    kw_patterns = get_post_reference_section_keyword_patterns()

    if None not in (ref_line_marker, ref_line_marker_ptn):
        mk_patterns = [re.compile(ref_line_marker_ptn, re.I|re.UNICODE)]
    else:
        mk_patterns = get_reference_line_numeration_marker_patterns()

    current_reference_count = 0
    while x < len(docbody) and not section_ended:
        # save the reference count
        num_match = regex_match_list(docbody[x].strip(), mk_patterns)
        if num_match:
            try:
                current_reference_count = int(num_match.group('marknum'))
            except (ValueError, IndexError):
                # non numerical references marking
                pass
        # look for a likely section title that would follow a reference section:
        end_match = regex_match_list(docbody[x].strip(), t_patterns)
        if not end_match:
            # didn't match a section title - try looking for keywords that
            # suggest the end of a reference section:
            end_match = regex_match_list(docbody[x].strip(), kw_patterns)
        else:
            # Is it really the end of the reference section? Check within the next
            # 5 lines for other reference numeration markers:
            y = x + 1
            line_found = False
            while y < x + 200 and y < len(docbody) and not line_found:
                num_match = regex_match_list(docbody[y].strip(), mk_patterns)
                if num_match and not num_match.group(0).isdigit():
                    try:
                        num = int(num_match.group('marknum'))
                        if current_reference_count + 1 == num:
                            line_found = True
                    except ValueError:
                        # We have the marknum index so it is
                        # numeric pattern for references like
                        # [1], [2] but this match is not a number
                        pass
                    except IndexError:
                        # We have a non numerical references marking
                        # we don't check for a number continuity
                        line_found = True
                y += 1
            if not line_found:
                # No ref line found-end section
                section_ended = True
        if not section_ended:
            # Does this & the next 5 lines simply contain numbers? If yes, it's
            # probably the axis scale of a graph in a fig. End refs section
            digit_test_str = docbody[x].replace(" ", "").\
                                        replace(".", "").\
                                        replace("-", "").\
                                        replace("+", "").\
                                        replace(u"\u00D7", "").\
                                        replace(u"\u2212", "").\
                                        strip()
            if len(digit_test_str) > 10 and digit_test_str.isdigit():
                # The line contains only digits and is longer than 10 chars:
                y = x + 1
                digit_lines = 4
                num_digit_lines = 1
                while y < x + digit_lines and y < len(docbody):
                    digit_test_str = docbody[y].replace(" ", "").\
                                     replace(".", "").\
                                     replace("-", "").\
                                     replace("+", "").\
                                     replace(u"\u00D7", "").\
                                     replace(u"\u2212", "").\
                                     strip()
                    if len(digit_test_str) > 10 and digit_test_str.isdigit():
                        num_digit_lines += 1
                    elif len(digit_test_str) == 0:
                        # This is a blank line. Don't count it, to accommodate
                        # documents that are double-line spaced:
                        digit_lines += 1
                    y = y + 1
                if num_digit_lines == digit_lines:
                    section_ended = True
            x += 1
    return x - 1
Example #14
0
def find_reference_section_no_title_generic(docbody, marker_patterns):
    """This function would generally be used when it was not possible to locate
       the start of a document's reference section by means of its title.
       Instead, this function will look for reference lines that have numeric
       markers of the format [1], [2], {1}, {2}, etc.
       @param docbody: (list) of strings -each string is a line in the document.
       @return: (dictionary) :
         { 'start_line' : (integer) - index in docbody of 1st reference line,
           'title_string' : (None) - title of the reference section
                                     (None since no title),
           'marker' : (string) - the marker of the first reference line,
           'marker_pattern' : (string) - the regexp string used to find the
                                         marker,
           'title_marker_same_line' : (integer) 0 - to signal title not on same
                                       line as marker.
         }
                 Much of this information is used by later functions to rebuild
                 a reference section.
         -- OR --
                (None) - when the reference section could not be found.
    """
    if not docbody:
        return None

    ref_start_line = ref_line_marker = None

    # try to find first reference line in the reference section:
    found_ref_sect = False

    for reversed_index, line in enumerate(reversed(docbody)):
        mark_match = regex_match_list(line.strip(), marker_patterns)
        if mark_match and mark_match.group('marknum') == '1':
            # Get marker recognition pattern:
            mark_pattern = mark_match.re.pattern

            # Look for [2] in next 10 lines:
            next_test_lines = 10

            index = len(docbody) - reversed_index
            zone_to_check = docbody[index:index+next_test_lines]
            if len(zone_to_check) < 5:
                # We found a 1 towards the end, we assume
                # we only have one reference
                found = True
            else:
                # Check for number 2
                found = False
                for l in zone_to_check:
                    mark_match2 = regex_match_list(l.strip(), marker_patterns)
                    if mark_match2 and mark_match2.group('marknum') == '2':
                        found = True
                        break

            if found:
                # Found next reference line:
                found_ref_sect = True
                ref_start_line = len(docbody) - 1 - reversed_index
                ref_line_marker = mark_match.group('mark')
                ref_line_marker_pattern = mark_pattern
                break

    if found_ref_sect:
        ref_sectn_details = {
            'start_line'             : ref_start_line,
            'title_string'           : None,
            'marker'                 : ref_line_marker.strip(),
            'marker_pattern'         : ref_line_marker_pattern,
            'title_marker_same_line' : False,
        }
    else:
        # didn't manage to find the reference section
        ref_sectn_details = None

    return ref_sectn_details
Example #15
0
def find_end_of_reference_section(docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn):
    """Given that the start of a document's reference section has already been
       recognised, this function is tasked with finding the line-number in the
       document of the last line of the reference section.
       @param docbody: (list) of strings - the entire plain-text document body.
       @param ref_start_line: (integer) - the index in docbody of the first line
        of the reference section.
       @param ref_line_marker: (string) - the line marker of the first reference
        line.
       @param ref_line_marker_ptn: (string) - the pattern used to search for a
        reference line marker.
       @return: (integer) - index in docbody of the last reference line
         -- OR --
                (None) - if ref_start_line was invalid.
    """
    section_ended = False
    x = ref_start_line
    if type(x) is not int or x < 0 or x > len(docbody) or len(docbody) < 1:
        # The provided 'first line' of the reference section was invalid.
        # Either it was out of bounds in the document body, or it was not a
        # valid integer.
        # Can't safely find end of refs with this info - quit.
        return None
    # Get patterns for testing line:
    t_patterns = get_post_reference_section_title_patterns()
    kw_patterns = get_post_reference_section_keyword_patterns()

    if None not in (ref_line_marker, ref_line_marker_ptn):
        mk_patterns = [re.compile(ref_line_marker_ptn, re.I | re.UNICODE)]
    else:
        mk_patterns = get_reference_line_numeration_marker_patterns()

    current_reference_count = 0
    while x < len(docbody) and not section_ended:
        # save the reference count
        num_match = regex_match_list(docbody[x].strip(), mk_patterns)
        if num_match:
            try:
                current_reference_count = int(num_match.group("marknum"))
            except (ValueError, IndexError):
                # non numerical references marking
                pass
        # look for a likely section title that would follow a reference section:
        end_match = regex_match_list(docbody[x].strip(), t_patterns)
        if not end_match:
            # didn't match a section title - try looking for keywords that
            # suggest the end of a reference section:
            end_match = regex_match_list(docbody[x].strip(), kw_patterns)
        else:
            # Is it really the end of the reference section? Check within the next
            # 5 lines for other reference numeration markers:
            y = x + 1
            line_found = False
            while y < x + 200 and y < len(docbody) and not line_found:
                num_match = regex_match_list(docbody[y].strip(), mk_patterns)
                if num_match and not num_match.group(0).isdigit():
                    try:
                        num = int(num_match.group("marknum"))
                        if current_reference_count + 1 == num:
                            line_found = True
                    except ValueError:
                        # We have the marknum index so it is
                        # numeric pattern for references like
                        # [1], [2] but this match is not a number
                        pass
                    except IndexError:
                        # We have a non numerical references marking
                        # we don't check for a number continuity
                        line_found = True
                y += 1
            if not line_found:
                # No ref line found-end section
                section_ended = True
        if not section_ended:
            # Does this & the next 5 lines simply contain numbers? If yes, it's
            # probably the axis scale of a graph in a fig. End refs section
            digit_test_str = (
                docbody[x]
                .replace(" ", "")
                .replace(".", "")
                .replace("-", "")
                .replace("+", "")
                .replace(u"\u00D7", "")
                .replace(u"\u2212", "")
                .strip()
            )
            if len(digit_test_str) > 10 and digit_test_str.isdigit():
                # The line contains only digits and is longer than 10 chars:
                y = x + 1
                digit_lines = 4
                num_digit_lines = 1
                while y < x + digit_lines and y < len(docbody):
                    digit_test_str = (
                        docbody[y]
                        .replace(" ", "")
                        .replace(".", "")
                        .replace("-", "")
                        .replace("+", "")
                        .replace(u"\u00D7", "")
                        .replace(u"\u2212", "")
                        .strip()
                    )
                    if len(digit_test_str) > 10 and digit_test_str.isdigit():
                        num_digit_lines += 1
                    elif len(digit_test_str) == 0:
                        # This is a blank line. Don't count it, to accommodate
                        # documents that are double-line spaced:
                        digit_lines += 1
                    y = y + 1
                if num_digit_lines == digit_lines:
                    section_ended = True
            x += 1
    return x - 1
Example #16
0
def find_reference_section_no_title_generic(docbody, marker_patterns):
    """This function would generally be used when it was not possible to locate
       the start of a document's reference section by means of its title.
       Instead, this function will look for reference lines that have numeric
       markers of the format [1], [2], {1}, {2}, etc.
       @param docbody: (list) of strings -each string is a line in the document.
       @return: (dictionary) :
         { 'start_line' : (integer) - index in docbody of 1st reference line,
           'title_string' : (None) - title of the reference section
                                     (None since no title),
           'marker' : (string) - the marker of the first reference line,
           'marker_pattern' : (string) - the regexp string used to find the
                                         marker,
           'title_marker_same_line' : (integer) 0 - to signal title not on same
                                       line as marker.
         }
                 Much of this information is used by later functions to rebuild
                 a reference section.
         -- OR --
                (None) - when the reference section could not be found.
    """
    if not docbody:
        return None

    ref_start_line = ref_line_marker = None

    # try to find first reference line in the reference section:
    found_ref_sect = False

    for reversed_index, line in enumerate(reversed(docbody)):
        mark_match = regex_match_list(line.strip(), marker_patterns)
        if mark_match and mark_match.group("marknum") == "1":
            # Get marker recognition pattern:
            mark_pattern = mark_match.re.pattern

            # Look for [2] in next 10 lines:
            next_test_lines = 10

            index = len(docbody) - reversed_index
            zone_to_check = docbody[index : index + next_test_lines]
            if len(zone_to_check) < 5:
                # We found a 1 towards the end, we assume
                # we only have one reference
                found = True
            else:
                # Check for number 2
                found = False
                for l in zone_to_check:
                    mark_match2 = regex_match_list(l.strip(), marker_patterns)
                    if mark_match2 and mark_match2.group("marknum") == "2":
                        found = True
                        break

            if found:
                # Found next reference line:
                found_ref_sect = True
                ref_start_line = len(docbody) - 1 - reversed_index
                ref_line_marker = mark_match.group("mark")
                ref_line_marker_pattern = mark_pattern
                break

    if found_ref_sect:
        ref_sectn_details = {
            "start_line": ref_start_line,
            "title_string": None,
            "marker": ref_line_marker.strip(),
            "marker_pattern": ref_line_marker_pattern,
            "title_marker_same_line": False,
        }
    else:
        # didn't manage to find the reference section
        ref_sectn_details = None

    return ref_sectn_details