Example #1
0
def find_numeration_in_body(docbody):
    marker_patterns = get_reference_line_numeration_marker_patterns()
    ref_details = None
    found_title = False

    for line in docbody:
        # Move past blank lines
        if line.isspace():
            continue

        # Is this line numerated like a reference line?
        mark_match = regex_match_list(line, marker_patterns)
        if mark_match:
            mark = mark_match.group("mark")
            mk_ptn = mark_match.re.pattern
            ref_details = {"marker": mark, "marker_pattern": mk_ptn, "title_marker_same_line": False}
            # Check if it's the first reference
            # Something like [1] or (1), etc.
            m_num = re_num.search(mark)
            if m_num and m_num.group(0) == "1":
                # 1st ref truly found
                break
        else:
            # No numeration
            ref_details = {"title_marker_same_line": False, "marker": None, "marker_pattern": None}

    return ref_details, found_title
Example #2
0
def find_numeration_in_body(docbody):
    marker_patterns = get_reference_line_numeration_marker_patterns()
    ref_details = None
    found_title = False

    for line in docbody:
        # Move past blank lines
        if line.isspace():
            continue

        # Is this line numerated like a reference line?
        mark_match = regex_match_list(line, marker_patterns)
        if mark_match:
            mark = mark_match.group('mark')
            mk_ptn = mark_match.re.pattern
            ref_details = {
                'marker': mark,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': False,
            }
            # Check if it's the first reference
            # Something like [1] or (1), etc.
            m_num = re_num.search(mark)
            if m_num and m_num.group(0) == '1':
                # 1st ref truly found
                break
        else:
            # No numeration
            ref_details = {
                'title_marker_same_line': False,
                'marker': None,
                'marker_pattern': None,
            }

    return ref_details, found_title
Example #3
0
def find_numeration_in_title(docbody, title):
    ref_details = None
    found_title = False

    try:
        first_line = docbody[0]
    except IndexError:
        return ref_details, found_title

    # Need to escape to avoid problems like 'References['
    title = re.escape(title)

    mk_with_title_ptns = get_reference_line_numeration_marker_patterns(title)
    mk_with_title_match = regex_match_list(first_line, mk_with_title_ptns)
    if mk_with_title_match:
        mk = mk_with_title_match.group("mark")
        mk_ptn = mk_with_title_match.re.pattern
        m_num = re_num.search(mk)
        if m_num and m_num.group(0) == "1":
            # Mark found
            found_title = True
            ref_details = {"marker": mk, "marker_pattern": mk_ptn, "title_marker_same_line": True}
        else:
            ref_details = {"marker": mk, "marker_pattern": mk_ptn, "title_marker_same_line": True}

    return ref_details, found_title
Example #4
0
def find_numeration_in_body(docbody):
    marker_patterns = get_reference_line_numeration_marker_patterns()
    ref_details = None
    found_title = False

    # No numeration unless we find one
    ref_details = {
        'title_marker_same_line': False,
        'marker': None,
        'marker_pattern': None,
    }

    for line in docbody:
        # Move past blank lines
        if line.isspace():
            continue

        # Is this line numerated like a reference line?
        m_num = None
        mark_match = regex_match_list(line, marker_patterns)
        if mark_match:
            # Check if it's the first reference
            # Something like [1] or (1), etc.
            try:
                m_num = mark_match.group('marknum')
                if m_num != '1':
                    continue
            except IndexError:
                pass

            mark = mark_match.group('mark')
            mk_ptn = mark_match.re.pattern
            ref_details = {
                'marker': mark,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': False,
            }

            break

    return ref_details, found_title
Example #5
0
def find_numeration_in_body(docbody):
    marker_patterns = get_reference_line_numeration_marker_patterns()
    ref_details = None
    found_title = False

    # No numeration unless we find one
    ref_details = {
        'title_marker_same_line': False,
        'marker': None,
        'marker_pattern': None,
    }

    for line in docbody:
        # Move past blank lines
        if line.isspace():
            continue

        # Is this line numerated like a reference line?
        m_num = None
        mark_match = regex_match_list(line, marker_patterns)
        if mark_match:
            # Check if it's the first reference
            # Something like [1] or (1), etc.
            try:
                m_num = mark_match.group('marknum')
                if m_num != '1':
                    continue
            except IndexError:
                pass

            mark = mark_match.group('mark')
            mk_ptn = mark_match.re.pattern
            ref_details = {
                'marker': mark,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': False,
            }

            break

    return ref_details, found_title
Example #6
0
def remove_reference_line_marker(line):
    """Trim a reference line's 'marker' from the beginning of the line.
       @param line: (string) - the reference line.
       @return: (tuple) containing two strings:
                 + The reference line's marker (or if there was not one,
                   a 'space' character.
                 + The reference line with it's marker removed from the
                   beginning.
    """
    # Get patterns to identify reference-line marker patterns:
    marker_patterns = get_reference_line_numeration_marker_patterns()
    line = line.lstrip()

    marker_match = regex_match_list(line, marker_patterns)

    if marker_match is not None:
        # found a marker:
        marker_val = marker_match.group(u'mark')
        # trim the marker from the start of the line:
        line = line[marker_match.end():].lstrip()
    else:
        marker_val = u" "
    return (marker_val, line)
Example #7
0
def remove_reference_line_marker(line):
    """Trim a reference line's 'marker' from the beginning of the line.
       @param line: (string) - the reference line.
       @return: (tuple) containing two strings:
                 + The reference line's marker (or if there was not one,
                   a 'space' character.
                 + The reference line with it's marker removed from the
                   beginning.
    """
    # Get patterns to identify reference-line marker patterns:
    marker_patterns = get_reference_line_numeration_marker_patterns()
    line = line.lstrip()

    marker_match = regex_match_list(line, marker_patterns)

    if marker_match is not None:
        # found a marker:
        marker_val = marker_match.group(u'mark')
        # trim the marker from the start of the line:
        line = line[marker_match.end():].lstrip()
    else:
        marker_val = u" "
    return (marker_val, line)
Example #8
0
def find_numeration_in_title(docbody, title):
    ref_details = None
    found_title = False

    try:
        first_line = docbody[0]
    except IndexError:
        return ref_details, found_title

    # Need to escape to avoid problems like 'References['
    title = re.escape(title)

    mk_with_title_ptns = \
       get_reference_line_numeration_marker_patterns(title)
    mk_with_title_match = \
       regex_match_list(first_line, mk_with_title_ptns)
    if mk_with_title_match:
        mk = mk_with_title_match.group('mark')
        mk_ptn = mk_with_title_match.re.pattern
        m_num = re_num.search(mk)
        if m_num and m_num.group(0) == '1':
            # Mark found
            found_title = True
            ref_details = {
                'marker': mk,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': True
            }
        else:
            ref_details = {
                'marker': mk,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': True
            }

    return ref_details, found_title
Example #9
0
 def test_get_reference_line_numeration_marker_patterns(self):
     r = refextract_re.get_reference_line_numeration_marker_patterns()
     self.assert_(len(r) > 2)
 def test_get_reference_line_numeration_marker_patterns(self):
     r = refextract_re.get_reference_line_numeration_marker_patterns()
     self.assert_(len(r) > 2)
Example #11
0
def find_end_of_reference_section(docbody,
                                  ref_start_line,
                                  ref_line_marker,
                                  ref_line_marker_ptn):
    """Given that the start of a document's reference section has already been
       recognised, this function is tasked with finding the line-number in the
       document of the last line of the reference section.
       @param docbody: (list) of strings - the entire plain-text document body.
       @param ref_start_line: (integer) - the index in docbody of the first line
        of the reference section.
       @param ref_line_marker: (string) - the line marker of the first reference
        line.
       @param ref_line_marker_ptn: (string) - the pattern used to search for a
        reference line marker.
       @return: (integer) - index in docbody of the last reference line
         -- OR --
                (None) - if ref_start_line was invalid.
    """
    section_ended = False
    x = ref_start_line
    if type(x) is not int or x < 0 or \
           x > len(docbody) or len(docbody) < 1:
        # The provided 'first line' of the reference section was invalid.
        # Either it was out of bounds in the document body, or it was not a
        # valid integer.
        # Can't safely find end of refs with this info - quit.
        return None
    # Get patterns for testing line:
    t_patterns = get_post_reference_section_title_patterns()
    kw_patterns = get_post_reference_section_keyword_patterns()

    if None not in (ref_line_marker, ref_line_marker_ptn):
        mk_patterns = [re.compile(ref_line_marker_ptn, re.I|re.UNICODE)]
    else:
        mk_patterns = get_reference_line_numeration_marker_patterns()

    current_reference_count = 0
    while x < len(docbody) and not section_ended:
        # save the reference count
        num_match = regex_match_list(docbody[x].strip(), mk_patterns)
        if num_match:
            try:
                current_reference_count = int(num_match.group('marknum'))
            except (ValueError, IndexError):
                # non numerical references marking
                pass
        # look for a likely section title that would follow a reference section:
        end_match = regex_match_list(docbody[x].strip(), t_patterns)
        if not end_match:
            # didn't match a section title - try looking for keywords that
            # suggest the end of a reference section:
            end_match = regex_match_list(docbody[x].strip(), kw_patterns)
        else:
            # Is it really the end of the reference section? Check within the next
            # 5 lines for other reference numeration markers:
            y = x + 1
            line_found = False
            while y < x + 200 and y < len(docbody) and not line_found:
                num_match = regex_match_list(docbody[y].strip(), mk_patterns)
                if num_match and not num_match.group(0).isdigit():
                    try:
                        num = int(num_match.group('marknum'))
                        if current_reference_count + 1 == num:
                            line_found = True
                    except ValueError:
                        # We have the marknum index so it is
                        # numeric pattern for references like
                        # [1], [2] but this match is not a number
                        pass
                    except IndexError:
                        # We have a non numerical references marking
                        # we don't check for a number continuity
                        line_found = True
                y += 1
            if not line_found:
                # No ref line found-end section
                section_ended = True
        if not section_ended:
            # Does this & the next 5 lines simply contain numbers? If yes, it's
            # probably the axis scale of a graph in a fig. End refs section
            digit_test_str = docbody[x].replace(" ", "").\
                                        replace(".", "").\
                                        replace("-", "").\
                                        replace("+", "").\
                                        replace(u"\u00D7", "").\
                                        replace(u"\u2212", "").\
                                        strip()
            if len(digit_test_str) > 10 and digit_test_str.isdigit():
                # The line contains only digits and is longer than 10 chars:
                y = x + 1
                digit_lines = 4
                num_digit_lines = 1
                while y < x + digit_lines and y < len(docbody):
                    digit_test_str = docbody[y].replace(" ", "").\
                                     replace(".", "").\
                                     replace("-", "").\
                                     replace("+", "").\
                                     replace(u"\u00D7", "").\
                                     replace(u"\u2212", "").\
                                     strip()
                    if len(digit_test_str) > 10 and digit_test_str.isdigit():
                        num_digit_lines += 1
                    elif len(digit_test_str) == 0:
                        # This is a blank line. Don't count it, to accommodate
                        # documents that are double-line spaced:
                        digit_lines += 1
                    y = y + 1
                if num_digit_lines == digit_lines:
                    section_ended = True
            x += 1
    return x - 1
Example #12
0
def find_end_of_reference_section(docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn):
    """Given that the start of a document's reference section has already been
       recognised, this function is tasked with finding the line-number in the
       document of the last line of the reference section.
       @param docbody: (list) of strings - the entire plain-text document body.
       @param ref_start_line: (integer) - the index in docbody of the first line
        of the reference section.
       @param ref_line_marker: (string) - the line marker of the first reference
        line.
       @param ref_line_marker_ptn: (string) - the pattern used to search for a
        reference line marker.
       @return: (integer) - index in docbody of the last reference line
         -- OR --
                (None) - if ref_start_line was invalid.
    """
    section_ended = False
    x = ref_start_line
    if type(x) is not int or x < 0 or x > len(docbody) or len(docbody) < 1:
        # The provided 'first line' of the reference section was invalid.
        # Either it was out of bounds in the document body, or it was not a
        # valid integer.
        # Can't safely find end of refs with this info - quit.
        return None
    # Get patterns for testing line:
    t_patterns = get_post_reference_section_title_patterns()
    kw_patterns = get_post_reference_section_keyword_patterns()

    if None not in (ref_line_marker, ref_line_marker_ptn):
        mk_patterns = [re.compile(ref_line_marker_ptn, re.I | re.UNICODE)]
    else:
        mk_patterns = get_reference_line_numeration_marker_patterns()

    current_reference_count = 0
    while x < len(docbody) and not section_ended:
        # save the reference count
        num_match = regex_match_list(docbody[x].strip(), mk_patterns)
        if num_match:
            try:
                current_reference_count = int(num_match.group("marknum"))
            except (ValueError, IndexError):
                # non numerical references marking
                pass
        # look for a likely section title that would follow a reference section:
        end_match = regex_match_list(docbody[x].strip(), t_patterns)
        if not end_match:
            # didn't match a section title - try looking for keywords that
            # suggest the end of a reference section:
            end_match = regex_match_list(docbody[x].strip(), kw_patterns)
        else:
            # Is it really the end of the reference section? Check within the next
            # 5 lines for other reference numeration markers:
            y = x + 1
            line_found = False
            while y < x + 200 and y < len(docbody) and not line_found:
                num_match = regex_match_list(docbody[y].strip(), mk_patterns)
                if num_match and not num_match.group(0).isdigit():
                    try:
                        num = int(num_match.group("marknum"))
                        if current_reference_count + 1 == num:
                            line_found = True
                    except ValueError:
                        # We have the marknum index so it is
                        # numeric pattern for references like
                        # [1], [2] but this match is not a number
                        pass
                    except IndexError:
                        # We have a non numerical references marking
                        # we don't check for a number continuity
                        line_found = True
                y += 1
            if not line_found:
                # No ref line found-end section
                section_ended = True
        if not section_ended:
            # Does this & the next 5 lines simply contain numbers? If yes, it's
            # probably the axis scale of a graph in a fig. End refs section
            digit_test_str = (
                docbody[x]
                .replace(" ", "")
                .replace(".", "")
                .replace("-", "")
                .replace("+", "")
                .replace(u"\u00D7", "")
                .replace(u"\u2212", "")
                .strip()
            )
            if len(digit_test_str) > 10 and digit_test_str.isdigit():
                # The line contains only digits and is longer than 10 chars:
                y = x + 1
                digit_lines = 4
                num_digit_lines = 1
                while y < x + digit_lines and y < len(docbody):
                    digit_test_str = (
                        docbody[y]
                        .replace(" ", "")
                        .replace(".", "")
                        .replace("-", "")
                        .replace("+", "")
                        .replace(u"\u00D7", "")
                        .replace(u"\u2212", "")
                        .strip()
                    )
                    if len(digit_test_str) > 10 and digit_test_str.isdigit():
                        num_digit_lines += 1
                    elif len(digit_test_str) == 0:
                        # This is a blank line. Don't count it, to accommodate
                        # documents that are double-line spaced:
                        digit_lines += 1
                    y = y + 1
                if num_digit_lines == digit_lines:
                    section_ended = True
            x += 1
    return x - 1