コード例 #1
0
 def test_no_title_via_numbers(self):
     sect = get_reference_section_beginning(["Hello", "1 Ref1" "2 Ref2"])
     self.assertEqual(
         sect, {
             'marker': '1',
             'marker_pattern':
             u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))',
             'start_line': 1,
             'title_string': None,
             'title_marker_same_line': False,
             'how_found_start': 4,
         })
コード例 #2
0
 def test_simple(self):
     sect = get_reference_section_beginning(
         ["Hello", "References", "[1] Ref1"])
     self.assertEqual(
         sect, {
             'marker': '[1]',
             'marker_pattern':
             u'\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])',
             'start_line': 1,
             'title_string': 'References',
             'title_marker_same_line': False,
             'how_found_start': 1,
         })
コード例 #3
0
 def test_no_title_via_dots(self):
     from invenio.refextract_find import get_reference_section_beginning
     sect = get_reference_section_beginning(["Hello", "1. Ref1" "2. Ref2"])
     self.assertEqual(
         sect, {
             'marker': '1.',
             'marker_pattern':
             u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>\\.))',
             'start_line': 1,
             'title_string': None,
             'title_marker_same_line': False,
             'how_found_start': 3,
         })
コード例 #4
0
def extract_references_from_fulltext(fulltext):
    """Locate and extract the reference section from a fulltext document.
       Return the extracted reference section as a list of strings, whereby each
       string in the list is considered to be a single reference line.
        E.g. a string could be something like:
        '[19] Wilson, A. Unpublished (1986).
       @param fulltext: (list) of strings, whereby each string is a line of the
        document.
       @return: (list) of strings, where each string is an extracted reference
        line.
    """
    # Try to remove pagebreaks, headers, footers
    fulltext = remove_page_boundary_lines(fulltext)
    status = 0
    # How ref section found flag
    how_found_start = 0
    # Find start of refs section
    ref_sect_start = get_reference_section_beginning(fulltext)

    if ref_sect_start is None:
        ## No References
        refs = []
        status = 4
        write_message(
            "* extract_references_from_fulltext: "
            "ref_sect_start is None",
            verbose=2)
    else:
        # If a reference section was found, however weak
        ref_sect_end = \
           find_end_of_reference_section(fulltext,
                                         ref_sect_start["start_line"],
                                         ref_sect_start["marker"],
                                         ref_sect_start["marker_pattern"])
        if ref_sect_end is None:
            # No End to refs? Not safe to extract
            refs = []
            status = 5
            write_message(
                "* extract_references_from_fulltext: "
                "no end to refs!",
                verbose=2)
        else:
            # If the end of the reference section was found.. start extraction
            refs = get_reference_lines(
                fulltext, ref_sect_start["start_line"], ref_sect_end,
                ref_sect_start["title_string"],
                ref_sect_start["marker_pattern"],
                ref_sect_start["title_marker_same_line"])

    return refs, status, how_found_start
コード例 #5
0
 def test_no_title_via_brackets(self):
     sect = get_reference_section_beginning(
         ["Hello", "[1] Ref1"
          "[2] Ref2"])
     self.assertEqual(
         sect, {
             'marker': '[1]',
             'marker_pattern':
             u'(?P<mark>(?P<left>\\[)\\s*(?P<marknum>\\d+)\\s*(?P<right>\\]))',
             'start_line': 1,
             'title_string': None,
             'title_marker_same_line': False,
             'how_found_start': 2,
         })
コード例 #6
0
 def test_no_title_via_numbers(self):
     sect = get_reference_section_beginning([
         "Hello",
         "1 Ref1"
         "2 Ref2"
     ])
     self.assertEqual(sect, {
         'marker': '1',
         'marker_pattern': u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))',
         'start_line': 1,
         'title_string': None,
         'title_marker_same_line': False,
         'how_found_start': 4,
     })
コード例 #7
0
 def test_no_title_via_brackets(self):
     sect = get_reference_section_beginning([
         "Hello",
         "[1] Ref1"
         "[2] Ref2"
     ])
     self.assertEqual(sect, {
         'marker': '[1]',
         'marker_pattern': u'(?P<mark>(?P<left>\\[)\\s*(?P<marknum>\\d+)\\s*(?P<right>\\]))',
         'start_line': 1,
         'title_string': None,
         'title_marker_same_line': False,
         'how_found_start': 2,
     })
コード例 #8
0
 def test_simple(self):
     sect = get_reference_section_beginning([
         "Hello",
         "References",
         "[1] Ref1"
     ])
     self.assertEqual(sect, {
         'marker': '[1]',
         'marker_pattern': u'\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])',
         'start_line': 1,
         'title_string': 'References',
         'title_marker_same_line': False,
         'how_found_start': 1,
     })
コード例 #9
0
def extract_references_from_fulltext(fulltext):
    """Locate and extract the reference section from a fulltext document.
       Return the extracted reference section as a list of strings, whereby each
       string in the list is considered to be a single reference line.
        E.g. a string could be something like:
        '[19] Wilson, A. Unpublished (1986).
       @param fulltext: (list) of strings, whereby each string is a line of the
        document.
       @return: (list) of strings, where each string is an extracted reference
        line.
    """
    # Try to remove pagebreaks, headers, footers
    fulltext = remove_page_boundary_lines(fulltext)
    status = 0
    # How ref section found flag
    how_found_start = 0
    # Find start of refs section
    ref_sect_start = get_reference_section_beginning(fulltext)

    if ref_sect_start is None:
        ## No References
        refs = []
        status = 4
        write_message("* extract_references_from_fulltext: "
                         "ref_sect_start is None", verbose=2)
    else:
        # If a reference section was found, however weak
        ref_sect_end = \
           find_end_of_reference_section(fulltext,
                                         ref_sect_start["start_line"],
                                         ref_sect_start["marker"],
                                         ref_sect_start["marker_pattern"])
        if ref_sect_end is None:
            # No End to refs? Not safe to extract
            refs = []
            status = 5
            write_message("* extract_references_from_fulltext: "
                             "no end to refs!", verbose=2)
        else:
            # If the end of the reference section was found.. start extraction
            refs = get_reference_lines(fulltext,
                                       ref_sect_start["start_line"],
                                       ref_sect_end,
                                       ref_sect_start["title_string"],
                                       ref_sect_start["marker_pattern"],
                                       ref_sect_start["title_marker_same_line"])

    return refs, status, how_found_start
コード例 #10
0
def extract_references_from_string_xml(source, is_only_references=True):
    """Extract references from a string

    The single parameter is the document
    The result is given in marcxml.
    """
    docbody = source.split("\n")
    if not is_only_references:
        reflines, dummy, dummy = extract_references_from_fulltext(docbody)
    else:
        refs_info = get_reference_section_beginning(docbody)
        if not refs_info:
            refs_info, dummy = find_numeration_in_body(docbody)
            refs_info["start_line"] = 0
            refs_info["end_line"] = (len(docbody) - 1,)

        reflines = rebuild_reference_lines(docbody, refs_info["marker_pattern"])
    return parse_references(reflines)
コード例 #11
0
ファイル: refextract_api.py プロジェクト: bopopescu/invenio
def extract_references_from_string_xml(source, is_only_references=True):
    """Extract references from a string

    The single parameter is the document
    The result is given in marcxml.
    """
    docbody = source.split('\n')
    if not is_only_references:
        reflines, dummy, dummy = extract_references_from_fulltext(docbody)
    else:
        refs_info = get_reference_section_beginning(docbody)
        if not refs_info:
            refs_info, dummy = find_numeration_in_body(docbody)
            refs_info['start_line'] = 0
            refs_info['end_line'] = len(docbody) - 1,

        reflines = rebuild_reference_lines(docbody, refs_info['marker_pattern'])
    return parse_references(reflines)
コード例 #12
0
 def test_no_section(self):
     sect = get_reference_section_beginning("")
     self.assertEqual(sect, None)
コード例 #13
0
 def test_no_section(self):
     sect = get_reference_section_beginning("")
     self.assertEqual(sect, None)
コード例 #14
0
 def test_no_section(self):
     from invenio.refextract_find import get_reference_section_beginning
     sect = get_reference_section_beginning("")
     self.assertEqual(sect, None)