コード例 #1
0
 def test_no_title_via_numbers(self):
     from invenio.legacy.refextract.find import get_reference_section_beginning
     sect = get_reference_section_beginning(["Hello", "1 Ref1" "2 Ref2"])
     self.assertEqual(
         sect, {
             'marker': '1',
             'marker_pattern':
             u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))',
             'start_line': 1,
             'title_string': None,
             'title_marker_same_line': False,
             'how_found_start': 4,
         })
コード例 #2
0
ファイル: text.py プロジェクト: chokribr/invenio-1
def extract_references_from_fulltext(fulltext):
    """Locate and extract the reference section from a fulltext document.
       Return the extracted reference section as a list of strings, whereby each
       string in the list is considered to be a single reference line.
        E.g. a string could be something like:
        '[19] Wilson, A. Unpublished (1986).
       @param fulltext: (list) of strings, whereby each string is a line of the
        document.
       @return: (list) of strings, where each string is an extracted reference
        line.
    """
    # Try to remove pagebreaks, headers, footers
    fulltext = remove_page_boundary_lines(fulltext)
    status = 0
    # How ref section found flag
    how_found_start = 0
    # Find start of refs section
    ref_sect_start = get_reference_section_beginning(fulltext)

    if ref_sect_start is None:
        ## No References
        refs = []
        status = 4
        write_message(
            "* extract_references_from_fulltext: "
            "ref_sect_start is None",
            verbose=2)
    else:
        # If a reference section was found, however weak
        ref_sect_end = \
           find_end_of_reference_section(fulltext,
                                         ref_sect_start["start_line"],
                                         ref_sect_start["marker"],
                                         ref_sect_start["marker_pattern"])
        if ref_sect_end is None:
            # No End to refs? Not safe to extract
            refs = []
            status = 5
            write_message(
                "* extract_references_from_fulltext: "
                "no end to refs!",
                verbose=2)
        else:
            # If the end of the reference section was found.. start extraction
            refs = get_reference_lines(
                fulltext, ref_sect_start["start_line"], ref_sect_end,
                ref_sect_start["title_string"],
                ref_sect_start["marker_pattern"],
                ref_sect_start["title_marker_same_line"])

    return refs, status, how_found_start
コード例 #3
0
 def test_simple(self):
     from invenio.legacy.refextract.find import get_reference_section_beginning
     sect = get_reference_section_beginning(
         ["Hello", "References", "[1] Ref1"])
     self.assertEqual(
         sect, {
             'marker': '[1]',
             'marker_pattern':
             u'\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])',
             'start_line': 1,
             'title_string': 'References',
             'title_marker_same_line': False,
             'how_found_start': 1,
         })
コード例 #4
0
ファイル: text.py プロジェクト: mhellmic/b2share
def extract_references_from_fulltext(fulltext):
    """Locate and extract the reference section from a fulltext document.
       Return the extracted reference section as a list of strings, whereby each
       string in the list is considered to be a single reference line.
        E.g. a string could be something like:
        '[19] Wilson, A. Unpublished (1986).
       @param fulltext: (list) of strings, whereby each string is a line of the
        document.
       @return: (list) of strings, where each string is an extracted reference
        line.
    """
    # Try to remove pagebreaks, headers, footers
    fulltext = remove_page_boundary_lines(fulltext)
    status = 0
    # How ref section found flag
    how_found_start = 0
    # Find start of refs section
    ref_sect_start = get_reference_section_beginning(fulltext)

    if ref_sect_start is None:
        ## No References
        refs = []
        status = 4
        write_message("* extract_references_from_fulltext: " \
                         "ref_sect_start is None", verbose=2)
    else:
        # If a reference section was found, however weak
        ref_sect_end = \
           find_end_of_reference_section(fulltext,
                                         ref_sect_start["start_line"],
                                         ref_sect_start["marker"],
                                         ref_sect_start["marker_pattern"])
        if ref_sect_end is None:
            # No End to refs? Not safe to extract
            refs = []
            status = 5
            write_message("* extract_references_from_fulltext: " \
                             "no end to refs!", verbose=2)
        else:
            # If the end of the reference section was found.. start extraction
            refs = get_reference_lines(fulltext,
                                       ref_sect_start["start_line"],
                                       ref_sect_end,
                                       ref_sect_start["title_string"],
                                       ref_sect_start["marker_pattern"],
                                       ref_sect_start["title_marker_same_line"],
                                       ref_sect_start["marker"])

    return refs, status, how_found_start
コード例 #5
0
 def test_no_title_via_numbers(self):
     from invenio.legacy.refextract.find import get_reference_section_beginning
     sect = get_reference_section_beginning([
         "Hello",
         "1 Ref1"
         "2 Ref2"
     ])
     self.assertEqual(sect, {
         'marker': '1',
         'marker_pattern': u'(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))',
         'start_line': 1,
         'title_string': None,
         'title_marker_same_line': False,
         'how_found_start': 4,
     })
コード例 #6
0
 def test_simple(self):
     from invenio.legacy.refextract.find import get_reference_section_beginning
     sect = get_reference_section_beginning([
         "Hello",
         "References",
         "[1] Ref1"
     ])
     self.assertEqual(sect, {
         'marker': '[1]',
         'marker_pattern': u'^\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])',
         'start_line': 1,
         'title_string': 'References',
         'title_marker_same_line': False,
         'how_found_start': 1,
     })
コード例 #7
0
    def test_no_title_via_numbers2(self):
        from invenio.legacy.refextract.find import get_reference_section_beginning

        sect = get_reference_section_beginning(["Hello", "1", "Ref1", "(3)", "2", "Ref2"])
        self.assertEqual(
            sect,
            {
                "marker": "1",
                "marker_pattern": u"(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))",
                "start_line": 1,
                "title_string": None,
                "title_marker_same_line": False,
                "how_found_start": 4,
            },
        )
コード例 #8
0
    def test_simple(self):
        from invenio.legacy.refextract.find import get_reference_section_beginning

        sect = get_reference_section_beginning(["Hello", "References", "[1] Ref1"])
        self.assertEqual(
            sect,
            {
                "marker": "[1]",
                "marker_pattern": u"\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])",
                "start_line": 1,
                "title_string": "References",
                "title_marker_same_line": False,
                "how_found_start": 1,
            },
        )
コード例 #9
0
 def test_no_section(self):
     from invenio.legacy.refextract.find import get_reference_section_beginning
     sect = get_reference_section_beginning("")
     self.assertEqual(sect, None)
コード例 #10
0
 def test_no_section(self):
     from invenio.legacy.refextract.find import get_reference_section_beginning
     sect = get_reference_section_beginning("")
     self.assertEqual(sect, None)