def extract_references_from_file_xml(path, recid=1):
    """Extract references from a local pdf file

    The single parameter is the path to the file
    It raises FullTextNotAvailable if the file does not exist
    The result is given in marcxml.
    """
    if not os.path.isfile(path):
        raise FullTextNotAvailable()

    docbody, dummy = get_plaintext_document_body(path)
    reflines, dummy, dummy = extract_references_from_fulltext(docbody)
    if not len(reflines):
        docbody, dummy = get_plaintext_document_body(path, keep_layout=True)
        reflines, dummy, dummy = extract_references_from_fulltext(docbody)

    return parse_references(reflines, recid=recid)
Exemple #2
0
def extract_references_from_file_xml(path, recid=1):
    """Extract references from a local pdf file

    The single parameter is the path to the file
    It raises FullTextNotAvailable if the file does not exist
    The result is given in marcxml.
    """
    if not os.path.isfile(path):
        raise FullTextNotAvailable()

    docbody, dummy = get_plaintext_document_body(path)
    reflines, dummy, dummy = extract_references_from_fulltext(docbody)
    if not len(reflines):
        docbody, dummy = get_plaintext_document_body(path, keep_layout=True)
        reflines, dummy, dummy = extract_references_from_fulltext(docbody)

    return parse_references(reflines, recid=recid)
Exemple #3
0
def extract_references_from_file(path, recid=None):
    """Extract references from a local pdf file

    The single parameter is the path to the file
    It raises FullTextNotAvailable if the file does not exist
    The result is given as a bibrecord class.
    """
    if not os.path.isfile(path):
        raise FullTextNotAvailable()

    docbody, dummy = get_plaintext_document_body(path)
    reflines, dummy, dummy = extract_references_from_fulltext(docbody)
    if not len(reflines):
        docbody, dummy = get_plaintext_document_body(path, keep_layout=True)
        reflines, dummy, dummy = extract_references_from_fulltext(docbody)

    references = parse_references(reflines, recid=recid)
    references['999C6'][0].add_subfield('v', os.path.basename(path))
    return references
def extract_references_from_string_xml(source, is_only_references=True):
    """Extract references from a string

    The single parameter is the document
    The result is given in marcxml.
    """
    docbody = source.split("\n")
    if not is_only_references:
        reflines, dummy, dummy = extract_references_from_fulltext(docbody)
    else:
        refs_info = get_reference_section_beginning(docbody)
        if not refs_info:
            refs_info, dummy = find_numeration_in_body(docbody)
            refs_info["start_line"] = 0
            refs_info["end_line"] = (len(docbody) - 1,)

        reflines = rebuild_reference_lines(docbody, refs_info["marker_pattern"])
    return parse_references(reflines)
Exemple #5
0
def extract_references_from_string_xml(source, is_only_references=True):
    """Extract references from a string

    The single parameter is the document
    The result is given in marcxml.
    """
    docbody = source.split('\n')
    if not is_only_references:
        reflines, dummy, dummy = extract_references_from_fulltext(docbody)
    else:
        refs_info = get_reference_section_beginning(docbody)
        if not refs_info:
            refs_info, dummy = find_numeration_in_body(docbody)
            refs_info['start_line'] = 0
            refs_info['end_line'] = len(docbody) - 1,

        reflines = rebuild_reference_lines(docbody, refs_info['marker_pattern'])
    return parse_references(reflines)