コード例 #1
0
ファイル: test_util.py プロジェクト: wo/opp-tools
def test_text_content():
    curpath = os.path.abspath(os.path.dirname(__file__))
    testdoc = os.path.join(curpath, 'testdocs', 'carnap-short.xml')
    content = util.text_content(testdoc)
    assert '<' not in content
    assert 'sys-' not in content
    assert 'systems' in content
コード例 #2
0
def process_file(doc, keep_tempfiles=False):
    """converts document to pdf, then xml, then extracts metadata"""
    
    if doc.filetype != 'pdf':
        # convert to pdf
        try:
            doc.tempfile = convert_to_pdf(doc.tempfile)
        except:
            raise Exception("pdf conversion failed")

    # get pdf info:
    try:
        pdfmeta = pdfinfo(doc.tempfile)
        doc.numpages = int(pdfmeta['Pages'])
    except:
        raise Exception('pdfinfo failed')
    debug(2, 'pdf has %s pages', doc.numpages)

    # convert to xml:
    doc.xmlfile = doc.tempfile.rsplit('.')[0] + '.xml'
    if doc.numpages > 10:
        # ocr only first 7 + last 3 pages if necessary:
        ocr_ranges = [(1,3), (doc.numpages-2,doc.numpages)]
    else:
        ocr_ranges = None
    try:
        engine = pdf2xml(doc.tempfile, doc.xmlfile, 
                         keep_tempfiles=keep_tempfiles,
                         ocr_ranges=ocr_ranges)
    except Exception as e:
        debug(1, "converting pdf to xml failed: %s", e)
        raise Exception('pdf conversion failed')

    # read some basic metadata from xml file: 
    doc.content = util.text_content(doc.xmlfile)
    debug(5, "text content:\n%s", doc.content)
    if engine == 'pdftohtml':
        doc.numwords = len(doc.content.split())
    else:
        doc.ocr = True
        if doc.numpages > 10:
            # extrapolate numwords from numpages and the number of words
            # on the ocr'ed pages:
            doc.numwords = len(doc.content.split()) * doc.numpages/10
        else:
            doc.numwords = len(doc.content.split())

    # guess doc type (paper, book, review, etc.):
    from .doctyper import doctyper
    doc.doctype = doctyper.evaluate(doc)

    # extract metadata:
    from .docparser import paperparser
    if not paperparser.parse(doc, keep_tempfiles=keep_tempfiles):
        raise Exception('parser error')
        return 0