def test_text_content(): curpath = os.path.abspath(os.path.dirname(__file__)) testdoc = os.path.join(curpath, 'testdocs', 'carnap-short.xml') content = util.text_content(testdoc) assert '<' not in content assert 'sys-' not in content assert 'systems' in content
def process_file(doc, keep_tempfiles=False): """converts document to pdf, then xml, then extracts metadata""" if doc.filetype != 'pdf': # convert to pdf try: doc.tempfile = convert_to_pdf(doc.tempfile) except: raise Exception("pdf conversion failed") # get pdf info: try: pdfmeta = pdfinfo(doc.tempfile) doc.numpages = int(pdfmeta['Pages']) except: raise Exception('pdfinfo failed') debug(2, 'pdf has %s pages', doc.numpages) # convert to xml: doc.xmlfile = doc.tempfile.rsplit('.')[0] + '.xml' if doc.numpages > 10: # ocr only first 7 + last 3 pages if necessary: ocr_ranges = [(1,3), (doc.numpages-2,doc.numpages)] else: ocr_ranges = None try: engine = pdf2xml(doc.tempfile, doc.xmlfile, keep_tempfiles=keep_tempfiles, ocr_ranges=ocr_ranges) except Exception as e: debug(1, "converting pdf to xml failed: %s", e) raise Exception('pdf conversion failed') # read some basic metadata from xml file: doc.content = util.text_content(doc.xmlfile) debug(5, "text content:\n%s", doc.content) if engine == 'pdftohtml': doc.numwords = len(doc.content.split()) else: doc.ocr = True if doc.numpages > 10: # extrapolate numwords from numpages and the number of words # on the ocr'ed pages: doc.numwords = len(doc.content.split()) * doc.numpages/10 else: doc.numwords = len(doc.content.split()) # guess doc type (paper, book, review, etc.): from .doctyper import doctyper doc.doctype = doctyper.evaluate(doc) # extract metadata: from .docparser import paperparser if not paperparser.parse(doc, keep_tempfiles=keep_tempfiles): raise Exception('parser error') return 0