Beispiel #1
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will convert the whole file to XML using `pdftohtml`, then run OCR
    on individual images within the file.
    """
    temp_dir = make_tempdir()
    try:
        out_file = os.path.join(temp_dir, 'pdf.xml')
        log.info("Converting PDF to XML: %r...", path)
        pdftohtml = get_config('PDFTOHTML_BIN')
        args = [pdftohtml, '-xml', '-hidden', '-q', '-nodrm', path, out_file]
        subprocess.call(args)

        if not os.path.exists(out_file):
            raise IngestorException("Could not convert PDF to XML: %s" % path)

        with open(out_file, 'r') as fh:
            xml = string_value(fh.read())
            xml = xml.replace('encoding="UTF-8"', '')
            parser = etree.XMLParser(recover=True, remove_comments=True)
            doc = etree.fromstring(xml, parser=parser)
            log.debug("Parsed XML: %r", path)

        pages = []
        for page in doc.findall('./page'):
            pages.append(extract_page(path, temp_dir, page, languages))

        return {'pages': pages}
    finally:
        remove_tempdir(temp_dir)
Beispiel #2
0
 def ingest(self, meta, local_path):
     work_dir = make_tempdir()
     try:
         bin_path = os.environ.get('READPST_BIN', 'readpst')
         args = [
             bin_path, '-D', '-e', '-8', '-b', '-o', work_dir, local_path
         ]
         log.debug('Converting Outlook PST file: %r', ' '.join(args))
         subprocess.call(args)
         for (dirpath, dirnames, filenames) in os.walk(work_dir):
             reldir = os.path.relpath(string_value(dirpath),
                                      string_value(work_dir))
             for filename in filenames:
                 filename = string_value(filename)
                 child = meta.make_child()
                 for kw in reldir.split(os.path.sep):
                     child.add_keyword(kw)
                 fid = os.path.join(string_value(meta.foreign_id),
                                    string_value(reldir), filename)
                 child.foreign_id = string_value(fid)
                 file_path = os.path.join(string_value(dirpath), filename)
                 ingest_file(self.collection_id,
                             child,
                             file_path,
                             move=True)
     finally:
         remove_tempdir(work_dir)
Beispiel #3
0
 def ingest(self, meta, local_path):
     temp_dir = make_tempdir()
     try:
         log.info("Unpacking bundle: %r", meta.file_name)
         with ZipFile(local_path, 'r') as zf:
             zf.extractall(temp_dir)
         self.ingest_directory(meta, temp_dir)
     finally:
         remove_tempdir(temp_dir)
Beispiel #4
0
 def ingest(self, meta, local_path):
     # Work-around: try to unpack multi-part files by changing into
     # the directory containing the file.
     prev_cwd = os.getcwd()
     os.chdir(os.path.dirname(local_path))
     temp_dir = make_tempdir(meta.file_name)
     try:
         log.info("Descending into package: %r", meta.file_name)
         self.unpack(meta, local_path, temp_dir)
         ingest_directory(self.collection_id, meta, temp_dir,
                          base_path=meta.foreign_id, move=True)
     except rarfile.NeedFirstVolume:
         pass
     finally:
         remove_tempdir(temp_dir)
         os.chdir(prev_cwd)
Beispiel #5
0
 def ingest(self, meta, local_path):
     work_dir = make_tempdir()
     try:
         bin_path = os.environ.get('READPST_BIN', 'readpst')
         args = [bin_path, '-D', '-e', '-o', work_dir, local_path]
         log.debug('Converting Outlook PST file: %r', ' '.join(args))
         subprocess.call(args)
         for (dirpath, dirnames, filenames) in os.walk(work_dir):
             reldir = os.path.relpath(dirpath, work_dir)
             for filename in filenames:
                 child = meta.make_child()
                 for kw in reldir.split(os.path.sep):
                     child.add_keyword(kw)
                 child.foreign_id = os.path.join(meta.foreign_id, reldir,
                                                 filename)
                 ingest_file(self.source_id, meta,
                             os.path.join(dirpath, filename), move=True)
     finally:
         remove_tempdir(work_dir)
Beispiel #6
0
 def ingest(self, meta, local_path):
     # Work-around: try to unpack multi-part files by changing into
     # the directory containing the file.
     prev_cwd = os.getcwd()
     os.chdir(os.path.dirname(local_path))
     temp_dir = make_tempdir(meta.file_name)
     try:
         log.info("Descending into package: %r", meta.file_name)
         self.unpack(meta, local_path, temp_dir)
         ingest_directory(self.collection_id,
                          meta,
                          temp_dir,
                          base_path=meta.foreign_id,
                          move=True)
     except rarfile.NeedFirstVolume:
         pass
     finally:
         remove_tempdir(temp_dir)
         os.chdir(prev_cwd)
Beispiel #7
0
 def emit_bundle(self, meta, directory, files):
     bundle = meta.make_child()
     if meta.foreign_id:
         bundle.source_path = os.path.join(meta.foreign_id,
                                           self.BUNDLE_EXTENSION)
     bundle.mime_type = self.BUNDLE_MIME
     bundle.file_name = '%s.%s' % (meta.file_name,
                                   self.BUNDLE_EXTENSION)
     log.info("Creating bundle: %r", bundle.file_name)
     temp_dir = make_tempdir()
     try:
         bundle_path = os.path.join(temp_dir, bundle.file_name)
         with ZipFile(bundle_path, 'w', ZIP_STORED) as zf:
             for file_name in files:
                 file_path = os.path.join(directory, file_name)
                 zf.write(file_path, file_name)
         ingest_file(self.collection_id, bundle, bundle_path,
                     move=True)
     finally:
         remove_tempdir(temp_dir)