Beispiel #1
0
def compare(file1, file2, **kwargs):
    # If any LAParams group arguments were passed,
    # create an LAParams object and
    # populate with given args. Otherwise, set it to None.
    if kwargs.get('laparams', None) is None:
        laparams = layout.LAParams()
        for param in ("all_texts", "detect_vertical", "word_margin",
                      "char_margin", "line_margin", "boxes_flow"):
            paramv = kwargs.get(param, None)
            if paramv is not None:
                laparams[param] = paramv
        kwargs['laparams'] = laparams

    s1 = io.StringIO()
    with open(file1, "rb") as fp:
        high_level.extract_text_to_fp(fp, s1, **kwargs)

    s2 = io.StringIO()
    with open(file2, "rb") as fp:
        high_level.extract_text_to_fp(fp, s2, **kwargs)

    import difflib
    s1.seek(0)
    s2.seek(0)
    s1, s2 = s1.readlines(), s2.readlines()

    import os.path
    try:
        extension = os.path.splitext(kwargs['outfile'])[1][1:4]
        if extension.lower() == 'htm':
            return difflib.HtmlDiff().make_file(s1, s2)
    except KeyError:
        pass
    return difflib.unified_diff(s1, s2, n=kwargs['context_lines'])
Beispiel #2
0
def pdf_to_text(pdf):
    """Return extracted text from PDF.

  Warning: This function can be slow... up to 300ms per page
  This function does not perform optical character recognition.

  Args:
    pdf: bytestring of PDF file
  Returns:
    str of text extracted from `pdf` contents.
  """
    # make input and output buffers
    in_buffer = StringIO.StringIO(pdf)
    out_buffer = StringIO.StringIO()

    # configure pdf parser
    parser = pdfparser.PDFParser(in_buffer)
    doc = pdfparser.PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password='')
    rsrcmgr = pdfinterp.PDFResourceManager()
    laparams = layout.LAParams()

    # convert pdf to text
    device = converter.TextConverter(rsrcmgr,
                                     outfp=out_buffer,
                                     codec='utf-8',
                                     laparams=laparams)
    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)

    for page in doc.get_pages():
        interpreter.process_page(page)

    return out_buffer.getvalue()
Beispiel #3
0
def reorder_objects(objects, method='LRTB'):
    """Reorder objects according to their bounding boxes.

    Parameters
    ----------
    objects : list
        A list of any type of objects defined in `pdfminer.layout`.
    method : str, optional
        Method for reordering objects.
        - 'LRTB': reorder objects from top-left to bottom-right.
        - 'TBRL': reorder objects from top-right to bottom-left.

    Returns
    -------
    objects : list
        Ordered objects.

    See also: https://github.com/pdfminer/pdfminer.six/blob/7254530/pdfminer/layout.py#L573-L590
    """
    if method not in ['LRTB', 'TBRL']:
        raise ValueError(f'`method` should be one of {["LRTB", "TBRL"]}')

    laparams = pmla.LAParams()
    if method == 'LRTB':
        objects.sort(key=lambda v:(
            (1 - laparams.boxes_flow) * v.x0
            - (1 + laparams.boxes_flow) * (v.y0 + v.y1)
        ))
    else:
        objects.sort(key=lambda v:(
            - (1 + laparams.boxes_flow) * (v.x0 + v.x1)
            - (1 - laparams.boxes_flow) * v.y1
        ))
    return objects
Beispiel #4
0
    def _parse_pdf(self, stream):
        """Parse a single PDF and return the date and description."""
        LOG.info("Parsing accident report data from %s" % stream.name)
        fields = self._get_fields()

        try:
            # so much pdfminer boilerplate....
            document = pdfdocument.PDFDocument(pdfparser.PDFParser(stream))
            rsrcmgr = pdfinterp.PDFResourceManager()
            device = pdfconverter.PDFPageAggregator(
                rsrcmgr, laparams=pdflayout.LAParams())
            interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
        except psparser.PSException as err:
            LOG.warn("Parsing %s failed, skipping: %s" % (stream.name, err))
            return dict([(f.name, f.value) for f in fields])

        page_num = 1

        for page in pdfpage.PDFPage.create_pages(document):
            LOG.debug("Parsing page %s" % page_num)

            interpreter.process_page(page)
            layout = device.get_result()

            for field in fields:
                field.update(layout, page=page_num)
                if (not field.value and field.short_circuit
                        and page_num >= field.maxpage):
                    LOG.warn("No %s found in %s, aborting parsing" %
                             (field.name, stream.name))
                    return dict([(f.name, f.value) for f in fields])

            page_num += 1
        return dict([(f.name, f.value) for f in fields])
Beispiel #5
0
def check_first_page_is_cover(pdf: bytes) -> bool:
    """Reads pdf and returns True if it is a cover page"""
    with io.StringIO() as test_string:
        params = layout.LAParams(line_margin=2)
        extract_text_to_fp(pdf, test_string, page_numbers=[0], laparams=params)
        first_page = test_string.getvalue()
        return len(first_page.split()) <= 100
Beispiel #6
0
def extract(pdf_path):
    text = pdf.extract_text(pdf_path, laparams=pdflayout.LAParams(char_margin=1000.0))
    selected_needles = list(filter(lambda needle: needle in text, parsers.keys()))
    if len(selected_needles) == 0:
        print("Pdf", pdf_path, "ignored, not parser for it.")
        return text, None
    parser = parsers[selected_needles[0]]
    print("Loaded", pdf_path, "as", parser.__module__.split(".")[-1])
    return parser, pdf_path, text
Beispiel #7
0
def converte_pdf(nome_arquivo, semaforo=None, profundidade=None, pagina=None):
    parametros = pdf_layout.LAParams(word_margin=100)
    gerenciador = pdf_interp.PDFResourceManager()
    str_saida = cStringIO.StringIO()

    arquivo_pdf = file(nome_arquivo, 'rb')

    dispositivo = pdf_converter.HTMLConverter(gerenciador, str_saida, \
     codec='utf-8', laparams=parametros)
    interpretador = pdf_interp.PDFPageInterpreter(gerenciador, dispositivo)

    if pagina == None:
        tipo_aux = 'pdf'
        pg_inicio, pg_fim = 0, -1
    else:
        tipo_aux = 'pdf_parte'
        pg_inicio, pg_fim = pagina - 1, pagina + 1

    tipo = tipo_aux

    try:
        parser = pdf_parser.PDFParser(arquivo_pdf)
        documento = pdf_parser.PDFDocument()
        parser.set_document(documento)
        documento.set_parser(parser)

        paginas = [p for p in documento.get_pages()]

        if len(paginas) > MAX_PAGINAS_PDF:
            return 'pdf_longo', None

        for pagina_atual in paginas[pg_inicio:pg_fim]:
            try:
                interpretador.process_page(pagina_atual)
            except Exception:
                tipo = tipo_aux + '_defeito'

        dados_html = str_saida.getvalue()
    except (AssertionError, pdf_parser.PDFSyntaxError):
        return tipo, None
    finally:
        arquivo_pdf.close()
        dispositivo.close()
        str_saida.close()

    removido_tags = _converte_html(dados_html, 'div')
    removido_espacos_desnecessarios = re.sub("\ +ç", "ç", removido_tags, \
     flags=re.IGNORECASE)
    sem_numero_pagina = re.sub("\nPage\ [0-9]+\ *\n[0-9]+\ *\n", "\n", \
     removido_espacos_desnecessarios)
    texto_final = re.sub('(\ *\n)+', '\n', re.sub('[\ \t]+', r' ', \
     sem_numero_pagina))

    return tipo, texto_final
Beispiel #8
0
def get_ltpages(infile, caching=True):
    rm = pdfinterp.PDFResourceManager(caching=caching)
    laparams = layout.LAParams(detect_vertical=False)
    device = converter.PDFPageAggregator(rm, laparams=laparams)
    interpreter = pdfinterp.PDFPageInterpreter(rm, device)
    ltpages = []
    for page in PDFPage.get_pages(infile, caching=caching):
        interpreter.process_page(page)
        ltpages.append(device.get_result())
    device.close()
    return ltpages
Beispiel #9
0
    def _parse_pages(self, document):
        """Return the info extracted for the PDF BORME pages."""
        resource_manager = pdfinterp.PDFResourceManager()
        # value is specified not as an actual length, but as a proportion of
        # the length to the size of each character in question.
        # Two text chunks whose distance is closer than the **char_margin**
        # is considered continuous and get grouped into one.
        # it may be required to insert blank characters (spaces) as necessary
        # if the distance between two words is greater than the
        # **word_margin**.
        # as a blank between words might not be represented as a space, but
        # indicated by the positioning of each word.
        # two lines whose distance is closer than the line_margin is grouped as
        # a text box, which is a rectangular area that contains a "cluster" of
        # text portions.
        # 6.0 --> all without one
        # params = layout.LAParams(char_margin=8.0)
        params = layout.LAParams(char_margin=14.0)
        device = converter.PDFPageAggregator(resource_manager, laparams=params)
        interpreter = pdfinterp.PDFPageInterpreter(resource_manager, device)
        pdf_pages = [page for page in pdfpage.PDFPage.create_pages(document)]
        raw_pages = []
        is_first_page = True
        for page in pdf_pages:
            interpreter.process_page(page)
            my_layout = device.get_result()
            acts = self._parse_raw_acts(my_layout, is_first_page)
            raw_pages.append(acts)
            is_first_page = False
            debug_txt = "Page number: %i Acts: %i" % (len(raw_pages),
                                                      len(acts))
            self._log.debug(debug_txt)

        pages = []
        counter = len(raw_pages)
        for page in raw_pages:
            counter -= 1
            is_last_page = (counter == 0)
            next_pages = []
            if not is_last_page:
                next_pages = raw_pages[(len(raw_pages) - counter):]
            last_page_act = None
            if pages:
                if pages[-1]:
                    last_page_act = pages[-1][-1]
            acts = self._parse_acts(page, next_pages, last_page_act)
            pages.append(acts)
        return pages
Beispiel #10
0
def pdf_miner(from_file, to_txt):
    log.debug('trying with pdfminer')
    pdf = codecs.open(from_file.path, mode='rb')
    output = codecs.open(to_txt.path, mode='wb')
    try:
        resourceman = pdfinterp.PDFResourceManager()
        device = converter.TextConverter(
            resourceman, output, laparams=layout.LAParams())
        interpreter = pdfinterp.PDFPageInterpreter(resourceman, device)
        for page in pdfpage.PDFPage.get_pages(pdf):
            interpreter.process_page(page)
        output.close()
        device.close()
        pdf.close()
    except Exception, e:
        log.critical(e)
        return
Beispiel #11
0
 def get_layouts(self):
     """
     get layouts from raw pdf file.
     :return: (page idx, layout obj) generator 
     """
     with open(self.file_path, 'rb') as fp:
         parser = PDFParser(fp)
         document = PDFDocument(parser, password='') # stores the document structure.
         if not document.is_extractable:
             raise PDFTextExtractionNotAllowed
         rsrcmgr = PDFResourceManager() # stores shared resources.
         laparams = LT.LAParams()
         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         for idx, page in enumerate(PDFPage.create_pages(document)):
             interpreter.process_page(page)
             yield idx, device.get_result() # page idx, layout obj
Beispiel #12
0
def create_html(pdf_file_path, newfile, password):
    doc = doc_parser(pdf_file_path, password)
    #manager to store resources such as fonts, images
    rsrcmgr = PDFResourceManager()
    #set params for analysis.layout analizer returns a LTPage that is a tree with child objects, like textbox, figure, curve, text line
    laparams = Layout.LAParams(all_texts=True, detect_vertical=True)
    with open(newfile, 'w+') as f:
        # PDFDevice to translate content to our needs
        device = HTMLConverter(rsrcmgr,
                               f,
                               laparams=laparams,
                               layoutmode='loose',
                               showpageno=False,
                               rect_colors={'curve': None})
        # processes page contents, renders intructions for device
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in doc.get_pages():
            interpreter.process_page(page)
Beispiel #13
0
    def scan(self, file_object, options):
        extract_text = options.get("extract_text", False)
        file_limit = options.get("limit", 2000)

        self.metadata["total"] = {"objects": 0, "extracted": 0}
        extracted_objects = set()

        try:
            with io.BytesIO(file_object.data) as pdf_object:
                parsed_pdf = pdfparser.PDFParser(pdf_object)
                pdf_document = pdfdocument.PDFDocument(parsed_pdf)

                self.metadata.setdefault("annotatedUris", [])
                for xref in pdf_document.xrefs:
                    for object_id in xref.get_objids():
                        self.metadata["total"]["objects"] += 1

                        try:
                            object = pdf_document.getobj(object_id)
                            if isinstance(object, dict):
                                for (key, value) in object.items():
                                    if key in ["AA", "OpenAction"]:
                                        file_object.flags.append(f"{self.scanner_name}::auto_action")
                                    if key in ["JS", "Javascript"]:
                                        file_object.flags.append(f"{self.scanner_name}::javascript_embedded")

                                    try:
                                        if key == "A":
                                            uri = value.get("URI")
                                            if (uri is not None and
                                                uri not in self.metadata["annotatedUris"]):
                                                    self.metadata["annotatedUris"].append(uri)

                                    except AttributeError:
                                        pass

                            if self.metadata["total"]["extracted"] >= file_limit:
                                continue
                            if isinstance(object, pdftypes.PDFStream):
                                try:
                                    child_filename = f"{self.scanner_name}::object_{object_id}"
                                    child_fo = objects.StrelkaFile(data=object.get_data(),
                                                                   filename=child_filename,
                                                                   depth=file_object.depth + 1,
                                                                   parent_uid=file_object.uid,
                                                                   root_uid=file_object.root_uid,
                                                                   parent_hash=file_object.hash,
                                                                   root_hash=file_object.root_hash,
                                                                   source=self.scanner_name)
                                    if object_id not in extracted_objects:
                                        self.children.append(child_fo)
                                        extracted_objects.add(object_id)
                                        self.metadata["total"]["extracted"] += 1

                                except TypeError:
                                    file_object.flags.append(f"{self.scanner_name}::type_error_{object_id}")
                                except struct.error:
                                    file_object.flags.append(f"{self.scanner_name}::struct_error_{object_id}")

                        except ValueError:
                            file_object.flags.append(f"{self.scanner_name}::value_error_{object_id}")
                        except pdftypes.PDFObjectNotFound:
                            file_object.flags.append(f"{self.scanner_name}::object_not_found_{object_id}")
                        except pdftypes.PDFNotImplementedError:
                            file_object.flags.append(f"{self.scanner_name}::not_implemented_error_{object_id}")
                        except pdftypes.PSSyntaxError:
                            file_object.flags.append(f"{self.scanner_name}::ps_syntax_error_{object_id}")

                if extract_text:
                    rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
                    retstr = io.StringIO()
                    la_params = layout.LAParams(detect_vertical=True,
                                                char_margin=1.0,
                                                line_margin=0.3,
                                                word_margin=0.3)
                    device = converter.TextConverter(rsrcmgr, retstr,
                                                     codec="utf-8",
                                                     laparams=la_params)
                    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
                    for page in pdfpage.PDFPage.get_pages(pdf_object, set()):
                        try:
                            interpreter.process_page(page)

                        except struct.error:
                            file_object.flags.append(f"{self.scanner_name}::text_struct_error")

                    pdf_object_text = retstr.getvalue()
                    child_filename = f"{self.scanner_name}::text"
                    child_fo = objects.StrelkaFile(data=pdf_object_text,
                                                   filename=child_filename,
                                                   depth=file_object.depth + 1,
                                                   parent_uid=file_object.uid,
                                                   root_uid=file_object.root_uid,
                                                   parent_hash=file_object.hash,
                                                   root_hash=file_object.root_hash,
                                                   source=self.scanner_name)
                    self.children.append(child_fo)
                    file_object.flags.append(f"{self.scanner_name}::extracted_text")
                    device.close()
                    retstr.close()

        except IndexError:
            file_object.flags.append(f"{self.scanner_name}::index_error")
        except pdfdocument.PDFEncryptionError:
            file_object.flags.append(f"{self.scanner_name}::encrypted_pdf")
        except pdfparser.PDFSyntaxError:
            file_object.flags.append(f"{self.scanner_name}::pdf_syntax_error")
        except psparser.PSEOF:
            file_object.flags.append(f"{self.scanner_name}::ps_eof")
        except psparser.PSSyntaxError:
            file_object.flags.append(f"{self.scanner_name}::ps_syntax_error")
Beispiel #14
0
 def read_file(self):
     output = io.StringIO()
     laparams = layout.LAParams()
     with open(self.path, "rb") as pdffile:
         high_level.extract_text_to_fp(pdffile, output, laparams=laparams)
     self.roster_as_string = output.getvalue().split('\n')
def scrape_pdf(path_to_pdf):
    """Convert PDF to text and scrape data from text.

    Args:
        path_to_pdf (str): path to folder, containing PDFs.

    Returns:
        List of lists. Each nested list contains class objects as
        elements.
    """

    # making parameters for PDFminer for this specific PDFs
    la_params = layout.LAParams(
        line_overlap=0.4,
        char_margin=3.0,
        line_margin=1.0,
        word_margin=0.15,
        boxes_flow=0.3,
        detect_vertical=False,
        all_texts=False,
    )
    # defining fields
    fields = [
        "Chief of State",
        "Head of Government",
        "Government Type",
        "Capital",
        "Legislature",
        "Judiciary",
        "Ambassador to US",
        "US Ambassador",
        "Area",
        "Climate",
        "Natural Resources",
        "Economic Overview",
        "GDP (Purchasing Power Parity)",
        "GDP per capita (Purchasing Power Parity)",
        "Exports",
        "Imports",
        "Population",
        "Population Growth",
        "Ethnicity",
        "Language",
        "Religion",
        "Urbanization",
        "Literacy",
    ]

    country_general = []
    country_natural_resources = []
    country_language = []
    country_religion = []
    country_ethnicity = []
    country_import_partners = []
    country_export_partners = []

    for name in os.listdir(path_to_pdf):

        temp_general = []
        filepath = os.path.join(path_to_pdf, name)

        # scraping unformatted text using pdfminer.six
        text = high_level.extract_text(filepath, laparams=la_params)

        # here we extract country name from text
        text = text.split("\n", 1)
        country_id = text.pop(0)
        if country_id == "SAO TOMEAND PRINCIPE":
            country_id = "SAO TOME AND PRINCIPE"

        text = text[0].split()

        # searching for the date of last update for PDF and removing it
        last_update_index = [
            (i, i + 2) for i in range(len(text))
            if text[i: i + 2] == ["as", "of"]
        ]

        last_update = " ".join(
            text[last_update_index[0][1]: last_update_index[0][1] + 2]
        )
        last_update = last_update.strip()
        del text[last_update_index[0][0]: last_update_index[0][0] + 4]

        temp_general.append(last_update)

        # We index text with get_index function
        index_dict = create_index(fields, text, country_id)

        # we work with fieldnames in reverse order, since we need to parse
        # text from end to begining
        for field_name in fields[::-1]:

            field_func = format_field_data(field_name)
            # this handles some expections for Sudan and Chad, where
            # fields Chief of State and Head of Government are joined
            if index_dict[field_name] == [] and field_name == "Chief of State":
                field_data = " ".join(text[5:])

            # some countries don't have some fields, like Literacy
            # so with such countries we set value for this fields to None
            elif index_dict[field_name] == []:
                field_data = None

            # This is main part, that works with most of the text.
            # It starts with the end of text and finds the last field
            # based on index that we got from get_index function

            else:
                start_field = index_dict[field_name][0]
                start_content = index_dict[field_name][1]

                field_data = " ".join(text[start_content:])
                del text[start_field:]

            if field_data in ("NA", "N/A"):
                field_data = None

            three_arg_fields = ["Imports", "Exports",
                                "Religion", "Ethnicity", "Language"]
            two_arg_fields = ["Population", "Area", "Natural Resources"]

            if field_name in three_arg_fields:
                temp = field_func(country_id, field_data, field_name)

                if field_name == "Imports":
                    temp_general.extend(temp[0])
                    temp_list = [sc.CountryImportPartners(
                        *item) for item in temp[1]]
                    country_import_partners.extend(temp_list)

                elif field_name == "Exports":
                    temp_general.extend(temp[0])
                    temp_list = [sc.CountryExportPartners(
                        *item) for item in temp[1]]
                    country_export_partners.extend(temp_list)

                elif field_name == "Ethnicity":
                    temp_list = [sc.CountryEthnicity(*item) for item in temp]
                    country_ethnicity.extend(temp_list)

                elif field_name == "Religion":
                    temp_list = [sc.CountryReligion(*item) for item in temp]
                    country_religion.extend(temp_list)

                elif field_name == 'Language':
                    temp_list = [sc.CountryLanguage(*item) for item in temp]
                    country_language.extend(temp_list)

            elif field_name in two_arg_fields:
                temp = field_func(country_id, field_data)

                if field_name == "Natural Resources":
                    temp_list = [sc.CountryNaturalResources(
                        *item) for item in temp]
                    country_natural_resources.extend(temp_list)

                else:
                    temp_general.extend(temp)

            else:
                temp = field_func(field_data)
                temp_general.extend(temp)

        temp_general.append(country_id)
        temp_general = sc.CountryGeneral(*temp_general[::-1])
        country_general.append(temp_general)

    print("Finished scraping PDF")
    return [country_general,
            country_natural_resources,
            country_export_partners,
            country_import_partners,
            country_ethnicity,
            country_language,
            country_religion]
Beispiel #16
0
    def scan(self, data, file, options, expire_at):
        extract_text = options.get('extract_text', False)
        file_limit = options.get('limit', 2000)

        self.event['total'] = {'objects': 0, 'extracted': 0}
        extracted_objects = set()

        try:
            with io.BytesIO(data) as pdf_io:
                parsed = pdfparser.PDFParser(pdf_io)
                pdf = pdfdocument.PDFDocument(parsed)

                self.event.setdefault('annotated_uris', [])
                for xref in pdf.xrefs:
                    for object_id in xref.get_objids():
                        self.event['total']['objects'] += 1

                        try:
                            object = pdf.getobj(object_id)
                            if isinstance(object, dict):
                                for (key, value) in object.items():
                                    if key in ['AA', 'OpenAction']:
                                        self.flags.append('auto_action')
                                    if key in ['JS', 'Javascript']:
                                        self.flags.append(
                                            'javascript_embedded')

                                    try:
                                        if key == 'A':
                                            uri = value.get('URI')
                                            if uri not in self.event[
                                                    'annotated_uris']:
                                                self.event[
                                                    'annotated_uris'].append(
                                                        uri)

                                    except AttributeError:
                                        pass

                            if self.event['total']['extracted'] >= file_limit:
                                continue
                            if isinstance(object, pdftypes.PDFStream):
                                try:
                                    if object_id not in extracted_objects:
                                        extract_file = strelka.File(
                                            name=f'object_{object_id}',
                                            source=self.name,
                                        )

                                        for c in strelka.chunk_string(
                                                object.get_data()):
                                            self.upload_to_cache(
                                                extract_file.pointer,
                                                c,
                                                expire_at,
                                            )

                                        self.files.append(extract_file)
                                        self.event['total']['extracted'] += 1
                                        extracted_objects.add(object_id)

                                except TypeError:
                                    self.flags.append('type_error_{object_id}')
                                except struct.error:
                                    self.flags.append(
                                        'struct_error_{object_id}')

                        except ValueError:
                            self.flags.append('value_error_{object_id}')
                        except pdftypes.PDFObjectNotFound:
                            self.flags.append('object_not_found_{object_id}')
                        except pdftypes.PDFNotImplementedError:
                            self.flags.append(
                                'not_implemented_error_{object_id}')
                        except psparser.PSSyntaxError:
                            self.flags.append('ps_syntax_error_{object_id}')

                if extract_text:
                    rsrcmgr = pdfinterp.PDFResourceManager(caching=True)
                    retstr = io.StringIO()
                    la_params = layout.LAParams(
                        detect_vertical=True,
                        char_margin=1.0,
                        line_margin=0.3,
                        word_margin=0.3,
                    )
                    device = converter.TextConverter(
                        rsrcmgr,
                        retstr,
                        codec='utf-8',
                        laparams=la_params,
                    )
                    interpreter = pdfinterp.PDFPageInterpreter(rsrcmgr, device)
                    for page in pdfpage.PDFPage.get_pages(data, set()):
                        try:
                            interpreter.process_page(page)

                        except struct.error:
                            self.flags.append('text_struct_error')

                    extract_file = strelka.File(
                        name='text',
                        source=self.name,
                    )
                    for c in strelka.chunk_string(retstr.getvalue()):
                        self.upload_to_cache(
                            extract_file.pointer,
                            c,
                            expire_at,
                        )
                    self.files.append(extract_file)

                    self.flags.append('extracted_text')
                    device.close()
                    retstr.close()

        except IndexError:
            self.flags.append('index_error')
        except pdfdocument.PDFEncryptionError:
            self.flags.append('encrypted_pdf')
        except pdfparser.PDFSyntaxError:
            self.flags.append('pdf_syntax_error')
        except psparser.PSEOF:
            self.flags.append('ps_eof')
        except psparser.PSSyntaxError:
            self.flags.append('ps_syntax_error')