def extract(self, input_file): input_file = self._check_input_file(input_file) document = Document() file = open(input_file) document.content = file.read() file.close() return document
def extract(self, input_file): input_file = self._check_input_file(input_file) # Extraction command and its options. They may be parametrized in the # future command = [self._pdf_extraction_tool, '-q', '-f', '1', '-l', '2', '-enc', 'ASCII7', '-htmlmeta', input_file, '-'] try: pop = subprocess.Popen(command, stdout=subprocess.PIPE) except subprocess.CalledProcessError as cpe: log.error ('Error executing PDF text extraction tool. Return code: ' #@UndefinedVariable + repr(cpe.returncode)) except OSError: log.error ('PDF extraction tool not found') #@UndefinedVariable stdout = pop.communicate()[0] if not stdout: raise ExtractionError('Corrupted file') parser = BeautifulSoup(stdout) document = Document() self._extract_metadata(parser, document) self._extract_content(parser, document) return document
def setUp(self): self.document = Document()