def leer_pdf_slate(ubicacion_archivo, password=None): """ Utiliza la librería slate3k para cargar un archivo PDF y extraer el \ texto de sus páginas. :param ubicacion_archivo: (str). Ubicación del archivo PDF que se \ desea leer. :param password: (str). Valor por defecto: None. Parámetro opcional \ para leer archivos PDF que están protegidos por contraseña. :return: (list). Lista de strings, que contienen el texto extraído \ de cada página del PDF. """ import slate3k as slate # Para no mostrar warnings de slate import logging logging.propagate = False logging.getLogger().setLevel(logging.ERROR) # Abrir el archivo y extraer el texto de las páginas with open(ubicacion_archivo, "rb") as f: if password is not None: paginas = slate.PDF(f, password) else: paginas = slate.PDF(f) # Retornar el texto extraído return paginas
def process_resume_list(): for resumeNo in range (1,98): resume = 'C:/Users/Muskaan Ratra/Desktop/CVs/CVs/c' + str(resumeNo) + '.pdf' resumeFile=open(resume,'rb') resumePdf = slate.PDF(resumeFile) # Remove punctuaton marks removeNewLines = remove_new_lines(str(resumePdf)) # Remove escape chars escapeCharsString = re.sub(escape_char, " ", removeNewLines) # Remove punctuation marks finalString = remove_punctuation_marks(escapeCharsString) resume_list.append(finalString) # Start lablabel=[] for i in range(36): labelList.append(1) for i in range(61): labelList.append(0) print(np.array(labelList))
def top_words(file_path, num): pdf = sk.PDF(open(file_path, 'rb')) list_words = re.findall("\w+", pdf.text()) ctr = Counter(list_words) result = dict() result = ctr.most_common(num) return result
def parsePdf(file_url): # creating a pdf file object try: r = requests.get(file_url, stream=True) with open("temp.pdf", "wb") as pdf: for chunk in r.iter_content(chunk_size=1024): # writing one chunk at a time to pdf file if chunk: pdf.write(chunk) pdfFileObj = open("temp.pdf", 'rb') extracted_text = slate.PDF(pdfFileObj) # creating a pdf reader object pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # printing number of pages in pdf file numPage = pdfReader.numPages except: extracted_text = "No text" numPage = 0 # print('Not a PDF') pdfFileObj.close() return extracted_text, numPage
def read_pdf(filepath): with open(filepath, 'rb') as f: text = str(slate.PDF(f)) text = text.replace('\\n', ' ') text = text.replace('\\t', ' ') text = text.replace('\\r', ' ') return text
def select_doc(usr_msg): doc = None if (usr_msg == "/CSE"): fp = open('(1)CSE_only_theory.pdf', 'rb') doc = slate.PDF(fp) if (usr_msg == "/IT"): fp = open('(1)IT_only_theory.pdf', 'rb') doc = slate.PDF(fp) if (usr_msg == "/ECE"): fp = open('(1)ECE_only_theory.pdf', 'rb') doc = slate.PDF(fp) if (usr_msg == "/EEE"): fp = open('(1)EEE_only_theory.pdf', 'rb') doc = slate.PDF(fp) return doc
def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]: """ :param pdf_name: full name of pdf (including .pdf extension) to be scraped and converted into a pd.Series :return: document_series: a pd.Series where each row contains the text of one pdf page. num_pages: int, the number of pages of the input pdf file """ assert pdf_name.endswith( '.pdf' ), 'Input file is not in .pdf format. The file cannot be processed.' document_series = pd.Series() if not self.from_s3_bucket: pdf = open(os.path.join(self.pdf_folder, pdf_name), 'rb') else: pdf = s3fs.S3FileSystem().open( pdf_name, 'rb' ) # no need to join with self.pdf_folder as s3fs includes that pdf_reader = slate3k.PDF(pdf) num_pages = len(pdf_reader) for i, page in enumerate(pdf_reader): logger.debug(f'Reading page {i+1} of PDF file {pdf_name}') page_text = self._clean_text(page) page_series = pd.Series(page_text) document_series = document_series.append(page_series, ignore_index=True) pdf.close() return document_series, num_pages
def readPdf(filename): pdfText= '' with open(filename,'rb') as f: text = slate.PDF(f) for each in text: pdfText = pdfText+each return pdfText
def extractText(self) -> List[str]: try: with open(self.fileName, 'rb') as f: document = None if self.isPasswordEnabled: document = slate.PDF(f, self.password) else: document = slate.PDF(f) self.totalPages = len(document) return list( map(self.replaceSlashWithOr, map(self.replaceNewLineWithEmptySpace, document))) except: sys.exc_info("Unexpected Error: {}, {}, line: {}".format( sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2].tb_lineno))
def extractFile(filePath, start, end, s, e): #pdfFileObj = open(filePath, 'rb') #pdfReader = PyPDF2.PdfFileReader(pdfFileObj) with open(filePath, 'rb') as f: extract = slate.PDF(f) readTable(extract, start, end) readParagraph(extract, s, e)
def get_pdf_text(path): result = "" with open(path, 'rb') as file: text_pages = slate.PDF(file) for text in text_pages: result += text.replace("\xa0", " ").strip() return result
def extractText(file): pdfFileObj = open(pdfFileName, "rb") pdfPages = slate.PDF(pdfFileObj) # Extract text from PDF file text = "" for page in pdfPages: text += page return text
def extract_text(file): f = open(file, 'rb') pdf = slate.PDF(f) brief = '' for item in pdf: brief += item return brief
def extract_text(self, pdf_filename: str) -> str: """Extract txt from pdf using the pdfbox module.""" # p = pdfbox.PDFBox() # text = p.extract_text(pdf_filename) with open(pdf_filename, 'rb') as f: pdf = slate3k.PDF(f) text = "\n\n".join(pdf) return text
def pdfextract(self,file): with open(file, 'rb') as f: fileReader = slate.PDF(f) #print(fileReader) #countpage = len(fileReader) text = [] for doc in fileReader: text.append(doc.lower()) return text
def pdf2txt(path, logger): path = Path(path) try: with open(path, 'rb') as f: doc = slate3k.PDF(f) except Exception as e: logger.info(f'{path}: + {str(e)}') else: text = '\n'.join(doc) save(text, OUT_DIR/path.parts[-1].replace('pdf', 'txt'))
def make_prediction(resumeNo): resume = 'C:/Users/Muskaan Ratra/Desktop/CVs/CVs/c' + str(resumeNo+1) + '.pdf' loaded_model = pickle.load(open(save_model, 'rb')) loaded_vector = pickle.load(open(save_vector, 'rb')) resumeFile=open(resume,'rb') sample_resume=slate.PDF(resumeFile) sample_resume=sample_resume[0] sample_resume=loaded_vector.transform([sample_resume]) return loaded_model.predict(sample_resume)[0] #print(make_prediction(4))
def slate_parser(pdf_path): ''' Extract words from pdf document using slate3k :param pdf_path: Path to pdf location in the project :return: String with content of the pdf ''' # Open the file to be able to read the content with open(pdf_path, 'rb') as f: text_content = slate.PDF(f) return str(text_content[0])
def readPdf(filename): with open(filename, 'rb') as f: doc = slate.PDF(f) try: b64Pdf = doc[0].strip() missing_padding = len(b64Pdf) % 4 if missing_padding: b64Pdf += '=' * (4 - missing_padding) print(base64.b64decode(b64Pdf).decode()) except: pass
def from_pdf(self, file): try: with open(file, 'rb') as fi: doc = slate3k.PDF(fi, word_margin=0) for i in range(len(doc)): string = doc[i] extracted_list = string.split('. \n') for line in extracted_list: if detect(line) == 'en': self.paragraphs.append(Text(line)) except: pass
def pdf_extractor(path, vectors=False): # Open the pdf file in read binary mode. file_object = open(path, 'rb') # Create a pdf reader . pdf_file_reader = PyPDF2.PdfFileReader(file_object) try: creator = pdf_file_reader.getDocumentInfo()["/Author"] except: creator = "Unknown" current_page_number = 1 paragraph_repo = {} vector = {} Classified = "No" # Reliably retrieve text from pdf with open(path, 'rb') as f: doc = slate.PDF(f) # Loop in all the pdf pages. for page in doc: # Get pdf page text. temp1 = None temp2 = None temp1 = page if vectors: temp2 = vectorizer(page, lang=detect(page)) paragraph_repo[str(current_page_number)] = temp1 vector[str(current_page_number)] = temp2 # if "cid" in temp2: # c = 0 # c = temp2.count("cid") # # if c > 5: # Classified = "Yes" if not paragraph_repo[str(current_page_number)]: # If can not extract text then use ocr lib to extract the scanned pdf file. try: paragraph_repo[str(current_page_number)] = fix_text(textract.process(path, method='tesseract', encoding='utf-8')) except TimeoutError: continue current_page_number += 1 if vectors: return Classified, creator, paragraph_repo, vector else: return Classified, creator, paragraph_repo
def addFromPatt(path, NOP, pdfWriter): with open(path, 'rb') as f: text = slate.PDF(f) pattern = (open(instruction_file, 'rb')).readlines() offset = int(pattern[1]) for i in range(0, NOP): #print(text[i]) bookmark = extOp(i, text, offset) + ' - Sid ' + str(i + 1) pdfWriter.addBookmark(bookmark, i, None) # show bookmarks on open pdfWriter.setPageMode("/UseOutlines")
def loadPDF(path: str): """Loads a given .pdf file Arguments: path {str} -- Path to .pdf file Returns: List -- List of pages as strings """ with open(path, 'rb') as f: extracted_text = slate.PDF(f) return extracted_text
def search_wordclouds(): word = request.get_json()['word'] result_list = [] for item in os.listdir(os.path.expanduser(app.config['UPLOAD_FOLDER'])): if not os.path.isdir(item): current_directory = app.config['UPLOAD_FOLDER'] if item.split(".")[1] == "txt": text = open(path.join(current_directory, item)).read() else: with open(path.join(current_directory, item), 'rb') as f: text = "".join(slate.PDF(f)) if word in text: result_list.append(item.split(".")[0] + ".jpeg") return jsonify({result_list})
def read_from_pdf(name_pdf_file, name_of_out_file="text"): with open(f"PDF/{name_pdf_file}.pdf", mode="rb") as pdf_file: list_text_pages = slate3k.PDF(pdf_file) text = "" for page in list_text_pages: text += page text = text.replace("\x0c", "") with open(f"PLAIN TEXT/{name_of_out_file}.txt", mode="w", encoding="utf-8") as text_file: text_file.write(text) return text
def TextIt2(self): Brothers = os.path.join(os.getcwd(), "Brothers") TextFiles = os.path.join(os.getcwd(), "TextFiles") self.SafeMake(TextFiles) for file in os.listdir(Brothers): oFile = open(os.path.join(TextFiles, file.replace("pdf","txt")), "w+", encoding="utf-8") iFile = open(os.path.join(Brothers, file), "rb") doc = slate.PDF(iFile) for page in doc: page = re.sub(r'\n+ ', '\n', page) page = re.sub(r'\n+', '\n', page) page = re.sub(r'[^\x00-\x7F]+','', page) #page = re.sub(r'[^0-9a-zA-Z]+','', page) if len(page)> 6: oFile.write(page[:-1]) iFile.close() oFile.close() return
def parse(self): """ This method is used to parse through the PDF file. It also calls the text_cleaning method and then appends the cleaned_text to corpus. :return: """ logging.propagate = False logging.getLogger().setLevel(logging.ERROR) with open(self.path, 'rb') as file: text = slate.PDF(file) text = self.modify(text) self.summary = TextPreprocessor().text_cleaning(text, '.*') self.corpus.append(self.summary) self.experience = TextPreprocessor().text_cleaning( text, 'Experience(.*?)Education') self.name = TextPreprocessor().text_cleaning(text, '(.*)Summary')
def PDFToText(file, wayToSaveFile, mode="simple"): nameFile = funcoesUteis.getOnlyNameFile(os.path.basename(file)) wayToSave = f"{wayToSaveFile}/{nameFile}.txt" try: textPdf = "" with open(file, 'rb') as filePdf: documents = slate.PDF(filePdf) for document in documents: textPdf += document if funcoesUteis.treatTextField(textPdf) == "": PDFImgToText(file, wayToSaveFile) else: command = f'{fileDir}/exe/pdftotext64.exe -{mode} "{file}" "{wayToSave}"' os.system(command) except Exception as ex: print(f"Nao foi possivel transformar o arquivo \"{file}\". O erro é: {str(ex)}")
def create_pdf_instance(filename): all_text = '' filepath = download.get_path(filename) try: with open(filepath, 'rb') as pdf_obj: pdf = slate_pdf_reader.PDF(pdf_obj) for page in pdf: all_text += page.strip() if all_text != '': return File(all_text, filepath) raise QWebValueMismatchError( 'Text not found. Seems that the pdf is empty.') except TypeError as e: raise QWebFileNotFoundError( f'File not found. Got {e} instead.') from e except PSEOF as e: raise QWebFileNotFoundError( f'File found, but it\'s not valid pdf-file: {e}') from e
def upload_pdf(): title = "Text Summarizer" textsumm = " " article_text = " " num_senten = 0 if flask.request.method == "POST": if 'file' not in flask.request.files: message = "No file is attached in request" #return redirect(url_for('get_doc')) return redirect(flask.request.base_url) file = flask.request.files['file'] if file.filename == '': message = "no file selected" #return redirect(url_for('get_doc')) return redirect(flask.request.base_url) if file and allow_file(file.filename): newfile = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], newfile)) num_sent = float(flask.request.form['num_sentences']) input = os.path.join(app.config['UPLOAD_FOLDER'], newfile) with open(input, 'rb') as f: extracted_text = slate.PDF(f) extracted_text = [x.replace("\t", " ") for x in extracted_text] extracted_text = [x.replace("\n", " ") for x in extracted_text] extracted_text = [ x.replace( "Liked This Book? For More FREE e-Books visit Freeditorial.com \x0c", "") for x in extracted_text ] # print(extracted_text) article_text = '. '.join([str(elem) for elem in extracted_text]) sentences_original = nltk.sent_tokenize(article_text) n = len(sentences_original) num_senten = (int)((num_sent * n) / 100) textsumm = pdf_summarizer(article_text, num_senten) #print("text:", textsumm) return render_template("pdf.html", title=title, original_text=article_text, output_summary=textsumm, total=num_senten)