Beispiel #1
0
def leer_pdf_slate(ubicacion_archivo, password=None):
    """
    Utiliza la librería slate3k para cargar un archivo PDF y extraer el \
        texto de sus páginas.

    :param ubicacion_archivo: (str). Ubicación del archivo PDF que se \
        desea leer.
    :param password: (str). Valor por defecto: None. Parámetro opcional \
        para leer archivos PDF que están protegidos por contraseña.
    :return: (list). Lista de strings, que contienen el texto extraído \
        de cada página del PDF.
    """
    import slate3k as slate

    # Para no mostrar warnings de slate
    import logging

    logging.propagate = False
    logging.getLogger().setLevel(logging.ERROR)
    # Abrir el archivo y extraer el texto de las páginas
    with open(ubicacion_archivo, "rb") as f:
        if password is not None:
            paginas = slate.PDF(f, password)
        else:
            paginas = slate.PDF(f)
    # Retornar el texto extraído
    return paginas
Beispiel #2
0
def process_resume_list():
    for resumeNo in range (1,98):
        resume = 'C:/Users/Muskaan Ratra/Desktop/CVs/CVs/c' + str(resumeNo) + '.pdf'

        resumeFile=open(resume,'rb')
        resumePdf = slate.PDF(resumeFile)       
        
        # Remove punctuaton marks
        removeNewLines = remove_new_lines(str(resumePdf))
        
        # Remove escape chars
        escapeCharsString = re.sub(escape_char, " ", removeNewLines)
        
        # Remove punctuation marks 
        finalString = remove_punctuation_marks(escapeCharsString)
        
        resume_list.append(finalString)
        
    
    # Start lablabel=[]
    for i in range(36):
        labelList.append(1)
    for i in range(61):
        labelList.append(0)
    print(np.array(labelList))
def top_words(file_path, num):
    pdf = sk.PDF(open(file_path, 'rb'))
    list_words = re.findall("\w+", pdf.text())
    ctr = Counter(list_words)
    result = dict()
    result = ctr.most_common(num)
    return result
Beispiel #4
0
def parsePdf(file_url):
    # creating a pdf file object
    try:
        r = requests.get(file_url, stream=True)

        with open("temp.pdf", "wb") as pdf:
            for chunk in r.iter_content(chunk_size=1024):

                # writing one chunk at a time to pdf file
                if chunk:
                    pdf.write(chunk)

        pdfFileObj = open("temp.pdf", 'rb')

        extracted_text = slate.PDF(pdfFileObj)
        # creating a pdf reader object
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

        # printing number of pages in pdf file
        numPage = pdfReader.numPages
    except:
        extracted_text = "No text"
        numPage = 0
        # print('Not a PDF')
    pdfFileObj.close()
    return extracted_text, numPage
Beispiel #5
0
def read_pdf(filepath):
    with open(filepath, 'rb') as f:
        text = str(slate.PDF(f))
        text = text.replace('\\n', ' ')
        text = text.replace('\\t', ' ')
        text = text.replace('\\r', ' ')
    return text
Beispiel #6
0
def select_doc(usr_msg):
    doc = None
    if (usr_msg == "/CSE"):
        fp = open('(1)CSE_only_theory.pdf', 'rb')
        doc = slate.PDF(fp)
    if (usr_msg == "/IT"):
        fp = open('(1)IT_only_theory.pdf', 'rb')
        doc = slate.PDF(fp)
    if (usr_msg == "/ECE"):
        fp = open('(1)ECE_only_theory.pdf', 'rb')
        doc = slate.PDF(fp)
    if (usr_msg == "/EEE"):
        fp = open('(1)EEE_only_theory.pdf', 'rb')
        doc = slate.PDF(fp)

    return doc
Beispiel #7
0
    def _text_to_series_of_pages(self, pdf_name: str) -> Tuple[pd.Series, int]:
        """
        :param pdf_name: full name of pdf (including .pdf extension) to be scraped and converted into a pd.Series
        :return: document_series: a pd.Series where each row contains the text of one pdf page.
                 num_pages: int, the number of pages of the input pdf file
        """
        assert pdf_name.endswith(
            '.pdf'
        ), 'Input file is not in .pdf format. The file cannot be processed.'
        document_series = pd.Series()
        if not self.from_s3_bucket:
            pdf = open(os.path.join(self.pdf_folder, pdf_name), 'rb')
        else:
            pdf = s3fs.S3FileSystem().open(
                pdf_name, 'rb'
            )  # no need to join with self.pdf_folder as s3fs includes that
        pdf_reader = slate3k.PDF(pdf)
        num_pages = len(pdf_reader)
        for i, page in enumerate(pdf_reader):
            logger.debug(f'Reading page {i+1} of PDF file {pdf_name}')
            page_text = self._clean_text(page)
            page_series = pd.Series(page_text)
            document_series = document_series.append(page_series,
                                                     ignore_index=True)
        pdf.close()

        return document_series, num_pages
Beispiel #8
0
def readPdf(filename):
    pdfText= ''
    with open(filename,'rb') as f:
        text = slate.PDF(f)
    for each in text:
        pdfText = pdfText+each
    return pdfText
 def extractText(self) -> List[str]:
     try:
         with open(self.fileName, 'rb') as f:
             document = None
             if self.isPasswordEnabled:
                 document = slate.PDF(f, self.password)
             else:
                 document = slate.PDF(f)
         self.totalPages = len(document)
         return list(
             map(self.replaceSlashWithOr,
                 map(self.replaceNewLineWithEmptySpace, document)))
     except:
         sys.exc_info("Unexpected Error: {}, {}, line: {}".format(
             sys.exc_info()[0],
             sys.exc_info()[1],
             sys.exc_info()[2].tb_lineno))
def extractFile(filePath, start, end, s, e):
    #pdfFileObj = open(filePath, 'rb')
    #pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

    with open(filePath, 'rb') as f:
        extract = slate.PDF(f)
    readTable(extract, start, end)
    readParagraph(extract, s, e)
Beispiel #11
0
    def get_pdf_text(path):
        result = ""
        with open(path, 'rb') as file:
            text_pages = slate.PDF(file)
            for text in text_pages:
                result += text.replace("\xa0", " ").strip()

        return result
def extractText(file):
    pdfFileObj = open(pdfFileName, "rb")
    pdfPages = slate.PDF(pdfFileObj)

    # Extract text from PDF file
    text = ""
    for page in pdfPages:
        text += page
    return text
Beispiel #13
0
def extract_text(file):
    f = open(file, 'rb')
    pdf = slate.PDF(f)

    brief = ''
    for item in pdf:
        brief += item

    return brief
    def extract_text(self, pdf_filename: str) -> str:
        """Extract txt from pdf using the pdfbox module."""
        # p = pdfbox.PDFBox()
        # text = p.extract_text(pdf_filename)
        with open(pdf_filename, 'rb') as f:
            pdf = slate3k.PDF(f)

        text = "\n\n".join(pdf)
        return text
Beispiel #15
0
 def pdfextract(self,file):
     with open(file, 'rb') as f:
         fileReader = slate.PDF(f)
         #print(fileReader)
         #countpage = len(fileReader)
         text = []
         for doc in fileReader:
             text.append(doc.lower())
     return text
Beispiel #16
0
def pdf2txt(path, logger):
    path = Path(path)
    try:
        with open(path, 'rb') as f:
            doc = slate3k.PDF(f)
    except Exception as e:
        logger.info(f'{path}: + {str(e)}')
    else:
        text = '\n'.join(doc)
        save(text, OUT_DIR/path.parts[-1].replace('pdf', 'txt'))
Beispiel #17
0
def make_prediction(resumeNo):
    resume = 'C:/Users/Muskaan Ratra/Desktop/CVs/CVs/c' + str(resumeNo+1) + '.pdf'
    loaded_model = pickle.load(open(save_model, 'rb'))
    loaded_vector = pickle.load(open(save_vector, 'rb'))
    resumeFile=open(resume,'rb')
    sample_resume=slate.PDF(resumeFile)
    sample_resume=sample_resume[0]
    sample_resume=loaded_vector.transform([sample_resume])
    return loaded_model.predict(sample_resume)[0]
    
#print(make_prediction(4))
Beispiel #18
0
def slate_parser(pdf_path):
    '''
    Extract words from pdf document using slate3k
    :param pdf_path: Path to pdf location in the project
    :return: String with content of the pdf
    '''
    # Open the file to be able to read the content
    with open(pdf_path, 'rb') as f:
        text_content = slate.PDF(f)

    return str(text_content[0])
Beispiel #19
0
def readPdf(filename):
    with open(filename, 'rb') as f:
        doc = slate.PDF(f)
    try:
        b64Pdf = doc[0].strip()
        missing_padding = len(b64Pdf) % 4
        if missing_padding:
            b64Pdf += '=' * (4 - missing_padding)
        print(base64.b64decode(b64Pdf).decode())
    except:
        pass
 def from_pdf(self, file):
     try:
         with open(file, 'rb') as fi:
             doc = slate3k.PDF(fi, word_margin=0)
             for i in range(len(doc)):
                 string = doc[i]
                 extracted_list = string.split('. \n')
                 for line in extracted_list:
                     if detect(line) == 'en':
                         self.paragraphs.append(Text(line))
     except:
         pass
Beispiel #21
0
def pdf_extractor(path, vectors=False):
    # Open the pdf file in read binary mode.
    file_object = open(path, 'rb')

    # Create a pdf reader .
    pdf_file_reader = PyPDF2.PdfFileReader(file_object)

    try:
        creator = pdf_file_reader.getDocumentInfo()["/Author"]
    except:
        creator = "Unknown"

    current_page_number = 1
    paragraph_repo = {}
    vector = {}
    Classified = "No"

    # Reliably retrieve text from pdf
    with open(path, 'rb') as f:
        doc = slate.PDF(f)
    # Loop in all the pdf pages.
    for page in doc:
        # Get pdf page text.
        temp1 = None
        temp2 = None
        temp1 = page
        if vectors:
            temp2 = vectorizer(page, lang=detect(page))
        paragraph_repo[str(current_page_number)] = temp1
        vector[str(current_page_number)] = temp2

        # if "cid" in temp2:
        #     c = 0
        #     c = temp2.count("cid")
        #
        #     if c > 5:
        #         Classified = "Yes"

        if not paragraph_repo[str(current_page_number)]:
            # If can not extract text then use ocr lib to extract the scanned pdf file.
            try:
                paragraph_repo[str(current_page_number)] = fix_text(textract.process(path,
                                                                                     method='tesseract',
                                                                                     encoding='utf-8'))
            except TimeoutError:
                continue

        current_page_number += 1

    if vectors:
        return Classified, creator, paragraph_repo, vector
    else:
        return Classified, creator, paragraph_repo
Beispiel #22
0
def addFromPatt(path, NOP, pdfWriter):
    with open(path, 'rb') as f:
        text = slate.PDF(f)

    pattern = (open(instruction_file, 'rb')).readlines()
    offset = int(pattern[1])

    for i in range(0, NOP):
        #print(text[i])
        bookmark = extOp(i, text, offset) + ' - Sid ' + str(i + 1)
        pdfWriter.addBookmark(bookmark, i, None)

    # show bookmarks on open
    pdfWriter.setPageMode("/UseOutlines")
Beispiel #23
0
def loadPDF(path: str):
    """Loads a given .pdf file

    Arguments:
        path {str} -- Path to .pdf file

    Returns:
        List -- List of pages as strings 
    """

    with open(path, 'rb') as f:
        extracted_text = slate.PDF(f)
    
    return extracted_text
Beispiel #24
0
def search_wordclouds():
    word = request.get_json()['word']
    result_list = []
    for item in os.listdir(os.path.expanduser(app.config['UPLOAD_FOLDER'])):
        if not os.path.isdir(item):
            current_directory = app.config['UPLOAD_FOLDER']
            if item.split(".")[1] == "txt":
                text = open(path.join(current_directory, item)).read()
            else:
                with open(path.join(current_directory, item), 'rb') as f:
                    text = "".join(slate.PDF(f))
            if word in text:
                result_list.append(item.split(".")[0] + ".jpeg")
    return jsonify({result_list})
Beispiel #25
0
def read_from_pdf(name_pdf_file, name_of_out_file="text"):

    with open(f"PDF/{name_pdf_file}.pdf", mode="rb") as pdf_file:
        list_text_pages = slate3k.PDF(pdf_file)

    text = ""
    for page in list_text_pages:
        text += page
    text = text.replace("\x0c", "")

    with open(f"PLAIN TEXT/{name_of_out_file}.txt", mode="w",
              encoding="utf-8") as text_file:
        text_file.write(text)

    return text
Beispiel #26
0
 def TextIt2(self):
     Brothers = os.path.join(os.getcwd(), "Brothers")
     TextFiles = os.path.join(os.getcwd(), "TextFiles")
     self.SafeMake(TextFiles)
     for file in os.listdir(Brothers):
         oFile = open(os.path.join(TextFiles, file.replace("pdf","txt")), "w+", encoding="utf-8")
         iFile = open(os.path.join(Brothers, file), "rb")
         doc = slate.PDF(iFile)
         for page in doc:
             page = re.sub(r'\n+ ', '\n', page)
             page = re.sub(r'\n+', '\n', page)
             page = re.sub(r'[^\x00-\x7F]+','', page)
             #page = re.sub(r'[^0-9a-zA-Z]+','', page)
             if len(page)> 6: oFile.write(page[:-1])
         iFile.close()
         oFile.close()
     return
Beispiel #27
0
    def parse(self):
        """
        This method is used to parse through the PDF file. It also calls the text_cleaning method and then appends the
        cleaned_text to corpus.
        :return:
        """
        logging.propagate = False
        logging.getLogger().setLevel(logging.ERROR)

        with open(self.path, 'rb') as file:
            text = slate.PDF(file)
        text = self.modify(text)
        self.summary = TextPreprocessor().text_cleaning(text, '.*')
        self.corpus.append(self.summary)
        self.experience = TextPreprocessor().text_cleaning(
            text, 'Experience(.*?)Education')
        self.name = TextPreprocessor().text_cleaning(text, '(.*)Summary')
def PDFToText(file, wayToSaveFile, mode="simple"):
    nameFile = funcoesUteis.getOnlyNameFile(os.path.basename(file))
    wayToSave = f"{wayToSaveFile}/{nameFile}.txt"
    try:
        textPdf = ""
        with open(file, 'rb') as filePdf:
            documents = slate.PDF(filePdf)
            for document in documents:
                textPdf += document
            
        if funcoesUteis.treatTextField(textPdf) == "":
            PDFImgToText(file, wayToSaveFile)
        else:
            command = f'{fileDir}/exe/pdftotext64.exe -{mode} "{file}" "{wayToSave}"'
            os.system(command)

    except Exception as ex:
        print(f"Nao foi possivel transformar o arquivo \"{file}\". O erro é: {str(ex)}")
Beispiel #29
0
 def create_pdf_instance(filename):
     all_text = ''
     filepath = download.get_path(filename)
     try:
         with open(filepath, 'rb') as pdf_obj:
             pdf = slate_pdf_reader.PDF(pdf_obj)
             for page in pdf:
                 all_text += page.strip()
             if all_text != '':
                 return File(all_text, filepath)
             raise QWebValueMismatchError(
                 'Text not found. Seems that the pdf is empty.')
     except TypeError as e:
         raise QWebFileNotFoundError(
             f'File not found. Got {e} instead.') from e
     except PSEOF as e:
         raise QWebFileNotFoundError(
             f'File found, but it\'s not valid pdf-file: {e}') from e
Beispiel #30
0
def upload_pdf():
    title = "Text Summarizer"
    textsumm = " "
    article_text = " "
    num_senten = 0

    if flask.request.method == "POST":
        if 'file' not in flask.request.files:
            message = "No file is attached in request"
            #return redirect(url_for('get_doc'))
            return redirect(flask.request.base_url)
        file = flask.request.files['file']
        if file.filename == '':
            message = "no file selected"
            #return redirect(url_for('get_doc'))
            return redirect(flask.request.base_url)
        if file and allow_file(file.filename):
            newfile = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], newfile))
            num_sent = float(flask.request.form['num_sentences'])
            input = os.path.join(app.config['UPLOAD_FOLDER'], newfile)
            with open(input, 'rb') as f:
                extracted_text = slate.PDF(f)
            extracted_text = [x.replace("\t", " ") for x in extracted_text]
            extracted_text = [x.replace("\n", " ") for x in extracted_text]
            extracted_text = [
                x.replace(
                    "Liked This Book?  For More FREE e-Books visit Freeditorial.com              \x0c",
                    "") for x in extracted_text
            ]
            # print(extracted_text)

            article_text = '. '.join([str(elem) for elem in extracted_text])
            sentences_original = nltk.sent_tokenize(article_text)
            n = len(sentences_original)
            num_senten = (int)((num_sent * n) / 100)
            textsumm = pdf_summarizer(article_text, num_senten)
            #print("text:", textsumm)

    return render_template("pdf.html",
                           title=title,
                           original_text=article_text,
                           output_summary=textsumm,
                           total=num_senten)