Python extract_text Exemples, pdfminer.high_level.extract_text Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : mexico.py Projet : LimLim0a0/covid-19-data

 def _get_text_from_pdf(self, url: str) -> str:
     """Get the text from the pdf url"""
     with tempfile.NamedTemporaryFile() as tmp:
         download_file_from_url(url, tmp.name)
         with open(tmp.name, "rb") as f:
             text = extract_text(f)
     text = re.sub(r"\s+", " ", text)
     text = re.sub(r"(\d)\,(\d)", r"\1\2", text)
     return text

Exemple #2

0

Afficher le fichier

def parse_pdf(attachment):
    """Try to read pdf and return contents as string.
    Return False if extraction fails.
    """
    try:
        doc = base64.b64decode(attachment.content)
        return extract_text(io.BytesIO(doc))
    except:
        return False

Exemple #3

0

Afficher le fichier

Fichier : views.py Projet : vmoshikov/cp_final_mco

def calculate_skills_assessment(text, ca):
    vacancy_key_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.key_skills.all().values_list('title',
                                                              flat=True))))
    vacancy_additional_skills = list(
        map(
            lambda x: x.lower(),
            list(ca.core_vacancy.additional_skills.all().values_list(
                'title', flat=True))))

    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    morph_vocab = MorphVocab()

    text = extract_text(ca.cv_file.path)

    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    cv_key_skills = []
    cv_additional_skills = []

    for token in doc.tokens:
        token.lemmatize(morph_vocab)
        print(token)
        if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills:
            cv_key_skills.append(token.lemma)
            print(token.lemma)

        if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills:
            cv_additional_skills.append(token.lemma)
            print(token.lemma)

    candidate_conformity = {
        "key_skills": {
            "vacancy_key_skills": vacancy_key_skills,
            "cv_key_skills": cv_key_skills,
            "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills)
        },
        "additional_skills": {
            "vacancy_additional_skills":
            vacancy_additional_skills,
            "cv_additional_skills":
            cv_additional_skills,
            "conformity_percent":
            len(cv_additional_skills) / len(vacancy_additional_skills)
        }
    }

    return candidate_conformity

Exemple #4

0

Afficher le fichier

Fichier : parser.py Projet : flplv/profit-manager

def extract(pdf_path):
    text = pdf.extract_text(pdf_path, laparams=pdflayout.LAParams(char_margin=1000.0))
    selected_needles = list(filter(lambda needle: needle in text, parsers.keys()))
    if len(selected_needles) == 0:
        print("Pdf", pdf_path, "ignored, not parser for it.")
        return text, None
    parser = parsers[selected_needles[0]]
    print("Loaded", pdf_path, "as", parser.__module__.split(".")[-1])
    return parser, pdf_path, text

Exemple #5

0

Afficher le fichier

async def planning(ctx, period: typing.Optional[str] = None):
    """`!planning [opt: vendredi|samedi|dimanche|semaine]` te donne le planning et le lien vers le PDF."""
    embed = discord.Embed()

    url = 'https://www.hackingindustry.camp/Planning-HIC-2021.pdf'
    r = requests.get(url, allow_redirects=True)
    embed.add_field(name="lien", value=url)
    embed.set_thumbnail(
        url='https://www.hackingindustry.camp/images/logos/Logo_HIC_White.png')
    bio = BytesIO(r.content)
    pdf = extract_text(bio)

    fields = [
        'planning', 'vendredi 5 février 2021', 'samedi 6 février 2021',
        'dimanche 7 février 2021',
        'du lundi 8 février au vendredi 12 février 2021'
    ]

    idxs = []
    idx_ends = []

    opt_list = {'vendredi': 1, 'samedi': 2, 'dimanche': 3, 'semaine': 4}

    for f in fields:
        try:
            idx = pdf.lower().index(f)
            idx_end = idx + len(f)
            idxs.append(idx)
            idx_ends.append(idx_end)
        except ValueError:
            pass

    if period is None:
        for i in range(len(idxs)):
            field_name = pdf[idxs[i]:idx_ends[i]]
            msg_end = -1 if i + 1 >= len(idxs) else idxs[i + 1]
            msg = pdf[idx_ends[i]:msg_end]
            embed.add_field(name=field_name, value=msg)
    elif period.lower() in opt_list:
        opt = period.lower()
        period = opt_list[opt]
        #
        field_name = pdf[idxs[period]:idx_ends[period]]
        msg_end = -1 if period + 1 >= len(idxs) else idxs[period + 1]
        msg = pdf[idx_ends[period]:msg_end]
        embed.add_field(name=field_name, value=msg)

    else:
        field_name = 'error'
        msg = "options possibles sont:\n"
        msg += "- `!planning` pour le planning entier\n"
        for k in opt_list.keys():
            msg += f"- `!planning {k}`\n"

        embed.add_field(name=field_name, value=msg)

    await ctx.send(embed=embed)

Exemple #6

0

Afficher le fichier

    def extract_text(self):
        full_path = self.pdf_path + self.file_name
        text = extract_text(full_path)
        with open('./cv/pdf_to_text.txt', 'w') as file:
            file.write(text)

        print(repr(text))
        print(text)
        print(type(text))

Exemple #7

0

Afficher le fichier

def scan_files_process(keywords: List[str], file: str) -> List[bool]:
    if file[-4:] == '.pdf':
        process_file_text = extract_text(file)
    elif file[-4:] == 'docx':
        process_file_text = docx2txt.process(file).upper()
    else:
        with open(file, 'rt') as process_file:
            process_file_text = process_file.read()
    return [k in process_file_text.lower() for k in keywords]

Exemple #8

0

Afficher le fichier

def _extract_text(file_path):
    try:
        pdf_to_text = extract_text(file_path)
        return ''.join([x for x in filter(lambda x: x.strip() != '', "".join(pdf_to_text).splitlines())])
    except PDFSyntaxError as pse:
        logging.error('文件解析错误，不是一个正确的 PDF 文件')
        raise Exception('无法解析 PDF') from pse

    return ''

Exemple #9

0

Afficher le fichier

Fichier : azerbaijan.py Projet : LimLim0a0/covid-19-data

 def _parse_pdf_text(self, url: str) -> str:
     """Parse pdf text from url."""
     with tempfile.NamedTemporaryFile() as tmp:
         download_file_from_url(url, tmp.name)
         with open(tmp.name, "rb") as f:
             text = extract_text(f)
     text = re.sub(r"(\d) (\d)", r"\1\2", text)
     text = re.sub(r"\s+", " ", text)
     return text

Exemple #10

0

Afficher le fichier

def parse_pdf(file_name, page_sep=False):
    '''Return text of a pdf file
    
    Args:
        file_name (str): a string of the file name.
        page_sep (bool): return all text as a string if False, 
                         a list of text on each page if True.
    '''
    # Set up pdf parsing environment
    output_string = StringIO()
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Determine the number of pages in the file
    fp = open(file_name, 'rb')
    length = len(list(PDFPage.get_pages(fp)))

    # Determine the start of meeting
    # for pageNumber, page in enumerate(PDFPage.get_pages(fp)):
    #     interpreter.process_page(page)
    #     content = retstr.getvalue().replace('\n', ' ').replace('  ', ' ').replace('\x0c', '')
    #     retstr.truncate(0)
    #     retstr.seek(0)
    #     if re.search("pengerusi", content): # where the meeting started
    #         start_no = pageNumber
    #         print(file_name, start_no)
    #         break

    # if 'start_no' in locals():
    #     page_no = range(start_no, length)
    # else:
    #     print("Starting page not found", file_name)
    #     return None

    # yet to determine a keyword that can determine the start in all files

    page_no = range(0, length)

    # Read data
    if page_sep == False:
        # Get all text in the file as a string
        text = extract_text(file_name, page_numbers=page_no)
    elif page_sep == True:
        # Separate the content of each page and store in a list
        text = []
        for pageNumber, page in enumerate(PDFPage.get_pages(fp)):
            if pageNumber in page_no:
                interpreter.process_page(page)
                text.append(retstr.getvalue().replace('\n', ' ').replace(
                    '  ', ' ').replace('\x0c', ''))
                retstr.truncate(0)
                retstr.seek(0)
    return [file_name, text]

Exemple #11

0

Afficher le fichier

Fichier : springer.py Projet : eljayan/springerBooks

def readSrpingerFile(filepath):
    """
    this function opens the springer pdf file wich contains more than 400 urls to free ebooks
    param: filepath: path to the pdf file from springer
    returns: a list of urls to ebooks.
    """
    pat = re.compile(r"http://link.springer.com/openurl.*", re.IGNORECASE)
    t = extract_text(filepath)
    links = pat.findall(t)
    return links

Exemple #12

0

Afficher le fichier

 def _parse_drive_id_from_pdf(self, pdf_path):
     # Get link from pdf
     with open(pdf_path, "rb") as f:
         text = extract_text(f)
     link = re.search(r"https://bit\.ly/.*", text).group()
     # Unshorten
     resp = requests.get(link)
     link = resp.url
     # Get id
     return link.split("/")[-1].split("?")[-2]

Exemple #13

0

Afficher le fichier

Fichier : TextFromFileExtractor.py Projet : sedovicin/MT_skills_from_CVs

def extract_from_pdf(pdf_file):
    """
	Extracts text from PDF file.

	:param pdf_file: path to file to be processed
	:type pdf_file: str
	:return: All the text from file as one string
	:rtype: str
	"""
    return pdf2txt.extract_text(pdf_file)

Exemple #14

0

Afficher le fichier

Fichier : pdf_reader.py Projet : Khachikyan01/esg-evaluation-by-nlp

 def read(self, path, html=False):
     text = StringIO()
     if html:
         with open(path, "rb") as f:
             extract_text_to_fp(f, text, laparams=LAParams(),
                                output_type="html", codec=None)
         text = text.getvalue()
     else:
         text = extract_text(path)
     return text

Exemple #15

0

Afficher le fichier

def read_pdf(filename):
    os.chdir('scrapped_pdfs')
    pathl = '\scrapped_pdfs'
    if os.getcwd().endswith(pathl):
        print('File found...')
        print(extract_text(filename + '.pdf'))
        os.chdir('../')
    else:
        print('Path is wrong..')
        print(os.getcwd())

Exemple #16

0

Afficher le fichier

def parse_doc(terms, path_to_pdf):
    patent_pdf = open(path_to_pdf, 'rb')
    text = extract_text(patent_pdf)

    relevant_information = {}

    i = 0
    lines = text.splitlines()

    def get_element(line_number, paragraphs, start):
        entry = ""
        index = line_number + start
        paragraph_count = 1
        while paragraph_count < paragraphs + 1:
            next_line = lines[index]
            entry += next_line
            if len(next_line) == 0:
                paragraph_count += 1
            index += 1
        relevant_information[p] = entry
        return entry, index

    while i < len(lines):
        line = lines[i]
        if 'Sheet 1' in line or len(terms) == 0:
            break

        p = parse_for_keywords(line, terms)
        if p is not None:
            start = 1
            count = 1
            if p == 'Int. Cl.':
                count = 3
            elif p == 'Assignees' or p == 'Inventors' or p == 'CPC':
                start = 0

            entry, index = get_element(i, count, start)
            relevant_information[p] = entry
            i = index

            terms.remove(p)

        else:
            i += 1

    new_info = {}
    for key, value in relevant_information.items():
        if key == 'CPC':
            cpc, uspc = parse_cpc_uspc(value)
            new_info['CPC'] = cpc
            new_info['USPC'] = uspc
        else:
            new_info[key] = parse_entries(key, value)

    return new_info

Exemple #17

0

Afficher le fichier

Fichier : sse.py Projet : rchopinw/Financial-News-Scraping

 def get_pdf_content(self,
                     url: str):
     """
     :param url: target url to get pdf content
     :return: string-like text of content
     """
     r = requests.get(url)
     with open('./cache/' + self.fn, 'wb+') as f:
         f.write(r.content)
     t = ph.extract_text('./cache/' + self.fn)
     return t

Exemple #18

0

Afficher le fichier

Fichier : dataset.py Projet : DAI-Lab/review_dataset

def get_introduction(path):
    text = extract_text(path)
    text = clean_text(text)
    loc_begin = text.find("INTRODUCTION")
    for stopper in ('RELATED', 'PRELIMINAR'):
        loc_end = text.find(stopper)
        if loc_end != -1:
            break

    introduction = text[loc_begin:loc_end]
    return introduction

Exemple #19

0

Afficher le fichier

def post_save_pdf(sender, instance, created, *args, **kwargs):
    if created:
        #pdfobj = open(instance.upload.path, "rb")
        #pdfread = PyPDF2.PdfFileReader(pdfobj)
        text = extract_text(instance.upload.path)

        #for i in range(pdfread.numPages):
        #    pob = pdfread.getPage(i)
        #    text += pob.extractText()
        instance.cv_text = text
        instance.save()

Exemple #20

0

Afficher le fichier

Fichier : api.py Projet : ayushi6560/Profile

def extract_and_json(path, filename):
    output = PdfFileWriter()
    text_list = []
    text = extract_text(pdf)
    text.replace("\n","")
    text_list.append(text)
    dict = {'text':text_list}
    json_object = json.dumps(dictionary, indent = 4)
    output.addPage(json_object) 
    output_stream = open(app.config['DOWNLOAD_FOLDER'] + filename, 'wb')
    output.write(output_stream)

Exemple #21

0

Afficher le fichier

Fichier : run.py Projet : ZapAutomation/DocumentAnalysis

def convert_to_txt(pdf_docs_location, txt_files_location):

    for dirpath, dirnames, files in os.walk(pdf_docs_location):
        for file_name in files:
            raw_text = extract_text(os.path.join(dirpath, file_name),
                                    caching=False)
            os.mkdir(os.path.join(txt_files_location, file_name[:-4]))
            text_file = open(
                os.path.join(txt_files_location, file_name[:-4],
                             file_name[:-4] + ".txt"), "w+")
            text_file.write(raw_text)

Exemple #22

0

Afficher le fichier

def extract_content_pdfMiner(file_name, directory):
    """
    alternative way to extract pdf content
    """

    try:
        text = extract_text(directory + file_name)

    except:
        return np.nan

    return text

Exemple #23

0

Afficher le fichier

Fichier : trans.py Projet : creatsx/trans

def translate_func():
    banner()
    # open , read  and get pages from pdf
    check_file(_path)
    name, ex = os.path.splitext(_path)
    fp = open(_path, 'rb')
    reader = PdfFileReader(fp)
    num_pages = reader.numPages
    pc.print("Pdf contents {} pages".format(num_pages), style="blue")

    #extracting text

    time_before = time.time()
    text = extract_text(_path)
    time_after = time.time()
    pc.print(
        "Extracting text from Each Page of {} in total of {} seconds".format(
            _path, time_after - time_before),
        style="bright_cyan")

    ## putting all text in one new file
    fw = open(f'{name}.txt', 'w')
    fw.write(text)

    pc.print('Successfully created text file for all text ',
             style="bright_black")

    ## start translating stuff

    time_first = time.time()

    with open(f'{name}.txt', 'r') as fn:
        lines = fn.readlines()
        try:
            results = pool.map(request, lines)
            pass
        except Exception as error:
            raise error

        pool.close()
        pool.join()

        time_second = time.time()

        print("Translating in %s  in a total of %s seconds" %
              (len(lines), time_second - time_first))
        fn.close()
        result_file = f'{name}_{lang}.txt'
        result = open(result_file, 'w')
        result.write(str(results))
        pc.print('  Created your %s lang translated text file of %s  ' %
                 (lang, _path),
                 style="cyan")

Exemple #24

0

Afficher le fichier

Fichier : utils.py Projet : JimXiongGM/CrawlPaper4NLP

def read_pdf(pdf_file_name):
    try: 
        err_message = None
        pdf_text = extract_text(pdf_file_name, laparams=laparams)
        return clean_text(pdf_text)
    except FileNotFoundError:
        err_message = f'ERROR NOFILE: {pdf_file_name}'
    except pdfminer.pdfparser.PDFSyntaxError:
        err_message = f'ERROR PDF: {pdf_file_name}'
    except:
        err_message = f'ERROR FILE: {pdf_file_name}'
    return err_message

Exemple #25

0

Afficher le fichier

Fichier : paper_keywords.py Projet : MatthieuMayer/palo_itest

 def dl_paper(self):
     """Download pdf paper with 'paper_id' in working directory."""
     # search paper id in arxiv list
     search = arxiv.Search(id_list=[self.paper_id])
     # get paper object
     paper = next(search.get())
     # extract paper title
     self.paper_title = paper.title
     # download paper as pdf
     paper.download_pdf(filename=self.paper_name)
     # load paper content
     self.paper_content = extract_text(self.paper_name)

Exemple #26

0

Afficher le fichier

Fichier : pdf.py Projet : kc980602/fyp-bert-qa

def extract_text_pdf(pdf):
    text = extract_text(pdf)
    text = text.replace('-\n', '')
    text = text.replace('\n\n', '[SEP]')
    text = text.replace('\n', ' ')

    text = text.split('[SEP]')

    # result = [line for line in text if len(line.split(' ')) > 8 and line.strip() != '']
    result = [line for line in text if line.strip() != '']

    return result

Exemple #27

0

Afficher le fichier

Fichier : ok_parse_daily_executive_order.py Projet : COVID19Tracking/covid19-datafetcher

def get_data(filepath):
    text = extract_text(filepath)
    lines = text.splitlines()
    lines = [x.strip() for x in lines if x.strip()]
    data = {}
    data['date'] = lines[0]
    for i, x in enumerate(lines):
        if x == TOTAL_TESTS:
            # read the next number and abort
            data[x] = atoi(lines[i + 1])

    return data

Exemple #28

0

Afficher le fichier

def main():
    text = extract_text("Projeto Grupou.pdf", page_numbers=range(14,
                                                                 35)).lower()
    tagged = pre_process(text)
    chunks = filter_chunks(tagged)
    normal = []
    for c in chunks:
        if len(c) == 1: normal.append(c[0])
        else: normal.append([d[0] for d in c])
    normal.sort(key=lambda x: len(x))
    for n in normal:
        print(' '.join(n))

Exemple #29

0

Afficher le fichier

Fichier : app.py Projet : yutingy/ResumeMatch

def upload():

    if 'file' not in request.files:
        return {"error": 'no file submitted.'}
    if 'query' not in request.files:
        return {"error": "no query found."}

    file = request.files['file']
    query = json.loads(request.files['query'].read().decode('utf-8'))
    if file.filename == '':
        return {"error": "no file selected"}
    if file and allowed_file(file.filename):
        filename = secure_filename(file.filename)
        save_location = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        if not os.path.exists(os.path.join('.', 'static', 'pdfs')):
            os.makedirs(os.path.join('.', 'static', 'pdfs'))
    file.save(save_location)
    txt = high_level.extract_text(save_location)
    try:
        c.addResume(txt)

        conn = psycopg2.connect("{}".format(os.getenv("URI")))
        cur = conn.cursor()

        cur.execute("SELECT * FROM jobs LIMIT 0;")
        colnames = [desc[0] for desc in cur.description]

        jobQuery = "%{}%".format(query['job'])
        locName = query['location']
        cur.execute("SELECT * FROM jobs WHERE descrip LIKE %s;", (jobQuery, ))
        results = []

        for row in cur:
            indRes = {}
            for i, colName in enumerate(colnames):
                indRes[colName] = row[i]
            c.addJobDesc(row[1])
            indRes['grade'] = (c.compareResumeToJob())
            results.append(indRes)
        conn.close()
        os.remove(save_location)
        return {"data": results}
    except Exception as e:
        print("Error occured, closing connection.")
        print(e)
        conn.close()
        return "Error occured"
    else:
        return {"error": "no work"}
    # print(save_location)
    os.remove(save_location)
    return results

Exemple #30

0

Afficher le fichier

Fichier : reader.py Projet : coopwilliams/TopicBoy

def read_pdf(filepath):
    try:
        full_text = extract_text(filepath)
    except:
        full_text = ""

    doc = {
        'filepath': filepath,
        'full_text': full_text,
        'title': filepath,
        'author': "",
    }
    return doc