def _get_text_from_pdf(self, url: str) -> str: """Get the text from the pdf url""" with tempfile.NamedTemporaryFile() as tmp: download_file_from_url(url, tmp.name) with open(tmp.name, "rb") as f: text = extract_text(f) text = re.sub(r"\s+", " ", text) text = re.sub(r"(\d)\,(\d)", r"\1\2", text) return text
def parse_pdf(attachment): """Try to read pdf and return contents as string. Return False if extraction fails. """ try: doc = base64.b64decode(attachment.content) return extract_text(io.BytesIO(doc)) except: return False
def calculate_skills_assessment(text, ca): vacancy_key_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.key_skills.all().values_list('title', flat=True)))) vacancy_additional_skills = list( map( lambda x: x.lower(), list(ca.core_vacancy.additional_skills.all().values_list( 'title', flat=True)))) segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) morph_vocab = MorphVocab() text = extract_text(ca.cv_file.path) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) cv_key_skills = [] cv_additional_skills = [] for token in doc.tokens: token.lemmatize(morph_vocab) print(token) if token.lemma in vacancy_key_skills and token.lemma not in cv_key_skills: cv_key_skills.append(token.lemma) print(token.lemma) if token.lemma in vacancy_additional_skills and token.lemma not in cv_additional_skills: cv_additional_skills.append(token.lemma) print(token.lemma) candidate_conformity = { "key_skills": { "vacancy_key_skills": vacancy_key_skills, "cv_key_skills": cv_key_skills, "conformity_percent": len(cv_key_skills) / len(vacancy_key_skills) }, "additional_skills": { "vacancy_additional_skills": vacancy_additional_skills, "cv_additional_skills": cv_additional_skills, "conformity_percent": len(cv_additional_skills) / len(vacancy_additional_skills) } } return candidate_conformity
def extract(pdf_path): text = pdf.extract_text(pdf_path, laparams=pdflayout.LAParams(char_margin=1000.0)) selected_needles = list(filter(lambda needle: needle in text, parsers.keys())) if len(selected_needles) == 0: print("Pdf", pdf_path, "ignored, not parser for it.") return text, None parser = parsers[selected_needles[0]] print("Loaded", pdf_path, "as", parser.__module__.split(".")[-1]) return parser, pdf_path, text
async def planning(ctx, period: typing.Optional[str] = None): """`!planning [opt: vendredi|samedi|dimanche|semaine]` te donne le planning et le lien vers le PDF.""" embed = discord.Embed() url = 'https://www.hackingindustry.camp/Planning-HIC-2021.pdf' r = requests.get(url, allow_redirects=True) embed.add_field(name="lien", value=url) embed.set_thumbnail( url='https://www.hackingindustry.camp/images/logos/Logo_HIC_White.png') bio = BytesIO(r.content) pdf = extract_text(bio) fields = [ 'planning', 'vendredi 5 février 2021', 'samedi 6 février 2021', 'dimanche 7 février 2021', 'du lundi 8 février au vendredi 12 février 2021' ] idxs = [] idx_ends = [] opt_list = {'vendredi': 1, 'samedi': 2, 'dimanche': 3, 'semaine': 4} for f in fields: try: idx = pdf.lower().index(f) idx_end = idx + len(f) idxs.append(idx) idx_ends.append(idx_end) except ValueError: pass if period is None: for i in range(len(idxs)): field_name = pdf[idxs[i]:idx_ends[i]] msg_end = -1 if i + 1 >= len(idxs) else idxs[i + 1] msg = pdf[idx_ends[i]:msg_end] embed.add_field(name=field_name, value=msg) elif period.lower() in opt_list: opt = period.lower() period = opt_list[opt] # field_name = pdf[idxs[period]:idx_ends[period]] msg_end = -1 if period + 1 >= len(idxs) else idxs[period + 1] msg = pdf[idx_ends[period]:msg_end] embed.add_field(name=field_name, value=msg) else: field_name = 'error' msg = "options possibles sont:\n" msg += "- `!planning` pour le planning entier\n" for k in opt_list.keys(): msg += f"- `!planning {k}`\n" embed.add_field(name=field_name, value=msg) await ctx.send(embed=embed)
def extract_text(self): full_path = self.pdf_path + self.file_name text = extract_text(full_path) with open('./cv/pdf_to_text.txt', 'w') as file: file.write(text) print(repr(text)) print(text) print(type(text))
def scan_files_process(keywords: List[str], file: str) -> List[bool]: if file[-4:] == '.pdf': process_file_text = extract_text(file) elif file[-4:] == 'docx': process_file_text = docx2txt.process(file).upper() else: with open(file, 'rt') as process_file: process_file_text = process_file.read() return [k in process_file_text.lower() for k in keywords]
def _extract_text(file_path): try: pdf_to_text = extract_text(file_path) return ''.join([x for x in filter(lambda x: x.strip() != '', "".join(pdf_to_text).splitlines())]) except PDFSyntaxError as pse: logging.error('文件解析错误,不是一个正确的 PDF 文件') raise Exception('无法解析 PDF') from pse return ''
def _parse_pdf_text(self, url: str) -> str: """Parse pdf text from url.""" with tempfile.NamedTemporaryFile() as tmp: download_file_from_url(url, tmp.name) with open(tmp.name, "rb") as f: text = extract_text(f) text = re.sub(r"(\d) (\d)", r"\1\2", text) text = re.sub(r"\s+", " ", text) return text
def parse_pdf(file_name, page_sep=False): '''Return text of a pdf file Args: file_name (str): a string of the file name. page_sep (bool): return all text as a string if False, a list of text on each page if True. ''' # Set up pdf parsing environment output_string = StringIO() rsrcmgr = PDFResourceManager() retstr = io.StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Determine the number of pages in the file fp = open(file_name, 'rb') length = len(list(PDFPage.get_pages(fp))) # Determine the start of meeting # for pageNumber, page in enumerate(PDFPage.get_pages(fp)): # interpreter.process_page(page) # content = retstr.getvalue().replace('\n', ' ').replace(' ', ' ').replace('\x0c', '') # retstr.truncate(0) # retstr.seek(0) # if re.search("pengerusi", content): # where the meeting started # start_no = pageNumber # print(file_name, start_no) # break # if 'start_no' in locals(): # page_no = range(start_no, length) # else: # print("Starting page not found", file_name) # return None # yet to determine a keyword that can determine the start in all files page_no = range(0, length) # Read data if page_sep == False: # Get all text in the file as a string text = extract_text(file_name, page_numbers=page_no) elif page_sep == True: # Separate the content of each page and store in a list text = [] for pageNumber, page in enumerate(PDFPage.get_pages(fp)): if pageNumber in page_no: interpreter.process_page(page) text.append(retstr.getvalue().replace('\n', ' ').replace( ' ', ' ').replace('\x0c', '')) retstr.truncate(0) retstr.seek(0) return [file_name, text]
def readSrpingerFile(filepath): """ this function opens the springer pdf file wich contains more than 400 urls to free ebooks param: filepath: path to the pdf file from springer returns: a list of urls to ebooks. """ pat = re.compile(r"http://link.springer.com/openurl.*", re.IGNORECASE) t = extract_text(filepath) links = pat.findall(t) return links
def _parse_drive_id_from_pdf(self, pdf_path): # Get link from pdf with open(pdf_path, "rb") as f: text = extract_text(f) link = re.search(r"https://bit\.ly/.*", text).group() # Unshorten resp = requests.get(link) link = resp.url # Get id return link.split("/")[-1].split("?")[-2]
def extract_from_pdf(pdf_file): """ Extracts text from PDF file. :param pdf_file: path to file to be processed :type pdf_file: str :return: All the text from file as one string :rtype: str """ return pdf2txt.extract_text(pdf_file)
def read(self, path, html=False): text = StringIO() if html: with open(path, "rb") as f: extract_text_to_fp(f, text, laparams=LAParams(), output_type="html", codec=None) text = text.getvalue() else: text = extract_text(path) return text
def read_pdf(filename): os.chdir('scrapped_pdfs') pathl = '\scrapped_pdfs' if os.getcwd().endswith(pathl): print('File found...') print(extract_text(filename + '.pdf')) os.chdir('../') else: print('Path is wrong..') print(os.getcwd())
def parse_doc(terms, path_to_pdf): patent_pdf = open(path_to_pdf, 'rb') text = extract_text(patent_pdf) relevant_information = {} i = 0 lines = text.splitlines() def get_element(line_number, paragraphs, start): entry = "" index = line_number + start paragraph_count = 1 while paragraph_count < paragraphs + 1: next_line = lines[index] entry += next_line if len(next_line) == 0: paragraph_count += 1 index += 1 relevant_information[p] = entry return entry, index while i < len(lines): line = lines[i] if 'Sheet 1' in line or len(terms) == 0: break p = parse_for_keywords(line, terms) if p is not None: start = 1 count = 1 if p == 'Int. Cl.': count = 3 elif p == 'Assignees' or p == 'Inventors' or p == 'CPC': start = 0 entry, index = get_element(i, count, start) relevant_information[p] = entry i = index terms.remove(p) else: i += 1 new_info = {} for key, value in relevant_information.items(): if key == 'CPC': cpc, uspc = parse_cpc_uspc(value) new_info['CPC'] = cpc new_info['USPC'] = uspc else: new_info[key] = parse_entries(key, value) return new_info
def get_pdf_content(self, url: str): """ :param url: target url to get pdf content :return: string-like text of content """ r = requests.get(url) with open('./cache/' + self.fn, 'wb+') as f: f.write(r.content) t = ph.extract_text('./cache/' + self.fn) return t
def get_introduction(path): text = extract_text(path) text = clean_text(text) loc_begin = text.find("INTRODUCTION") for stopper in ('RELATED', 'PRELIMINAR'): loc_end = text.find(stopper) if loc_end != -1: break introduction = text[loc_begin:loc_end] return introduction
def post_save_pdf(sender, instance, created, *args, **kwargs): if created: #pdfobj = open(instance.upload.path, "rb") #pdfread = PyPDF2.PdfFileReader(pdfobj) text = extract_text(instance.upload.path) #for i in range(pdfread.numPages): # pob = pdfread.getPage(i) # text += pob.extractText() instance.cv_text = text instance.save()
def extract_and_json(path, filename): output = PdfFileWriter() text_list = [] text = extract_text(pdf) text.replace("\n","") text_list.append(text) dict = {'text':text_list} json_object = json.dumps(dictionary, indent = 4) output.addPage(json_object) output_stream = open(app.config['DOWNLOAD_FOLDER'] + filename, 'wb') output.write(output_stream)
def convert_to_txt(pdf_docs_location, txt_files_location): for dirpath, dirnames, files in os.walk(pdf_docs_location): for file_name in files: raw_text = extract_text(os.path.join(dirpath, file_name), caching=False) os.mkdir(os.path.join(txt_files_location, file_name[:-4])) text_file = open( os.path.join(txt_files_location, file_name[:-4], file_name[:-4] + ".txt"), "w+") text_file.write(raw_text)
def extract_content_pdfMiner(file_name, directory): """ alternative way to extract pdf content """ try: text = extract_text(directory + file_name) except: return np.nan return text
def translate_func(): banner() # open , read and get pages from pdf check_file(_path) name, ex = os.path.splitext(_path) fp = open(_path, 'rb') reader = PdfFileReader(fp) num_pages = reader.numPages pc.print("Pdf contents {} pages".format(num_pages), style="blue") #extracting text time_before = time.time() text = extract_text(_path) time_after = time.time() pc.print( "Extracting text from Each Page of {} in total of {} seconds".format( _path, time_after - time_before), style="bright_cyan") ## putting all text in one new file fw = open(f'{name}.txt', 'w') fw.write(text) pc.print('Successfully created text file for all text ', style="bright_black") ## start translating stuff time_first = time.time() with open(f'{name}.txt', 'r') as fn: lines = fn.readlines() try: results = pool.map(request, lines) pass except Exception as error: raise error pool.close() pool.join() time_second = time.time() print("Translating in %s in a total of %s seconds" % (len(lines), time_second - time_first)) fn.close() result_file = f'{name}_{lang}.txt' result = open(result_file, 'w') result.write(str(results)) pc.print(' Created your %s lang translated text file of %s ' % (lang, _path), style="cyan")
def read_pdf(pdf_file_name): try: err_message = None pdf_text = extract_text(pdf_file_name, laparams=laparams) return clean_text(pdf_text) except FileNotFoundError: err_message = f'ERROR NOFILE: {pdf_file_name}' except pdfminer.pdfparser.PDFSyntaxError: err_message = f'ERROR PDF: {pdf_file_name}' except: err_message = f'ERROR FILE: {pdf_file_name}' return err_message
def dl_paper(self): """Download pdf paper with 'paper_id' in working directory.""" # search paper id in arxiv list search = arxiv.Search(id_list=[self.paper_id]) # get paper object paper = next(search.get()) # extract paper title self.paper_title = paper.title # download paper as pdf paper.download_pdf(filename=self.paper_name) # load paper content self.paper_content = extract_text(self.paper_name)
def extract_text_pdf(pdf): text = extract_text(pdf) text = text.replace('-\n', '') text = text.replace('\n\n', '[SEP]') text = text.replace('\n', ' ') text = text.split('[SEP]') # result = [line for line in text if len(line.split(' ')) > 8 and line.strip() != ''] result = [line for line in text if line.strip() != ''] return result
def get_data(filepath): text = extract_text(filepath) lines = text.splitlines() lines = [x.strip() for x in lines if x.strip()] data = {} data['date'] = lines[0] for i, x in enumerate(lines): if x == TOTAL_TESTS: # read the next number and abort data[x] = atoi(lines[i + 1]) return data
def main(): text = extract_text("Projeto Grupou.pdf", page_numbers=range(14, 35)).lower() tagged = pre_process(text) chunks = filter_chunks(tagged) normal = [] for c in chunks: if len(c) == 1: normal.append(c[0]) else: normal.append([d[0] for d in c]) normal.sort(key=lambda x: len(x)) for n in normal: print(' '.join(n))
def upload(): if 'file' not in request.files: return {"error": 'no file submitted.'} if 'query' not in request.files: return {"error": "no query found."} file = request.files['file'] query = json.loads(request.files['query'].read().decode('utf-8')) if file.filename == '': return {"error": "no file selected"} if file and allowed_file(file.filename): filename = secure_filename(file.filename) save_location = os.path.join(app.config['UPLOAD_FOLDER'], filename) if not os.path.exists(os.path.join('.', 'static', 'pdfs')): os.makedirs(os.path.join('.', 'static', 'pdfs')) file.save(save_location) txt = high_level.extract_text(save_location) try: c.addResume(txt) conn = psycopg2.connect("{}".format(os.getenv("URI"))) cur = conn.cursor() cur.execute("SELECT * FROM jobs LIMIT 0;") colnames = [desc[0] for desc in cur.description] jobQuery = "%{}%".format(query['job']) locName = query['location'] cur.execute("SELECT * FROM jobs WHERE descrip LIKE %s;", (jobQuery, )) results = [] for row in cur: indRes = {} for i, colName in enumerate(colnames): indRes[colName] = row[i] c.addJobDesc(row[1]) indRes['grade'] = (c.compareResumeToJob()) results.append(indRes) conn.close() os.remove(save_location) return {"data": results} except Exception as e: print("Error occured, closing connection.") print(e) conn.close() return "Error occured" else: return {"error": "no work"} # print(save_location) os.remove(save_location) return results
def read_pdf(filepath): try: full_text = extract_text(filepath) except: full_text = "" doc = { 'filepath': filepath, 'full_text': full_text, 'title': filepath, 'author': "", } return doc