def parsePhone(): tika.initVM() parsed = parser.from_file("MyResume.docx") regular_expression = re.compile(r"[[+91]*[' ']*[0-9]{10}]*", re.IGNORECASE) result = re.search(regular_expression, parsed["content"]) print("phone No:" + result.group())
async def convert_pdf_to_txt(pdf_path: str, save_dir: str) -> None: """ This function converts a pdf file to a txt file. It cleans the text. Parameters: pdf_path (str): The path where the pdf to covert is located save_dir (str): The path where to save the converted pdf Returns: None """ if not hasattr(convert_pdf_to_txt, 'nlp'): convert_pdf_to_txt.nlp = spacy.load(ACCEPTED_LANGUAGES['es']) convert_pdf_to_txt.nlp.add_pipe(convert_pdf_to_txt.nlp.create_pipe('sentencizer')) try: tika.initVM() pdf_file = parser.from_file(pdf_path) async with AIOFile(save_dir, 'w') as text_file: doc = convert_pdf_to_txt.nlp(pdf_file['content']) #print(doc) text = ''.join([re.sub(r'[,|;|\b]\n+\b', '\n', re.sub(r'\b\n+\b', '\n', s.text)) for s in doc.sents]) # Fix sentences that have more newlines than they should paragraphs = split_text_into_paragraphs(text) # Eliminate extra newlines between paragraphs new_text = '\n\n'.join(paragraphs) new_text = re.sub(r'-\s*\n+', '', new_text) # Join split words. print(new_text) await text_file.write(new_text) except Exception as e: raise e
def main(): url = 'https://www.harrisonburgva.gov/sites/default/files/Police/files/POLICIES/Use_of_Force-1.pdf' file_path = '/Users/dturcan/Docs/campaign_zero/use_of_force_docs/harrisonburg_va.pdf' # Extract all of the lines tika.initVM() parsed = parser.from_file(file_path) content = parsed["content"] uof_parser = UOFParser(content) # Read in config try: config = yaml.safe_load(open('config.yaml')) except FileNotFoundError: config = yaml.safe_load(open('uof_parser/config.yaml')) # Run indicators: for policy, policy_indicators in config.items(): print('-------------') print("Checking", policy) result = uof_parser.perform_search( policy_indicators.get('search_terms', []), policy_indicators.get('phrases_for_positive_indicator', [])) print() print(policy, ":", result[0]) print('Context:') print(result[1]) print()
def __init__(self, server_url=None): if server_url: os.environ['TIKA_CLIENT_ONLY'] = 'True' os.environ['TIKA_SERVER_ENDPOINT'] = server_url print("Tika Server Endpoint %s" % os.environ['TIKA_SERVER_ENDPOINT']) import tika tika.initVM()
def parseEmail(): tika.initVM() parsed = parser.from_file("MyResume.docx") regular_expression = re.compile( r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}", re.IGNORECASE) result = re.search(regular_expression, parsed["content"]) print("Email id:" + result.group())
def main(): count = 0 data_files = '/home/nimesh/Desktop/1000' output_file = open('grobidquantity_data_test.json', 'w+') for root, dirs, files in os.walk(data_files): for file in files: try: path='' if(file!='.DS_Store'): count+=1 print count path=os.path.join(root, file) tika.initVM() parsed = tika.parser.from_file(path) if("content" in parsed.keys()): type=parsed.get("metadata").get("Content-Type") print type content=parsed["content"] mycontent = content.encode("UTF-8") if(content is not None and ('application/pdf' in type or 'application/xml' in type or 'text/plain' in type)): p=os.popen('curl -GET --data-urlencode'+' '+'"text='+mycontent+'"'+' '+'localhost:8080/processQuantityText').read() json.dump(p, output_file) output_file.write('\n') except: continue output_file.close()
def fileworker(filequeue, dbqueue, monitorqueue, uforiamodules, config, rcontext): """ Receives a file item from file_scanner inside the filequeue and executes the file_processor for that file. The fileworker operates as the entry point for each process, and is therefore also responsible for the execution of any expensive library initialization code. filequeue - The file queue dbqueue - The database queue monitorqueue - The monitoring queue to show information about the current file uforiamodules - The uforia module objects from modulescanner config - The uforia configuration file rcontext - The recursion context """ # Start the JCC JVM runtime for Tika if not rcontext.jvm_initialized: import tika tika.initVM() rcontext.jvm_initialized = True while True: item = filequeue.get() if item == None: # Finished. break else: file_processor(item, dbqueue, monitorqueue, uforiamodules, config, rcontext) filequeue.task_done() filequeue.task_done()
def pdf2text(file): tika.initVM() parsed = parser.from_file(file) data = parsed["content"] list_sen = data.split('\s{4,}') for i in range(0, len(list_sen)): list_sen[i] = " ".join(list_sen[i].split()) return annotator.tokenize(list_sen[0])
def __init__(self, **kwargs): server_url = kwargs['tika_url'] if server_url: os.environ['TIKA_CLIENT_ONLY'] = 'True' os.environ['TIKA_SERVER_ENDPOINT'] = server_url print("Tika Server Endpoint %s" % os.environ['TIKA_SERVER_ENDPOINT']) import tika tika.initVM()
def __init__(self, beanstalk_host: Text = "127.0.0.1", beanstalk_port: int = 11300): self.client: Optional[greenstalk.Client] = None self.connect(beanstalk_host, beanstalk_port) logging.info("initialize tika VM") tika.initVM()
def get_content(self, data): import tika tika.initVM() try: parsed = tika.parser.from_file('/path/to/file') print(parsed["metadata"]) return parsed["content"] except: return "Error parsing content"
def __init__(self, tika_server_url): super(TikaParser, self).__init__('tika_parser') if tika_server_url: os.environ['TIKA_CLIENT_ONLY'] = 'True' os.environ['TIKA_SERVER_ENDPOINT'] = tika_server_url print("Tika Server Endpoint %s" % os.environ['TIKA_SERVER_ENDPOINT']) tika.initVM()
def __init__(self, path, **kwargs): if type(self).parser is None: # Tika is conditionally imported here import tika # automatically downloads tika jar and starts a JVM processif no REST API # is configured in ENV tika.initVM() from tika import parser as tk_parser type(self).parser = tk_parser super(TikaPreprocessor, self).__init__(path, **kwargs)
def extract_and_store(ftype): if not os.path.exists("../data/{}_raw_text".format(ftype)): os.mkdir("../data/{}_raw_text".format(ftype)) tika.initVM() for fidx, file in enumerate(os.listdir("../data/{}_pdfs".format(ftype))): print("{} -- {}".format(fidx, file)) raw_text = extract_raw_text(ftype, file).encode('utf8') raw_text_path = "../data/{}_raw_text/{}".format(ftype, file.replace(".pdf",".txt")) with open(raw_text_path, "wb") as f: f.write(raw_text)
def parseAnyFile(): tika.initVM() parsed = parser.from_file("sample resume.docx") print(parsed["content"]) print("------details after parsing resume--------") name=re.compile(r"([a-zA-Z]){3,}\s([a-zA-Z]){3,}") print("Name:"+re.search(name,parsed["content"]).group()) ph=re.compile(r"(\d{10})") print("Phone Number:"+re.search(ph,parsed["content"]).group()) mail = re.compile(r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}", re.IGNORECASE) print("Mail ID:"+re.search(mail,parsed["content"]).group()) address =re.compile(r"[a-z\S0-9\S]+,\n+[a-zA-Z,\n]+[a-zA-Z.]") print("Address:"+re.search(address,parsed["content"]).group())
def extractPDFwithTika(arqs): """Using Apache Tika to extract PDF text - https://pypi.org/project/tika/ Arguments: arqs {str} -- A list of filenames with path """ #the time for load the Tika .jar server impact in first time of use tika.initVM() for arq in arqs: timeIni = perf_counter() textoCompleto = parser.from_file(arq) fileName = os.path.basename(arq) timeEnd = perf_counter() timeTotal = timeEnd - timeIni printMiniReport(textoCompleto["content"], fileName, "Tika", timeTotal) saveText(textoCompleto["content"], fileName, "Tika") print("--- Tika ---")
class TikaPreprocessor(DocPreprocessor): """ This preprocessor use `Apache Tika <http://tika.apache.org>`_ parser to retrieve text content from complex file types such as DOCX, HTML and PDFs. Documentation for customizing Tika is `here <https://github.com/chrismattmann/tika-python>`_ Example:: !find pdf_dir -name *.pdf > input.csv # list of files from snorkel.parser import ( TikaPreprocessor, CSVPathsPreprocessor, CorpusParser ) CorpusParser().apply( CSVPathsPreprocessor('input.csv', parser_factory=TikaPreprocessor) ) """ # Tika is conditionally imported here import tika # automatically downloads tika jar and starts a JVM processif no REST API # is configured in ENV tika.initVM() from tika import parser as tk_parser parser = tk_parser def parse_file(self, fp, file_name): parsed = type(self).parser.from_file(fp) txt = parsed['content'] name = os.path.basename(fp).rsplit('.', 1)[0] stable_id = self.get_stable_id(name) doc = Document(name=name, stable_id=stable_id, meta={'file_name': file_name}) yield doc, txt
def run(): """ Starts Uforia. Sets up the database, modules, and then invokes the file_scanner. """ recursive = rcontext.is_recursive if not recursive: print("Uforia starting...") db = database.Database(config) if not recursive: db.setup_main_table() db.setup_mimetypes_table() if config.ENABLEMODULES: if config.DEBUG: print("Detecting available modules...") uforiamodules = modules.Modules(config, db, rcontext) if not recursive: fill_mimetypes_table(uforiamodules) else: uforiamodules = '' # Start the JCC JVM runtime for Tika if not rcontext.jvm_initialized: import tika tika.initVM() rcontext.jvm_initialized = True if config.DEBUG: print("Starting producer...") if os.path.exists(config.STARTDIR): file_scanner(config.STARTDIR, uforiamodules, rcontext) else: print("The pathname " + config.STARTDIR + " does not exist, stopping...") if not recursive: print("\nUforia completed...\n")
def getPDF(filename): import tika tika.initVM() from tika import parser parsed = parser.from_file(filename) return parsed["content"].split("\n")
from logging import info from pathlib import Path import tika from web.datasets.services import get_s3_client from django.conf import settings from django.core.exceptions import ImproperlyConfigured from dotenv import find_dotenv, load_dotenv from dramatiq import actor, set_broker from dramatiq.brokers.rabbitmq import RabbitmqBroker from tika import parser tika.initVM() # noqa # Esse bloco (feio) faz com que esse módulo funcione dentro ou fora do Django try: from web.datasets.models import File except ImproperlyConfigured: import configurations import os os.environ.setdefault("DJANGO_CONFIGURATION", "Dev") os.environ.setdefault("DJANGO_SETTINGS_MODULE", "web.settings") load_dotenv(find_dotenv()) configurations.setup() from web.datasets.models import File rabbitmq_broker = RabbitmqBroker(url=settings.BROKER_URL) set_broker(rabbitmq_broker) client = get_s3_client(settings)
def textpdf(document): tika.initVM() parsed = parser.from_file(document.encode('utf-8')) return parsed["content"]
def startVM(self,proxy_host,proxy_port,httpsProxy,no_proxy): args=self.__create_vm_args(proxy_host,proxy_port, httpsProxy,no_proxy) log.debug("Args for JVM: {}. Maxheap: {}".format(args,self.jvm_max_heap)) vm=tika.initVM(maxheap=self.jvm_max_heap, vmargs=args) return vm
import sys import os import tika tika.initVM() from tika import parser def read_html_file(fileop): parsed = parser.from_file(fileop) return(parsed["content"]) def read_training_file(filename): filewr = open("big.txt","w") HTMLDirs = next(os.walk(filename))[2] print(HTMLDirs) count=0 for files in HTMLDirs: fileop = os.path.join(filename,files) text = read_html_file(fileop) print(text) filewr.write(text.encode('utf-8')) count = count+1 print(count) trainingfile = sys.argv[1] read_training_file(trainingfile)
def test_tika(self): image = 'https://raw.githubusercontent.com/nmcteam/image-with-text/master/example/destination.jpg' tika.initVM() parsed = parser.from_file(image)
def fileanalisis(f_in_tika): global patrones global busqueda_pln # Tabla resultados # Guarda un resumen de la sumatoria de hallazgos por cada tipo econtrado # Tiene el formato: Nombre,hash,impacto,cant.ocurrencias,impacto resultados = [] # Tabla resultadodetalle # Guarda el detalle de cada item encontrado. # Tiene el formato: Tipo,dato resultadodetalle = [] ################################# #Parseo de archivo con Tika ################################# tika.initVM() parsed = parser.from_file(f_in_tika) #Se extrae el contenido del archivo parseado. La otra opción es extraer los metadatos del archivo. doctika = parsed["content"] #Entrenamiento de Spacy el procesador de lenguaje natural NLP nlp = spacy.load('en_core_web_sm') # En el caso de querer probar con idioma español se debe usar la proxima linea. # nlp = spacy.load('es_core_new_sm') # Carga en la tabla resultados los elementos de "patrones". Convierto los nombres a hash # El impacto queda en cero ya que luego será calculado. for name, patron, impacto in patrones: resultados.append([name, nlp.vocab.strings[name], impacto, 0, 0]) # Carga en la tabla resultados los elementos de "busqueda_pln" # El impacto queda en cero ya que luego será calculado. for name, hash, impacto in busqueda_pln: resultados.append([name, hash, impacto, 0, 0]) print("\nNombre de archivo a analizar", f_in_tika) ######################################################################### #Procesamiento usando Spacy de la cadena de caracteres entregada por Tika ######################################################################### doc = nlp(parsed["content"]) # Crea el objeto con todos los match matcher_obj = Matcher(nlp.vocab) ######################################################### # busqueda_pln por expresiones Regulares (lista patrones) ######################################################### #agrego todos los patrones a buscar al objeto para que Spacy pueda buscar expresiones regulares. for nombre, pat, impacto in patrones: matcher_obj.add(nombre, None, pat) #Se realiza la busqueda #Guarda en la lista "Coincidencias" todos los match de las expresiones regulares. #El formato de la lista es hash,start,end (en el documento) coincidencias = matcher_obj(doc) #Recorre la lista de objetos encontrados y matcheo con la tabla de resultados usando el hash como id. #Por cada hallazgo aumenta en 1 el campo cant_ocurrencias for var in range(len(coincidencias)): hash = coincidencias[var][0] for index in range(len(resultados)): if hash == resultados[index][1]: resultadodetalle.append([ resultados[index][0], doc[coincidencias[var][1]:coincidencias[var][2]].text ]) resultados[index][3] += 1 #Quita Duplicados resultadodetalle = removeDuplicates(resultadodetalle) ###################################################################### # busqueda_pln por NLP de Spacy, usando Named Entity Recognition (NER) ###################################################################### # Entidades a buscar con nombre,hash (todos en 0), impacto #Lista de entidades econtradas entidades = [] #Imprime las entidades encontradas según lo especificado en búsqueda y agrego los hallazgos a "Entidades" for ent in doc.ents: for index in range(len(busqueda_pln)): if ent.label_ == busqueda_pln[index][0]: entidades.append([ent.label_, ent.text]) resultadodetalle.append([ent.label_, ent.text]) #Quita Duplicados entidades = removeDuplicates(entidades) print("\nEntidades\n") print(entidades) #Imprime los resultados en detalle ordenados por tipo de evento print("\nHallazgos en detalle") resultadodetalle = sorted(resultadodetalle, key=lambda item: item[0], reverse=False) for nombre, detalle in resultadodetalle: print(nombre, detalle) #Vuelve a recorrer todas las entidades econtradas y por cada hallazgo suma 1 a la tabla de resultados for ente in range(len(entidades)): tipo = entidades[ente][0] for index in range(len(resultados)): if tipo == resultados[index][0]: resultados[index][3] += 1 ############################################################################################### #Calculo el riesgo del archivo por medio de la formula Riesgo=Impacto * cantidad de ocurrencias ############################################################################################### for resul in range(len(resultados)): resultados[resul][4] = resultados[resul][2] * resultados[resul][3] #Ordena la lista por la columna peso y la vuelve a imprimir resultados = sorted(resultados, key=lambda item: item[4], reverse=True) riesgoarchivo = 0 #Imprime los resultados finales estadisticos print("\nTABLA DE RESULTADOS\n") print("\nTIPO - IMPACTO - CANTIDAD - RIESGO\n") for linea in range(len(resultados)): print(resultados[linea][0], resultados[linea][2], resultados[linea][3], resultados[linea][4]) riesgoarchivo += resultados[linea][4] print("\nRiesgo de archivo:", riesgoarchivo) #Chequea la cantidad de hallazgos print('Total de matcheos en el documento:', len(resultados)) return riesgoarchivo, resultados, resultadodetalle, doctika
def __init__(self): print('initializing pdf reader...') tika.initVM() print('done.')
def getpdf(filename): # print("텍스트 파일을 추출할 PDF 파일명을 입력하세요.") tmp3 = settings.BASE_DIR / 'images' / filename tmp3=str(tmp3) print(tmp3,'%#$') # PDFfileName = 'documents/' + input() + '.pdf' # PDF를 열고, interpreter, pages 변수를 가져온다. device, interpreter, pages = pdfopen(tmp3) if device == -1 and interpreter == -1 and pages == -1: print("PDF 파일을 잘못 입력했습니다.") exit() # PDF를 읽고, test_list를 가져오고, title을 가져오고, 띄어쓰기를 교정하며, 가장 많이 사용한 텍스트 크기를 반환한다. text_list, textfont_list, textmiddle_list, title_num, title_data, image_name, image_list, textmiddle_average, textfont_average, char_list = pdfread(device, interpreter, pages, filename) title_data = title_return(title_data).strip() print(title_data) translator = Translator() translator_cnt = 0 print(char_list,'$#@') if len(char_list) > 0: # print("논문 형식에 따라, 논문 전체 내용을 요약합니다.") print_result = "논문 내용\n" # 맞춤법을 교정한다. # print("맞춤법 교정 시작!") result = char_list.strip().split(".") final_result = "" translate_result = "" for y in range(len(result)): # print("맞춤법 교정 중.... " + str(round((y+1) / (len(result)+1) * 100, 2)) + "%") if len(result[y]) > 0: try: temp = spell_checker.check(result[y] + '.') final_result += temp.as_dict()['checked'] print_result += temp.as_dict()['checked'] + "\n" if translator_cnt == 0: if translator.translate(temp.as_dict()['checked']).src == 'ko': translator_cnt = 1 elif translator.translate(temp.as_dict()['checked']).src == 'en': translate_result += translator.translate(temp.as_dict()['checked'], dest='ko').text except: final_result += result[y] + ". " print_result += result[y] + ".\n" if translator_cnt == 0: if translator.translate(result[y]).src == 'ko': translator_cnt = 1 elif translator.translate(result[y]).src == 'en': translate_result += translator.translate(result[y], dest='ko').text # print("맞춤법 교정 완료!") # print("") else: try: # KCI 사이트에서 관련 정보를 가져온다. # print("PDF 논문 분석 중...") link_data, title_data_ko, title_data_en, title_data_plus1, title_data_plus2, journalInfo1, journalInfo2, journalInfo3, name1, name2, content1, content2, content3, content4, reference = crawling_setting(title_data) # print("PDF 논문 분석 완료!") # print("") except: link_data = -1 text_list = list_return(text_list) collect_loc = maxsize_return(text_list, textfont_list) # print(collect_loc) # 다단 나누고, 같은 글자 크기끼리 리스트를 합친다. text_list, textfont_list, figure_name, figure_list = pdfsort(text_list, textfont_list, textmiddle_list, textmiddle_average, textfont_average) text_list, textfont_list = pdfgrap(text_list, textfont_list) text_list, textfont_list = pdfcutter(text_list, textfont_list, title_num, collect_loc) # 관련 텍스트를 전부 합친다. result = "" print_result = "" for y in range(len(text_list)): result += text_list[y] + " " if link_data == -1: pass print("KCI에 등록되어 있지 않은 논문이거나 사이트 액세스 오류입니다.") else: # 관련 정보를 추가한다. print_result = "링크 : " + link_data + "\n\n" print_result += "논문 제목(한글) : " + title_data_ko + "\n\n" print_result += "논문 제목(영어) : " + title_data_en + "\n\n" print_result += "피인용 횟수 : " + str(title_data_plus1) + "\n\n" print_result += "열람 횟수 : " + str(title_data_plus2) + "\n\n" print_result += "학술지 : " + journalInfo1 + "\n\n" print_result += "논문정보 : " + journalInfo2 + "\n\n" print_result += "발행기관 : " + journalInfo3 + "\n\n" print_result += "저자 정보\n" for x in range(len(name1)): print_result += str(x) + " : " + name1[x] + " (" + name2[x] + ")\n" print_result += "\n" print_result += "논문 초록\n" print_result += content1 + "\n\n" print_result += content2 + "\n\n" print_result += "키워드\n" if len(content3) == len(content4): for x in range(len(content3)): print_result += str(x) + " : " + content3[x] + " (" + content4[x] + ")\n" else: for x in range(len(content3)): print_result += str(x) + " : " + content3[x] + "\n" print_result += "\n" if len(reference) > 0: print_result += "참고 문헌\n" for x in range(len(reference)): print_result += reference[x] + "\n" print_result += "\n" # 그림 데이터를 정제한다. figure_image_name = [] figure_image_src = [] if len(figure_name) > 0: max_cnt = max(image_list) max_list = [] count_list = [] for x in range(max_cnt+1): max_list.append(image_list.count(x)) count_list.append(0) if x >= 1: max_list[x] += max_list[x-1] # print(max_list) # print(count_list) for x in range(len(figure_name)): if (count_list[figure_list[x] - 1] + max_list[figure_list[x] - 1]) < max_list[figure_list[x]]: if image_name[(count_list[figure_list[x] - 1] + max_list[figure_list[x] - 1])].count('No Image') == 0: figure_image_name.append(figure_name[x]) figure_image_src.append("images/" + image_name[(count_list[figure_list[x] - 1] + max_list[figure_list[x] - 1])]) count_list[figure_list[x] - 1] += 1 if len(figure_image_name) > 0: print_result += "그림\n" for x in range(len(figure_image_name)): print_result += figure_image_name[x] + " " + figure_image_src[x] + "\n" print_result += "\n" print_result += "논문 내용\n" # 맞춤법을 교정한다. # print("맞춤법 교정 시작!") result = result.strip().split(".") final_result = "" translate_result = "" for y in range(len(result)): # print("맞춤법 교정 중.... " + str(round((y+1) / (len(result)+1) * 100, 2)) + "%") if len(result[y]) > 0: try: temp = spell_checker.check(result[y] + '.') final_result += temp.as_dict()['checked'] print_result += temp.as_dict()['checked'] + "\n" if translator_cnt == 0: if translator.translate(temp.as_dict()['checked']).src == 'ko': translator_cnt = 1 translate_result += temp.as_dict()['checked'] elif translator.translate(temp.as_dict()['checked']).src == 'en': translate_result += translator.translate(temp.as_dict()['checked'], dest='ko').text else: translate_result += temp.as_dict()['checked'] except: final_result += result[y] + ". " print_result += result[y] + ".\n" if translator_cnt == 0: if translator.translate(result[y]).src == 'ko': translator_cnt = 1 translate_result += result[y] + ". " elif translator.translate(result[y]).src == 'en': translate_result += translator.translate(result[y], dest='ko').text else: translate_result += result[y] + ". " # print("맞춤법 교정 완료!") # print("") if len(final_result) < 100: print("논문 내용이 뽑히지 않아 다시 진행중...") import tika tika.initVM() from tika import parser parsed = parser.from_file(tmp3) temp = parsed["content"] temp = temp.replace('\n', '') print_result += temp + "\n" final_result = temp result = temp.strip().split(".") final_result = "" translate_result = "" for y in range(len(result)): if translator.translate(result[y]).src == 'ko': translate_result = temp break # print("번역 중.... " + str(round((y+1) / (len(result)+1) * 100, 2)) + "%") translate_result += translator.translate(result[y], dest='ko').text + ". " # print("추출 완료!") # 요약서비스를 이용한다 # print("요약 서비스 시작!") summarize_data = lexlank_function(final_result) summarize_result = "본문 요약 (10줄)\n" for x in range(len(summarize_data)): try: if translator.translate(summarize_data[x]).src == 'en': summarize_result += "원문 : " + summarize_data[x] + "\n" summarize_result += "번역 : " + translator.translate(summarize_data[x], dest='ko').text + "\n\n" else: summarize_result += summarize_data[x] + "\n\n" except: summarize_result += summarize_data[x] + "\n\n" # print("요약 완료!") # print("") # print(translate_result) # print("키워드 추출 시작!") summarize_tags = keywords_function(translate_result) # print(summarize_tags) # visualize_function(PDFpathName, summarize_tags) # print("키워드 추출 완료!") # print("") output_name='res'+filename +'.txt' fileIn = open(settings.BASE_DIR / 'reports/algo/outputs'/ output_name, 'wt', encoding='utf-8') print(print_result, file=fileIn) fileIn.close() output1_name='final_'+filename +'.txt' fileOut = open(settings.BASE_DIR / 'reports/algo/outputs'/ output1_name, 'wt', encoding='utf-8') print(final_result, file=fileOut) fileOut.close() output2_name='summarize_'+filename +'.txt' fileOut = open(settings.BASE_DIR / 'reports/algo/outputs'/ output2_name, 'wt', encoding='utf-8') print(title_data+';^'+summarize_result, file=fileOut) fileOut.close() output3_name='tag_'+filename +'.txt' fileOut = open(settings.BASE_DIR / 'reports/algo/outputs'/ output3_name, 'wt', encoding='utf-8') print(summarize_tags, file=fileOut) fileOut.close() # fileOut = open('outputs/output2_' + filename +'.txt', 'wt', encoding='utf-8') # print(summarize_result, file=fileOut) # fileOut.close() # print(final_result,title_data,summarize_data,summarize_tags) # print( title_data,'#',summarize_data[0]) print("프로그램 완료! 종료하겠습니다.") print("")
def __init__(self): tika.initVM() nltk.download('punkt')
def pdf(filename): tika.initVM() file = parser.from_file(filename) texto = file['content'] texto = texto.replace('\n\n', '¬').replace('\n', '').replace('¬', '\n\n') return texto
def get_xml(self, ocr, tikaUrl, path, image_save, save_path, fullname, pdf_save): tika.initVM() tika.TikaClientOnly = True os.environ['no_proxy'] = '*' name1 = fullname.split('/') name2 = name1[-1].replace('.pdf', '') name2 = name2 + "_ocr" + ".pdf" name3 = name1[-2] + '/' + name2 parsed = parser.from_file(path, tikaUrl, xmlContent=True) xml2 = parsed["content"] xml2 = xml2.split('<div class="page">') convert_ocr_path = "" xml = "" if ocr == "true": #if not os.path.exists(save_path + name3): pages = convert_from_path(path, 450) image_counter = 1 os.mkdir(image_save) os.mkdir(pdf_save) for page in pages: filename = "page_" + str(image_counter) + ".jpg" page.save(image_save+"/" + filename, 'JPEG') image_counter = image_counter + 1 filelimit = image_counter - 1 pageList = [] for i in range(1, filelimit + 1): filename = "page_" + str(i) + ".jpg" filepdfname = "page_" + str(i) + ".pdf" pdf = pytesseract.image_to_pdf_or_hocr(image_save +"/" + filename, extension='pdf', config='-psm 6') with open(pdf_save+"/" + filepdfname, 'a+b') as f: f.write(pdf) shutil.rmtree(image_save) merger = PdfFileMerger() path = pdf_save for i in range(1, filelimit + 1): filepdfname = "page_" + str(i) + ".pdf" merger.append(pdf_save +"/" + filepdfname) shutil.rmtree(pdf_save) merger.write(save_path + name3) merger.close() parsed = parser.from_file(save_path + name3, tikaUrl, xmlContent=True) convert_ocr_path=save_path + name3 xml = parsed["content"] else: if xml2[2] == "<p />\n</div>\n": if not os.path.exists(save_path + name3): pages = convert_from_path(path, 450) image_counter = 1 os.mkdir(image_save) os.mkdir(pdf_save) for page in pages: filename = "page_" + str(image_counter) + ".jpg" page.save(image_save+'/' + filename, 'JPEG') image_counter = image_counter + 1 filelimit = image_counter - 1 pageList = [] for i in range(1, filelimit + 1): filename = "page_" + str(i) + ".jpg" filepdfname = "page_" + str(i) + ".pdf" pdf = pytesseract.image_to_pdf_or_hocr(image_save +'/'+ filename, extension='pdf', config='-psm 6' ) with open(pdf_save +"/" + filepdfname, 'a+b') as f: f.write(pdf) shutil.rmtree(image_save) merger = PdfFileMerger() path = pdf_save for i in range(1, filelimit + 1): filepdfname = "page_" + str(i) + ".pdf" merger.append(pdf_save +"/" + filepdfname) shutil.rmtree(pdf_save) merger.write(save_path + name3) merger.close() parsed = parser.from_file(save_path + name3, tikaUrl, xmlContent=True) convert_ocr_path = save_path + name3 xml = parsed["content"] else: parsed = parser.from_file(save_path + name3, tikaUrl, xmlContent=True) xml = parsed["content"] else: xml = parsed["content"] for i in rule['REPLACE']: xml = xml.replace(i[0], i[1]) return xml, convert_ocr_path
# tika 서버 초기화 하기 import tika tika.initVM() from tika import parser print("텍스트 파일을 추출할 PDF파일명을 입력하세요.") PDFfileName = 'documents/' + input() + '.pdf' inputpath = PDFfileName parsed = parser.from_file(PDFfileName) temp = parsed["content"] fileOut = open('output.txt', 'w', encoding='utf-8') print(temp, file=fileOut) fileOut.close()
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # $Id$ # # Author: mattmann # Description: TBD import json import sys import getopt import tika tika.initVM(tika.CLASSPATH) _verbose = False _helpMessage = ''' Usage: translate [-v] [-c column headers file] [-i input json file] [-j output json file] [-p cred file] [-f from] [-t to] Options: -i input json file --injson=file The input named JSON file. -j json file --json=file Output the named JSON file. -c column headers file --cols=file Use the provided column headers to parse the TSV and to name fields in the JSON. -f from language --from=2 letter language code The 2 letter code of the language to translate from. -t to language --to=2 letter language code
def init(): tika.initVM()
pacotes necessários: pip install python-docx pip install tika @author: PAULO.GFERREIRA """ """------ Pacotes ------""" import unicodedata, re, os from docx import Document from datetime import datetime # Tika é o pacote para importação em qualquer formato import tika #No windows é necessário inicializar a VM de java if os.name == 'nt': tika.initVM() from tika import parser """------ Classes ------""" # Objetos alterações encotradas class Alteracoes_obj: #instancias_criadas = [] def __init__(self, ind_original=None, ind_novo=None, simi_difflib=None, simi_bow=None, tipo=None): self.ind_original = ind_original self.ind_novo = ind_novo
def __init__(self): tika.initVM()