def get_req_from_pdf(fileNameIn, regExp, fileNameOut): """ transform pdf file fileNameIn in requirements in json file fileNameOut via reg exp regExp :arg fileNameIn: pdf file containing requirements :type fileNameIn: string :arg regExp: regular expression :type regExp: string :arg fileNameOut: Json file :type fileNameOut: string """ print("Extract : %s"%fileNameIn) pdf2txt.main(["-A", "-o", "../work/output.txt", fileNameIn]) # 1) Read input file fp=open("../work/output.txt","r") data = fp.readlines() concatenatedData = "".join(data) # # 2) provide the concatened string to regexp extract = re.findall(regExp, concatenatedData) # # 3) write json file requirements = pyReq(fileNameOut) for item in extract: requirements.add(item[0], fileNameIn, item[1]) # # 4) Free resources del(requirements) fp.close()
def get_full_text(self, storage_dir): if hasattr(self, 'attachments'): for attachment in self.attachments: # TODO: read text from pdf pdf2txt.main(['pdf2txt', self.format_filename(attachment, storage_dir)]) else: return "No PDF attachments."
def pdftotextcovert(): print "***File Path :" + name arr = ['arguments', '-o', 'pdftotextconvertedfile.txt', name] pdf2txt.main(arr) f = open("pdftotextconvertedfile.txt", "r") data = f.read() data = unicode(data, errors='ignore') loadfiledata()
def exec_pdf2txt(pdf_file): # pdf2txt.py -t text -o 2900510720726.txt 2900510720726.pdf #subprocess.call([cmd, '-t', 'text', '-o', txt_file, pdf_file]) import pdf2txt txt_file = pdf_file.replace('.pdf', '.txt') args = ['pdf2txt.py', '-t', 'text', '-o', txt_file, pdf_file] pdf2txt.main(args) return(txt_file)
def exec_pdf2txt(pdf_file): # pdf2txt.py -t text -o 2900510720726.txt 2900510720726.pdf #subprocess.call([cmd, '-t', 'text', '-o', txt_file, pdf_file]) import pdf2txt txt_file = pdf_file.replace('.pdf', '.txt') args = ['pdf2txt.py', '-t', 'text', '-o', txt_file, pdf_file] pdf2txt.main(args) return (txt_file)
def transforme_pdf_en_txt(fichier_PDF, REPERTOIRE_TXT): # Titre de la section où se retrouvent les contrats. TITRE_SECTION_20 = " Affaires contractuelles" # Titre de la section suivant celle où se retrouvent les contrats. TITRE_SECTION_30 = " Administration et finances" prefixe_txt = os.path.splitext(os.path.basename(fichier_PDF))[0] fichier_TXT_temp = os.path.join(REPERTOIRE_TXT, prefixe_txt + '_temp.txt') fichier_TXT = os.path.join(REPERTOIRE_TXT, prefixe_txt + '.txt') odj_traites = open(fichier_TXT, "w") est_dans_section_affaires_contractuelles = False est_dans_section_suivante = False compteur_page = 0 while not est_dans_section_suivante: compteur_page += 1 print("Traitement de la page %s" % compteur_page) args = [ 'pdf2txt', '-p', str(compteur_page), '-o', fichier_TXT_temp, fichier_PDF, ] pdf2txt.main(args) with open(fichier_TXT_temp, 'r') as f: for ligne in f: if not est_dans_section_affaires_contractuelles: if TITRE_SECTION_20 in ligne: est_dans_section_affaires_contractuelles = True if TITRE_SECTION_30 in ligne: est_dans_section_suivante = True break elif est_dans_section_affaires_contractuelles: if ligne.startswith("['Page "): # Ne pas écrire le numéro de page du pied-de-page break else: # Ajouter la ligne dans le fichier fichier_TXT odj_traites.writelines(ligne) os.remove(fichier_TXT_temp) odj_traites.close()
def convertPDF2TXT_thread(fullpath, lock): # 0 conversion succeed and created new txt file # 1 failed # 2 already done and nothing has been changed _argv = [ "D:/EclipseWorkspace/TextbasedSixDegree/pdf2txt.py", "-o", "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt", fullpath ] if os.path.isfile("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt"): with lock: print("process exits with id: %d " % os.getpid()) return 2 else: try: pdf2txt.main(_argv) except: print("PDF 2 TXT conversion failed. Info:") print(sys.exc_info()[1]) with lock: print("process exits with id: %d " % os.getpid()) return 1 # format txt txt = open( "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt", 'r') data = txt.read() data = data.replace("\f", '') data = data.replace('\n', ' ') data, number = re.subn(re.compile(" [ ]+"), " ", data) data, number = re.subn(re.compile("[^a-zA-Z. ]+"), "", data) txt_fmt = open( "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-4] + "_fmt.txt", 'w') txt_fmt.write(data) txt.close() txt_fmt.close() os.remove("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt") os.rename( "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-4] + "_fmt.txt", "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/pdf" + os.path.basename(fullpath)[:-3] + "txt") with lock: print("process exits with id: %d " % os.getpid()) return 0
def pdftotextcovert(): print "***File Path :"+name arr = ['arguments', '-o', 'pdftotextconvertedfile.txt',name] pdf2txt.main(arr) f = open("pdftotextconvertedfile.txt","r") data=f.read() data=unicode(data, errors='ignore') testdata="" wordList = re.sub("[^\w]", " ", data).split() for i in range(len(wordList)): testdata=testdata+" "+lemmatizer.lemmatize(wordList[i]) lbl1.configure(text="File Data :"+testdata) nounphrases(testdata)
def dump_file(self,tag="text",outfile=None): '''tag = xml, tag, text,html''' if outfile: outfil=self.location+"/"+outfile else: outfile=self.location+"/tmp.txt" flag = tag #path = self.location+"/"+self.filename cmd = ["spacer","-t",flag,"-o",outfile, self.filename] try: pdf2txt.main(cmd) except IOError: print "Missing file!"
def pdf_to_text(file_path): import pdf2txt (outpath,ext) = os.path.splitext(file_path) outfile = outpath + '.txt' print(os.path.abspath(outfile)) if os.path.exists(outfile): return text_from_txt_file(outfile) outfile = os.path.abspath(outfile) file_path = os.path.abspath(file_path) pdf2txt.main(argv=['pdf2txt', '-o', outfile, file_path ]) return text_from_txt_file(outfile)
def convertirLibros(parametros): parametros_pdf2txt = list() parametros_pdf2txt.append("") parametros_pdf2txt.append(parametros.ruta_general) parametros_pdf2txt.append(parametros.ruta_base_txts) bandera, librosNoConvertidos = pdf2txt.main(parametros_pdf2txt) return bandera, librosNoConvertidos
def read_pdfs_directory(): print "*****PROCESS STARTED*****\n" print "Cleaning directory" clean_up() list_of_pdf_files = glob.glob('pdfs/*.pdf') print "List of files that are going to be parsed:" print list_of_pdf_files with open("datafile.txt", "a") as myfile: for individual in list_of_pdf_files: print "Reading: {}".format(individual) pdf2txt.main(['', '-o', 'individualfile.txt', '-t', 'text', individual]) individual_file = open('individualfile.txt', 'r') individual_content = individual_file.read() myfile.write(individual_content) print "Finished reading: {}".format(individual) print "Completed reading PDF files" print "Created datafile.txt from PDFs"
def convertPDF(fullpath): # read and convert the file to pure texts # 0 conversion succeed and created new txt file # 1 failed # 2 already done and nothing has been changed _argv = ["D:/EclipseWorkspace/TextbasedSixDegree/pdf2txt.py", "-o", "D:/EclipseWorkspace/TextbasedSixDegree/txt_ori/" + os.path.basename(fullpath)[:-3] + "txt", fullpath] if os.path.isfile("D:/EclipseWorkspace/TextbasedSixDegree/txt_ori/" + os.path.basename(fullpath)[:-3] + "txt"): return 2 else: try: pdf2txt.main(_argv) except: print("PDF 2 TXT conversion failed. Info:") print(sys.exc_info()[1]) return 1 return 0
def convertPDF2TXT_thread(fullpath, lock): # 0 conversion succeed and created new txt file # 1 failed # 2 already done and nothing has been changed _argv = ["D:/EclipseWorkspace/TextbasedSixDegree/pdf2txt.py", "-o", "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt", fullpath] if os.path.isfile("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt"): with lock: print("process exits with id: %d " % os.getpid()) return 2 else: try: pdf2txt.main(_argv) except: print("PDF 2 TXT conversion failed. Info:") print(sys.exc_info()[1]) with lock: print("process exits with id: %d " % os.getpid()) return 1 # format txt txt = open("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt", 'r') data = txt.read() data = data.replace("\f", '') data = data.replace('\n', ' ') data, number = re.subn(re.compile(" [ ]+"), " ", data) data, number = re.subn(re.compile("[^a-zA-Z. ]+"), "", data) txt_fmt = open("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-4] + "_fmt.txt", 'w') txt_fmt.write(data) txt.close() txt_fmt.close() os.remove("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-3] + "txt") os.rename("D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/" + os.path.basename(fullpath)[:-4] + "_fmt.txt", "D:/EclipseWorkspace/TextbasedSixDegree/txt_fmt/pdf" + os.path.basename(fullpath)[:-3] + "txt") with lock: print("process exits with id: %d " % os.getpid()) return 0
import pdf2txt from urllib import urlretrieve import collections headers = [ "Starting Serial Num", "Ending Serial Num", "Male", "Female", "Others", "Total" ] finaldata = collections.OrderedDict() for item in range(1580001, 1580213): url = "http://ceokarnataka.kar.nic.in/FinalRoll2013_Final/English/WOIMG/AC158/AC" + str( item) + ".pdf" filename = str(item) + ".pdf" urlretrieve(url, filename) pdf2txt.main(["", "-M 40", "-p 1", "-oa.txt", filename]) infile = open("a.txt").readlines() output = collections.OrderedDict() for line in infile: if line.strip().startswith("Part"): for el in line.strip().split(): if el.isdigit(): part = el if line.startswith("Male Female"): for i in xrange(len(infile[infile.index(line) + 1].strip().split())): output[headers[i]] = infile[infile.index(line) +
import json import pdf2txt from urllib import urlretrieve import collections headers = ["Starting Serial Num", "Ending Serial Num", "Male", "Female", "Others", "Total"] finaldata = collections.OrderedDict() for item in range(1580001, 1580213): url = "http://ceokarnataka.kar.nic.in/FinalRoll2013_Final/English/WOIMG/AC158/AC" + str(item) + ".pdf" filename = str(item) + ".pdf" urlretrieve(url, filename) pdf2txt.main(["", "-M 40", "-p 1", "-oa.txt", filename]) infile = open("a.txt").readlines() output = collections.OrderedDict() for line in infile: if line.strip().startswith("Part"): for el in line.strip().split(): if el.isdigit(): part = el if line.startswith("Male Female"): for i in xrange(len(infile[infile.index(line) + 1].strip().split())): output[headers[i]] = infile[infile.index(line) + 1].strip().split()[i] finaldata["158"+str(item)[-3:]] = output print str(item), "complete"
import requests import sys sys.path.append('/mnt/brick1/justin/nejm/pdfminer-20140328/tools') import pdf2txt import logging # downloads pdfs into temp.pdf, then converts temp.pdf to a text file logging.basicConfig(filename='save_pdfs.log', level=logging.INFO) base = 'http://www.nejm.org' flinks = open(sys.argv[1], 'r') #opens file with list of links as first column for line in flinks: logging.info(line) tokens = line.rstrip().split() linkname = tokens[0] response = requests.get(base + linkname) with open('temp.pdf', 'wb') as f: f.write(response.content) linkpathsplit = linkname.split('/') fname = linkpathsplit[-1] + '_' + tokens[1] + '.txt' pdf2txt.main(['wer', '-o', fname, 'temp.pdf'])
alchemyapi = AlchemyAPI() # ---- Convert pdf to txt (doesn't have to be run if no new pdfs are added) --- pdf_name = [] # get pdf names for(dirpath, dirnames,filenames) in walk(pdf_path): pdf_name.extend(filenames) break # convert pdf to txt for f in pdf_name: if f[-3:] == "pdf": full_pdf_path = pdf_path + f full_text_path = out_txt_path + f[0:-3] + "txt" pdf2txt.main(full_pdf_path, k , full_text_path) # --------------------------------------------------------------------------- txt_name = [] for(dirpath, dirnames,filenames) in walk(out_txt_path): txt_name.extend(filenames) break json_data = {} entity_list = [] keywords_list = [] concept_list = [] for f in txt_name: if f[-3:] == "txt": full_text_path = out_txt_path + f
def pdf_to_text(pdf, text): args = ["", "-o", text, pdf] pdf2txt.main(args)
# # #lunghezza = 5 #len(file_list) lunghezza = len(file_list) excel = [[0 for x in range(7)] for y in range(lunghezza)] for f in range(0, lunghezza): #for f in range(0,len(file_list)): print f reportpdf = file_list[f] reporttxt = my_dir + '\\temp.txt' pdf2txt.main(['', '-o', reporttxt, reportpdf]) data_raw = [] testo = open(reporttxt, 'r') with testo as myfile: for line in myfile: data_raw.append(line) #for i in range(0,len(data_raw)): # print i, data_raw[i] testo.close() ############## Data ##############
def scrape_and_parse(pdf_file_name, text_file_name): pdf2txt.main([pdf_file_name, "-o", text_file_name]) sow_parsing_ff(text_file_name)
def main(): print print("Debut du traitement") print REPERTOIRE_PDF = "C:\\ContratsOuvertsMtl\\Ordres_du_jour\\PDF" #Répertoire où les fichiers PDF sont enregistrés# fichier_PDF = "" #Nom du fichier PDF traité REPERTOIRE_TXT = "C:\\ContratsOuvertsMtl\\Ordres_du_jour\\TXT" #Répertoire où le fichier texte résultant sera sauvegardé fichier_TXT = "" #Nom du fichier texte qui sera généré TITRE_SECTION_20 = " Affaires contractuelles" #Titre de la section où se retrouvent les contrats TITRE_SECTION_30 = " Administration et finances" #Titre de la section suivant celle où se retrouvent les contrats est_dans_section_affaires_contractuelle = False #Variable pour savoir si on est rendu à la section des contrats, pour ne pas sauvegarder #les premières pages inutilement continuer = True #Variable pour arrêter le traitement une fois que la section des contrats est terminée compteur_page = 0 #Compteur pour le traitement des pages for filename in os.listdir(REPERTOIRE_PDF): #Passer au travers des fichiers PDF fichier_PDF = REPERTOIRE_PDF + "\\" + filename fichier_TXT = REPERTOIRE_TXT + "\\" + filename.replace("pdf","txt") #Ouverture du fichier fichier_TXT pour sauvegarder le traitement odj_traites = open(fichier_TXT, "w") fodj_traites = csv.writer(odj_traites, delimiter = ';') while continuer: #Passer au travers des pages du fichier PDF compteur_page = compteur_page + 1 #Compteur pour le traitement des pages print("Traitement de la page %s" % compteur_page) #Afficher le numéro de la page comme indicateur que le traitement fonctionne arg = ["", '-p', '' + str(compteur_page) + '', '-o', 'C:\\ContratsOuvertsMtl\\Ordres_du_jour\\TXT\\page_' + str(compteur_page) + '.txt', fichier_PDF] pdf2txt.main(arg) #Convertir la page du PDF en texte with open('C:\\ContratsOuvertsMtl\\Ordres_du_jour\\TXT\\page_' + str(compteur_page) + '.txt', "r",) as f: reader = csv.reader(f, delimiter = "|") #Accéder au fichier texte généré for ligne in reader: #Passer au travers du fichier texte généré if est_dans_section_affaires_contractuelle == False: #Indicateur si on est dans la section des contrats if TITRE_SECTION_20 in str(ligne).encode("utf-8"): est_dans_section_affaires_contractuelle = True if TITRE_SECTION_30 in str(ligne).encode("utf-8"): #Indicateur si on a fini de traiter la section des contrats continuer = False break else: if est_dans_section_affaires_contractuelle: #Écrire la page dans le fichier fichier_TXT if left(str(ligne),7) == "['Page ": #Ne pas écrire le numéro de page du pied-de-page break else: #Ajouter la ligne dans le fichier fichier_TXT fodj_traites.writerow(ligne) f.close() odj_traites.close() print print("Fin du traitement")
# # #lunghezza = 5 #len(file_list) lunghezza = len(file_list) excel = [[0 for x in range(7)] for y in range(lunghezza)] for f in range(0,lunghezza): #for f in range(0,len(file_list)): print f reportpdf = file_list[f] reporttxt = my_dir + '\\temp.txt' pdf2txt.main(['', '-o', reporttxt, reportpdf]) data_raw = [] testo = open(reporttxt, 'r') with testo as myfile: for line in myfile: data_raw.append(line) #for i in range(0,len(data_raw)): # print i, data_raw[i] testo.close()
import json import pdf2txt from urllib import urlretrieve import collections headers = ["Starting Serial Num", "Ending Serial Num", "Male", "Female", "Others", "Total"] finaldata = collections.OrderedDict() for item in range(1580001, 1580213): url = "http://ceokarnataka.kar.nic.in/FinalRoll2013_Final/English/WOIMG/AC158/AC" + str(item) + ".pdf" filename = str(item) + ".pdf" urlretrieve(url, filename) pdf2txt.main(["", "-M 40", "-p 1", "-oa.txt", filename]) infile = open("a.txt").readlines() output = collections.OrderedDict() for line in infile: if line.strip().startswith("Part"): for el in line.strip().split(): if el.isdigit(): part = el if line.startswith("Male Female"): for i in xrange(len(infile[infile.index(line) + 1].strip().split())): output[headers[i]] = infile[infile.index(line) + 1].strip().split()[i] finaldata["158" + str(item)[-3:]] = output print str(item), "complete"
import json import pdf2txt import collections datalist = [] pdf2txt.main(["", "-M 40", "-ooutput.txt", "voterdata.pdf"]) finaldata = {} finaldata["assemblywise"] = [] finaldata["total"] = [] infile = open('output.txt').readlines() sanitized = [] for line in infile: if line.strip() and line.strip().split()[0].isdigit() and len(line.strip().split()) > 4: sanitized.append(line.strip()) for i in xrange(len(sanitized)): if sanitized[i].strip() and sanitized[i].strip().split()[0].isdigit(): fullstring = ','.join(w for w in sanitized[i].strip().split() if w) pointer = 0 t = collections.OrderedDict() pc_no = fullstring.split(',')[pointer] pointer += 1 t["pc_no"] = int(pc_no) pc_name = "" if not fullstring.split(',')[pointer].isdigit(): pc_name += fullstring.split(',')[pointer] pointer += 1