def file_convert_to_txt_if_necessary(Prg, FileOrigAbsPath, Converted__FileBaseNames__OrigNames): BaseNameNoExt, ExtensionLow = util.basename_without_extension__ext( FileOrigAbsPath, ExtensionLower=True) if ExtensionLow in ExtensionsConvertable: FilePathConvertedToText = util.filename_without_extension( FileOrigAbsPath) + ".txt" Converted__FileBaseNames__OrigNames[BaseNameNoExt] = FileOrigAbsPath if not os.path.isfile( FilePathConvertedToText): # convert if it's necessary if ExtensionLow == ".pdf": Converter = Prg["ConverterPdfToText"] if ExtensionLow == ".htm" or ExtensionLow == ".html": Converter = Prg["ConverterHtmlToText"] if Converter(Prg, FileOrigAbsPath, FilePathConvertedToText): info("Successful conversion to txt: " + FileOrigAbsPath) else: ConversionErrorMsg = f"Error, file conversion: {FileOrigAbsPath}" util.log(Prg, ConversionErrorMsg) info(ConversionErrorMsg)
def extract_all_labels(filenames, out_filepath=DATA_FOLDER+'labels.p', chunk_size=2000): print "EXTRACTING ALL LABELS INTO {0}".format(out_filepath) all_labels = [] label_dict = {} filenames_chunks = util.chunks(filenames, chunk_size) for i, chunk in enumerate(filenames_chunks): pool = Pool(processes=util.CPU_COUNT) chunk_labels = pool.map(extract_labels, chunk) pool.close() for filepath, labels in zip(chunk, chunk_labels): if labels is not None: file_id = util.filename_without_extension(filepath) label_dict[file_id] = labels all_labels += labels print i+1, '/', len(filenames_chunks) #Write labels to file with open(out_filepath,'w') as f: pickle.dump(label_dict, f) print '\nLabels:' print len(set(all_labels)) print Counter(all_labels)
def frog_process_files(files, verbose=True): seen = [] start_time = time.time() frogger = frog.Frog(frog.FrogOptions(parser=False,mwu=False,ner=False,morph=False,chunking=False, numThreads=8),'/etc/frog/frog.cfg') for i, filename in enumerate(files): with open(filename,'r') as in_file: output = frogger.process_raw(in_file.read()) if verbose: print ('> PROCESSING', filename, str(len(seen))+'/'+str(len(files))) seen.append(filename) #Timings (estimation of time remaining) runtime = time.time() - start_time per_document_time = runtime/len(seen) remaining_time = (len(files)-len(seen))*per_document_time total_time = remaining_time+runtime print ("RUNTIME", duration_to_string(runtime), "("+duration_to_string(per_document_time)+")", 'REMAINING', duration_to_string(remaining_time), 'TOTAL', duration_to_string(total_time)) frogged_filename = util.filename_without_extension(filename, '.txt') with open(OUTPUT_FOLDER+frogged_filename+'.frog.out', 'w') as f: f.write(output)
def extract_all_labels(filenames, out_filepath=DATA_FOLDER + 'labels.p', chunk_size=2000): print "EXTRACTING ALL LABELS INTO {0}".format(out_filepath) all_labels = [] label_dict = {} filenames_chunks = util.chunks(filenames, chunk_size) for i, chunk in enumerate(filenames_chunks): pool = Pool(processes=util.CPU_COUNT) chunk_labels = pool.map(extract_labels, chunk) pool.close() for filepath, labels in zip(chunk, chunk_labels): if labels is not None: file_id = util.filename_without_extension(filepath) label_dict[file_id] = labels all_labels += labels print i + 1, '/', len(filenames_chunks) #Write labels to file with open(out_filepath, 'w') as f: pickle.dump(label_dict, f) print '\nLabels:' print len(set(all_labels)) print Counter(all_labels)
def test_util_filename_extension__without_extension(self): if self._test_exec("test_filename_extension__without_extension"): Ext = util.filename_extension("file.py") self.assertEqual(Ext, ".py") Ext = util.filename_extension("noext") self.assertEqual(Ext, "") FnameOnly = util.filename_without_extension("file.py") self.assertEqual(FnameOnly, "file")
def test_collect_docs_from_working_dir(self): if self._test_exec("test_collect_docs_from_working_dir"): Prg = self.Prg FileName = "test_file_document_example.txt" FilePath = os.path.join(Prg["DirDocuments"], FileName) util.file_del(FilePath) util.file_write(Prg, Fname=FilePath, Content="example text") DocumentsAvailable = document.document_objects_collect_from_dir_documents( Prg) self.assertIn(util.filename_without_extension(FileName), DocumentsAvailable) util.file_del(FilePath)
def extract_plaintext(filepath, outpath): with open(filepath) as fd: #Filename without extension file_id = util.filename_without_extension(filepath) plain_text = [] try: obj = xmltodict.parse(fd.read()) root = obj['open-rechtspraak'] metadata = root['rdf:RDF'] ############# # Extract content as plain text ############# if 'inhoudsindicatie' in root: summary = root['inhoudsindicatie'] as_plain_text(summary, plain_text) if 'uitspraak' in root: content = root['uitspraak'] as_plain_text(content, plain_text) if 'conclusie' in root: content = root['conclusie'] as_plain_text(content, plain_text) #print filepath #Write to outfile with codecs.open(outpath+file_id+'.txt', 'w', 'utf-8') as f: for line in plain_text: print>>f, line except KeyError, e: #Skip silently #print "ERROR:", sys.exc_info()[0] #print "Skipping faulty:", filepath #print "file_id:", file_id return except:
def extract_plaintext(filepath, outpath): with open(filepath) as fd: #Filename without extension file_id = util.filename_without_extension(filepath) plain_text = [] try: obj = xmltodict.parse(fd.read()) root = obj['open-rechtspraak'] metadata = root['rdf:RDF'] ############# # Extract content as plain text ############# if 'inhoudsindicatie' in root: summary = root['inhoudsindicatie'] as_plain_text(summary, plain_text) if 'uitspraak' in root: content = root['uitspraak'] as_plain_text(content, plain_text) if 'conclusie' in root: content = root['conclusie'] as_plain_text(content, plain_text) #print filepath #Write to outfile with codecs.open(outpath + file_id + '.txt', 'w', 'utf-8') as f: for line in plain_text: print >> f, line except KeyError, e: #Skip silently #print "ERROR:", sys.exc_info()[0] #print "Skipping faulty:", filepath #print "file_id:", file_id return except:
def extract_labels(filepath): with open(filepath) as fd: #Filename without extension file_id = util.filename_without_extension(filepath) try: obj = xmltodict.parse(fd.read()) root = obj['open-rechtspraak'] metadata = root['rdf:RDF'] ############# # Extract labels ############# description = metadata['rdf:Description'] if type(description) is list: description = description[0] law_areas = description['dcterms:subject'] if type(law_areas) is not list: law_areas = [law_areas] text_labels = [] for x in law_areas: labels = x['#text'].split('; ') text_labels += labels return text_labels except KeyError, e: #Skip silently #print "ERROR:", sys.exc_info()[0] #print "Skipping faulty:", filepath #print "file_id:", file_id return except:
def filter_and_lemma(chunk_size=2000): files = glob.glob(INPUT_FOLDER+'*.frog.out') lemmatized = {} #Split all files in the list into chunks file_chunks = util.chunks(files, chunk_size) for i, chunk in enumerate(tqdm(file_chunks)): pool = Pool(processes=util.CPU_COUNT) filtered_lemmatized = pool.map(process, chunk) pool.close() for filename, value in zip(chunk, filtered_lemmatized): file_id = util.filename_without_extension(filename, '.frog.out') lemmatized[file_id] = value #Order by key ordered = OrderedDict(sorted(lemmatized.items())) with open(DATA_FOLDER+'processed.p','w') as f: pickle.dump(ordered,f) print "Done!"
def doc_objects_delete__file_abspath(Prg, FileAbsPathWithExt): BaseName = os.path.basename(FileAbsPathWithExt) BaseNameNoExt = util.filename_without_extension(BaseName) doc_objects_delete__basename(Prg, BaseNameNoExt) util.file_del(FileAbsPathWithExt ) # del orig file in every case, if DocObj doesn't exis