def _is_categorie_folder(self, path): for fname in sorted(os.listdir(path)): fpath = os.path.join(path, fname) if os.path.isdir(fpath) and not variables.noloaddir(fpath): return False return True
def add_folder_structure(self, fpath, offset, parent): listf = [] for name in os.listdir(fpath): if variables.noloaddir(name) : continue path = os.path.join(fpath, name) if os.path.isdir(path): label = self.get_categorie_label(name, offset) if parent != None and label !=None: label = parent + ";" + name self.folder_structure[name] = [label, offset] self.folder_order.append(name) ll = self.add_folder_structure(path, (offset+1), label) if len(ll) > 0 : self.folder_structure[name].append(ll) elif name.endswith('.txt') : listf.append([name, self.file_index]) self.file_index += 1 return listf
def _create_folder_clauses(self, fpath): for name in os.listdir(fpath): path = os.path.join(fpath, name) if os.path.isdir(path): if variables.noloaddir(name): continue self._create_folder_clauses(path) elif name.endswith(".txt"): self._create_clauses(path)
def create_ner(self, fpath): # first level only for directory for name in sorted(os.listdir(fpath)): if variables.noloaddir(name): continue path = os.path.join(fpath, name) if os.path.isdir(path): print("training => " + name) self.create_ner_in_directory(path) elif path.endswith('.txt'): self.file_named_tag(path)
def create_subset_model(self, path, categorie): if self._is_categorie_folder(path): self.create_categorie_tagging(path, categorie) else: for fname in sorted(os.listdir(path)): fpath = os.path.join(path, fname) if os.path.isdir(fpath): if variables.noloaddir(fpath): continue else: self.create_categorie_tagging(fpath, fname)
def update_corpus_folder(self, path, outfile, name) : if not variables.noloaddir(path) : if not os.path.exists(outfile) : os.mkdir(outfile) for fname in sorted(os.listdir(path)): ipath = os.path.join(path, fname) ofile = os.path.join(outfile, fname); if os.path.isdir(ipath): self.update_corpus_folder(ipath, ofile, fname) else : self.update_corpus_text_file(ipath, ofile, fname)
def load_corpus_directories(self, path, name, level, encoding='utf-8') : for fname in sorted(os.listdir(path)): fpath = os.path.join(path, fname) if os.path.isdir(fpath): if variables.noloaddir(fname) : pass else : self.load_corpus_directories(fpath, fname, (level+1)) else : self.load_corpus_text_file(path, fname, name, level)
def create_ner_in_directory(self, path): for fname in sorted(os.listdir(path)): fpath = os.path.join(path, fname) if variables.noloaddir(fname): continue if os.path.isdir(fpath): print("training => " + fname) self.create_ner_in_directory(fpath) elif fname.endswith('.txt'): self.file_named_tag(fpath) pass
def folder_tagging(self, path, categorie, output_data): self.tagging.set_categorie(categorie) for fname in sorted(os.listdir(path)): fpath = os.path.join(path, fname) if os.path.isdir(fpath): if variables.noloaddir(fpath): continue self.folder_tagging(fpath, categorie, output_data) else: self.tagging.tagging(fpath, output_data)
def create_crf_model(self, path=None, subset=True): if path == None: path = config.TEMPLATE_DIR + "/TEXT" for fname in sorted(os.listdir(path)): fpath = os.path.join(path, fname) if variables.noloaddir(fpath): continue if os.path.isdir(fpath): if subset: self.create_subset_model(fpath, fname) else: self.create_categorie_tagging(fpath, fname)
def create_corpus_documents(self, text_path) : self.folder_structure = self.load_folder_structure(text_path); for name in sorted(os.listdir(text_path)): path = os.path.join(text_path, name) if os.path.isdir(path) : if variables.noloaddir(name): pass else : self.load_corpus_directories(path, name, 1) else : self.load_corpus_text_file(text_path, name, name, 1)
def load_directory(self, path): for name in os.listdir(path): fpath = os.path.join(path, name) if os.path.isdir(fpath): if variables.noloaddir(name): continue else: self.load_directory(fpath) else: if name.endswith(".txt"): #self.sentences.append(self.load_file(fpath)) self.load_file(fpath)
def load_directory(self, path) : for fname in sorted(os.listdir(path)): fpath = os.path.join(path, fname) if os.path.isdir(fpath): if variables.noloaddir(fname) : continue self.load_directory(fpath) elif fname.endswith(".txt"): self.load_file(fpath) else : #is not text file pass
def files_to_text(self, fpath, ofile): """ fpath : input file or directory ofile : output text file or directory """ if not os.path.exists(ofile): os.mkdir(ofile) for name in os.listdir(fpath): if variables.noloaddir(name): continue path = os.path.join(fpath, name) o_file = os.path.join(ofile, name) if os.path.isdir(path): self.files_to_text(path, o_file) else: self._total += 1 if path.lower().endswith(".txt") == False: o_file += ".txt" self.file_to_text(path, o_file)