def extraction(self, filename, ftype, delete_file=True): pprocess = False output_file = None test_file = self.create_test_tagging(filename, ftype) results = self.dtn_doc._results model_name = crf_utils.get_crf_model_name(self.categorie) if not model_name and self.ftype: model_name = crf_utils.get_crf_model_name(self.ftype) if model_name: out_path = crf_utils.get_crf_training_directory() model = os.path.join(out_path, model_name + crf_utils.CRF_MODEL_EXT) if pprocess: command_line = crf_utils.CRF_TEST + ' -m "%s" "%s"' % ( model, test_file) else: output_file = test_file + ".out" command_line = crf_utils.CRF_TEST + ' -m "%s" -o "%s" "%s"' % ( model, output_file, test_file) command_line = command_line.replace("\\", "/") dtn_logger.logger_info("CRF", command_line) # run CRF++ if pprocess: args = shlex.split(command_line) p = Popen(args, shell=True, bufsize=4096, stdin=PIPE, stdout=PIPE, stderr=PIPE) output = p.communicate() if (p.returncode == 0 or p.returncode == 255): #self._parse_output(output, results) self._parse_buffer(output, results) dtn_logger.logger_info( "CRF", "crf test exit code : %d" % (p.returncode)) else: dtn_logger.logger_error( "CRF", "crf test exit code : %d" % (p.returncode)) else: os.system(command_line) self._parse_file(output_file, results) else: dtn_logger.logger_error( "CRF", "crf model file is not found (%s)" % (ftype)) ''' remove test file ''' if delete_file: os.remove(test_file) if output_file: os.remove(output_file) return results
def create_embedding_words(self, path, created=True): fname = md.get_model_embedded_file(self.MODEL_NAME) tw = Text4Words() if created: dtn_logger.logger_info("TRAINING", "Create Embedded Words " + fname) tw.load_directory(path) tw.train_word_vector_embedding(vectfname=fname) else: tw.load_word_vector_embedding(vectfname=fname)
def load_word_vector_embedding(self, vectfname=None): # word to vector file name if vectfname == None: vectfname = util.get_default_wordvect_file() self._model = KeyedVectors.load_word2vec_format(vectfname, binary=False) # summarize vocabulary size in model words = list(self._model.wv.vocab) logging.logger_info( "Embedded", 'load vocabulary : %d (%s)' % (len(words), vectfname))
def load_embedding_words(self): embeddings_index = {} fname = md.get_model_embedded_file(self.MODEL_NAME) dtn_logger.logger_info("TRAINING", "Load Embedded Words " + fname) f = codecs.open(os.path.join(fname), 'r', 'utf-8') for line in f: values = line.split() if len(values) > 2: word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() return embeddings_index
def add_clause(self, clausetype, name, value): filename = self._get_clause_file_name(clausetype) if os.path.exists(filename): f = codecs.open(filename, 'a+', 'utf-8') else: f = codecs.open(filename, 'w', 'utf-8') dtn_logger.logger_info("Clause", "add clause %s : %s" % (clausetype, name)) f.write("[[" + name + "\n") f.write(value + "\n]]\n\n") f.close()
def _init_terms_table(self, filename, termtype): self.categorie = termtype # get file name self.fullname = filename self.filename = os.path.basename(filename).split('.')[0] # get file created date self._filetime = util.get_creation_file_date(filename) # init verfying tab self.verified_terms = {} self.keywords = dtn_sentence.get_document_categorie(termtype) for key in self.keywords: self.verified_terms[key] = ExtractData(key, termtype) dtn_logger.logger_info("VERIFY", "%s (%s)" % (filename, termtype))
def _train(self, trainf, model, threads=8, cost=16.0): ''' -a CRF-L2 or CRF-L1 默认是CRF-L2, 一般来说L2算法效果要比L1算法稍微好一点,虽然L1算法中非零特征的数值要比L2中大幅度的小。 -c float 这个参数设置CRF的hyper-parameter。c的数值越大,CRF拟合训练数据的程度越高。 -f NUM 这个参数设置特征的cut-off threshold。CRF++使用训练数据中至少NUM次出现的特征。默认值为1。 -p NUM 多个CPU,那么那么可以通过多线程提升训练速度。NUM是线程数量。 ''' # run CRF++ #os.system(self.crf_learn + " -t -p %d -c %f %s %s %s" % (threads, cost, self.templatef, trainf, model)) templatef = crf_utils.get_crf_template_file() commd = crf_utils.CRF_LEARN + " -p %d -c %f %s %s %s" % ( threads, cost, templatef, trainf, model) dtn_logger.logger_info("CRF Training", commd) #os.system(crf_utils.CRF_LEARN + " -p %d -c %f %s %s %s" % (threads, cost, templatef, trainf, model)) os.system(commd)
def _create_training_data(self): labels = to_categorical(np.asarray(self.labels)) # create data data = pad_sequences(self.sequences, maxlen=self.MAX_SEQUENCE_LENGTH) dtn_logger.logger_info("TRAINING", "loading data " + str(data.shape)) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] nb_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0]) # training size and values x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:] return x_train, y_train, x_val, y_val
def create_categorie_tagging(self, fpath, categorie): dataname = crf_utils.add_crf_model_name(categorie) dtn_logger.logger_info("CRF Training", "%s => %s" % (categorie, dataname)) out_path = crf_utils.get_crf_training_directory() output_file = os.path.join(out_path, dataname + crf_utils.CRF_FILE_TAG_EXT) output_data = codecs.open(output_file, 'w', 'utf-8') self.folder_tagging(fpath, categorie, output_data) output_data.close() model = os.path.join(out_path, dataname + crf_utils.CRF_MODEL_EXT) self._train(output_file, model)
def train_word_vector_embedding(self, min_count=2, sg=0, vectfname=None): # word to vector file name if vectfname == None: vectfname = util.get_default_wordvect_file() # word to vector self._model = word2vec.Word2Vec(self.sentences, min_count=min_count, sg=sg, workers=4, size=self.VECTOR_LENGTH, window=5) # save wood to vector model self._model.wv.save_word2vec_format(vectfname, binary=False) # summarize vocabulary size in model words = list(self._model.wv.vocab) logging.logger_info( "Embedded", 'create vocabulary : %d (%s)' % (len(words), vectfname))
def close_output(self): if self._verbose > 0: dtn_logger.logger_info( "CONVERT", "Total Files : %d, Error Files : %d \n\n" % (self._total, self._error)) dtn_logger.logger_info( "CONVERT", "DOC(%d), DOCX(%d), PDF(%d), IMG(%d), XLSX(%d), TXT(%d), MSG(%d)" % (self._ftypes['DOC'], self._ftypes['DOCX'], self._ftypes['PDF'], self._ftypes['IMG'], self._ftypes['XLSX'], self._ftypes['TXT'], self._ftypes['MSG'])) self.outout_result.close() if self._verbose > 1: print("\n\nTotal Files : %d, Error Files : %d \n\n" % (self._total, self._error)) print( "\n\nDOC(%d), DOCX(%d), PDF(%d), IMG(%d), XLSX(%d), MSG(%d), MSG(%d) \n\n" % (self._ftypes['DOC'], self._ftypes['DOCX'], self._ftypes['PDF'], self._ftypes['IMG'], self._ftypes['XLSX'], self._ftypes['TXT'], self._ftypes['MSG']))
def write_text_file(self, path, outfile, document, f): #isok = f.test_document(document) isok = False if document and len(document) > 100: isok = True if isok: # write text file fp = codecs.open(outfile, 'w', self._encoding) fp.write(document) fp.close() dtn_logger.logger_info("CONVERT", "%s => %s" % (path, outfile)) if self._verbose > 0: self.outout_result.write(path + " => OK \n") if self._verbose > 1: print("%d write text to %s " % (self._total, outfile)) else: dtn_logger.logger_error("CONVERT", "%s => %s" % (path, outfile)) self._error += 1 if self._verbose > 0: self.outout_result.write(path + " => ERROR\n") if self._verbose > 1: print("%d. convert error : %s " % (self._error, path))
def predict(self, filename): texts = self.load_predict_document(filename) tokenizer = Tokenizer(num_words=self.clause_model.MAX_NB_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) # create data data = pad_sequences(sequences, maxlen=self.clause_model.MAX_SEQUENCE_LENGTH) dtn_logger.logger_info("PREDICT", "Verification document : " + filename) dtn_logger.logger_info("PREDICT", "Predict Data : " + str(data.shape)) model = md.load_json_model(self.clause_model.MODEL_NAME) #model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) model.compile(loss='binary_crossentropy', optimizer=md.OPTIMIZER_ADAM, metrics=['accuracy']) for i, s in enumerate(data): s = data[np.array([i])] preds = model.predict(s) n = self.sample(preds[0]) print("*** " + self.clause_model.label_name[n] + "***") n = self.sample(preds[0], 0.8) print("*** " + self.clause_model.label_name[n] + "***") n = self.sample(preds[0], 0.2) print("*** " + self.clause_model.label_name[n] + "***") print(texts[i]) if i > 5: break
def create_dataset(self, text_path, min_count=2, sg=0, workers=1, size=256, window=5): """ min_count : ignore all words with total frequency lower than this. sg : sg = O CBOW, sg=1 skip-gram workers: thread size : dimension feature vectors. window : maximum distance between the current and predicted word within a sentence. """ dtn_logger.logger_info("DATASET", "create dataset " + text_path) lists, doclists = self.load_documents(text_path) dictionary = corpora.Dictionary(lists) corpus = [dictionary.doc2bow(text) for text in lists] # save corpus corpusfname = self.get_mm_file_name() corpora.MmCorpus.serialize(corpusfname, corpus) # save dictionay dictfname = self.get_dict_file_name() dictionary.save(dictfname) dictfname = self.get_model_list_name() # initialize a model tfidf = models.TfidfModel(corpus, normalize=True) # use the model to transform vectors corpus_tfidf = tfidf[corpus] # initialize an LSI transformation, LSI 2-D space lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) lsi.save(dictfname) # same for tfidf, lda, ... #training doc2vec model = self.TrainingDoc2Vec(doclists, size=size, window=window, nb_loop=32) # save doc vector vectfname = self.get_model_file_name() model.save(vectfname) # word to vector model = word2vec.Word2Vec(lists, min_count=min_count, sg=sg, workers=workers, size=size, window=window) # save words vector vectfname = self.get_word_model_name() model.wv.save_word2vec_format(vectfname, binary=False) #model.sort_vocab() #model.build_vocab(sentences, update=False) # save file label self.save_filelabel() # save doc label self.save_doclabel() # save vector labels self.save_labelset() # save classifier labels self.save_classifierlabel()
o_file = options.output_dir if options.input_dir == None: infile = variables.CORPUS_DIR if options.action != 'convert': infile = os.path.join(variables.CORPUS_DIR, 'TEXT') commd = "-a " + options.action if infile: commd += " -i " + infile if o_file: commd += " -o " + o_file if options.doc_type: commd += " -t " + options.doc_type dtn_logger.logger_info("MAIN", commd) if options.action == 'convert': ''' conv = Convert(verbose=verbose, restart=options.restart) o_file = conv.open_output(infile, o_file) conv.files_to_text(infile, o_file) conv.close_output() ''' ofile = docutonelocate.convert_file(infile, True) elif options.action == 'testfile': conv = Convert(verbose=verbose, restart=options.restart) conv.test_files_in_directory(infile, o_file)