def getWordVectorsSum(self): self.nfWords = 0 self.sdict = dict() self.tmpCount = 0 if self.model.Config["runfor"] != "test": ds = datetime.datetime.now() self.model.trainArrays = numpy.concatenate([self.getDocsArray(x.words, 'Train') for x in self.model.Config[self.keyTrain]]) self.model.trainLabels = numpy.concatenate([numpy.array(x.labels). reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTrain]]) if self.addValSet: ind = int(len(self.model.trainArrays) * (1 - self.valSize)) self.model.valArrays = self.model.trainArrays[ind:] self.model.valLabels = self.model.trainLabels[ind:] self.model.trainArrays = self.model.trainArrays[:ind] self.model.trainLabels = self.model.trainLabels[:ind] de = datetime.datetime.now() print("Prepare train and validation data in %s" % (showTime(ds, de))) else: de = datetime.datetime.now() print("Prepare train data in %s" % (showTime(ds, de))) self.tmpCount = 0 ds = datetime.datetime.now() self.model.testArrays = numpy.concatenate([self.getDocsArray(x.words, "Test") for x in self.model.Config[self.keyTest]]) self.model.testLabels = numpy.concatenate([numpy.array(x.labels). reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTest]]) if self.model.isCV: return de = datetime.datetime.now() print("Prepare test data in %s" % (showTime(ds, de))) print("Unique words in all documents: %d" % (len(self.sdict))) print("Words not found in the w2v vocabulary: %d" % (self.nfWords))
def tokenizeFile(self, parser, inPath, outPath): outFile = open(outPath, 'w', encoding='UTF-8') ds = datetime.datetime.now() q = 0 qt = 0 with open(inPath, 'r', encoding='UTF-8') as f: for line in f: q += 1 if q > 1: result = '\n' else: result = '' line = line.replace('\r', '').replace('\n', '') if len(line) == 0: continue toks = line.split() if len(toks) < 3: continue qt += len(toks) tArr = parser.tag(line.split()) result += joinTokens(tArr, self.Config).strip() outFile.write(result) de = datetime.datetime.now() print("File %s (%d lines, %d tokens): in %s" % (outPath, q, qt, showTime(ds, de))) f.close() outFile.close()
def loadData(self): if self.Config["datatoks"] == "yes": print("Start loading and preprocessing of data...") else: print("Start loading data...") ds = datetime.datetime.now() self.Config["cats"] = self.getCategories( fullPath(self.Config, "trainpath")) traindocs = self.getDataDocs(fullPath(self.Config, "trainpath")) if not self.splitTrain: testdocs = self.getDataDocs(fullPath(self.Config, "testpath")) else: ind = int(len(traindocs) * (1 - self.sz)) random.shuffle(traindocs) testdocs = traindocs[ind:] traindocs = traindocs[:ind] de = datetime.datetime.now() self.Config["traindocs"] = random.sample(traindocs, len(traindocs)) self.Config["testdocs"] = random.sample(testdocs, len(testdocs)) self.getMaxSeqLen() self.getMaxCharsLength() if self.Config["datatoks"] == "yes" and self.Config[ "actualtoks"] == "yes": self.jar.stdin.write('!!! STOP !!!\n') self.jar.stdin.flush() print("Input data loaded in %s" % (showTime(ds, de))) print("Training set contains %d documents." % (len(self.Config["traindocs"]))) print("Testing set contains %d documents." % (len(self.Config["testdocs"]))) print("Documents belong to %d categories." % (len(self.Config["cats"])))
def getDataForSklearnClassifiers(self): mlb = None ds = datetime.datetime.now() if self.model.Config["runfor"] != "test": nmCats = [""] * len(self.model.Config["cats"]) cKeys = list(self.model.Config["cats"].keys()) for i in range(len(cKeys)): nmCats[self.model.Config["cats"][cKeys[i]]] = cKeys[i] mlb = MultiLabelBinarizer(classes=nmCats) wev = TfidfVectorizer(ngram_range=(1, 3), max_df=0.50).fit([x.lines for x in self.model.Config[self.keyTrain]], [x.nlabs for x in self.model.Config[self.keyTrain]]) self.model.trainArrays = wev.transform([x.lines for x in self.model.Config[self.keyTrain]]) self.model.trainLabels = mlb.fit_transform([x.nlabs for x in self.model.Config[self.keyTrain]]) if not self.model.isCV: with open(fullPath(self.model.Config, "binarizerpath"), 'wb') as handle: pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() with open(fullPath(self.model.Config, "vectorizerpath"), 'wb') as handle: pickle.dump(wev, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() if mlb == None: with open(fullPath(self.model.Config, "binarizerpath"), 'rb') as handle: mlb = pickle.load(handle) handle.close() with open(fullPath(self.model.Config, "vectorizerpath"), 'rb') as handle: wev = pickle.load(handle) handle.close() self.model.testArrays = wev.transform([x.lines for x in self.model.Config[self.keyTest]]) self.model.testLabels = mlb.fit_transform([x.nlabs for x in self.model.Config[self.keyTest]]) de = datetime.datetime.now() print("Prepare all data in %s" % (showTime(ds, de)))
def getCharVectors(self): ds = datetime.datetime.now() """ if self.model.Config["maxcharsdoclen"] > self.model.Config["maxcharsseqlen"]: print( "Most of documents from training set have less then %d characters. Longer documents will be truncated." % ( self.model.Config["maxcharsseqlen"])) """ if self.model.Config["runfor"] != "test": self.model.trainArrays = numpy.concatenate([self.stringToIndexes(" ".join(x.words)) for x in self.model.Config[self.keyTrain]]) self.model.trainLabels = numpy.concatenate([numpy.array(x.labels). reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTrain]]) if self.addValSet: ind = int(len(self.model.trainArrays) * (1 - self.valSize)) self.model.valArrays = self.model.trainArrays[ind:] self.model.valLabels = self.model.trainLabels[ind:] self.model.trainArrays = self.model.trainArrays[:ind] self.model.trainLabels = self.model.trainLabels[:ind] self.model.testArrays = numpy.concatenate([self.stringToIndexes(" ".join(x.words)) for x in self.model.Config[self.keyTest]]) self.model.testLabels = numpy.concatenate([numpy.array(x.labels). reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTest]]) if self.model.isCV: return de = datetime.datetime.now() print("Prepare all data in %s" % (showTime(ds, de)))
def tokenize(self, Config): taggerPath = fullPath(Config, "taggerpath") if (len(taggerPath) == 0 or not os.path.exists(taggerPath)): print ("Wrong path to the tagger's jar. Tokenization can't be done") Config["error"] = True return inPath = Config["home"] + "/" + Config["sourcepath"] outPath = Config["home"] + "/" + Config["targetpath"] stopWords = "" if Config["stopwords"] == "yes": sWords = list(stopwords.words('arabic')) for i in range(len(sWords)): if i > 0: stopWords += "," stopWords += sWords[i] ds = datetime.datetime.now() srv = subprocess.Popen('java -Xmx2g -jar ' + taggerPath + ' "' + inPath + '" "' + outPath + '" "' + Config["expos"] + '" "'+ stopWords + '" "' + Config["extrawords"] + '" "' + Config["normalization"] + '" "' + Config["actualtoks"] + '"', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) srv.wait() reply = srv.communicate() de = datetime.datetime.now() print(reply[0].decode()) print("All process is done in %s" % (showTime(ds, de)))
def trainSKLModel(self): de = datetime.datetime.now() print("Start training...") self.model.fit(self.trainArrays, self.trainLabels) ds = datetime.datetime.now() print("Model is trained in %s" % (showTime(de, ds))) if self.isCV: return joblib.dump(self.model, fullPath(self.Config, "modelpath", opt="name")) print("Model is saved in %s" % (fullPath(self.Config, "modelpath", opt="name"))) print("Model evaluation...") prediction = self.model.predict(self.testArrays) print('Final accuracy is %.2f' % (accuracy_score(self.testLabels, prediction))) de = datetime.datetime.now() print("Evaluated in %s" % (showTime(ds, de)))
def getWordVectorsMatrix(self): tokenizer = None ds = datetime.datetime.now() if self.model.Config["runfor"] != "test": tokenizer = Tokenizer(num_words=self.maxWords) trainTexts = [] for i in range(len(self.model.Config[self.keyTrain])): trainTexts.append(self.model.Config[self.keyTrain][i].lines) tokenizer.fit_on_texts(trainTexts) if not self.model.isCV: with open(fullPath(self.model.Config, "indexerpath"), 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() if self.model.Config["maxdoclen"] > self.model.Config["maxseqlen"]: print("Most of documents from training set have less then %d tokens. Longer documents will be truncated."%( self.model.Config["maxseqlen"])) self.model.trainArrays = pad_sequences(tokenizer.texts_to_sequences(trainTexts), maxlen=self.model.Config["maxseqlen"]) self.model.trainLabels = numpy.concatenate([numpy.array(x.labels). reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTrain]]) if self.addValSet: ind = int(len(self.model.trainArrays) * (1 - self.valSize)) self.model.valArrays = self.model.trainArrays[ind:] self.model.valLabels = self.model.trainLabels[ind:] self.model.trainArrays = self.model.trainArrays[:ind] self.model.trainLabels = self.model.trainLabels[:ind] if tokenizer == None: with open(fullPath(self.model.Config, "indexerpath"), 'rb') as handle: tokenizer = pickle.load(handle) handle.close() testTexts = [] for i in range(len(self.model.Config[self.keyTest])): testTexts.append(self.model.Config[self.keyTest][i].lines) self.model.testArrays = pad_sequences(tokenizer.texts_to_sequences(testTexts), maxlen=self.model.Config["maxseqlen"]) self.model.testLabels = numpy.concatenate([numpy.array(x.labels). reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTest]]) embedding_matrix = numpy.zeros((self.maxWords, self.ndim)) word_index = tokenizer.word_index nf = 0 for word, i in word_index.items(): if i < self.maxWords: try: embedding_vector = self.model.w2vModel[word] except KeyError: nf += 1 continue if embedding_vector is not None: embedding_matrix[i] = embedding_vector self.model.embMatrix = embedding_matrix self.model.maxWords = self.maxWords if self.model.isCV: return de = datetime.datetime.now() print('Found %s unique tokens.' % len(tokenizer.word_index)) print ('Tokens not found in W2V vocabulary: %d'%nf) print("All data prepared and embedding matrix built in %s"%(showTime(ds, de))) return embedding_matrix, self.maxWords
def createW2VModel(self): sentences = [] count = 0 print ("Start to create W2V model...") print ("Get input data...") ds = datetime.datetime.now() with open(fullPath(self.Config, "w2vcorpuspath"), 'r', encoding='UTF-8') as f: for line in f: if len(line.strip()) == 0: continue count += 1 words = [w for w in line.strip().split()] sentences.append(words) f.close() de = datetime.datetime.now() print("Got %d lines from file %s in %s"% (count, fullPath(self.Config, "w2vcorpuspath"), showTime(ds, de))) numpy.random.shuffle(sentences) logger = EpochLogger(self.epochs) w2v = Word2Vec(size=self.ndim, window=10, min_count=3, workers=10) ds = datetime.datetime.now() print("Build vocabulary...") w2v.build_vocab(sentences) de = datetime.datetime.now() print("Vocabulary is built in %s" % (showTime(ds, de))) print("Train model...") ds = datetime.datetime.now() w2v.train(sentences, epochs=int(self.Config["w2vepochs"]), total_examples=len(sentences), callbacks=[logger]) de = datetime.datetime.now() print("W2V model is completed in %s" % (showTime(ds, de))) modelPath = fullPath(self.Config, "w2vmodelpath") if self.Config["w2vtimeinname"]: modelName = os.path.basename(modelPath) dt = "-" + datetime.datetime.now().strftime("%Y-%b-%d-%H%M%S") pInd = modelName.rfind(".") if pInd > 0: modelName = modelName[:pInd] + dt + modelName[pInd:] else: modelName += dt finalPath = os.path.dirname(modelPath) + "/" + modelName ds = datetime.datetime.now() w2v.wv.save_word2vec_format(finalPath, binary=False) de = datetime.datetime.now() print("W2V model %s is saved in the text format in %s\n" % (finalPath, showTime(ds, de)))
def loadW2VModel(self): print("Load W2V model...") ds = datetime.datetime.now() self.Config[ "w2vmodel"] = gensim.models.KeyedVectors.load_word2vec_format( fullPath(self.Config, "w2vmodelpath")) de = datetime.datetime.now() print("Load W2V model (%s) in %s" % (fullPath(self.Config, "w2vmodelpath"), showTime(ds, de)))
def tokenize(self): parser = CoreNLPParser(url='http://localhost:' + self.Config["servport"], tagtype='pos') inPath = self.Config["home"] + "/" + self.Config["sourcepath"] outPath = self.Config["home"] + "/" + self.Config["targetpath"] fds = datetime.datetime.now() self.tokenizeData(parser, inPath, outPath) fde = datetime.datetime.now() print("Tokenization complited in %s" % (showTime(fds, fde)))
def testNNModel(self): print("Start testing...") print("Rank threshold: %.2f" % (self.rankThreshold)) ds = datetime.datetime.now() self.predictions = self.model.predict(self.testArrays) de = datetime.datetime.now() print("Test dataset containing %d documents predicted in %s\n" % (len(self.testArrays), showTime(ds, de))) if self.isCV: return self.saveResources("keras") self.getMetrics() self.saveResults()
def loadW2VModel(self): if self.Config["w2vmodel"] != None: print("W2V model is already loaded...") self.w2vModel = self.Config["w2vmodel"] return print("Load W2V model... ") ds = datetime.datetime.now() self.w2vModel = gensim.models.KeyedVectors.load_word2vec_format( fullPath(self.Config, "w2vmodelpath")) de = datetime.datetime.now() print("Load W2V model (%s) in %s" % (fullPath(self.Config, "w2vmodelpath"), showTime(ds, de))) self.Config["resources"]["w2v"]["modelPath"] = fullPath( self.Config, "w2vmodelpath") self.Config["resources"]["w2v"]["ndim"] = self.ndim
def trainNNModel(self): checkpoints = [] if self.tempSave and not self.isCV: checkpoint = ModelCheckpoint(fullPath(self.Config, "temppath") + "/tempModel.hdf5", monitor='val_acc', verbose=self.verbose, save_best_only=True, mode='auto') checkpoints.append(checkpoint) print("Start training... ") ds = datetime.datetime.now() self.model.fit(self.trainArrays, self.trainLabels, epochs=self.epochs, validation_data=(self.valArrays, self.valLabels), batch_size=self.trainBatch, verbose=self.verbose, callbacks=checkpoints, shuffle=False) de = datetime.datetime.now() print("Model is trained in %s" % (showTime(ds, de))) if self.isCV: return self.model.save(fullPath(self.Config, "modelpath", opt="name")) print("Model evaluation...") scores1 = self.model.evaluate(self.testArrays, self.testLabels, verbose=self.verbose) print("Final model accuracy: %.2f%%" % (scores1[1] * 100)) if self.tempSave: model1 = load_model( fullPath(self.Config, "temppath") + "/tempModel.hdf5") scores2 = model1.evaluate(self.testArrays, self.testLabels, verbose=self.verbose) print("Last saved model accuracy: %.2f%%" % (scores2[1] * 100)) if scores1[1] < scores2[1]: model = model1 pref = "The best model " else: pref = "Model " self.model.save(fullPath(self.Config, "modelpath", opt="name")) print(pref + "is saved in %s" % (fullPath(self.Config, "modelpath", opt="name")))
def testSKLModel(self): print("Start testing...") if self.useProbabilities: print("Rank threshold: %.2f" % (self.rankThreshold)) else: print("Model doesn't evaluate probabilities.") ds = datetime.datetime.now() if not self.useProbabilities: self.predictions = self.model.predict(self.testArrays) else: self.predictions = self.model.predict_proba(self.testArrays) de = datetime.datetime.now() print("Test dataset containing %d documents predicted in %s" % (self.testArrays.shape[0], showTime(ds, de))) if self.isCV: return self.saveResources("skl") self.getMetrics() self.saveResults()
def launchCrossValidation(self): print("Start cross-validation...") ds = datetime.datetime.now() self.cvDocs = self.Config["traindocs"] + self.Config["testdocs"] random.shuffle(self.cvDocs) self.keyTrain = "cvtraindocs" self.keyTest = "cvtestdocs" pSize = len(self.cvDocs) // self.kfold ind = 0 f1 = 0 arrMetrics = [] for i in range(self.kfold): print("Cross-validation, cycle %d from %d..." % ((i + 1), self.kfold)) if i == 0: self.Config["cvtraindocs"] = self.cvDocs[pSize:] self.Config["cvtestdocs"] = self.cvDocs[:pSize] elif i == self.kfold - 1: self.Config["cvtraindocs"] = self.cvDocs[:ind] self.Config["cvtestdocs"] = self.cvDocs[ind:] else: self.Config["cvtraindocs"] = self.cvDocs[:ind] + self.cvDocs[ ind + pSize:] self.Config["cvtestdocs"] = self.cvDocs[ind:ind + pSize] ind += pSize self.prepareData() self.model = self.createModel() self.trainModel() self.testModel() arrMetrics.append(self.metrics) cycleF1 = self.metrics["all"]["f1"] print("Resulting F1-Measure: %f\n" % (cycleF1)) if cycleF1 > f1: if self.Config["cvsave"]: self.saveDataSets() f1 = cycleF1 de = datetime.datetime.now() print("Cross-validation is done in %s" % (showTime(ds, de))) printAveragedMetrics(arrMetrics, self.Config) print("The best result is %f" % (f1)) print("Corresponding data sets are saved in the folder %s" % (fullPath(self.Config, "cvpath")))
def testModel(self): print("Start testing...") print("Rank threshold: %.2f" % (self.rankThreshold)) ds = datetime.datetime.now() if self.model_to_save == None: output_model_file = fullPath(self.Config, "bertoutpath", opt="name") model_state_dict = torch.load(output_model_file) model = BertForMultiLabelSequenceClassification.from_pretrained( self.args.bert_model, state_dict=model_state_dict, num_labels=self.num_labels) model.to(self.device) eval_examples = self.processor.get_dev_examples(self.args.data_dir) eval_features = convert_examples_to_features(eval_examples, self.label_list, self.max_seq_length, self.tokenizer) #self.logger.info(" Num examples = %d", len(eval_examples)) #self.logger.info(" Batch size = %d", self.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = BertDataLoader(eval_data, sampler=eval_sampler, batch_size=self.eval_batch_size) self.model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 allLabs = None res = None initRes = True for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = self.model(input_ids, segment_ids, input_mask, label_ids) logits = self.model(input_ids, segment_ids, input_mask) preds = logits.sigmoid().to('cpu').numpy() labs = label_ids.to('cpu').numpy() if initRes == True: res = preds allLabs = labs initRes = False else: res = numpy.concatenate((res, preds)) allLabs = numpy.concatenate((allLabs, labs)) tmp_eval_accuracy = accuracy(logits, label_ids, self.rankThreshold) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 self.predictions = res self.testLabels = allLabs de = datetime.datetime.now() print("Test dataset containing %d documents predicted in %s\n" % (len(eval_examples), showTime(ds, de))) if self.Config["runfor"] != "crossvalidation": self.saveResources("torch") self.getMetrics() self.saveResults()
def trainModel(self): print("Start training..") ds = datetime.datetime.now() param_optimizer = [p for p in self.model.named_parameters()] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=self.learning_rate, warmup=self.warmup_proportion, t_total=self.num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 train_features = convert_examples_to_features(self.train_examples, self.label_list, self.max_seq_length, self.tokenizer) logger = getLogger() logger.info("***** Running training *****") logger.info(" Num examples = %d", len(self.train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", self.num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = BertDataLoader(train_data, sampler=train_sampler, batch_size=self.train_batch_size) self.model.train() for _ in trange(int(self.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = self.model(input_ids, segment_ids, input_mask, label_ids) if self.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if self.gradient_accumulation_steps > 1: loss = loss / self.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 de = datetime.datetime.now() print("Model is trained in %s" % (showTime(ds, de))) if self.Config["runfor"] == "crossvalidation": return print("Model evaluation...") eval_examples = self.processor.get_dev_examples(self.args.data_dir) eval_features = convert_examples_to_features(eval_examples, self.label_list, self.max_seq_length, self.tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = BertDataLoader(eval_data, sampler=eval_sampler, batch_size=self.eval_batch_size) self.model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 allLabs = None res = None initRes = True for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = self.model(input_ids, segment_ids, input_mask, label_ids) logits = self.model(input_ids, segment_ids, input_mask) preds = logits.sigmoid().to('cpu').numpy() labs = label_ids.to('cpu').numpy() if initRes == True: res = preds allLabs = labs initRes = False else: res = numpy.concatenate((res, preds)) allLabs = numpy.concatenate((allLabs, labs)) tmp_eval_accuracy = accuracy(logits, label_ids, self.rankThreshold) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_accuracy = eval_accuracy / nb_eval_examples print("Model accuracy: %.2f" % (eval_accuracy)) # Save a trained model self.model_to_save = self.model.module if hasattr( self.model, 'module') else self.model output_model_file = fullPath(self.Config, "bertoutpath", opt="name") torch.save(self.model_to_save.state_dict(), output_model_file) print("Model is saved in %s" % (output_model_file))
def saveResources(self): tokOpts = [ "actualtoks", "normalization", "stopwords", "expos", "extrawords", "maxseqlen", "maxcharsseqlen", "rttaggerpath" ] self.Config["resources"]["tokenization"] = {} ds = datetime.datetime.now() self.outDir = fullPath(self.Config, "resourcespath") + "/" for i in range(len(tokOpts)): if tokOpts[i] != "rttaggerpath": self.Config["resources"]["tokenization"][ tokOpts[i]] = self.Config[tokOpts[i]] elif self.Config["actualtoks"] == "yes": self.Config["resources"]["tokenization"]["rttaggerpath"] = \ self.copyFile(fullPath(self.Config, "rttaggerpath")) isW2VNeeded = False for key, val in self.Config["resources"]["models"].items(): val["modelPath"] = self.copyFile(val["modelPath"]) if "w2v" in val and val["w2v"] == "yes": isW2VNeeded = True if not isW2VNeeded and "w2v" in self.Config["resources"]: self.Config["resources"].pop("w2v", None) if "w2v" in self.Config["resources"]: w2vDict = {} isFirstLine = True fEmbeddings = open(self.Config["resources"]["w2v"]["modelPath"], encoding="utf-8") for line in fEmbeddings: if isFirstLine == True: isFirstLine = False continue split = line.strip().split(" ") word = split[0] vector = numpy.array([float(num) for num in split[1:]]) w2vDict[word] = vector fEmbeddings.close() with open(self.Config["resources"]["w2v"]["modelPath"] + '.pkl', 'wb') as file: pickle.dump(w2vDict, file, pickle.HIGHEST_PROTOCOL) file.close() self.Config["resources"]["w2v"]["modelPath"] = self.copyFile( self.Config["resources"]["w2v"]["modelPath"] + '.pkl') if "indexer" in self.Config["resources"]: self.Config["resources"]["indexer"] = self.copyFile( self.Config["resources"]["indexer"]) if "vectorizer" in self.Config["resources"]: self.Config["resources"]["vectorizer"] = self.copyFile( self.Config["resources"]["vectorizer"]) if "ptBertModel" in self.Config["resources"]: self.Config["resources"]["ptBertModel"] = self.copyFile( self.Config["resources"]["ptBertModel"]) self.Config["resources"]["vocabPath"] = self.copyFile( self.Config["resources"]["vocabPath"]) cNames = [''] * len(self.Config["cats"]) for k, v in self.Config["cats"].items(): cNames[v] = k with open(self.outDir + 'labels.txt', 'w', encoding="utf-8") as file: file.write(",".join(cNames)) file.close() self.Config["resources"]["labels"] = "labels.txt" self.Config["resources"]["consolidatedRank"] = self.rankThreshold with open(self.outDir + 'config.json', 'w', encoding="utf-8") as file: json.dump(self.Config["resources"], file, indent=4) file.close() de = datetime.datetime.now() print("\nArtifacts are copied into the folder %s in %s" % (fullPath(self.Config, "resourcespath"), showTime(ds, de)))