Beispiel #1
0
    def getWordVectorsSum(self):
        self.nfWords = 0
        self.sdict = dict()
        self.tmpCount = 0
        if self.model.Config["runfor"] != "test":
            ds = datetime.datetime.now()
            self.model.trainArrays = numpy.concatenate([self.getDocsArray(x.words, 'Train') for x in self.model.Config[self.keyTrain]])
            self.model.trainLabels = numpy.concatenate([numpy.array(x.labels).
                                reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTrain]])

            if self.addValSet:
                ind = int(len(self.model.trainArrays) * (1 - self.valSize))
                self.model.valArrays = self.model.trainArrays[ind:]
                self.model.valLabels = self.model.trainLabels[ind:]
                self.model.trainArrays = self.model.trainArrays[:ind]
                self.model.trainLabels = self.model.trainLabels[:ind]

                de = datetime.datetime.now()
                print("Prepare train and validation data in %s" % (showTime(ds, de)))
            else:
                de = datetime.datetime.now()
                print("Prepare train data in %s" % (showTime(ds, de)))

        self.tmpCount = 0
        ds = datetime.datetime.now()
        self.model.testArrays = numpy.concatenate([self.getDocsArray(x.words, "Test") for x in self.model.Config[self.keyTest]])
        self.model.testLabels = numpy.concatenate([numpy.array(x.labels).
                                reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTest]])
        if self.model.isCV:
            return
        de = datetime.datetime.now()
        print("Prepare test data in %s" % (showTime(ds, de)))
        print("Unique words in all documents: %d" % (len(self.sdict)))
        print("Words not found in the w2v vocabulary: %d" % (self.nfWords))
Beispiel #2
0
 def tokenizeFile(self, parser, inPath, outPath):
     outFile = open(outPath, 'w', encoding='UTF-8')
     ds = datetime.datetime.now()
     q = 0
     qt = 0
     with open(inPath, 'r', encoding='UTF-8') as f:
         for line in f:
             q += 1
             if q > 1:
                 result = '\n'
             else:
                 result = ''
             line = line.replace('\r', '').replace('\n', '')
             if len(line) == 0:
                 continue
             toks = line.split()
             if len(toks) < 3:
                 continue
             qt += len(toks)
             tArr = parser.tag(line.split())
             result += joinTokens(tArr, self.Config).strip()
             outFile.write(result)
     de = datetime.datetime.now()
     print("File %s (%d lines, %d tokens): in %s" %
           (outPath, q, qt, showTime(ds, de)))
     f.close()
     outFile.close()
Beispiel #3
0
 def loadData(self):
     if self.Config["datatoks"] == "yes":
         print("Start loading and preprocessing of data...")
     else:
         print("Start loading data...")
     ds = datetime.datetime.now()
     self.Config["cats"] = self.getCategories(
         fullPath(self.Config, "trainpath"))
     traindocs = self.getDataDocs(fullPath(self.Config, "trainpath"))
     if not self.splitTrain:
         testdocs = self.getDataDocs(fullPath(self.Config, "testpath"))
     else:
         ind = int(len(traindocs) * (1 - self.sz))
         random.shuffle(traindocs)
         testdocs = traindocs[ind:]
         traindocs = traindocs[:ind]
     de = datetime.datetime.now()
     self.Config["traindocs"] = random.sample(traindocs, len(traindocs))
     self.Config["testdocs"] = random.sample(testdocs, len(testdocs))
     self.getMaxSeqLen()
     self.getMaxCharsLength()
     if self.Config["datatoks"] == "yes" and self.Config[
             "actualtoks"] == "yes":
         self.jar.stdin.write('!!! STOP !!!\n')
         self.jar.stdin.flush()
     print("Input data loaded in %s" % (showTime(ds, de)))
     print("Training set contains %d documents." %
           (len(self.Config["traindocs"])))
     print("Testing set contains %d documents." %
           (len(self.Config["testdocs"])))
     print("Documents belong to %d categories." %
           (len(self.Config["cats"])))
Beispiel #4
0
 def getDataForSklearnClassifiers(self):
     mlb = None
     ds = datetime.datetime.now()
     if self.model.Config["runfor"] != "test":
         nmCats = [""] * len(self.model.Config["cats"])
         cKeys = list(self.model.Config["cats"].keys())
         for i in range(len(cKeys)):
             nmCats[self.model.Config["cats"][cKeys[i]]] = cKeys[i]
         mlb = MultiLabelBinarizer(classes=nmCats)
         wev = TfidfVectorizer(ngram_range=(1, 3), max_df=0.50).fit([x.lines for x in self.model.Config[self.keyTrain]],
                                                                [x.nlabs for x in self.model.Config[self.keyTrain]])
         self.model.trainArrays = wev.transform([x.lines for x in self.model.Config[self.keyTrain]])
         self.model.trainLabels = mlb.fit_transform([x.nlabs for x in self.model.Config[self.keyTrain]])
         if not self.model.isCV:
             with open(fullPath(self.model.Config, "binarizerpath"), 'wb') as handle:
                 pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
             with open(fullPath(self.model.Config, "vectorizerpath"), 'wb') as handle:
                 pickle.dump(wev, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
     if mlb == None:
         with open(fullPath(self.model.Config, "binarizerpath"), 'rb') as handle:
             mlb = pickle.load(handle)
         handle.close()
         with open(fullPath(self.model.Config, "vectorizerpath"), 'rb') as handle:
             wev = pickle.load(handle)
         handle.close()
     self.model.testArrays = wev.transform([x.lines for x in self.model.Config[self.keyTest]])
     self.model.testLabels = mlb.fit_transform([x.nlabs for x in self.model.Config[self.keyTest]])
     de = datetime.datetime.now()
     print("Prepare all data in %s" % (showTime(ds, de)))
Beispiel #5
0
 def getCharVectors(self):
     ds = datetime.datetime.now()
     """
     if self.model.Config["maxcharsdoclen"] > self.model.Config["maxcharsseqlen"]:
         print(
             "Most of documents from training set have less then %d characters. Longer documents will be truncated." % (
                 self.model.Config["maxcharsseqlen"]))
     """
     if self.model.Config["runfor"] != "test":
         self.model.trainArrays = numpy.concatenate([self.stringToIndexes(" ".join(x.words))
                                         for x in self.model.Config[self.keyTrain]])
         self.model.trainLabels = numpy.concatenate([numpy.array(x.labels).
                             reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTrain]])
         if self.addValSet:
             ind = int(len(self.model.trainArrays) * (1 - self.valSize))
             self.model.valArrays = self.model.trainArrays[ind:]
             self.model.valLabels = self.model.trainLabels[ind:]
             self.model.trainArrays = self.model.trainArrays[:ind]
             self.model.trainLabels = self.model.trainLabels[:ind]
     self.model.testArrays = numpy.concatenate([self.stringToIndexes(" ".join(x.words))
                                         for x in self.model.Config[self.keyTest]])
     self.model.testLabels = numpy.concatenate([numpy.array(x.labels).
                             reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTest]])
     if self.model.isCV:
         return
     de = datetime.datetime.now()
     print("Prepare all data in %s" % (showTime(ds, de)))
Beispiel #6
0
 def tokenize(self, Config):
     taggerPath = fullPath(Config, "taggerpath")
     if (len(taggerPath) == 0 or not os.path.exists(taggerPath)):
         print ("Wrong path to the tagger's jar. Tokenization can't be done")
         Config["error"] = True
         return
     inPath = Config["home"] + "/" + Config["sourcepath"]
     outPath = Config["home"] + "/" + Config["targetpath"]
     stopWords = ""
     if Config["stopwords"] == "yes":
         sWords = list(stopwords.words('arabic'))
         for i in range(len(sWords)):
             if i > 0:
                 stopWords += ","
             stopWords += sWords[i]
     ds = datetime.datetime.now()
     srv = subprocess.Popen('java -Xmx2g -jar ' + taggerPath + ' "' + inPath +  '" "'  +
                            outPath + '" "' + Config["expos"] + '" "'+ stopWords + '" "' +
                            Config["extrawords"] + '" "' + Config["normalization"] + '" "' +
                            Config["actualtoks"] + '"',
                            stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
     srv.wait()
     reply = srv.communicate()
     de = datetime.datetime.now()
     print(reply[0].decode())
     print("All process is done in %s" % (showTime(ds, de)))
Beispiel #7
0
 def trainSKLModel(self):
     de = datetime.datetime.now()
     print("Start training...")
     self.model.fit(self.trainArrays, self.trainLabels)
     ds = datetime.datetime.now()
     print("Model is trained in %s" % (showTime(de, ds)))
     if self.isCV:
         return
     joblib.dump(self.model, fullPath(self.Config, "modelpath", opt="name"))
     print("Model is saved in %s" %
           (fullPath(self.Config, "modelpath", opt="name")))
     print("Model evaluation...")
     prediction = self.model.predict(self.testArrays)
     print('Final accuracy is %.2f' %
           (accuracy_score(self.testLabels, prediction)))
     de = datetime.datetime.now()
     print("Evaluated in %s" % (showTime(ds, de)))
Beispiel #8
0
 def getWordVectorsMatrix(self):
     tokenizer = None
     ds = datetime.datetime.now()
     if self.model.Config["runfor"] != "test":
         tokenizer = Tokenizer(num_words=self.maxWords)
         trainTexts = []
         for i in range(len(self.model.Config[self.keyTrain])):
             trainTexts.append(self.model.Config[self.keyTrain][i].lines)
         tokenizer.fit_on_texts(trainTexts)
         if not self.model.isCV:
             with open(fullPath(self.model.Config, "indexerpath"), 'wb') as handle:
                 pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
             if self.model.Config["maxdoclen"] > self.model.Config["maxseqlen"]:
                 print("Most of documents from training set have less then %d tokens. Longer documents will be truncated."%(
                     self.model.Config["maxseqlen"]))
         self.model.trainArrays = pad_sequences(tokenizer.texts_to_sequences(trainTexts),
                                                 maxlen=self.model.Config["maxseqlen"])
         self.model.trainLabels = numpy.concatenate([numpy.array(x.labels).
                         reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTrain]])
         if self.addValSet:
             ind = int(len(self.model.trainArrays) * (1 - self.valSize))
             self.model.valArrays = self.model.trainArrays[ind:]
             self.model.valLabels = self.model.trainLabels[ind:]
             self.model.trainArrays = self.model.trainArrays[:ind]
             self.model.trainLabels = self.model.trainLabels[:ind]
     if tokenizer == None:
         with open(fullPath(self.model.Config, "indexerpath"), 'rb') as handle:
             tokenizer = pickle.load(handle)
         handle.close()
     testTexts = []
     for i in range(len(self.model.Config[self.keyTest])):
         testTexts.append(self.model.Config[self.keyTest][i].lines)
     self.model.testArrays = pad_sequences(tokenizer.texts_to_sequences(testTexts),
                                           maxlen=self.model.Config["maxseqlen"])
     self.model.testLabels = numpy.concatenate([numpy.array(x.labels).
                         reshape(1, len(self.model.Config["cats"])) for x in self.model.Config[self.keyTest]])
     embedding_matrix = numpy.zeros((self.maxWords, self.ndim))
     word_index = tokenizer.word_index
     nf = 0
     for word, i in word_index.items():
         if i < self.maxWords:
             try:
                 embedding_vector = self.model.w2vModel[word]
             except KeyError:
                 nf += 1
                 continue
             if embedding_vector is not None:
                 embedding_matrix[i] = embedding_vector
     self.model.embMatrix =  embedding_matrix
     self.model.maxWords = self.maxWords
     if self.model.isCV:
         return
     de = datetime.datetime.now()
     print('Found %s unique tokens.' % len(tokenizer.word_index))
     print ('Tokens not found in W2V vocabulary: %d'%nf)
     print("All data prepared and embedding matrix built in %s"%(showTime(ds, de)))
     return embedding_matrix, self.maxWords
Beispiel #9
0
    def createW2VModel(self):
        sentences = []
        count = 0
        print ("Start to create W2V model...")
        print ("Get input data...")
        ds = datetime.datetime.now()
        with open(fullPath(self.Config, "w2vcorpuspath"), 'r', encoding='UTF-8') as f:
            for line in f:
                if len(line.strip()) == 0:
                    continue
                count += 1
                words = [w for w in line.strip().split()]
                sentences.append(words)
        f.close()
        de = datetime.datetime.now()
        print("Got %d lines from file %s in %s"% (count, fullPath(self.Config, "w2vcorpuspath"), showTime(ds, de)))
        numpy.random.shuffle(sentences)

        logger = EpochLogger(self.epochs)
        w2v = Word2Vec(size=self.ndim, window=10, min_count=3, workers=10)
        ds = datetime.datetime.now()
        print("Build vocabulary...")
        w2v.build_vocab(sentences)
        de = datetime.datetime.now()
        print("Vocabulary is built in %s" % (showTime(ds, de)))
        print("Train model...")
        ds = datetime.datetime.now()
        w2v.train(sentences, epochs=int(self.Config["w2vepochs"]), total_examples=len(sentences), callbacks=[logger])
        de = datetime.datetime.now()
        print("W2V model is completed in %s" % (showTime(ds, de)))

        modelPath = fullPath(self.Config, "w2vmodelpath")
        if self.Config["w2vtimeinname"]:
            modelName = os.path.basename(modelPath)
            dt = "-" + datetime.datetime.now().strftime("%Y-%b-%d-%H%M%S")
            pInd = modelName.rfind(".")
            if pInd > 0:
                modelName = modelName[:pInd] + dt + modelName[pInd:]
            else:
                modelName += dt
        finalPath = os.path.dirname(modelPath) + "/" + modelName
        ds = datetime.datetime.now()
        w2v.wv.save_word2vec_format(finalPath, binary=False)
        de = datetime.datetime.now()
        print("W2V model %s is saved in the text format in %s\n" % (finalPath, showTime(ds, de)))
Beispiel #10
0
 def loadW2VModel(self):
     print("Load W2V model...")
     ds = datetime.datetime.now()
     self.Config[
         "w2vmodel"] = gensim.models.KeyedVectors.load_word2vec_format(
             fullPath(self.Config, "w2vmodelpath"))
     de = datetime.datetime.now()
     print("Load W2V model (%s) in %s" %
           (fullPath(self.Config, "w2vmodelpath"), showTime(ds, de)))
Beispiel #11
0
    def tokenize(self):
        parser = CoreNLPParser(url='http://localhost:' +
                               self.Config["servport"],
                               tagtype='pos')
        inPath = self.Config["home"] + "/" + self.Config["sourcepath"]
        outPath = self.Config["home"] + "/" + self.Config["targetpath"]

        fds = datetime.datetime.now()
        self.tokenizeData(parser, inPath, outPath)
        fde = datetime.datetime.now()
        print("Tokenization complited in %s" % (showTime(fds, fde)))
Beispiel #12
0
 def testNNModel(self):
     print("Start testing...")
     print("Rank threshold: %.2f" % (self.rankThreshold))
     ds = datetime.datetime.now()
     self.predictions = self.model.predict(self.testArrays)
     de = datetime.datetime.now()
     print("Test dataset containing %d documents predicted in %s\n" %
           (len(self.testArrays), showTime(ds, de)))
     if self.isCV:
         return
     self.saveResources("keras")
     self.getMetrics()
     self.saveResults()
Beispiel #13
0
 def loadW2VModel(self):
     if self.Config["w2vmodel"] != None:
         print("W2V model is already loaded...")
         self.w2vModel = self.Config["w2vmodel"]
         return
     print("Load W2V model... ")
     ds = datetime.datetime.now()
     self.w2vModel = gensim.models.KeyedVectors.load_word2vec_format(
         fullPath(self.Config, "w2vmodelpath"))
     de = datetime.datetime.now()
     print("Load W2V model (%s) in %s" %
           (fullPath(self.Config, "w2vmodelpath"), showTime(ds, de)))
     self.Config["resources"]["w2v"]["modelPath"] = fullPath(
         self.Config, "w2vmodelpath")
     self.Config["resources"]["w2v"]["ndim"] = self.ndim
Beispiel #14
0
 def trainNNModel(self):
     checkpoints = []
     if self.tempSave and not self.isCV:
         checkpoint = ModelCheckpoint(fullPath(self.Config, "temppath") +
                                      "/tempModel.hdf5",
                                      monitor='val_acc',
                                      verbose=self.verbose,
                                      save_best_only=True,
                                      mode='auto')
         checkpoints.append(checkpoint)
     print("Start training...              ")
     ds = datetime.datetime.now()
     self.model.fit(self.trainArrays,
                    self.trainLabels,
                    epochs=self.epochs,
                    validation_data=(self.valArrays, self.valLabels),
                    batch_size=self.trainBatch,
                    verbose=self.verbose,
                    callbacks=checkpoints,
                    shuffle=False)
     de = datetime.datetime.now()
     print("Model is trained in %s" % (showTime(ds, de)))
     if self.isCV:
         return
     self.model.save(fullPath(self.Config, "modelpath", opt="name"))
     print("Model evaluation...")
     scores1 = self.model.evaluate(self.testArrays,
                                   self.testLabels,
                                   verbose=self.verbose)
     print("Final model accuracy: %.2f%%" % (scores1[1] * 100))
     if self.tempSave:
         model1 = load_model(
             fullPath(self.Config, "temppath") + "/tempModel.hdf5")
         scores2 = model1.evaluate(self.testArrays,
                                   self.testLabels,
                                   verbose=self.verbose)
         print("Last saved model accuracy: %.2f%%" % (scores2[1] * 100))
         if scores1[1] < scores2[1]:
             model = model1
         pref = "The best model "
     else:
         pref = "Model "
     self.model.save(fullPath(self.Config, "modelpath", opt="name"))
     print(pref + "is saved in %s" %
           (fullPath(self.Config, "modelpath", opt="name")))
Beispiel #15
0
 def testSKLModel(self):
     print("Start testing...")
     if self.useProbabilities:
         print("Rank threshold: %.2f" % (self.rankThreshold))
     else:
         print("Model doesn't evaluate probabilities.")
     ds = datetime.datetime.now()
     if not self.useProbabilities:
         self.predictions = self.model.predict(self.testArrays)
     else:
         self.predictions = self.model.predict_proba(self.testArrays)
     de = datetime.datetime.now()
     print("Test dataset containing %d documents predicted in %s" %
           (self.testArrays.shape[0], showTime(ds, de)))
     if self.isCV:
         return
     self.saveResources("skl")
     self.getMetrics()
     self.saveResults()
Beispiel #16
0
 def launchCrossValidation(self):
     print("Start cross-validation...")
     ds = datetime.datetime.now()
     self.cvDocs = self.Config["traindocs"] + self.Config["testdocs"]
     random.shuffle(self.cvDocs)
     self.keyTrain = "cvtraindocs"
     self.keyTest = "cvtestdocs"
     pSize = len(self.cvDocs) // self.kfold
     ind = 0
     f1 = 0
     arrMetrics = []
     for i in range(self.kfold):
         print("Cross-validation, cycle %d from %d..." %
               ((i + 1), self.kfold))
         if i == 0:
             self.Config["cvtraindocs"] = self.cvDocs[pSize:]
             self.Config["cvtestdocs"] = self.cvDocs[:pSize]
         elif i == self.kfold - 1:
             self.Config["cvtraindocs"] = self.cvDocs[:ind]
             self.Config["cvtestdocs"] = self.cvDocs[ind:]
         else:
             self.Config["cvtraindocs"] = self.cvDocs[:ind] + self.cvDocs[
                 ind + pSize:]
             self.Config["cvtestdocs"] = self.cvDocs[ind:ind + pSize]
         ind += pSize
         self.prepareData()
         self.model = self.createModel()
         self.trainModel()
         self.testModel()
         arrMetrics.append(self.metrics)
         cycleF1 = self.metrics["all"]["f1"]
         print("Resulting F1-Measure: %f\n" % (cycleF1))
         if cycleF1 > f1:
             if self.Config["cvsave"]:
                 self.saveDataSets()
             f1 = cycleF1
     de = datetime.datetime.now()
     print("Cross-validation is done in %s" % (showTime(ds, de)))
     printAveragedMetrics(arrMetrics, self.Config)
     print("The best result is %f" % (f1))
     print("Corresponding data sets are saved in the folder %s" %
           (fullPath(self.Config, "cvpath")))
Beispiel #17
0
 def testModel(self):
     print("Start testing...")
     print("Rank threshold: %.2f" % (self.rankThreshold))
     ds = datetime.datetime.now()
     if self.model_to_save == None:
         output_model_file = fullPath(self.Config,
                                      "bertoutpath",
                                      opt="name")
         model_state_dict = torch.load(output_model_file)
         model = BertForMultiLabelSequenceClassification.from_pretrained(
             self.args.bert_model,
             state_dict=model_state_dict,
             num_labels=self.num_labels)
         model.to(self.device)
     eval_examples = self.processor.get_dev_examples(self.args.data_dir)
     eval_features = convert_examples_to_features(eval_examples,
                                                  self.label_list,
                                                  self.max_seq_length,
                                                  self.tokenizer)
     #self.logger.info("  Num examples = %d", len(eval_examples))
     #self.logger.info("  Batch size = %d", self.eval_batch_size)
     all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                  dtype=torch.long)
     all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                   dtype=torch.long)
     all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                    dtype=torch.long)
     all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                  dtype=torch.float)
     eval_data = TensorDataset(all_input_ids, all_input_mask,
                               all_segment_ids, all_label_ids)
     # Run prediction for full data
     eval_sampler = SequentialSampler(eval_data)
     eval_dataloader = BertDataLoader(eval_data,
                                      sampler=eval_sampler,
                                      batch_size=self.eval_batch_size)
     self.model.eval()
     eval_loss, eval_accuracy = 0, 0
     nb_eval_steps, nb_eval_examples = 0, 0
     allLabs = None
     res = None
     initRes = True
     for input_ids, input_mask, segment_ids, label_ids in tqdm(
             eval_dataloader, desc="Evaluating"):
         input_ids = input_ids.to(self.device)
         input_mask = input_mask.to(self.device)
         segment_ids = segment_ids.to(self.device)
         label_ids = label_ids.to(self.device)
         with torch.no_grad():
             tmp_eval_loss = self.model(input_ids, segment_ids, input_mask,
                                        label_ids)
             logits = self.model(input_ids, segment_ids, input_mask)
         preds = logits.sigmoid().to('cpu').numpy()
         labs = label_ids.to('cpu').numpy()
         if initRes == True:
             res = preds
             allLabs = labs
             initRes = False
         else:
             res = numpy.concatenate((res, preds))
             allLabs = numpy.concatenate((allLabs, labs))
         tmp_eval_accuracy = accuracy(logits, label_ids, self.rankThreshold)
         eval_loss += tmp_eval_loss.mean().item()
         eval_accuracy += tmp_eval_accuracy
         nb_eval_examples += input_ids.size(0)
         nb_eval_steps += 1
     self.predictions = res
     self.testLabels = allLabs
     de = datetime.datetime.now()
     print("Test dataset containing %d documents predicted in %s\n" %
           (len(eval_examples), showTime(ds, de)))
     if self.Config["runfor"] != "crossvalidation":
         self.saveResources("torch")
     self.getMetrics()
     self.saveResults()
Beispiel #18
0
 def trainModel(self):
     print("Start training..")
     ds = datetime.datetime.now()
     param_optimizer = [p for p in self.model.named_parameters()]
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [{
         'params': [
             p for n, p in param_optimizer
             if not any(nd in n for nd in no_decay)
         ],
         'weight_decay':
         0.01
     }, {
         'params':
         [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay':
         0.0
     }]
     optimizer = BertAdam(optimizer_grouped_parameters,
                          lr=self.learning_rate,
                          warmup=self.warmup_proportion,
                          t_total=self.num_train_optimization_steps)
     global_step = 0
     nb_tr_steps = 0
     tr_loss = 0
     train_features = convert_examples_to_features(self.train_examples,
                                                   self.label_list,
                                                   self.max_seq_length,
                                                   self.tokenizer)
     logger = getLogger()
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(self.train_examples))
     logger.info("  Batch size = %d", self.train_batch_size)
     logger.info("  Num steps = %d", self.num_train_optimization_steps)
     all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                  dtype=torch.long)
     all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                   dtype=torch.long)
     all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                    dtype=torch.long)
     all_label_ids = torch.tensor([f.label_id for f in train_features],
                                  dtype=torch.float)
     train_data = TensorDataset(all_input_ids, all_input_mask,
                                all_segment_ids, all_label_ids)
     train_sampler = RandomSampler(train_data)
     train_dataloader = BertDataLoader(train_data,
                                       sampler=train_sampler,
                                       batch_size=self.train_batch_size)
     self.model.train()
     for _ in trange(int(self.num_train_epochs), desc="Epoch"):
         tr_loss = 0
         nb_tr_examples, nb_tr_steps = 0, 0
         for step, batch in enumerate(
                 tqdm(train_dataloader, desc="Iteration")):
             batch = tuple(t.to(self.device) for t in batch)
             input_ids, input_mask, segment_ids, label_ids = batch
             loss = self.model(input_ids, segment_ids, input_mask,
                               label_ids)
             if self.n_gpu > 1:
                 loss = loss.mean()  # mean() to average on multi-gpu.
             if self.gradient_accumulation_steps > 1:
                 loss = loss / self.gradient_accumulation_steps
             loss.backward()
             tr_loss += loss.item()
             nb_tr_examples += input_ids.size(0)
             nb_tr_steps += 1
             if (step + 1) % self.gradient_accumulation_steps == 0:
                 optimizer.step()
                 optimizer.zero_grad()
                 global_step += 1
     de = datetime.datetime.now()
     print("Model is trained in %s" % (showTime(ds, de)))
     if self.Config["runfor"] == "crossvalidation":
         return
     print("Model evaluation...")
     eval_examples = self.processor.get_dev_examples(self.args.data_dir)
     eval_features = convert_examples_to_features(eval_examples,
                                                  self.label_list,
                                                  self.max_seq_length,
                                                  self.tokenizer)
     all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                  dtype=torch.long)
     all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                   dtype=torch.long)
     all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                    dtype=torch.long)
     all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                  dtype=torch.float)
     eval_data = TensorDataset(all_input_ids, all_input_mask,
                               all_segment_ids, all_label_ids)
     eval_sampler = SequentialSampler(eval_data)
     eval_dataloader = BertDataLoader(eval_data,
                                      sampler=eval_sampler,
                                      batch_size=self.eval_batch_size)
     self.model.eval()
     eval_loss, eval_accuracy = 0, 0
     nb_eval_steps, nb_eval_examples = 0, 0
     allLabs = None
     res = None
     initRes = True
     for input_ids, input_mask, segment_ids, label_ids in tqdm(
             eval_dataloader, desc="Evaluating"):
         input_ids = input_ids.to(self.device)
         input_mask = input_mask.to(self.device)
         segment_ids = segment_ids.to(self.device)
         label_ids = label_ids.to(self.device)
         with torch.no_grad():
             tmp_eval_loss = self.model(input_ids, segment_ids, input_mask,
                                        label_ids)
             logits = self.model(input_ids, segment_ids, input_mask)
         preds = logits.sigmoid().to('cpu').numpy()
         labs = label_ids.to('cpu').numpy()
         if initRes == True:
             res = preds
             allLabs = labs
             initRes = False
         else:
             res = numpy.concatenate((res, preds))
             allLabs = numpy.concatenate((allLabs, labs))
         tmp_eval_accuracy = accuracy(logits, label_ids, self.rankThreshold)
         eval_loss += tmp_eval_loss.mean().item()
         eval_accuracy += tmp_eval_accuracy
         nb_eval_examples += input_ids.size(0)
         nb_eval_steps += 1
     eval_accuracy = eval_accuracy / nb_eval_examples
     print("Model accuracy: %.2f" % (eval_accuracy))
     # Save a trained model
     self.model_to_save = self.model.module if hasattr(
         self.model, 'module') else self.model
     output_model_file = fullPath(self.Config, "bertoutpath", opt="name")
     torch.save(self.model_to_save.state_dict(), output_model_file)
     print("Model is saved in %s" % (output_model_file))
Beispiel #19
0
 def saveResources(self):
     tokOpts = [
         "actualtoks", "normalization", "stopwords", "expos", "extrawords",
         "maxseqlen", "maxcharsseqlen", "rttaggerpath"
     ]
     self.Config["resources"]["tokenization"] = {}
     ds = datetime.datetime.now()
     self.outDir = fullPath(self.Config, "resourcespath") + "/"
     for i in range(len(tokOpts)):
         if tokOpts[i] != "rttaggerpath":
             self.Config["resources"]["tokenization"][
                 tokOpts[i]] = self.Config[tokOpts[i]]
         elif self.Config["actualtoks"] == "yes":
             self.Config["resources"]["tokenization"]["rttaggerpath"] = \
                 self.copyFile(fullPath(self.Config, "rttaggerpath"))
     isW2VNeeded = False
     for key, val in self.Config["resources"]["models"].items():
         val["modelPath"] = self.copyFile(val["modelPath"])
         if "w2v" in val and val["w2v"] == "yes":
             isW2VNeeded = True
     if not isW2VNeeded and "w2v" in self.Config["resources"]:
         self.Config["resources"].pop("w2v", None)
     if "w2v" in self.Config["resources"]:
         w2vDict = {}
         isFirstLine = True
         fEmbeddings = open(self.Config["resources"]["w2v"]["modelPath"],
                            encoding="utf-8")
         for line in fEmbeddings:
             if isFirstLine == True:
                 isFirstLine = False
                 continue
             split = line.strip().split(" ")
             word = split[0]
             vector = numpy.array([float(num) for num in split[1:]])
             w2vDict[word] = vector
         fEmbeddings.close()
         with open(self.Config["resources"]["w2v"]["modelPath"] + '.pkl',
                   'wb') as file:
             pickle.dump(w2vDict, file, pickle.HIGHEST_PROTOCOL)
         file.close()
         self.Config["resources"]["w2v"]["modelPath"] = self.copyFile(
             self.Config["resources"]["w2v"]["modelPath"] + '.pkl')
     if "indexer" in self.Config["resources"]:
         self.Config["resources"]["indexer"] = self.copyFile(
             self.Config["resources"]["indexer"])
     if "vectorizer" in self.Config["resources"]:
         self.Config["resources"]["vectorizer"] = self.copyFile(
             self.Config["resources"]["vectorizer"])
     if "ptBertModel" in self.Config["resources"]:
         self.Config["resources"]["ptBertModel"] = self.copyFile(
             self.Config["resources"]["ptBertModel"])
         self.Config["resources"]["vocabPath"] = self.copyFile(
             self.Config["resources"]["vocabPath"])
     cNames = [''] * len(self.Config["cats"])
     for k, v in self.Config["cats"].items():
         cNames[v] = k
     with open(self.outDir + 'labels.txt', 'w', encoding="utf-8") as file:
         file.write(",".join(cNames))
     file.close()
     self.Config["resources"]["labels"] = "labels.txt"
     self.Config["resources"]["consolidatedRank"] = self.rankThreshold
     with open(self.outDir + 'config.json', 'w', encoding="utf-8") as file:
         json.dump(self.Config["resources"], file, indent=4)
     file.close()
     de = datetime.datetime.now()
     print("\nArtifacts are copied into the folder %s in %s" %
           (fullPath(self.Config, "resourcespath"), showTime(ds, de)))