Beispiel #1
0
 def load_w2v_model(self):
     print ("Load W2V model...")
     ds = datetime.datetime.now()
     self.Config["w2vmodel"] = \
         gensim.models.KeyedVectors.load_word2vec_format(get_abs_path(self.Config, "model_path"))
     de = datetime.datetime.now()
     print("Load W2V model (%s) in %s" % (get_abs_path(self.Config, "model_path"), get_formatted_date(ds, de)))
Beispiel #2
0
 def load_data(self):
     if self.Config["enable_tokenization"] == "True":
         print("Start loading and preprocessing of data...")
     else:
         print ("Start loading data...")
     ds = datetime.datetime.now()
     self.Config["predefined_categories"] = self.get_categories(get_abs_path(self.Config, "train_data_path"))
     train_docs = self.get_data_docs(get_abs_path(self.Config, "train_data_path"))
     if not self.splitTrain:
         test_docs = self.get_data_docs(get_abs_path(self.Config, "test_data_path"))
     else:
         ind = int(len(train_docs) * (1 - self.sz))
         random.shuffle(train_docs)
         test_docs = train_docs[ind:]
         train_docs = train_docs[:ind]
     de = datetime.datetime.now()
     self.Config["train_docs"] = random.sample(train_docs, len(train_docs))
     self.Config["test_docs"] = random.sample(test_docs, len(test_docs))
     self.get_max_seq_len()
     self.get_max_chars_length()
     if self.Config["enable_tokenization"] == "True" \
             and self.Config["language_tokenization"] == "True" \
             and self.Config["use_java"] == "True":
         self.jar.stdin.write('!!! STOP !!!\n')
         self.jar.stdin.flush()
     print ("Input data loaded in %s"%(get_formatted_date(ds, de)))
     print ("Training set contains %d documents."%(len(self.Config["train_docs"])))
     print ("Testing set contains %d documents."%(len(self.Config["test_docs"])))
     print ("Documents belong to %d categories."%(len(self.Config["predefined_categories"])))
Beispiel #3
0
    def run(self):  #create W2V Model
        sentences = []
        count = 0
        print("Start to create W2V model...")
        print("Get input data...")
        ds = datetime.datetime.now()
        with open(get_abs_path(self.Config, "data_corpus_path"),
                  'r',
                  encoding='UTF-8') as f:
            for line in f:
                if len(line.strip()) == 0:
                    continue
                count += 1
                words = [w for w in line.strip().split()]
                sentences.append(words)
        f.close()
        de = datetime.datetime.now()
        print("Got %d lines from file %s in %s" %
              (count, get_abs_path(self.Config, "data_corpus_path"),
               get_formatted_date(ds, de)))
        numpy.random.shuffle(sentences)

        logger = EpochLogger(self.epochs)
        w2v = Word2Vec(size=self.ndim, window=10, min_count=3, workers=10)
        ds = datetime.datetime.now()
        print("Build vocabulary...")
        w2v.build_vocab(sentences)
        de = datetime.datetime.now()
        print("Vocabulary is built in %s" % (get_formatted_date(ds, de)))
        print("Train model...")
        ds = datetime.datetime.now()
        w2v.train(sentences,
                  epochs=int(self.Config["epochs_total"]),
                  total_examples=len(sentences),
                  callbacks=[logger])
        de = datetime.datetime.now()
        print("W2V model is completed in %s" % (get_formatted_date(ds, de)))

        created_model_path = get_abs_path(self.Config, "model_path")
        if self.Config["include_current_time_in_model_name"]:
            modelName = os.path.basename(created_model_path)
            dt = "-" + datetime.datetime.now().strftime("%Y-%b-%d-%H%M%S")
            pInd = modelName.rfind(".")
            if pInd > 0:
                modelName = modelName[:pInd] + dt + modelName[pInd:]
            else:
                modelName += dt
        finalPath = os.path.dirname(created_model_path) + "/" + modelName
        ds = datetime.datetime.now()
        w2v.wv.save_word2vec_format(finalPath, binary=False)
        de = datetime.datetime.now()
        print("W2V model %s is saved in the text format in %s\n" %
              (finalPath, get_formatted_date(ds, de)))
Beispiel #4
0
 def trainNNModel(self):
     checkpoints = []
     if self.save_intermediate_results and not self.isCV:
         checkpoint = ModelCheckpoint(
             get_abs_path(self.Config, "intermediate_results_path") +
             "/tempModel.hdf5",
             monitor='val_acc',
             verbose=self.verbose,
             save_best_only=True,
             mode='auto')
         checkpoints.append(checkpoint)
     print("Start training...              ")
     ds = datetime.datetime.now()
     self.model.fit(self.trainArrays,
                    self.trainLabels,
                    epochs=self.epochs,
                    validation_data=(self.valArrays, self.valLabels),
                    batch_size=self.train_batch,
                    verbose=self.verbose,
                    callbacks=checkpoints,
                    shuffle=False)
     de = datetime.datetime.now()
     print("Model is trained in %s" % (get_formatted_date(ds, de)))
     if self.isCV:
         return
     self.model.save(
         get_abs_path(self.Config, "created_model_path", opt="name"))
     print("Model evaluation...")
     scores1 = self.model.evaluate(self.testArrays,
                                   self.testLabels,
                                   verbose=self.verbose)
     print("Final model accuracy: %.2f%%" % (scores1[1] * 100))
     if self.save_intermediate_results:
         model1 = load_model(
             get_abs_path(self.Config, "intermediate_results_path") +
             "/tempModel.hdf5")
         scores2 = model1.evaluate(self.testArrays,
                                   self.testLabels,
                                   verbose=self.verbose)
         print("Last saved model accuracy: %.2f%%" % (scores2[1] * 100))
         if scores1[1] < scores2[1]:
             model = model1
         pref = "The best model "
     else:
         pref = "Model "
     self.model.save(
         get_abs_path(self.Config, "created_model_path", opt="name"))
     print(pref + "is saved in %s" %
           get_abs_path(self.Config, "created_model_path", opt="name"))
Beispiel #5
0
 def load_w2v_model(self):
     if self.Config["w2vmodel"] != None:
         print("W2V model is already loaded...")
         self.w2vModel = self.Config["w2vmodel"]
         return
     print("Load W2V model... ")
     ds = datetime.datetime.now()
     self.w2vModel = gensim.models.KeyedVectors.load_word2vec_format(
         get_abs_path(self.Config, "model_path"))
     de = datetime.datetime.now()
     print("Load W2V model (%s) in %s" % (get_abs_path(
         self.Config, "model_path"), get_formatted_date(ds, de)))
     self.Config["resources"]["w2v"]["created_model_path"] = get_abs_path(
         self.Config, "model_path")
     self.Config["resources"]["w2v"]["ndim"] = self.ndim
Beispiel #6
0
def start_server(Config):
    stanford_path = get_abs_path(Config, "servsource") + "/"
    os.chdir(stanford_path)
    os.environ["CLASSPATH"] = "*"

    def run_server(restore_initial_dir, popenArgs):
        def runInThread(restore_initial_dir, popenArgs):
            srv = Popen(
                'java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -serverProperties '
                + stanford_path +
                'StanfordCoreNLP-arabic.properties -preload tokenize,ssplit,pos '
                + '-status_port ' + Config["servport"] + ' -port ' +
                Config["servport"] + ' -timeout 20000',
                shell=True)
            srv.wait()
            restore_initial_dir()
            return

        thread = threading.Thread(target=runInThread,
                                  args=(restore_initial_dir, ''))
        thread.start()
        return thread

    def restore_initial_dir():
        os.chdir(initial_dir)
        print("Server is down")

    run_server(restore_initial_dir, '')
    time.sleep(10)
    print("Server is running")
Beispiel #7
0
 def run(self):
     lib_path = get_abs_path(self.Config,
                             "set_of_docs_lang_tokenization_lib_path")
     print("GRISHA use set_of_docs_lang_tokenization")
     if not lib_path or not os.path.exists(lib_path):
         raise ValueError(
             "Wrong path to the tagger's jar. Tokenization can't be done")
     in_path = self.Config["home"] + "/" + self.Config["source_path"]
     if not self.Config["source_path"] or self.Config[
             "source_path"] == self.Config["target_path"]:
         raise ValueError(
             "Wrong source/target path(s). Tokenization can't be done.")
     out_path = self.Config["home"] + "/" + self.Config["target_path"]
     stop_words = ""
     stop_words = ",".join(list(stopwords.words(
         'arabic'))) if self.Config["stop_words"] == "True" else ""
     ds = datetime.datetime.now()
     srv = subprocess.Popen(
         'java -Xmx2g -jar ' + lib_path + ' "' + in_path + '" "' +
         out_path + '" "' + self.Config["exclude_positions"] + '" "' +
         stop_words + '" "' + self.Config["extra_words"] + '" "' +
         self.Config["normalization"] + '" "' +
         self.Config["language_tokenization"] + '"',
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         shell=True)
     srv.wait()
     reply = srv.communicate()
     de = datetime.datetime.now()
     print(reply[0].decode())
     print("All process is done in %s" % (get_formatted_date(ds, de)))
Beispiel #8
0
 def save_additions(self):
     self.resources["w2v"] = "True"
     if not "indexer" in self.Config["resources"]:
         self.Config["resources"]["indexer"] = get_abs_path(
             self.Config, "indexer_path")
     self.resources["indexer"] = "True"
     self.resources["handleType"] = "wordVectorsMatrix"
Beispiel #9
0
 def save_data_sets(self):
     root = get_abs_path(self.Config, "cross_validations_datasets_path")
     shutil.rmtree(root)
     os.mkdir(root)
     train_data_path = root + "/train"
     test_data_path = root + "/test"
     folds = {}
     os.mkdir(train_data_path)
     for doc in self.Config["cross_validations_train_docs"]:
         for nlab in doc.nlabs:
             foldPath = train_data_path + "/" + nlab
             if nlab not in folds:
                 os.mkdir(foldPath)
                 folds[nlab] = True
             with open(foldPath + '/' + doc.name, 'w',
                       encoding="utf-8") as file:
                 file.write(doc.lines)
             file.close()
     folds = {}
     os.mkdir(test_data_path)
     for doc in self.Config["cross_validations_test_docs"]:
         for nlab in doc.nlabs:
             foldPath = test_data_path + "/" + nlab
             if nlab not in folds:
                 os.mkdir(foldPath)
                 folds[nlab] = True
             with open(foldPath + '/' + doc.name, 'w',
                       encoding="utf-8") as file:
                 file.write(doc.lines)
             file.close()
Beispiel #10
0
def tokens_from_tagger(Config):
    print("GRISHA tokens_from_tagger()")
    test_path(Config, "set_of_docs_lang_tokenization_lib_path",
              "Wrong path to the tagger's jar. Tokenization can't be done")
    tagger_path = get_abs_path(Config,
                               "set_of_docs_lang_tokenization_lib_path")
    source_path = Config["home"] + "/" + Config["source_path"]
    target_path = Config["home"] + "/" + Config["target_path"]
    stop_words = ",".join(list(
        stopwords.words('arabic'))) if Config["stop_words"] == "True" else ""
    ds = datetime.datetime.now()
    srv = subprocess.Popen(
        'java -Xmx2g -jar ' + tagger_path + ' "' + source_path + '" "' +
        target_path + '" "' + Config["exclude_positions"] + '" "' +
        stop_words + '" "' + Config["extra_words"] + '" "' +
        Config["normalization"] + '" "' + Config["language_tokenization"] +
        '"',
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        shell=True)
    srv.wait()
    reply = srv.communicate()
    de = datetime.datetime.now()
    print(reply[0].decode())
    print("All process is done in %s" % (get_formatted_date(ds, de)))
Beispiel #11
0
    def run(self):
        test_path(self.Config, "train_data_path", "Wrong path to training set. Data can't be loaded.")
        if self.Config["test_data_path"]:
            test_path(self.Config, "test_data_path", "Wrong path to testing set. Data can't be loaded.")
        else:
            self.splitTrain = True
            try:
                self.sz = float(self.Config["test_data_size"])
            except ValueError:
                self.sz = 0
            if not self.Config["test_data_path"] and (self.sz <= 0 or self.sz >= 1):
                raise ValueError("Wrong size of testing set. Data can't be loaded.")
        if self.Config["enable_tokenization"] == "True":
            if self.Config["language_tokenization"] == "True":
                print("GRISHA use single_doc_lang_tokenization")
                if self.Config["use_java"] == "True":
                    test_path(self.Config, 'single_doc_lang_tokenization_lib_path',
                              "Wrong path to the tagger's jar. Preprocessing can't be done.")
                    lib_path = get_abs_path(self.Config, 'single_doc_lang_tokenization_lib_path')
                    command_line = 'java -Xmx2g -jar ' + lib_path + ' "' + self.Config["exclude_positions"] + '"'
                    self.jar = subprocess.Popen(command_line, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
                                                stderr=subprocess.PIPE, shell=True, encoding="utf-8")
                else:
                    self.nlp_tokenizer = stanfordnlp.Pipeline(lang="ar", processors='tokenize,mwt', use_gpu=True)
            if self.Config["stop_words"] == "True":
                self.stop_words = set(nltk.corpus.stopwords.words('arabic'))
            else:
                self.stop_words = set()
            if self.Config["normalization"] == "True":
                self.normalizer = ArabicNormalizer()
        if self.Config["load_w2v_model"] == "True":
            if not self.Config["model_path"] or not os.path.isfile(get_abs_path(self.Config, "model_path")):
                raise ValueError("Wrong path to W2V model. Stop.")
            try:
                self.ndim = int(self.Config["vectors_dimension"])
            except ValueError:
                raise ValueError("Wrong size of vectors' dimentions. Stop.")
            self.Config["resources"]["w2v"]["created_model_path"] = get_abs_path(self.Config, "model_path")
            self.Config["resources"]["w2v"]["ndim"] = self.ndim
            self.load_w2v_model()
        else:
            self.Config["w2vmodel"] = None

        self.load_data()
        if self.Config["analysis"] == "True":
            self.analysis()
Beispiel #12
0
 def trainSKLModel(self):
     de = datetime.datetime.now()
     print("Start training...")
     self.model.fit(self.trainArrays, self.trainLabels)
     ds = datetime.datetime.now()
     print("Model is trained in %s" % (get_formatted_date(de, ds)))
     if self.isCV:
         return
     joblib.dump(
         self.model,
         get_abs_path(self.Config, "created_model_path", opt="name"))
     print("Model is saved in %s" %
           get_abs_path(self.Config, "created_model_path", opt="name"))
     print("Model evaluation...")
     prediction = self.model.predict(self.testArrays)
     print('Final accuracy is %.2f' %
           accuracy_score(self.testLabels, prediction))
     de = datetime.datetime.now()
     print("Evaluated in %s" % get_formatted_date(ds, de))
Beispiel #13
0
 def run(self):
     print("\nCalculate consolidated metrics...")
     if not self.Config["results"]:
         print(
             "No results to consolidate them. Consolidation can not be performed."
         )
         return
     if self.Config["save_reports"] == "True":
         if not self.Config["reports_path"] or not os.path.isdir(
                 get_abs_path(self.Config, "reports_path")):
             print(
                 "Wrong path to the folder, containing reports. Reports can not be created."
             )
         else:
             self.save_reports = True
     if self.Config["prepare_resources_for_runtime"] == "True":
         if (not self.Config["saved_resources_path"] or not os.path.isdir(
                 get_abs_path(self.Config, "saved_resources_path"))):
             print(
                 "Wrong path to the folder, containing resources for runtime. Resources can not be saved."
             )
         else:
             self.runtime = True
     print("Rank threshold for consolidated results: %.2f" %
           (self.rank_threshold))
     if self.save_reports or self.Config[
             "show_consolidated_results"] == "True":
         self.getConsolidatedResults()
         self.get_metrics()
         if self.save_reports:
             self.saveReports()
     if self.runtime:
         saved_rc_path = get_abs_path(self.Config, "saved_resources_path")
         if len(os.listdir(saved_rc_path)) > 0:
             print(
                 "Warning: folder %s is not empty. All its content will be deleted."
                 % saved_rc_path)
             shutil.rmtree(saved_rc_path)
             os.makedirs(saved_rc_path, exist_ok=True)
         print("\nCollect arfifacts for runtime...")
         self.prepare_resources_for_runtime()
Beispiel #14
0
    def saveReports(self):
        print("Save report...")
        report = Report()
        report.requestId = self.Config["reqid"]
        report.sourcesPath = self.Config["actual_path"]
        report.datasetPath = self.Config["test_data_path"]

        tokenization_options = [
            "language_tokenization", "normalization", "stop_words",
            "exclude_positions", "extra_words", "exclude_categories"
        ]
        for t in tokenization_options:
            report.preprocess[t] = self.Config[t]
        for t in self.Config["test_docs"]:
            report.docs[t.name] = {}
            report.docs[t.name]["actual"] = ",".join(t.nlabs)
        if not self.Config["exclude_categories"]:
            exclude_categories = []
        else:
            exclude_categories = self.Config["exclude_categories"].split(",")
        cNames = [''] * (len(self.Config["predefined_categories"]) -
                         len(exclude_categories))
        for k, v in self.Config["predefined_categories"].items():
            if k not in exclude_categories:
                cNames[v] = k
        report.categories = cNames
        for key, val in self.Config["results"].items():
            for i in range(len(val)):
                labs = []
                for j in range(self.qLabs):
                    #if val[i][j] >= self.rank_threshold:
                    if val[i][j] >= self.Config["ranks"][key]:
                        labs.append("%s[%.2f]" % (cNames[j], val[i][j]))
                report.docs[self.Config["test_docs"][i].name][key] = ",".join(
                    labs)
        for key, val in self.Config["metrics"].items():
            report.models[key] = val
        for key, val in self.Config["ranks"].items():
            report.ranks[key] = val
        if len(self.Config["results"]) > 1:
            for i in range(len(self.predictions)):
                labs = []
                for j in range(self.qLabs):
                    if self.predictions[i][j] == 1:
                        labs.append(cNames[j])
                report.docs[self.Config["test_docs"]
                            [i].name]["consolidated"] = ",".join(labs)
            report.models["consolidated"] = self.rank_threshold
        rPath = get_abs_path(
            self.Config, "reports_path") + "/" + self.Config["reqid"] + ".json"
        with open(rPath, 'w', encoding="utf-8") as file:
            json.dump(report.toJSON(), file, indent=4)
        file.close()
Beispiel #15
0
 def prepare_resources_for_runtime(self, type):
     self.resources["created_model_path"] = get_abs_path(
         self.Config, "created_model_path", opt="name")
     self.resources["modelType"] = type
     if self.useProbabilities:
         self.resources["rank_threshold"] = self.rank_threshold
     else:
         self.resources["rank_threshold"] = 1.0
     self.save_additions()
     if type == "skl":
         self.resources["handleType"] = "vectorize"
     self.Config["resources"]["models"][
         "Model" + str(self.Config["modelid"])] = self.resources
 def getDataForSklearnClassifiers(self):
     mlb = None
     ds = datetime.datetime.now()
     if self.model.Config["type_of_execution"] != "test":
         nmCats = [""] * len(self.model.Config["predefined_categories"])
         for k in list(self.model.Config["predefined_categories"].keys()):
             nmCats[self.model.Config["predefined_categories"][k]] = k
         mlb = MultiLabelBinarizer(classes=nmCats)
         wev = (TfidfVectorizer(ngram_range=(1, 3), max_df=0.50).fit(
             [x.lines for x in self.model.Config[self.keyTrain]],
             [x.nlabs for x in self.model.Config[self.keyTrain]]))
         self.model.trainArrays = wev.transform(
             [x.lines for x in self.model.Config[self.keyTrain]])
         self.model.trainLabels = mlb.fit_transform(
             [x.nlabs for x in self.model.Config[self.keyTrain]])
         if not self.model.isCV:
             with open(get_abs_path(self.model.Config, "binarizer_path"),
                       'wb') as handle:
                 pickle.dump(mlb, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
             with open(get_abs_path(self.model.Config, "vectorizer_path"),
                       'wb') as handle:
                 pickle.dump(wev, handle, protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
     if not mlb:
         with open(get_abs_path(self.model.Config, "binarizer_path"),
                   'rb') as handle:
             mlb = pickle.load(handle)
         handle.close()
         with open(get_abs_path(self.model.Config, "vectorizer_path"),
                   'rb') as handle:
             wev = pickle.load(handle)
         handle.close()
     self.model.testArrays = wev.transform(
         [x.lines for x in self.model.Config[self.keyTest]])
     self.model.testLabels = mlb.fit_transform(
         [x.nlabs for x in self.model.Config[self.keyTest]])
     de = datetime.datetime.now()
     print("Prepare all data in %s" % (get_formatted_date(ds, de)))
Beispiel #17
0
 def launch_crossvalidation(self):
     print("Start cross-validation...")
     ds = datetime.datetime.now()
     dp = DataPreparation(self, self.addValSet)
     pSize = len(self.cvDocs) // self.cross_validations_total
     ind = 0
     f1 = 0
     attr_metrics = []
     for i in range(self.cross_validations_total):
         print("Cross-validation, cycle %d from %d..." %
               ((i + 1), self.cross_validations_total))
         if i == 0:
             self.Config["cross_validations_train_docs"] = self.cvDocs[
                 pSize:]
             self.Config[
                 "cross_validations_test_docs"] = self.cvDocs[:pSize]
         elif i == self.cross_validations_total - 1:
             self.Config["cross_validations_train_docs"] = self.cvDocs[:ind]
             self.Config["cross_validations_test_docs"] = self.cvDocs[ind:]
         else:
             self.Config[
                 "cross_validations_train_docs"] = self.cvDocs[:
                                                               ind] + self.cvDocs[
                                                                   ind +
                                                                   pSize:]
             self.Config["cross_validations_test_docs"] = self.cvDocs[
                 ind:ind + pSize]
         ind += pSize
         dp.getVectors(self.handleType)
         self.model = self.create_model()
         self.train_model()
         self.test_model()
         ModelMetrics(self)
         attr_metrics.append(self.metrics)
         cycleF1 = self.metrics["all"]["f1"]
         print("Resulting F1-Measure: %f\n" % cycleF1)
         if cycleF1 > f1:
             if self.Config["save_cross_validations_datasets"]:
                 self.save_data_sets()
             f1 = cycleF1
     de = datetime.datetime.now()
     print("Cross-validation is done in %s" % get_formatted_date(ds, de))
     print_averaged_metrics(attr_metrics, self.Config)
     print("The best result is %f" % (f1))
     print("Corresponding data sets are saved in the folder %s" %
           get_abs_path(self.Config, "cross_validations_datasets_path"))
Beispiel #18
0
def compose_tsv(model, type):
    cNames = [''] * len(model.Config["predefined_categories"])
    for k, v in model.Config["predefined_categories"].items():
        cNames[v] = k
#    if type == "train":
#        pretrained_bert_model_path = get_abs_path(model.Config, "resulting_bert_files_path", opt="/train.tsv")
#        data = model.Config[model.keyTrain]
#    else:
#        pretrained_bert_model_path = get_abs_path(model.Config, "resulting_bert_files_path", opt="/dev.tsv")
#        data = model.Config[model.keyTest]

    pre_trained_bert_model_path = get_abs_path(model.Config, "resulting_bert_files_path",
                                                    opt=("/train.tsv" if type == "train" else "/dev.tsv"))
    data = model.Config[model.keyTest]
    target = open(pre_trained_bert_model_path, "w", encoding="utf-8")
    for i in range(len(data)):
        conts = data[i].lines.replace('\r','').replace('\n','.')
        nl = '\n'
        if i == 0:
            nl = ''
        string = nl + ",".join(data[i].nlabs) + "\t" + conts
        target.write(string)
    target.close()
Beispiel #19
0
 def __init__(self, Config):
     print ("Start to create info...")
     self.Config = Config
     self.curDir = os.path.dirname(__file__)
     self.info = {}
     self.startId = "%d%0.2d%0.2d000000"%(date.today().year, date.today().month, date.today().day)
     if self.Config["info_from"] != "today":
         arr = self.Config["info_from"].split()
         prevDays = int(arr[0])
         startDay = date.today() - timedelta(days=prevDays)
         self.startId = "%d%0.2d%0.2d000000" % (startDay.year, startDay.month, startDay.day)
     self.path = get_abs_path(Config, "reports_path")
     os.chdir(self.path)
     for f in glob.glob("*"):
         resPath = self.path + "/" + f
         try:
             ind = f.rindex(".")
         except ValueError:
             ind = len(f)
         key = f[:ind]
         if (key < self.startId):
             continue
         with open(resPath, 'r', encoding='utf-8') as json_file:
             try:
                 self.info[key] = json.load(json_file)
             except json.JSONDecodeError:
                 print ("Warning: file %s doesn't have json format. Skipped." % resPath)
         json_file.close()
     if not self.info:
         print ("Folder %s doesn't contain reports, created in required diapason of dates. Exit." % self.path)
         return
     self.html = ""
     self.qReqs = 0
     self.footer = "</table></body></html>"
     self.docsDict = self.getDocsDictionary()
     self.createHtml()
Beispiel #20
0
 def prepare_resources_for_runtime(self):
     tokenization_options = [
         "language_tokenization", "normalization", "stop_words",
         "exclude_positions", "extra_words", "max_seq_len",
         "max_chars_seq_len", "single_doc_lang_tokenization_lib_path"
     ]
     self.Config["resources"]["tokenization"] = {}
     ds = datetime.datetime.now()
     self.outDir = get_abs_path(self.Config, "saved_resources_path") + "/"
     for t in tokenization_options:
         if t != "single_doc_lang_tokenization_lib_path":
             self.Config["resources"]["tokenization"][t] = self.Config[t]
         elif self.Config["language_tokenization"] == "True":
             self.Config["resources"]["tokenization"]["single_doc_lang_tokenization_lib_path"] = \
                 self.copyFile(get_abs_path(self.Config, "single_doc_lang_tokenization_lib_path"))
     isW2VNeeded = False
     for key, val in self.Config["resources"]["models"].items():
         val["created_model_path"] = self.copyFile(
             val["created_model_path"])
         if "w2v" in val and val["w2v"] == "True":
             isW2VNeeded = True
     if not isW2VNeeded and "w2v" in self.Config["resources"]:
         self.Config["resources"].pop("w2v", None)
     if "w2v" in self.Config["resources"]:
         w2vDict = {}
         isFirstLine = True
         fEmbeddings = open(
             self.Config["resources"]["w2v"]["created_model_path"],
             encoding="utf-8")
         for line in fEmbeddings:
             if isFirstLine == True:
                 isFirstLine = False
                 continue
             split = line.strip().split(" ")
             word = split[0]
             vector = numpy.array([float(num) for num in split[1:]])
             w2vDict[word] = vector
         fEmbeddings.close()
         with open(
                 self.Config["resources"]["w2v"]["created_model_path"] +
                 '.pkl', 'wb') as file:
             pickle.dump(w2vDict, file, pickle.HIGHEST_PROTOCOL)
         file.close()
         self.Config["resources"]["w2v"]["created_model_path"] = \
             self.copyFile(self.Config["resources"]["w2v"]["created_model_path"] + '.pkl')
     if "indexer" in self.Config["resources"]:
         self.Config["resources"]["indexer"] = self.copyFile(
             self.Config["resources"]["indexer"])
     if "vectorizer" in self.Config["resources"]:
         self.Config["resources"]["vectorizer"] = self.copyFile(
             self.Config["resources"]["vectorizer"])
     if "ptBertModel" in self.Config["resources"]:
         self.Config["resources"]["ptBertModel"] = self.copyFile(
             self.Config["resources"]["ptBertModel"])
         self.Config["resources"]["vocabPath"] = self.copyFile(
             self.Config["resources"]["vocabPath"])
     cNames = [''] * len(self.Config["predefined_categories"])
     for k, v in self.Config["predefined_categories"].items():
         cNames[v] = k
     with open(self.outDir + 'labels.txt', 'w', encoding="utf-8") as file:
         file.write(",".join(cNames))
     file.close()
     self.Config["resources"]["labels"] = "labels.txt"
     self.Config["resources"]["consolidatedRank"] = self.rank_threshold
     with open(self.outDir + 'config.json', 'w', encoding="utf-8") as file:
         json.dump(self.Config["resources"], file, indent=4)
     file.close()
     de = datetime.datetime.now()
     print("\nArtifacts are copied into the folder %s in %s" %
           (get_abs_path(self.Config, "saved_resources_path"),
            get_formatted_date(ds, de)))
Beispiel #21
0
 def loadSKLModel(self):
     return joblib.load(
         get_abs_path(self.Config, "created_model_path", opt="name"))
Beispiel #22
0
 def loadNNModel(self):
     return load_model(
         get_abs_path(self.Config, "created_model_path", opt="name"))
Beispiel #23
0
 def save_additions(self):
     if not "vectorizer" in self.Config["resources"]:
         self.Config["resources"]["vectorizer"] = get_abs_path(
             self.Config, "vectorizer_path")
     self.resources["vectorizer"] = "True"
Beispiel #24
0
 def run(self):
     try:
         self.test_data_size = float(self.Config["test_data_size"])
     except ValueError:
         self.test_data_size = -1
     if not correct_path(self.Config, "train_data_path"):
         if self.Config["type_of_execution"] != "test" or not self.Config[
                 "test_data_path"]:
             raise ValueError(
                 "Wrong path to the training set: folder %s doesn't exist."
                 % get_abs_path(self.Config, "train_data_path"))
     if not correct_path(self.Config, "test_data_path"):
         if not (len(self.Config["test_data_path"]) == 0
                 and self.test_data_size > 0 and self.test_data_size < 1):
             raise ValueError(
                 "Wrong path to the testing set: folder %d doesn't exist." %
                 get_abs_path(self.Config, "test_data_path"))
     test_path(self.Config, "created_model_path",
               "Wrong path to the models' folder.")
     if not self.Config["name"]:
         self.Config["name"] = self.Config["type"] + str(
             self.Config["modelid"])
     mPath = get_abs_path(self.Config, "created_model_path", opt="name")
     if self.Config["type_of_execution"] == "test" and not os.path.isfile(
             mPath):
         raise ValueError("Wrong path to the tested model.")
     if self.Config["type_of_execution"] != "test":
         try:
             self.epochs = int(self.Config["epochs"])
         except ValueError:
             raise ValueError("Wrong quantity of epochs for training.")
         try:
             self.train_batch = int(self.Config["train_batch"])
         except ValueError:
             raise ValueError("Wrong batch size for training.")
         try:
             self.verbose = int(self.Config["verbose"])
         except ValueError:
             raise ValueError("Wrong value of 'verbose' flag for training.")
         if self.Config["save_intermediate_results"] == "True":
             if not self.Config["intermediate_results_path"] or \
                     not os.path.isdir(get_abs_path(self.Config, "intermediate_results_path")):
                 raise ValueError(
                     "Wrong path to folder with intermediate results.")
     """
     if self.Config["type_of_execution"].lower() != "train":
         if self.Config["modelinfo"] == "True":
             if not self.Config["infopath"] or not os.path.isdir(get_abs_path(self.Config, "infopath")):
                 raise ValueError("Wrong path to folder containing model info.")
     """
     if self.Config["type_of_execution"] != "train" and self.Config[
             "customrank"] == "True":
         try:
             self.rank_threshold = float(self.Config["rank_threshold"])
         except ValueError:
             raise ValueError("Wrong custom rank threshold.")
     if self.Config["type_of_execution"] == "crossvalidation":
         if self.Config["save_cross_validations_datasets"] == "True":
             test_path(
                 self.Config, "cross_validations_datasets_path",
                 "Wrong path to the cross-validation's resulting folder.")
         try:
             cross_validations_total = int(
                 self.Config["cross_validations_total"])
         except ValueError:
             raise ValueError("Wrong k-fold value.")
     #if stop:
     #    print ("Stop.")
     #    self.Config["error"] = True
     #    return
     if self.Config["type"].lower() == "snn":
         SnnModel(self.Config)
     elif self.Config["type"].lower() == "ltsm":
         LTSMModel(self.Config)
     elif self.Config["type"].lower() == "cnn":
         CNNModel(self.Config)
     elif self.Config["type"].lower() == "pac":
         PacModel(self.Config)
     elif self.Config["type"].lower() == "ridge":
         RidgeModel(self.Config)
     elif self.Config["type"].lower() == "svc":
         SVCModel(self.Config)
     elif self.Config["type"] == "perceptron":
         PerceptronModel(self.Config)
     elif self.Config["type"] == "sgd":
         SGDModel(self.Config)
     elif self.Config["type"] == "bert":
         BertModel(self.Config)
 def getWordVectorsMatrix(self):
     tokenizer = None
     ds = datetime.datetime.now()
     if self.model.Config["type_of_execution"] != "test":
         tokenizer = Tokenizer(num_words=self.maxWords)
         trainTexts = []
         for t in self.model.Config[self.keyTrain]:
             trainTexts.append(t.lines)
         tokenizer.fit_on_texts(trainTexts)
         if not self.model.isCV:
             with open(get_abs_path(self.model.Config, "indexer_path"),
                       'wb') as handle:
                 pickle.dump(tokenizer,
                             handle,
                             protocol=pickle.HIGHEST_PROTOCOL)
             handle.close()
             if self.model.Config["max_doc_len"] > self.model.Config[
                     "max_seq_len"]:
                 print(
                     "Most of documents from training set have less then %d tokens. Longer documents will be truncated."
                     % (self.model.Config["max_seq_len"]))
         self.model.trainArrays = pad_sequences(
             tokenizer.texts_to_sequences(trainTexts),
             maxlen=self.model.Config["max_seq_len"])
         self.model.trainLabels = numpy.concatenate([
             numpy.array(x.labels).reshape(
                 1, len(self.model.Config["predefined_categories"]))
             for x in self.model.Config[self.keyTrain]
         ])
         if self.addValSet:
             ind = int(
                 len(self.model.trainArrays) *
                 (1 - self.validation_data_size))
             self.model.valArrays = self.model.trainArrays[ind:]
             self.model.valLabels = self.model.trainLabels[ind:]
             self.model.trainArrays = self.model.trainArrays[:ind]
             self.model.trainLabels = self.model.trainLabels[:ind]
     if tokenizer == None:
         with open(get_abs_path(self.model.Config, "indexer_path"),
                   'rb') as handle:
             tokenizer = pickle.load(handle)
         handle.close()
     testTexts = []
     for t in self.model.Config[self.keyTest]:
         testTexts.append(t.lines)
     self.model.testArrays = pad_sequences(
         tokenizer.texts_to_sequences(testTexts),
         maxlen=self.model.Config["max_seq_len"])
     self.model.testLabels = numpy.concatenate([
         numpy.array(x.labels).reshape(
             1, len(self.model.Config["predefined_categories"]))
         for x in self.model.Config[self.keyTest]
     ])
     embedding_matrix = numpy.zeros((self.maxWords, self.ndim))
     word_index = tokenizer.word_index
     nf = 0
     for word, i in word_index.items():
         if i < self.maxWords:
             try:
                 embedding_vector = self.model.w2vModel[word]
             except KeyError:
                 nf += 1
                 continue
             if embedding_vector is not None:
                 embedding_matrix[i] = embedding_vector
     self.model.embMatrix = embedding_matrix
     self.model.maxWords = self.maxWords
     if self.model.isCV:
         return
     de = datetime.datetime.now()
     print('Found %s unique tokens.' % len(tokenizer.word_index))
     print('Tokens not found in W2V vocabulary: %d' % nf)
     print("All data prepared and embedding matrix built in %s" %
           (get_formatted_date(ds, de)))
     return embedding_matrix, self.maxWords