def tokens_from_tagger(Config): print("GRISHA tokens_from_tagger()") test_path(Config, "set_of_docs_lang_tokenization_lib_path", "Wrong path to the tagger's jar. Tokenization can't be done") tagger_path = get_abs_path(Config, "set_of_docs_lang_tokenization_lib_path") source_path = Config["home"] + "/" + Config["source_path"] target_path = Config["home"] + "/" + Config["target_path"] stop_words = ",".join(list( stopwords.words('arabic'))) if Config["stop_words"] == "True" else "" ds = datetime.datetime.now() srv = subprocess.Popen( 'java -Xmx2g -jar ' + tagger_path + ' "' + source_path + '" "' + target_path + '" "' + Config["exclude_positions"] + '" "' + stop_words + '" "' + Config["extra_words"] + '" "' + Config["normalization"] + '" "' + Config["language_tokenization"] + '"', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) srv.wait() reply = srv.communicate() de = datetime.datetime.now() print(reply[0].decode()) print("All process is done in %s" % (get_formatted_date(ds, de)))
def parse_config_info(path): parser.read_file(open(path)) for s in parser.sections(): for opt in parser.items(s): Config[opt[0]] = opt[1] if not Config["home"]: Config["home"] = str(Path.home()) if not Config["info_from"]: Config["info_from"] = "today" if Config["info_from"] != "today": chk = Config["info_from"].split() if len(chk) != 2 and not chk[1].startswith("day"): print("Wrong value of 'info_from' option. Exit.") return try: days = int(chk[0]) except ValueError: print("Wrong value of 'info_from' option. Exit.") return test_path(Config, "reports_path", "Wrong path to the folder, containing reports. Exit.") test_path( Config, "actual_path", "Warning: wrong path to the folder containing original documents. It will not be possible to view them." ) InfoCreator(Config)
def is_correct_path(self, Config): if self.Config["w2vmodel"] == None: test_path(Config, "model_path", "Wrong path to W2V model. Stop.") if not correct_path(Config, "indexer_path"): if Config["type_of_execution"] == "test": print("Wrong path to indexer. Stop.") return False return True
def __init__(self, Config, DefConfig, kwargs): print("=== Word Embedding ===") updateParams(Config, DefConfig, kwargs) self.Config = Config self.DefConfig = DefConfig test_path(Config, "model_path", "Wrong path to W2V model. Word Embedding can't be done.") if Config["need_create_model"] != "True": return test_path(Config, "data_corpus_path", "Wrong corpus path. W2V model can't be created.") try: self.epochs = int(self.Config["epochs_total"]) except ValueError: raise ValueError( "Wrong quantity of epochs for training. W2V model can't be created." ) try: self.ndim = int(self.Config["vectors_dimension"]) except ValueError: raise ValueError( "Wrong size of resulting vectors. W2V model can't be created.")
def __init__(self, Config): super().__init__(Config) if self.Config["w2vmodel"] == None: test_path(Config, "model_path", "Wrong path to W2V model. Stop.") try: self.validation_data_size = float(Config["validation_data_size"]) except ValueError: self.validation_data_size = 0 if self.validation_data_size <= 0 or self.validation_data_size >= 1: raise ValueError("Wrong size of validation data set. Stop.") try: self.ndim = int(self.Config["vectors_dimension"]) except ValueError: raise ValueError("Wrong size of vectors' dimentions. Stop.") self.addValSet = True self.handleType = "wordVectorsSum" self.save_intermediate_results = Config[ "save_intermediate_results"] == "True" self.useProbabilities = True self.w2vModel = None self.load_w2v_model() if Config["type_of_execution"] != "crossvalidation": self.prepareData() self.launch_process()
def run(self): test_path(self.Config, "train_data_path", "Wrong path to training set. Data can't be loaded.") if self.Config["test_data_path"]: test_path(self.Config, "test_data_path", "Wrong path to testing set. Data can't be loaded.") else: self.splitTrain = True try: self.sz = float(self.Config["test_data_size"]) except ValueError: self.sz = 0 if not self.Config["test_data_path"] and (self.sz <= 0 or self.sz >= 1): raise ValueError("Wrong size of testing set. Data can't be loaded.") if self.Config["enable_tokenization"] == "True": if self.Config["language_tokenization"] == "True": print("GRISHA use single_doc_lang_tokenization") if self.Config["use_java"] == "True": test_path(self.Config, 'single_doc_lang_tokenization_lib_path', "Wrong path to the tagger's jar. Preprocessing can't be done.") lib_path = get_abs_path(self.Config, 'single_doc_lang_tokenization_lib_path') command_line = 'java -Xmx2g -jar ' + lib_path + ' "' + self.Config["exclude_positions"] + '"' self.jar = subprocess.Popen(command_line, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, encoding="utf-8") else: self.nlp_tokenizer = stanfordnlp.Pipeline(lang="ar", processors='tokenize,mwt', use_gpu=True) if self.Config["stop_words"] == "True": self.stop_words = set(nltk.corpus.stopwords.words('arabic')) else: self.stop_words = set() if self.Config["normalization"] == "True": self.normalizer = ArabicNormalizer() if self.Config["load_w2v_model"] == "True": if not self.Config["model_path"] or not os.path.isfile(get_abs_path(self.Config, "model_path")): raise ValueError("Wrong path to W2V model. Stop.") try: self.ndim = int(self.Config["vectors_dimension"]) except ValueError: raise ValueError("Wrong size of vectors' dimentions. Stop.") self.Config["resources"]["w2v"]["created_model_path"] = get_abs_path(self.Config, "model_path") self.Config["resources"]["w2v"]["ndim"] = self.ndim self.load_w2v_model() else: self.Config["w2vmodel"] = None self.load_data() if self.Config["analysis"] == "True": self.analysis()
def run(self): try: self.test_data_size = float(self.Config["test_data_size"]) except ValueError: self.test_data_size = -1 if not correct_path(self.Config, "train_data_path"): if self.Config["type_of_execution"] != "test" or not self.Config[ "test_data_path"]: raise ValueError( "Wrong path to the training set: folder %s doesn't exist." % get_abs_path(self.Config, "train_data_path")) if not correct_path(self.Config, "test_data_path"): if not (len(self.Config["test_data_path"]) == 0 and self.test_data_size > 0 and self.test_data_size < 1): raise ValueError( "Wrong path to the testing set: folder %d doesn't exist." % get_abs_path(self.Config, "test_data_path")) test_path(self.Config, "created_model_path", "Wrong path to the models' folder.") if not self.Config["name"]: self.Config["name"] = self.Config["type"] + str( self.Config["modelid"]) mPath = get_abs_path(self.Config, "created_model_path", opt="name") if self.Config["type_of_execution"] == "test" and not os.path.isfile( mPath): raise ValueError("Wrong path to the tested model.") if self.Config["type_of_execution"] != "test": try: self.epochs = int(self.Config["epochs"]) except ValueError: raise ValueError("Wrong quantity of epochs for training.") try: self.train_batch = int(self.Config["train_batch"]) except ValueError: raise ValueError("Wrong batch size for training.") try: self.verbose = int(self.Config["verbose"]) except ValueError: raise ValueError("Wrong value of 'verbose' flag for training.") if self.Config["save_intermediate_results"] == "True": if not self.Config["intermediate_results_path"] or \ not os.path.isdir(get_abs_path(self.Config, "intermediate_results_path")): raise ValueError( "Wrong path to folder with intermediate results.") """ if self.Config["type_of_execution"].lower() != "train": if self.Config["modelinfo"] == "True": if not self.Config["infopath"] or not os.path.isdir(get_abs_path(self.Config, "infopath")): raise ValueError("Wrong path to folder containing model info.") """ if self.Config["type_of_execution"] != "train" and self.Config[ "customrank"] == "True": try: self.rank_threshold = float(self.Config["rank_threshold"]) except ValueError: raise ValueError("Wrong custom rank threshold.") if self.Config["type_of_execution"] == "crossvalidation": if self.Config["save_cross_validations_datasets"] == "True": test_path( self.Config, "cross_validations_datasets_path", "Wrong path to the cross-validation's resulting folder.") try: cross_validations_total = int( self.Config["cross_validations_total"]) except ValueError: raise ValueError("Wrong k-fold value.") #if stop: # print ("Stop.") # self.Config["error"] = True # return if self.Config["type"].lower() == "snn": SnnModel(self.Config) elif self.Config["type"].lower() == "ltsm": LTSMModel(self.Config) elif self.Config["type"].lower() == "cnn": CNNModel(self.Config) elif self.Config["type"].lower() == "pac": PacModel(self.Config) elif self.Config["type"].lower() == "ridge": RidgeModel(self.Config) elif self.Config["type"].lower() == "svc": SVCModel(self.Config) elif self.Config["type"] == "perceptron": PerceptronModel(self.Config) elif self.Config["type"] == "sgd": SGDModel(self.Config) elif self.Config["type"] == "bert": BertModel(self.Config)