Exemple #1
0
def tokens_from_tagger(Config):
    print("GRISHA tokens_from_tagger()")
    test_path(Config, "set_of_docs_lang_tokenization_lib_path",
              "Wrong path to the tagger's jar. Tokenization can't be done")
    tagger_path = get_abs_path(Config,
                               "set_of_docs_lang_tokenization_lib_path")
    source_path = Config["home"] + "/" + Config["source_path"]
    target_path = Config["home"] + "/" + Config["target_path"]
    stop_words = ",".join(list(
        stopwords.words('arabic'))) if Config["stop_words"] == "True" else ""
    ds = datetime.datetime.now()
    srv = subprocess.Popen(
        'java -Xmx2g -jar ' + tagger_path + ' "' + source_path + '" "' +
        target_path + '" "' + Config["exclude_positions"] + '" "' +
        stop_words + '" "' + Config["extra_words"] + '" "' +
        Config["normalization"] + '" "' + Config["language_tokenization"] +
        '"',
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        shell=True)
    srv.wait()
    reply = srv.communicate()
    de = datetime.datetime.now()
    print(reply[0].decode())
    print("All process is done in %s" % (get_formatted_date(ds, de)))
Exemple #2
0
def parse_config_info(path):
    parser.read_file(open(path))
    for s in parser.sections():
        for opt in parser.items(s):
            Config[opt[0]] = opt[1]
    if not Config["home"]:
        Config["home"] = str(Path.home())
    if not Config["info_from"]:
        Config["info_from"] = "today"
    if Config["info_from"] != "today":
        chk = Config["info_from"].split()
        if len(chk) != 2 and not chk[1].startswith("day"):
            print("Wrong value of 'info_from' option. Exit.")
            return
        try:
            days = int(chk[0])
        except ValueError:
            print("Wrong value of 'info_from' option. Exit.")
            return
    test_path(Config, "reports_path",
              "Wrong path to the folder, containing reports. Exit.")
    test_path(
        Config, "actual_path",
        "Warning: wrong path to the folder containing original documents. It will not be possible to view them."
    )
    InfoCreator(Config)
Exemple #3
0
 def is_correct_path(self, Config):
     if self.Config["w2vmodel"] == None:
         test_path(Config, "model_path", "Wrong path to W2V model. Stop.")
     if not correct_path(Config, "indexer_path"):
         if Config["type_of_execution"] == "test":
             print("Wrong path to indexer. Stop.")
             return False
     return True
Exemple #4
0
 def __init__(self, Config, DefConfig, kwargs):
     print("=== Word Embedding ===")
     updateParams(Config, DefConfig, kwargs)
     self.Config = Config
     self.DefConfig = DefConfig
     test_path(Config, "model_path",
               "Wrong path to W2V model. Word Embedding can't be done.")
     if Config["need_create_model"] != "True":
         return
     test_path(Config, "data_corpus_path",
               "Wrong corpus path. W2V model can't be created.")
     try:
         self.epochs = int(self.Config["epochs_total"])
     except ValueError:
         raise ValueError(
             "Wrong quantity of epochs for training. W2V model can't be created."
         )
     try:
         self.ndim = int(self.Config["vectors_dimension"])
     except ValueError:
         raise ValueError(
             "Wrong size of resulting vectors. W2V model can't be created.")
Exemple #5
0
 def __init__(self, Config):
     super().__init__(Config)
     if self.Config["w2vmodel"] == None:
         test_path(Config, "model_path", "Wrong path to W2V model. Stop.")
     try:
         self.validation_data_size = float(Config["validation_data_size"])
     except ValueError:
         self.validation_data_size = 0
     if self.validation_data_size <= 0 or self.validation_data_size >= 1:
         raise ValueError("Wrong size of validation data set. Stop.")
     try:
         self.ndim = int(self.Config["vectors_dimension"])
     except ValueError:
         raise ValueError("Wrong size of vectors' dimentions. Stop.")
     self.addValSet = True
     self.handleType = "wordVectorsSum"
     self.save_intermediate_results = Config[
         "save_intermediate_results"] == "True"
     self.useProbabilities = True
     self.w2vModel = None
     self.load_w2v_model()
     if Config["type_of_execution"] != "crossvalidation":
         self.prepareData()
     self.launch_process()
Exemple #6
0
    def run(self):
        test_path(self.Config, "train_data_path", "Wrong path to training set. Data can't be loaded.")
        if self.Config["test_data_path"]:
            test_path(self.Config, "test_data_path", "Wrong path to testing set. Data can't be loaded.")
        else:
            self.splitTrain = True
            try:
                self.sz = float(self.Config["test_data_size"])
            except ValueError:
                self.sz = 0
            if not self.Config["test_data_path"] and (self.sz <= 0 or self.sz >= 1):
                raise ValueError("Wrong size of testing set. Data can't be loaded.")
        if self.Config["enable_tokenization"] == "True":
            if self.Config["language_tokenization"] == "True":
                print("GRISHA use single_doc_lang_tokenization")
                if self.Config["use_java"] == "True":
                    test_path(self.Config, 'single_doc_lang_tokenization_lib_path',
                              "Wrong path to the tagger's jar. Preprocessing can't be done.")
                    lib_path = get_abs_path(self.Config, 'single_doc_lang_tokenization_lib_path')
                    command_line = 'java -Xmx2g -jar ' + lib_path + ' "' + self.Config["exclude_positions"] + '"'
                    self.jar = subprocess.Popen(command_line, stdin=subprocess.PIPE, stdout=subprocess.PIPE,
                                                stderr=subprocess.PIPE, shell=True, encoding="utf-8")
                else:
                    self.nlp_tokenizer = stanfordnlp.Pipeline(lang="ar", processors='tokenize,mwt', use_gpu=True)
            if self.Config["stop_words"] == "True":
                self.stop_words = set(nltk.corpus.stopwords.words('arabic'))
            else:
                self.stop_words = set()
            if self.Config["normalization"] == "True":
                self.normalizer = ArabicNormalizer()
        if self.Config["load_w2v_model"] == "True":
            if not self.Config["model_path"] or not os.path.isfile(get_abs_path(self.Config, "model_path")):
                raise ValueError("Wrong path to W2V model. Stop.")
            try:
                self.ndim = int(self.Config["vectors_dimension"])
            except ValueError:
                raise ValueError("Wrong size of vectors' dimentions. Stop.")
            self.Config["resources"]["w2v"]["created_model_path"] = get_abs_path(self.Config, "model_path")
            self.Config["resources"]["w2v"]["ndim"] = self.ndim
            self.load_w2v_model()
        else:
            self.Config["w2vmodel"] = None

        self.load_data()
        if self.Config["analysis"] == "True":
            self.analysis()
Exemple #7
0
 def run(self):
     try:
         self.test_data_size = float(self.Config["test_data_size"])
     except ValueError:
         self.test_data_size = -1
     if not correct_path(self.Config, "train_data_path"):
         if self.Config["type_of_execution"] != "test" or not self.Config[
                 "test_data_path"]:
             raise ValueError(
                 "Wrong path to the training set: folder %s doesn't exist."
                 % get_abs_path(self.Config, "train_data_path"))
     if not correct_path(self.Config, "test_data_path"):
         if not (len(self.Config["test_data_path"]) == 0
                 and self.test_data_size > 0 and self.test_data_size < 1):
             raise ValueError(
                 "Wrong path to the testing set: folder %d doesn't exist." %
                 get_abs_path(self.Config, "test_data_path"))
     test_path(self.Config, "created_model_path",
               "Wrong path to the models' folder.")
     if not self.Config["name"]:
         self.Config["name"] = self.Config["type"] + str(
             self.Config["modelid"])
     mPath = get_abs_path(self.Config, "created_model_path", opt="name")
     if self.Config["type_of_execution"] == "test" and not os.path.isfile(
             mPath):
         raise ValueError("Wrong path to the tested model.")
     if self.Config["type_of_execution"] != "test":
         try:
             self.epochs = int(self.Config["epochs"])
         except ValueError:
             raise ValueError("Wrong quantity of epochs for training.")
         try:
             self.train_batch = int(self.Config["train_batch"])
         except ValueError:
             raise ValueError("Wrong batch size for training.")
         try:
             self.verbose = int(self.Config["verbose"])
         except ValueError:
             raise ValueError("Wrong value of 'verbose' flag for training.")
         if self.Config["save_intermediate_results"] == "True":
             if not self.Config["intermediate_results_path"] or \
                     not os.path.isdir(get_abs_path(self.Config, "intermediate_results_path")):
                 raise ValueError(
                     "Wrong path to folder with intermediate results.")
     """
     if self.Config["type_of_execution"].lower() != "train":
         if self.Config["modelinfo"] == "True":
             if not self.Config["infopath"] or not os.path.isdir(get_abs_path(self.Config, "infopath")):
                 raise ValueError("Wrong path to folder containing model info.")
     """
     if self.Config["type_of_execution"] != "train" and self.Config[
             "customrank"] == "True":
         try:
             self.rank_threshold = float(self.Config["rank_threshold"])
         except ValueError:
             raise ValueError("Wrong custom rank threshold.")
     if self.Config["type_of_execution"] == "crossvalidation":
         if self.Config["save_cross_validations_datasets"] == "True":
             test_path(
                 self.Config, "cross_validations_datasets_path",
                 "Wrong path to the cross-validation's resulting folder.")
         try:
             cross_validations_total = int(
                 self.Config["cross_validations_total"])
         except ValueError:
             raise ValueError("Wrong k-fold value.")
     #if stop:
     #    print ("Stop.")
     #    self.Config["error"] = True
     #    return
     if self.Config["type"].lower() == "snn":
         SnnModel(self.Config)
     elif self.Config["type"].lower() == "ltsm":
         LTSMModel(self.Config)
     elif self.Config["type"].lower() == "cnn":
         CNNModel(self.Config)
     elif self.Config["type"].lower() == "pac":
         PacModel(self.Config)
     elif self.Config["type"].lower() == "ridge":
         RidgeModel(self.Config)
     elif self.Config["type"].lower() == "svc":
         SVCModel(self.Config)
     elif self.Config["type"] == "perceptron":
         PerceptronModel(self.Config)
     elif self.Config["type"] == "sgd":
         SGDModel(self.Config)
     elif self.Config["type"] == "bert":
         BertModel(self.Config)