Esempio n. 1
0
    def load(self, path="data/models/solver27.pkl"):
        # path2config = sber_path('/var/sberbank/models/')
        # with open(os.path.join(path2config,'path2config.json'), "r", encoding="utf-8") as file:
        #     config = json.load(file)
        config = {
            "model_name_to_save": "lm_5_ep_lr2-3_5_stlr",
            "dict_name_to_save": "itos",
            "tf_vectorizer_path": sber_path("/var/sberbank/tfvect.joblib"),
            "lda_path": sber_path("/var/sberbank/lda.joblib"),
            "topics_path": sber_path("/var/sberbank/topics.csv"),
        }

        bs = 16
        config['n_hid'] = 1150
        self.init_args(**config)
        self.tf_vectorizer = joblib.load(self.tf_vectorizer_path)
        self.lda = joblib.load(self.lda_path)
        self.topics = pd.read_csv(self.topics_path, sep="\t")
        self.topic_dic = {
            int(i): self.topics.iloc[i]['Topic']
            for i in range(len(self.topics))
        }

        self.data = TextList.from_df(pd.DataFrame(["tmp", "tmp"]),
                        processor=[TokenizeProcessor(tokenizer=Tokenizer(lang="xx")),
                                                     NumericalizeProcessor(
                                                         vocab=Vocab.load(sber_path("/var/sberbank/models/itos.pkl").format(self.dict_name_to_save)))]).\
                random_split_by_pct(.1).\
                label_for_lm().\
                databunch(bs=bs)

        #self.dict_name_to_save = os.path.join('data/models',self.dict_name_to_save )
        #self.model_name_to_save = os.path.join('data/models',self.model_name_to_save )
        conf = awd_lstm_lm_config.copy()
        conf['n_hid'] = 1150
        self.learn = language_model_learner(self.data,
                                            AWD_LSTM,
                                            pretrained=False,
                                            config=conf,
                                            drop_mult=0.7,
                                            pretrained_fnames=[
                                                self.model_name_to_save,
                                                self.dict_name_to_save
                                            ],
                                            path=sber_path('/var/sberbank'))
        #self.learn = language_model_learner(self.data, AWD_LSTM, pretrained=False, drop_mult=0.7,
        #pretrained_fnames=[self.model_name_to_save, self.dict_name_to_save])
        return self
Esempio n. 2
0
 def __init__(self, seed=42):
     self.has_model = True
     self.siamese = SiameseBiLSTM()
     self.best_model_path = sber_path(
         '/var/sberbank/models/siameise_model.h5')
     self.seed = seed
     self.init_seed()
Esempio n. 3
0
 def __init__(self, seed=42):
     super(Solver, self).__init__()
     self.seed = seed
     self.init_seed()
     self.representatives = {}
     self.rnc_path = sber_path('/var/sberbank/1grams-3.txt')
     self.rnc_unigrams = self.lazy_unigrams(self.rnc_path)
Esempio n. 4
0
 def __init__(self, seed=42):
     data_path = sber_path('/var/sberbank/')
     self.is_train_task = False
     self.morph = pymorphy2.MorphAnalyzer()
     self.toktok = ToktokTokenizer()
     self.seed = seed
     self.init_seed()
     self.synonyms = open(os.path.join(data_path, r'synonyms.txt'),
                          'r',
                          encoding='utf8').readlines()
     self.synonyms = [
         re.sub('\.', '',
                t.lower().strip('\n')).split(' ') for t in self.synonyms
     ]
     self.synonyms = [[t for t in l if t] for l in self.synonyms]
     self.antonyms = open(os.path.join(data_path, r'antonyms.txt'),
                          'r',
                          encoding='utf8').readlines()
     self.antonyms = [t.strip(' \n').split(' - ') for t in self.antonyms]
     self.phraseology = open(os.path.join(data_path, r'phraseologs.txt'),
                             'r',
                             encoding='utf8').readlines()
     self.phraseology = [[
         l for l in self.lemmatize(l) if l not in
         ['\n', ' ', '...', '', ',', '-', '.', '?', r' (', r'/']
     ] for l in self.phraseology]
Esempio n. 5
0
 def load(self, path=''):
     print("Hi!, It's load")
     self.best_model_path = sber_path(
         '/var/sberbank/models/siameise_model.h5')
     self.siamese_model_loaded = self.get_model()
     print("Siamese model is loaded")
     print(self.siamese.embedding_matrix)
     return self.siamese_model_loaded
Esempio n. 6
0
 def save(self, path=''):
     path = sber_path('/var/sberbank/models')
     joblib.dump(self.clf_gen, os.path.join(path, 'clf_gen.joblib'))
     joblib.dump(self.clf_nar, os.path.join(path, 'clf_nar.joblib'))
     joblib.dump(self.clf_desc, os.path.join(path, 'clf_desc.joblib'))
     joblib.dump(self.clf_discource,
                 os.path.join(path, 'clf_discource.joblib'))
     joblib.dump(self.clf_cause, os.path.join(path, 'clf_cause.joblib'))
Esempio n. 7
0
 def load(self, path=''):
     path = sber_path('/var/sberbank/models')
     self.clf_gen = joblib.load(os.path.join(path, 'clf_gen.joblib'))
     self.clf_nar = joblib.load(os.path.join(path, 'clf_nar.joblib'))
     self.clf_desc = joblib.load(os.path.join(path, 'clf_desc.joblib'))
     self.clf_discource = joblib.load(
         os.path.join(path, 'clf_discource.joblib'))
     self.clf_cause = joblib.load(os.path.join(path, 'clf_cause.joblib'))
Esempio n. 8
0
 def get_paronyms(self):
     paronyms = []
     with open(sber_path('/var/sberbank/paronyms.csv'),
               'r',
               encoding='utf-8') as in_file:
         for line in in_file.readlines():
             pair = line.strip(punctuation).split('\t')
             paronyms.append(pair)
     return paronyms
Esempio n. 9
0
    def __init__(self, speller: Optional[Speller] = None):
        self.udpipe_model = Model.load(
            sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model')))
        self.process_pipeline = Pipeline(self.udpipe_model,
                                         sber_encode('tokenize'),
                                         Pipeline.DEFAULT, Pipeline.DEFAULT,
                                         sber_encode('conllu'))

        if speller is None:
            speller = Speller()
        self.speller: Speller = speller
Esempio n. 10
0
 def __init__(self, seed=42):
     super(Solver, self).__init__()
     self.seed = seed
     self.init_seed()
     self.model_config = sber_path('/var/sberbank/models/model_26.json')
     self.config = read_config(self.model_config)
     self.unified_substrings = self.config["unified_substrings"]
     self.replacements = self.config["replacements"]
     self.duplicates = self.config["duplicates"]
     self.classifier = LogisticRegression(verbose=10)
     self.label_encoder = LabelEncoder()
Esempio n. 11
0
    def __init__(self, seed=42):

        self.morph = pymorphy2.MorphAnalyzer()
        self.model = Model.load(
            sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model')))
        self.process_pipeline = Pipeline(self.model, sber_encode('tokenize'),
                                         Pipeline.DEFAULT, Pipeline.DEFAULT,
                                         sber_encode('conllu'))
        self.seed = seed
        self.init_seed()
        self.paronyms = self.get_paronyms()
        self.freq_bigrams = self.open_freq_grams()
Esempio n. 12
0
 def __init__(self, seed=42):
     self.morph = pymorphy2.MorphAnalyzer()
     self.categories = set()
     self.has_model = True
     self.model = Model.load(
         sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model')))
     self.process_pipeline = Pipeline(self.model, sber_encode('tokenize'),
                                      Pipeline.DEFAULT, Pipeline.DEFAULT,
                                      sber_encode('conllu'))
     self.seed = seed
     self.label_dict = {
         'деепричастный оборот': "get_gerund",
         'косвенный речь': "get_indirect_speech",
         'несогласованный приложение': "get_app",
         'однородный член': "get_homogeneous",
         'причастный оборот': "get_participle",
         'связь подлежащее сказуемое': "get_predicates",
         'сложноподчинённый': "get_clause",
         'сложный': "get_clause",
         'соотнесённость глагольный форма': "get_verbs",
         'форма существительное': "get_nouns",
         'числительное': "get_numerals"
     }
     self.init_seed()
Esempio n. 13
0
 def __init__(self, seed=42):
     self.is_train_task = False
     self.seed = seed
     self.init_seed()
     self.stress = open(os.path.join(sber_path('/var/sberbank'), 'agi_stress.txt'), 'r', encoding='utf8').read().split('\n')[:-1]
Esempio n. 14
0
 def __init__(self):
     self.model_file = sber_path('/var/sberbank/bert-base-multilingual-cased.tar.gz')
     self.vocab_file = sber_path('/var/sberbank/bert-base-multilingual-cased-vocab.txt')
     self.model = self.bert_model()
     self.tokenizer = self.bert_tokenizer()
     self.embedding_matrix = self.get_bert_embed_matrix()
Esempio n. 15
0
 def open_freq_grams(selfself):
     with open(sber_path('/var/sberbank/bigrams_lemmas.pickle'),
               'rb') as inputfile:
         counts = pickle.load(inputfile)
     return counts