def load(self, path="data/models/solver27.pkl"): # path2config = sber_path('/var/sberbank/models/') # with open(os.path.join(path2config,'path2config.json'), "r", encoding="utf-8") as file: # config = json.load(file) config = { "model_name_to_save": "lm_5_ep_lr2-3_5_stlr", "dict_name_to_save": "itos", "tf_vectorizer_path": sber_path("/var/sberbank/tfvect.joblib"), "lda_path": sber_path("/var/sberbank/lda.joblib"), "topics_path": sber_path("/var/sberbank/topics.csv"), } bs = 16 config['n_hid'] = 1150 self.init_args(**config) self.tf_vectorizer = joblib.load(self.tf_vectorizer_path) self.lda = joblib.load(self.lda_path) self.topics = pd.read_csv(self.topics_path, sep="\t") self.topic_dic = { int(i): self.topics.iloc[i]['Topic'] for i in range(len(self.topics)) } self.data = TextList.from_df(pd.DataFrame(["tmp", "tmp"]), processor=[TokenizeProcessor(tokenizer=Tokenizer(lang="xx")), NumericalizeProcessor( vocab=Vocab.load(sber_path("/var/sberbank/models/itos.pkl").format(self.dict_name_to_save)))]).\ random_split_by_pct(.1).\ label_for_lm().\ databunch(bs=bs) #self.dict_name_to_save = os.path.join('data/models',self.dict_name_to_save ) #self.model_name_to_save = os.path.join('data/models',self.model_name_to_save ) conf = awd_lstm_lm_config.copy() conf['n_hid'] = 1150 self.learn = language_model_learner(self.data, AWD_LSTM, pretrained=False, config=conf, drop_mult=0.7, pretrained_fnames=[ self.model_name_to_save, self.dict_name_to_save ], path=sber_path('/var/sberbank')) #self.learn = language_model_learner(self.data, AWD_LSTM, pretrained=False, drop_mult=0.7, #pretrained_fnames=[self.model_name_to_save, self.dict_name_to_save]) return self
def __init__(self, seed=42): self.has_model = True self.siamese = SiameseBiLSTM() self.best_model_path = sber_path( '/var/sberbank/models/siameise_model.h5') self.seed = seed self.init_seed()
def __init__(self, seed=42): super(Solver, self).__init__() self.seed = seed self.init_seed() self.representatives = {} self.rnc_path = sber_path('/var/sberbank/1grams-3.txt') self.rnc_unigrams = self.lazy_unigrams(self.rnc_path)
def __init__(self, seed=42): data_path = sber_path('/var/sberbank/') self.is_train_task = False self.morph = pymorphy2.MorphAnalyzer() self.toktok = ToktokTokenizer() self.seed = seed self.init_seed() self.synonyms = open(os.path.join(data_path, r'synonyms.txt'), 'r', encoding='utf8').readlines() self.synonyms = [ re.sub('\.', '', t.lower().strip('\n')).split(' ') for t in self.synonyms ] self.synonyms = [[t for t in l if t] for l in self.synonyms] self.antonyms = open(os.path.join(data_path, r'antonyms.txt'), 'r', encoding='utf8').readlines() self.antonyms = [t.strip(' \n').split(' - ') for t in self.antonyms] self.phraseology = open(os.path.join(data_path, r'phraseologs.txt'), 'r', encoding='utf8').readlines() self.phraseology = [[ l for l in self.lemmatize(l) if l not in ['\n', ' ', '...', '', ',', '-', '.', '?', r' (', r'/'] ] for l in self.phraseology]
def load(self, path=''): print("Hi!, It's load") self.best_model_path = sber_path( '/var/sberbank/models/siameise_model.h5') self.siamese_model_loaded = self.get_model() print("Siamese model is loaded") print(self.siamese.embedding_matrix) return self.siamese_model_loaded
def save(self, path=''): path = sber_path('/var/sberbank/models') joblib.dump(self.clf_gen, os.path.join(path, 'clf_gen.joblib')) joblib.dump(self.clf_nar, os.path.join(path, 'clf_nar.joblib')) joblib.dump(self.clf_desc, os.path.join(path, 'clf_desc.joblib')) joblib.dump(self.clf_discource, os.path.join(path, 'clf_discource.joblib')) joblib.dump(self.clf_cause, os.path.join(path, 'clf_cause.joblib'))
def load(self, path=''): path = sber_path('/var/sberbank/models') self.clf_gen = joblib.load(os.path.join(path, 'clf_gen.joblib')) self.clf_nar = joblib.load(os.path.join(path, 'clf_nar.joblib')) self.clf_desc = joblib.load(os.path.join(path, 'clf_desc.joblib')) self.clf_discource = joblib.load( os.path.join(path, 'clf_discource.joblib')) self.clf_cause = joblib.load(os.path.join(path, 'clf_cause.joblib'))
def get_paronyms(self): paronyms = [] with open(sber_path('/var/sberbank/paronyms.csv'), 'r', encoding='utf-8') as in_file: for line in in_file.readlines(): pair = line.strip(punctuation).split('\t') paronyms.append(pair) return paronyms
def __init__(self, speller: Optional[Speller] = None): self.udpipe_model = Model.load( sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model'))) self.process_pipeline = Pipeline(self.udpipe_model, sber_encode('tokenize'), Pipeline.DEFAULT, Pipeline.DEFAULT, sber_encode('conllu')) if speller is None: speller = Speller() self.speller: Speller = speller
def __init__(self, seed=42): super(Solver, self).__init__() self.seed = seed self.init_seed() self.model_config = sber_path('/var/sberbank/models/model_26.json') self.config = read_config(self.model_config) self.unified_substrings = self.config["unified_substrings"] self.replacements = self.config["replacements"] self.duplicates = self.config["duplicates"] self.classifier = LogisticRegression(verbose=10) self.label_encoder = LabelEncoder()
def __init__(self, seed=42): self.morph = pymorphy2.MorphAnalyzer() self.model = Model.load( sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model'))) self.process_pipeline = Pipeline(self.model, sber_encode('tokenize'), Pipeline.DEFAULT, Pipeline.DEFAULT, sber_encode('conllu')) self.seed = seed self.init_seed() self.paronyms = self.get_paronyms() self.freq_bigrams = self.open_freq_grams()
def __init__(self, seed=42): self.morph = pymorphy2.MorphAnalyzer() self.categories = set() self.has_model = True self.model = Model.load( sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model'))) self.process_pipeline = Pipeline(self.model, sber_encode('tokenize'), Pipeline.DEFAULT, Pipeline.DEFAULT, sber_encode('conllu')) self.seed = seed self.label_dict = { 'деепричастный оборот': "get_gerund", 'косвенный речь': "get_indirect_speech", 'несогласованный приложение': "get_app", 'однородный член': "get_homogeneous", 'причастный оборот': "get_participle", 'связь подлежащее сказуемое': "get_predicates", 'сложноподчинённый': "get_clause", 'сложный': "get_clause", 'соотнесённость глагольный форма': "get_verbs", 'форма существительное': "get_nouns", 'числительное': "get_numerals" } self.init_seed()
def __init__(self, seed=42): self.is_train_task = False self.seed = seed self.init_seed() self.stress = open(os.path.join(sber_path('/var/sberbank'), 'agi_stress.txt'), 'r', encoding='utf8').read().split('\n')[:-1]
def __init__(self): self.model_file = sber_path('/var/sberbank/bert-base-multilingual-cased.tar.gz') self.vocab_file = sber_path('/var/sberbank/bert-base-multilingual-cased-vocab.txt') self.model = self.bert_model() self.tokenizer = self.bert_tokenizer() self.embedding_matrix = self.get_bert_embed_matrix()
def open_freq_grams(selfself): with open(sber_path('/var/sberbank/bigrams_lemmas.pickle'), 'rb') as inputfile: counts = pickle.load(inputfile) return counts