def generate(self): for padding in ['left']: params = dict() params['padding'] = padding samples, computed_params = load_dataset(params) embeddings = WordEmbeddings.load_word_vectors( wordchar2vector_path, word2vector_path) word_dims = embeddings.vector_size computed_params['embeddings'] = embeddings computed_params['word_dims'] = word_dims for net_arch in ['rnn(cnn)']: # 'rnn', 'cnn' params['net_arch'] = net_arch if net_arch == 'rnn': for rnn_size in [150, 200, 256]: params['rnn_size'] = rnn_size for units1 in [16]: params['units1'] = units1 for activation1 in ['relu']: params['activation1'] = activation1 for optimizer in ['nadam']: params['optimizer'] = optimizer for batch_size in [150]: params['batch_size'] = batch_size yield params, computed_params, samples if net_arch == 'rnn(cnn)': for rnn_size in [450, 500, 550]: params['rnn_size'] = rnn_size for nb_filters in [130, 140, 150]: params['nb_filters'] = nb_filters for min_kernel_size in [1]: params['min_kernel_size'] = min_kernel_size for max_kernel_size in [2]: params['max_kernel_size'] = max_kernel_size for pooling in ['max']: params['pooling'] = pooling for units1 in [15, 20, 25]: params['units1'] = units1 for activation1 in ['relu']: params[ 'activation1'] = activation1 for optimizer in ['nadam']: params[ 'optimizer'] = optimizer for batch_size in [150]: params[ 'batch_size'] = batch_size yield params, computed_params, samples if net_arch == 'cnn': for nb_filters in [100]: params['nb_filters'] = nb_filters for min_kernel_size in [1]: params['min_kernel_size'] = min_kernel_size for max_kernel_size in [2]: params['max_kernel_size'] = max_kernel_size for pooling in ['max']: params['pooling'] = pooling for units1 in [16]: params['units1'] = units1 for activation1 in ['relu']: params['activation1'] = activation1 for optimizer in ['nadam']: params['optimizer'] = optimizer for batch_size in [150]: params[ 'batch_size'] = batch_size yield params, computed_params, samples
best_score_wrt.flush() logging.info('Grid search complete, best_score=%f best_params=%s', best_score, get_params_str(best_params)) best_score_wrt.close() if run_mode == 'train': logging.info('Start with run_mode==train') params = dict() params['padding'] = 'left' samples, computed_params = load_dataset(params) embeddings = WordEmbeddings.load_word_vectors(wordchar2vector_path, word2vector_path) word_dims = embeddings.vector_size computed_params['embeddings'] = embeddings computed_params['word_dims'] = word_dims params['net_arch'] = 'rnn(cnn)' params['rnn_size'] = 200 # 500 params['units1'] = 15 params['activation1'] = 'relu' params['nb_filters'] = 150 params['min_kernel_size'] = 1 params['max_kernel_size'] = 2 params['pooling'] = 'max' params['optimizer'] = 'nadam' params['batch_size'] = 250 # 150
def prepare_data(input_path, params, max_samples): logging.info('prepare_data for "%s', get_params_str(params)) samples3 = [] df = pd.read_csv(input_path, encoding='utf-8', delimiter='\t', quoting=3) for anchor, positive, negative in zip(df['anchor'].values, df['positive'].values, df['negative'].values): samples3.append(Sample3(anchor, positive, negative)) if len(samples3) > max_samples: samples3 = random.sample(samples3, max_samples) computed_params = dict() if params['repres'] == 'words': embeddings = WordEmbeddings.load_word_vectors( params['wordchar2vector_path'], params['word2vector_path']) computed_params['embeddings'] = embeddings computed_params['word_dims'] = embeddings.vector_size tokenizer = Tokenizer() tokenizer.load() computed_params['tokenizer'] = tokenizer max_wordseq_len = 0 for sample in samples3: for phrase in [sample.anchor, sample.positive, sample.negative]: words = tokenizer.tokenize(phrase) max_wordseq_len = max(max_wordseq_len, len(words)) logging.info('max_wordseq_len={}'.format(max_wordseq_len)) computed_params['max_wordseq_len'] = max_wordseq_len # Выравниваем все фразы pad_func = lpad_wordseq if padding == 'left' else rpad_wordseq computed_params['pad_func'] = pad_func for sample in samples3: sample.anchor_words = pad_func(tokenizer.tokenize(sample.anchor), max_wordseq_len) sample.positive_words = pad_func( tokenizer.tokenize(sample.positive), max_wordseq_len) sample.negative_words = pad_func( tokenizer.tokenize(sample.negative), max_wordseq_len) elif params['repres'] == 'pieces': spm_name = 'spm_synonymy({})'.format(params['spm_items']) computed_params['spm_name'] = spm_name if not os.path.exists(os.path.join(tmp_folder, spm_name + '.model')): # Для обучения модели SentencePiece нам нужен текстовый корпус. Изготовим его # из имеющихся вариантов предложений в обучающем наборе all_texts = set() for sample in samples3: all_texts.add(sample.anchor) all_texts.add(sample.positive) all_texts.add(sample.negative) sentencepiece_corpus = os.path.join(tmp_folder, 'sentencepiece_corpus.txt') with io.open(sentencepiece_corpus, 'w', encoding='utf-8') as wrt: for text in all_texts: wrt.write(text) wrt.write(u'\n') # Корпус готов, обучаем сегментатор logging.info('Train SentencePiece model on {}...'.format( sentencepiece_corpus)) spm.SentencePieceTrainer.Train( '--input={} --model_prefix={} --vocab_size={} --character_coverage=1.0 --model_type=bpe --input_sentence_size=10000000' .format(sentencepiece_corpus, spm_name, params['spm_items'])) os.rename(spm_name + '.vocab', os.path.join(tmp_folder, spm_name + '.vocab')) os.rename(spm_name + '.model', os.path.join(tmp_folder, spm_name + '.model')) splitter = spm.SentencePieceProcessor() splitter.Load(os.path.join(tmp_folder, spm_name + '.model')) computed_params['splitter'] = splitter max_wordseq_len = 0 all_tokens = set([PAD_TOKEN]) for sample in samples3: for phrase in [sample.anchor, sample.positive, sample.negative]: tokens = splitter.EncodeAsPieces(phrase) max_wordseq_len = max(max_wordseq_len, len(tokens)) all_tokens.update(tokens) logging.info('max_wordseq_len={}'.format(max_wordseq_len)) computed_params['max_wordseq_len'] = max_wordseq_len token2index = {PAD_TOKEN: 0} for token in all_tokens: if token != PAD_TOKEN: token2index[token] = len(token2index) computed_params['token2index'] = token2index for sample in samples3: sample.anchor_words = spm2tokens(splitter, sample.anchor, max_wordseq_len, token2index) sample.positive_words = spm2tokens(splitter, sample.positive, max_wordseq_len, token2index) sample.negative_words = spm2tokens(splitter, sample.negative, max_wordseq_len, token2index) else: raise NotImplementedError() return samples3, computed_params
answers_train, answers_test = train_test_split(premises, questions, answers, test_size=TEST_SHARE, random_state=SEED) print('Generating training samples...') train_inputs, train_targets = generate_samples(premises_train, questions_train, answers_train, max_answer_len) nb_train = len(train_inputs) print('nb_train={}'.format(nb_train)) print('Generating test samples...') test_inputs, test_targets = generate_samples(premises_test, questions_test, answers_test, max_answer_len) nb_test = len(test_inputs) print('nb_test={}'.format(nb_test)) wc2v_path = os.path.join(data_folder, 'wordchar2vector.dat') word2vec = WordEmbeddings.load_word_vectors(wc2v_path, w2v_path) word_dims = word2vec.vector_size print('word_dims={0}'.format(word_dims)) model_config = { 'engine': 'nn', 'max_inputseq_len': max_phrase_len, 'max_outseq_len': max_answer_len, 'w2v_path': w2v_path, 'wordchar2vector_path': wc2v_path, 'PAD_WORD': PAD_WORD, 'model_folder': tmp_folder, 'word_dims': word_dims, 'char2index': char2index, 'arch_filepath': arch_filepath, 'weights_filepath': weights_path
# Грузим конфигурацию модели, веса и т.д. with open(config_path, 'r') as f: model_config = json.load(f) repres = model_config['repres'] max_wordseq_len = model_config['max_wordseq_len'] net_arch = model_config['net_arch'] padding = model_config['padding'] with open(arch_filepath, 'r') as f: model = model_from_json(f.read()) model.load_weights(weights_path) if repres == 'words': embeddings = WordEmbeddings.load_word_vectors( model_config['wordchar2vector_path'], model_config['w2v_path']) word_dims = embeddings.vector_size pad_func = lpad_wordseq if padding == 'left' else rpad_wordseq tokenizer = Tokenizer() tokenizer.load() elif repres == 'pieces': splitter = spm.SentencePieceProcessor() splitter.Load( os.path.join(tmp_folder, model_config['spm_name'] + '.model')) # Загрузим эталонные предложения, похожесть на которые будем определять для # введенного в консоли предложения. phrases2 = set() if True: for phrase in load_strings_from_yaml( os.path.join(data_folder, 'rules.yaml')):