def train(init_path=None, is_reverse_model=False): processed_train_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME) processed_val_corpus_path = get_processed_corpus_path( CONTEXT_SENSITIVE_VAL_CORPUS_NAME) index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME) index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME) # check the existence of all necessary files before compiling the model _look_for_saved_files(files_paths=[ processed_train_corpus_path, processed_val_corpus_path, index_to_token_path ]) index_to_token = load_index_to_item(index_to_token_path) index_to_condition = load_index_to_item(index_to_condition_path) w2v_matrix = _get_w2v_embedding_matrix_by_corpus_path( processed_train_corpus_path, index_to_token) # get nn_model and train it nn_model_resolver_factory = S3FileResolver.init_resolver( bucket_name=S3_MODELS_BUCKET_NAME, remote_dir=S3_NN_MODEL_REMOTE_DIR) nn_model, _ = get_nn_model(index_to_token, index_to_condition, model_init_path=init_path, w2v_matrix=w2v_matrix, resolver_factory=nn_model_resolver_factory, is_reverse_model=is_reverse_model) train_model(nn_model)
def _calculate_tfidf_vectorizer(base_corpus_name=BASE_CORPUS_NAME): index_to_token = load_index_to_item(get_index_to_token_path(base_corpus_name)) token_to_index = {v: k for k, v in list(index_to_token.items())} train_lines = _load_train_lines() tfidf_vectorizer = TfidfVectorizer(tokenizer=get_tokens_sequence, vocabulary=token_to_index) tfidf_vectorizer.fit(train_lines) return tfidf_vectorizer
def _get_index_to_token(fetch_from_s3): index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME) if fetch_from_s3: tokens_idx_resolver = S3FileResolver(index_to_token_path, S3_MODELS_BUCKET_NAME, S3_TOKENS_IDX_REMOTE_DIR) if not tokens_idx_resolver.resolve(): raise Exception('Can\'t get index_to_token because file does not exist at S3') else: if not os.path.exists(index_to_token_path): raise Exception('Can\'t get index_to_token because file does not exist. ' 'Run tools/download_model.py first to get all required files or construct it by yourself.') return load_index_to_item(index_to_token_path)
def load_model(): index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME) index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME) model_path = get_model_full_path() index_to_token = load_index_to_item(index_to_token_path) index_to_condition = load_index_to_item(index_to_condition_path) nn_model, model_exists = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path) if not model_exists: raise FileNotFoundException('Couldn\'t find model:\n"{}". \nExiting...'.format(model_path)) return nn_model
def load_model(model_path, tokens_index_path=None, conditions_index_path=None): if tokens_index_path is None: tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME) if conditions_index_path is None: conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME) index_to_token = load_index_to_item(tokens_index_path) index_to_condition = load_index_to_item(conditions_index_path) nn_model, model_exists = get_nn_model(index_to_token, index_to_condition, model_path) if not model_exists: raise ValueError('Couldn\'t find model: "{}".'.format(model_path)) return nn_model
def load_model(model_path=None, tokens_index_path=None, conditions_index_path=None): if model_path is None: model_path = get_model_full_path() if tokens_index_path is None: tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME) if conditions_index_path is None: conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME) index_to_token = load_index_to_item(tokens_index_path) index_to_condition = load_index_to_item(conditions_index_path) nn_model, model_exists = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path) if not model_exists: raise ValueError('Couldn\'t find model: "{}".'.format(model_path)) return nn_model
def _get_index_to_token(fetch_from_s3): index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME) if fetch_from_s3: tokens_idx_resolver = S3FileResolver(index_to_token_path, S3_MODELS_BUCKET_NAME, S3_TOKENS_IDX_REMOTE_DIR) if not tokens_idx_resolver.resolve(): raise Exception( 'Can\'t get index_to_token because file does not exist at S3') else: if not os.path.exists(index_to_token_path): raise Exception( 'Can\'t get index_to_token because file does not exist. ' 'Run tools/download_model.py first to get all required files or construct it by yourself.' ) return load_index_to_item(index_to_token_path)
def load_model(): index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME) index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME) model_path = get_model_full_path() index_to_token = load_index_to_item(index_to_token_path) index_to_condition = load_index_to_item(index_to_condition_path) nn_model, model_exists = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path) if not model_exists: raise FileNotFoundException( 'Couldn\'t find model:\n"{}". \nExiting...'.format(model_path)) return nn_model
def _get_index_to_token(fetch_from_s3): index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME) file_name = os.path.basename(index_to_token_path) if fetch_from_s3: tokens_idx_resolver = S3FileResolver(index_to_token_path, S3_MODELS_BUCKET_NAME, S3_TOKENS_IDX_REMOTE_DIR) if not tokens_idx_resolver.resolve(): raise FileNotFoundException( 'No such file on S3: {}'.format(file_name)) else: if not os.path.exists(index_to_token_path): raise FileNotFoundException( 'No such file: {}'.format(file_name) + 'Run "python tools/fetch.py" first to get all necessary files.' ) return load_index_to_item(index_to_token_path)
def train(is_reverse_model=False): processed_train_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME) processed_val_corpus_path = get_processed_corpus_path(CONTEXT_SENSITIVE_VAL_CORPUS_NAME) index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME) index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME) model_path = get_model_full_path(is_reverse_model) # check the existence of all necessary files before compiling the model _look_for_saved_files(files_paths=[processed_train_corpus_path, processed_val_corpus_path, index_to_token_path]) _look_for_saved_model(model_path) index_to_token = load_index_to_item(index_to_token_path) index_to_condition = load_index_to_item(index_to_condition_path) w2v_matrix = _get_w2v_embedding_matrix_by_corpus_path(processed_train_corpus_path, index_to_token) # get nn_model and train it nn_model, _ = get_nn_model(index_to_token, index_to_condition, w2v_matrix) train_model(nn_model, is_reverse_model=is_reverse_model)
def get_model_vocab_size(): index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME) index_to_token = load_index_to_item(index_to_token_path) return len(index_to_token)
conditions = [ condition for condition, _ in conditions_counter.most_common(max_conditions_num) ] # Validate the condition list if DEFAULT_CONDITION not in conditions: raise Exception( 'No default condition "%s" found in the dataset condition list.' % DEFAULT_CONDITION) # Return index_to_token and index_to_condition mappings return dict(enumerate(vocab)), dict(enumerate(conditions)) def dump_index_to_item(index_to_item, path): ensure_dir(os.path.dirname(path)) with codecs.open(path, 'w', 'utf-8') as fh: json.dump(index_to_item, fh, ensure_ascii=False) if __name__ == '__main__': processed_train_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME) index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME) index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME) index_to_token, index_to_condition = build_index_mappings( processed_train_corpus_path) dump_index_to_item(index_to_token, index_to_token_path) dump_index_to_item(index_to_condition, index_to_condition_path)
def predict(model_path, tokens_index_path=None, conditions_index_path=None, default_predictions_path=None, reverse_model_weights=None, temperatures=None, prediction_mode=None): if not tokens_index_path: tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME) if not conditions_index_path: conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME) if not temperatures: temperatures = [DEFAULT_TEMPERATURE] if not prediction_mode: prediction_mode = PREDICTION_MODE_FOR_TESTS # Construct list of parameters values for all possible combinations of passed parameters prediction_params = [dict()] if reverse_model_weights: prediction_params = [ dict(params, mmi_reverse_model_score_weight=w) for params in prediction_params for w in reverse_model_weights ] if temperatures: prediction_params = [ dict(params, temperature=t) for params in prediction_params for t in temperatures ] if not is_non_empty_file(tokens_index_path): _logger.warning( 'Couldn\'t find tokens_index file:\n{}. \nExiting...'.format( tokens_index_path)) return index_to_token = load_index_to_item(tokens_index_path) index_to_condition = load_index_to_item(conditions_index_path) nn_model, _ = get_nn_model(index_to_token, index_to_condition, model_init_path=model_path) if not default_predictions_path: default_predictions_path = os.path.join( DATA_DIR, 'results', 'predictions_' + nn_model.model_name) # Get path for each combination of parameters predictions_paths = [] # Add suffix to the filename only for parameters that have a specific value passed as an argument # If no parameters were specified, no suffix is added if len(prediction_params) > 1: for cur_params in prediction_params: cur_path = '{base_path}_{params_str}.tsv'.format( base_path=default_predictions_path, params_str='_'.join( ['{}_{}'.format(k, v) for k, v in cur_params.items()])) predictions_paths.append(cur_path) else: predictions_paths = [default_predictions_path + '.tsv'] _logger.info('Model for prediction:\n{}'.format(nn_model.model_load_path)) _logger.info('Tokens index:\n{}'.format(tokens_index_path)) _logger.info('File with questions:\n{}'.format(QUESTIONS_CORPUS_NAME)) _logger.info('Files to dump responses:\n{}'.format( '\n'.join(predictions_paths))) _logger.info('Prediction parameters\n{}'.format('\n'.join( [str(x) for x in prediction_params]))) processed_test_set = get_tokenized_test_lines(QUESTIONS_CORPUS_NAME, set(index_to_token.values())) processed_test_set = list(processed_test_set) for cur_params, cur_path in zip(prediction_params, predictions_paths): _logger.info( 'Predicting with the following params: {}'.format(cur_params)) _save_test_results(processed_test_set, cur_path, nn_model, prediction_modes=[prediction_mode])
def train(model_init_path=None, is_reverse_model=False, train_subset_size=None, use_pretrained_w2v=USE_PRETRAINED_W2V_EMBEDDINGS_LAYER, train_corpus_name=TRAIN_CORPUS_NAME, context_sensitive_val_corpus_name=CONTEXT_SENSITIVE_VAL_CORPUS_NAME, base_corpus_name=BASE_CORPUS_NAME, s3_models_bucket_name=S3_MODELS_BUCKET_NAME, s3_nn_model_remote_dir=S3_NN_MODEL_REMOTE_DIR, prediction_mode_for_tests=PREDICTION_MODE_FOR_TESTS): processed_train_corpus_path = get_processed_corpus_path(train_corpus_name) processed_val_corpus_path = get_processed_corpus_path( context_sensitive_val_corpus_name) index_to_token_path = get_index_to_token_path(base_corpus_name) index_to_condition_path = get_index_to_condition_path(base_corpus_name) # check the existence of all necessary files before compiling the model _look_for_saved_files(files_paths=[ processed_train_corpus_path, processed_val_corpus_path, index_to_token_path ]) # load essentials for building model and training index_to_token = load_index_to_item(index_to_token_path) index_to_condition = load_index_to_item(index_to_condition_path) token_to_index = {v: k for k, v in index_to_token.items()} condition_to_index = {v: k for k, v in index_to_condition.items()} training_data_param = ModelParam(value=get_training_dataset( train_corpus_name, token_to_index, condition_to_index, is_reverse_model, train_subset_size), id=train_corpus_name) val_sets_names = get_validation_sets_names() validation_data_param = ModelParam( value=get_validation_dataset_name_to_data(val_sets_names, token_to_index, condition_to_index, is_reverse_model), id=get_validation_data_id(val_sets_names)) w2v_model_param = ModelParam(value=get_w2v_model(), id=get_w2v_model_id()) if use_pretrained_w2v \ else ModelParam(value=None, id=None) model_resolver_factory = S3FileResolver.init_resolver( bucket_name=s3_models_bucket_name, remote_dir=s3_nn_model_remote_dir) reverse_model = get_reverse_model( prediction_mode_for_tests) if not is_reverse_model else None # build CakeChatModel cakechat_model = CakeChatModel(index_to_token, index_to_condition, training_data_param=training_data_param, validation_data_param=validation_data_param, w2v_model_param=w2v_model_param, model_init_path=model_init_path, model_resolver=model_resolver_factory, is_reverse_model=is_reverse_model, reverse_model=reverse_model, horovod=hvd) # train model cakechat_model.train_model()
# Build the tokens list vocab = list(SPECIAL_TOKENS) + \ [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))] # Build the conditions list conditions = [condition for condition, _ in conditions_counter.most_common(max_conditions_num)] # Validate the condition list if DEFAULT_CONDITION not in conditions: raise Exception('No default condition "%s" found in the dataset condition list.' % DEFAULT_CONDITION) # Return index_to_token and index_to_condition mappings return dict(enumerate(vocab)), dict(enumerate(conditions)) def dump_index_to_item(index_to_item, path): ensure_dir(os.path.dirname(path)) with codecs.open(path, 'w', 'utf-8') as fh: json.dump(index_to_item, fh, ensure_ascii=False) if __name__ == '__main__': processed_train_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME) index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME) index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME) index_to_token, index_to_condition = build_index_mappings(processed_train_corpus_path) dump_index_to_item(index_to_token, index_to_token_path) dump_index_to_item(index_to_condition, index_to_condition_path)
def predict(model_path=None, tokens_index_path=None, conditions_index_path=None, default_predictions_path=None, reverse_model_weights=None, temperatures=None, prediction_mode=PREDICTION_MODE_FOR_TESTS): if not model_path: model_path = get_model_full_path() if not tokens_index_path: tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME) if not conditions_index_path: conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME) if not default_predictions_path: default_predictions_path = os.path.join(DATA_DIR, 'results', 'predictions_' + get_model_full_params_str()) # Construct list of parameters values for all possible combinations of passed parameters prediction_params = [dict()] if reverse_model_weights: prediction_params = [ dict(params, mmi_reverse_model_score_weight=w) for params in prediction_params for w in reverse_model_weights ] if temperatures: prediction_params = [dict(params, temperature=t) for params in prediction_params for t in temperatures] # Get path for each combination of parameters predictions_paths = [] # Add suffix to the filename only for parameters that have a specific value passed as an argument # If no parameters were specified, no suffix is added if len(prediction_params) > 1: for cur_params in prediction_params: cur_path = '{base_path}_{params_str}.tsv'.format( base_path=default_predictions_path, params_str='_'.join(['{}_{}'.format(k, v) for k, v in cur_params.items()])) predictions_paths.append(cur_path) else: predictions_paths = [default_predictions_path + '.tsv'] if not is_non_empty_file(model_path): _logger.warn('Couldn\'t find model:\n"{}". \nExiting...'.format(model_path)) return if not is_non_empty_file(tokens_index_path): _logger.warn('Couldn\'t find tokens_index file:\n"{}". \nExiting...'.format(tokens_index_path)) return _logger.info('Model for prediction:\n{}'.format(model_path)) _logger.info('Tokens index:\n{}'.format(tokens_index_path)) _logger.info('File with questions:\n{}'.format(QUESTIONS_CORPUS_NAME)) _logger.info('Files to dump responses:\n{}'.format('\n'.join(predictions_paths))) _logger.info('Prediction parameters\n{}'.format('\n'.join([str(x) for x in prediction_params]))) index_to_token = load_index_to_item(tokens_index_path) index_to_condition = load_index_to_item(conditions_index_path) processed_test_set = get_tokenized_test_lines(QUESTIONS_CORPUS_NAME, set(index_to_token.values())) processed_test_set = list(processed_test_set) nn_model, _ = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path) for cur_params, cur_path in zip(prediction_params, predictions_paths): _logger.info('Predicting with the following params: {}'.format(cur_params)) _save_test_results(processed_test_set, cur_path, nn_model, prediction_mode, **cur_params)