Beispiel #1
0
def train(init_path=None, is_reverse_model=False):
    processed_train_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME)
    processed_val_corpus_path = get_processed_corpus_path(
        CONTEXT_SENSITIVE_VAL_CORPUS_NAME)
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    # check the existence of all necessary files before compiling the model
    _look_for_saved_files(files_paths=[
        processed_train_corpus_path, processed_val_corpus_path,
        index_to_token_path
    ])

    index_to_token = load_index_to_item(index_to_token_path)
    index_to_condition = load_index_to_item(index_to_condition_path)

    w2v_matrix = _get_w2v_embedding_matrix_by_corpus_path(
        processed_train_corpus_path, index_to_token)

    # get nn_model and train it
    nn_model_resolver_factory = S3FileResolver.init_resolver(
        bucket_name=S3_MODELS_BUCKET_NAME, remote_dir=S3_NN_MODEL_REMOTE_DIR)

    nn_model, _ = get_nn_model(index_to_token,
                               index_to_condition,
                               model_init_path=init_path,
                               w2v_matrix=w2v_matrix,
                               resolver_factory=nn_model_resolver_factory,
                               is_reverse_model=is_reverse_model)

    train_model(nn_model)
Beispiel #2
0
def _calculate_tfidf_vectorizer(base_corpus_name=BASE_CORPUS_NAME):
    index_to_token = load_index_to_item(get_index_to_token_path(base_corpus_name))
    token_to_index = {v: k for k, v in list(index_to_token.items())}
    train_lines = _load_train_lines()
    tfidf_vectorizer = TfidfVectorizer(tokenizer=get_tokens_sequence, vocabulary=token_to_index)
    tfidf_vectorizer.fit(train_lines)
    return tfidf_vectorizer
Beispiel #3
0
def _get_index_to_token(fetch_from_s3):
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if fetch_from_s3:
        tokens_idx_resolver = S3FileResolver(index_to_token_path, S3_MODELS_BUCKET_NAME, S3_TOKENS_IDX_REMOTE_DIR)
        if not tokens_idx_resolver.resolve():
            raise Exception('Can\'t get index_to_token because file does not exist at S3')
    else:
        if not os.path.exists(index_to_token_path):
            raise Exception('Can\'t get index_to_token because file does not exist. '
                            'Run tools/download_model.py first to get all required files or construct it by yourself.')

    return load_index_to_item(index_to_token_path)
def load_model():
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    model_path = get_model_full_path()
    index_to_token = load_index_to_item(index_to_token_path)
    index_to_condition = load_index_to_item(index_to_condition_path)

    nn_model, model_exists = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path)

    if not model_exists:
        raise FileNotFoundException('Couldn\'t find model:\n"{}". \nExiting...'.format(model_path))

    return nn_model
def load_model(model_path, tokens_index_path=None, conditions_index_path=None):
    if tokens_index_path is None:
        tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if conditions_index_path is None:
        conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    index_to_token = load_index_to_item(tokens_index_path)
    index_to_condition = load_index_to_item(conditions_index_path)
    nn_model, model_exists = get_nn_model(index_to_token, index_to_condition, model_path)

    if not model_exists:
        raise ValueError('Couldn\'t find model: "{}".'.format(model_path))

    return nn_model
def load_model(model_path=None, tokens_index_path=None, conditions_index_path=None):
    if model_path is None:
        model_path = get_model_full_path()
    if tokens_index_path is None:
        tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if conditions_index_path is None:
        conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    index_to_token = load_index_to_item(tokens_index_path)
    index_to_condition = load_index_to_item(conditions_index_path)
    nn_model, model_exists = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path)

    if not model_exists:
        raise ValueError('Couldn\'t find model: "{}".'.format(model_path))

    return nn_model
Beispiel #7
0
def _get_index_to_token(fetch_from_s3):
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if fetch_from_s3:
        tokens_idx_resolver = S3FileResolver(index_to_token_path,
                                             S3_MODELS_BUCKET_NAME,
                                             S3_TOKENS_IDX_REMOTE_DIR)
        if not tokens_idx_resolver.resolve():
            raise Exception(
                'Can\'t get index_to_token because file does not exist at S3')
    else:
        if not os.path.exists(index_to_token_path):
            raise Exception(
                'Can\'t get index_to_token because file does not exist. '
                'Run tools/download_model.py first to get all required files or construct it by yourself.'
            )

    return load_index_to_item(index_to_token_path)
def load_model():
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    model_path = get_model_full_path()
    index_to_token = load_index_to_item(index_to_token_path)
    index_to_condition = load_index_to_item(index_to_condition_path)

    nn_model, model_exists = get_nn_model(index_to_token,
                                          index_to_condition,
                                          nn_model_path=model_path)

    if not model_exists:
        raise FileNotFoundException(
            'Couldn\'t find model:\n"{}". \nExiting...'.format(model_path))

    return nn_model
Beispiel #9
0
def _get_index_to_token(fetch_from_s3):
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    file_name = os.path.basename(index_to_token_path)
    if fetch_from_s3:
        tokens_idx_resolver = S3FileResolver(index_to_token_path,
                                             S3_MODELS_BUCKET_NAME,
                                             S3_TOKENS_IDX_REMOTE_DIR)
        if not tokens_idx_resolver.resolve():
            raise FileNotFoundException(
                'No such file on S3: {}'.format(file_name))
    else:
        if not os.path.exists(index_to_token_path):
            raise FileNotFoundException(
                'No such file: {}'.format(file_name) +
                'Run "python tools/fetch.py" first to get all necessary files.'
            )

    return load_index_to_item(index_to_token_path)
Beispiel #10
0
def train(is_reverse_model=False):
    processed_train_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME)
    processed_val_corpus_path = get_processed_corpus_path(CONTEXT_SENSITIVE_VAL_CORPUS_NAME)
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    model_path = get_model_full_path(is_reverse_model)

    # check the existence of all necessary files before compiling the model
    _look_for_saved_files(files_paths=[processed_train_corpus_path, processed_val_corpus_path, index_to_token_path])
    _look_for_saved_model(model_path)

    index_to_token = load_index_to_item(index_to_token_path)
    index_to_condition = load_index_to_item(index_to_condition_path)

    w2v_matrix = _get_w2v_embedding_matrix_by_corpus_path(processed_train_corpus_path, index_to_token)

    # get nn_model and train it
    nn_model, _ = get_nn_model(index_to_token, index_to_condition, w2v_matrix)
    train_model(nn_model, is_reverse_model=is_reverse_model)
def get_model_vocab_size():
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    index_to_token = load_index_to_item(index_to_token_path)
    return len(index_to_token)
    conditions = [
        condition
        for condition, _ in conditions_counter.most_common(max_conditions_num)
    ]

    # Validate the condition list
    if DEFAULT_CONDITION not in conditions:
        raise Exception(
            'No default condition "%s" found in the dataset condition list.' %
            DEFAULT_CONDITION)

    # Return index_to_token and index_to_condition mappings
    return dict(enumerate(vocab)), dict(enumerate(conditions))


def dump_index_to_item(index_to_item, path):
    ensure_dir(os.path.dirname(path))
    with codecs.open(path, 'w', 'utf-8') as fh:
        json.dump(index_to_item, fh, ensure_ascii=False)


if __name__ == '__main__':
    processed_train_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME)
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    index_to_token, index_to_condition = build_index_mappings(
        processed_train_corpus_path)
    dump_index_to_item(index_to_token, index_to_token_path)
    dump_index_to_item(index_to_condition, index_to_condition_path)
def predict(model_path,
            tokens_index_path=None,
            conditions_index_path=None,
            default_predictions_path=None,
            reverse_model_weights=None,
            temperatures=None,
            prediction_mode=None):

    if not tokens_index_path:
        tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if not conditions_index_path:
        conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME)
    if not temperatures:
        temperatures = [DEFAULT_TEMPERATURE]
    if not prediction_mode:
        prediction_mode = PREDICTION_MODE_FOR_TESTS

    # Construct list of parameters values for all possible combinations of passed parameters
    prediction_params = [dict()]
    if reverse_model_weights:
        prediction_params = [
            dict(params, mmi_reverse_model_score_weight=w)
            for params in prediction_params for w in reverse_model_weights
        ]
    if temperatures:
        prediction_params = [
            dict(params, temperature=t) for params in prediction_params
            for t in temperatures
        ]

    if not is_non_empty_file(tokens_index_path):
        _logger.warning(
            'Couldn\'t find tokens_index file:\n{}. \nExiting...'.format(
                tokens_index_path))
        return

    index_to_token = load_index_to_item(tokens_index_path)
    index_to_condition = load_index_to_item(conditions_index_path)

    nn_model, _ = get_nn_model(index_to_token,
                               index_to_condition,
                               model_init_path=model_path)

    if not default_predictions_path:
        default_predictions_path = os.path.join(
            DATA_DIR, 'results', 'predictions_' + nn_model.model_name)

    # Get path for each combination of parameters
    predictions_paths = []
    # Add suffix to the filename only for parameters that have a specific value passed as an argument
    # If no parameters were specified, no suffix is added
    if len(prediction_params) > 1:
        for cur_params in prediction_params:
            cur_path = '{base_path}_{params_str}.tsv'.format(
                base_path=default_predictions_path,
                params_str='_'.join(
                    ['{}_{}'.format(k, v) for k, v in cur_params.items()]))
            predictions_paths.append(cur_path)
    else:
        predictions_paths = [default_predictions_path + '.tsv']

    _logger.info('Model for prediction:\n{}'.format(nn_model.model_load_path))
    _logger.info('Tokens index:\n{}'.format(tokens_index_path))
    _logger.info('File with questions:\n{}'.format(QUESTIONS_CORPUS_NAME))
    _logger.info('Files to dump responses:\n{}'.format(
        '\n'.join(predictions_paths)))
    _logger.info('Prediction parameters\n{}'.format('\n'.join(
        [str(x) for x in prediction_params])))

    processed_test_set = get_tokenized_test_lines(QUESTIONS_CORPUS_NAME,
                                                  set(index_to_token.values()))
    processed_test_set = list(processed_test_set)

    for cur_params, cur_path in zip(prediction_params, predictions_paths):
        _logger.info(
            'Predicting with the following params: {}'.format(cur_params))
        _save_test_results(processed_test_set,
                           cur_path,
                           nn_model,
                           prediction_modes=[prediction_mode])
Beispiel #14
0
def train(model_init_path=None,
          is_reverse_model=False,
          train_subset_size=None,
          use_pretrained_w2v=USE_PRETRAINED_W2V_EMBEDDINGS_LAYER,
          train_corpus_name=TRAIN_CORPUS_NAME,
          context_sensitive_val_corpus_name=CONTEXT_SENSITIVE_VAL_CORPUS_NAME,
          base_corpus_name=BASE_CORPUS_NAME,
          s3_models_bucket_name=S3_MODELS_BUCKET_NAME,
          s3_nn_model_remote_dir=S3_NN_MODEL_REMOTE_DIR,
          prediction_mode_for_tests=PREDICTION_MODE_FOR_TESTS):
    processed_train_corpus_path = get_processed_corpus_path(train_corpus_name)
    processed_val_corpus_path = get_processed_corpus_path(
        context_sensitive_val_corpus_name)
    index_to_token_path = get_index_to_token_path(base_corpus_name)
    index_to_condition_path = get_index_to_condition_path(base_corpus_name)

    # check the existence of all necessary files before compiling the model
    _look_for_saved_files(files_paths=[
        processed_train_corpus_path, processed_val_corpus_path,
        index_to_token_path
    ])

    # load essentials for building model and training
    index_to_token = load_index_to_item(index_to_token_path)
    index_to_condition = load_index_to_item(index_to_condition_path)
    token_to_index = {v: k for k, v in index_to_token.items()}
    condition_to_index = {v: k for k, v in index_to_condition.items()}

    training_data_param = ModelParam(value=get_training_dataset(
        train_corpus_name, token_to_index, condition_to_index,
        is_reverse_model, train_subset_size),
                                     id=train_corpus_name)

    val_sets_names = get_validation_sets_names()
    validation_data_param = ModelParam(
        value=get_validation_dataset_name_to_data(val_sets_names,
                                                  token_to_index,
                                                  condition_to_index,
                                                  is_reverse_model),
        id=get_validation_data_id(val_sets_names))

    w2v_model_param = ModelParam(value=get_w2v_model(), id=get_w2v_model_id()) if use_pretrained_w2v \
        else ModelParam(value=None, id=None)

    model_resolver_factory = S3FileResolver.init_resolver(
        bucket_name=s3_models_bucket_name, remote_dir=s3_nn_model_remote_dir)

    reverse_model = get_reverse_model(
        prediction_mode_for_tests) if not is_reverse_model else None

    # build CakeChatModel
    cakechat_model = CakeChatModel(index_to_token,
                                   index_to_condition,
                                   training_data_param=training_data_param,
                                   validation_data_param=validation_data_param,
                                   w2v_model_param=w2v_model_param,
                                   model_init_path=model_init_path,
                                   model_resolver=model_resolver_factory,
                                   is_reverse_model=is_reverse_model,
                                   reverse_model=reverse_model,
                                   horovod=hvd)

    # train model
    cakechat_model.train_model()
    # Build the tokens list
    vocab = list(SPECIAL_TOKENS) + \
            [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))]

    # Build the conditions list
    conditions = [condition for condition, _ in conditions_counter.most_common(max_conditions_num)]

    # Validate the condition list
    if DEFAULT_CONDITION not in conditions:
        raise Exception('No default condition "%s" found in the dataset condition list.' % DEFAULT_CONDITION)

    # Return index_to_token and index_to_condition mappings
    return dict(enumerate(vocab)), dict(enumerate(conditions))


def dump_index_to_item(index_to_item, path):
    ensure_dir(os.path.dirname(path))
    with codecs.open(path, 'w', 'utf-8') as fh:
        json.dump(index_to_item, fh, ensure_ascii=False)


if __name__ == '__main__':
    processed_train_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME)
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    index_to_token, index_to_condition = build_index_mappings(processed_train_corpus_path)
    dump_index_to_item(index_to_token, index_to_token_path)
    dump_index_to_item(index_to_condition, index_to_condition_path)
def predict(model_path=None,
            tokens_index_path=None,
            conditions_index_path=None,
            default_predictions_path=None,
            reverse_model_weights=None,
            temperatures=None,
            prediction_mode=PREDICTION_MODE_FOR_TESTS):
    if not model_path:
        model_path = get_model_full_path()
    if not tokens_index_path:
        tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if not conditions_index_path:
        conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME)
    if not default_predictions_path:
        default_predictions_path = os.path.join(DATA_DIR, 'results', 'predictions_' + get_model_full_params_str())

    # Construct list of parameters values for all possible combinations of passed parameters
    prediction_params = [dict()]
    if reverse_model_weights:
        prediction_params = [
            dict(params, mmi_reverse_model_score_weight=w)
            for params in prediction_params
            for w in reverse_model_weights
        ]
    if temperatures:
        prediction_params = [dict(params, temperature=t) for params in prediction_params for t in temperatures]

    # Get path for each combination of parameters
    predictions_paths = []
    # Add suffix to the filename only for parameters that have a specific value passed as an argument
    # If no parameters were specified, no suffix is added
    if len(prediction_params) > 1:
        for cur_params in prediction_params:
            cur_path = '{base_path}_{params_str}.tsv'.format(
                base_path=default_predictions_path,
                params_str='_'.join(['{}_{}'.format(k, v) for k, v in cur_params.items()]))
            predictions_paths.append(cur_path)
    else:
        predictions_paths = [default_predictions_path + '.tsv']

    if not is_non_empty_file(model_path):
        _logger.warn('Couldn\'t find model:\n"{}". \nExiting...'.format(model_path))
        return

    if not is_non_empty_file(tokens_index_path):
        _logger.warn('Couldn\'t find tokens_index file:\n"{}". \nExiting...'.format(tokens_index_path))
        return

    _logger.info('Model for prediction:\n{}'.format(model_path))
    _logger.info('Tokens index:\n{}'.format(tokens_index_path))
    _logger.info('File with questions:\n{}'.format(QUESTIONS_CORPUS_NAME))
    _logger.info('Files to dump responses:\n{}'.format('\n'.join(predictions_paths)))
    _logger.info('Prediction parameters\n{}'.format('\n'.join([str(x) for x in prediction_params])))

    index_to_token = load_index_to_item(tokens_index_path)
    index_to_condition = load_index_to_item(conditions_index_path)

    processed_test_set = get_tokenized_test_lines(QUESTIONS_CORPUS_NAME, set(index_to_token.values()))
    processed_test_set = list(processed_test_set)

    nn_model, _ = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path)

    for cur_params, cur_path in zip(prediction_params, predictions_paths):
        _logger.info('Predicting with the following params: {}'.format(cur_params))
        _save_test_results(processed_test_set, cur_path, nn_model, prediction_mode, **cur_params)