Beispiel #1
0
def parse_args():
    argparser = argparse.ArgumentParser()

    argparser.add_argument(
        '-r',
        '--reverse',
        action='store_true',
        help='Pass this flag if you want to train reverse model. '
        'The model will be stored at {}'.format(get_model_full_path(is_reverse_model=True)))
    return argparser.parse_args()
Beispiel #2
0
def _log_sample_answers(x_test, nn_model, mode, is_reverse_model):
    _logger.info('Model: {}'.format(get_model_full_path(is_reverse_model)))
    _logger.info('Start predicting responses of length {out_len} for {n_samples} samples with mode {mode}'.format(
        out_len=MAX_PREDICTIONS_LENGTH, n_samples=x_test.shape[0], mode=mode))

    questions = transform_context_token_ids_to_sentences(x_test, nn_model.index_to_token)
    responses = get_nn_responses(x_test, nn_model, mode, output_candidates_num=LOG_CANDIDATES_NUM)
    _logger.info('Finished predicting! Logging...')

    for i, (question_ids, question) in enumerate(zip(x_test, questions)):
        laconic_logger.info('')  # for better readability
        for j, response in enumerate(responses[i]):
            laconic_logger.info('%-35s\t --#=%02d--> \t%s' % (question, j + 1, response))
def load_model():
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    model_path = get_model_full_path()
    index_to_token = load_index_to_item(index_to_token_path)
    index_to_condition = load_index_to_item(index_to_condition_path)

    nn_model, model_exists = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path)

    if not model_exists:
        raise FileNotFoundException('Couldn\'t find model:\n"{}". \nExiting...'.format(model_path))

    return nn_model
Beispiel #4
0
def _update_saved_nn_model(nn_model, cur_perplexities, best_perplexities, is_reverse_model=False):
    model_path = get_model_full_path(is_reverse_model)
    if all((cur < best) for cur, best in zip(cur_perplexities, best_perplexities)):
        old_suffix = '_pp_free{0:.2f}_sensitive{1:.2f}'.format(*best_perplexities)
        new_suffix = '_pp_free{0:.2f}_sensitive{1:.2f}'.format(*cur_perplexities)
        best_perplexities = cur_perplexities
        _save_model(nn_model, model_path + new_suffix)

        if new_suffix != old_suffix:
            _delete_model(model_path + old_suffix)
    else:
        _save_model(nn_model, model_path)

    return best_perplexities
def load_model(model_path=None, tokens_index_path=None, conditions_index_path=None):
    if model_path is None:
        model_path = get_model_full_path()
    if tokens_index_path is None:
        tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if conditions_index_path is None:
        conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    index_to_token = load_index_to_item(tokens_index_path)
    index_to_condition = load_index_to_item(conditions_index_path)
    nn_model, model_exists = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path)

    if not model_exists:
        raise ValueError('Couldn\'t find model: "{}".'.format(model_path))

    return nn_model
def load_model():
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    model_path = get_model_full_path()
    index_to_token = load_index_to_item(index_to_token_path)
    index_to_condition = load_index_to_item(index_to_condition_path)

    nn_model, model_exists = get_nn_model(index_to_token,
                                          index_to_condition,
                                          nn_model_path=model_path)

    if not model_exists:
        raise FileNotFoundException(
            'Couldn\'t find model:\n"{}". \nExiting...'.format(model_path))

    return nn_model
Beispiel #7
0
def get_nn_model(index_to_token,
                 index_to_condition,
                 w2v_matrix=None,
                 resolver_factory=None,
                 nn_model_path=None,
                 is_reverse_model=False):
    _logger.info('Initializing NN model with the following params:')
    _logger.info('NN input dimension: {} (token vector size)'.format(
        WORD_EMBEDDING_DIMENSION + CONDITION_EMBEDDING_DIMENSION))
    _logger.info('NN hidden dimension: {}'.format(HIDDEN_LAYER_DIMENSION))
    _logger.info('NN output dimension: {} (dict size)'.format(
        len(index_to_token)))

    if w2v_matrix is not None:
        w2v_matrix = w2v_matrix.astype(theano.config.floatX)

    model = CakeChatModel(index_to_token,
                          index_to_condition,
                          init_embedding=w2v_matrix)

    if not nn_model_path:
        nn_model_path = get_model_full_path(is_reverse_model)

    resolver = resolver_factory(
        nn_model_path) if resolver_factory else DummyFileResolver(
            nn_model_path)
    model_exists = resolver.resolve()

    if model_exists:
        _logger.info('Loading previously calculated weights from {}...'.format(
            nn_model_path))
        model.load_weights(nn_model_path)
    else:
        _logger.info(
            "Can't find previously calculated model, so will use a fresh one")

    _logger.info('Model is built\n')
    model.print_layer_shapes()
    model.print_matrices_weights()

    _logger.info('Model path is {}'.format(nn_model_path))

    return model, model_exists
Beispiel #8
0
def train(is_reverse_model=False):
    processed_train_corpus_path = get_processed_corpus_path(TRAIN_CORPUS_NAME)
    processed_val_corpus_path = get_processed_corpus_path(CONTEXT_SENSITIVE_VAL_CORPUS_NAME)
    index_to_token_path = get_index_to_token_path(BASE_CORPUS_NAME)
    index_to_condition_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    model_path = get_model_full_path(is_reverse_model)

    # check the existence of all necessary files before compiling the model
    _look_for_saved_files(files_paths=[processed_train_corpus_path, processed_val_corpus_path, index_to_token_path])
    _look_for_saved_model(model_path)

    index_to_token = load_index_to_item(index_to_token_path)
    index_to_condition = load_index_to_item(index_to_condition_path)

    w2v_matrix = _get_w2v_embedding_matrix_by_corpus_path(processed_train_corpus_path, index_to_token)

    # get nn_model and train it
    nn_model, _ = get_nn_model(index_to_token, index_to_condition, w2v_matrix)
    train_model(nn_model, is_reverse_model=is_reverse_model)
Beispiel #9
0
def load_model(model_path=None,
               tokens_index_path=None,
               conditions_index_path=None):
    if model_path is None:
        model_path = get_model_full_path()
    if tokens_index_path is None:
        tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if conditions_index_path is None:
        conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME)

    index_to_token = load_index_to_item(tokens_index_path)
    index_to_condition = load_index_to_item(conditions_index_path)
    nn_model, model_exists = get_nn_model(index_to_token,
                                          index_to_condition,
                                          nn_model_path=model_path)

    if not model_exists:
        raise ValueError('Couldn\'t find model: "{}".'.format(model_path))

    return nn_model
Beispiel #10
0
def train_model(nn_model, is_reverse_model=False):
    """
    Main function fo training. Refactoring anticipated.
    """
    validation_prediction_mode = PREDICTION_MODES.sampling if is_reverse_model else PREDICTION_MODE_FOR_TESTS

    train = load_conditioned_train_set(nn_model.token_to_index,
                                       nn_model.condition_to_index)

    context_free_val = load_context_free_val(nn_model.token_to_index)

    context_sensitive_val = load_context_sensitive_val(
        nn_model.token_to_index, nn_model.condition_to_index)
    if is_reverse_model:
        service_tokens = ServiceTokensIDs(nn_model.token_to_index)
        train = reverse_nn_input(train, service_tokens)
        context_free_val = reverse_nn_input(context_free_val, service_tokens)
        context_sensitive_val = reverse_nn_input(context_sensitive_val,
                                                 service_tokens)

    # Train subset of same size as a context-free val for metrics calculation
    train_subset = generate_subset(train, VAL_SUBSET_SIZE)

    # Context-sensitive val subset of same size as a context-free val for metrics calculation
    context_sensitive_val_subset = generate_subset(context_sensitive_val,
                                                   VAL_SUBSET_SIZE)

    _logger.info('Finished preprocessing! Start training')

    batch_id = 0
    avg_loss = 0
    total_training_time = 0
    best_val_perplexities = (float('inf'), float('inf'))
    batches_num = (train.x.shape[0] - 1) / BATCH_SIZE + 1
    start_time = time.time()
    cur_val_metrics = None

    try:
        for epoches_counter in xrange(1, EPOCHES_NUM + 1):
            _logger.info(
                'Starting epoch #%d; time = %0.2f s(training of it = %0.2f s)'
                % (epoches_counter, time.time() - start_time,
                   total_training_time))

            for train_batch in get_training_batch(
                [train.x, train.y, train.condition_ids],
                    BATCH_SIZE,
                    random_permute=SHUFFLE_TRAINING_BATCHES):
                x_train_batch, y_train_batch, condition_ids_train_batch = train_batch

                batch_id += 1
                prev_time = time.time()
                loss = nn_model.train(x_train_batch, y_train_batch,
                                      condition_ids_train_batch)

                cur_time = time.time()
                total_training_time += cur_time - prev_time
                total_time = cur_time - start_time
                avg_loss = LOG_LOSS_DECAY * avg_loss + (
                    1 - LOG_LOSS_DECAY) * loss if batch_id > 1 else loss

                progress = 100 * float(batch_id) / batches_num
                avr_time_per_sample = total_time / batch_id
                expected_time_per_epoch = avr_time_per_sample * batches_num

                # use print here for better readability
                _logger.info('batch %s / %s (%d%%) \t'
                             'loss: %.2f \t '
                             'time: epoch %.1f h | '
                             'total %0.1f h | '
                             'train %0.1f h (%.1f%%)' %
                             (batch_id, batches_num, progress, avg_loss,
                              expected_time_per_epoch / 3600,
                              total_time / 3600, total_training_time / 3600,
                              100 * total_training_time / total_time))

                if batch_id % SCREEN_LOG_FREQUENCY_PER_BATCHES == 0:
                    _log_sample_answers(
                        context_free_val.x[:SCREEN_LOG_NUM_TEST_LINES],
                        nn_model, validation_prediction_mode, is_reverse_model)

                if batch_id % LOG_FREQUENCY_PER_BATCHES == 0:
                    _calc_and_save_train_metrics(nn_model, train_subset,
                                                 avg_loss)

                    val_metrics = _calc_and_save_val_metrics(
                        nn_model,
                        context_sensitive_val_subset,
                        context_free_val,
                        prediction_mode=validation_prediction_mode)
                    _save_val_results(
                        nn_model,
                        context_free_val.x,
                        context_sensitive_val_subset.x,
                        val_metrics,
                        train_info=(start_time, batch_id, batches_num),
                        prediction_mode=validation_prediction_mode)
                    cur_val_metrics = val_metrics

                    best_val_perplexities = \
                        _update_saved_nn_model(nn_model,
                                               (val_metrics['context_free_perplexity'],
                                                val_metrics['context_sensitive_perplexity']),
                                               best_val_perplexities,
                                               is_reverse_model=is_reverse_model)

    except (KeyboardInterrupt, SystemExit):
        _logger.info('Training cycle is stopped manually')
        _save_model(nn_model, get_model_full_path(is_reverse_model) + '_final')
        _save_val_results(nn_model,
                          context_free_val.x,
                          context_sensitive_val_subset.x,
                          cur_val_metrics,
                          train_info=(start_time, batch_id, batches_num),
                          suffix='_final',
                          prediction_mode=validation_prediction_mode)
Beispiel #11
0
def predict(model_path=None,
            tokens_index_path=None,
            conditions_index_path=None,
            default_predictions_path=None,
            reverse_model_weights=None,
            temperatures=None,
            prediction_mode=PREDICTION_MODE_FOR_TESTS):
    if not model_path:
        model_path = get_model_full_path()
    if not tokens_index_path:
        tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if not conditions_index_path:
        conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME)
    if not default_predictions_path:
        default_predictions_path = os.path.join(
            DATA_DIR, 'results', 'predictions_' + get_model_full_params_str())

    # Construct list of parameters values for all possible combinations of passed parameters
    prediction_params = [dict()]
    if reverse_model_weights:
        prediction_params = [
            dict(params, mmi_reverse_model_score_weight=w)
            for params in prediction_params for w in reverse_model_weights
        ]
    if temperatures:
        prediction_params = [
            dict(params, temperature=t) for params in prediction_params
            for t in temperatures
        ]

    # Get path for each combination of parameters
    predictions_paths = []
    # Add suffix to the filename only for parameters that have a specific value passed as an argument
    # If no parameters were specified, no suffix is added
    if len(prediction_params) > 1:
        for cur_params in prediction_params:
            cur_path = '{base_path}_{params_str}.tsv'.format(
                base_path=default_predictions_path,
                params_str='_'.join(
                    ['{}_{}'.format(k, v) for k, v in cur_params.items()]))
            predictions_paths.append(cur_path)
    else:
        predictions_paths = [default_predictions_path + '.tsv']

    if not is_non_empty_file(model_path):
        _logger.warn(
            'Couldn\'t find model:\n"{}". \nExiting...'.format(model_path))
        return

    if not is_non_empty_file(tokens_index_path):
        _logger.warn(
            'Couldn\'t find tokens_index file:\n"{}". \nExiting...'.format(
                tokens_index_path))
        return

    _logger.info('Model for prediction:\n{}'.format(model_path))
    _logger.info('Tokens index:\n{}'.format(tokens_index_path))
    _logger.info('File with questions:\n{}'.format(QUESTIONS_CORPUS_NAME))
    _logger.info('Files to dump responses:\n{}'.format(
        '\n'.join(predictions_paths)))
    _logger.info('Prediction parameters\n{}'.format('\n'.join(
        [str(x) for x in prediction_params])))

    index_to_token = load_index_to_item(tokens_index_path)
    index_to_condition = load_index_to_item(conditions_index_path)

    processed_test_set = get_tokenized_test_lines(QUESTIONS_CORPUS_NAME,
                                                  set(index_to_token.values()))
    processed_test_set = list(processed_test_set)

    nn_model, _ = get_nn_model(index_to_token,
                               index_to_condition,
                               nn_model_path=model_path)

    for cur_params, cur_path in zip(prediction_params, predictions_paths):
        _logger.info(
            'Predicting with the following params: {}'.format(cur_params))
        _save_test_results(processed_test_set, cur_path, nn_model,
                           prediction_mode, **cur_params)
def predict(model_path=None,
            tokens_index_path=None,
            conditions_index_path=None,
            default_predictions_path=None,
            reverse_model_weights=None,
            temperatures=None,
            prediction_mode=PREDICTION_MODE_FOR_TESTS):
    if not model_path:
        model_path = get_model_full_path()
    if not tokens_index_path:
        tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if not conditions_index_path:
        conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME)
    if not default_predictions_path:
        default_predictions_path = os.path.join(DATA_DIR, 'results', 'predictions_' + get_model_full_params_str())

    # Construct list of parameters values for all possible combinations of passed parameters
    prediction_params = [dict()]
    if reverse_model_weights:
        prediction_params = [
            dict(params, mmi_reverse_model_score_weight=w)
            for params in prediction_params
            for w in reverse_model_weights
        ]
    if temperatures:
        prediction_params = [dict(params, temperature=t) for params in prediction_params for t in temperatures]

    # Get path for each combination of parameters
    predictions_paths = []
    # Add suffix to the filename only for parameters that have a specific value passed as an argument
    # If no parameters were specified, no suffix is added
    if len(prediction_params) > 1:
        for cur_params in prediction_params:
            cur_path = '{base_path}_{params_str}.tsv'.format(
                base_path=default_predictions_path,
                params_str='_'.join(['{}_{}'.format(k, v) for k, v in cur_params.items()]))
            predictions_paths.append(cur_path)
    else:
        predictions_paths = [default_predictions_path + '.tsv']

    if not is_non_empty_file(model_path):
        _logger.warn('Couldn\'t find model:\n"{}". \nExiting...'.format(model_path))
        return

    if not is_non_empty_file(tokens_index_path):
        _logger.warn('Couldn\'t find tokens_index file:\n"{}". \nExiting...'.format(tokens_index_path))
        return

    _logger.info('Model for prediction:\n{}'.format(model_path))
    _logger.info('Tokens index:\n{}'.format(tokens_index_path))
    _logger.info('File with questions:\n{}'.format(QUESTIONS_CORPUS_NAME))
    _logger.info('Files to dump responses:\n{}'.format('\n'.join(predictions_paths)))
    _logger.info('Prediction parameters\n{}'.format('\n'.join([str(x) for x in prediction_params])))

    index_to_token = load_index_to_item(tokens_index_path)
    index_to_condition = load_index_to_item(conditions_index_path)

    processed_test_set = get_tokenized_test_lines(QUESTIONS_CORPUS_NAME, set(index_to_token.values()))
    processed_test_set = list(processed_test_set)

    nn_model, _ = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path)

    for cur_params, cur_path in zip(prediction_params, predictions_paths):
        _logger.info('Predicting with the following params: {}'.format(cur_params))
        _save_test_results(processed_test_set, cur_path, nn_model, prediction_mode, **cur_params)
Beispiel #13
0
def train_model(nn_model, is_reverse_model=False):
    """
    Main function fo training. Refactoring anticipated.
    """
    validation_prediction_mode = PREDICTION_MODES.sampling if is_reverse_model else PREDICTION_MODE_FOR_TESTS

    train = load_conditioned_train_set(nn_model.token_to_index, nn_model.condition_to_index)

    context_free_val = load_context_free_val(nn_model.token_to_index)

    context_sensitive_val = load_context_sensitive_val(nn_model.token_to_index, nn_model.condition_to_index)
    if is_reverse_model:
        service_tokens = ServiceTokensIDs(nn_model.token_to_index)
        train = reverse_nn_input(train, service_tokens)
        context_free_val = reverse_nn_input(context_free_val, service_tokens)
        context_sensitive_val = reverse_nn_input(context_sensitive_val, service_tokens)

    # Train subset of same size as a context-free val for metrics calculation
    train_subset = generate_subset(train, VAL_SUBSET_SIZE)

    # Context-sensitive val subset of same size as a context-free val for metrics calculation
    context_sensitive_val_subset = generate_subset(context_sensitive_val, VAL_SUBSET_SIZE)

    _logger.info('Finished preprocessing! Start training')

    batch_id = 0
    avg_loss = 0
    total_training_time = 0
    best_val_perplexities = (float('inf'), float('inf'))
    batches_num = (train.x.shape[0] - 1) / BATCH_SIZE + 1
    start_time = time.time()
    cur_val_metrics = None

    try:
        for epoches_counter in xrange(1, EPOCHES_NUM + 1):
            _logger.info('Starting epoch #%d; time = %0.2f s(training of it = %0.2f s)' %
                         (epoches_counter, time.time() - start_time, total_training_time))

            for train_batch in get_training_batch(
                [train.x, train.y, train.condition_ids], BATCH_SIZE, random_permute=SHUFFLE_TRAINING_BATCHES):
                x_train_batch, y_train_batch, condition_ids_train_batch = train_batch

                batch_id += 1
                prev_time = time.time()
                loss = nn_model.train(x_train_batch, y_train_batch, condition_ids_train_batch)

                cur_time = time.time()
                total_training_time += cur_time - prev_time
                total_time = cur_time - start_time
                avg_loss = LOG_LOSS_DECAY * avg_loss + (1 - LOG_LOSS_DECAY) * loss if batch_id > 1 else loss

                progress = 100 * float(batch_id) / batches_num
                avr_time_per_sample = total_time / batch_id
                expected_time_per_epoch = avr_time_per_sample * batches_num

                # use print here for better readability
                _logger.info('batch %s / %s (%d%%) \t'
                             'loss: %.2f \t '
                             'time: epoch %.1f h | '
                             'total %0.1f h | '
                             'train %0.1f h (%.1f%%)' %
                             (batch_id, batches_num, progress, avg_loss, expected_time_per_epoch / 3600,
                              total_time / 3600, total_training_time / 3600, 100 * total_training_time / total_time))

                if batch_id % SCREEN_LOG_FREQUENCY_PER_BATCHES == 0:
                    _log_sample_answers(context_free_val.x[:SCREEN_LOG_NUM_TEST_LINES], nn_model,
                                        validation_prediction_mode, is_reverse_model)

                if batch_id % LOG_FREQUENCY_PER_BATCHES == 0:
                    _calc_and_save_train_metrics(nn_model, train_subset, avg_loss)

                    val_metrics = _calc_and_save_val_metrics(
                        nn_model,
                        context_sensitive_val_subset,
                        context_free_val,
                        prediction_mode=validation_prediction_mode)
                    _save_val_results(
                        nn_model,
                        context_free_val.x,
                        context_sensitive_val_subset.x,
                        val_metrics,
                        train_info=(start_time, batch_id, batches_num),
                        prediction_mode=validation_prediction_mode)
                    cur_val_metrics = val_metrics

                    best_val_perplexities = \
                        _update_saved_nn_model(nn_model,
                                               (val_metrics['context_free_perplexity'],
                                                val_metrics['context_sensitive_perplexity']),
                                               best_val_perplexities,
                                               is_reverse_model=is_reverse_model)

    except (KeyboardInterrupt, SystemExit):
        _logger.info('Training cycle is stopped manually')
        _save_model(nn_model, get_model_full_path(is_reverse_model) + '_final')
        _save_val_results(
            nn_model,
            context_free_val.x,
            context_sensitive_val_subset.x,
            cur_val_metrics,
            train_info=(start_time, batch_id, batches_num),
            suffix='_final',
            prediction_mode=validation_prediction_mode)