Ejemplo n.º 1
0
    def test_one_step_decoder(self):
        nn_model = get_trained_model()

        _EPS = 1e-6
        batch_size = 1
        context_size = 3
        input_seq_len = 10
        output_seq_len = 9

        x = np.random.randint(0,
                              nn_model.vocab_size,
                              size=(batch_size, context_size, input_seq_len),
                              dtype=np.int32)
        y = np.random.randint(0,
                              nn_model.vocab_size,
                              size=(batch_size, output_seq_len),
                              dtype=np.int32)

        ground_truth_log_probabilities = get_sequence_log_probs(
            nn_model, x, y, condition_ids=None)
        one_step_log_probabilities = self._predict_log_probabilities_one_step(
            nn_model, x, y)
        mae = np.abs(one_step_log_probabilities -
                     ground_truth_log_probabilities).mean()

        self.assertTrue(mae < _EPS)
Ejemplo n.º 2
0
    def test_one_step_decoder(self):
        nn_model = get_trained_model()

        _EPS = 1e-5
        batch_size = 1
        # input batches shapes should correspond to the shapes of the trained model layers
        context_size = INPUT_CONTEXT_SIZE
        input_seq_len = INPUT_SEQUENCE_LENGTH
        output_seq_len = OUTPUT_SEQUENCE_LENGTH

        x = np.random.randint(0,
                              nn_model.vocab_size,
                              size=(batch_size, context_size, input_seq_len),
                              dtype=INTX)
        y = np.random.randint(0,
                              nn_model.vocab_size,
                              size=(batch_size, output_seq_len),
                              dtype=INTX)

        ground_truth_log_probabilities = get_sequence_log_probs(
            nn_model, x, y, condition_ids=None)
        one_step_log_probabilities = self._predict_log_probabilities_one_step(
            nn_model, x, y)
        mae = np.abs(one_step_log_probabilities -
                     ground_truth_log_probabilities).mean()

        self.assertTrue(mae < _EPS)
Ejemplo n.º 3
0
def _get_reverse_model():
    if not hasattr(_get_reverse_model, 'reverse_model'):
        try:
            reverse_model = get_trained_model(reverse=True)
        except:
            raise ValueError('Can\'t get reverse nn model for prediction. '
                             'Try to run \'python tools/train.py --reverse\' or switch prediction mode to sampling.')
        _get_reverse_model.reverse_model = reverse_model
    return _get_reverse_model.reverse_model
Ejemplo n.º 4
0
def evaluate_distinctness(args):
    if args.sample_size > 1 and PREDICTION_MODE_FOR_TESTS == PREDICTION_MODES.beamsearch:
        _logger.waring(
            'Using sample_size > 1 is meaningless with prediction_mode=\'beamsearch\' because there\'s no '
            'randomness in the prediction. Use sample_size=1 instead.')

    nn_model = get_trained_model()

    if args.validation_only:
        validation = load_questions_set(nn_model.token_to_index)
        validation_set_name = 'context free questions'
    else:
        eval_datasets = load_datasets(nn_model.token_to_index,
                                      nn_model.condition_to_index)
        validation = eval_datasets.cf_validation
        cs_test = eval_datasets.cs_test
        cs_test_one_condition = eval_datasets.cs_test_one_condition

        validation_set_name = 'validation set without conditions'

        _logger.info(
            'Evaluating distinctness for context sensitive testset without conditions'
        )
        log_distinct_metrics(nn_model, cs_test.x, samples_num=args.sample_size)

        _logger.info(
            'Evaluating distinctness for context sensitive testset with conditions'
        )
        log_distinct_metrics(nn_model,
                             cs_test.x,
                             cs_test.condition_ids,
                             samples_num=args.sample_size)

        _logger.info(
            'Evaluating distinctness for defined-conditions-subset without conditions'
        )
        log_distinct_metrics(nn_model,
                             cs_test_one_condition.x,
                             samples_num=args.sample_size)

        _logger.info(
            'Evaluating distinctness for defined-conditions-subset with conditions'
        )
        log_distinct_metrics(nn_model,
                             cs_test_one_condition.x,
                             cs_test_one_condition.condition_ids,
                             samples_num=args.sample_size)

    _logger.info('Evaluating distinctness for {}'.format(validation_set_name))
    log_distinct_metrics(nn_model, validation.x, samples_num=args.sample_size)
Ejemplo n.º 5
0
    def test_one_step_decoder(self):
        nn_model = get_trained_model()

        _EPS = 1e-6
        batch_size = 1
        context_size = 3
        input_seq_len = 10
        output_seq_len = 9

        x = np.random.randint(0, nn_model.vocab_size, size=(batch_size, context_size, input_seq_len), dtype=np.int32)
        y = np.random.randint(0, nn_model.vocab_size, size=(batch_size, output_seq_len), dtype=np.int32)

        ground_truth_log_probabilities = get_sequence_log_probs(nn_model, x, y, condition_ids=None)
        one_step_log_probabilities = self._predict_log_probabilities_one_step(nn_model, x, y)
        mae = np.abs(one_step_log_probabilities - ground_truth_log_probabilities).mean()

        self.assertTrue(mae < _EPS)
Ejemplo n.º 6
0
def predictor_factory(nn_model, mode, config):
    """

    :param nn_model: Model used for predicting
    :param mode: Prediction mode: 'sampling', 'sampling-reranking' or 'candidates'
    :param config: All additional prediction parameters. See PredictionConfig for the details.
    :return: BasePredictor descendant with predict_response() method implemented.
    """
    if mode not in PREDICTION_MODES:
        raise ValueError(
            'Unknown prediction mode {}. Use one of the following: {}.'.format(
                mode, list(PREDICTION_MODES)))

    if mode in [
            PREDICTION_MODES.beamsearch, PREDICTION_MODES.beamsearch_reranking
    ]:
        candidates_generator = BeamsearchCandidatesGenerator(
            nn_model, config['beam_size'],
            config['repetition_penalization_coefficient'])
    else:
        candidates_generator = SamplingCandidatesGenerator(
            nn_model, config['temperature'], config['samples_num'],
            config['repetition_penalization_coefficient'])

    if mode in [
            PREDICTION_MODES.beamsearch_reranking,
            PREDICTION_MODES.sampling_reranking
    ]:
        if config['mmi_reverse_model_score_weight'] <= 0:
            raise ValueError(
                'mmi_reverse_model_score_weight should be > 0 for reranking mode'
            )

        reverse_model = get_trained_model(reverse=True)
        reranker = MMIReranker(nn_model, reverse_model,
                               config['mmi_reverse_model_score_weight'],
                               config['repetition_penalization_coefficient'])
    else:
        reranker = DummyReranker()

    return Predictor(nn_model, candidates_generator, reranker)
Ejemplo n.º 7
0
    argparser.add_argument('-t',
                           '--text',
                           action='store',
                           help='Context message that feed to the model',
                           default=None)
    argparser.add_argument('-c',
                           '--condition',
                           action='store',
                           help='Condition',
                           default=DEFAULT_CONDITION)

    return argparser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    nn_model = get_trained_model()

    if args.text:
        tokenized_lines = process_text(nn_model, args.text.decode('utf8'))
    else:
        tokenized_lines = load_corpus(nn_model, args.data)

    contexts_token_ids = transform_lines_to_contexts_token_ids(
        tokenized_lines, nn_model)

    print_predictions(nn_model,
                      contexts_token_ids,
                      args.condition,
                      prediction_mode=args.prediction_mode)
from cakechat.utils.w2v.model import get_w2v_model

_logger = get_tools_logger(__file__)


def parse_args():
    argparser = argparse.ArgumentParser()
    argparser.add_argument(
        '-m',
        '--model',
        action='store',
        choices=['default', 'reverse', 'w2v', 'all'],
        help='Fetch models from s3 to disk',
        default='all')
    args = argparser.parse_args()

    return args


if __name__ == '__main__':
    args = parse_args()

    if args.model in {'default', 'all'}:
        get_trained_model(fetch_from_s3=True)

    if args.model in {'reverse', 'all'}:
        get_trained_model(fetch_from_s3=True, is_reverse_model=True)

    if args.model in {'w2v', 'all'}:
        get_w2v_model(fetch_from_s3=True)
Ejemplo n.º 9
0
import random

from cakechat.api.config import PREDICTION_MODE, NUM_BEST_CANDIDATES_TO_PICK_FROM, SAMPLING_ATTEMPTS_NUM, \
    DEFAULT_RESPONSE
from cakechat.config import INPUT_CONTEXT_SIZE, INPUT_SEQUENCE_LENGTH, PREDICTION_MODES
from cakechat.dialog_model.factory import get_trained_model
from cakechat.dialog_model.inference import get_nn_responses, warmup_predictor
from cakechat.dialog_model.model_utils import transform_contexts_to_token_ids, transform_conditions_to_ids
from cakechat.utils.offense_detector.config import OFFENSIVE_PHRASES_PATH
from cakechat.utils.offense_detector import OffenseDetector
from cakechat.utils.text_processing import get_tokens_sequence, get_pretty_str_from_tokens_sequence

_offense_detector = OffenseDetector(OFFENSIVE_PHRASES_PATH)
_cakechat_model = get_trained_model(fetch_from_s3=False)
warmup_predictor(_cakechat_model, PREDICTION_MODE)


def _get_non_offensive_response_using_fast_sampling(context_tokens_ids,
                                                    condition_id):
    for _ in xrange(SAMPLING_ATTEMPTS_NUM):
        response = get_nn_responses(context_tokens_ids,
                                    _cakechat_model,
                                    PREDICTION_MODES.sampling,
                                    condition_ids=condition_id)[0][0]

        tokenized_response = get_tokens_sequence(response)
        if not _offense_detector.has_offensive_ngrams(tokenized_response):
            return get_pretty_str_from_tokens_sequence(tokenized_response)

    return DEFAULT_RESPONSE
Ejemplo n.º 10
0
            continue

        responses_ground_truth = transform_token_ids_to_sentences(responses_token_ids_ground_truth,
                                                                  nn_model.index_to_token)
        responses = predict_for_condition_id(nn_model, questions.x, condition_id)

        lex_sim_conditioned_vs_non_conditioned = calculate_lexical_similarity(responses, responses_baseline,
                                                                              tfidf_vectorizer)
        lex_sim_conditioned_vs_groundtruth = calculate_lexical_similarity(responses, responses_ground_truth,
                                                                          tfidf_vectorizer)

        yield condition, (lex_sim_conditioned_vs_non_conditioned, lex_sim_conditioned_vs_groundtruth)


if __name__ == '__main__':
    nn_model = get_trained_model()
    train, questions, validation, train_subset, conditioned_subset = load_datasets(nn_model.token_to_index,
                                                                                   nn_model.condition_to_index)
    tfidf_vectorizer = get_tfidf_vectorizer()

    for metric, perplexity in calc_perplexity_metrics(nn_model, train_subset, conditioned_subset,
                                                      validation).iteritems():
        _logger.info('Metric: {}, perplexity: {}'.format(metric, perplexity))

    for condition, (ppl_non_conditioned, ppl_conditioned) in calc_perplexity_by_condition_metrics(nn_model, train):
        _logger.info('Condition: {}, non-conditioned perplexity: {}, conditioned perplexity: {}'.format(
            condition, ppl_non_conditioned, ppl_conditioned))

    for condition, (lex_sim_conditioned_vs_non_conditioned, lex_sim_conditioned_vs_groundtruth) in \
            calc_lexical_similarity_metrics(nn_model, train, questions, tfidf_vectorizer):
        _logger.info('Condition: {}, conditioned vs non-conditioned lexical similarity: {}'.format(
Ejemplo n.º 11
0
#!/usr/bin/env python
"""
Gets trained model and warms it up (i.e. compiles and dumps corresponding prediction functions)
"""

import os
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from cakechat.utils.env import init_theano_env

init_theano_env()

from cakechat.dialog_model.factory import get_trained_model
from cakechat.utils.logger import get_tools_logger

_logger = get_tools_logger(__file__)

if __name__ == '__main__':
    _logger.info('Fetching and pre-compiling pre-trained model...')
    get_trained_model(fetch_from_s3=True)
    _logger.info('Successfully resolved and compiled model.')
    _logger.info('Fetching and pre-compiling additional reverse-model for MMI reranking...')
    get_trained_model(fetch_from_s3=True, reverse=True)
    _logger.info('Successfully resolved and compiled reverse-model.')
import random

from cakechat.api.config import PREDICTION_MODE, NUM_BEST_CANDIDATES_TO_PICK_FROM, SAMPLING_ATTEMPTS_NUM, \
    DEFAULT_RESPONSE
from cakechat.config import INPUT_CONTEXT_SIZE, INPUT_SEQUENCE_LENGTH, PREDICTION_MODES
from cakechat.dialog_model.factory import get_trained_model, get_reverse_model
from cakechat.dialog_model.inference import get_nn_responses, warmup_predictor
from cakechat.dialog_model.model_utils import transform_contexts_to_token_ids, transform_conditions_to_ids
from cakechat.utils.offense_detector import OffenseDetector
from cakechat.utils.offense_detector.config import OFFENSIVE_PHRASES_PATH
from cakechat.utils.text_processing import get_tokens_sequence, prettify_response

_offense_detector = OffenseDetector(OFFENSIVE_PHRASES_PATH)
_cakechat_model = get_trained_model(
    reverse_model=get_reverse_model(PREDICTION_MODE))
warmup_predictor(_cakechat_model, PREDICTION_MODE)


def _is_appropriate_response(response):
    return response != '' and not _offense_detector.has_offensive_ngrams(
        response)


def _get_non_offensive_response_using_fast_sampling(context_tokens_ids,
                                                    condition_id):
    for _ in range(SAMPLING_ATTEMPTS_NUM):
        response = get_nn_responses(context_tokens_ids,
                                    _cakechat_model,
                                    PREDICTION_MODES.sampling,
                                    condition_ids=condition_id)[0][0]