def load_datasets(token_to_index,
                  condition_to_index,
                  test_corpus_name=CONTEXT_SENSITIVE_TEST_CORPUS_NAME):
    # load context_sensitive_test dataset
    cs_test = load_conditioned_dataset(test_corpus_name, token_to_index,
                                       condition_to_index)
    # load context_free_validation dataset
    cf_validation = load_context_free_val(token_to_index)

    # load context sensitive testset for one selected condition
    condition_mask = cs_test.condition_ids != condition_to_index[
        DEFAULT_CONDITION]
    conditioned_test = Dataset(
        x=cs_test.x[condition_mask],
        y=cs_test.y[condition_mask],
        condition_ids=cs_test.condition_ids[condition_mask])

    # get a subset of conditioned_test of the same size as cf_validation;
    # if there are no so many samples in conditioned_test, use all of the available conditioned_test samples
    cs_test_one_condition = \
        generate_subset(conditioned_test, subset_size=min(cf_validation.x.shape[0], conditioned_test.x.shape[0]))

    return create_namedtuple_instance(
        'EvalMetricsDatasets',
        cf_validation=cf_validation,
        cs_test=cs_test,
        cs_test_one_condition=cs_test_one_condition)
Esempio n. 2
0
TRAIN_WORD_EMBEDDINGS_LAYER = True  # Allow fine-tuning of the word embedding layer during the model training
W2V_MODEL_DIR = os.path.join(
    DATA_DIR, 'w2v_models')  # Path to store & load trained word2vec models
WORD_EMBEDDING_DIMENSION = 128  # word2vec embedding dimension
W2V_WINDOW_SIZE = 10  # word2vec window size, used during the w2v pre-training
USE_SKIP_GRAM = True  # Use skip-gram word2vec mode. When False, CBOW is used
MIN_WORD_FREQ = 1  # Minimum frequency of a word to be used in word2vec pre-calculation

# condition inputs. We use five major emotions to condition our model's predictions
# original emotions
#EMOTIONS_TYPES = create_namedtuple_instance(
#    'EMOTIONS_TYPES', neutral='neutral', anger='anger', joy='joy', fear='fear', disgust='disgust')
# TODO we have emotions {"0": "neutral", "1": "anger", "2": "joy", "3": "laugh", "4": "disgust"}
EMOTIONS_TYPES = create_namedtuple_instance('EMOTIONS_TYPES',
                                            neutral='neutral',
                                            anger='anger',
                                            joy='joy',
                                            laugh='laugh',
                                            disgust='disgust')
DEFAULT_CONDITION = EMOTIONS_TYPES.joy  # Default condition to be used during the prediction (if not specified)
CONDITION_EMBEDDING_DIMENSION = 128  # Conditions embedding layer dimension to be trained.

# NN architecture params
ENCODER_DEPTH = 2  # Number of recurrent (GRU) layers for the encoder
DECODER_DEPTH = 2  # Number of recurrent (GRU) layers for the decoder
HIDDEN_LAYER_DIMENSION = 512  # Dimension for the recurrent layer
DENSE_DROPOUT_RATIO = 0.2  # Use dropout with the given ratio before decoder's output

# training params
INPUT_SEQUENCE_LENGTH = 30  # Input sequence length for the model during the training;
INPUT_CONTEXT_SIZE = 3  # Maximum depth of the conversational history to be used in encoder (at least 1)
OUTPUT_SEQUENCE_LENGTH = 32  # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens
Esempio n. 3
0
QUESTIONS_CORPUS_NAME = 'context_free_questions'  # Context-free questions only path

# word embeddings params
USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True  # Whether to use word2vec to pre-train weights for the embedding layer
TRAIN_WORD_EMBEDDINGS_LAYER = True  # Allow fine-tuning of the word embedding layer during the model training
W2V_MODEL_DIR = os.path.join(
    DATA_DIR, 'w2v_models')  # Path to store & load trained word2vec models
WORD_EMBEDDING_DIMENSION = 128  # word2vec embedding dimension
W2V_WINDOW_SIZE = 10  # word2vec window size, used during the w2v pre-training
USE_SKIP_GRAM = True  # Use skip-gram word2vec mode. When False, CBOW is used
MIN_WORD_FREQ = 1  # Minimum frequency of a word to be used in word2vec pre-calculation

# condition inputs. We use five major emotions to condition our model's predictions
EMOTIONS_TYPES = create_namedtuple_instance('EMOTIONS_TYPES',
                                            neutral='neutral',
                                            anger='anger',
                                            joy='joy',
                                            fear='fear',
                                            sadness='sadness')
DEFAULT_CONDITION = EMOTIONS_TYPES.neutral  # Default condition to be used during the prediction (if not specified)
CONDITION_EMBEDDING_DIMENSION = 128  # Conditions embedding layer dimension to be trained.

# NN architecture params
ENCODER_DEPTH = 2  # Number of recurrent (GRU) layers for the encoder
DECODER_DEPTH = 2  # Number of recurrent (GRU) layers for the decoder
HIDDEN_LAYER_DIMENSION = 512  # Dimension for the recurrent layer
DENSE_DROPOUT_RATIO = 0.2  # Use dropout with the given ratio before decoder's output

# training params
INPUT_SEQUENCE_LENGTH = 30  # Input sequence length for the model during the training;
INPUT_CONTEXT_SIZE = 3  # Maximum depth of the conversational history to be used in encoder (at least 1)
OUTPUT_SEQUENCE_LENGTH = 32  # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens
Esempio n. 4
0
from cakechat.utils.data_structures import create_namedtuple_instance

SPECIAL_TOKENS = create_namedtuple_instance(
    'SPECIAL_TOKENS', PAD_TOKEN=u'_pad_', UNKNOWN_TOKEN=u'_unk_', START_TOKEN=u'_start_', EOS_TOKEN=u'_end_')

DIALOG_TEXT_FIELD = 'text'
DIALOG_CONDITION_FIELD = 'condition'
    def __init__(self,
                 index_to_token,
                 index_to_condition,
                 training_data_param,
                 validation_data_param,
                 w2v_model_param,
                 model_init_path=None,
                 model_resolver=None,
                 model_name=MODEL_NAME,
                 corpus_name=BASE_CORPUS_NAME,
                 skip_token=SPECIAL_TOKENS.PAD_TOKEN,
                 token_embedding_dim=WORD_EMBEDDING_DIMENSION,
                 train_token_embedding=TRAIN_WORD_EMBEDDINGS_LAYER,
                 condition_embedding_dim=CONDITION_EMBEDDING_DIMENSION,
                 input_seq_len=INPUT_SEQUENCE_LENGTH,
                 input_context_size=INPUT_CONTEXT_SIZE,
                 output_seq_len=OUTPUT_SEQUENCE_LENGTH,
                 hidden_layer_dim=HIDDEN_LAYER_DIMENSION,
                 use_cudnn=USE_CUDNN,
                 dense_dropout_ratio=DENSE_DROPOUT_RATIO,
                 is_reverse_model=False,
                 reverse_model=None,
                 learning_rate=LEARNING_RATE,
                 grad_clip=GRAD_CLIP,
                 batch_size=BATCH_SIZE,
                 epochs_num=EPOCHS_NUM,
                 horovod=None,
                 tensorboard_log_dir=TENSORBOARD_LOG_DIR,
                 log_run_metadata=LOG_RUN_METADATA):
        """
        :param index_to_token: Dict with mapping: tokens indices to tokens
        :param index_to_condition: Dict with mapping: conditions indicies to conditions values
        :param training_data_param: Instance of ModelParam, tuple (value, id) where value is a dataset used for training
        and id is a name this dataset
        :param validation_data_param: Instance of ModelParam, tuple (value, id) where value is a dataset used for
        metrics calculation and id is a concatenation of these datasets' names
        :param w2v_model_param: Instance of ModelParam, tuple (value, id) where value is a word2vec matrix of shape
        (vocab_size, token_embedding_dim) with float values, used for initializing token embedding layers, and id is
        the name of word2vec model
        :param model_init_path: Path to a file with model's saved weights for layers intialization
        :param model_resolver: Factory that takes model path and returns a file resolver object
        :param model_name: String prefix that is prepended to automatically generated model's name. The prefix helps
         distinguish the current experiment from other experiments with similar params.
        :param corpus_name: File name of the training dataset (included into automatically generated model's name)
        :param skip_token: Token to skip with masking, usually _pad_ token. Id of this token is inferred from
        index_to_token dictionary
        :param token_embedding_dim:  Vectors dimensionality of tokens embeddings
        :param train_token_embedding: Bool value indicating whether to train token embeddings along with other model's
        weights or keep them freezed during training
        :param condition_embedding_dim: Vectors dimensionality of conditions embeddings
        :param input_seq_len: Max number of tokens in the context sentences
        :param input_context_size: Max number of sentences in the context
        :param output_seq_len: Max number of tokens in the output sentences
        :param hidden_layer_dim: Vectors dimensionality of hidden layers in GRU and Dense layers
        :param dense_dropout_ratio: Float value between 0 and 1, indicating the ratio of neurons that will be randomly
        deactivated during training to prevent model's overfitting
        :param is_reverse_model: Bool value indicating the type of model:
        False (regular model) - predicts response for the given context
        True (reverse model) - predicts context for the given response (actually, predict the last context sentence for
        given response and the beginning of the context) - used for calculating Maximim Mutual Information metric
        :param reverse_model: Trained reverse model used to generate predictions in *_reranking modes
        :param learning_rate: Learning rate of the optimization algorithm
        :param grad_clip: Clipping parameter of the optimization algorithm, used to prevent gradient explosion
        :param batch_size: Number of samples to be used for gradient estimation on each train step
        :param epochs_num: Number of full dataset passes during train
        :param horovod: Initialized horovod module used for multi-gpu training. Trains on single gpu if horovod=None
        :param tensorboard_log_dir: Path to tensorboard logs directory
        :param log_run_metadata: Set 'True' to profile memory consumption and computation time on tensorboard
        """
        # Calculate batches number in each epoch.
        # The last batch which may be smaller than batch size is included in this number
        batches_num_per_epoch = math.ceil(training_data_param.value.x.shape[0] / batch_size) \
            if training_data_param.value else None

        # Create callbacks
        callbacks = self._create_essential_callbacks(self, horovod)
        callbacks.extend([
            # Custom callback for metrics calculation
            CakeChatEvaluatorCallback(self, index_to_token, batch_size,
                                      batches_num_per_epoch)
        ])

        super(CakeChatModel, self).__init__(
            model_resolver_factory=model_resolver,
            metrics_plotter=TensorboardMetricsPlotter(tensorboard_log_dir),
            horovod=horovod,
            training_callbacks=callbacks)
        WithLogger.__init__(self)

        self._model_name = 'reverse_{}'.format(
            model_name) if is_reverse_model else model_name
        self._rnn_class = CuDNNGRU if use_cudnn else partial(GRU,
                                                             reset_after=True)

        # tokens params
        self._index_to_token = index_to_token
        self._token_to_index = {v: k for k, v in index_to_token.items()}
        self._vocab_size = len(self._index_to_token)
        self._skip_token_id = self._token_to_index[skip_token]

        self._token_embedding_dim = token_embedding_dim
        self._train_token_embedding = train_token_embedding
        self._W_init_embedding = \
            self._build_embedding_matrix(self._token_to_index, w2v_model_param.value, token_embedding_dim) \
                if w2v_model_param.value else None

        # condition params
        self._index_to_condition = index_to_condition
        self._condition_to_index = {
            v: k
            for k, v in index_to_condition.items()
        }
        self._condition_embedding_dim = condition_embedding_dim

        # data params
        self._training_data = training_data_param.value
        self._validation_data = validation_data_param.value

        # train params
        self._batches_num_per_epoch = batches_num_per_epoch
        self._model_init_path = model_init_path
        self._horovod = horovod

        self._optimizer = optimizers.Adadelta(lr=learning_rate,
                                              clipvalue=grad_clip)
        if self._horovod:
            self._optimizer = horovod.DistributedOptimizer(self._optimizer)

        # gather model's params that define the experiment setting
        self._params = create_namedtuple_instance(
            name='Params',
            corpus_name=corpus_name,
            input_context_size=input_context_size,
            input_seq_len=input_seq_len,
            output_seq_len=output_seq_len,
            token_embedding_dim=token_embedding_dim,
            train_batch_size=batch_size,
            hidden_layer_dim=hidden_layer_dim,
            w2v_model=w2v_model_param.id,
            is_reverse_model=is_reverse_model,
            dense_dropout_ratio=dense_dropout_ratio,
            voc_size=len(self._token_to_index),
            training_data=training_data_param.id,
            validation_data=validation_data_param.id,
            epochs_num=epochs_num,
            optimizer=self._optimizer.get_config())

        # profiling params
        self._run_options = tf.RunOptions(
            trace_level=tf.RunOptions.FULL_TRACE) if log_run_metadata else None
        self._run_metadata = tf.RunMetadata() if log_run_metadata else None

        # parts of computational graph
        self._models = None

        # get trained reverse model used for inference
        self._reverse_model = reverse_model
Esempio n. 6
0
TEST_DATA_DIR = os.path.join(DATA_DIR, 'quality')  # Path to datasets for quality metrics calculation
CONTEXT_FREE_VAL_CORPUS_NAME = 'context_free_validation_set'  # Context-free validation set path
TEST_CORPUS_NAME = 'context_free_test_set'  # Context-free test set path
QUESTIONS_CORPUS_NAME = 'context_free_questions'  # Context-free questions only path

# word embeddings params
USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True  # Whether to use word2vec to pre-train weights for the embedding layer
TRAIN_WORD_EMBEDDINGS_LAYER = True  # Allow fine-tuning of the word embedding layer during the model training
W2V_MODEL_DIR = os.path.join(DATA_DIR, 'w2v_models')  # Path to store & load trained word2vec models
WORD_EMBEDDING_DIMENSION = 128  # word2vec embedding dimension
W2V_WINDOW_SIZE = 10  # word2vec window size, used during the w2v pre-training
USE_SKIP_GRAM = True  # Use skip-gram word2vec mode. When False, CBOW is used
MIN_WORD_FREQ = 1  # Minimum frequency of a word to be used in word2vec pre-calculation

# condition inputs. We use five major emotions to condition our model's predictions
EMOTIONS_TYPES = create_namedtuple_instance(
    'EMOTIONS_TYPES', neutral='neutral', anger='anger', joy='joy', fear='fear', sadness='sadness')
DEFAULT_CONDITION = EMOTIONS_TYPES.neutral  # Default condition to be used during the prediction (if not specified)
CONDITION_EMBEDDING_DIMENSION = 128  # Conditions embedding layer dimension to be trained.

# NN architecture params
ENCODER_DEPTH = 2  # Number of recurrent (GRU) layers for the encoder
DECODER_DEPTH = 2  # Number of recurrent (GRU) layers for the decoder
HIDDEN_LAYER_DIMENSION = 512  # Dimension for the recurrent layer
DENSE_DROPOUT_RATIO = 0.2  # Use dropout with the given ratio before decoder's output

# training params
INPUT_SEQUENCE_LENGTH = 30  # Input sequence length for the model during the training;
INPUT_CONTEXT_SIZE = 3  # Maximum depth of the conversational history to be used in encoder (at least 1)
OUTPUT_SEQUENCE_LENGTH = 32  # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens
BATCH_SIZE = 192  # Default batch size which fits into 8GB of GPU memory
SHUFFLE_TRAINING_BATCHES = True  # Shuffle training batches in the dataset each epoch
Esempio n. 7
0
TEST_DATA_DIR = os.path.join(DATA_DIR, 'quality')  # Path to datasets for quality metrics calculation
CONTEXT_FREE_VAL_CORPUS_NAME = 'context_free_validation_set'  # Context-free validation set path
TEST_CORPUS_NAME = 'context_free_test_set'  # Context-free test set path
QUESTIONS_CORPUS_NAME = 'context_free_questions'  # Context-free questions only path

# word embeddings params
USE_PRETRAINED_W2V_EMBEDDINGS_LAYER = True  # Whether to use word2vec to pre-train weights for the embedding layer
TRAIN_WORD_EMBEDDINGS_LAYER = True  # Allow fine-tuning of the word embedding layer during the model training
W2V_MODEL_DIR = os.path.join(DATA_DIR, 'w2v_models')  # Path to store & load trained word2vec models
WORD_EMBEDDING_DIMENSION = 128  # word2vec embedding dimension
W2V_WINDOW_SIZE = 10  # word2vec window size, used during the w2v pre-training
USE_SKIP_GRAM = True  # Use skip-gram word2vec mode. When False, CBOW is used
MIN_WORD_FREQ = 1  # Minimum frequency of a word to be used in word2vec pre-calculation

# condition inputs. We use five major emotions to condition our model's predictions
EMOTIONS_TYPES = create_namedtuple_instance(
    'EMOTIONS_TYPES', therapist='thera', client='client')
DEFAULT_CONDITION = EMOTIONS_TYPES.therapist  # Default condition to be used during the prediction (if not specified)
CONDITION_EMBEDDING_DIMENSION = 128  # Conditions embedding layer dimension to be trained.

# NN architecture params
ENCODER_DEPTH = 2  # Number of recurrent (GRU) layers for the encoder
DECODER_DEPTH = 2  # Number of recurrent (GRU) layers for the decoder
HIDDEN_LAYER_DIMENSION = 512  # Dimension for the recurrent layer
DENSE_DROPOUT_RATIO = 0.2  # Use dropout with the given ratio before decoder's output

# training params
INPUT_SEQUENCE_LENGTH = 30  # Input sequence length for the model during the training;
INPUT_CONTEXT_SIZE = 3  # Maximum depth of the conversational history to be used in encoder (at least 1)
OUTPUT_SEQUENCE_LENGTH = 32  # Output sequence length. Better to keep as INPUT_SEQUENCE_LENGTH+2 for start/end tokens
BATCH_SIZE = 192  # Default batch size which fits into 8GB of GPU memory
SHUFFLE_TRAINING_BATCHES = True  # Shuffle training batches in the dataset each epoch