Ejemplo n.º 1
0
def build_index_mappings(corpus_path, max_tokens_num=MAX_TOKENS_NUM, max_conditions_num=MAX_CONDITIONS_NUM):
    if not is_non_empty_file(corpus_path):
        raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path))

    dialogs = load_processed_dialogs_from_json(
        FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME)

    tokens_counter = Counter()
    conditions_counter = Counter()

    for dialog in dialogs:
        for utterance in dialog:
            # Tokenize dialog utterance text and update tokens count
            tokens = get_tokens_sequence(utterance[TEXT_FIELD_NAME])
            tokens_counter += Counter(tokens)
            # Update conditions count
            conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1

    # Build the tokens list
    vocab = list(SPECIAL_TOKENS) + \
            [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))]

    # Build the conditions list
    conditions = [condition for condition, _ in conditions_counter.most_common(max_conditions_num)]

    # Validate the condition list
    if DEFAULT_CONDITION not in conditions:
        raise Exception('No default condition "%s" found in the dataset condition list.' % DEFAULT_CONDITION)

    # Return index_to_token and index_to_condition mappings
    return dict(enumerate(vocab)), dict(enumerate(conditions))
def build_index_mappings(corpus_path, max_tokens_num=VOCABULARY_MAX_SIZE, max_conditions_num=MAX_CONDITIONS_NUM,
                         simple_tokenize=SIMPLE_TOKENIZE):
    if not is_non_empty_file(corpus_path):
        raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path))

    dialogs = load_processed_dialogs_from_json(
        FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME)

    tokens_counter = Counter()
    conditions_counter = Counter()

    for dialog in tqdm(dialogs):
        for utterance in dialog:
            tokens = utterance[TEXT_FIELD_NAME].split() if simple_tokenize else \
                get_tokens_sequence(utterance[TEXT_FIELD_NAME])

            tokens_counter.update(tokens)
            conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1

    # Build the tokens list
    vocab = list(SPECIAL_TOKENS) + \
            [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))]

    # Build the conditions list
    conditions = [condition for condition, _ in conditions_counter.most_common(max_conditions_num)]

    # Validate the condition list
    if DEFAULT_CONDITION not in conditions:
        raise Exception('No default condition "{}" found in the dataset condition list.'.format(DEFAULT_CONDITION))

    # Return index_to_token and index_to_condition mappings
    return dict(enumerate(vocab)), dict(enumerate(conditions))
Ejemplo n.º 3
0
    def _load_model_if_exists(self):
        if is_non_empty_file(self._model_progress_resource_path):
            self._model = self._load_model(self._model,
                                           self._model_progress_resource_path)
            self._metrics = self._metrics_serializer.load_metrics(
                self._metrics_resource_path)
            return

        self._logger.info(
            'Could not find saved model at {}\nModel will be trained from scratch.\n'
            .format(self._model_progress_resource_path))
Ejemplo n.º 4
0
def get_tokenized_test_lines(corpus_name, tokens_voc):
    corpus_path = os.path.join(TEST_DATA_DIR, '%s.txt' % corpus_name)
    if not is_non_empty_file(corpus_path):
        raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path))
    test_lines = load_file(corpus_path)
    result = []
    for line in test_lines:
        tokenized_line = get_tokens_sequence(line)
        tokenized_line = replace_out_of_voc_tokens(tokenized_line, tokens_voc)
        result.append(tokenized_line)

    return result
Ejemplo n.º 5
0
def build_index_mappings(corpus_path,
                         max_tokens_num=MAX_TOKENS_NUM,
                         max_conditions_num=MAX_CONDITIONS_NUM):
    if not is_non_empty_file(corpus_path):
        raise ValueError(
            'Test corpus file doesn\'t exist: {}'.format(corpus_path))

    dialogs = load_processed_dialogs_from_json(
        FileTextLinesIterator(corpus_path),
        text_field_name=TEXT_FIELD_NAME,
        condition_field_name=CONDITION_FIELD_NAME)

    tokens_counter = Counter()
    conditions_counter = Counter()

    for dialog in dialogs:
        for utterance in dialog:
            # Tokenize dialog utterance text and update tokens count
            tokens = get_tokens_sequence(utterance[TEXT_FIELD_NAME])
            tokens_counter += Counter(tokens)
            # Update conditions count
            conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1

    # Build the tokens list
    vocab = list(SPECIAL_TOKENS) + \
            [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))]

    # Build the conditions list
    conditions = [
        condition
        for condition, _ in conditions_counter.most_common(max_conditions_num)
    ]

    # Validate the condition list
    if DEFAULT_CONDITION not in conditions:
        raise Exception(
            'No default condition "%s" found in the dataset condition list.' %
            DEFAULT_CONDITION)

    # Return index_to_token and index_to_condition mappings
    return dict(enumerate(vocab)), dict(enumerate(conditions))
Ejemplo n.º 6
0
def predict(model_path,
            tokens_index_path=None,
            conditions_index_path=None,
            default_predictions_path=None,
            reverse_model_weights=None,
            temperatures=None,
            prediction_mode=None):

    if not tokens_index_path:
        tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if not conditions_index_path:
        conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME)
    if not temperatures:
        temperatures = [DEFAULT_TEMPERATURE]
    if not prediction_mode:
        prediction_mode = PREDICTION_MODE_FOR_TESTS

    # Construct list of parameters values for all possible combinations of passed parameters
    prediction_params = [dict()]
    if reverse_model_weights:
        prediction_params = [
            dict(params, mmi_reverse_model_score_weight=w)
            for params in prediction_params for w in reverse_model_weights
        ]
    if temperatures:
        prediction_params = [
            dict(params, temperature=t) for params in prediction_params
            for t in temperatures
        ]

    if not is_non_empty_file(tokens_index_path):
        _logger.warning(
            'Couldn\'t find tokens_index file:\n{}. \nExiting...'.format(
                tokens_index_path))
        return

    index_to_token = load_index_to_item(tokens_index_path)
    index_to_condition = load_index_to_item(conditions_index_path)

    nn_model, _ = get_nn_model(index_to_token,
                               index_to_condition,
                               model_init_path=model_path)

    if not default_predictions_path:
        default_predictions_path = os.path.join(
            DATA_DIR, 'results', 'predictions_' + nn_model.model_name)

    # Get path for each combination of parameters
    predictions_paths = []
    # Add suffix to the filename only for parameters that have a specific value passed as an argument
    # If no parameters were specified, no suffix is added
    if len(prediction_params) > 1:
        for cur_params in prediction_params:
            cur_path = '{base_path}_{params_str}.tsv'.format(
                base_path=default_predictions_path,
                params_str='_'.join(
                    ['{}_{}'.format(k, v) for k, v in cur_params.items()]))
            predictions_paths.append(cur_path)
    else:
        predictions_paths = [default_predictions_path + '.tsv']

    _logger.info('Model for prediction:\n{}'.format(nn_model.model_load_path))
    _logger.info('Tokens index:\n{}'.format(tokens_index_path))
    _logger.info('File with questions:\n{}'.format(QUESTIONS_CORPUS_NAME))
    _logger.info('Files to dump responses:\n{}'.format(
        '\n'.join(predictions_paths)))
    _logger.info('Prediction parameters\n{}'.format('\n'.join(
        [str(x) for x in prediction_params])))

    processed_test_set = get_tokenized_test_lines(QUESTIONS_CORPUS_NAME,
                                                  set(index_to_token.values()))
    processed_test_set = list(processed_test_set)

    for cur_params, cur_path in zip(prediction_params, predictions_paths):
        _logger.info(
            'Predicting with the following params: {}'.format(cur_params))
        _save_test_results(processed_test_set,
                           cur_path,
                           nn_model,
                           prediction_modes=[prediction_mode])
Ejemplo n.º 7
0
def _look_for_saved_files(files_paths):
    for f_path in files_paths:
        if not is_non_empty_file(f_path):
            raise Exception('\nCould not find the following file or it\'s empty: {0}'.format(f_path))
Ejemplo n.º 8
0
def predict(model_path=None,
            tokens_index_path=None,
            conditions_index_path=None,
            default_predictions_path=None,
            reverse_model_weights=None,
            temperatures=None,
            prediction_mode=PREDICTION_MODE_FOR_TESTS):
    if not model_path:
        model_path = get_model_full_path()
    if not tokens_index_path:
        tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME)
    if not conditions_index_path:
        conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME)
    if not default_predictions_path:
        default_predictions_path = os.path.join(DATA_DIR, 'results', 'predictions_' + get_model_full_params_str())

    # Construct list of parameters values for all possible combinations of passed parameters
    prediction_params = [dict()]
    if reverse_model_weights:
        prediction_params = [
            dict(params, mmi_reverse_model_score_weight=w)
            for params in prediction_params
            for w in reverse_model_weights
        ]
    if temperatures:
        prediction_params = [dict(params, temperature=t) for params in prediction_params for t in temperatures]

    # Get path for each combination of parameters
    predictions_paths = []
    # Add suffix to the filename only for parameters that have a specific value passed as an argument
    # If no parameters were specified, no suffix is added
    if len(prediction_params) > 1:
        for cur_params in prediction_params:
            cur_path = '{base_path}_{params_str}.tsv'.format(
                base_path=default_predictions_path,
                params_str='_'.join(['{}_{}'.format(k, v) for k, v in cur_params.items()]))
            predictions_paths.append(cur_path)
    else:
        predictions_paths = [default_predictions_path + '.tsv']

    if not is_non_empty_file(model_path):
        _logger.warn('Couldn\'t find model:\n"{}". \nExiting...'.format(model_path))
        return

    if not is_non_empty_file(tokens_index_path):
        _logger.warn('Couldn\'t find tokens_index file:\n"{}". \nExiting...'.format(tokens_index_path))
        return

    _logger.info('Model for prediction:\n{}'.format(model_path))
    _logger.info('Tokens index:\n{}'.format(tokens_index_path))
    _logger.info('File with questions:\n{}'.format(QUESTIONS_CORPUS_NAME))
    _logger.info('Files to dump responses:\n{}'.format('\n'.join(predictions_paths)))
    _logger.info('Prediction parameters\n{}'.format('\n'.join([str(x) for x in prediction_params])))

    index_to_token = load_index_to_item(tokens_index_path)
    index_to_condition = load_index_to_item(conditions_index_path)

    processed_test_set = get_tokenized_test_lines(QUESTIONS_CORPUS_NAME, set(index_to_token.values()))
    processed_test_set = list(processed_test_set)

    nn_model, _ = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path)

    for cur_params, cur_path in zip(prediction_params, predictions_paths):
        _logger.info('Predicting with the following params: {}'.format(cur_params))
        _save_test_results(processed_test_set, cur_path, nn_model, prediction_mode, **cur_params)