def build_index_mappings(corpus_path, max_tokens_num=MAX_TOKENS_NUM, max_conditions_num=MAX_CONDITIONS_NUM): if not is_non_empty_file(corpus_path): raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path)) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME) tokens_counter = Counter() conditions_counter = Counter() for dialog in dialogs: for utterance in dialog: # Tokenize dialog utterance text and update tokens count tokens = get_tokens_sequence(utterance[TEXT_FIELD_NAME]) tokens_counter += Counter(tokens) # Update conditions count conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1 # Build the tokens list vocab = list(SPECIAL_TOKENS) + \ [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))] # Build the conditions list conditions = [condition for condition, _ in conditions_counter.most_common(max_conditions_num)] # Validate the condition list if DEFAULT_CONDITION not in conditions: raise Exception('No default condition "%s" found in the dataset condition list.' % DEFAULT_CONDITION) # Return index_to_token and index_to_condition mappings return dict(enumerate(vocab)), dict(enumerate(conditions))
def build_index_mappings(corpus_path, max_tokens_num=VOCABULARY_MAX_SIZE, max_conditions_num=MAX_CONDITIONS_NUM, simple_tokenize=SIMPLE_TOKENIZE): if not is_non_empty_file(corpus_path): raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path)) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME) tokens_counter = Counter() conditions_counter = Counter() for dialog in tqdm(dialogs): for utterance in dialog: tokens = utterance[TEXT_FIELD_NAME].split() if simple_tokenize else \ get_tokens_sequence(utterance[TEXT_FIELD_NAME]) tokens_counter.update(tokens) conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1 # Build the tokens list vocab = list(SPECIAL_TOKENS) + \ [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))] # Build the conditions list conditions = [condition for condition, _ in conditions_counter.most_common(max_conditions_num)] # Validate the condition list if DEFAULT_CONDITION not in conditions: raise Exception('No default condition "{}" found in the dataset condition list.'.format(DEFAULT_CONDITION)) # Return index_to_token and index_to_condition mappings return dict(enumerate(vocab)), dict(enumerate(conditions))
def _load_model_if_exists(self): if is_non_empty_file(self._model_progress_resource_path): self._model = self._load_model(self._model, self._model_progress_resource_path) self._metrics = self._metrics_serializer.load_metrics( self._metrics_resource_path) return self._logger.info( 'Could not find saved model at {}\nModel will be trained from scratch.\n' .format(self._model_progress_resource_path))
def get_tokenized_test_lines(corpus_name, tokens_voc): corpus_path = os.path.join(TEST_DATA_DIR, '%s.txt' % corpus_name) if not is_non_empty_file(corpus_path): raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path)) test_lines = load_file(corpus_path) result = [] for line in test_lines: tokenized_line = get_tokens_sequence(line) tokenized_line = replace_out_of_voc_tokens(tokenized_line, tokens_voc) result.append(tokenized_line) return result
def build_index_mappings(corpus_path, max_tokens_num=MAX_TOKENS_NUM, max_conditions_num=MAX_CONDITIONS_NUM): if not is_non_empty_file(corpus_path): raise ValueError( 'Test corpus file doesn\'t exist: {}'.format(corpus_path)) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME) tokens_counter = Counter() conditions_counter = Counter() for dialog in dialogs: for utterance in dialog: # Tokenize dialog utterance text and update tokens count tokens = get_tokens_sequence(utterance[TEXT_FIELD_NAME]) tokens_counter += Counter(tokens) # Update conditions count conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1 # Build the tokens list vocab = list(SPECIAL_TOKENS) + \ [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))] # Build the conditions list conditions = [ condition for condition, _ in conditions_counter.most_common(max_conditions_num) ] # Validate the condition list if DEFAULT_CONDITION not in conditions: raise Exception( 'No default condition "%s" found in the dataset condition list.' % DEFAULT_CONDITION) # Return index_to_token and index_to_condition mappings return dict(enumerate(vocab)), dict(enumerate(conditions))
def predict(model_path, tokens_index_path=None, conditions_index_path=None, default_predictions_path=None, reverse_model_weights=None, temperatures=None, prediction_mode=None): if not tokens_index_path: tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME) if not conditions_index_path: conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME) if not temperatures: temperatures = [DEFAULT_TEMPERATURE] if not prediction_mode: prediction_mode = PREDICTION_MODE_FOR_TESTS # Construct list of parameters values for all possible combinations of passed parameters prediction_params = [dict()] if reverse_model_weights: prediction_params = [ dict(params, mmi_reverse_model_score_weight=w) for params in prediction_params for w in reverse_model_weights ] if temperatures: prediction_params = [ dict(params, temperature=t) for params in prediction_params for t in temperatures ] if not is_non_empty_file(tokens_index_path): _logger.warning( 'Couldn\'t find tokens_index file:\n{}. \nExiting...'.format( tokens_index_path)) return index_to_token = load_index_to_item(tokens_index_path) index_to_condition = load_index_to_item(conditions_index_path) nn_model, _ = get_nn_model(index_to_token, index_to_condition, model_init_path=model_path) if not default_predictions_path: default_predictions_path = os.path.join( DATA_DIR, 'results', 'predictions_' + nn_model.model_name) # Get path for each combination of parameters predictions_paths = [] # Add suffix to the filename only for parameters that have a specific value passed as an argument # If no parameters were specified, no suffix is added if len(prediction_params) > 1: for cur_params in prediction_params: cur_path = '{base_path}_{params_str}.tsv'.format( base_path=default_predictions_path, params_str='_'.join( ['{}_{}'.format(k, v) for k, v in cur_params.items()])) predictions_paths.append(cur_path) else: predictions_paths = [default_predictions_path + '.tsv'] _logger.info('Model for prediction:\n{}'.format(nn_model.model_load_path)) _logger.info('Tokens index:\n{}'.format(tokens_index_path)) _logger.info('File with questions:\n{}'.format(QUESTIONS_CORPUS_NAME)) _logger.info('Files to dump responses:\n{}'.format( '\n'.join(predictions_paths))) _logger.info('Prediction parameters\n{}'.format('\n'.join( [str(x) for x in prediction_params]))) processed_test_set = get_tokenized_test_lines(QUESTIONS_CORPUS_NAME, set(index_to_token.values())) processed_test_set = list(processed_test_set) for cur_params, cur_path in zip(prediction_params, predictions_paths): _logger.info( 'Predicting with the following params: {}'.format(cur_params)) _save_test_results(processed_test_set, cur_path, nn_model, prediction_modes=[prediction_mode])
def _look_for_saved_files(files_paths): for f_path in files_paths: if not is_non_empty_file(f_path): raise Exception('\nCould not find the following file or it\'s empty: {0}'.format(f_path))
def predict(model_path=None, tokens_index_path=None, conditions_index_path=None, default_predictions_path=None, reverse_model_weights=None, temperatures=None, prediction_mode=PREDICTION_MODE_FOR_TESTS): if not model_path: model_path = get_model_full_path() if not tokens_index_path: tokens_index_path = get_index_to_token_path(BASE_CORPUS_NAME) if not conditions_index_path: conditions_index_path = get_index_to_condition_path(BASE_CORPUS_NAME) if not default_predictions_path: default_predictions_path = os.path.join(DATA_DIR, 'results', 'predictions_' + get_model_full_params_str()) # Construct list of parameters values for all possible combinations of passed parameters prediction_params = [dict()] if reverse_model_weights: prediction_params = [ dict(params, mmi_reverse_model_score_weight=w) for params in prediction_params for w in reverse_model_weights ] if temperatures: prediction_params = [dict(params, temperature=t) for params in prediction_params for t in temperatures] # Get path for each combination of parameters predictions_paths = [] # Add suffix to the filename only for parameters that have a specific value passed as an argument # If no parameters were specified, no suffix is added if len(prediction_params) > 1: for cur_params in prediction_params: cur_path = '{base_path}_{params_str}.tsv'.format( base_path=default_predictions_path, params_str='_'.join(['{}_{}'.format(k, v) for k, v in cur_params.items()])) predictions_paths.append(cur_path) else: predictions_paths = [default_predictions_path + '.tsv'] if not is_non_empty_file(model_path): _logger.warn('Couldn\'t find model:\n"{}". \nExiting...'.format(model_path)) return if not is_non_empty_file(tokens_index_path): _logger.warn('Couldn\'t find tokens_index file:\n"{}". \nExiting...'.format(tokens_index_path)) return _logger.info('Model for prediction:\n{}'.format(model_path)) _logger.info('Tokens index:\n{}'.format(tokens_index_path)) _logger.info('File with questions:\n{}'.format(QUESTIONS_CORPUS_NAME)) _logger.info('Files to dump responses:\n{}'.format('\n'.join(predictions_paths))) _logger.info('Prediction parameters\n{}'.format('\n'.join([str(x) for x in prediction_params]))) index_to_token = load_index_to_item(tokens_index_path) index_to_condition = load_index_to_item(conditions_index_path) processed_test_set = get_tokenized_test_lines(QUESTIONS_CORPUS_NAME, set(index_to_token.values())) processed_test_set = list(processed_test_set) nn_model, _ = get_nn_model(index_to_token, index_to_condition, nn_model_path=model_path) for cur_params, cur_path in zip(prediction_params, predictions_paths): _logger.info('Predicting with the following params: {}'.format(cur_params)) _save_test_results(processed_test_set, cur_path, nn_model, prediction_mode, **cur_params)