def _build_offensive_ngrams(offensive_phrases_path): offensive_phrases = load_file(offensive_phrases_path) offensive_ngrams = [ tuple(get_tokens_sequence(offensive_phrase)) for offensive_phrase in offensive_phrases ] return set(offensive_ngrams)
def build_index_mappings(corpus_path, max_tokens_num=MAX_TOKENS_NUM, max_conditions_num=MAX_CONDITIONS_NUM): if not is_non_empty_file(corpus_path): raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path)) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME) tokens_counter = Counter() conditions_counter = Counter() for dialog in dialogs: for utterance in dialog: # Tokenize dialog utterance text and update tokens count tokens = get_tokens_sequence(utterance[TEXT_FIELD_NAME]) tokens_counter += Counter(tokens) # Update conditions count conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1 # Build the tokens list vocab = list(SPECIAL_TOKENS) + \ [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))] # Build the conditions list conditions = [condition for condition, _ in conditions_counter.most_common(max_conditions_num)] # Validate the condition list if DEFAULT_CONDITION not in conditions: raise Exception('No default condition "%s" found in the dataset condition list.' % DEFAULT_CONDITION) # Return index_to_token and index_to_condition mappings return dict(enumerate(vocab)), dict(enumerate(conditions))
def build_index_mappings(corpus_path, max_tokens_num=VOCABULARY_MAX_SIZE, max_conditions_num=MAX_CONDITIONS_NUM, simple_tokenize=SIMPLE_TOKENIZE): if not is_non_empty_file(corpus_path): raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path)) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME) tokens_counter = Counter() conditions_counter = Counter() for dialog in tqdm(dialogs): for utterance in dialog: tokens = utterance[TEXT_FIELD_NAME].split() if simple_tokenize else \ get_tokens_sequence(utterance[TEXT_FIELD_NAME]) tokens_counter.update(tokens) conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1 # Build the tokens list vocab = list(SPECIAL_TOKENS) + \ [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))] # Build the conditions list conditions = [condition for condition, _ in conditions_counter.most_common(max_conditions_num)] # Validate the condition list if DEFAULT_CONDITION not in conditions: raise Exception('No default condition "{}" found in the dataset condition list.'.format(DEFAULT_CONDITION)) # Return index_to_token and index_to_condition mappings return dict(enumerate(vocab)), dict(enumerate(conditions))
def has_offensive_ngrams(self, text): if not isinstance(text, str): raise TypeError('"text" variable must be a string') tokenized_text = get_tokens_sequence(text) text_ngrams = self._get_ngrams(tokenized_text) return bool(text_ngrams & self._offensive_ngrams)
def has_offensive_ngrams(self, text_or_tokenized_text): if isinstance(text_or_tokenized_text, string_types): tokenized_text = get_tokens_sequence(text_or_tokenized_text) elif isinstance(text_or_tokenized_text, list): tokenized_text = text_or_tokenized_text else: raise TypeError('text_or_tokenized_text must be string or list') text_ngrams = self._get_ngrams(tokenized_text) return bool(text_ngrams & self._offensive_ngrams)
def has_offensive_ngrams(self, text_or_tokenized_text): if isinstance(text_or_tokenized_text, basestring): tokenized_text = get_tokens_sequence(text_or_tokenized_text) elif isinstance(text_or_tokenized_text, list): tokenized_text = text_or_tokenized_text else: raise TypeError('text_or_tokenized_text must be string or list') text_ngrams = self._get_ngrams(tokenized_text) return bool(text_ngrams & self._offensive_ngrams)
def get_tokenized_test_lines(corpus_name, tokens_voc): corpus_path = os.path.join(TEST_DATA_DIR, '%s.txt' % corpus_name) if not is_non_empty_file(corpus_path): raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path)) test_lines = load_file(corpus_path) result = [] for line in test_lines: tokenized_line = get_tokens_sequence(line) tokenized_line = replace_out_of_voc_tokens(tokenized_line, tokens_voc) result.append(tokenized_line) return result
def _get_non_offensive_response_using_fast_sampling(context_tokens_ids, condition_id): for _ in xrange(SAMPLING_ATTEMPTS_NUM): response = get_nn_responses(context_tokens_ids, _cakechat_model, PREDICTION_MODES.sampling, condition_ids=condition_id)[0][0] tokenized_response = get_tokens_sequence(response) if not _offense_detector.has_offensive_ngrams(tokenized_response): return get_pretty_str_from_tokens_sequence(tokenized_response) return DEFAULT_RESPONSE
def _get_context_to_weighted_responses(nn_model, testset, all_utterances): token_to_index = nn_model.token_to_index all_utterances_ids = transform_lines_to_token_ids( map(get_tokens_sequence, all_utterances), token_to_index, OUTPUT_SEQUENCE_LENGTH, add_start_end=True) context_to_weighted_responses = {} for context in testset: context_tokenized = get_tokens_sequence(context) repeated_context_ids = transform_contexts_to_token_ids( [[context_tokenized]] * len(all_utterances), token_to_index, INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE) scores = get_sequence_score(nn_model, repeated_context_ids, all_utterances_ids) context_to_weighted_responses[context] = dict(zip(all_utterances, scores)) return context_to_weighted_responses
def build_index_mappings(corpus_path, max_tokens_num=MAX_TOKENS_NUM, max_conditions_num=MAX_CONDITIONS_NUM): if not is_non_empty_file(corpus_path): raise ValueError( 'Test corpus file doesn\'t exist: {}'.format(corpus_path)) dialogs = load_processed_dialogs_from_json( FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME) tokens_counter = Counter() conditions_counter = Counter() for dialog in dialogs: for utterance in dialog: # Tokenize dialog utterance text and update tokens count tokens = get_tokens_sequence(utterance[TEXT_FIELD_NAME]) tokens_counter += Counter(tokens) # Update conditions count conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1 # Build the tokens list vocab = list(SPECIAL_TOKENS) + \ [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))] # Build the conditions list conditions = [ condition for condition, _ in conditions_counter.most_common(max_conditions_num) ] # Validate the condition list if DEFAULT_CONDITION not in conditions: raise Exception( 'No default condition "%s" found in the dataset condition list.' % DEFAULT_CONDITION) # Return index_to_token and index_to_condition mappings return dict(enumerate(vocab)), dict(enumerate(conditions))
def _get_non_offensive_response(context_tokens_ids, condition_id): responses = get_nn_responses( context_tokens_ids, _cakechat_model, PREDICTION_MODE, output_candidates_num=NUM_BEST_CANDIDATES_TO_PICK_FROM, condition_ids=condition_id)[0] tokenized_responses = [ get_tokens_sequence(response) for response in responses ] non_offensive_tokenized_responses = [ r for r in tokenized_responses if not _offense_detector.has_offensive_ngrams(r) ] if non_offensive_tokenized_responses: tokenized_response = random.choice(non_offensive_tokenized_responses) return get_pretty_str_from_tokens_sequence(tokenized_response) return DEFAULT_RESPONSE
def _get_context_to_weighted_responses(nn_model, testset, all_utterances): token_to_index = nn_model.token_to_index all_utterances_ids = transform_lines_to_token_ids(list( map(get_tokens_sequence, all_utterances)), token_to_index, OUTPUT_SEQUENCE_LENGTH, add_start_end=True) context_to_weighted_responses = {} for context in testset: context_tokenized = get_tokens_sequence(context) repeated_context_ids = transform_contexts_to_token_ids( [[context_tokenized]] * len(all_utterances), token_to_index, INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE) scores = get_sequence_score(nn_model, repeated_context_ids, all_utterances_ids) context_to_weighted_responses[context] = dict( zip(all_utterances, scores)) return context_to_weighted_responses
def process_text(nn_model, text): tokenized_line = get_tokens_sequence(text) return [replace_out_of_voc_tokens(tokenized_line, nn_model.token_to_index)]
def _build_offensive_ngrams(offensive_phrases_path): offensive_phrases = load_file(offensive_phrases_path) offensive_ngrams = [tuple(get_tokens_sequence(offensive_phrase)) for offensive_phrase in offensive_phrases] return set(offensive_ngrams)
def process_text(nn_model, text): tokenized_line = get_tokens_sequence(text) return [replace_out_of_voc_tokens(tokenized_line, nn_model.token_to_index)]