コード例 #1
0
ファイル: detector.py プロジェクト: zhengjunzhao1991/cakechat
 def _build_offensive_ngrams(offensive_phrases_path):
     offensive_phrases = load_file(offensive_phrases_path)
     offensive_ngrams = [
         tuple(get_tokens_sequence(offensive_phrase))
         for offensive_phrase in offensive_phrases
     ]
     return set(offensive_ngrams)
コード例 #2
0
def build_index_mappings(corpus_path, max_tokens_num=MAX_TOKENS_NUM, max_conditions_num=MAX_CONDITIONS_NUM):
    if not is_non_empty_file(corpus_path):
        raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path))

    dialogs = load_processed_dialogs_from_json(
        FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME)

    tokens_counter = Counter()
    conditions_counter = Counter()

    for dialog in dialogs:
        for utterance in dialog:
            # Tokenize dialog utterance text and update tokens count
            tokens = get_tokens_sequence(utterance[TEXT_FIELD_NAME])
            tokens_counter += Counter(tokens)
            # Update conditions count
            conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1

    # Build the tokens list
    vocab = list(SPECIAL_TOKENS) + \
            [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))]

    # Build the conditions list
    conditions = [condition for condition, _ in conditions_counter.most_common(max_conditions_num)]

    # Validate the condition list
    if DEFAULT_CONDITION not in conditions:
        raise Exception('No default condition "%s" found in the dataset condition list.' % DEFAULT_CONDITION)

    # Return index_to_token and index_to_condition mappings
    return dict(enumerate(vocab)), dict(enumerate(conditions))
コード例 #3
0
def build_index_mappings(corpus_path, max_tokens_num=VOCABULARY_MAX_SIZE, max_conditions_num=MAX_CONDITIONS_NUM,
                         simple_tokenize=SIMPLE_TOKENIZE):
    if not is_non_empty_file(corpus_path):
        raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path))

    dialogs = load_processed_dialogs_from_json(
        FileTextLinesIterator(corpus_path), text_field_name=TEXT_FIELD_NAME, condition_field_name=CONDITION_FIELD_NAME)

    tokens_counter = Counter()
    conditions_counter = Counter()

    for dialog in tqdm(dialogs):
        for utterance in dialog:
            tokens = utterance[TEXT_FIELD_NAME].split() if simple_tokenize else \
                get_tokens_sequence(utterance[TEXT_FIELD_NAME])

            tokens_counter.update(tokens)
            conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1

    # Build the tokens list
    vocab = list(SPECIAL_TOKENS) + \
            [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))]

    # Build the conditions list
    conditions = [condition for condition, _ in conditions_counter.most_common(max_conditions_num)]

    # Validate the condition list
    if DEFAULT_CONDITION not in conditions:
        raise Exception('No default condition "{}" found in the dataset condition list.'.format(DEFAULT_CONDITION))

    # Return index_to_token and index_to_condition mappings
    return dict(enumerate(vocab)), dict(enumerate(conditions))
コード例 #4
0
    def has_offensive_ngrams(self, text):
        if not isinstance(text, str):
            raise TypeError('"text" variable must be a string')
        tokenized_text = get_tokens_sequence(text)
        text_ngrams = self._get_ngrams(tokenized_text)

        return bool(text_ngrams & self._offensive_ngrams)
コード例 #5
0
    def has_offensive_ngrams(self, text_or_tokenized_text):
        if isinstance(text_or_tokenized_text, string_types):
            tokenized_text = get_tokens_sequence(text_or_tokenized_text)
        elif isinstance(text_or_tokenized_text, list):
            tokenized_text = text_or_tokenized_text
        else:
            raise TypeError('text_or_tokenized_text must be string or list')

        text_ngrams = self._get_ngrams(tokenized_text)
        return bool(text_ngrams & self._offensive_ngrams)
コード例 #6
0
ファイル: detector.py プロジェクト: Allensmile/cakechat
    def has_offensive_ngrams(self, text_or_tokenized_text):
        if isinstance(text_or_tokenized_text, basestring):
            tokenized_text = get_tokens_sequence(text_or_tokenized_text)
        elif isinstance(text_or_tokenized_text, list):
            tokenized_text = text_or_tokenized_text
        else:
            raise TypeError('text_or_tokenized_text must be string or list')

        text_ngrams = self._get_ngrams(tokenized_text)
        return bool(text_ngrams & self._offensive_ngrams)
コード例 #7
0
ファイル: dataset_loader.py プロジェクト: Mewtwonite7/Name
def get_tokenized_test_lines(corpus_name, tokens_voc):
    corpus_path = os.path.join(TEST_DATA_DIR, '%s.txt' % corpus_name)
    if not is_non_empty_file(corpus_path):
        raise ValueError('Test corpus file doesn\'t exist: {}'.format(corpus_path))
    test_lines = load_file(corpus_path)
    result = []
    for line in test_lines:
        tokenized_line = get_tokens_sequence(line)
        tokenized_line = replace_out_of_voc_tokens(tokenized_line, tokens_voc)
        result.append(tokenized_line)

    return result
コード例 #8
0
ファイル: response.py プロジェクト: zhengjunzhao1991/cakechat
def _get_non_offensive_response_using_fast_sampling(context_tokens_ids,
                                                    condition_id):
    for _ in xrange(SAMPLING_ATTEMPTS_NUM):
        response = get_nn_responses(context_tokens_ids,
                                    _cakechat_model,
                                    PREDICTION_MODES.sampling,
                                    condition_ids=condition_id)[0][0]

        tokenized_response = get_tokens_sequence(response)
        if not _offense_detector.has_offensive_ngrams(tokenized_response):
            return get_pretty_str_from_tokens_sequence(tokenized_response)

    return DEFAULT_RESPONSE
コード例 #9
0
def _get_context_to_weighted_responses(nn_model, testset, all_utterances):
    token_to_index = nn_model.token_to_index

    all_utterances_ids = transform_lines_to_token_ids(
        map(get_tokens_sequence, all_utterances), token_to_index, OUTPUT_SEQUENCE_LENGTH, add_start_end=True)

    context_to_weighted_responses = {}

    for context in testset:
        context_tokenized = get_tokens_sequence(context)
        repeated_context_ids = transform_contexts_to_token_ids(
            [[context_tokenized]] * len(all_utterances), token_to_index, INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE)

        scores = get_sequence_score(nn_model, repeated_context_ids, all_utterances_ids)

        context_to_weighted_responses[context] = dict(zip(all_utterances, scores))

    return context_to_weighted_responses
コード例 #10
0
def build_index_mappings(corpus_path,
                         max_tokens_num=MAX_TOKENS_NUM,
                         max_conditions_num=MAX_CONDITIONS_NUM):
    if not is_non_empty_file(corpus_path):
        raise ValueError(
            'Test corpus file doesn\'t exist: {}'.format(corpus_path))

    dialogs = load_processed_dialogs_from_json(
        FileTextLinesIterator(corpus_path),
        text_field_name=TEXT_FIELD_NAME,
        condition_field_name=CONDITION_FIELD_NAME)

    tokens_counter = Counter()
    conditions_counter = Counter()

    for dialog in dialogs:
        for utterance in dialog:
            # Tokenize dialog utterance text and update tokens count
            tokens = get_tokens_sequence(utterance[TEXT_FIELD_NAME])
            tokens_counter += Counter(tokens)
            # Update conditions count
            conditions_counter[utterance[CONDITION_FIELD_NAME]] += 1

    # Build the tokens list
    vocab = list(SPECIAL_TOKENS) + \
            [token for token, _ in tokens_counter.most_common(max_tokens_num - len(SPECIAL_TOKENS))]

    # Build the conditions list
    conditions = [
        condition
        for condition, _ in conditions_counter.most_common(max_conditions_num)
    ]

    # Validate the condition list
    if DEFAULT_CONDITION not in conditions:
        raise Exception(
            'No default condition "%s" found in the dataset condition list.' %
            DEFAULT_CONDITION)

    # Return index_to_token and index_to_condition mappings
    return dict(enumerate(vocab)), dict(enumerate(conditions))
コード例 #11
0
ファイル: response.py プロジェクト: zhengjunzhao1991/cakechat
def _get_non_offensive_response(context_tokens_ids, condition_id):
    responses = get_nn_responses(
        context_tokens_ids,
        _cakechat_model,
        PREDICTION_MODE,
        output_candidates_num=NUM_BEST_CANDIDATES_TO_PICK_FROM,
        condition_ids=condition_id)[0]

    tokenized_responses = [
        get_tokens_sequence(response) for response in responses
    ]
    non_offensive_tokenized_responses = [
        r for r in tokenized_responses
        if not _offense_detector.has_offensive_ngrams(r)
    ]

    if non_offensive_tokenized_responses:
        tokenized_response = random.choice(non_offensive_tokenized_responses)
        return get_pretty_str_from_tokens_sequence(tokenized_response)

    return DEFAULT_RESPONSE
コード例 #12
0
def _get_context_to_weighted_responses(nn_model, testset, all_utterances):
    token_to_index = nn_model.token_to_index

    all_utterances_ids = transform_lines_to_token_ids(list(
        map(get_tokens_sequence, all_utterances)),
                                                      token_to_index,
                                                      OUTPUT_SEQUENCE_LENGTH,
                                                      add_start_end=True)

    context_to_weighted_responses = {}

    for context in testset:
        context_tokenized = get_tokens_sequence(context)
        repeated_context_ids = transform_contexts_to_token_ids(
            [[context_tokenized]] * len(all_utterances), token_to_index,
            INPUT_SEQUENCE_LENGTH, INPUT_CONTEXT_SIZE)

        scores = get_sequence_score(nn_model, repeated_context_ids,
                                    all_utterances_ids)

        context_to_weighted_responses[context] = dict(
            zip(all_utterances, scores))

    return context_to_weighted_responses
コード例 #13
0
def process_text(nn_model, text):
    tokenized_line = get_tokens_sequence(text)
    return [replace_out_of_voc_tokens(tokenized_line, nn_model.token_to_index)]
コード例 #14
0
ファイル: detector.py プロジェクト: Allensmile/cakechat
 def _build_offensive_ngrams(offensive_phrases_path):
     offensive_phrases = load_file(offensive_phrases_path)
     offensive_ngrams = [tuple(get_tokens_sequence(offensive_phrase)) for offensive_phrase in offensive_phrases]
     return set(offensive_ngrams)
コード例 #15
0
def process_text(nn_model, text):
    tokenized_line = get_tokens_sequence(text)
    return [replace_out_of_voc_tokens(tokenized_line, nn_model.token_to_index)]