Beispiel #1
0
def predict_babi_file(in_model,
                      vocabs_for_tasks,
                      dataset,
                      in_config,
                      in_session,
                      target_file_path=None):
    if target_file_path:
        target_file = open(target_file_path, "w")

    # eval tags --> RNN tags
    X, ys_for_tasks = make_multitask_dataset(dataset, vocabs_for_tasks[0][0],
                                             vocabs_for_tasks[0][1], in_config)
    predictions = predict(in_model, (X, ys_for_tasks),
                          [vocabs_for_tasks[0][-1]], in_session)
    predictions_eval = []
    global_word_index = 0
    broken_sequences_number = 0
    # RNN tags --> eval tags
    for utterance in dataset['utterance']:
        current_tags = predictions[global_word_index:global_word_index +
                                   len(utterance)]
        try:
            current_tags_eval = convert_from_inc_disfluency_tags_to_eval_tags(
                current_tags, utterance, representation="disf1")
        except:
            current_tags_eval = current_tags
            broken_sequences_number += 1
        predictions_eval += current_tags_eval
        global_word_index += len(utterance)
    print '#broken sequences after RNN --> eval conversion: {} out of {}'.format(
        broken_sequences_number, dataset.shape[0])

    predictions_eval_iter = iter(predictions_eval)
    for speaker, (_, speaker_data) in enumerate(dataset.iterrows()):
        if target_file_path:
            target_file.write("Speaker: " + str(speaker) + "\n\n")
        timing_data, lex_data, pos_data, labels = (create_fake_timings(
            len(speaker_data['utterance'])), speaker_data['utterance'],
                                                   speaker_data['pos'],
                                                   speaker_data['tags'])

        for i in range(0, len(timing_data)):
            # print i, timing_data[i]
            _, end = timing_data[i]
            word = lex_data[i]
            pos = pos_data[i]
            predicted_tags = [next(predictions_eval_iter)]
            current_time = end
            if target_file_path:
                target_file.write("Time: " + str(current_time) + "\n")
                new_words = lex_data[i - (len(predicted_tags) - 1):i + 1]
                new_pos = pos_data[i - (len(predicted_tags) - 1):i + 1]
                new_timings = timing_data[i - (len(predicted_tags) - 1):i + 1]
                for t, w, p, tag in zip(new_timings, new_words, new_pos,
                                        predicted_tags):
                    target_file.write("\t".join(
                        [str(t[0]), str(t[1]), w, p, tag]))
                    target_file.write("\n")
                target_file.write("\n")
        target_file.write("\n")
Beispiel #2
0
def eval_babi(in_model,
              in_vocabs_for_tasks,
              source_file_path,
              in_config,
              in_session,
              verbose=True):
    increco_file = 'swbd_disf_heldout_data_output_increco.text'
    dataset = pd.read_json(source_file_path)
    predict_babi_file(in_model,
                      in_vocabs_for_tasks,
                      dataset,
                      in_config,
                      in_session,
                      target_file_path=increco_file)
    IDs, timings, words, pos_tags, labels = (map(str, range(
        dataset.shape[0])), [None] * dataset.shape[0], dataset['utterance'],
                                             dataset['pos'], dataset['tags'])
    gold_data = {}  # map from the file name to the data
    for dialogue, a, b, c, d in zip(IDs, timings, words, pos_tags, labels):
        # if "asr" in division and not dialogue[:4] in good_asr: continue
        current_tags_eval = convert_from_inc_disfluency_tags_to_eval_tags(
            d, b, representation="disf1")
        gold_data[dialogue] = (create_fake_timings(len(b)), b, c,
                               current_tags_eval)
    final_output_name = increco_file.replace("_increco", "_final")
    incremental_output_disfluency_eval_from_file(
        increco_file,
        gold_data,
        utt_eval=True,
        error_analysis=False,
        word=True,
        interval=False,
        outputfilename=final_output_name)
    # hyp_dir = experiment_dir
    IDs, timings, words, pos_tags, labels = (map(str, range(
        dataset.shape[0])), [None] * dataset.shape[0], dataset['utterance'],
                                             dataset['pos'], dataset['tags'])
    gold_data = {}  # map from the file name to the data
    for dialogue, a, b, c, d in zip(IDs, timings, words, pos_tags, labels):
        # if "asr" in division and not dialogue[:4] in good_asr: continue
        current_tags_eval = convert_from_inc_disfluency_tags_to_eval_tags(
            d, b, representation="disf1")
        d = rename_all_repairs_in_line_with_index(current_tags_eval)
        gold_data[dialogue] = (create_fake_timings(len(b)), b, c, d)

    # the below does just the final output evaluation, assuming a final output file, faster
    hyp_file = "swbd_disf_heldout_data_output_final.text"
    word = True  # world-level analyses
    error = True  # get an error analysis
    results, speaker_rate_dict, error_analysis = final_output_disfluency_eval_from_file(
        hyp_file,
        gold_data,
        utt_eval=False,
        error_analysis=error,
        word=word,
        interval=False,
        outputfilename=None)
    # the below does incremental and final output in one, also outputting the final outputs
    # derivable from the incremental output, takes quite a while
    if verbose:
        for k, v in results.items():
            print k, v
    all_results = deepcopy(results)

    return {
        'f1_<rm_word': all_results['f1_<rm_word'],
        'f1_<rps_word': all_results['f1_<rps_word'],
        'f1_<e_word': all_results['f1_<e_word']
    }
Beispiel #3
0
def predict_increco_file(in_model,
                         vocabs_for_tasks,
                         source_file_path,
                         in_config,
                         in_session,
                         target_file_path=None,
                         is_asr_results_file=False):
    """Return the incremental output in an increco style
    given the incoming words + POS. E.g.:

    Speaker: KB3_1

    Time: 1.50
    KB3_1:1    0.00    1.12    $unc$yes    NNP    <f/><tc/>

    Time: 2.10
    KB3_1:1    0.00    1.12    $unc$yes    NNP    <rms id="1"/><tc/>
    KB3_1:2    1.12    2.00     because    IN    <rps id="1"/><cc/>

    Time: 2.5
    KB3_1:2    1.12    2.00     because    IN    <rps id="1"/><rpndel id="1"/><cc/>

    from an ASR increco style input without the POStags:

    or a normal style disfluency dectection ground truth corpus:

    Speaker: KB3_1
    KB3_1:1    0.00    1.12    $unc$yes    NNP    <rms id="1"/><tc/>
    KB3_1:2    1.12    2.00     $because    IN    <rps id="1"/><cc/>
    KB3_1:3    2.00    3.00    because    IN    <f/><cc/>
    KB3_1:4    3.00    4.00    theres    EXVBZ    <f/><cc/>
    KB3_1:6    4.00    5.00    a    DT    <f/><cc/>
    KB3_1:7    6.00    7.10    pause    NN    <f/><cc/>


    :param source_file_path: str, file path to the input file
    :param target_file_path: str, file path to output in the above format
    :param is_asr_results_file: bool, whether the input is increco style
    """
    if target_file_path:
        target_file = open(target_file_path, "w")
    if 'timings' in source_file_path:
        print "input file has timings"
        if not is_asr_results_file:
            dialogues = []
            IDs, timings, words, pos_tags, labels = \
                get_tag_data_from_corpus_file(source_file_path)
            for dialogue, a, b, c, d in zip(IDs, timings, words, pos_tags,
                                            labels):
                dialogues.append((dialogue, (a, b, c, d)))
    else:
        print "no timings in input file, creating fake timings"
        raise NotImplementedError

    # collecting a single dataset for the model to predict in batches
    utterances, tags, pos = [], [], []
    for speaker, speaker_data in dialogues:
        timing_data, lex_data, pos_data, labels = speaker_data
        # iterate through the utterances
        # utt_idx = -1

        for i in range(0, len(timing_data)):
            # print i, timing_data[i]
            _, end = timing_data[i]
            if "<t" in labels[i]:
                utterances.append([])
                tags.append([])
                pos.append([])
            utterances[-1].append(lex_data[i])
            tags[-1].append(labels[i])
            pos[-1].append(pos_data[i])

    # eval tags --> RNN tags
    dataset = pd.DataFrame({
        'utterance':
        utterances,
        'tags': [
            convert_from_eval_tags_to_inc_disfluency_tags(
                tags_i, words_i, representation="disf1")
            for tags_i, words_i in zip(tags, utterances)
        ],
        'pos':
        pos
    })
    X, ys_for_tasks = make_multitask_dataset(dataset, vocabs_for_tasks[0][0],
                                             vocabs_for_tasks[0][1], in_config)
    predictions = predict(in_model, (X, ys_for_tasks),
                          [vocabs_for_tasks[0][-1]], in_session)
    predictions_eval = []
    global_word_index = 0
    broken_sequences_number = 0
    # RNN tags --> eval tags
    for utterance in utterances:
        current_tags = predictions[global_word_index:global_word_index +
                                   len(utterance)]
        try:
            current_tags_eval = convert_from_inc_disfluency_tags_to_eval_tags(
                current_tags, utterance, representation="disf1")
        except:
            current_tags_eval = current_tags
            broken_sequences_number += 1
        predictions_eval += current_tags_eval
        global_word_index += len(utterance)
    print '#broken sequences after RNN --> eval conversion: {} out of {}'.format(
        broken_sequences_number, len(utterances))

    predictions_eval_iter = iter(predictions_eval)
    for speaker, speaker_data in dialogues:
        if target_file_path:
            target_file.write("Speaker: " + str(speaker) + "\n\n")
        timing_data, lex_data, pos_data, labels = speaker_data

        for i in range(0, len(timing_data)):
            # print i, timing_data[i]
            _, end = timing_data[i]
            word = lex_data[i]
            pos = pos_data[i]
            predicted_tags = [next(predictions_eval_iter)]
            current_time = end
            if target_file_path:
                target_file.write("Time: " + str(current_time) + "\n")
                new_words = lex_data[i - (len(predicted_tags) - 1):i + 1]
                new_pos = pos_data[i - (len(predicted_tags) - 1):i + 1]
                new_timings = timing_data[i - (len(predicted_tags) - 1):i + 1]
                for t, w, p, tag in zip(new_timings, new_words, new_pos,
                                        predicted_tags):
                    target_file.write("\t".join(
                        [str(t[0]), str(t[1]), w, p, tag]))
                    target_file.write("\n")
                target_file.write("\n")
        target_file.write("\n")
Beispiel #4
0
    def tag_new_word(self,
                     word,
                     pos=None,
                     timing=None,
                     extra=None,
                     diff_only=True,
                     rollback=0):
        """Tag new incoming word and update the word and tag graphs.

        :param word: the word to consume/tag
        :param pos: the POS tag to consume/tag (optional)
        :param timing: the duration of the word (optional)
        :param diff_only: whether to output only the diffed suffix,
        if False, outputs entire output tags
        :param rollback: the number of words to rollback
        in the case of changed word hypotheses from an ASR
        """
        self.rollback(rollback)
        if pos is None and self.args.pos:
            # if no pos tag provided but there is a pos-tagger, tag word
            test_words = [
                unicode(x) for x in get_last_n_features(
                    "words", self.word_graph, len(self.word_graph) - 1, n=4)
            ] + [unicode(word.lower())]
            pos = self.pos_tagger.tag(test_words)[-1][1]
            # print "tagging", word, "as", pos
        # 0. Add new word to word graph
        word, pos = self.standardize_word_and_pos(word, pos)
        # print "New word:", word, pos
        self.word_graph.append((word, pos, timing))
        # 1. load the saved internal rnn state
        # TODO these nets aren't (necessarily) trained statefully
        # The internal state in training self.args.bs words back
        # are the inital ones in training, however here
        # They are the actual state reached.
        if self.state_history == []:
            c0_state = self.initial_c0_state
            h0_state = self.initial_h0_state
        else:
            if self.model_type == "lstm":
                c0_state = self.state_history[-1][0][-1]
                h0_state = self.state_history[-1][1][-1]
            elif self.model_type == "elman":
                h0_state = self.state_history[-1][-1]

        if self.model_type == "lstm":
            self.model.load_weights(c0=c0_state, h0=h0_state)
        elif self.model_type == "elman":
            self.model.load_weights(h0=h0_state)
        else:
            raise NotImplementedError("no history loading for\
                             {0} model".format(self.model_type))

        # 2. do the softmax output with converted inputs
        word_window = [
            self.word_to_index_map[x]
            for x in get_last_n_features("words",
                                         self.word_graph,
                                         len(self.word_graph) - 1,
                                         n=self.window_size)
        ]
        pos_window = [
            self.pos_to_index_map[x]
            for x in get_last_n_features("POS",
                                         self.word_graph,
                                         len(self.word_graph) - 1,
                                         n=self.window_size)
        ]
        # print "word_window, pos_window", word_window, pos_window
        if self.model_type == "lstm":
            h_t, c_t, s_t = self.model.\
                soft_max_return_hidden_layer([word_window], [pos_window])
            self.softmax_history.append(s_t)
            if len(self.state_history) == 20:  # just saving history
                self.state_history.pop(0)  # pop first one
            self.state_history.append((c_t, h_t))
        elif self.model_type == "elman":
            h_t, s_t = self.model.soft_max_return_hidden_layer([word_window],
                                                               [pos_window])
            self.softmax_history.append(s_t)
            if len(self.state_history) == 20:
                self.state_history.pop(0)  # pop first one
            self.state_history.append(h_t)
        else:
            raise NotImplementedError("no softmax implemented for\
                                 {0} model".format(self.model_type))
        softmax = np.concatenate(self.softmax_history)

        # 3. do the decoding on the softmax
        if "disf" in self.args.tags:
            edit_tag = "<e/><cc>" if "uttseg" in self.args.tags else "<e/>"
            # print self.tag_to_index_map[edit_tag]
            adjustsoftmax = np.concatenate(
                (softmax, softmax[:, self.tag_to_index_map[edit_tag]].reshape(
                    softmax.shape[0], 1)), 1)
        else:
            adjustsoftmax = softmax
        last_n_timings = None if ((not self.args.use_timing_data) or
                                  not timing) \
            else get_last_n_features("timings", self.word_graph,
                                     len(self.word_graph)-1,
                                     n=3)
        new_tags = self.decoder.viterbi_incremental(
            adjustsoftmax,
            a_range=(len(adjustsoftmax) - 1, len(adjustsoftmax)),
            changed_suffix_only=True,
            timing_data=last_n_timings,
            words=[word])
        # print "new tags", new_tags
        prev_output_tags = deepcopy(self.output_tags)
        self.output_tags = self.output_tags[:len(self.output_tags) -
                                            (len(new_tags) - 1)] + new_tags

        # 4. convert to standardized output format
        if "simple" in self.args.tags:
            for p in range(
                    len(self.output_tags) - (len(new_tags) + 1),
                    len(self.output_tags)):
                rps = self.output_tags[p]
                self.output_tags[p] = rps.replace('rm-0',
                                                  'rps id="{}"'.format(p))
                if "<i" in self.output_tags[p]:
                    self.output_tags[p] = self.output_tags[p].\
                        replace("<e/>", "").replace("<i", "<e/><i")
        else:
            # new_words = [word]
            words = get_last_n_features("words",
                                        self.word_graph,
                                        len(self.word_graph) - 1,
                                        n=len(self.word_graph) -
                                        (self.window_size - 1))
            self.output_tags = convert_from_inc_disfluency_tags_to_eval_tags(
                self.output_tags,
                words,
                start=len(self.output_tags) - (len(new_tags)),
                representation=self.args.tags)
        if diff_only:
            for i, old_new in enumerate(zip(prev_output_tags,
                                            self.output_tags)):
                old, new = old_new
                if old != new:
                    return self.output_tags[i:]
            return self.output_tags[len(prev_output_tags):]
        return self.output_tags