Ejemplo n.º 1
0
 def send(self):
     # Don't send consecutive utterances with entities
     if self.sent_entity and not self.env.consecutive_entity:
         return None
     if self.matched_item is not None:
         return self.select(self.matched_item)
     for i in xrange(1):
         tokens = self.decode()
         if tokens is not None:
             break
     if tokens is None:
         return None
     if self._has_entity(tokens):
         self.sent_entity = True
     else:
         self.sent_entity = False
     for token in tokens:
         if is_entity(token):
             self.mentioned_entities.add(token[1][0])
     if self.env.realizer is None:
         tokens = [x if not is_entity(x) else x[0] for x in tokens]
     else:
         tokens = self.env.realizer.realize_entity(tokens)
     if len(tokens) > 1 and tokens[0] == markers.SELECT and tokens[
             1].startswith('item-'):
         item_id = int(tokens[1].split('-')[1])
         self.selected_items.add(item_id)
         item = self.kb.items[item_id]
         return self.select(item)
     tokens = self.naturalize(tokens)
     s = self.attach_punct(' '.join(tokens))
     return self.message(s)
Ejemplo n.º 2
0
 def process_utterance(self, utterance, stage=None):
     if stage is None:
         return [
             self.get_entity_form(x, 'canonical') if is_entity(x) else x
             for x in utterance
         ]
     else:
         return [
             self.get_entity_form(x, self.entity_forms[stage])
             if is_entity(x) else x for x in utterance
         ]
Ejemplo n.º 3
0
 def read_utterance(self, tokens, stage=None):
     '''
     Map entities to node ids and tokens to -1. Add new nodes if needed.
     tokens: from batch['encoder/decoder_tokens']; entities are represented
     as (surface_form, (canonical_form, type)), i.e. output of entitylink.
     '''
     entities = [x[1] for x in tokens if is_entity(x)]
     new_entities = set([x for x in entities if not self.nodes.has(x)])
     if len(new_entities) > 0:
         self.add_entity_nodes(new_entities)
     node_ids = [self.nodes.to_ind(x[1]) for x in tokens if is_entity(x)]
     self.entities.append(node_ids)
Ejemplo n.º 4
0
def build_vocab(dialogues, special_symbols=[], entity_forms=[]):
    vocab = Vocabulary(offset=0, unk=True)

    def _add_entity(entity):
        for entity_form in entity_forms:
            # If copy entity embedding from the graph embedding, don't need entity in vocab
            if entity_form != 'graph':
                word = Preprocessor.get_entity_form(entity, entity_form)
                vocab.add_word(word)

    # Add words
    for dialogue in dialogues:
        assert dialogue.is_int is False
        for turns in dialogue.token_turns:
            for turn in turns:
                for token in chain.from_iterable(turn):
                    if is_entity(token):
                        _add_entity(token)
                    else:
                        vocab.add_word(token)

    # Add special symbols
    vocab.add_words(special_symbols)
    print 'Vocabulary size:', vocab.size
    return vocab
Ejemplo n.º 5
0
    def receive(self, event):
        #self.log.write('receive event:%s\n' % str(event.to_dict()))
        # Reset status
        self.sent_entity = False
        # Parse utterance
        if event.action == 'select':
            self.matched_item = self._match(event.data)
            if self.matched_item is None:
                entity_tokens = [markers.SELECT
                                 ] + self.env.preprocessor.item_to_entities(
                                     event.data, self.kb.attributes)
            else:
                # Got a match; we're done.
                return
        elif event.action == 'message':
            entity_tokens = self.env.preprocessor.process_event(
                event,
                self.kb,
                mentioned_entities=self.mentioned_entities,
                known_kb=False)
            print entity_tokens[0]
            # Empty message
            if entity_tokens is None:
                return
            else:
                # Take the encoding version of sequence
                entity_tokens = entity_tokens[0]
        else:
            raise ValueError('Unknown event action %s.' % event.action)
        for token in entity_tokens:
            if is_entity(token):
                self.mentioned_entities.add(token[1][0])
        entity_tokens += [markers.EOS]

        self.encode(entity_tokens)
Ejemplo n.º 6
0
 def process_event(self, e, kb, mentioned_entities=None, known_kb=True):
     '''
     Convert event to two lists of tokens and entities for encoding and decoding.
     '''
     if e.action == 'message':
         # Lower, tokenize, link entity
         entity_tokens = self.lexicon.link_entity(
             tokenize(e.data),
             kb=kb,
             mentioned_entities=mentioned_entities,
             known_kb=known_kb)
         #print e.data
         #print entity_tokens
         entity_tokens = [
             normalize_number(x) if not is_entity(x) else x
             for x in entity_tokens
         ]
         if entity_tokens:
             # NOTE: have two copies because we might change it given decoding/encoding
             return (entity_tokens, copy.copy(entity_tokens))
         else:
             return None
     elif e.action == 'select':
         # Convert an item to item-id (wrt to the speaker)
         item_id = self.get_item_id(kb, e.data)
         # We use the entities to represent the item during encoding and item-id during decoding
         return ([markers.SELECT] +
                 self.item_to_entities(e.data, kb.attributes),
                 [markers.SELECT, item_to_entity(item_id)])
     else:
         raise ValueError('Unknown event action.')
Ejemplo n.º 7
0
 def text_to_int(self, utterance, stage=None):
     '''
     Process entities in the utterance based on whether it is used for encoding, decoding
     or ground truth.
     '''
     if stage is not None:
         use_entity_map = self.setting[stage]
         tokens = self.preprocessor.process_utterance(utterance, stage)
         if not use_entity_map:
             return [self.vocab.to_ind(token) for token in tokens]
         else:
             offset = self.vocab.size
             return [self.vocab.to_ind(token) if not is_entity(token) else self.entity_map.to_ind(token) + offset for token in tokens]
     else:
         tokens = self.preprocessor.process_utterance(utterance)
         offset = self.vocab.size
         return [self.vocab.to_ind(token) if not is_entity(token) else self.entity_map.to_ind(token) + offset for token in tokens]
Ejemplo n.º 8
0
 def _process_target_tokens(self, tokens):
     '''
     TODO: for now evaluate against canonical entities. In future, evaluate against
     actual utterances.
     '''
     targets = [token[1] if is_entity(token) else token for token in tokens]
     targets = [x for x in targets if x not in (markers.PAD,)]
     return targets
Ejemplo n.º 9
0
 def _pred_to_token(self, preds):
     if self.env.copy:
         preds = self.graph.copy_preds(preds, self.env.vocab.size)
     entity_tokens, _ = pred_to_token(preds, self.env.stop_symbol,
                                      self.env.remove_symbols,
                                      self.env.textint_map)
     entity_tokens = [[(x[0], x) if is_entity(x) else x for x in toks]
                      for toks in entity_tokens]
     return entity_tokens
Ejemplo n.º 10
0
def update_ngram_counts(counts, utterance):
    tokens = [x if not is_entity(x) else ('<%s>' % x[1][1]) for x in utterance]
    for x in tokens:
        counts[1][(x, )] += 1
    for x, y in izip(tokens, tokens[1:]):
        counts[2][(x, y)] += 1
    for x, y, z in izip(tokens, tokens[1:], tokens[2:]):
        counts[3][(x, y, z)] += 1
    return counts
Ejemplo n.º 11
0
 def eval(self, kb, utterance):
     '''
     utterance: a list of tokens and entities represented as a tuple (surface_form, (caninical_form, type))
     '''
     #print 'eval:', utterance
     N = len(utterance)
     i = 0
     while i < N:
         token = utterance[i]
         if is_entity(token) and token[1][1] != 'item':
             self.inc_fact()
             if i + 1 < N and utterance[i + 1] == 'and':
                 # number ent1 and ent2
                 if i - 1 < 0 or i + 3 > N:
                     self.inc_undecided()
                     i += 1
                 else:
                     start, end = i - 1, i + 3
                     if not is_entity(utterance[i + 2]):
                         self.inc_undecided()
                     else:
                         if end + 1 < N and utterance[end:end +
                                                      2] == ['in', 'those']:
                             self.inc_coref()
                             i = end + 2
                         else:
                             self.eval_joint(kb, utterance[start:end])
                     i = end
             elif i - 1 > 0:
                 # number ent
                 start, end = i - 1, i + 1
                 if end + 1 < N and utterance[end:end +
                                              2] == ['in', 'those']:
                     self.inc_coref()
                     i = end + 2
                 else:
                     self.eval_single(kb, utterance[start:end])
                     i = end
             else:
                 self.inc_undecided()
                 i += 1
         else:
             i += 1
Ejemplo n.º 12
0
    def receive(self, event):
        self.sent_entity = False
        if event.action == 'message':
            raw_utterance = event.data

            # handle bilingual. translate into English no matter what source lang is
            translation = translate_client.translate(raw_utterance, target_language='en')
            new_eng = translation['translatedText']
            # entity_tokens = self.lexicon.link_entity(tokenize(raw_utterance), kb=self.kb, mentioned_entities=self.mentioned_entities, known_kb=False)
            entity_tokens = self.lexicon.link_entity(tokenize(new_eng), kb=self.kb, mentioned_entities=self.mentioned_entities, known_kb=False)
            for token in entity_tokens:
                if is_entity(token):
                    # it works!!
                    # print '***'
                    # print 'IS ENTITY:', token
                    # print '***'
                    self.mentioned_entities.add(token[1][0])
            entities = [word[1] for word in entity_tokens if is_entity(word)]

            if self.is_question(entity_tokens):
                self.asked_entities = entities

            # Update item weights
            if len(entities) > 0:
                if len([x for x in entity_tokens if x in ('no', 'none', "don't", 'zero')]) > 0:
                    negative = True
                else:
                    negative = False
                self.update_item_weights(entities, -10. if negative else 1.)

                row_entities, col_entities = self.get_related_entity(entities)
                self.update_entity_weights(entities, -10. if negative else 1.)
                self.update_entity_weights(row_entities, -1. if negative else 2.)
                self.update_entity_weights(col_entities, 1. if negative else 0.5)

        elif event.action == 'select':
            for item in self.kb.items:
                if item == event.data:
                    self.matched_item = item
Ejemplo n.º 13
0
 def add_utterance(self, agent, utterances, prepend):
     # Prepend entities to decoder utterances
     if prepend:
         decoder_utterance = utterances[self.DEC]
         decoder_entities = [x for x in decoder_utterance if is_entity(x)
                             ] + [markers.EOE]
         utterances[self.DEC][:0] = decoder_entities
     # Same agent talking
     if len(self.agents) > 0 and agent == self.agents[-1]:
         for i in xrange(2):
             self.token_turns[i][-1].append(utterances[i])
     else:
         self.agents.append(agent)
         for i in xrange(2):
             self.token_turns[i].append([utterances[i]])
Ejemplo n.º 14
0
    def _process_example(self, ex):
        '''
        Convert example to turn-based dialogue.
        '''
        kbs = ex.scenario.kbs
        dialogue = Dialogue(kbs, ex.uuid)

        mentioned_entities = set()
        for e in ex.events:
            utterances = self.process_event(e, kbs[e.agent], mentioned_entities)
            if utterances:
                dialogue.add_utterance(e.agent, utterances)
                for token in utterances[0]:
                    if is_entity(token):
                        mentioned_entities.add(token[1][0])
        return dialogue
Ejemplo n.º 15
0
def get_linguistic_template(template_summary_map, utterance):
    if len(utterance) == 0:
        return
    template = []
    for token in utterance:
        if is_entity(token):
            template.append('<%s>' % get_entity_type(token))
        else:
            # if token not in stopwords.words('english'):
            template.append(token)

    k = tuple(template)
    if k not in template_summary_map.keys():
        template_summary_map[k] = 0.

    template_summary_map['total'] += 1.
    template_summary_map[k] += 1.
Ejemplo n.º 16
0
    def _process_example(self, ex,wizardkb):
        '''
        Convert example to turn-based dialogue.
        '''
        kbs = ex.scenario.kbs
        attributes = ex.scenario.attributes
        kbs.append(KB.from_dict(attributes, wizardkb)) # append wizard kb
        dialogue = Dialogue(kbs, ex.uuid)

        mentioned_entities = set()
        for e in ex.events:
            utterances = self.process_event(e, kbs[e.agent], mentioned_entities)
            if utterances:
                dialogue.add_utterance(e.agent, utterances)
                for token in utterances[0]:
                    if is_entity(token):
                        mentioned_entities.add(token[1][0])
        return dialogue
Ejemplo n.º 17
0
def get_stats(chat, agent_id, preprocessor):
    ex = Example.from_dict(None, chat)
    kbs = ex.scenario.kbs
    mentioned_entities = set()
    stats = {}
    vocab = set()
    for i, event in enumerate(ex.events):
        if agent_id != event.agent:
            continue
        if event.action == 'select':
            utterance = []
            logstats.update_summary_map(stats, {'num_select': 1})
        elif event.action == 'message':
            utterance = preprocessor.process_event(event, kbs[event.agent],
                                                   mentioned_entities)
            # Skip empty utterances
            if not utterance:
                continue
            else:
                utterance = utterance[0]
                for token in utterance:
                    if is_entity(token):
                        logstats.update_summary_map(stats, {'num_entity': 1})
                        mentioned_entities.add(token[1][0])
                    else:
                        vocab.add(token)
                logstats.update_summary_map(stats,
                                            {'utterance_len': len(utterance)})
        speech_act = get_speech_act(defaultdict(int), event, utterance)
        if speech_act[0] in ('inform', 'ask', 'answer'):
            logstats.update_summary_map(stats, {'SA_' + speech_act[0]: 1})
        logstats.update_summary_map(stats, {'num_utterance': 1})

    new_stats = {}
    for k in stats:
        if k in ('num_select', 'num_utterance', 'num_entity'):
            new_stats[k] = stats[k]['sum']
        elif k in ('utterance_len', ):
            new_stats[k] = stats[k]['mean']
        elif k.startswith('SA_'):
            new_stats[k] = stats[k]['sum']
    new_stats['vocab_size'] = len(vocab)
    return new_stats
Ejemplo n.º 18
0
def check_fact(summary_map, tokens, kb):
    '''
    Simple fact checker:
        each utterance is converted to a list of numbers and entities and we assume
        that the number describes the following entities, which will cause some false
        negatives.
    '''
    hypothesis = []
    N = len(kb.items)
    for token in tokens:
        if is_entity(token):
            if len(hypothesis) > 0:
                # Represent entity as its canonical form
                hypothesis[-1][1].append(token[1][0])
        else:
            number = to_number(token, N)
            if number:
                hypothesis.append((number, []))
    for n, entities in hypothesis:
        if len(entities) > 0:
            correct = 1 if n == count_kb_entity(kb, entities) else 0
            logstats.update_summary_map(summary_map, {'correct': correct})
Ejemplo n.º 19
0
                    return_entities=True,
                    agent=agent,
                    uuid=scenario_uuid)

                for c in candidate_annotation:
                    # Entity, Span, Type
                    fout.write(c[1][0] + "\t" + c[0] + "\t" + c[1][1] + "\n")

    preprocessor = Preprocessor(schema, lexicon, 'canonical', 'canonical',
                                'canonical')
    for raw in examples:
        ex = Example.from_dict(None, raw)
        kbs = ex.scenario.kbs
        mentioned_entities = set()
        for i, event in enumerate(ex.events):
            if event.action == 'message':
                utterance = preprocessor.process_event(event, kbs[event.agent],
                                                       mentioned_entities)
                # Skip empty utterances
                if utterance:
                    utterance = utterance[0]
                    for token in utterance:
                        if is_entity(token):
                            span, entity = token
                            entity, type_ = entity
                            # Entity, Span, Type
                            fout.write(entity + "\t" + span + "\t" + type_ +
                                       "\n")

    fout.close()
Ejemplo n.º 20
0
def is_inform(tokens):
    for token in tokens:
        if is_entity(token):
            return True
    return False
Ejemplo n.º 21
0
def analyze_strategy(all_chats, scenario_db, preprocessor, text_output, lm):
    fout = open(text_output, 'w') if text_output is not None else None
    speech_act_summary_map = defaultdict(int)
    kb_strategy_summary_map = {}
    dialog_summary_map = {}
    fact_summary_map = {}
    utterance_counts = defaultdict(lambda: defaultdict(int))
    ngram_counts = defaultdict(lambda: defaultdict(int))
    template_summary_map = {'total': 0.}
    speech_act_sequence_summary_map = {'total': 0.}
    alpha_stats = []
    num_items_stats = []
    num_attrs_mentioned = 0.
    most_mentioned_attrs = 0.
    entity_mention_summary_map = {}

    total_events = 0
    total_dialogues = 0.

    lm_summary_map = {}
    for raw in all_chats:
        ex = Example.from_dict(scenario_db, raw)
        kbs = ex.scenario.kbs
        if ex.outcome is None or ex.outcome["reward"] == 0:
            continue  # skip incomplete dialogues
        total_dialogues += 1.
        dialog = []
        mentioned_entities = set()
        for i, event in enumerate(ex.events):
            if event.action == 'select':
                utterance = []
            elif event.action == 'message':
                utterance = preprocessor.process_event(event, kbs[event.agent],
                                                       mentioned_entities)
                # Skip empty utterances
                if not utterance:
                    continue
                else:
                    utterance = utterance[0]
                    for token in utterance:
                        if is_entity(token):
                            mentioned_entities.add(token[1][0])
                    logstats.update_summary_map(
                        dialog_summary_map,
                        {'utterance_length': len(utterance)})
                    check_fact(fact_summary_map, utterance, kbs[event.agent])
                    if lm:
                        logstats.update_summary_map(lm_summary_map, {
                            'score':
                            lm.score(' '.join(entity_to_type(utterance)))
                        })
                    update_ngram_counts(ngram_counts, utterance)
                    if fout:
                        fout.write('%s\n' %
                                   (' '.join(entity_to_type(utterance))))
            else:
                raise ValueError('Unknown event action %s.' % event.action)

            total_events += 1

            speech_act = get_speech_act(speech_act_summary_map, event,
                                        utterance)
            get_linguistic_template(template_summary_map, utterance)
            entities = [x[1] for x in utterance if is_entity(x)]
            dialog.append((event.agent, speech_act, entities, utterance))

        get_dialog_stats(dialog_summary_map, utterance_counts, dialog)
        get_speech_act_histograms(speech_act_sequence_summary_map, dialog)
        get_entity_mention(entity_mention_summary_map, dialog, kbs)

        orders, mentioned_attrs, most_mentioned_label = get_kb_strategy(
            kbs, dialog)
        orders = tuple(orders)
        most_mentioned_attrs += alpha_labels_to_values[most_mentioned_label]

        if len(orders) not in kb_strategy_summary_map.keys():
            kb_strategy_summary_map[len(orders)] = {}

        if orders not in kb_strategy_summary_map[len(orders)].keys():
            kb_strategy_summary_map[len(orders)][orders] = 0.0

        kb_strategy_summary_map[len(orders)][tuple(orders)] += 1.0
        alphas = ex.scenario.alphas

        num_attrs_mentioned += len(orders) / len(alphas)

        first_mentioned_label = NO_ALPHA_MENTION
        if len(orders) > 0:
            first_mentioned_label = orders[0]

        if len(mentioned_attrs) > 0:
            first_mentioned_type, first_mentioned_attr, first_agent = mentioned_attrs[
                0]
            update_item_stats(num_items_stats, first_mentioned_type,
                              first_mentioned_attr, kbs[first_agent])

            if first_mentioned_label != NO_ALPHA_MENTION:
                update_alpha_stats(alpha_stats, kbs[first_agent],
                                   first_mentioned_label)
                # print "First mentioned attribute alpha:", first_mentioned, alpha_labels_to_values[first_mentioned]

    if fout:
        fout.close()
    # Summarize stats
    total = float(total_events)
    kb_strategy_totals = {
        k1: sum(v2 for v2 in v1.values())
        for k1, v1 in kb_strategy_summary_map.items()
    }
    dialog_stats = {
        k: dialog_summary_map[k]['mean']
        for k in dialog_summary_map
    }
    dialog_stats['entity_type_token_ratio'] = dialog_summary_map[
        'num_entity_type_per_dialog']['sum'] / float(
            dialog_summary_map['num_entity_per_dialog']['sum'])

    unigram_counts = {k[0]: v for k, v in ngram_counts[1].iteritems()}
    dialog_stats['vocab_size'] = len(unigram_counts)
    dialog_stats['unigram_entropy'] = count_to_entropy(unigram_counts)
    multi_speech_act = sum([
        speech_act_summary_map[k] for k in speech_act_summary_map if len(k) > 1
    ]) / total

    return {
        'speech_act': {
            k: speech_act_summary_map[k] / total
            for k in speech_act_summary_map.keys()
        },
        'kb_strategy': {
            k1: {
                ", ".join(k2): v2 / kb_strategy_totals[k1]
                for k2, v2 in v1.items()
            }
            for k1, v1 in kb_strategy_summary_map.items()
        },
        'dialog_stats': dialog_stats,
        'lm_score': -1 if not lm else lm_summary_map['score']['mean'],
        'utterance_counts': utterance_counts,
        'ngram_counts': ngram_counts,
        'linguistic_templates': template_summary_map,
        'speech_act_sequences': speech_act_sequence_summary_map,
        'correct': fact_summary_map['correct']['mean'],
        'entity_mention': {
            k: np.mean(v)
            for k, v in entity_mention_summary_map['first'].iteritems()
        },
        'multi_speech_act': multi_speech_act,
        'alpha_stats': alpha_stats,
        'num_items_stats': num_items_stats
    }
Ejemplo n.º 22
0
 def get_entity(x):
     return [e for e in x if is_entity(e)]
Ejemplo n.º 23
0
def entity_to_type(tokens):
    return [x if not is_entity(x) else '<%s>' % x[1][1] for x in tokens]
Ejemplo n.º 24
0
def get_entity_type(entity):
    if not is_entity(entity):
        return None
    _, (_, entity_type) = entity
    return entity_type
Ejemplo n.º 25
0
 def get_entities(self, utterance):
     return [-1 if not is_entity(token) else self.entity_map.to_ind(token) for token in tokens]
Ejemplo n.º 26
0
 def _has_entity(self, tokens):
     for token in tokens:
         if is_entity(token):
             return True
     return False
Ejemplo n.º 27
0
 def realize_entity(self, entity_tokens):
     return [
         token if not is_entity(token) else self._realize_entity(token[1])
         for token in entity_tokens
     ]