Ejemplo n.º 1
0
def build_utterance_vocab(dialogues,
                          special_symbols=[],
                          entity_forms=[],
                          except_words=[]):
    vocab = Vocabulary(offset=0, unk=True, except_words=except_words)

    def _add_entity(entity):
        for entity_form in entity_forms:
            word = get_entity_form(entity, entity_form)
            vocab.add_word(word)

    # Add words
    for dialogue in dialogues:
        assert dialogue.is_int is False
        for turn in dialogue.token_turns:
            for token in turn:
                if is_entity(token):
                    _add_entity(token)
                else:
                    vocab.add_word(token)

    # Add special symbols
    vocab.add_words(special_symbols, special=True)
    vocab.finish(size_threshold=10000)
    print('Utterance vocab size:', vocab.size)
    return vocab
Ejemplo n.º 2
0
def build_lf_vocab(dialogues):
    vocab = Vocabulary(offset=0, unk=True)
    for dialogue in dialogues:
        assert dialogue.is_int is False
        for lf in dialogue.lfs:
            vocab.add_words(lf)
    vocab.add_words(sequence_markers, special=True)
    vocab.finish()
    print 'LF vocabulary size:', vocab.size
    return vocab
Ejemplo n.º 3
0
def build_kb_vocab(dialogues, special_symbols=[]):
    kb_vocab = Vocabulary(offset=0, unk=False)
    for dialogue in dialogues:
        assert dialogue.is_int is False
        kb_vocab.add_words(dialogue.scenario)

    kb_vocab.add_words(special_symbols, special=True)
    kb_vocab.finish()

    print 'KB vocab size:', kb_vocab.size
    return kb_vocab
Ejemplo n.º 4
0
def build_vocab(dialogues, special_symbols=[], entity_forms=[]):
    vocab = Vocabulary(offset=0, unk=True)

    def _add_entity(entity):
        for entity_form in entity_forms:
            # If copy entity embedding from the graph embedding, don't need entity in vocab
            if entity_form != 'graph':
                word = Preprocessor.get_entity_form(entity, entity_form)
                vocab.add_word(word)

    # Add words
    for dialogue in dialogues:
        assert dialogue.is_int is False
        for turns in dialogue.token_turns:
            for turn in turns:
                for token in chain.from_iterable(turn):
                    if is_entity(token):
                        _add_entity(token)
                    else:
                        vocab.add_word(token)

    # Add special symbols
    vocab.add_words(special_symbols)
    print('Vocabulary size:', vocab.size)
    return vocab
Ejemplo n.º 5
0
def build_lf_vocab_simple(dialogues):
    vocab = Vocabulary(offset=0, unk=True)
    for dialogue in dialogues:
        assert dialogue.is_int is False
        for lf in dialogue.lf_tokens:
            vocab.add_words(lf)
    vocab.add_words(sequence_markers, special=True)
    print('LF vocabulary size:', vocab.size)
    return vocab
Ejemplo n.º 6
0
def build_lf_vocab(dialogues, special_symbols=[], entity_forms=[]):
    vocab = Vocabulary(offset=0, unk=True)

    def _add_entity(entity):
        for entity_form in entity_forms:
            word = get_entity_form(entity, entity_form)
            vocab.add_word(word)

    for dialogue in dialogues:
        assert dialogue.is_int is False
        for lf in dialogue.lfs:
            for token in lf:
                if is_entity(token):
                    _add_entity(token)
                else:
                    vocab.add_word(token)
    vocab.add_words(sequence_markers, special=True)
    vocab.finish(size_threshold=10000)
    print('LF vocabulary size:', vocab.size)
    return vocab
Ejemplo n.º 7
0
def build_schema_mappings(schema, num_items):
    entity_map = Vocabulary(unk=True)
    for type_, values in schema.values.iteritems():
        entity_map.add_words(((value.lower(), type_) for value in values))
    # Add item nodes
    for i in range(num_items):
        entity_map.add_word(item_to_entity(i)[1])
    # Add attr nodes
    #for attr in schema.attributes:
    #    entity_map.add_word((attr.name.lower(), 'attr'))

    relation_map = Vocabulary(unk=False)
    attribute_types = schema.get_attributes()  # {attribute_name: value_type}
    relation_map.add_words((a.lower() for a in attribute_types.keys()))
    relation_map.add_word('has')
    # Inverse relation
    relation_map.add_words([inv_rel(r) for r in relation_map.word_to_ind])

    return entity_map, relation_map
Ejemplo n.º 8
0
def build_kb_vocab(dialogues, special_symbols=[]):
    kb_vocab = Vocabulary(offset=0, unk=True)
    cat_vocab = Vocabulary(offset=0, unk=False)

    for dialogue in dialogues:
        assert dialogue.is_int is False
        kb_vocab.add_words(dialogue.title)
        kb_vocab.add_words(dialogue.description)
        cat_vocab.add_word(dialogue.category)

    kb_vocab.add_words(special_symbols, special=True)
    kb_vocab.finish(freq_threshold=5)
    cat_vocab.add_words(
        ['bike', 'car', 'electronics', 'furniture', 'housing', 'phone'],
        special=True)
    cat_vocab.finish()

    print('KB vocab size:', kb_vocab.size)
    print('Category vocab size:', cat_vocab.size)
    return kb_vocab, cat_vocab