Ejemplo n.º 1
0
def query_to_encoder_features(sentence, vocabs, FLAGS):
    """
    Convert a natural language query into feature vectors used by the encoder.
    """
    if FLAGS.channel == 'char':
        tokens = data_utils.nl_to_characters(sentence)
        init_vocab = data_utils.CHAR_INIT_VOCAB
    elif FLAGS.channel == 'partial.token':
        tokens = data_utils.nl_to_partial_tokens(sentence, tokenizer.basic_tokenizer)
        init_vocab = data_utils.TOKEN_INIT_VOCAB
    else:
        if FLAGS.normalized:
            tokens = data_utils.nl_to_tokens(sentence, tokenizer.ner_tokenizer)
        else:
            tokens = data_utils.nl_to_tokens(sentence, tokenizer.basic_tokenizer)
        init_vocab = data_utils.TOKEN_INIT_VOCAB
    sc_ids = data_utils.tokens_to_ids(tokens, vocabs.sc_vocab)
    encoder_features = [[sc_ids]]
    if FLAGS.use_copy and FLAGS.copy_fun == 'copynet':
        csc_ids = []
        for i, t in enumerate(tokens):
            if not t in init_vocab and t in vocabs.tg_vocab:
                csc_ids.append(vocabs.tg_vocab[t])
            else:
                csc_ids.append(len(vocabs.tg_vocab) + i)
        encoder_features.append([csc_ids])
    return encoder_features
Ejemplo n.º 2
0
def query_to_copy_tokens(sentence, FLAGS):
    if FLAGS.channel == 'char':
        tokens = data_utils.nl_to_characters(sentence)
    elif FLAGS.channel == 'partial.token':
        tokens = data_utils.nl_to_partial_tokens(
            sentence, tokenizer.basic_tokenizer, to_lower_case=False,
            lemmatization=False)
    else:
        tokens = data_utils.nl_to_tokens(
            sentence, tokenizer.basic_tokenizer, to_lower_case=False,
            lemmatization=False)
    return tokens