Ejemplo n.º 1
0
def parse_query(q, data):
    """
    Description: Get the knowledge base key from a query with a stock ticker
    Parameters: (String) Query as entered by the user, (Dict) data from load_data() output
    Output: (Array) Array containing inputs to model.predict, (Array) Labels for potential answers
    """

    #  Remove punctuation, lowercase, split on spaces
    translator = str.maketrans(string.punctuation,
                               ' ' * len(string.punctuation))
    raw_query = ((q.translate(translator)).lower()).split()

    # Get kb_key
    kb_key = get_kb_key(q)

    # Ensure raw_query is at most 13 words
    if len(raw_query) > 13:
        raw_query = raw_query[:13]

    # Tokenize Query
    query = [
        data['vocab2id'][word] if word in data['vocab2id'] else 1
        for word in raw_query
    ]
    query.extend([0 for _ in range(13 - len(raw_query))])
    query = [query]

    # Memory & Candidate Answer Labels
    ans_cands = build_ans_cands(data['kb'][kb_key],
                                            data['entity2id'], \
                                            data['entityType2id'],
                                            data['relation2id'],
                                            data['vocab2id'])
    memory = [ans_cands[:-1]]
    cand_labels = [ans_cands[-1]]

    # Vectorize Data
    query, query_words, _, memory = vectorize_data(query, [[]], memory, \
                                            max_query_size=data['opt']['query_size'], \
                                            max_query_markup_size=data['opt']['query_markup_size'], \
                                            max_ans_bow_size=data['opt']['ans_bow_size'], \
                                            vocab2id=data['vocab2id'])

    return ([memory, query, query_words, [raw_query], [[]],
             [len(raw_query)]], cand_labels)
Ejemplo n.º 2
0
        'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
        'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't',
        'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
        'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't",
        'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
        "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma',
        'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
        "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
        'won', "won't", 'wouldn', "wouldn't"
    }

    train_queries, train_raw_queries, train_query_mentions, train_memories, _, train_gold_ans_inds, _ = train_vec
    train_queries, train_query_words, train_query_lengths, train_memories = vectorize_data(train_queries, train_query_mentions, \
                                        train_memories, max_query_size=opt['query_size'], \
                                        max_query_markup_size=opt['query_markup_size'], \
                                        max_mem_size=opt['mem_size'], \
                                        max_ans_bow_size=opt['ans_bow_size'], \
                                        max_ans_path_bow_size=opt['ans_path_bow_size'], \
                                        vocab2id=vocab2id)

    valid_queries, valid_raw_queries, valid_query_mentions, valid_memories, valid_cand_labels, valid_gold_ans_inds, valid_gold_ans_labels = valid_vec
    valid_queries, valid_query_words, valid_query_lengths, valid_memories = vectorize_data(valid_queries, valid_query_mentions, \
                                        valid_memories, max_query_size=opt['query_size'], \
                                        max_query_markup_size=opt['query_markup_size'], \
                                        max_mem_size=opt['mem_size'], \
                                        max_ans_bow_size=opt['ans_bow_size'], \
                                        max_ans_path_bow_size=opt['ans_path_bow_size'], \
                                        vocab2id=vocab2id)

    start = timeit.default_timer()
Ejemplo n.º 3
0
                                        'freebase_full.json'),
                           return_type='dict')
    test_data = load_ndjson(
        os.path.join(cfg['raw_data_dir'], 'test_seed_2_smart.json'))
    data_vec = build_data(test_data,
                          freebase,
                          entity2id,
                          entityType2id,
                          relation2id,
                          vocab2id,
                          pred_seed_ents=pred_seed_ents)

    queries, raw_queries, query_mentions, memories, cand_labels, _, gold_ans_labels = data_vec
    queries, query_words, query_lengths, memories_vec = vectorize_data(queries, query_mentions, memories, \
                                        max_query_size=bamnet_opt['query_size'], \
                                        max_query_markup_size=bamnet_opt['query_markup_size'], \
                                        max_ans_bow_size=bamnet_opt['ans_bow_size'], \
                                        vocab2id=vocab2id)

    model = BAMnetAgent(bamnet_opt, ctx_stopwords, vocab2id)
    pred = model.predict([
        memories_vec, queries, query_words, raw_queries, query_mentions,
        query_lengths
    ],
                         cand_labels,
                         batch_size=bamnet_opt['test_batch_size'],
                         margin=2)

    print('\nPredictions')
    for margin in bamnet_opt['test_margin']:
        print('\nMargin: {}'.format(margin))
Ejemplo n.º 4
0
def train_model(config_path='question-answering/config/bamnet_webq.yml'):
    """
    Description: Train a BAMnet model with knowledge base and questions in /data
    Parameters: (String) Relative path to config file
    Output: (1 .md File) BAMnet model weights. Use for question_answering.py
    """

    # build_utils.vectorize_data()

    with open(config_path, "r") as setting:
        opt = yaml.load(setting)

    # Load data
    train_vec = load_json(os.path.join(opt['data_dir'], opt['train_data']))
    valid_vec = load_json(os.path.join(opt['data_dir'], opt['valid_data']))

    vocab2id = load_json(os.path.join(opt['data_dir'], 'vocab2id.json'))

    ctx_stopwords = {
        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
        "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
        'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
        'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
        'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
        'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was',
        'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',
        'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
        'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
        'about', 'against', 'between', 'into', 'through', 'during', 'before',
        'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
        'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
        'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
        'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor',
        'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't',
        'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now',
        'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't",
        'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',
        "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma',
        'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
        "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
        'won', "won't", 'wouldn', "wouldn't"
    }

    # Vectorize data
    train_queries, train_raw_queries, train_query_mentions, train_memories, _, train_gold_ans_inds, _ = train_vec
    train_queries, train_query_words, train_query_lengths, train_memories = build_utils.vectorize_data(train_queries, train_query_mentions, \
                                        train_memories, max_query_size=opt['query_size'], \
                                        max_query_markup_size=opt['query_markup_size'], \
                                        max_mem_size=opt['mem_size'], \
                                        max_ans_bow_size=opt['ans_bow_size'], \
                                        max_ans_path_bow_size=opt['ans_path_bow_size'], \
                                        vocab2id=vocab2id)

    valid_queries, valid_raw_queries, valid_query_mentions, valid_memories, valid_cand_labels, valid_gold_ans_inds, valid_gold_ans_labels = valid_vec
    valid_queries, valid_query_words, valid_query_lengths, valid_memories = build_utils.vectorize_data(valid_queries, valid_query_mentions, \
                                        valid_memories, max_query_size=opt['query_size'], \
                                        max_query_markup_size=opt['query_markup_size'], \
                                        max_mem_size=opt['mem_size'], \
                                        max_ans_bow_size=opt['ans_bow_size'], \
                                        max_ans_path_bow_size=opt['ans_path_bow_size'], \
                                        vocab2id=vocab2id)

    start = timeit.default_timer()

    model = BAMnetAgent(opt, ctx_stopwords, vocab2id)
    model.train([train_memories, train_queries, train_query_words, train_raw_queries, train_query_mentions, train_query_lengths], train_gold_ans_inds, \
        [valid_memories, valid_queries, valid_query_words, valid_raw_queries, valid_query_mentions, valid_query_lengths], \
        valid_gold_ans_inds, valid_cand_labels, valid_gold_ans_labels)

    print('Runtime: %ss' % (timeit.default_timer() - start))