Beispiel #1
0
 def preprocess(self, data, test_time=False):
     corpus = {"question": [], "candidates": [], "answers": []}
     for xy in data:
         x, y = xy
         corpus["question"].append(x.question)
         corpus["candidates"].append(x.atomic_candidates)
         assert len(y) == 1
         corpus["answers"].append(y[0].text)
     corpus = deep_map(corpus, notokenize, ['question'])
     corpus = deep_map(corpus, self.vocab, ['question'])
     corpus = deep_map(corpus, self.vocab, ['candidates'], cache_fun=True)
     corpus = deep_map(corpus, self.vocab, ['answers'])
     qanswers = {}
     for i, q in enumerate(corpus['question']):
         q0 = q[0]
         if q0 not in qanswers:
             qanswers[q0] = set()
         a = corpus["answers"][i]
         qanswers[q0].add(a)
     if not test_time:
         sl = ShuffleList(corpus["candidates"][0], qanswers)
         corpus = posnegsample(corpus, 'question', 'answers', 'candidates',
                               sl)
         #corpus = dynamic_subsample(corpus,'candidates','answers',how_many=1)
     return corpus
Beispiel #2
0
def pipeline(corpus,
             vocab=None,
             target_vocab=None,
             emb=None,
             freeze=False,
             normalize=False):
    vocab = vocab or Vocab(emb=emb)
    target_vocab = target_vocab or Vocab(unk=None)
    if freeze:
        vocab.freeze()
        target_vocab.freeze()

    corpus_tokenized = deep_map(corpus, tokenize, ['sentence1', 'sentence2'])
    corpus_lower = deep_seq_map(corpus_tokenized, lower,
                                ['sentence1', 'sentence2'])
    corpus_os = deep_seq_map(corpus_lower,
                             lambda xs: ["<SOS>"] + xs + ["<EOS>"],
                             ['sentence1', 'sentence2'])
    corpus_ids = deep_map(corpus_os, vocab, ['sentence1', 'sentence2'])
    corpus_ids = deep_map(corpus_ids, target_vocab, ['targets'])
    corpus_ids = deep_seq_map(corpus_ids,
                              lambda xs: len(xs),
                              keys=['sentence1', 'sentence2'],
                              fun_name='lengths',
                              expand=True)
    if normalize:
        corpus_ids = deep_map(corpus_ids,
                              vocab._normalize,
                              keys=['sentence1', 'sentence2'])
    return corpus_ids, vocab, target_vocab
Beispiel #3
0
def test_deep_map():
    a_lst = [[1, 2, 3], [4, 5, 6]]
    a_lst_map = map.deep_map(a_lst, lambda x: '_{}'.format(x))
    assert a_lst_map == [['_1', '_2', '_3'], ['_4', '_5', '_6']]

    a_dict = [{1: 0, 2: 1, 3: 0}, {4: 1, 5: 0, 6: 1}]
    a_dict_map = map.deep_map(a_dict, lambda x: '_{}'.format(x))
    assert a_dict_map == [{1: '_0', 2: '_1', 3: '_0'}, {4: '_1', 5: '_0', 6: '_1'}]
def pipeline(corpus,
             vocab=None,
             target_vocab=None,
             emb=None,
             freeze=False,
             concat_seq=True,
             use_permutation_index=True):
    vocab = vocab or Vocab(emb=emb)
    target_vocab = target_vocab or Vocab(unk=None)
    if freeze:
        vocab.freeze()
        target_vocab.freeze()

    corpus_tokenized = deep_map(corpus, tokenize, ["story"])
    # corpus_lower = deep_seq_map(corpus_tokenized, lower, ["story"])
    corpus_os = deep_seq_map(corpus_tokenized,
                             lambda xs: ["<SOS>"] + xs + ["<EOS>"], ["story"])

    corpus_ids = deep_map(corpus_os, vocab, ["story"])
    corpus_ids = deep_map(corpus_ids, target_vocab, ["order"])
    if concat_seq:
        for i in range(len(corpus_ids["story"])):
            corpus_ids["story"][i] = [
                x for xs in corpus_ids["story"][i] for x in xs
            ]
            #corpus_ids = \
            #    deep_seq_map(corpus_tokenized,
            #                 lambda xs: ["<SOS>"] + xs + ["<EOS>"], ["story"])

    seq_keys = ["story"]
    if not use_permutation_index:
        seq_keys += ["order"]

    corpus_ids = deep_seq_map(corpus_ids,
                              lambda xs: len(xs),
                              keys=seq_keys,
                              fun_name='length',
                              expand=True)
    return corpus_ids, vocab, target_vocab
def prepare_data(dataset,
                 vocab,
                 lowercase=False,
                 with_answers=False,
                 wiq_contentword=False,
                 with_spacy=False,
                 max_support_length=-1):
    #sentences = LineSentence('/home/jemitche/Desktop/wiki/text8')
    #model = Word2Vec(sentences, size=300, window=5, min_count=5, workers=4)
    default_vec = np.zeros([vocab.emb_length])
    if with_spacy:
        import spacy
        nlp = spacy.load("en", parser=False)
        thistokenize = lambda t: nlp(t)
    else:
        thistokenize = tokenize

    corpus = {"support": [], "question": [], "id": [], "slot": []}
    for d in dataset:
        if isinstance(d, QASetting):
            qa_setting = d
        else:
            qa_setting, answer = d

        if lowercase:
            corpus["support"].append(" ".join(qa_setting.support).lower())
            question, slot = qa_setting.question.lower().split('\t')
            corpus["question"].append(question)
            corpus["slot"].append(slot)
        else:
            corpus["support"].append(" ".join(qa_setting.support))
            question, slot = qa_setting.question.split('\t')
            corpus["question"].append(question)
            corpus["slot"].append(int(slot))
        corpus["id"].append(qa_setting.id)
        assert qa_setting.id != None

    corpus_tokenized = deep_map(corpus, thistokenize, ['question', 'support'])

    word_in_question = []
    question_lengths = []
    support_lengths = []
    token_offsets = []
    answer_spans = []

    rng = random.Random(12345)

    for i, (q, s) in enumerate(
            zip(corpus_tokenized["question"], corpus_tokenized["support"])):
        # word in question feature
        wiq = []
        for token in s:
            if with_spacy:
                wiq.append(
                    float(
                        any(token.lemma == t2.lemma for t2 in q)
                        and (not wiq_contentword or
                             (token.orth_.isalnum() and not token.is_stop))))
            else:
                '''
                try:
                    max_sim=max_similarity(vocab.emb.lookup,token,q,default_vec)
                    wiq.append(max_sim)
                except:
                    wiq.append(0)
                '''
                wiq.append(
                    float(token in q
                          and (not wiq_contentword or token.isalnum())))
        word_in_question.append(wiq)

        if with_spacy:
            offsets = [t.idx for t in s]
            s = [t.orth_ for t in s]
            q = [t.orth_ for t in q]
            corpus_tokenized["question"][i] = q
            corpus_tokenized["support"][i] = s
        else:
            # char to token offsets
            support = corpus["support"][i]
            offsets = token_to_char_offsets(support, s)

        token_offsets.append(offsets)
        question_lengths.append(len(q))

        min_answer = len(s)
        max_answer = 0

        spans = []
        if with_answers:
            answers = dataset[i][1]
            for a in answers:
                start = 0
                while start < len(offsets) and offsets[start] < a.span[0]:
                    start += 1

                if start == len(offsets):
                    continue

                end = start
                while end + 1 < len(offsets) and offsets[end + 1] < a.span[1]:
                    end += 1
                if (start, end) not in spans:
                    spans.append((start, end))
                    min_answer = min(min_answer, start)
                    max_answer = max(max_answer, end)

        # cut support whenever there is a maximum allowed length and recompute answer spans
        if max_support_length is not None and max_support_length > 0 and len(
                s) > max_support_length:
            if max_answer < max_support_length:
                s = s[:max_support_length]
                word_in_question[-1] = word_in_question[
                    -1][:max_support_length]
            else:
                offset = rng.randint(1, 11)
                new_end = max_answer + offset
                new_start = max(
                    0, min(min_answer - offset, new_end - max_support_length))
                while new_end - new_start > max_support_length:
                    spans = [(s, e) for s, e in spans
                             if e < (new_end - offset)]
                    new_end = max(spans, key=lambda span: span[1])[1] + offset
                    new_start = max(
                        0,
                        min(min_answer - offset, new_end - max_support_length))
                s = s[new_start:new_end]
                spans = [(s - new_start, e - new_start) for s, e in spans]
                word_in_question[-1] = word_in_question[-1][new_start:new_end]

            corpus_tokenized["support"][i] = s
            answer_spans.append(spans)

        else:
            answer_spans.append(spans)

        support_lengths.append(len(s))

    corpus_ids = deep_map(corpus_tokenized, vocab, ['question', 'support'])

    return corpus_tokenized["question"], corpus_ids["question"], question_lengths, \
           corpus_tokenized["support"], corpus_ids["support"], support_lengths, \
           word_in_question, token_offsets, answer_spans, corpus["id"],corpus["slot"]