Ejemplo n.º 1
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0):
    source_path = os.path.join(args.source_dir, "{}-v1.0-aug.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    x, cx, tx, stx = [], [], [], []
    answerss = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    pos_counter = Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp, txp, stxp = [], [], [], []
        x.append(xp)
        cx.append(cxp)
        tx.append(txp)
        stx.append(stxp)
        for pi, para in enumerate(article['paragraphs']):
            xi = []
            for dep in para['deps']:
                if dep is None:
                    xi.append([])
                else:
                    xi.append([node[0] for node in dep[0]])
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            txp.append(para['consts'])
            stxp.append([str(load_compressed_tree(s)) for s in para['consts']])
            trees = map(nltk.tree.Tree.fromstring, para['consts'])
            for tree in trees:
                for subtree in tree.subtrees():
                    pos_counter[subtree.label()] += 1

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                dep = qa['dep']
                qi = [] if dep is None else [node[0] for node in dep[0]]
                cqi = [list(qij) for qij in qi]
                yi = []
                answers = []
                for answer in qa['answers']:
                    answers.append(answer['text'])
                    yi0 = answer['answer_word_start'] or [0, 0]
                    yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    yi.append([yi0, yi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, '*tx': rx, '*stx': rx,
            'idxs': idxs, 'ids': ids, 'answerss': answerss}
    shared = {'x': x, 'cx': cx, 'tx': tx, 'stx': stx,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict, 'pos_counter': pos_counter}

    return data, shared
Ejemplo n.º 2
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0):
    source_path = os.path.join(args.source_dir, "{}-v1.0-aug.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    x, cx, tx, stx = [], [], [], []
    answerss = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    pos_counter = Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp, txp, stxp = [], [], [], []
        x.append(xp)
        cx.append(cxp)
        tx.append(txp)
        stx.append(stxp)
        for pi, para in enumerate(article['paragraphs']):
            xi = []
            for dep in para['deps']:
                if dep is None:
                    xi.append([])
                else:
                    xi.append([node[0] for node in dep[0]])
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            txp.append(para['consts'])
            stxp.append([str(load_compressed_tree(s)) for s in para['consts']])
            trees = map(nltk.tree.Tree.fromstring, para['consts'])
            for tree in trees:
                for subtree in tree.subtrees():
                    pos_counter[subtree.label()] += 1

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                dep = qa['dep']
                qi = [] if dep is None else [node[0] for node in dep[0]]
                cqi = [list(qij) for qij in qi]
                yi = []
                answers = []
                for answer in qa['answers']:
                    answers.append(answer['text'])
                    yi0 = answer['answer_word_start'] or [0, 0]
                    yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    yi.append([yi0, yi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, '*tx': rx, '*stx': rx,
            'idxs': idxs, 'ids': ids, 'answerss': answerss}
    shared = {'x': x, 'cx': cx, 'tx': tx, 'stx': stx,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict, 'pos_counter': pos_counter}

    return data, shared