Exemple #1
0
def modify_answer_spans(qas, summary):
    #converts span from (sent_num, word_num) -> (0,total_word_num in para)
    import nltk
    summary_tokenized = list(
        map(nltk.word_tokenize, nltk.sent_tokenize(summary)))
    summary_tokenized = [
        process_tokens(tokens) for tokens in summary_tokenized
    ]

    for index, qa in qas.iterrows():
        answer1_span_start_idx = qa['start_index'].split(', ')
        answer1_span_end_idx = qa['end_index'].split(', ')
        answer1_span_start_idx = list(map(int, answer1_span_start_idx))
        answer1_span_end_idx = list(map(int, answer1_span_end_idx))
        index_mod = sum(
            map(len, summary_tokenized[0:answer1_span_start_idx[0]]))
        qas.at[index,
               'start_index'] = [0, index_mod + answer1_span_start_idx[1]]
        index_mod = sum(map(len, summary_tokenized[0:answer1_span_end_idx[0]]))
        qas.at[index, 'end_index'] = [0, index_mod + answer1_span_end_idx[1]]
    return qas
Exemple #2
0
def evaluate_bleu_scores(data_type):
    bleu_scores = []
    bleu_4_scores = []
    for index_summ, row in tqdm(source_summaries.iterrows(), total=1572):
        if data_type == row['set']:
            references = []
            references1 = []
            spans = []
            #summary = row['processed_summary'].replace(".",". ")
            #summ = list(map(nltk.word_tokenize, nltk.sent_tokenize(row['processed_summary_wo'])))
            #summ = [process_tokens(tokens) for tokens in summ]
            summary_tokenized = nltk.word_tokenize(row['summary_tokenized'])
            summary_tokenized = list(
                map(str.lower, process_tokens(summary_tokenized)))
            qas = source_qas[source_qas['document_id'].isin(
                [row['document_id']])]
            qas = qas.reset_index(drop=True)
            qas = modify_answer_spans(qas, row['summary_tokenized'])
            for qid, ques_row in qas.iterrows():
                sent = list(
                    map(
                        str.lower,
                        nltk.word_tokenize(
                            ques_row['answer1_tokenized'].replace(".", ""))))
                #print ("Question",qid,ques_row['processed_question_wo'])
                #print ("Answer:",sent)
                #print("indices",ques_row['start_index'],ques_row['end_index'])
                predicted_rouge_span = summary_tokenized[
                    ques_row['start_index'][1]:ques_row['end_index'][1] + 1]
                #print ("Rouge Span:",predicted_rouge_span)
                references.append([sent])
                #references1.append([predicted_rouge_span])
                spans.append(predicted_rouge_span)
            bleu_scores.append(
                corpus_bleu(references, spans, weights=(1, 0, 0, 0)))
            bleu_4_scores.append(corpus_bleu(references, spans))
    print("Average score bleu_1 for", data_type,
          sum(bleu_scores) / len(bleu_scores))
    print("Average score bleu_4 for", data_type,
          sum(bleu_4_scores) / len(bleu_4_scores))
Exemple #3
0
def _tokenize(c):
    c = c.replace("''", '" ')
    c = c.replace("``", '" ')
    cl = list(map(word_tokenize, sent_tokenize(c)))
    cl = [process_tokens(tokens) for tokens in cl]  # process tokens
    return cl[0]
Exemple #4
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    totalnum0 = 0
    falsenum0 = 0
    falsenum1 = 0
    truenum0 = 0
    truenum1 = 0
    outlist = []
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}.seq.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            ctx = para['context']
            if (len(ctx.split()) <= 800):
                cut = -1
            else:
                cut = sum(map(len, ctx.split()[:800])) + 800
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            xi = [[xijk for xijk in xij if xijk != ''] for xij in xi]
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                totalnum0 += 1
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                answers = []
                flag = False
                Flag = True
                if (cut > -1):
                    for ans in qa['answers']:
                        if (max(ans['answer_location'])) < cut:
                            Flag = False
                            break
                else:
                    Flag = False
                if (Flag):
                    falsenum1 += 1

                for answer in qa['answers']:
                    flag1 = True
                    answer_text = answer['text']
                    answers.append(answer_text)
                    ansi = word_tokenize(answer_text)
                    answer_location = answer['answer_location']
                    yii = []
                    for ans_idx, answer_start in enumerate(answer_location):
                        answer_stop = answer_start + len(ansi[ans_idx])
                        yi0, _ = get_word_span(context, xi, answer_start,
                                               answer_stop)
                        if (yi0[1] >= 800):
                            flag1 = False
                        assert len(xi[yi0[0]]) > yi0[1]
                        w0 = xi[yi0[0]][yi0[1]]
                        assert ansi[ans_idx] == w0, (ansi[ans_idx], w0)
                        yii.append(yi0)

                    if (flag1):
                        flag = True

                    yi.append(yii)
                if (flag):
                    truenum0 += 1

                if (flag == Flag):
                    print(ctx, qa, yi, cut)
                    outlist.append([ctx, qa])

                    # answer_start = answer['answer_start']
                    # answer_stop = answer_start + len(answer_text)
                    # yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # # yi0 = answer['answer_word_start'] or [0, 0]
                    # # yi1 = answer['answer_word_stop'] or [0, 1]
                    # assert len(xi[yi0[0]]) > yi0[1]
                    # assert len(xi[yi1[0]]) >= yi1[1]
                    # w0 = xi[yi0[0]][yi0[1]]
                    # w1 = xi[yi1[0]][yi1[1]-1]
                    # i0 = get_word_idx(context, xi, yi0)
                    # i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    # cyi0 = answer_start - i0
                    # cyi1 = answer_stop - i1 - 1
                    # # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    # assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    # assert answer_text[-1] == w1[cyi1]
                    # assert cyi0 < 32, (answer_text, w0)
                    # assert cyi1 < 32, (answer_text, w1)
                    # yi.append([yi0, yi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    print(truenum0, totalnum0, float(truenum0) / totalnum0)
    print(falsenum1, totalnum0, 1 - float(falsenum1) / totalnum0)
    with open('debugcnt.json', 'w') as f:
        json.dump(outlist, f)
Exemple #5
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}-v1.1.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                answer = qa['answer']

                yi.append(answer)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                def put():
                    q.append(qi)
                    cq.append(cqi)
                    y.append(yi)
                    rx.append(rxi)
                    rcx.append(rxi)
                    ids.append(qa['id'])
                    idxs.append(len(idxs))
                    answerss.append(answers)

                put()
                if data_type == 'train' and answer:
                    for i in range(3):
                        put()

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)

def get_2d_span(summary, index):
    sum = 0
    for sent_num, sent in enumerate(summary):
        if index < (sum + len(sent)):
            return (sent_num, index - sum)
        sum += len(sent)


for index_summ, row in tqdm(source_summaries.iterrows(), total=1572):
    #summary = row['processed_summary'].replace(".",". ")
    summ = list(
        map(nltk.word_tokenize,
            nltk.sent_tokenize(row['processed_summary_wo'])))
    summ = [process_tokens(tokens) for tokens in summ]
    summary_tokenized = nltk.word_tokenize(row['processed_summary_wo'])
    summary_tokenized = list(map(str.lower, process_tokens(summary_tokenized)))
    all_substrings = get_all_substrings(summary_tokenized)
    #print (compute_bleu(get_all_substrings(['Dhruv','is','a','good','scientist','.']),['Dhruv']))
    qas = source_qas[source_qas['document_id'].isin([row['document_id']])]
    qas = qas.reset_index(drop=True)
    print("Summaries", summ)
    for qid, ques_row in qas.iterrows():
        sent = list(
            map(
                str.lower,
                nltk.word_tokenize(ques_row['processed_answer_wo'].replace(
                    ".", ""))))
        #ans_span=compute_bleu(all_substrings,sent)
        ans_span = compute_rouge(all_substrings, sent)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    contextss = []
    context_questions = []
    titles = []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        pp = []
        p.append(pp)
        xp, cxp, contexts, c_questions = [], [], [], []
        x.append(xp)
        cx.append(cxp)
        contextss.append(contexts)
        context_questions.append(c_questions)
        title = "[" + str(ai).zfill(2) + "] " + article['title'].replace('_', ' ')
        titles.append(title)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ') #Sentences of priginal Paragraph
            contexts.append(context)
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            if ai==0: c_questions.append(para['qas'][3]['question'])
            else: c_questions.append(para['qas'][0]['question'])
            """
            for qa in para['qas']:
                # get words
                c_questions.append(qa['question'])
                break
                qi = word_tokenize(qa['question']) # qa['question'] : original question
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)
            """
            if args.debug:
                break
    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p' : rx}
    shared = {'x': x, 'cx': cx, 'p' : p,
              'contextss' : contextss, 'context_questions' : context_questions,
              'titles' : titles,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}
    print("saving ...")
    save(args, data, shared, out_name)
Exemple #8
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    parser = StanfordParser(model_path=os.getenv("StanfordParser_model_path"))
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    # if not args.split:
    #     sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(
        args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    q_syn_seq = []
    na = []
    cy = []
    x, cx = [], []
    syn_seq = []
    rsyn_seq = []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    syn_counter = Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    abandon = 0
    for ai, article in enumerate(tqdm(source_data['data'][221:])):
        xp, cxp = [], []
        syn_seqp = []
        pp = []
        x.append(xp)
        cx.append(cxp)
        syn_seq.append(syn_seqp)
        p.append(pp)
        p_i = -1
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ').replace("``", '" ').replace(
                'e.g.', 'e-g,')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            try:
                syn_seqpp = sents_parser(xi, parser)
                p_i += 1
            except:
                abandon += 1
                continue
            for sent in syn_seqpp:
                for word in sent:
                    for syn in word:
                        syn_counter[syn] += 1

            syn_seqp.append(syn_seqpp)
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai + 221, p_i]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == p_i
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                qi = process_tokens(qi)
                try:
                    q_syn_seqq = sent_parser(qi, parser)
                except:
                    continue
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start,
                                             answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1] - 1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                if len(qa['answers']) == 0:
                    yi.append([(0, 0), (0, 1)])
                    cyi.append([0, 1])
                    na.append(True)
                else:
                    na.append(False)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                q_syn_seq.append(q_syn_seqq)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                rsyn_seq.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)
        print('abandon {} paragraph'.format(abandon))
        if args.debug:
            break
    for sent in q_syn_seq:
        for word in sent:
            for syn in word:
                syn_counter[syn] += 1
    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        '*syn_seq': rsyn_seq,
        'cy': cy,
        'q_syn_seq': q_syn_seq,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx,
        'na': na
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'syn_seq': syn_seq,
        'syn_counter': syn_counter,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }
    print("saving ...")
    save(args, data, shared, out_name)
Exemple #9
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk.tokenize as nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from corenlp import CoreNLPClient
        interface = CoreNLPClient(annotators="tokenize ssplit".split())
    else:
        raise Exception()
    """
    if not args.split:
        sent_tokenize = lambda para: [para]
    """
    source_path = in_path or os.path.join(
        args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    na = []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            while True:
                try:
                    temp = interface.annotate(context)
                    break
                except Exception as e:
                    time.sleep(0.2)
            context_s = []
            for sent in temp.sentence:
                sent = [word.originalText for word in sent.token]
                # CoreNLP 가 처리못하는 단어인 \xa0 를 예외처리하기위한 수동 방법...
                for wi in range(len(sent)):
                    if "\xa0" in sent[wi]:
                        sent = sent[:wi] + sent[wi].split("\xa0") + sent[wi +
                                                                         1:]
                        wi = 0
                context_s.append(sent)
            xi = context_s
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                while True:
                    try:
                        temp = interface.annotate(qa['question']).sentence[0]
                        break
                    except Exception as e:
                        time.sleep(0.2)
                #print(temp.token[0])
                #exit(-1)
                qi = [t_s.originalText for t_s in temp.token]
                print(qi)
                exit(-1)
                qi = process_tokens(qi)
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start,
                                             answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1] - 1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)
                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                if len(qa['answers']) == 0:
                    yi.append([(0, 0), (0, 1)])
                    cyi.append([0, 1])
                    na.append(True)
                else:
                    na.append(False)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

        if args.debug:
            break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'cy': cy,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx,
        'na': na
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
Exemple #10
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    """


    :param args:            arguments
    :param data_type:       train, dev or all
    :param start_ratio:     default is 0.0
    :param stop_ratio:      default is 1.0
    :param out_name:        train, dev or test
    :param in_path:         default is None, not sure about what is this
    :return:
    """

    # 1. tokenize and sent tokenize

    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            """
            firstly word_tokenize the tokens and replace some
            chars, and return a list
            :param tokens:
            :return:
            """
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]  # input is para, turn it to a list

    # 2. load data from disk
    source_path = in_path or os.path.join(args.source_dir,
                                          "{}-v{}.json".format(data_type, args.version))
    source_data = json.load(open(file=source_path, mode='r'))

    # 3. initiate some counter and some lists
    q, cq, rx, rcx = [], [], [], []
    # question, char_question, context, char_context
    y, cy, ids, idxs = [], [], [], []
    x, cx = [], []
    answerss, p = [], []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_at_index = int(round(len(source_data['data']) * start_ratio))
    stop_at_index = int(round(len(source_data['data']) * stop_ratio))

    # 4. iterate the dataset
    max_ques_size = 0
    max_context_size = 0
    max_word_size = 0

    for article_index, article in enumerate(tqdm(source_data['data'][start_at_index:stop_at_index])):
        xp, cxp, pp = [], [], []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)

        for paragraph_index, paragraph in enumerate(article['paragraphs']):
            context = paragraph['context']
            context = context.replace("''", '" ')
            # notice this space, so the length of the context will not change when replace
            context = context.replace("``", '" ')

            # context is a str here
            list_of_wordlist = list(map(word_tokenize, sent_tokenize(context)))
            # after sent_tokenizer, it will be a list of sentence, here just one sentence,
            # a list of sentence
            # then the map, will apply the word_tokenize func to each sentence
            # a list of lists of words
            # [[words for sentence1], [words for sentence2]]

            list_of_wordlist = [process_tokens(tokens) for tokens in list_of_wordlist]
            # list_of_wordlist is a 2d stuff
            for wordlist in list_of_wordlist:
                max_context_size = max(max_context_size, len(wordlist))

            list_of_charlist = [[list(word) for word in wordlist] for wordlist in list_of_wordlist]
            # list of charlist is a 3d, sentence-dim, word-dim, char-dim

            xp.append(list_of_wordlist)
            # 3d, paragraph, sentence, words
            cxp.append(list_of_charlist)
            # 4d, paragraph, sentence, words, chars
            pp.append(context)
            # 2d, paragraph, context

            ## update counters
            num_qas = len(paragraph['qas'])
            for wordlist in list_of_wordlist:
                for word in wordlist:
                    word_counter[word] += num_qas
                    lower_word_counter[word.lower()] += num_qas
                    for char in word:
                        char_counter[char] += num_qas

            rxi = [article_index, paragraph_index]
            assert len(x) - 1 == article_index
            # x stores xp, xp is 3d, paragraph, sentece, and words
            assert len(x[article_index]) - 1 == paragraph_index

            for question in paragraph['qas']:
                question_wordslist = word_tokenize(question['question'])
                max_ques_size = max(max_ques_size, len(question_wordslist))
                # it's a list of words
                question_charslist = [list(word) for word in question_wordslist]
                # it's a list of charlist
                yi = []
                cyi = []
                answers = []  # the content of each answers

                for answer in question['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start_index = answer['answer_start']
                    answer_end_index = answer_start_index + len(answer_text)
                    yi0, yi1 = get_word_span(context,
                                             list_of_wordlist,  # 2-d: sentences, words
                                             answer_start_index,
                                             answer_end_index)
                    # yi0 (0, 108), 0 is the index of sentence
                    # yi1 (0, 111). 108 and 111 is the start and end of word index

                    assert len(list_of_wordlist[yi0[0]]) > yi0[1]
                    # the length of the first sentence is larger than 108
                    assert len(list_of_wordlist[yi1[0]]) >= yi1[1]
                    # the length of the first sentence is larger or equla to 111

                    w0 = list_of_wordlist[yi0[0]][yi0[1]]  # the start words of the answer
                    w1 = list_of_wordlist[yi1[0]][yi1[1] - 1]  # the last word of the answer

                    i0 = get_word_idx(context, list_of_wordlist, yi0)
                    i1 = get_word_idx(context, list_of_wordlist, (yi1[0], yi1[1] - 1))
                    # i0 is 515, which is the char index of the answer,
                    # i1 is start index of the final word in terms of chars
                    # 'Saint Bernadette Soubirous', i1 is the index of S in Soubirous
                    cyi0 = answer_start_index - i0
                    # it should be 0 here since start index is 515, and i0 should also be 515
                    cyi1 = answer_end_index - i1 - 1
                    # cyi1 seems to be the length of last word -1, or because some other issues

                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    # be sure the first char and last char are same with the first word's first char and last word's last char
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)
                    #
                    yi.append([yi0, yi1])  # index of words
                    cyi.append([cyi0, cyi1])
                    # index of shifts from the first char and last char of the answer in context

                # update counters
                for word in question_wordslist:
                    word_counter[word] += 1
                    lower_word_counter[word.lower()] += 1
                    for char in word:
                        char_counter[char] += 1

                q.append(question_wordslist)  # 2-d list of wordlist for each question
                cq.append(question_charslist)  # 3-d, question-word-char
                y.append(yi)  # question-startendpair
                cy.append(cyi)  # question-startend char pair
                rx.append(rxi)  # list of article_id-paragraph_id pair
                rcx.append(rxi)
                ids.append(question['id'])  # ids for each question
                idxs.append(len(idxs))  # index for each question
                answerss.append(answers)  # list of answer in string

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    for word in word_counter:
        max_word_size = max(max_word_size, len(word))


    # add context here
    data = {
        'q': q,  # list of word list of each questions, [['who','are', 'you'], ... ]
        'cq': cq,
        # [<class 'list'>: [['T', 'o'], ['w', 'h', 'o', 'm'], ['d', 'i', 'd'], ['t', 'h', 'e'], ['V', 'i', 'r', 'g', 'i', 'n'], ['M', 'a', 'r', 'y'], ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'], ['a', 'p', 'p', 'e', 'a', 'r'], ['i', 'n'], ['1', '8', '5', '8'], ['i', 'n'], ['L', 'o', 'u', 'r', 'd', 'e', 's'], ['F', 'r', 'a', 'n', 'c', 'e'], ['?']] , ...]
        'y': y,  # list of <class 'list'>: [[(0, 108), (0, 111)]]
        '*x': rx,  # list of <class 'list'>: [0, 21], 0 means the number of article, 21 means the 21st paragraph
        '*cx': rcx,  # same with rx but for characters, i guess the values are same as well
        'cy': cy,  #
        'idxs': idxs,  # just those ids
        'ids': ids,  # the id of each question, sth like uuid
        'answerss': answerss,  # the content of the answer
        '*p': rx  #
    }
    shared = {
        'x': x,  # words of each paragraph
        'cx': cx,  # characters of each
        'p': p,  # the content of each paragraph
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    # source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type))
    source_path = in_path or os.path.join(args.source_dir, "{}.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    # start_ai = int(round(len(source_data['data']) * start_ratio))
    # stop_ai = int(round(len(source_data['data']) * stop_ratio))
    start_ai = int(round(len(source_data) * start_ratio))
    stop_ai = int(round(len(source_data) * stop_ratio))

    # for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
    for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        # print(article.keys()) # paragraph, title
        # raise
        # print(article) # {'question', 'answer', 'context', 'answer_list'}
        # raise
        # for pi, para in enumerate(article['paragraphs']):
        for pi, para in enumerate([article]):
            # print(para.keys()) # qas, context
            # raise
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # print(xi) # [['archi', ',', 'the', 'school']]
            # raise
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            # print(len(para['qas'])) # 5
            # print(para['qas']) # [{'answers': [{'text', 'answer_start'}], 'id', 'question'}]
            # raise
            for xij in xi:
                for xijk in xij:
                    # word_counter[xijk] += len(para['qas'])
                    # lower_word_counter[xijk.lower()] += len(para['qas'])
                    word_counter[xijk] += 1
                    lower_word_counter[xijk.lower()] += 1
                    for xijkl in xijk:
                        # char_counter[xijkl] += len(para['qas'])
                        char_counter[xijkl] += 1

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            # for qa in para['qas']:
            for qa in [article]:
                # get words
                # qi = word_tokenize(qa['question'])
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                # for answer in qa['answers']:
                try:
                    answer = qa['answer']
                    answer_text = qa['answer_list'][answer-1]
                except KeyError:
                    answer_text = ' '.join(qa['answer_list'])
                for _ in [answer_text]:
                    # answer_text = answer['text']
                    answers.append(answer_text)
                    # answer_start = answer['answer_start']
                    try:
                        answer_start = context.index(answer_text)
                        answer_stop = answer_start + len(answer_text)
                    except ValueError:
                        answer_start = 0
                        answer_stop = len(context)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    # context: str
                    # xi: [[word, word, word, ...]]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, '|', w0[cyi0:], '|', w1[:cyi1+1])
                    # raise
                    #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    #assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                # ids.append(qa['id'])
                ids.append(qa['question'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)
Exemple #12
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    """

    :param args: configurations
    :param data_type: train or dev
    :param start_ratio:
    :param stop_ratio:
    :param out_name: train, dev, test
    :param in_path:
    :return:
    """
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    # 1. load data
    source_path = in_path or os.path.join(
        args.source_dir, "{}-v{}.json".format(data_type, args.version))
    source_data = json.load(open(source_path, 'r'))
    # load the train  data or dev 1.1 dataset

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_at_index = int(round(len(source_data['data']) * start_ratio))
    stop_at_index = int(round(len(source_data['data']) * stop_ratio))

    # for each article
    for article_index, article in enumerate(
            tqdm(source_data['data'][start_at_index:stop_at_index])):
        xp, cxp = [], []
        pp = []
        x.append(xp)  # article_paragraph_sentence_wordlist
        cx.append(cxp)  # article_paragraph_sentence_word_charlist
        p.append(pp)  # article_contextlist

        # for each paragrph of the article
        for paragraph_index, paragraph in enumerate(article['paragraphs']):
            # wordss
            context = paragraph['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            list_of_wordlist = list(map(word_tokenize, sent_tokenize(context)))
            list_of_wordlist = [
                process_tokens(tokens) for tokens in list_of_wordlist
            ]  # process tokens
            # xi are words
            # given xi, add chars
            list_of_charlist = [[list(word) for word in word_list]
                                for word_list in list_of_wordlist]
            # cxi are characters for each words
            xp.append(list_of_wordlist)  # paragraph_sentence_wordlist
            cxp.append(list_of_charlist)  # paragraph_sentence_word_charlist
            pp.append(context)  # contextlist

            # update the counter to plus the number of questions
            for wordlist in list_of_wordlist:
                for word in wordlist:
                    word_counter[word] += len(paragraph['qas'])
                    lower_word_counter[word.lower()] += len(paragraph['qas'])
                    for char in word:
                        char_counter[char] += len(paragraph['qas'])

            rxi = [article_index, paragraph_index]
            assert len(x) - 1 == article_index
            assert len(x[article_index]) - 1 == paragraph_index
            for question in paragraph['qas']:
                # get words
                question_wordslist = word_tokenize(question['question'])
                question_charslist = [list(qij) for qij in question_wordslist]
                yi = []
                cyi = []
                answers = []
                for answer in question['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, list_of_wordlist,
                                             answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(list_of_wordlist[yi0[0]]) > yi0[1]
                    assert len(list_of_wordlist[yi1[0]]) >= yi1[1]
                    w0 = list_of_wordlist[yi0[0]][yi0[1]]
                    w1 = list_of_wordlist[yi1[0]][yi1[1] - 1]
                    i0 = get_word_idx(context, list_of_wordlist, yi0)
                    i1 = get_word_idx(context, list_of_wordlist,
                                      (yi1[0], yi1[1] - 1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in question_wordslist:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(question_wordslist)  # question_wordlist,
                cq.append(question_charslist)  # qeustion_word_charlist
                y.append(yi)  #
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(question['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q':
        q,  # list of word list of each questions, [['who','are', 'you'], ... ]
        'cq': cq,
        # [<class 'list'>: [['T', 'o'], ['w', 'h', 'o', 'm'], ['d', 'i', 'd'], ['t', 'h', 'e'], ['V', 'i', 'r', 'g', 'i', 'n'], ['M', 'a', 'r', 'y'], ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'], ['a', 'p', 'p', 'e', 'a', 'r'], ['i', 'n'], ['1', '8', '5', '8'], ['i', 'n'], ['L', 'o', 'u', 'r', 'd', 'e', 's'], ['F', 'r', 'a', 'n', 'c', 'e'], ['?']] , ...]
        'y': y,  # list of <class 'list'>: [[(0, 108), (0, 111)]]
        '*x':
        rx,  # list of <class 'list'>: [0, 21], 0 means the number of article, 21 means the 21st paragraph
        '*cx':
        rcx,  # same with rx but for characters, i guess the values are same as well
        'cy': cy,  #
        'idxs': idxs,  # just those ids
        'ids': ids,  # the id of each question, sth like uuid
        'answerss': answerss,  # the content of the answer
        '*p': rx  #
    }
    # the following variables are shared by several question,
    shared = {
        'x': x,  # words of each paragraph
        'cx': cx,  # characters of each
        'p': p,  # the content of each paragraph
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
Exemple #13
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
 #           return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
            return [token.replace("''", '"').replace("``", '"') for token in jieba.lcut(tokens, cut_all=False)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

#    if not args.split:
#        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type))
    source_data = json.load(open(source_path, 'r', encoding='utf-8'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            
            ################### add by zhijing
            table = {ord(f):ord(t) for f,t in zip(
            	',。!?【】()%#@&1234567890',
            	',.!?[]()%#@&1234567890')}
            context = context.translate(table)
            ################### add by zhijing
            print(context)
            print(len(sent_tokenize(context)))
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            print('xi')
            print(xi)
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)
Exemple #14
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix))
    source_data = json.load(open(source_path, 'r', encoding="utf-8"))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    na = []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                qi = process_tokens(qi)
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    #add
		    
                    #print("i0 :",i0, "i1 :",i1, "cyi0 :", cyi0, "w0 :",w0 )
                    #print("xi :", xi)
                    #print( "yi0",yi0, "(yi1[0], yi1[1]-1) :",(yi1[0], yi1[1]-1) )
                    #print("answer_text",answer_text)
                    #print("cyi1:",cyi1)
                    #print("answer_text[0] :",answer_text[0])
                    #print("answer_text[-1] :",answer_text[-1])
                    #print("w0 :",w0)
                    #print("w1 :",w1)
                    #so far

                    #print(":):):)")
                    #print("answer_text:",answer_text,"\nstart:", w0[cyi0:],"\nend:", w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                if len(qa['answers']) == 0:
                    yi.append([(0, 0), (0, 1)])
                    cyi.append([0, 1])
                    na.append(True)
                else:
                    na.append(False)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

        if args.debug:
            break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)
Exemple #15
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize
        def word_tokenize(tokens):
            return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "{}-qar_squad_all.jsonl".format(data_type))
    rfp = open(source_path, 'r')

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    #start_ai = int(round(len(source_data['data']) * start_ratio))
    #stop_ai = int(round(len(source_data['data']) * stop_ratio))
    pi = 0
    ai = 0
    xp, cxp = [], []
    pp = []
    x.append(xp)
    cx.append(cxp)
    p.append(pp)

    for line in tqdm(rfp):
        para = json.loads(line)
        context = para['context']
        context = context.replace("''", '" ')
        context = context.replace("``", '" ')
        xi = list(map(word_tokenize, sent_tokenize(context)))
        # xi = context.split()
        xi = [process_tokens(tokens) for tokens in xi]  # process tokens
        # given xi, add chars
        cxi = [[list(xijk) for xijk in xij] for xij in xi]
        xp.append(xi)
        cxp.append(cxi)
        pp.append(context)

        for xij in xi:
            for xijk in xij:
                word_counter[xijk] += len(para['qas'])
                lower_word_counter[xijk.lower()] += len(para['qas'])
                for xijkl in xijk:
                    char_counter[xijkl] += len(para['qas'])

        rxi = [ai, pi]
        assert len(x) - 1 == ai
        assert len(x[ai]) - 1 == pi
        for qa in para['qas']:
            # get words
            qa_text = qa['question']

            qa_text = qa_text.replace("''", '" ')
            qa_text = qa_text.replace("``", '" ')

            qi = word_tokenize(qa_text)

            # qi = qa['question'].split()
            cqi = [list(qij) for qij in qi]
            yi = []
            cyi = []
            answers = []
            for answer in qa['answers']:
                flag = False
                answer_text = answer['text']

                answer_text = answer_text.replace("''", '" ')
                answer_text = answer_text.replace("``", '" ')

                answers.append(answer_text)
                answer_start = answer['answer_start']
                answer_stop = answer_start + len(answer_text)
                # TODO : put some function that gives word_start, word_stop here
                yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                # yi0 = answer['answer_word_start'] or [0, 0]
                # yi1 = answer['answer_word_stop'] or [0, 1]
                assert len(xi[yi0[0]]) > yi0[1]
                assert len(xi[yi1[0]]) >= yi1[1]
                w0 = xi[yi0[0]][yi0[1]]
                w1 = xi[yi1[0]][yi1[1]-1]

                if len(w1) == 0 and len(xi[yi1[0]][yi1[1]-2]) != 0:
                    flag = True
                    w1 = xi[yi1[0]][yi1[1]-2]

                i0 = get_word_idx(context, xi, yi0)
                i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                cyi0 = answer_start - i0
                cyi1 = answer_stop - i1 - 1
                # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)

                if flag:
                    assert answer_text[-2] == w1[cyi1], (answer_text, w1, cyi1)
                else:
                    assert answer_text[-1] == w1[cyi1], (answer_text, w1, cyi1)

                assert cyi0 < 32, (answer_text, w0)
                assert cyi1 < 32, (answer_text, w1)

                yi.append([yi0, yi1])
                cyi.append([cyi0, cyi1])

            for qij in qi:
                word_counter[qij] += 1
                lower_word_counter[qij.lower()] += 1
                for qijk in qij:
                    char_counter[qijk] += 1

            q.append(qi)
            cq.append(cqi)
            y.append(yi)
            cy.append(cyi)
            rx.append(rxi)
            rcx.append(rxi)
            ids.append(qa['id'])
            idxs.append(len(idxs))
            answerss.append(answers)

        if args.debug:
            break

        pi += 1


    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("Saving ...")
    save(args, data, shared, out_name)
    print("Saving complete!")
Exemple #16
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}-v1.1.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    contextss = []
    context_questions = []
    titles = []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        pp = []
        p.append(pp)
        xp, cxp, contexts, c_questions = [], [], [], []
        x.append(xp)
        cx.append(cxp)
        contextss.append(contexts)
        context_questions.append(c_questions)
        title = "[" + str(ai).zfill(2) + "] " + article['title'].replace(
            '_', ' ')
        titles.append(title)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``",
                                      '" ')  #Sentences of priginal Paragraph
            contexts.append(context)
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            if ai == 0: c_questions.append(para['qas'][3]['question'])
            else: c_questions.append(para['qas'][0]['question'])
            """
            for qa in para['qas']:
                # get words
                c_questions.append(qa['question'])
                break
                qi = word_tokenize(qa['question']) # qa['question'] : original question
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    assert answer_text[-1] == w1[cyi1]
                    assert cyi0 < 32, (answer_text, w0)
                    assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)
            """
            if args.debug:
                break
    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'cy': cy,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'contextss': contextss,
        'context_questions': context_questions,
        'titles': titles,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }
    print("saving ...")
    save(args, data, shared, out_name)
Exemple #17
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}.seq.json".format(data_type))
    source_data = json.load(open(source_path, 'r'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data['data']) * start_ratio))
    stop_ai = int(round(len(source_data['data']) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            xi = [[xijk for xijk in xij if xijk != ''] for xij in xi]
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    ansi = word_tokenize(answer_text)
                    answer_location = answer['answer_location']
                    yii = []
                    for ans_idx, answer_start in enumerate(answer_location):
                        answer_stop = answer_start + len(ansi[ans_idx])
                        yi0, _ = get_word_span(context, xi, answer_start,
                                               answer_stop)
                        assert len(xi[yi0[0]]) > yi0[1]
                        w0 = xi[yi0[0]][yi0[1]]
                        assert ansi[ans_idx] == w0, (ansi[ans_idx], w0)
                        yii.append(yi0)

                    yi.append(yii)

                    # answer_start = answer['answer_start']
                    # answer_stop = answer_start + len(answer_text)
                    # yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # # yi0 = answer['answer_word_start'] or [0, 0]
                    # # yi1 = answer['answer_word_stop'] or [0, 1]
                    # assert len(xi[yi0[0]]) > yi0[1]
                    # assert len(xi[yi1[0]]) >= yi1[1]
                    # w0 = xi[yi0[0]][yi0[1]]
                    # w1 = xi[yi1[0]][yi1[1]-1]
                    # i0 = get_word_idx(context, xi, yi0)
                    # i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    # cyi0 = answer_start - i0
                    # cyi1 = answer_stop - i1 - 1
                    # # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    # assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    # assert answer_text[-1] == w1[cyi1]
                    # assert cyi0 < 32, (answer_text, w0)
                    # assert cyi1 < 32, (answer_text, w1)
                    # yi.append([yi0, yi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    #if not args.split:
    #    sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir, "third_party",
                                          "wikipedia")
    source_summaries = pd.read_csv(source_path + '/summaries.csv')
    source_qas = pd.read_csv(args.source_dir + '/qaps.csv')

    summaries = []
    summaries_char_list = []
    ques_answers = []
    questions = []
    questions_char_list = []
    document_ids = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    summary_index = -1
    for index_summ, row in tqdm(source_summaries.iterrows()):
        if data_type == row['set']:
            summary_tokenized_paras = []
            summary_char_para = []
            summary_tokenized = list(
                map(word_tokenize, sent_tokenize(row['summary_tokenized'])))
            summary_tokenized = [
                process_tokens(tokens) for tokens in summary_tokenized
            ]
            char_list = [[list(word) for word in sent]
                         for sent in summary_tokenized]
            #print ("summ",summary_tokenized)
            #print (char_list)
            summary_tokenized_paras.append(
                summary_tokenized)  # TODO:each summary has only one paragraph
            summaries.append(summary_tokenized_paras)
            summary_char_para.append(
                char_list)  # TODO:each summary has only one paragraph
            summaries_char_list.append(summary_char_para)
            #coz train/test/valid all are in one file, index_summ cannot be used
            summary_index = summary_index + 1

            qas = source_qas[source_qas['document_id'].isin(
                [row['document_id']])]

            for sent in summary_tokenized:
                for word in sent:
                    word_counter[word] += len(qas)
                    lower_word_counter[word.lower()] += len(qas)
                    for char in word:
                        char_counter[char] += len(qas)

            for index, qa in qas.iterrows():
                #if question is of multiple sentences, not handling that case also
                #Not req most probably
                question_tokenized = word_tokenize(qa['question'])
                question_tokenized = process_tokens(question_tokenized)
                #print (question_tokenized)
                question_char_list = [
                    list(word) for word in question_tokenized
                ]

                answer1_tokenized = list(
                    map(word_tokenize, sent_tokenize(qa['answer1'])))
                answer1_tokenized = [
                    process_tokens(tokens) for tokens in answer1_tokenized
                ]
                #print(answer1_tokenized)

                answer2_tokenized = list(
                    map(word_tokenize, sent_tokenize(qa['answer2'])))
                answer2_tokenized = [
                    process_tokens(tokens) for tokens in answer2_tokenized
                ]
                #print(answer2_tokenized)

                ques_answers.append([answer1_tokenized, answer2_tokenized])
                #print(ques_answers)

                questions.append(question_tokenized)
                questions_char_list.append(question_char_list)
                document_ids.append([summary_index, row['document_id']])

                for sent in question_tokenized:
                    for word in sent:
                        word_counter[word] += 1
                        lower_word_counter[word.lower()] += 1
                        for char in word:
                            char_counter[char] += 1

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    data = {
        'q': questions,
        'cq': questions_char_list,
        '*x': document_ids,
        'answerss': ques_answers,
        '*cx': document_ids
    }
    shared = {
        'x': summaries,
        'cx': summaries_char_list,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
Exemple #19
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default"):
    word_tokenize, sent_tokenize = get_sent_tokenize()

    source_data = []
    f = open(os.path.join(args.source_dir, 'WikiQA-%s.txt' % data_type), 'r')

    lines = (f.read()).rsplit('\n')
    for i, line in enumerate(lines):
        if line == '': continue
        t = tuple(line.rsplit('\t'))
        assert len(t) == 3, t
        question, sentence, correct = t
        curr_question = question
        if not sentence.endswith('.'):
            sentence += '.'

        _id = len(source_data)
        qas = [{'answer': correct, 'id': _id, 'question': question}]
        dic = {'context': sentence, 'qas': qas}
        source_data.append({'paragraphs': [dic]})

    json.dump({'data': source_data},
              open(os.path.join(args.source_dir, '%s-class.json' % data_type),
                   'w'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data) * start_ratio))
    stop_ai = int(round(len(source_data) * stop_ratio))

    answer_counter = Counter()
    N = 0
    for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                N += 1
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                answers = []
                answer = qa['answer'] == '1'
                answer_counter[answer] += 1
                yi.append(answer)

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                def put():
                    q.append(qi)
                    cq.append(cqi)
                    y.append(yi)
                    rx.append(rxi)
                    rcx.append(rxi)
                    ids.append(qa['id'])
                    idxs.append(len(idxs))
                    answerss.append(answers)

                put()
                if data_type == 'train' and answer:
                    for i in range(17):
                        put()

            if args.debug:
                break
    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        '*p': rx
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
Exemple #20
0
def prepro_each(args,
                data_type,
                start_ratio=0.0,
                stop_ratio=1.0,
                out_name="default",
                in_path=None):
    print("Preprocessing data type %s" % data_type)
    if args.tokenizer == "PTB":
        import nltk
        sent_tokenize = nltk.sent_tokenize

        def word_tokenize(tokens):
            return [
                token.replace("''", '"').replace("``", '"')
                for token in nltk.word_tokenize(tokens)
            ]
    elif args.tokenizer == 'Stanford':
        from my.corenlp_interface import CoreNLPInterface
        interface = CoreNLPInterface(args.url, args.port)
        sent_tokenize = interface.split_doc
        word_tokenize = interface.split_sent
    else:
        raise Exception()

    if not args.split:
        sent_tokenize = lambda para: [para]

    source_path = in_path or os.path.join(args.source_dir,
                                          "{}.csv".format(data_type))
    print("Reading data from source path %s" % source_path)
    source_data = pd.read_csv(source_path,
                              encoding='utf-8',
                              dtype=dict(is_answer_absent=float),
                              na_values=dict(question=[],
                                             story_text=[],
                                             validated_answers=[]),
                              keep_default_na=False)

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []  # Gold standard answers
    span_answerss = []  # Answers from our spans
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(
    ), Counter()
    start_ai = int(round(len(source_data) * start_ratio))
    stop_ai = int(round(len(source_data) * stop_ratio))
    data_rows = source_data.iterrows()
    story_ids_to_idx = {}
    idx_to_story_ids = {}

    for ai, data_point in enumerate(tqdm(data_rows)):
        question_index, question_info = data_point[0], data_point[1]
        story_id = question_info['story_id']
        context = question_info['story_text']
        context = context.replace("''", '" ')
        context = context.replace("``", '" ')
        question = question_info['question']
        question_id = ai
        answer_char_ranges = question_info['answer_char_ranges']

        # Copy get answer script from the newsqa dataset
        baseline_answers = []
        # Prefer validated answers.
        # If there are no validated answers, use the ones that are provided.
        if not 'validated_answers' in question_info or not question_info[
                'validated_answers']:
            # Ignore per selection splits.
            char_ranges = question_info['answer_char_ranges'].replace(
                '|', ',').split(',')
        else:
            validated_answers_dict = json.loads(
                question_info['validated_answers'])
            char_ranges = []
            for k, v in validated_answers_dict.items():
                char_ranges += v * [k]

        for char_range in char_ranges:
            if char_range.lower() == 'none':
                baseline_answers.append('NONE')
            elif ':' in char_range:
                start, end = map(int, char_range.split(':'))
                answer = question_info['story_text'][start:end]
                baseline_answers.append(answer)
        paragraph_ptr = -1
        pi = 0
        if story_id not in story_ids_to_idx:
            paragraph_ptr = len(story_ids_to_idx)
            story_ids_to_idx[story_id] = paragraph_ptr
            idx_to_story_ids[paragraph_ptr] = story_id
            xp, cxp = [], []
            pp = []
            x.append(xp)
            cx.append(cxp)
            p.append(pp)

            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens

            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += 1
                    lower_word_counter[xijk.lower()] += 1
                    for xijkl in xijk:
                        char_counter[xijkl] += 1

        else:
            paragraph_ptr = story_ids_to_idx[story_id]
        rxi = [paragraph_ptr, pi]
        """
        print("TEST")
        print("TEST")
        print(story_ids_to_idx)
        print(len(xp))
        print(paragraph_ptr)
        """
        xi = x[paragraph_ptr][pi]

        qi = word_tokenize(question)
        cqi = [list(qij) for qij in qi]
        yi = []
        cyi = []
        answers = []
        answer_char_ranges_split = answer_char_ranges.split("|")
        for answer in answer_char_ranges_split:
            if answer == 'None':
                continue
            answer_char_range = answer.split(",")[0].split(":")
            answer_start = int(answer_char_range[0])
            answer_stop = int(answer_char_range[-1])
            answer_text = context[answer_start:answer_stop].strip()

            if answer_text == "":
                print("BAD ANSWER GIVEN %s" % answer_char_range)
                continue

            answers.append(answer_text)

            # TODO : put some function that gives word_start, word_stop here
            yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
            # yi0 = answer['answer_word_start'] or [0, 0]
            # yi1 = answer['answer_word_stop'] or [0, 1]

            assert len(xi[yi0[0]]) > yi0[1]
            assert len(xi[yi1[0]]) >= yi1[1]
            w0 = xi[yi0[0]][yi0[1]]
            w1 = xi[yi1[0]][yi1[1] - 1]

            i0 = get_word_idx(context, xi, yi0)
            i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1))
            cyi0 = answer_start - i0
            cyi1 = answer_stop - i1 - 1

            #print(question, answer_text, w0[cyi0:], w1[:cyi1+1])
            #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
            #assert answer_text[-1] == w1[-1]
            assert cyi0 < 32, (answer_text, w0)
            assert cyi1 < 32, (answer_text, w1)

            yi.append([yi0, yi1])
            cyi.append([cyi0, cyi1])

        for qij in qi:
            word_counter[qij] += 1
            lower_word_counter[qij.lower()] += 1
            for qijk in qij:
                char_counter[qijk] += 1

        q.append(qi)
        cq.append(cqi)
        y.append(yi)
        cy.append(cyi)
        rx.append(rxi)
        rcx.append(rxi)
        ids.append(question_id)
        idxs.append(len(idxs))
        answerss.append(baseline_answers)
        span_answerss.append(answers)
        if args.debug:
            break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {
        'q': q,
        'cq': cq,
        'y': y,
        '*x': rx,
        '*cx': rcx,
        'cy': cy,
        'idxs': idxs,
        'ids': ids,
        'answerss': answerss,
        'span_answerss': span_answerss,
        '*p': rx
    }
    shared = {
        'x': x,
        'cx': cx,
        'p': p,
        'story_ids_to_idx': story_ids_to_idx,
        'idx_to_story_ids': idx_to_story_ids,
        'word_counter': word_counter,
        'char_counter': char_counter,
        'lower_word_counter': lower_word_counter,
        'word2vec': word2vec_dict,
        'lower_word2vec': lower_word2vec_dict
    }

    print("saving ...")
    save(args, data, shared, out_name)
Exemple #21
0
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None):

    word_tokenize, sent_tokenize = get_sent_tokenize()

    source_data = []
    f = open(os.path.join(args.source_dir, 'WikiQA-%s.txt' % data_type), 'r', encoding='utf-8')
    curr_question = None
    lines = (f.read()).rsplit('\n')
    for i, line in enumerate(lines):
        if line == '' : continue
        t = tuple(line.rsplit('\t'))
        assert len(t)==3, t
        question, sentence, correct = t
        if not curr_question == question:
            if not (curr_question is None or answer_list == []):
                context = ' '.join(context_list)
                context = context.replace(' .', '.')
                answers = [{'answer_start':0, 'text':answer_list}]
                _id = len(source_data)
                qas = [{'answers':answers, 'id':_id, 'question':curr_question}]
                dic = {'context' : context, 'qas' : qas}
                source_data.append({'paragraphs' : [dic]})
            context_list = []
            answer_list = []
        curr_question = question
        if not sentence.endswith('.'):
            sentence += '.'
        context_list.append(sentence)
        if correct == '1':
            answer_list.append(sentence)

    json.dump({'data' : source_data}, open(os.path.join(args.source_dir, '%s.json' % data_type), 'w'))

    q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
    cy = []
    x, cx = [], []
    answerss = []
    p = []
    word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
    start_ai = int(round(len(source_data) * start_ratio))
    stop_ai = int(round(len(source_data) * stop_ratio))
    for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])):
        xp, cxp = [], []
        pp = []
        x.append(xp)
        cx.append(cxp)
        p.append(pp)
        for pi, para in enumerate(article['paragraphs']):
            # wordss
            context = para['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            xi = list(map(word_tokenize, sent_tokenize(context)))
            xi = [process_tokens(tokens) for tokens in xi]  # process tokens
            # given xi, add chars
            cxi = [[list(xijk) for xijk in xij] for xij in xi]
            xp.append(xi)
            cxp.append(cxi)
            pp.append(context)

            for xij in xi:
                for xijk in xij:
                    word_counter[xijk] += len(para['qas'])
                    lower_word_counter[xijk.lower()] += len(para['qas'])
                    for xijkl in xijk:
                        char_counter[xijkl] += len(para['qas'])

            rxi = [ai, pi]
            assert len(x) - 1 == ai
            assert len(x[ai]) - 1 == pi
            for qa in para['qas']:
                # get words
                qi = word_tokenize(qa['question'])
                cqi = [list(qij) for qij in qi]
                yi = []
                cyi = []
                answers = []
                for answer in qa['answers']:
                    answer_text = answer['text']
                    answers.append(answer_text)
                    answer_start = answer['answer_start']
                    answer_stop = answer_start + len(answer_text)
                    # TODO : put some function that gives word_start, word_stop here
                    yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop)
                    # yi0 = answer['answer_word_start'] or [0, 0]
                    # yi1 = answer['answer_word_stop'] or [0, 1]
                    assert len(xi[yi0[0]]) > yi0[1]
                    assert len(xi[yi1[0]]) >= yi1[1]
                    w0 = xi[yi0[0]][yi0[1]]
                    w1 = xi[yi1[0]][yi1[1]-1]
                    i0 = get_word_idx(context, xi, yi0)
                    i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1))
                    cyi0 = answer_start - i0
                    cyi1 = answer_stop - i1 - 1
                    # print(answer_text, w0[cyi0:], w1[:cyi1+1])
                    #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0)
                    #assert answer_text[-1] == w1[cyi1]
                    #assert cyi0 < 32, (answer_text, w0)
                    #assert cyi1 < 32, (answer_text, w1)

                    yi.append([yi0, yi1])
                    cyi.append([cyi0, cyi1])

                for qij in qi:
                    word_counter[qij] += 1
                    lower_word_counter[qij.lower()] += 1
                    for qijk in qij:
                        char_counter[qijk] += 1

                q.append(qi)
                cq.append(cqi)
                y.append(yi)
                cy.append(cyi)
                rx.append(rxi)
                rcx.append(rxi)
                ids.append(qa['id'])
                idxs.append(len(idxs))
                answerss.append(answers)

            if args.debug:
                break

    word2vec_dict = get_word2vec(args, word_counter)
    lower_word2vec_dict = get_word2vec(args, lower_word_counter)

    # add context here
    data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy,
            'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx}
    shared = {'x': x, 'cx': cx, 'p': p,
              'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
              'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict}

    print("saving ...")
    save(args, data, shared, out_name)