def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() #if not args.split: # sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "third_party", "wikipedia") source_summaries = pd.read_csv(source_path + '/summaries.csv') source_qas = pd.read_csv(args.source_dir + '/qaps.csv') summaries = [] summaries_char_list = [] ques_answers = [] questions = [] questions_char_list = [] document_ids = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() summary_index = -1 for index_summ, row in tqdm(source_summaries.iterrows()): if data_type == row['set']: summary_tokenized_paras = [] summary_char_para = [] summary_tokenized = list( map(word_tokenize, sent_tokenize(row['summary_tokenized']))) summary_tokenized = [ process_tokens(tokens) for tokens in summary_tokenized ] char_list = [[list(word) for word in sent] for sent in summary_tokenized] #print ("summ",summary_tokenized) #print (char_list) summary_tokenized_paras.append( summary_tokenized) # TODO:each summary has only one paragraph summaries.append(summary_tokenized_paras) summary_char_para.append( char_list) # TODO:each summary has only one paragraph summaries_char_list.append(summary_char_para) #coz train/test/valid all are in one file, index_summ cannot be used summary_index = summary_index + 1 qas = source_qas[source_qas['document_id'].isin( [row['document_id']])] for sent in summary_tokenized: for word in sent: word_counter[word] += len(qas) lower_word_counter[word.lower()] += len(qas) for char in word: char_counter[char] += len(qas) for index, qa in qas.iterrows(): #if question is of multiple sentences, not handling that case also #Not req most probably question_tokenized = word_tokenize(qa['question']) question_tokenized = process_tokens(question_tokenized) #print (question_tokenized) question_char_list = [ list(word) for word in question_tokenized ] answer1_tokenized = list( map(word_tokenize, sent_tokenize(qa['answer1']))) answer1_tokenized = [ process_tokens(tokens) for tokens in answer1_tokenized ] #print(answer1_tokenized) answer2_tokenized = list( map(word_tokenize, sent_tokenize(qa['answer2']))) answer2_tokenized = [ process_tokens(tokens) for tokens in answer2_tokenized ] #print(answer2_tokenized) ques_answers.append([answer1_tokenized, answer2_tokenized]) #print(ques_answers) questions.append(question_tokenized) questions_char_list.append(question_char_list) document_ids.append([summary_index, row['document_id']]) for sent in question_tokenized: for word in sent: word_counter[word] += 1 lower_word_counter[word.lower()] += 1 for char in word: char_counter[char] += 1 word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) data = { 'q': questions, 'cq': questions_char_list, '*x': document_ids, 'answerss': ques_answers, '*cx': document_ids } shared = { 'x': summaries, 'cx': summaries_char_list, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] contextss = [] context_questions = [] titles = [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): pp = [] p.append(pp) xp, cxp, contexts, c_questions = [], [], [], [] x.append(xp) cx.append(cxp) contextss.append(contexts) context_questions.append(c_questions) title = "[" + str(ai).zfill(2) + "] " + article['title'].replace( '_', ' ') titles.append(title) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') #Sentences of priginal Paragraph contexts.append(context) xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi if ai == 0: c_questions.append(para['qas'][3]['question']) else: c_questions.append(para['qas'][0]['question']) """ for qa in para['qas']: # get words c_questions.append(qa['question']) break qi = word_tokenize(qa['question']) # qa['question'] : original question cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) """ if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx } shared = { 'x': x, 'cx': cx, 'p': p, 'contextss': contextss, 'context_questions': context_questions, 'titles': titles, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): parser = StanfordParser(model_path=os.getenv("StanfordParser_model_path")) if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() # if not args.split: # sent_tokenize = lambda para: [para] source_path = in_path or os.path.join( args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] q_syn_seq = [] na = [] cy = [] x, cx = [], [] syn_seq = [] rsyn_seq = [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() syn_counter = Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) abandon = 0 for ai, article in enumerate(tqdm(source_data['data'][221:])): xp, cxp = [], [] syn_seqp = [] pp = [] x.append(xp) cx.append(cxp) syn_seq.append(syn_seqp) p.append(pp) p_i = -1 for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ').replace("``", '" ').replace( 'e.g.', 'e-g,') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens try: syn_seqpp = sents_parser(xi, parser) p_i += 1 except: abandon += 1 continue for sent in syn_seqpp: for word in sent: for syn in word: syn_counter[syn] += 1 syn_seqp.append(syn_seqpp) # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai + 221, p_i] assert len(x) - 1 == ai assert len(x[ai]) - 1 == p_i for qa in para['qas']: # get words qi = word_tokenize(qa['question']) qi = process_tokens(qi) try: q_syn_seqq = sent_parser(qi, parser) except: continue cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1] - 1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) if len(qa['answers']) == 0: yi.append([(0, 0), (0, 1)]) cyi.append([0, 1]) na.append(True) else: na.append(False) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) q_syn_seq.append(q_syn_seqq) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) rsyn_seq.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) print('abandon {} paragraph'.format(abandon)) if args.debug: break for sent in q_syn_seq: for word in sent: for syn in word: syn_counter[syn] += 1 word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, '*syn_seq': rsyn_seq, 'cy': cy, 'q_syn_seq': q_syn_seq, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na } shared = { 'x': x, 'cx': cx, 'p': p, 'syn_seq': syn_seq, 'syn_counter': syn_counter, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): totalnum0 = 0 falsenum0 = 0 falsenum1 = 0 truenum0 = 0 truenum1 = 0 outlist = [] if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}.seq.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss ctx = para['context'] if (len(ctx.split()) <= 800): cut = -1 else: cut = sum(map(len, ctx.split()[:800])) + 800 context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens xi = [[xijk for xijk in xij if xijk != ''] for xij in xi] # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: totalnum0 += 1 # get words qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] answers = [] flag = False Flag = True if (cut > -1): for ans in qa['answers']: if (max(ans['answer_location'])) < cut: Flag = False break else: Flag = False if (Flag): falsenum1 += 1 for answer in qa['answers']: flag1 = True answer_text = answer['text'] answers.append(answer_text) ansi = word_tokenize(answer_text) answer_location = answer['answer_location'] yii = [] for ans_idx, answer_start in enumerate(answer_location): answer_stop = answer_start + len(ansi[ans_idx]) yi0, _ = get_word_span(context, xi, answer_start, answer_stop) if (yi0[1] >= 800): flag1 = False assert len(xi[yi0[0]]) > yi0[1] w0 = xi[yi0[0]][yi0[1]] assert ansi[ans_idx] == w0, (ansi[ans_idx], w0) yii.append(yi0) if (flag1): flag = True yi.append(yii) if (flag): truenum0 += 1 if (flag == Flag): print(ctx, qa, yi, cut) outlist.append([ctx, qa]) # answer_start = answer['answer_start'] # answer_stop = answer_start + len(answer_text) # yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # # yi0 = answer['answer_word_start'] or [0, 0] # # yi1 = answer['answer_word_stop'] or [0, 1] # assert len(xi[yi0[0]]) > yi0[1] # assert len(xi[yi1[0]]) >= yi1[1] # w0 = xi[yi0[0]][yi0[1]] # w1 = xi[yi1[0]][yi1[1]-1] # i0 = get_word_idx(context, xi, yi0) # i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) # cyi0 = answer_start - i0 # cyi1 = answer_stop - i1 - 1 # # print(answer_text, w0[cyi0:], w1[:cyi1+1]) # assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) # assert answer_text[-1] == w1[cyi1] # assert cyi0 < 32, (answer_text, w0) # assert cyi1 < 32, (answer_text, w1) # yi.append([yi0, yi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break print(truenum0, totalnum0, float(truenum0) / totalnum0) print(falsenum1, totalnum0, 1 - float(falsenum1) / totalnum0) with open('debugcnt.json', 'w') as f: json.dump(outlist, f)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix)) source_data = json.load(open(source_path, 'r', encoding="utf-8")) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] na = [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words qi = word_tokenize(qa['question']) qi = process_tokens(qi) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 #add #print("i0 :",i0, "i1 :",i1, "cyi0 :", cyi0, "w0 :",w0 ) #print("xi :", xi) #print( "yi0",yi0, "(yi1[0], yi1[1]-1) :",(yi1[0], yi1[1]-1) ) #print("answer_text",answer_text) #print("cyi1:",cyi1) #print("answer_text[0] :",answer_text[0]) #print("answer_text[-1] :",answer_text[-1]) #print("w0 :",w0) #print("w1 :",w1) #so far #print(":):):)") #print("answer_text:",answer_text,"\nstart:", w0[cyi0:],"\nend:", w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) if len(qa['answers']) == 0: yi.append([(0, 0), (0, 1)]) cyi.append([0, 1]) na.append(True) else: na.append(False) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na} shared = {'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict} print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "third_party", "wikipedia") #source_summaries = pd.read_csv(source_path + '/summaries.csv') source_summaries = pd.read_csv( '/Users/dhruv100691/Documents/cs546/CS-546--Narrative-QA/bi-att-flow-dev/processed_summaries_new_method.csv' ) #source_qas = pd.read_csv(args.source_dir + '/qaps.csv') source_qas = pd.read_csv( '/Users/dhruv100691/Documents/cs546/CS-546--Narrative-QA/bi-att-flow-dev/processed_answer_spans_rogue_new_method.csv' ) #could not find spans for some answers, so dropping those qa pairs source_qas['start_index'] = source_qas['start_index'].str.replace( '(', '').str.replace(')', '') source_qas['end_index'] = source_qas['end_index'].str.replace( '(', '').str.replace(')', '') source_qas.dropna(subset=['start_index', 'end_index'], inplace=True) summaries = [] summaries_char_list = [] ques_answers = [] questions = [] questions_char_list = [] ques_answer_lengths = [] ques_answer_spans = [] document_ids = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() summary_index = -1 len_summ = 0 num_summ = 0 avg_len_sent = 0 num_words = 0 bleu_scores = [] bleu_4_scores = [] for index_summ, row in tqdm(source_summaries.iterrows(), total=1572): if data_type == row['set']: len_sent = 0 spans = [] references = [] summary_tokenized_paras = [] summary_char_para = [] #row['processed_summary'] = row['processed_summary'].replace(".",". ") summary_tokenized = list( map(word_tokenize, sent_tokenize(row['processed_summary']))) #summary_tokenized = [process_tokens(tokens) for tokens in summary_tokenized] char_list = [[list(word) for word in sent] for sent in summary_tokenized] summary_tokenized_paras.append(summary_tokenized) #summaries.append(list(break_summary_to_paras(summary_tokenized))) summaries.append(summary_tokenized_paras) num_summ += 1 summary_char_para.append( char_list) # TODO:each summary has only one paragraph summaries_char_list.append(summary_char_para) #coz train/test/valid all are in one file, index_summ cannot be used summary_index = summary_index + 1 len_summ += len(summary_tokenized) qas = source_qas[source_qas['document_id'].isin( [row['document_id']])] qas = modify_answer_spans(qas, row['processed_summary']) for sent in summary_tokenized: len_sent += len(sent) num_words += len(sent) for word in sent: word_counter[word] += len(qas) lower_word_counter[word.lower()] += len(qas) for char in word: char_counter[char] += len(qas) avg_len_sent += (len_sent / float(len(summary_tokenized))) for index, qa in qas.iterrows(): #if question is of multiple sentences, not handling that case also #Not req most probably question_tokenized = word_tokenize(qa['processed_question']) #question_tokenized = process_tokens(question_tokenized) #print (question_tokenized) question_char_list = [ list(word) for word in question_tokenized ] answer1_tokenized = list( map(word_tokenize, sent_tokenize(qa['processed_answer'].replace( ".", "")))) ##TODO #answer1_tokenized = [process_tokens(tokens) for tokens in answer1_tokenized] answer1_eos = answer1_tokenized[len(answer1_tokenized) - 1] + [ '</s>' ] #appending end token answer1_sos = ['--SOS--'] + answer1_tokenized[0] target_length = len(answer1_eos) answer1_span_start_idx = qa['start_index'] answer1_span_end_idx = qa['end_index'] #answer2_tokenized = list(map(word_tokenize, sent_tokenize(qa['answer2']))) #answer2_tokenized = [process_tokens(tokens) for tokens in answer2_tokenized] #answer2_eos = answer2_tokenized[len(answer2_tokenized) - 1] + ['</s>'] # appending end token #answer2_sos = ['--SOS--'] + answer2_tokenized[0] #print(answer2_tokenized) predicted_rouge_span = summary_tokenized[0][ answer1_span_start_idx[1]:answer1_span_end_idx[1] + 1] references.append([list(map(str.lower, answer1_tokenized[0]))]) spans.append(list(map(str.lower, predicted_rouge_span))) ques_answers.append([answer1_sos, answer1_eos]) ques_answer_spans.append( [answer1_span_start_idx, answer1_span_end_idx]) ques_answer_lengths.append(target_length) questions.append(question_tokenized) questions_char_list.append(question_char_list) document_ids.append([summary_index, row['document_id']]) for sent in question_tokenized: for word in sent: word_counter[word] += 1 lower_word_counter[word.lower()] += 1 for char in word: char_counter[char] += 1 bleu_scores.append( corpus_bleu(references, spans, weights=(1, 0, 0, 0))) bleu_4_scores.append(corpus_bleu(references, spans)) print("Average score bleu_1 for", data_type, sum(bleu_scores) / len(bleu_scores)) print("Average score bleu_4 for", data_type, sum(bleu_4_scores) / len(bleu_4_scores)) word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) data = { 'q': questions, 'cq': questions_char_list, '*x': document_ids, 'answerss': ques_answers, '*cx': document_ids, 'ans_len': ques_answer_lengths, 'spans': ques_answer_spans } shared = { 'x': summaries, 'cx': summaries_char_list, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name) print("{} statistics".format(data_type)) print(" Number of summaries :", num_summ) print(" Average summary length : ", len_summ / float(num_summ)) print(" Average sentence lengths :", avg_len_sent / float(num_summ)) print(" Average number of words :", num_words / float(num_summ))
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}.seq.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens xi = [[xijk for xijk in xij if xijk != ''] for xij in xi] # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) ansi = word_tokenize(answer_text) answer_location = answer['answer_location'] yii = [] for ans_idx, answer_start in enumerate(answer_location): answer_stop = answer_start + len(ansi[ans_idx]) yi0, _ = get_word_span(context, xi, answer_start, answer_stop) assert len(xi[yi0[0]]) > yi0[1] w0 = xi[yi0[0]][yi0[1]] assert ansi[ans_idx] == w0, (ansi[ans_idx], w0) yii.append(yi0) yi.append(yii) # answer_start = answer['answer_start'] # answer_stop = answer_start + len(answer_text) # yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # # yi0 = answer['answer_word_start'] or [0, 0] # # yi1 = answer['answer_word_stop'] or [0, 1] # assert len(xi[yi0[0]]) > yi0[1] # assert len(xi[yi1[0]]) >= yi1[1] # w0 = xi[yi0[0]][yi0[1]] # w1 = xi[yi1[0]][yi1[1]-1] # i0 = get_word_idx(context, xi, yi0) # i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) # cyi0 = answer_start - i0 # cyi1 = answer_stop - i1 - 1 # # print(answer_text, w0[cyi0:], w1[:cyi1+1]) # assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) # assert answer_text[-1] == w1[cyi1] # assert cyi0 < 32, (answer_text, w0) # assert cyi1 < 32, (answer_text, w1) # yi.append([yi0, yi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx } shared = { 'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None, save_json=True, pre_answer_doc_id=None): """ data: q/cq: query y: answer rx/rcx: index pairs: (article_no, paragraph_no) cy: idxs: ids: answerss: na: shared: x/cx: tokenized paragraphs (words and chars) p: untokenized paragraphs """ if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if args.medhop: from qangaroo.utils import process_tokens_medhop as process_tokens else: from qangaroo.utils import process_tokens if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] na = [] # no answer cy = [] x, cx = [], [] x2 = [] answers = [] p, p2 = [], [] q2, cq2 = [], [] cand_span, ccand_span, cand_span_y = [], [], [] cand_word, cand_word_y, cand_word_found, real_cand_word_found = [], [], [], [] all_cand_spans, A1s, A2s, all_cand_doc_ids, all_cand_ids, all_cand_num_spans_found, real_cand_count = [], [], [], [], [], [], [] # To store all candidate spans, adjacency matrices, candidate's doc ids, candidate's ids answer_doc_ids, answer_ids_in_doc = [], [] topk_2layer_tfidf_docs = [] first_doc_ids = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() start_ai = int(round(len(source_data) * start_ratio)) stop_ai = int(round(len(source_data) * stop_ratio)) mis_cand = 0 found_answer_in_first_n = 0 tfidf = TfidfVectorizer(strip_accents='unicode') bi = 0 if args.randomize_examples: random.shuffle(source_data) for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])): candidates = article['candidates'] query_sub = ' '.join(article['query'].split()[1:]) query = article['query'].replace('_', ' ') # get rid of '_' in, e.g., 'record_label' supports = article['supports'] answer = article['answer'] if args.truncate_at > 0: for si, support in enumerate(supports): support_split = support.split(' ')[:args.truncate_at] if support_split[-1] != '.': support_split += '.' supports[si] = ' '.join(support_split) if args.randomize_docs: random.shuffle(supports) if args.filter_by_annotations is not None: annotations = article['annotations'] not_follow = 0 likely = 0 follow = 0 multiple = 0 single = 0 follow_and_multiple = 0 for anno in annotations: if anno[0] == 'follows' and anno[1] == 'multiple': follow_and_multiple += 1 if anno[0] == 'follows': follow += 1 if anno[0] == 'not_follow': not_follow += 1 if anno[0] == 'likely': likely += 1 if anno[1] == 'multiple': multiple += 1 if anno[1] == 'single': single += 1 if args.filter_by_annotations == 'follow' and follow < 2: continue elif args.filter_by_annotations == 'multiple' and (follow < 2 or multiple < 2): continue elif args.filter_by_annotations == 'single' and (follow < 2 or single < 2): continue xp, cxp = [], [] xp2 = [] pp, pp2 = [], [] x.append(xp) cx.append(cxp) p.append(pp) x2.append(xp2) p2.append(pp2) para_features = tfidf.fit_transform(supports) q_features = tfidf.transform([query_sub]) dists = pairwise_distances(q_features, para_features, "cosine").ravel() sorted_ix = np.lexsort((supports, dists)) first_doc_ids.append(np.asscalar(sorted_ix[0])) assert first_doc_ids[-1] < len(supports), (first_doc_ids[-1], len(supports)) if args.rank_by_tfidf and save_json: first_doc_ids[-1] = 0 para_features = tfidf.fit_transform(supports) q_features = tfidf.transform([query_sub]) dists = pairwise_distances(q_features, para_features, "cosine").ravel() if pre_answer_doc_id is not None: dists[pre_answer_doc_id[bi]] = 0 sorted_ix = np.lexsort((supports, dists)) sorted_supports = [supports[idx] for idx in sorted_ix] if args.tfidf_layer == 1: if args.back_tfidf: para_features = tfidf.fit_transform(sorted_supports[2:]) q_features = tfidf.transform([sorted_supports[1] + ' ' + sorted_supports[0]]) dists = pairwise_distances(q_features, para_features, "cosine").ravel() sorted_ix = np.lexsort((sorted_supports[2:], dists)) supports = [sorted_supports[idx + 2] for idx in sorted_ix] assert len(sorted_supports) == len(supports) + 2 supports.insert(0, sorted_supports[1]) supports.insert(2, sorted_supports[0]) else: supports = sorted_supports elif args.tfidf_layer == 2: if args.mode == 'double': para_features = tfidf.fit_transform(sorted_supports[2:]) q_features = tfidf.transform([sorted_supports[1]]) dists = pairwise_distances(q_features, para_features, "cosine").ravel() sorted_ix = np.lexsort((sorted_supports[2:], dists)) supports = [sorted_supports[idx + 2] for idx in sorted_ix] assert len(sorted_supports) == len(supports) + 2 supports.insert(0, sorted_supports[1]) supports.insert(2, sorted_supports[0]) else: para_features = tfidf.fit_transform(sorted_supports[1:]) q_features = tfidf.transform([sorted_supports[0]]) dists = pairwise_distances(q_features, para_features, "cosine").ravel() sorted_ix = np.lexsort((sorted_supports[1:], dists)) supports = [sorted_supports[idx + 1] for idx in sorted_ix] assert len(sorted_supports) == len(supports) + 1 supports.insert(0, sorted_supports[0]) else: raise NotImplementedError if args.keep_topk_docs_only > 0: supports = supports[:args.keep_topk_docs_only] else: sorted_supports = [supports[idx] for idx in sorted_ix] para_features = tfidf.fit_transform(supports) q_features = tfidf.transform([sorted_supports[0]]) dists = pairwise_distances(q_features, para_features, "cosine").ravel() dists[sorted_ix[0]] = 1e30 sorted_ix = np.lexsort((supports, dists)) topk_2layer_tfidf_docs.append([]) for kk in range(min(7, len(sorted_ix))): topk_2layer_tfidf_docs[-1].append(np.asscalar(sorted_ix[kk])) context = '' if args.split_supports is True: xi, cxi = [[]], [[]] xi_len = [] for pi, _context in enumerate(supports): _context += ' ' _context = _context.replace("''", '" ') _context = _context.replace("``", '" ') _context = _context.replace(' ', ' ').replace(' ', ' ') context += _context _xi = list(map(word_tokenize, sent_tokenize(_context))) _xi = [process_tokens(tokens) for tokens in _xi] # xi = [["blahblah"]] _cxi = [[list(xijk) for xijk in xij] for xij in _xi] xi[0] += _xi[0] xi_len.append(len(_xi[0])) xp.append(_xi[0]) cxp.append(_cxi[0]) pp.append(_context) xp2.append(xi[0]) pp2.append(context) assert sum(map(len,xp)) == np.array(xp2).shape[-1], (sum(map(len,xp)), np.array(xp2).shape[-1]) else: for pi, _context in enumerate(supports): _context += ' ' _context = _context.replace("''", '" ') _context = _context.replace("``", '" ') _context = _context.replace(' ', ' ').replace(' ', ' ') context += _context xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # xi = [["blahblah"]] cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi[0]) cxp.append(cxi[0]) pp.append(context) # Only "+= 1" because every sets of support_docs corresponds to only 1 question. # In SQuAD, every paragraph can have multiple (len(para['qas'])) questions. for xij in xi: # for sentence in context for xijk in xij: # for word in sentence # if xijk == '.': # print(xijk) word_counter[xijk] += 1 lower_word_counter[xijk.lower()] += 1 for xijkl in xijk: char_counter[xijkl] += 1 # query # get words qi = word_tokenize(query) qi = process_tokens(qi) cqi = [list(qij) for qij in qi] q2i = word_tokenize(query_sub) q2i = process_tokens(q2i) cq2i = [list(q2ij) for q2ij in q2i] # answer yi = [] cyi = [] candi, ccandi, candi_y = [], [], [] candi_word_y = [] candi_word = candidates cand_span.append(candi) ccand_span.append(ccandi) cand_span_y.append(candi_y) cand_word.append(candi_word) cand_word_y.append(candi_word_y) answer_text = answer tokenized_context = ' '.join(xp2[-1]) if args.find_candidates: assert answer in candidates, (answer, candidates) candi_word_y.append(candidates.index(answer)) candidates_spans, not_found, candidates_found, real_candidates_found = compute_candidate_spans(tokenized_context, candidates) cand_word_found.append(candidates_found) real_cand_word_found.append(real_candidates_found) mis_cand += (not_found > 0) for (start, stop) in candidates_spans: yi0, yi1 = get_word_span(tokenized_context, xi, start, stop) assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] i0 = get_word_idx(tokenized_context, xi, yi0) i1 = get_word_idx(tokenized_context, xi, (yi1[0], yi1[1]-1)) cyi0 = start - i0 cyi1 = stop - i1 - 1 candi.append([yi0, yi1]) ccandi.append([cyi0, cyi1]) if answer == '': raise Exception("Answer is empty.") else: answer_start, answer_stop = compute_answer_span(tokenized_context, answer) # Find first matching span if answer_start is None: yi.append([(0, 0), (0, 1)]) cyi.append([0, 1]) na.append(True) answer_doc_ids.append([0]) answer_ids_in_doc.append([0]) else: if args.find_candidates: # If we found the answer span, then we must have found the same span in candidates assert (answer_start, answer_stop) in \ (candidates_spans), (answer, candidates, answer_start, answer_stop, candidates_spans) ans_idx = candidates_spans.index((answer_start, answer_stop)) candi_y.append(ans_idx) na.append(False) yi0, yi1 = get_word_span(tokenized_context, xi, answer_start, answer_stop) answer_doc_id, answer_id_in_doc = find_doc_with_answer(yi0[1], xi_len) if pre_answer_doc_id is not None: assert answer_doc_id < 3, (answer_doc_id) answer_doc_ids.append([answer_doc_id]) answer_ids_in_doc.append([answer_id_in_doc]) answer_spans = [] answer_spans.append((answer_start, answer_stop)) next_answer_start = answer_start next_answer_stop = answer_stop next_context = tokenized_context[answer_stop:] while True: next_answer_start, next_answer_stop = compute_answer_span(next_context, answer) next_context = next_context[next_answer_stop:] if next_answer_start is not None: answer_spans.append((next_answer_start + answer_spans[-1][1], next_answer_stop + answer_spans[-1][1])) else: break next_yi0, next_yi1 = get_word_span(tokenized_context, xi, next_answer_start + answer_spans[-2][1], next_answer_stop + answer_spans[-2][1]) next_answer_doc_id, next_answer_id_in_doc = find_doc_with_answer(next_yi0[1], xi_len) answer_doc_ids[-1].append(next_answer_doc_id) answer_ids_in_doc[-1].append(next_answer_id_in_doc) assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] i0 = get_word_idx(tokenized_context, xi, yi0) i1 = get_word_idx(tokenized_context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 if args.medhop: assert answer_text[0] == w0[cyi0], (answer_text[0], w0[cyi0].lower(), answer_text, w0, cyi0) else: assert answer_text[0] == w0[cyi0].lower(), (answer_text[0], w0[cyi0].lower(), answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1].lower() assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) q2.append(q2i) cq.append(cqi) cq2.append(cq2i) y.append(yi) cy.append(cyi) ids.append(article['id']) answers.append(answer) bi += 1 assert len(q) == len(na), (len(qa), len(na)) assert len(q) == len(y), (len(q), len(y)) assert len(q) == len(x), (len(q), len(x)) assert len(q) == len(first_doc_ids), (len(q), len(first_doc_ids)) assert len(q) == len(answer_doc_ids), (len(q), len(answer_doc_ids)) # Get embedding map according to word_counter. word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here """ q/cq: query y: answer rx/rcx: index pairs: (article_no, paragraph_no) cy: idxs: ids: answerss: na: """ if args.split_supports: if args.find_candidates: data = {'q': q, 'cq': cq, 'y': y, 'cy': cy, 'ids': ids, 'answers': answers, 'na': na, 'x': x, 'cx': cx, 'p': p, 'x2': x2, 'p2': p2, 'q2': q2, 'cq2': cq2, \ 'cand_span': cand_span, 'ccand_span': ccand_span, 'cand_span_y': cand_span_y, 'cand_word': cand_word, 'cand_word_y': cand_word_y, \ 'cand_word_found': cand_word_found, 'real_cand_word_found': real_cand_word_found, 'answer_doc_ids': answer_doc_ids, 'answer_ids_in_doc': answer_ids_in_doc, 'first_doc_ids': first_doc_ids} if args.rank_by_tfidf is False: assert len(topk_2layer_tfidf_docs) > 0 data.update({'topk_2layer_tfidf_docs': topk_2layer_tfidf_docs}) else: data = {'q': q, 'cq': cq, 'y': y, 'cy': cy, 'ids': ids, 'answers': answers, \ 'na': na, 'x': x, 'cx': cx, 'p': p, 'x2': x2, 'p2': p2, 'answer_doc_ids': answer_doc_ids, \ 'answer_ids_in_doc': answer_ids_in_doc, 'first_doc_ids': first_doc_ids} else: data = {'q': q, 'cq': cq, 'y': y, 'cy': cy, 'ids': ids, 'answers': answers, \ 'na': na, 'x': x, 'cx': cx, 'p': p} """ x/cx: tokenized paragraphs (words and chars) p: untokenized paragraphs """ shared = {'word_counter': word_counter, 'char_counter': char_counter, \ 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, \ 'lower_word2vec': lower_word2vec_dict} print("saving ...") print("no answer: %d" %sum(na)) print("missing candidates: %d" %mis_cand) if save_json: save(args, data, shared, out_name) else: prepro_each(args, data_type, out_name=out_name, pre_answer_doc_id=answer_doc_ids)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): """ :param args: arguments :param data_type: train, dev or all :param start_ratio: default is 0.0 :param stop_ratio: default is 1.0 :param out_name: train, dev or test :param in_path: default is None, not sure about what is this :return: """ # 1. tokenize and sent tokenize if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): """ firstly word_tokenize the tokens and replace some chars, and return a list :param tokens: :return: """ return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] # input is para, turn it to a list # 2. load data from disk source_path = in_path or os.path.join(args.source_dir, "{}-v{}.json".format(data_type, args.version)) source_data = json.load(open(file=source_path, mode='r')) # 3. initiate some counter and some lists q, cq, rx, rcx = [], [], [], [] # question, char_question, context, char_context y, cy, ids, idxs = [], [], [], [] x, cx = [], [] answerss, p = [], [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() start_at_index = int(round(len(source_data['data']) * start_ratio)) stop_at_index = int(round(len(source_data['data']) * stop_ratio)) # 4. iterate the dataset max_ques_size = 0 max_context_size = 0 max_word_size = 0 for article_index, article in enumerate(tqdm(source_data['data'][start_at_index:stop_at_index])): xp, cxp, pp = [], [], [] x.append(xp) cx.append(cxp) p.append(pp) for paragraph_index, paragraph in enumerate(article['paragraphs']): context = paragraph['context'] context = context.replace("''", '" ') # notice this space, so the length of the context will not change when replace context = context.replace("``", '" ') # context is a str here list_of_wordlist = list(map(word_tokenize, sent_tokenize(context))) # after sent_tokenizer, it will be a list of sentence, here just one sentence, # a list of sentence # then the map, will apply the word_tokenize func to each sentence # a list of lists of words # [[words for sentence1], [words for sentence2]] list_of_wordlist = [process_tokens(tokens) for tokens in list_of_wordlist] # list_of_wordlist is a 2d stuff for wordlist in list_of_wordlist: max_context_size = max(max_context_size, len(wordlist)) list_of_charlist = [[list(word) for word in wordlist] for wordlist in list_of_wordlist] # list of charlist is a 3d, sentence-dim, word-dim, char-dim xp.append(list_of_wordlist) # 3d, paragraph, sentence, words cxp.append(list_of_charlist) # 4d, paragraph, sentence, words, chars pp.append(context) # 2d, paragraph, context ## update counters num_qas = len(paragraph['qas']) for wordlist in list_of_wordlist: for word in wordlist: word_counter[word] += num_qas lower_word_counter[word.lower()] += num_qas for char in word: char_counter[char] += num_qas rxi = [article_index, paragraph_index] assert len(x) - 1 == article_index # x stores xp, xp is 3d, paragraph, sentece, and words assert len(x[article_index]) - 1 == paragraph_index for question in paragraph['qas']: question_wordslist = word_tokenize(question['question']) max_ques_size = max(max_ques_size, len(question_wordslist)) # it's a list of words question_charslist = [list(word) for word in question_wordslist] # it's a list of charlist yi = [] cyi = [] answers = [] # the content of each answers for answer in question['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start_index = answer['answer_start'] answer_end_index = answer_start_index + len(answer_text) yi0, yi1 = get_word_span(context, list_of_wordlist, # 2-d: sentences, words answer_start_index, answer_end_index) # yi0 (0, 108), 0 is the index of sentence # yi1 (0, 111). 108 and 111 is the start and end of word index assert len(list_of_wordlist[yi0[0]]) > yi0[1] # the length of the first sentence is larger than 108 assert len(list_of_wordlist[yi1[0]]) >= yi1[1] # the length of the first sentence is larger or equla to 111 w0 = list_of_wordlist[yi0[0]][yi0[1]] # the start words of the answer w1 = list_of_wordlist[yi1[0]][yi1[1] - 1] # the last word of the answer i0 = get_word_idx(context, list_of_wordlist, yi0) i1 = get_word_idx(context, list_of_wordlist, (yi1[0], yi1[1] - 1)) # i0 is 515, which is the char index of the answer, # i1 is start index of the final word in terms of chars # 'Saint Bernadette Soubirous', i1 is the index of S in Soubirous cyi0 = answer_start_index - i0 # it should be 0 here since start index is 515, and i0 should also be 515 cyi1 = answer_end_index - i1 - 1 # cyi1 seems to be the length of last word -1, or because some other issues assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] # be sure the first char and last char are same with the first word's first char and last word's last char assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) # yi.append([yi0, yi1]) # index of words cyi.append([cyi0, cyi1]) # index of shifts from the first char and last char of the answer in context # update counters for word in question_wordslist: word_counter[word] += 1 lower_word_counter[word.lower()] += 1 for char in word: char_counter[char] += 1 q.append(question_wordslist) # 2-d list of wordlist for each question cq.append(question_charslist) # 3-d, question-word-char y.append(yi) # question-startendpair cy.append(cyi) # question-startend char pair rx.append(rxi) # list of article_id-paragraph_id pair rcx.append(rxi) ids.append(question['id']) # ids for each question idxs.append(len(idxs)) # index for each question answerss.append(answers) # list of answer in string word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) for word in word_counter: max_word_size = max(max_word_size, len(word)) # add context here data = { 'q': q, # list of word list of each questions, [['who','are', 'you'], ... ] 'cq': cq, # [<class 'list'>: [['T', 'o'], ['w', 'h', 'o', 'm'], ['d', 'i', 'd'], ['t', 'h', 'e'], ['V', 'i', 'r', 'g', 'i', 'n'], ['M', 'a', 'r', 'y'], ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'], ['a', 'p', 'p', 'e', 'a', 'r'], ['i', 'n'], ['1', '8', '5', '8'], ['i', 'n'], ['L', 'o', 'u', 'r', 'd', 'e', 's'], ['F', 'r', 'a', 'n', 'c', 'e'], ['?']] , ...] 'y': y, # list of <class 'list'>: [[(0, 108), (0, 111)]] '*x': rx, # list of <class 'list'>: [0, 21], 0 means the number of article, 21 means the 21st paragraph '*cx': rcx, # same with rx but for characters, i guess the values are same as well 'cy': cy, # 'idxs': idxs, # just those ids 'ids': ids, # the id of each question, sth like uuid 'answerss': answerss, # the content of the answer '*p': rx # } shared = { 'x': x, # words of each paragraph 'cx': cx, # characters of each 'p': p, # the content of each paragraph 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...")
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] # source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type)) source_path = in_path or os.path.join(args.source_dir, "{}.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() # start_ai = int(round(len(source_data['data']) * start_ratio)) # stop_ai = int(round(len(source_data['data']) * stop_ratio)) start_ai = int(round(len(source_data) * start_ratio)) stop_ai = int(round(len(source_data) * stop_ratio)) # for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) # print(article.keys()) # paragraph, title # raise # print(article) # {'question', 'answer', 'context', 'answer_list'} # raise # for pi, para in enumerate(article['paragraphs']): for pi, para in enumerate([article]): # print(para.keys()) # qas, context # raise # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # print(xi) # [['archi', ',', 'the', 'school']] # raise # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) # print(len(para['qas'])) # 5 # print(para['qas']) # [{'answers': [{'text', 'answer_start'}], 'id', 'question'}] # raise for xij in xi: for xijk in xij: # word_counter[xijk] += len(para['qas']) # lower_word_counter[xijk.lower()] += len(para['qas']) word_counter[xijk] += 1 lower_word_counter[xijk.lower()] += 1 for xijkl in xijk: # char_counter[xijkl] += len(para['qas']) char_counter[xijkl] += 1 rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi # for qa in para['qas']: for qa in [article]: # get words # qi = word_tokenize(qa['question']) qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] # for answer in qa['answers']: try: answer = qa['answer'] answer_text = qa['answer_list'][answer-1] except KeyError: answer_text = ' '.join(qa['answer_list']) for _ in [answer_text]: # answer_text = answer['text'] answers.append(answer_text) # answer_start = answer['answer_start'] try: answer_start = context.index(answer_text) answer_stop = answer_start + len(answer_text) except ValueError: answer_start = 0 answer_stop = len(context) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] # context: str # xi: [[word, word, word, ...]] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, '|', w0[cyi0:], '|', w1[:cyi1+1]) # raise #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) #assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) # ids.append(qa['id']) ids.append(qa['question']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx} shared = {'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict} print("saving ...") save(args, data, shared, out_name)
import json import sys from tqdm import tqdm from my.corenlp_interface import CoreNLPInterface in_path = sys.argv[1] out_path = sys.argv[2] url = sys.argv[3] port = int(sys.argv[4]) data = json.load(open(in_path, 'r')) h = CoreNLPInterface(url, port) def find_all(a_str, sub): start = 0 while True: start = a_str.find(sub, start) if start == -1: return yield start start += len(sub) # use start += 1 to find overlapping matches def to_hex(s): return " ".join(map(hex, map(ord, s))) def handle_nobreak(cand, text): if cand == text:
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() # source_path = in_path or os.path.join(args.source_dir, "{}.seq.json".format(data_type)) # source_data = json.load(open(source_path, 'r')) q, cq = [], [] sents, csents = [], [] rsents, rcsents = [], [] sentslen = [] labels = [] ids = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() outfile = open('noise.txt', 'w') enum = 0 total = 0 overlap = 0 if(args.mode=="squad"): source_path = os.path.join(args.source_dir, "{}.json".format(data_type)) source_data = json.load(open(source_path, 'r')) start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens xi = [[xijk for xijk in xij if xijk != ''] for xij in xi] # context in sent-level contexti = sent_tokenize(context) context_sent_len = [] len_cur = 0 for cidx, c in enumerate(contexti): len_cur += len(c) + 1 context_sent_len.append(len_cur) #assert len_cur-1 == len(context), (len_cur, len(context)) # sentences in word-level sentsi = xi # sentences in char-level csentsi = [[list(xijk) for xijk in xij] for xij in xi] if args.debug: print(sentsi) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) for qa in para['qas']: # get question words qaid = qa["id"] qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] answer_loc_list = [] # if(len(qa['answers'])>1): # continue answer = qa['answers'][0] # for answer in qa['answers']: answer_text = answer['text'] ansi = word_tokenize(answer_text) answer_location = answer['answer_start'] answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) answer_loc_list.append(get_sent_loc_idx(context_sent_len, answer_start, answer_stop)) score = get_score(answer_loc_list, len(sentsi), args.kernel_size) label = get_label(answer_loc_list, len(sentsi)) for si in range(len(sentsi)): if(len(sentsi[si]) > 60 or noise_flag(sentsi[si])): outfile.write(' '.join(sentsi[si])+'\n') enum+=1 continue sents.append([sentsi[si]]) sentslen.append(len(sentsi[si])) csents.append([csentsi[si]]) q.append(qi) cq.append(cqi) scores.append(score[si]) labels.append(label[si]) ids.append(qaid) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 else: fi = 0 qlen = [] slen = [] for file_size in ['0-400', '400-700', '700-']: source_path = os.path.join(args.source_dir, "{0}/{1}.seq.json".format(file_size, data_type)) source_data = json.load(open(source_path, 'r')) start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] sents.append(xp) csents.append(cxp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens xi = [[xijk for xijk in xij if xijk != ''] for xij in xi] # context in sent-level contexti = sent_tokenize(context) context_sent_len = [] len_cur = 0 for cidx, c in enumerate(contexti): len_cur += len(c) + 1 if(len(xi[cidx]) < 200): slen.append(len(xi[cidx])) context_sent_len.append(len_cur) assert len_cur-1 == len(context), (len_cur, len(context)) # sentences in word-level sentsi = xi # sentences in char-level csentsi = [[list(xijk) for xijk in xij] for xij in xi] xp.append([[sent] for sent in sentsi]) cxp.append([[csent] for csent in csentsi]) if args.debug: print(sentsi) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) for qa in para['qas']: # get question words total += 1 qaid = qa["id"] qi = word_tokenize(qa['question']) for qw in qi: oflag = False for xs in xi[0]: if qw not in STOPWORDS and qw in xs: overlap += 1 oflag = True break if(oflag): break qlen.append(len(qi)) cqi = [list(qij) for qij in qi] answer_loc_list = [] # if(len(qa['answers'])>1): # continue answer = qa['answers'][0] # for answer in qa['answers']: answer_text = answer['text'] ansi = word_tokenize(answer_text) answer_location = answer['answer_location'] api = [] for ans_idx, answer_start in enumerate(answer_location): answer_stop = answer_start + len(ansi[ans_idx]) answer_loc_senti = get_sent_loc_idx(context_sent_len, answer_start, answer_stop) answer_loc_list.append(answer_loc_senti) label = get_label(answer_loc_list, len(sentsi)) for si in range(len(sentsi)): if(len(sentsi[si]) > 60 or noise_flag(sentsi[si])): outfile.write(' '.join(sentsi[si])+'\n') enum+=1 continue rsentsi = [ai+fi, pi, si] rx = rsentsi assert(sentsi[si] == sents[rx[0]][rx[1]][rx[2]][0]) #sents.append([sentsi[si]]) sentslen.append(len(sentsi[si])) #csents.append([csentsi[si]]) q.append(qi) cq.append(cqi) labels.append(label[si]) ids.append(qaid) rsents.append(rsentsi) rcsents.append(rsentsi) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 if args.debug: break fi += stop_ai-start_ai word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here print(len(q), len(cq), len(labels)) print(float(overlap)/total) print(enum) data = {'q': q, 'cq': cq, '*sents': rsents, '*csents': rcsents, 'label': labels, "id": ids, "sentslen": sentslen} shared = {'sents': sents, 'csents': csents, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict} # print("saving ...") # save(args, data, shared, out_name) plt.figure() sns.set( palette="muted", color_codes=True) sns.distplot(qlen, kde_kws={"label":"Question Length Distribution"}) plt.savefig("qld") plt.figure() sns.distplot(slen, kde_kws={"label":"Sentence Length Distribution"}) plt.savefig("sld") plt.show()
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): """ :param args: configurations :param data_type: train or dev :param start_ratio: :param stop_ratio: :param out_name: train, dev, test :param in_path: :return: """ if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] # 1. load data source_path = in_path or os.path.join( args.source_dir, "{}-v{}.json".format(data_type, args.version)) source_data = json.load(open(source_path, 'r')) # load the train data or dev 1.1 dataset q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_at_index = int(round(len(source_data['data']) * start_ratio)) stop_at_index = int(round(len(source_data['data']) * stop_ratio)) # for each article for article_index, article in enumerate( tqdm(source_data['data'][start_at_index:stop_at_index])): xp, cxp = [], [] pp = [] x.append(xp) # article_paragraph_sentence_wordlist cx.append(cxp) # article_paragraph_sentence_word_charlist p.append(pp) # article_contextlist # for each paragrph of the article for paragraph_index, paragraph in enumerate(article['paragraphs']): # wordss context = paragraph['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') list_of_wordlist = list(map(word_tokenize, sent_tokenize(context))) list_of_wordlist = [ process_tokens(tokens) for tokens in list_of_wordlist ] # process tokens # xi are words # given xi, add chars list_of_charlist = [[list(word) for word in word_list] for word_list in list_of_wordlist] # cxi are characters for each words xp.append(list_of_wordlist) # paragraph_sentence_wordlist cxp.append(list_of_charlist) # paragraph_sentence_word_charlist pp.append(context) # contextlist # update the counter to plus the number of questions for wordlist in list_of_wordlist: for word in wordlist: word_counter[word] += len(paragraph['qas']) lower_word_counter[word.lower()] += len(paragraph['qas']) for char in word: char_counter[char] += len(paragraph['qas']) rxi = [article_index, paragraph_index] assert len(x) - 1 == article_index assert len(x[article_index]) - 1 == paragraph_index for question in paragraph['qas']: # get words question_wordslist = word_tokenize(question['question']) question_charslist = [list(qij) for qij in question_wordslist] yi = [] cyi = [] answers = [] for answer in question['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, list_of_wordlist, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(list_of_wordlist[yi0[0]]) > yi0[1] assert len(list_of_wordlist[yi1[0]]) >= yi1[1] w0 = list_of_wordlist[yi0[0]][yi0[1]] w1 = list_of_wordlist[yi1[0]][yi1[1] - 1] i0 = get_word_idx(context, list_of_wordlist, yi0) i1 = get_word_idx(context, list_of_wordlist, (yi1[0], yi1[1] - 1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in question_wordslist: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(question_wordslist) # question_wordlist, cq.append(question_charslist) # qeustion_word_charlist y.append(yi) # cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(question['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, # list of word list of each questions, [['who','are', 'you'], ... ] 'cq': cq, # [<class 'list'>: [['T', 'o'], ['w', 'h', 'o', 'm'], ['d', 'i', 'd'], ['t', 'h', 'e'], ['V', 'i', 'r', 'g', 'i', 'n'], ['M', 'a', 'r', 'y'], ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'], ['a', 'p', 'p', 'e', 'a', 'r'], ['i', 'n'], ['1', '8', '5', '8'], ['i', 'n'], ['L', 'o', 'u', 'r', 'd', 'e', 's'], ['F', 'r', 'a', 'n', 'c', 'e'], ['?']] , ...] 'y': y, # list of <class 'list'>: [[(0, 108), (0, 111)]] '*x': rx, # list of <class 'list'>: [0, 21], 0 means the number of article, 21 means the 21st paragraph '*cx': rcx, # same with rx but for characters, i guess the values are same as well 'cy': cy, # 'idxs': idxs, # just those ids 'ids': ids, # the id of each question, sth like uuid 'answerss': answerss, # the content of the answer '*p': rx # } # the following variables are shared by several question, shared = { 'x': x, # words of each paragraph 'cx': cx, # characters of each 'p': p, # the content of each paragraph 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...")
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): # return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] return [token.replace("''", '"').replace("``", '"') for token in jieba.lcut(tokens, cut_all=False)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() # if not args.split: # sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type)) source_data = json.load(open(source_path, 'r', encoding='utf-8')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') ################### add by zhijing table = {ord(f):ord(t) for f,t in zip( ',。!?【】()%#@&1234567890', ',.!?[]()%#@&1234567890')} context = context.translate(table) ################### add by zhijing print(context) print(len(sent_tokenize(context))) xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens print('xi') print(xi) # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx} shared = {'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict} print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] answer = qa['answer'] yi.append(answer) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 def put(): q.append(qi) cq.append(cqi) y.append(yi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) put() if data_type == 'train' and answer: for i in range(3): put() word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx } shared = { 'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}-qar_squad_all.jsonl".format(data_type)) rfp = open(source_path, 'r') q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() #start_ai = int(round(len(source_data['data']) * start_ratio)) #stop_ai = int(round(len(source_data['data']) * stop_ratio)) pi = 0 ai = 0 xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for line in tqdm(rfp): para = json.loads(line) context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) # xi = context.split() xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words qa_text = qa['question'] qa_text = qa_text.replace("''", '" ') qa_text = qa_text.replace("``", '" ') qi = word_tokenize(qa_text) # qi = qa['question'].split() cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: flag = False answer_text = answer['text'] answer_text = answer_text.replace("''", '" ') answer_text = answer_text.replace("``", '" ') answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] if len(w1) == 0 and len(xi[yi1[0]][yi1[1]-2]) != 0: flag = True w1 = xi[yi1[0]][yi1[1]-2] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) if flag: assert answer_text[-2] == w1[cyi1], (answer_text, w1, cyi1) else: assert answer_text[-1] == w1[cyi1], (answer_text, w1, cyi1) assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break pi += 1 word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx} shared = {'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict} print("Saving ...") save(args, data, shared, out_name) print("Saving complete!")
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): print("Preprocessing data type %s" % data_type) if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}.csv".format(data_type)) print("Reading data from source path %s" % source_path) source_data = pd.read_csv(source_path, encoding='utf-8', dtype=dict(is_answer_absent=float), na_values=dict(question=[], story_text=[], validated_answers=[]), keep_default_na=False) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] # Gold standard answers span_answerss = [] # Answers from our spans p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data) * start_ratio)) stop_ai = int(round(len(source_data) * stop_ratio)) data_rows = source_data.iterrows() story_ids_to_idx = {} idx_to_story_ids = {} for ai, data_point in enumerate(tqdm(data_rows)): question_index, question_info = data_point[0], data_point[1] story_id = question_info['story_id'] context = question_info['story_text'] context = context.replace("''", '" ') context = context.replace("``", '" ') question = question_info['question'] question_id = ai answer_char_ranges = question_info['answer_char_ranges'] # Copy get answer script from the newsqa dataset baseline_answers = [] # Prefer validated answers. # If there are no validated answers, use the ones that are provided. if not 'validated_answers' in question_info or not question_info[ 'validated_answers']: # Ignore per selection splits. char_ranges = question_info['answer_char_ranges'].replace( '|', ',').split(',') else: validated_answers_dict = json.loads( question_info['validated_answers']) char_ranges = [] for k, v in validated_answers_dict.items(): char_ranges += v * [k] for char_range in char_ranges: if char_range.lower() == 'none': baseline_answers.append('NONE') elif ':' in char_range: start, end = map(int, char_range.split(':')) answer = question_info['story_text'][start:end] baseline_answers.append(answer) paragraph_ptr = -1 pi = 0 if story_id not in story_ids_to_idx: paragraph_ptr = len(story_ids_to_idx) story_ids_to_idx[story_id] = paragraph_ptr idx_to_story_ids[paragraph_ptr] = story_id xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += 1 lower_word_counter[xijk.lower()] += 1 for xijkl in xijk: char_counter[xijkl] += 1 else: paragraph_ptr = story_ids_to_idx[story_id] rxi = [paragraph_ptr, pi] """ print("TEST") print("TEST") print(story_ids_to_idx) print(len(xp)) print(paragraph_ptr) """ xi = x[paragraph_ptr][pi] qi = word_tokenize(question) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] answer_char_ranges_split = answer_char_ranges.split("|") for answer in answer_char_ranges_split: if answer == 'None': continue answer_char_range = answer.split(",")[0].split(":") answer_start = int(answer_char_range[0]) answer_stop = int(answer_char_range[-1]) answer_text = context[answer_start:answer_stop].strip() if answer_text == "": print("BAD ANSWER GIVEN %s" % answer_char_range) continue answers.append(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1] - 1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 #print(question, answer_text, w0[cyi0:], w1[:cyi1+1]) #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) #assert answer_text[-1] == w1[-1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(question_id) idxs.append(len(idxs)) answerss.append(baseline_answers) span_answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, 'span_answerss': span_answerss, '*p': rx } shared = { 'x': x, 'cx': cx, 'p': p, 'story_ids_to_idx': story_ids_to_idx, 'idx_to_story_ids': idx_to_story_ids, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() sent_tokenize0 = lambda para: [para] # source_path = in_path or os.path.join(args.source_dir, "{}.seq.json".format(data_type)) # source_data = json.load(open(source_path, 'r')) total = 0 debug_out = [] debug_q = Counter() false_num = 0 fnum = 0 q, cq = [], [] y = [] sents, csents = [], [] rsents, rcsents = [], [] ids = [] answerss = [] q_counter, q_counter0 = {}, {} word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() source_path = os.path.join(args.source_dir, "{}.seq.json".format(data_type)) source_data = json.load(open(source_path, 'r')) filter_dict = json.load(open(args.filter_file, 'r')) start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] sents.append(xp) csents.append(cxp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens xi = [[xijk for xijk in xij if xijk != ''] for xij in xi] # context in sent-level contexti = sent_tokenize(context) context_sent_len = [] len_cur = 0 for cidx, c in enumerate(contexti): len_cur += len(c) + 1 context_sent_len.append(len_cur) assert len_cur - 1 == len(context), (len_cur, len(context)) # sentences in word-level sentsi = xi # sentences in char-level csentsi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(sentsi) cxp.append(csentsi) for qa in para['qas']: # get question words qaid = qa["id"] q_counter[qa['question']] = q_counter.get(qa['question'], 0) + 1 total += 1 if (qaid in filter_dict): valid_sentid = sorted(filter_dict[qaid]) inv_sentid = {k: v for v, k in enumerate(valid_sentid)} rsentsi = [ai, pi, valid_sentid] qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] # xi = list(map(word_tokenize, sent_tokenize(context))) newxi = [xi[sentid] for sentid in valid_sentid] word_num = list(map(len, newxi)) newxi = [[x for s in newxi for x in s]] cnewxi = [[list(xijk) for xijk in xij] for xij in newxi] yi = [] answers = [] for answer in qa['answers']: yii = [] answer_text = answer['text'] ansi = word_tokenize(answer_text) answer_location = answer['answer_location'] not_complete = False for ans_idx, answer_start in enumerate( answer_location): answer_stop = answer_start + len(ansi[ans_idx]) answer_loc_senti = get_sent_loc_idx( context_sent_len, answer_start, answer_stop) if (answer_loc_senti not in valid_sentid): not_complete = True break start = sum( word_num[:inv_sentid[answer_loc_senti]]) end = sum(word_num[:inv_sentid[answer_loc_senti] + 1]) try: pos = newxi[0].index(ansi[ans_idx], start, end) except: not_complete = True false_num += 1 print(xi[answer_loc_senti], newxi[0][start - 5:end + 5], word_num, start, end, newxi[start:end], ansi) break yii.append(pos) if (not_complete): continue yi.append(yii) answers.append(answer_text) if (len(yi) == 0): fnum += 1 q_counter0[qa['question']] = q_counter0.get( qa['question'], 0) + 1 continue for xij in newxi: for xijk in xij: word_counter[xijk] += 1 lower_word_counter[xijk.lower()] += 1 for xijkl in xijk: char_counter[xijkl] += 1 for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) ids.append(qa['id']) rsents.append(rsentsi) rcsents.append(rsentsi) answerss.append(answers) if (qaid not in filter_dict): continue word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here qx, qy = [], [] print("{0}/{1}".format(len(q), total)) for k in q_counter0.keys(): if (float(q_counter0[k]) / q_counter[k] > 0.05): qx.append(q_counter0[k]) qy.append(q_counter[k]) print(k, "{}/{}".format(q_counter0[k], q_counter[k])) xaxis = list(range(len(qx))) plt.bar(xaxis, qx, width=0.5) plt.bar(xaxis, qy, width=0.2) plt.show() data = { 'q': q, 'cq': cq, '*x': rsents, '*cx': rcsents, 'y': y, "id": ids, "answer": answerss } shared = { 'x': sents, 'cx': csents, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } # print(debug_q) # print("saving ...") # with open("debug_out.json", "w") as fh: # json.dump(debug_out, fh) save(args, data, shared, out_name)