def modify_answer_spans(qas, summary): #converts span from (sent_num, word_num) -> (0,total_word_num in para) import nltk summary_tokenized = list( map(nltk.word_tokenize, nltk.sent_tokenize(summary))) summary_tokenized = [ process_tokens(tokens) for tokens in summary_tokenized ] for index, qa in qas.iterrows(): answer1_span_start_idx = qa['start_index'].split(', ') answer1_span_end_idx = qa['end_index'].split(', ') answer1_span_start_idx = list(map(int, answer1_span_start_idx)) answer1_span_end_idx = list(map(int, answer1_span_end_idx)) index_mod = sum( map(len, summary_tokenized[0:answer1_span_start_idx[0]])) qas.at[index, 'start_index'] = [0, index_mod + answer1_span_start_idx[1]] index_mod = sum(map(len, summary_tokenized[0:answer1_span_end_idx[0]])) qas.at[index, 'end_index'] = [0, index_mod + answer1_span_end_idx[1]] return qas
def evaluate_bleu_scores(data_type): bleu_scores = [] bleu_4_scores = [] for index_summ, row in tqdm(source_summaries.iterrows(), total=1572): if data_type == row['set']: references = [] references1 = [] spans = [] #summary = row['processed_summary'].replace(".",". ") #summ = list(map(nltk.word_tokenize, nltk.sent_tokenize(row['processed_summary_wo']))) #summ = [process_tokens(tokens) for tokens in summ] summary_tokenized = nltk.word_tokenize(row['summary_tokenized']) summary_tokenized = list( map(str.lower, process_tokens(summary_tokenized))) qas = source_qas[source_qas['document_id'].isin( [row['document_id']])] qas = qas.reset_index(drop=True) qas = modify_answer_spans(qas, row['summary_tokenized']) for qid, ques_row in qas.iterrows(): sent = list( map( str.lower, nltk.word_tokenize( ques_row['answer1_tokenized'].replace(".", "")))) #print ("Question",qid,ques_row['processed_question_wo']) #print ("Answer:",sent) #print("indices",ques_row['start_index'],ques_row['end_index']) predicted_rouge_span = summary_tokenized[ ques_row['start_index'][1]:ques_row['end_index'][1] + 1] #print ("Rouge Span:",predicted_rouge_span) references.append([sent]) #references1.append([predicted_rouge_span]) spans.append(predicted_rouge_span) bleu_scores.append( corpus_bleu(references, spans, weights=(1, 0, 0, 0))) bleu_4_scores.append(corpus_bleu(references, spans)) print("Average score bleu_1 for", data_type, sum(bleu_scores) / len(bleu_scores)) print("Average score bleu_4 for", data_type, sum(bleu_4_scores) / len(bleu_4_scores))
def _tokenize(c): c = c.replace("''", '" ') c = c.replace("``", '" ') cl = list(map(word_tokenize, sent_tokenize(c))) cl = [process_tokens(tokens) for tokens in cl] # process tokens return cl[0]
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): totalnum0 = 0 falsenum0 = 0 falsenum1 = 0 truenum0 = 0 truenum1 = 0 outlist = [] if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}.seq.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss ctx = para['context'] if (len(ctx.split()) <= 800): cut = -1 else: cut = sum(map(len, ctx.split()[:800])) + 800 context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens xi = [[xijk for xijk in xij if xijk != ''] for xij in xi] # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: totalnum0 += 1 # get words qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] answers = [] flag = False Flag = True if (cut > -1): for ans in qa['answers']: if (max(ans['answer_location'])) < cut: Flag = False break else: Flag = False if (Flag): falsenum1 += 1 for answer in qa['answers']: flag1 = True answer_text = answer['text'] answers.append(answer_text) ansi = word_tokenize(answer_text) answer_location = answer['answer_location'] yii = [] for ans_idx, answer_start in enumerate(answer_location): answer_stop = answer_start + len(ansi[ans_idx]) yi0, _ = get_word_span(context, xi, answer_start, answer_stop) if (yi0[1] >= 800): flag1 = False assert len(xi[yi0[0]]) > yi0[1] w0 = xi[yi0[0]][yi0[1]] assert ansi[ans_idx] == w0, (ansi[ans_idx], w0) yii.append(yi0) if (flag1): flag = True yi.append(yii) if (flag): truenum0 += 1 if (flag == Flag): print(ctx, qa, yi, cut) outlist.append([ctx, qa]) # answer_start = answer['answer_start'] # answer_stop = answer_start + len(answer_text) # yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # # yi0 = answer['answer_word_start'] or [0, 0] # # yi1 = answer['answer_word_stop'] or [0, 1] # assert len(xi[yi0[0]]) > yi0[1] # assert len(xi[yi1[0]]) >= yi1[1] # w0 = xi[yi0[0]][yi0[1]] # w1 = xi[yi1[0]][yi1[1]-1] # i0 = get_word_idx(context, xi, yi0) # i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) # cyi0 = answer_start - i0 # cyi1 = answer_stop - i1 - 1 # # print(answer_text, w0[cyi0:], w1[:cyi1+1]) # assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) # assert answer_text[-1] == w1[cyi1] # assert cyi0 < 32, (answer_text, w0) # assert cyi1 < 32, (answer_text, w1) # yi.append([yi0, yi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break print(truenum0, totalnum0, float(truenum0) / totalnum0) print(falsenum1, totalnum0, 1 - float(falsenum1) / totalnum0) with open('debugcnt.json', 'w') as f: json.dump(outlist, f)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] answer = qa['answer'] yi.append(answer) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 def put(): q.append(qi) cq.append(cqi) y.append(yi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) put() if data_type == 'train' and answer: for i in range(3): put() word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx } shared = { 'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def get_2d_span(summary, index): sum = 0 for sent_num, sent in enumerate(summary): if index < (sum + len(sent)): return (sent_num, index - sum) sum += len(sent) for index_summ, row in tqdm(source_summaries.iterrows(), total=1572): #summary = row['processed_summary'].replace(".",". ") summ = list( map(nltk.word_tokenize, nltk.sent_tokenize(row['processed_summary_wo']))) summ = [process_tokens(tokens) for tokens in summ] summary_tokenized = nltk.word_tokenize(row['processed_summary_wo']) summary_tokenized = list(map(str.lower, process_tokens(summary_tokenized))) all_substrings = get_all_substrings(summary_tokenized) #print (compute_bleu(get_all_substrings(['Dhruv','is','a','good','scientist','.']),['Dhruv'])) qas = source_qas[source_qas['document_id'].isin([row['document_id']])] qas = qas.reset_index(drop=True) print("Summaries", summ) for qid, ques_row in qas.iterrows(): sent = list( map( str.lower, nltk.word_tokenize(ques_row['processed_answer_wo'].replace( ".", "")))) #ans_span=compute_bleu(all_substrings,sent) ans_span = compute_rouge(all_substrings, sent)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] contextss = [] context_questions = [] titles = [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): pp = [] p.append(pp) xp, cxp, contexts, c_questions = [], [], [], [] x.append(xp) cx.append(cxp) contextss.append(contexts) context_questions.append(c_questions) title = "[" + str(ai).zfill(2) + "] " + article['title'].replace('_', ' ') titles.append(title) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') #Sentences of priginal Paragraph contexts.append(context) xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi if ai==0: c_questions.append(para['qas'][3]['question']) else: c_questions.append(para['qas'][0]['question']) """ for qa in para['qas']: # get words c_questions.append(qa['question']) break qi = word_tokenize(qa['question']) # qa['question'] : original question cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) """ if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p' : rx} shared = {'x': x, 'cx': cx, 'p' : p, 'contextss' : contextss, 'context_questions' : context_questions, 'titles' : titles, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict} print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): parser = StanfordParser(model_path=os.getenv("StanfordParser_model_path")) if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() # if not args.split: # sent_tokenize = lambda para: [para] source_path = in_path or os.path.join( args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] q_syn_seq = [] na = [] cy = [] x, cx = [], [] syn_seq = [] rsyn_seq = [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() syn_counter = Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) abandon = 0 for ai, article in enumerate(tqdm(source_data['data'][221:])): xp, cxp = [], [] syn_seqp = [] pp = [] x.append(xp) cx.append(cxp) syn_seq.append(syn_seqp) p.append(pp) p_i = -1 for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ').replace("``", '" ').replace( 'e.g.', 'e-g,') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens try: syn_seqpp = sents_parser(xi, parser) p_i += 1 except: abandon += 1 continue for sent in syn_seqpp: for word in sent: for syn in word: syn_counter[syn] += 1 syn_seqp.append(syn_seqpp) # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai + 221, p_i] assert len(x) - 1 == ai assert len(x[ai]) - 1 == p_i for qa in para['qas']: # get words qi = word_tokenize(qa['question']) qi = process_tokens(qi) try: q_syn_seqq = sent_parser(qi, parser) except: continue cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1] - 1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) if len(qa['answers']) == 0: yi.append([(0, 0), (0, 1)]) cyi.append([0, 1]) na.append(True) else: na.append(False) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) q_syn_seq.append(q_syn_seqq) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) rsyn_seq.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) print('abandon {} paragraph'.format(abandon)) if args.debug: break for sent in q_syn_seq: for word in sent: for syn in word: syn_counter[syn] += 1 word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, '*syn_seq': rsyn_seq, 'cy': cy, 'q_syn_seq': q_syn_seq, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na } shared = { 'x': x, 'cx': cx, 'p': p, 'syn_seq': syn_seq, 'syn_counter': syn_counter, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk.tokenize as nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from corenlp import CoreNLPClient interface = CoreNLPClient(annotators="tokenize ssplit".split()) else: raise Exception() """ if not args.split: sent_tokenize = lambda para: [para] """ source_path = in_path or os.path.join( args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] na = [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') while True: try: temp = interface.annotate(context) break except Exception as e: time.sleep(0.2) context_s = [] for sent in temp.sentence: sent = [word.originalText for word in sent.token] # CoreNLP 가 처리못하는 단어인 \xa0 를 예외처리하기위한 수동 방법... for wi in range(len(sent)): if "\xa0" in sent[wi]: sent = sent[:wi] + sent[wi].split("\xa0") + sent[wi + 1:] wi = 0 context_s.append(sent) xi = context_s xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words while True: try: temp = interface.annotate(qa['question']).sentence[0] break except Exception as e: time.sleep(0.2) #print(temp.token[0]) #exit(-1) qi = [t_s.originalText for t_s in temp.token] print(qi) exit(-1) qi = process_tokens(qi) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1] - 1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) if len(qa['answers']) == 0: yi.append([(0, 0), (0, 1)]) cyi.append([0, 1]) na.append(True) else: na.append(False) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na } shared = { 'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): """ :param args: arguments :param data_type: train, dev or all :param start_ratio: default is 0.0 :param stop_ratio: default is 1.0 :param out_name: train, dev or test :param in_path: default is None, not sure about what is this :return: """ # 1. tokenize and sent tokenize if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): """ firstly word_tokenize the tokens and replace some chars, and return a list :param tokens: :return: """ return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] # input is para, turn it to a list # 2. load data from disk source_path = in_path or os.path.join(args.source_dir, "{}-v{}.json".format(data_type, args.version)) source_data = json.load(open(file=source_path, mode='r')) # 3. initiate some counter and some lists q, cq, rx, rcx = [], [], [], [] # question, char_question, context, char_context y, cy, ids, idxs = [], [], [], [] x, cx = [], [] answerss, p = [], [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() start_at_index = int(round(len(source_data['data']) * start_ratio)) stop_at_index = int(round(len(source_data['data']) * stop_ratio)) # 4. iterate the dataset max_ques_size = 0 max_context_size = 0 max_word_size = 0 for article_index, article in enumerate(tqdm(source_data['data'][start_at_index:stop_at_index])): xp, cxp, pp = [], [], [] x.append(xp) cx.append(cxp) p.append(pp) for paragraph_index, paragraph in enumerate(article['paragraphs']): context = paragraph['context'] context = context.replace("''", '" ') # notice this space, so the length of the context will not change when replace context = context.replace("``", '" ') # context is a str here list_of_wordlist = list(map(word_tokenize, sent_tokenize(context))) # after sent_tokenizer, it will be a list of sentence, here just one sentence, # a list of sentence # then the map, will apply the word_tokenize func to each sentence # a list of lists of words # [[words for sentence1], [words for sentence2]] list_of_wordlist = [process_tokens(tokens) for tokens in list_of_wordlist] # list_of_wordlist is a 2d stuff for wordlist in list_of_wordlist: max_context_size = max(max_context_size, len(wordlist)) list_of_charlist = [[list(word) for word in wordlist] for wordlist in list_of_wordlist] # list of charlist is a 3d, sentence-dim, word-dim, char-dim xp.append(list_of_wordlist) # 3d, paragraph, sentence, words cxp.append(list_of_charlist) # 4d, paragraph, sentence, words, chars pp.append(context) # 2d, paragraph, context ## update counters num_qas = len(paragraph['qas']) for wordlist in list_of_wordlist: for word in wordlist: word_counter[word] += num_qas lower_word_counter[word.lower()] += num_qas for char in word: char_counter[char] += num_qas rxi = [article_index, paragraph_index] assert len(x) - 1 == article_index # x stores xp, xp is 3d, paragraph, sentece, and words assert len(x[article_index]) - 1 == paragraph_index for question in paragraph['qas']: question_wordslist = word_tokenize(question['question']) max_ques_size = max(max_ques_size, len(question_wordslist)) # it's a list of words question_charslist = [list(word) for word in question_wordslist] # it's a list of charlist yi = [] cyi = [] answers = [] # the content of each answers for answer in question['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start_index = answer['answer_start'] answer_end_index = answer_start_index + len(answer_text) yi0, yi1 = get_word_span(context, list_of_wordlist, # 2-d: sentences, words answer_start_index, answer_end_index) # yi0 (0, 108), 0 is the index of sentence # yi1 (0, 111). 108 and 111 is the start and end of word index assert len(list_of_wordlist[yi0[0]]) > yi0[1] # the length of the first sentence is larger than 108 assert len(list_of_wordlist[yi1[0]]) >= yi1[1] # the length of the first sentence is larger or equla to 111 w0 = list_of_wordlist[yi0[0]][yi0[1]] # the start words of the answer w1 = list_of_wordlist[yi1[0]][yi1[1] - 1] # the last word of the answer i0 = get_word_idx(context, list_of_wordlist, yi0) i1 = get_word_idx(context, list_of_wordlist, (yi1[0], yi1[1] - 1)) # i0 is 515, which is the char index of the answer, # i1 is start index of the final word in terms of chars # 'Saint Bernadette Soubirous', i1 is the index of S in Soubirous cyi0 = answer_start_index - i0 # it should be 0 here since start index is 515, and i0 should also be 515 cyi1 = answer_end_index - i1 - 1 # cyi1 seems to be the length of last word -1, or because some other issues assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] # be sure the first char and last char are same with the first word's first char and last word's last char assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) # yi.append([yi0, yi1]) # index of words cyi.append([cyi0, cyi1]) # index of shifts from the first char and last char of the answer in context # update counters for word in question_wordslist: word_counter[word] += 1 lower_word_counter[word.lower()] += 1 for char in word: char_counter[char] += 1 q.append(question_wordslist) # 2-d list of wordlist for each question cq.append(question_charslist) # 3-d, question-word-char y.append(yi) # question-startendpair cy.append(cyi) # question-startend char pair rx.append(rxi) # list of article_id-paragraph_id pair rcx.append(rxi) ids.append(question['id']) # ids for each question idxs.append(len(idxs)) # index for each question answerss.append(answers) # list of answer in string word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) for word in word_counter: max_word_size = max(max_word_size, len(word)) # add context here data = { 'q': q, # list of word list of each questions, [['who','are', 'you'], ... ] 'cq': cq, # [<class 'list'>: [['T', 'o'], ['w', 'h', 'o', 'm'], ['d', 'i', 'd'], ['t', 'h', 'e'], ['V', 'i', 'r', 'g', 'i', 'n'], ['M', 'a', 'r', 'y'], ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'], ['a', 'p', 'p', 'e', 'a', 'r'], ['i', 'n'], ['1', '8', '5', '8'], ['i', 'n'], ['L', 'o', 'u', 'r', 'd', 'e', 's'], ['F', 'r', 'a', 'n', 'c', 'e'], ['?']] , ...] 'y': y, # list of <class 'list'>: [[(0, 108), (0, 111)]] '*x': rx, # list of <class 'list'>: [0, 21], 0 means the number of article, 21 means the 21st paragraph '*cx': rcx, # same with rx but for characters, i guess the values are same as well 'cy': cy, # 'idxs': idxs, # just those ids 'ids': ids, # the id of each question, sth like uuid 'answerss': answerss, # the content of the answer '*p': rx # } shared = { 'x': x, # words of each paragraph 'cx': cx, # characters of each 'p': p, # the content of each paragraph 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...")
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] # source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type)) source_path = in_path or os.path.join(args.source_dir, "{}.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() # start_ai = int(round(len(source_data['data']) * start_ratio)) # stop_ai = int(round(len(source_data['data']) * stop_ratio)) start_ai = int(round(len(source_data) * start_ratio)) stop_ai = int(round(len(source_data) * stop_ratio)) # for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) # print(article.keys()) # paragraph, title # raise # print(article) # {'question', 'answer', 'context', 'answer_list'} # raise # for pi, para in enumerate(article['paragraphs']): for pi, para in enumerate([article]): # print(para.keys()) # qas, context # raise # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # print(xi) # [['archi', ',', 'the', 'school']] # raise # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) # print(len(para['qas'])) # 5 # print(para['qas']) # [{'answers': [{'text', 'answer_start'}], 'id', 'question'}] # raise for xij in xi: for xijk in xij: # word_counter[xijk] += len(para['qas']) # lower_word_counter[xijk.lower()] += len(para['qas']) word_counter[xijk] += 1 lower_word_counter[xijk.lower()] += 1 for xijkl in xijk: # char_counter[xijkl] += len(para['qas']) char_counter[xijkl] += 1 rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi # for qa in para['qas']: for qa in [article]: # get words # qi = word_tokenize(qa['question']) qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] # for answer in qa['answers']: try: answer = qa['answer'] answer_text = qa['answer_list'][answer-1] except KeyError: answer_text = ' '.join(qa['answer_list']) for _ in [answer_text]: # answer_text = answer['text'] answers.append(answer_text) # answer_start = answer['answer_start'] try: answer_start = context.index(answer_text) answer_stop = answer_start + len(answer_text) except ValueError: answer_start = 0 answer_stop = len(context) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] # context: str # xi: [[word, word, word, ...]] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, '|', w0[cyi0:], '|', w1[:cyi1+1]) # raise #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) #assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) # ids.append(qa['id']) ids.append(qa['question']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx} shared = {'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict} print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): """ :param args: configurations :param data_type: train or dev :param start_ratio: :param stop_ratio: :param out_name: train, dev, test :param in_path: :return: """ if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] # 1. load data source_path = in_path or os.path.join( args.source_dir, "{}-v{}.json".format(data_type, args.version)) source_data = json.load(open(source_path, 'r')) # load the train data or dev 1.1 dataset q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_at_index = int(round(len(source_data['data']) * start_ratio)) stop_at_index = int(round(len(source_data['data']) * stop_ratio)) # for each article for article_index, article in enumerate( tqdm(source_data['data'][start_at_index:stop_at_index])): xp, cxp = [], [] pp = [] x.append(xp) # article_paragraph_sentence_wordlist cx.append(cxp) # article_paragraph_sentence_word_charlist p.append(pp) # article_contextlist # for each paragrph of the article for paragraph_index, paragraph in enumerate(article['paragraphs']): # wordss context = paragraph['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') list_of_wordlist = list(map(word_tokenize, sent_tokenize(context))) list_of_wordlist = [ process_tokens(tokens) for tokens in list_of_wordlist ] # process tokens # xi are words # given xi, add chars list_of_charlist = [[list(word) for word in word_list] for word_list in list_of_wordlist] # cxi are characters for each words xp.append(list_of_wordlist) # paragraph_sentence_wordlist cxp.append(list_of_charlist) # paragraph_sentence_word_charlist pp.append(context) # contextlist # update the counter to plus the number of questions for wordlist in list_of_wordlist: for word in wordlist: word_counter[word] += len(paragraph['qas']) lower_word_counter[word.lower()] += len(paragraph['qas']) for char in word: char_counter[char] += len(paragraph['qas']) rxi = [article_index, paragraph_index] assert len(x) - 1 == article_index assert len(x[article_index]) - 1 == paragraph_index for question in paragraph['qas']: # get words question_wordslist = word_tokenize(question['question']) question_charslist = [list(qij) for qij in question_wordslist] yi = [] cyi = [] answers = [] for answer in question['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, list_of_wordlist, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(list_of_wordlist[yi0[0]]) > yi0[1] assert len(list_of_wordlist[yi1[0]]) >= yi1[1] w0 = list_of_wordlist[yi0[0]][yi0[1]] w1 = list_of_wordlist[yi1[0]][yi1[1] - 1] i0 = get_word_idx(context, list_of_wordlist, yi0) i1 = get_word_idx(context, list_of_wordlist, (yi1[0], yi1[1] - 1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in question_wordslist: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(question_wordslist) # question_wordlist, cq.append(question_charslist) # qeustion_word_charlist y.append(yi) # cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(question['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, # list of word list of each questions, [['who','are', 'you'], ... ] 'cq': cq, # [<class 'list'>: [['T', 'o'], ['w', 'h', 'o', 'm'], ['d', 'i', 'd'], ['t', 'h', 'e'], ['V', 'i', 'r', 'g', 'i', 'n'], ['M', 'a', 'r', 'y'], ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'], ['a', 'p', 'p', 'e', 'a', 'r'], ['i', 'n'], ['1', '8', '5', '8'], ['i', 'n'], ['L', 'o', 'u', 'r', 'd', 'e', 's'], ['F', 'r', 'a', 'n', 'c', 'e'], ['?']] , ...] 'y': y, # list of <class 'list'>: [[(0, 108), (0, 111)]] '*x': rx, # list of <class 'list'>: [0, 21], 0 means the number of article, 21 means the 21st paragraph '*cx': rcx, # same with rx but for characters, i guess the values are same as well 'cy': cy, # 'idxs': idxs, # just those ids 'ids': ids, # the id of each question, sth like uuid 'answerss': answerss, # the content of the answer '*p': rx # } # the following variables are shared by several question, shared = { 'x': x, # words of each paragraph 'cx': cx, # characters of each 'p': p, # the content of each paragraph 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...")
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): # return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] return [token.replace("''", '"').replace("``", '"') for token in jieba.lcut(tokens, cut_all=False)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() # if not args.split: # sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type)) source_data = json.load(open(source_path, 'r', encoding='utf-8')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') ################### add by zhijing table = {ord(f):ord(t) for f,t in zip( ',。!?【】()%#@&1234567890', ',.!?[]()%#@&1234567890')} context = context.translate(table) ################### add by zhijing print(context) print(len(sent_tokenize(context))) xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens print('xi') print(xi) # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx} shared = {'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict} print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}-{}v1.1.json".format(data_type, args.suffix)) source_data = json.load(open(source_path, 'r', encoding="utf-8")) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] na = [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words qi = word_tokenize(qa['question']) qi = process_tokens(qi) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 #add #print("i0 :",i0, "i1 :",i1, "cyi0 :", cyi0, "w0 :",w0 ) #print("xi :", xi) #print( "yi0",yi0, "(yi1[0], yi1[1]-1) :",(yi1[0], yi1[1]-1) ) #print("answer_text",answer_text) #print("cyi1:",cyi1) #print("answer_text[0] :",answer_text[0]) #print("answer_text[-1] :",answer_text[-1]) #print("w0 :",w0) #print("w1 :",w1) #so far #print(":):):)") #print("answer_text:",answer_text,"\nstart:", w0[cyi0:],"\nend:", w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) if len(qa['answers']) == 0: yi.append([(0, 0), (0, 1)]) cyi.append([0, 1]) na.append(True) else: na.append(False) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx, 'na': na} shared = {'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict} print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}-qar_squad_all.jsonl".format(data_type)) rfp = open(source_path, 'r') q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() #start_ai = int(round(len(source_data['data']) * start_ratio)) #stop_ai = int(round(len(source_data['data']) * stop_ratio)) pi = 0 ai = 0 xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for line in tqdm(rfp): para = json.loads(line) context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) # xi = context.split() xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words qa_text = qa['question'] qa_text = qa_text.replace("''", '" ') qa_text = qa_text.replace("``", '" ') qi = word_tokenize(qa_text) # qi = qa['question'].split() cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: flag = False answer_text = answer['text'] answer_text = answer_text.replace("''", '" ') answer_text = answer_text.replace("``", '" ') answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] if len(w1) == 0 and len(xi[yi1[0]][yi1[1]-2]) != 0: flag = True w1 = xi[yi1[0]][yi1[1]-2] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) if flag: assert answer_text[-2] == w1[cyi1], (answer_text, w1, cyi1) else: assert answer_text[-1] == w1[cyi1], (answer_text, w1, cyi1) assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break pi += 1 word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx} shared = {'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict} print("Saving ...") save(args, data, shared, out_name) print("Saving complete!")
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}-v1.1.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] contextss = [] context_questions = [] titles = [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): pp = [] p.append(pp) xp, cxp, contexts, c_questions = [], [], [], [] x.append(xp) cx.append(cxp) contextss.append(contexts) context_questions.append(c_questions) title = "[" + str(ai).zfill(2) + "] " + article['title'].replace( '_', ' ') titles.append(title) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') #Sentences of priginal Paragraph contexts.append(context) xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi if ai == 0: c_questions.append(para['qas'][3]['question']) else: c_questions.append(para['qas'][0]['question']) """ for qa in para['qas']: # get words c_questions.append(qa['question']) break qi = word_tokenize(qa['question']) # qa['question'] : original question cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) assert answer_text[-1] == w1[cyi1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) """ if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx } shared = { 'x': x, 'cx': cx, 'p': p, 'contextss': contextss, 'context_questions': context_questions, 'titles': titles, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}.seq.json".format(data_type)) source_data = json.load(open(source_path, 'r')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data['data']) * start_ratio)) stop_ai = int(round(len(source_data['data']) * stop_ratio)) for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens xi = [[xijk for xijk in xij if xijk != ''] for xij in xi] # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) ansi = word_tokenize(answer_text) answer_location = answer['answer_location'] yii = [] for ans_idx, answer_start in enumerate(answer_location): answer_stop = answer_start + len(ansi[ans_idx]) yi0, _ = get_word_span(context, xi, answer_start, answer_stop) assert len(xi[yi0[0]]) > yi0[1] w0 = xi[yi0[0]][yi0[1]] assert ansi[ans_idx] == w0, (ansi[ans_idx], w0) yii.append(yi0) yi.append(yii) # answer_start = answer['answer_start'] # answer_stop = answer_start + len(answer_text) # yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # # yi0 = answer['answer_word_start'] or [0, 0] # # yi1 = answer['answer_word_stop'] or [0, 1] # assert len(xi[yi0[0]]) > yi0[1] # assert len(xi[yi1[0]]) >= yi1[1] # w0 = xi[yi0[0]][yi0[1]] # w1 = xi[yi1[0]][yi1[1]-1] # i0 = get_word_idx(context, xi, yi0) # i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) # cyi0 = answer_start - i0 # cyi1 = answer_stop - i1 - 1 # # print(answer_text, w0[cyi0:], w1[:cyi1+1]) # assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) # assert answer_text[-1] == w1[cyi1] # assert cyi0 < 32, (answer_text, w0) # assert cyi1 < 32, (answer_text, w1) # yi.append([yi0, yi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx } shared = { 'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() #if not args.split: # sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "third_party", "wikipedia") source_summaries = pd.read_csv(source_path + '/summaries.csv') source_qas = pd.read_csv(args.source_dir + '/qaps.csv') summaries = [] summaries_char_list = [] ques_answers = [] questions = [] questions_char_list = [] document_ids = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() summary_index = -1 for index_summ, row in tqdm(source_summaries.iterrows()): if data_type == row['set']: summary_tokenized_paras = [] summary_char_para = [] summary_tokenized = list( map(word_tokenize, sent_tokenize(row['summary_tokenized']))) summary_tokenized = [ process_tokens(tokens) for tokens in summary_tokenized ] char_list = [[list(word) for word in sent] for sent in summary_tokenized] #print ("summ",summary_tokenized) #print (char_list) summary_tokenized_paras.append( summary_tokenized) # TODO:each summary has only one paragraph summaries.append(summary_tokenized_paras) summary_char_para.append( char_list) # TODO:each summary has only one paragraph summaries_char_list.append(summary_char_para) #coz train/test/valid all are in one file, index_summ cannot be used summary_index = summary_index + 1 qas = source_qas[source_qas['document_id'].isin( [row['document_id']])] for sent in summary_tokenized: for word in sent: word_counter[word] += len(qas) lower_word_counter[word.lower()] += len(qas) for char in word: char_counter[char] += len(qas) for index, qa in qas.iterrows(): #if question is of multiple sentences, not handling that case also #Not req most probably question_tokenized = word_tokenize(qa['question']) question_tokenized = process_tokens(question_tokenized) #print (question_tokenized) question_char_list = [ list(word) for word in question_tokenized ] answer1_tokenized = list( map(word_tokenize, sent_tokenize(qa['answer1']))) answer1_tokenized = [ process_tokens(tokens) for tokens in answer1_tokenized ] #print(answer1_tokenized) answer2_tokenized = list( map(word_tokenize, sent_tokenize(qa['answer2']))) answer2_tokenized = [ process_tokens(tokens) for tokens in answer2_tokenized ] #print(answer2_tokenized) ques_answers.append([answer1_tokenized, answer2_tokenized]) #print(ques_answers) questions.append(question_tokenized) questions_char_list.append(question_char_list) document_ids.append([summary_index, row['document_id']]) for sent in question_tokenized: for word in sent: word_counter[word] += 1 lower_word_counter[word.lower()] += 1 for char in word: char_counter[char] += 1 word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) data = { 'q': questions, 'cq': questions_char_list, '*x': document_ids, 'answerss': ques_answers, '*cx': document_ids } shared = { 'x': summaries, 'cx': summaries_char_list, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default"): word_tokenize, sent_tokenize = get_sent_tokenize() source_data = [] f = open(os.path.join(args.source_dir, 'WikiQA-%s.txt' % data_type), 'r') lines = (f.read()).rsplit('\n') for i, line in enumerate(lines): if line == '': continue t = tuple(line.rsplit('\t')) assert len(t) == 3, t question, sentence, correct = t curr_question = question if not sentence.endswith('.'): sentence += '.' _id = len(source_data) qas = [{'answer': correct, 'id': _id, 'question': question}] dic = {'context': sentence, 'qas': qas} source_data.append({'paragraphs': [dic]}) json.dump({'data': source_data}, open(os.path.join(args.source_dir, '%s-class.json' % data_type), 'w')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data) * start_ratio)) stop_ai = int(round(len(source_data) * stop_ratio)) answer_counter = Counter() N = 0 for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: N += 1 qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] answers = [] answer = qa['answer'] == '1' answer_counter[answer] += 1 yi.append(answer) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 def put(): q.append(qi) cq.append(cqi) y.append(yi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) put() if data_type == 'train' and answer: for i in range(17): put() if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx } shared = { 'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): print("Preprocessing data type %s" % data_type) if args.tokenizer == "PTB": import nltk sent_tokenize = nltk.sent_tokenize def word_tokenize(tokens): return [ token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens) ] elif args.tokenizer == 'Stanford': from my.corenlp_interface import CoreNLPInterface interface = CoreNLPInterface(args.url, args.port) sent_tokenize = interface.split_doc word_tokenize = interface.split_sent else: raise Exception() if not args.split: sent_tokenize = lambda para: [para] source_path = in_path or os.path.join(args.source_dir, "{}.csv".format(data_type)) print("Reading data from source path %s" % source_path) source_data = pd.read_csv(source_path, encoding='utf-8', dtype=dict(is_answer_absent=float), na_values=dict(question=[], story_text=[], validated_answers=[]), keep_default_na=False) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] # Gold standard answers span_answerss = [] # Answers from our spans p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter( ), Counter() start_ai = int(round(len(source_data) * start_ratio)) stop_ai = int(round(len(source_data) * stop_ratio)) data_rows = source_data.iterrows() story_ids_to_idx = {} idx_to_story_ids = {} for ai, data_point in enumerate(tqdm(data_rows)): question_index, question_info = data_point[0], data_point[1] story_id = question_info['story_id'] context = question_info['story_text'] context = context.replace("''", '" ') context = context.replace("``", '" ') question = question_info['question'] question_id = ai answer_char_ranges = question_info['answer_char_ranges'] # Copy get answer script from the newsqa dataset baseline_answers = [] # Prefer validated answers. # If there are no validated answers, use the ones that are provided. if not 'validated_answers' in question_info or not question_info[ 'validated_answers']: # Ignore per selection splits. char_ranges = question_info['answer_char_ranges'].replace( '|', ',').split(',') else: validated_answers_dict = json.loads( question_info['validated_answers']) char_ranges = [] for k, v in validated_answers_dict.items(): char_ranges += v * [k] for char_range in char_ranges: if char_range.lower() == 'none': baseline_answers.append('NONE') elif ':' in char_range: start, end = map(int, char_range.split(':')) answer = question_info['story_text'][start:end] baseline_answers.append(answer) paragraph_ptr = -1 pi = 0 if story_id not in story_ids_to_idx: paragraph_ptr = len(story_ids_to_idx) story_ids_to_idx[story_id] = paragraph_ptr idx_to_story_ids[paragraph_ptr] = story_id xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += 1 lower_word_counter[xijk.lower()] += 1 for xijkl in xijk: char_counter[xijkl] += 1 else: paragraph_ptr = story_ids_to_idx[story_id] rxi = [paragraph_ptr, pi] """ print("TEST") print("TEST") print(story_ids_to_idx) print(len(xp)) print(paragraph_ptr) """ xi = x[paragraph_ptr][pi] qi = word_tokenize(question) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] answer_char_ranges_split = answer_char_ranges.split("|") for answer in answer_char_ranges_split: if answer == 'None': continue answer_char_range = answer.split(",")[0].split(":") answer_start = int(answer_char_range[0]) answer_stop = int(answer_char_range[-1]) answer_text = context[answer_start:answer_stop].strip() if answer_text == "": print("BAD ANSWER GIVEN %s" % answer_char_range) continue answers.append(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1] - 1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1] - 1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 #print(question, answer_text, w0[cyi0:], w1[:cyi1+1]) #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) #assert answer_text[-1] == w1[-1] assert cyi0 < 32, (answer_text, w0) assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(question_id) idxs.append(len(idxs)) answerss.append(baseline_answers) span_answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = { 'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, 'span_answerss': span_answerss, '*p': rx } shared = { 'x': x, 'cx': cx, 'p': p, 'story_ids_to_idx': story_ids_to_idx, 'idx_to_story_ids': idx_to_story_ids, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict } print("saving ...") save(args, data, shared, out_name)
def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0, out_name="default", in_path=None): word_tokenize, sent_tokenize = get_sent_tokenize() source_data = [] f = open(os.path.join(args.source_dir, 'WikiQA-%s.txt' % data_type), 'r', encoding='utf-8') curr_question = None lines = (f.read()).rsplit('\n') for i, line in enumerate(lines): if line == '' : continue t = tuple(line.rsplit('\t')) assert len(t)==3, t question, sentence, correct = t if not curr_question == question: if not (curr_question is None or answer_list == []): context = ' '.join(context_list) context = context.replace(' .', '.') answers = [{'answer_start':0, 'text':answer_list}] _id = len(source_data) qas = [{'answers':answers, 'id':_id, 'question':curr_question}] dic = {'context' : context, 'qas' : qas} source_data.append({'paragraphs' : [dic]}) context_list = [] answer_list = [] curr_question = question if not sentence.endswith('.'): sentence += '.' context_list.append(sentence) if correct == '1': answer_list.append(sentence) json.dump({'data' : source_data}, open(os.path.join(args.source_dir, '%s.json' % data_type), 'w')) q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], [] cy = [] x, cx = [], [] answerss = [] p = [] word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter() start_ai = int(round(len(source_data) * start_ratio)) stop_ai = int(round(len(source_data) * stop_ratio)) for ai, article in enumerate(tqdm(source_data[start_ai:stop_ai])): xp, cxp = [], [] pp = [] x.append(xp) cx.append(cxp) p.append(pp) for pi, para in enumerate(article['paragraphs']): # wordss context = para['context'] context = context.replace("''", '" ') context = context.replace("``", '" ') xi = list(map(word_tokenize, sent_tokenize(context))) xi = [process_tokens(tokens) for tokens in xi] # process tokens # given xi, add chars cxi = [[list(xijk) for xijk in xij] for xij in xi] xp.append(xi) cxp.append(cxi) pp.append(context) for xij in xi: for xijk in xij: word_counter[xijk] += len(para['qas']) lower_word_counter[xijk.lower()] += len(para['qas']) for xijkl in xijk: char_counter[xijkl] += len(para['qas']) rxi = [ai, pi] assert len(x) - 1 == ai assert len(x[ai]) - 1 == pi for qa in para['qas']: # get words qi = word_tokenize(qa['question']) cqi = [list(qij) for qij in qi] yi = [] cyi = [] answers = [] for answer in qa['answers']: answer_text = answer['text'] answers.append(answer_text) answer_start = answer['answer_start'] answer_stop = answer_start + len(answer_text) # TODO : put some function that gives word_start, word_stop here yi0, yi1 = get_word_span(context, xi, answer_start, answer_stop) # yi0 = answer['answer_word_start'] or [0, 0] # yi1 = answer['answer_word_stop'] or [0, 1] assert len(xi[yi0[0]]) > yi0[1] assert len(xi[yi1[0]]) >= yi1[1] w0 = xi[yi0[0]][yi0[1]] w1 = xi[yi1[0]][yi1[1]-1] i0 = get_word_idx(context, xi, yi0) i1 = get_word_idx(context, xi, (yi1[0], yi1[1]-1)) cyi0 = answer_start - i0 cyi1 = answer_stop - i1 - 1 # print(answer_text, w0[cyi0:], w1[:cyi1+1]) #assert answer_text[0] == w0[cyi0], (answer_text, w0, cyi0) #assert answer_text[-1] == w1[cyi1] #assert cyi0 < 32, (answer_text, w0) #assert cyi1 < 32, (answer_text, w1) yi.append([yi0, yi1]) cyi.append([cyi0, cyi1]) for qij in qi: word_counter[qij] += 1 lower_word_counter[qij.lower()] += 1 for qijk in qij: char_counter[qijk] += 1 q.append(qi) cq.append(cqi) y.append(yi) cy.append(cyi) rx.append(rxi) rcx.append(rxi) ids.append(qa['id']) idxs.append(len(idxs)) answerss.append(answers) if args.debug: break word2vec_dict = get_word2vec(args, word_counter) lower_word2vec_dict = get_word2vec(args, lower_word_counter) # add context here data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, 'cy': cy, 'idxs': idxs, 'ids': ids, 'answerss': answerss, '*p': rx} shared = {'x': x, 'cx': cx, 'p': p, 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter, 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict} print("saving ...") save(args, data, shared, out_name)