def complete_wiki_processing(data, utility, train=True): #convert to integers and padding processed_data = [] num_bad_examples = 0 for example in data: number_found = 0 if (example.is_bad_example): num_bad_examples += 1 if (not (example.is_bad_example)): example.string_question = example.question[:] #entry match example.processed_number_columns = example.processed_number_columns[:] example.processed_word_columns = example.processed_word_columns[:] example.word_exact_match, word_match, matched_indices = exact_match( example.string_question, example.original_wc, number=False) example.number_exact_match, number_match, _ = exact_match( example.string_question, example.original_nc, number=True) if (not (pick_one(example.word_exact_match)) and not (pick_one(example.number_exact_match))): assert len(word_match) == 0 assert len(number_match) == 0 example.word_exact_match, word_match = partial_match( example.string_question, example.original_wc, number=False) #group by max example.word_group_by_max = group_by_max(example.original_wc, False) example.number_group_by_max = group_by_max(example.original_nc, True) #column name match example.word_column_exact_match, wcol_matched_indices = exact_column_match( example.string_question, example.original_wc_names, number=False) example.number_column_exact_match, ncol_matched_indices = exact_column_match( example.string_question, example.original_nc_names, number=False) if (not (1.0 in example.word_column_exact_match) and not (1.0 in example.number_column_exact_match)): example.word_column_exact_match = partial_column_match( example.string_question, example.original_wc_names, number=False) example.number_column_exact_match = partial_column_match( example.string_question, example.original_nc_names, number=False) if (len(word_match) > 0 or len(number_match) > 0): example.question.append(utility.entry_match_token) if (1.0 in example.word_column_exact_match or 1.0 in example.number_column_exact_match): example.question.append(utility.column_match_token) example.string_question = example.question[:] example.number_lookup_matrix = np.transpose( example.number_lookup_matrix)[:] example.word_lookup_matrix = np.transpose( example.word_lookup_matrix)[:] example.columns = example.number_columns[:] example.word_columns = example.word_columns[:] example.len_total_cols = len(example.word_column_names) + len( example.number_column_names) example.column_names = example.number_column_names[:] example.word_column_names = example.word_column_names[:] example.string_column_names = example.number_column_names[:] example.string_word_column_names = example.word_column_names[:] example.sorted_number_index = [] example.sorted_word_index = [] example.column_mask = [] example.word_column_mask = [] example.processed_column_mask = [] example.processed_word_column_mask = [] example.word_column_entry_mask = [] example.question_attention_mask = [] example.question_number = example.question_number_1 = -1 example.question_attention_mask = [] example.ordinal_question = [] example.ordinal_question_one = [] new_question = [] if (len(example.number_columns) > 0): example.len_col = len(example.number_columns[0]) else: example.len_col = len(example.word_columns[0]) for (start, length) in matched_indices: for j in range(length): example.question[start + j] = utility.unk_token #print example.question for word in example.question: if (isinstance(word, numbers.Number) or wiki_data.is_date(word)): if (not (isinstance(word, numbers.Number)) and wiki_data.is_date(word)): word = word.replace("X", "").replace("-", "") number_found += 1 if (number_found == 1): example.question_number = word if (len(example.ordinal_question) > 0): example.ordinal_question[ len(example.ordinal_question) - 1] = 1.0 else: example.ordinal_question.append(1.0) elif (number_found == 2): example.question_number_1 = word if (len(example.ordinal_question_one) > 0): example.ordinal_question_one[ len(example.ordinal_question_one) - 1] = 1.0 else: example.ordinal_question_one.append(1.0) else: new_question.append(word) example.ordinal_question.append(0.0) example.ordinal_question_one.append(0.0) example.question = [ utility.word_ids[word_lookup(w, utility)] for w in new_question ] example.question_attention_mask = [0.0] * len(example.question) #when the first question number occurs before a word example.ordinal_question = example.ordinal_question[ 0:len(example.question)] example.ordinal_question_one = example.ordinal_question_one[ 0:len(example.question)] #question-padding example.question = [utility.word_ids[utility.dummy_token] ] * (utility.FLAGS.question_length - len(example.question)) + example.question example.question_attention_mask = [-10000.0] * ( utility.FLAGS.question_length - len(example.question_attention_mask) ) + example.question_attention_mask example.ordinal_question = [0.0] * ( utility.FLAGS.question_length - len(example.ordinal_question)) + example.ordinal_question example.ordinal_question_one = [ 0.0 ] * (utility.FLAGS.question_length - len( example.ordinal_question_one)) + example.ordinal_question_one if (True): #number columns and related-padding num_cols = len(example.columns) start = 0 for column in example.number_columns: if (check_processed_cols( example.processed_number_columns[start], utility)): example.processed_column_mask.append(0.0) sorted_index = sorted(range( len(example.processed_number_columns[start])), key=lambda k: example. processed_number_columns[start][k], reverse=True) sorted_index = sorted_index + [utility.FLAGS.pad_int] * ( utility.FLAGS.max_elements - len(sorted_index)) example.sorted_number_index.append(sorted_index) example.columns[start] = column + [ utility.FLAGS.pad_int ] * (utility.FLAGS.max_elements - len(column)) example.processed_number_columns[start] += [ utility.FLAGS.pad_int ] * (utility.FLAGS.max_elements - len(example.processed_number_columns[start])) start += 1 example.column_mask.append(0.0) for remaining in range(num_cols, utility.FLAGS.max_number_cols): example.sorted_number_index.append( [utility.FLAGS.pad_int] * (utility.FLAGS.max_elements)) example.columns.append([utility.FLAGS.pad_int] * (utility.FLAGS.max_elements)) example.processed_number_columns.append( [utility.FLAGS.pad_int] * (utility.FLAGS.max_elements)) example.number_exact_match.append( [0.0] * (utility.FLAGS.max_elements)) example.number_group_by_max.append( [0.0] * (utility.FLAGS.max_elements)) example.column_mask.append(-100000000.0) example.processed_column_mask.append(-100000000.0) example.number_column_exact_match.append(0.0) example.column_names.append([utility.dummy_token]) #word column and related-padding start = 0 word_num_cols = len(example.word_columns) for column in example.word_columns: if (check_processed_cols( example.processed_word_columns[start], utility)): example.processed_word_column_mask.append(0.0) sorted_index = sorted( range(len(example.processed_word_columns[start])), key=lambda k: example.processed_word_columns[start][k], reverse=True) sorted_index = sorted_index + [utility.FLAGS.pad_int] * ( utility.FLAGS.max_elements - len(sorted_index)) example.sorted_word_index.append(sorted_index) column = convert_to_int_2d_and_pad(column, utility) example.word_columns[start] = column + [ [utility.word_ids[utility.dummy_token]] * utility.FLAGS.max_entry_length ] * (utility.FLAGS.max_elements - len(column)) example.processed_word_columns[start] += [ utility.FLAGS.pad_int ] * (utility.FLAGS.max_elements - len(example.processed_word_columns[start])) example.word_column_entry_mask.append( [0] * len(column) + [utility.word_ids[utility.dummy_token]] * (utility.FLAGS.max_elements - len(column))) start += 1 example.word_column_mask.append(0.0) for remaining in range(word_num_cols, utility.FLAGS.max_word_cols): example.sorted_word_index.append( [utility.FLAGS.pad_int] * (utility.FLAGS.max_elements)) example.word_columns.append( [[utility.word_ids[utility.dummy_token]] * utility.FLAGS.max_entry_length] * (utility.FLAGS.max_elements)) example.word_column_entry_mask.append( [utility.word_ids[utility.dummy_token]] * (utility.FLAGS.max_elements)) example.word_exact_match.append( [0.0] * (utility.FLAGS.max_elements)) example.word_group_by_max.append( [0.0] * (utility.FLAGS.max_elements)) example.processed_word_columns.append( [utility.FLAGS.pad_int] * (utility.FLAGS.max_elements)) example.word_column_mask.append(-100000000.0) example.processed_word_column_mask.append(-100000000.0) example.word_column_exact_match.append(0.0) example.word_column_names.append( [utility.dummy_token] * utility.FLAGS.max_entry_length) seen_tables[example.table_key] = 1 #convert column and word column names to integers example.column_ids = convert_to_int_2d_and_pad( example.column_names, utility) example.word_column_ids = convert_to_int_2d_and_pad( example.word_column_names, utility) for i_em in range(len(example.number_exact_match)): example.number_exact_match[i_em] = example.number_exact_match[ i_em] + [0.0] * (utility.FLAGS.max_elements - len(example.number_exact_match[i_em])) example.number_group_by_max[ i_em] = example.number_group_by_max[i_em] + [0.0] * ( utility.FLAGS.max_elements - len(example.number_group_by_max[i_em])) for i_em in range(len(example.word_exact_match)): example.word_exact_match[i_em] = example.word_exact_match[ i_em] + [0.0] * (utility.FLAGS.max_elements - len(example.word_exact_match[i_em])) example.word_group_by_max[i_em] = example.word_group_by_max[ i_em] + [0.0] * (utility.FLAGS.max_elements - len(example.word_group_by_max[i_em])) example.exact_match = example.number_exact_match + example.word_exact_match example.group_by_max = example.number_group_by_max + example.word_group_by_max example.exact_column_match = example.number_column_exact_match + example.word_column_exact_match #answer and related mask, padding if (example.is_lookup): example.answer = example.calc_answer example.number_print_answer = example.number_lookup_matrix.tolist( ) example.word_print_answer = example.word_lookup_matrix.tolist() for i_answer in range(len(example.number_print_answer)): example.number_print_answer[ i_answer] = example.number_print_answer[i_answer] + [ 0.0 ] * (utility.FLAGS.max_elements - len(example.number_print_answer[i_answer])) for i_answer in range(len(example.word_print_answer)): example.word_print_answer[ i_answer] = example.word_print_answer[i_answer] + [ 0.0 ] * (utility.FLAGS.max_elements - len(example.word_print_answer[i_answer])) example.number_lookup_matrix = convert_to_bool_and_pad( example.number_lookup_matrix, utility) example.word_lookup_matrix = convert_to_bool_and_pad( example.word_lookup_matrix, utility) for remaining in range(num_cols, utility.FLAGS.max_number_cols): example.number_lookup_matrix.append( [False] * utility.FLAGS.max_elements) example.number_print_answer.append( [0.0] * utility.FLAGS.max_elements) for remaining in range(word_num_cols, utility.FLAGS.max_word_cols): example.word_lookup_matrix.append( [False] * utility.FLAGS.max_elements) example.word_print_answer.append( [0.0] * utility.FLAGS.max_elements) example.print_answer = example.number_print_answer + example.word_print_answer else: example.answer = example.calc_answer example.print_answer = [[0.0] * (utility.FLAGS.max_elements) ] * (utility.FLAGS.max_number_cols + utility.FLAGS.max_word_cols) #question_number masks if (example.question_number == -1): example.question_number_mask = np.zeros( [utility.FLAGS.max_elements]) else: example.question_number_mask = np.ones( [utility.FLAGS.max_elements]) if (example.question_number_1 == -1): example.question_number_one_mask = -10000.0 else: example.question_number_one_mask = np.float64(0.0) if (example.len_col > utility.FLAGS.max_elements): continue processed_data.append(example) return processed_data
def complete_wiki_processing(data, utility, train=True): #convert to integers and padding processed_data = [] num_bad_examples = 0 for example in data: number_found = 0 if (example.is_bad_example): num_bad_examples += 1 if (not (example.is_bad_example)): example.string_question = example.question[:] #entry match example.processed_number_columns = example.processed_number_columns[:] example.processed_word_columns = example.processed_word_columns[:] example.word_exact_match, word_match, matched_indices = exact_match( example.string_question, example.original_wc, number=False) example.number_exact_match, number_match, _ = exact_match( example.string_question, example.original_nc, number=True) if (not (pick_one(example.word_exact_match)) and not ( pick_one(example.number_exact_match))): assert len(word_match) == 0 assert len(number_match) == 0 example.word_exact_match, word_match = partial_match( example.string_question, example.original_wc, number=False) #group by max example.word_group_by_max = group_by_max(example.original_wc, False) example.number_group_by_max = group_by_max(example.original_nc, True) #column name match example.word_column_exact_match, wcol_matched_indices = exact_column_match( example.string_question, example.original_wc_names, number=False) example.number_column_exact_match, ncol_matched_indices = exact_column_match( example.string_question, example.original_nc_names, number=False) if (not (1.0 in example.word_column_exact_match) and not ( 1.0 in example.number_column_exact_match)): example.word_column_exact_match = partial_column_match( example.string_question, example.original_wc_names, number=False) example.number_column_exact_match = partial_column_match( example.string_question, example.original_nc_names, number=False) if (len(word_match) > 0 or len(number_match) > 0): example.question.append(utility.entry_match_token) if (1.0 in example.word_column_exact_match or 1.0 in example.number_column_exact_match): example.question.append(utility.column_match_token) example.string_question = example.question[:] example.number_lookup_matrix = np.transpose( example.number_lookup_matrix)[:] example.word_lookup_matrix = np.transpose(example.word_lookup_matrix)[:] example.columns = example.number_columns[:] example.word_columns = example.word_columns[:] example.len_total_cols = len(example.word_column_names) + len( example.number_column_names) example.column_names = example.number_column_names[:] example.word_column_names = example.word_column_names[:] example.string_column_names = example.number_column_names[:] example.string_word_column_names = example.word_column_names[:] example.sorted_number_index = [] example.sorted_word_index = [] example.column_mask = [] example.word_column_mask = [] example.processed_column_mask = [] example.processed_word_column_mask = [] example.word_column_entry_mask = [] example.question_attention_mask = [] example.question_number = example.question_number_1 = -1 example.question_attention_mask = [] example.ordinal_question = [] example.ordinal_question_one = [] new_question = [] if (len(example.number_columns) > 0): example.len_col = len(example.number_columns[0]) else: example.len_col = len(example.word_columns[0]) for (start, length) in matched_indices: for j in range(length): example.question[start + j] = utility.unk_token #print example.question for word in example.question: if (isinstance(word, numbers.Number) or wiki_data.is_date(word)): if (not (isinstance(word, numbers.Number)) and wiki_data.is_date(word)): word = word.replace("X", "").replace("-", "") number_found += 1 if (number_found == 1): example.question_number = word if (len(example.ordinal_question) > 0): example.ordinal_question[len(example.ordinal_question) - 1] = 1.0 else: example.ordinal_question.append(1.0) elif (number_found == 2): example.question_number_1 = word if (len(example.ordinal_question_one) > 0): example.ordinal_question_one[len(example.ordinal_question_one) - 1] = 1.0 else: example.ordinal_question_one.append(1.0) else: new_question.append(word) example.ordinal_question.append(0.0) example.ordinal_question_one.append(0.0) example.question = [ utility.word_ids[word_lookup(w, utility)] for w in new_question ] example.question_attention_mask = [0.0] * len(example.question) #when the first question number occurs before a word example.ordinal_question = example.ordinal_question[0:len( example.question)] example.ordinal_question_one = example.ordinal_question_one[0:len( example.question)] #question-padding example.question = [utility.word_ids[utility.dummy_token]] * ( utility.FLAGS.question_length - len(example.question) ) + example.question example.question_attention_mask = [-10000.0] * ( utility.FLAGS.question_length - len(example.question_attention_mask) ) + example.question_attention_mask example.ordinal_question = [0.0] * (utility.FLAGS.question_length - len(example.ordinal_question) ) + example.ordinal_question example.ordinal_question_one = [0.0] * (utility.FLAGS.question_length - len(example.ordinal_question_one) ) + example.ordinal_question_one if (True): #number columns and related-padding num_cols = len(example.columns) start = 0 for column in example.number_columns: if (check_processed_cols(example.processed_number_columns[start], utility)): example.processed_column_mask.append(0.0) sorted_index = sorted( range(len(example.processed_number_columns[start])), key=lambda k: example.processed_number_columns[start][k], reverse=True) sorted_index = sorted_index + [utility.FLAGS.pad_int] * ( utility.FLAGS.max_elements - len(sorted_index)) example.sorted_number_index.append(sorted_index) example.columns[start] = column + [utility.FLAGS.pad_int] * ( utility.FLAGS.max_elements - len(column)) example.processed_number_columns[start] += [utility.FLAGS.pad_int] * ( utility.FLAGS.max_elements - len(example.processed_number_columns[start])) start += 1 example.column_mask.append(0.0) for remaining in range(num_cols, utility.FLAGS.max_number_cols): example.sorted_number_index.append([utility.FLAGS.pad_int] * (utility.FLAGS.max_elements)) example.columns.append([utility.FLAGS.pad_int] * (utility.FLAGS.max_elements)) example.processed_number_columns.append([utility.FLAGS.pad_int] * (utility.FLAGS.max_elements)) example.number_exact_match.append([0.0] * (utility.FLAGS.max_elements)) example.number_group_by_max.append([0.0] * (utility.FLAGS.max_elements)) example.column_mask.append(-100000000.0) example.processed_column_mask.append(-100000000.0) example.number_column_exact_match.append(0.0) example.column_names.append([utility.dummy_token]) #word column and related-padding start = 0 word_num_cols = len(example.word_columns) for column in example.word_columns: if (check_processed_cols(example.processed_word_columns[start], utility)): example.processed_word_column_mask.append(0.0) sorted_index = sorted( range(len(example.processed_word_columns[start])), key=lambda k: example.processed_word_columns[start][k], reverse=True) sorted_index = sorted_index + [utility.FLAGS.pad_int] * ( utility.FLAGS.max_elements - len(sorted_index)) example.sorted_word_index.append(sorted_index) column = convert_to_int_2d_and_pad(column, utility) example.word_columns[start] = column + [[ utility.word_ids[utility.dummy_token] ] * utility.FLAGS.max_entry_length] * (utility.FLAGS.max_elements - len(column)) example.processed_word_columns[start] += [utility.FLAGS.pad_int] * ( utility.FLAGS.max_elements - len(example.processed_word_columns[start])) example.word_column_entry_mask.append([0] * len(column) + [ utility.word_ids[utility.dummy_token] ] * (utility.FLAGS.max_elements - len(column))) start += 1 example.word_column_mask.append(0.0) for remaining in range(word_num_cols, utility.FLAGS.max_word_cols): example.sorted_word_index.append([utility.FLAGS.pad_int] * (utility.FLAGS.max_elements)) example.word_columns.append([[utility.word_ids[utility.dummy_token]] * utility.FLAGS.max_entry_length] * (utility.FLAGS.max_elements)) example.word_column_entry_mask.append( [utility.word_ids[utility.dummy_token]] * (utility.FLAGS.max_elements)) example.word_exact_match.append([0.0] * (utility.FLAGS.max_elements)) example.word_group_by_max.append([0.0] * (utility.FLAGS.max_elements)) example.processed_word_columns.append([utility.FLAGS.pad_int] * (utility.FLAGS.max_elements)) example.word_column_mask.append(-100000000.0) example.processed_word_column_mask.append(-100000000.0) example.word_column_exact_match.append(0.0) example.word_column_names.append([utility.dummy_token] * utility.FLAGS.max_entry_length) seen_tables[example.table_key] = 1 #convert column and word column names to integers example.column_ids = convert_to_int_2d_and_pad(example.column_names, utility) example.word_column_ids = convert_to_int_2d_and_pad( example.word_column_names, utility) for i_em in range(len(example.number_exact_match)): example.number_exact_match[i_em] = example.number_exact_match[ i_em] + [0.0] * (utility.FLAGS.max_elements - len(example.number_exact_match[i_em])) example.number_group_by_max[i_em] = example.number_group_by_max[ i_em] + [0.0] * (utility.FLAGS.max_elements - len(example.number_group_by_max[i_em])) for i_em in range(len(example.word_exact_match)): example.word_exact_match[i_em] = example.word_exact_match[ i_em] + [0.0] * (utility.FLAGS.max_elements - len(example.word_exact_match[i_em])) example.word_group_by_max[i_em] = example.word_group_by_max[ i_em] + [0.0] * (utility.FLAGS.max_elements - len(example.word_group_by_max[i_em])) example.exact_match = example.number_exact_match + example.word_exact_match example.group_by_max = example.number_group_by_max + example.word_group_by_max example.exact_column_match = example.number_column_exact_match + example.word_column_exact_match #answer and related mask, padding if (example.is_lookup): example.answer = example.calc_answer example.number_print_answer = example.number_lookup_matrix.tolist() example.word_print_answer = example.word_lookup_matrix.tolist() for i_answer in range(len(example.number_print_answer)): example.number_print_answer[i_answer] = example.number_print_answer[ i_answer] + [0.0] * (utility.FLAGS.max_elements - len(example.number_print_answer[i_answer])) for i_answer in range(len(example.word_print_answer)): example.word_print_answer[i_answer] = example.word_print_answer[ i_answer] + [0.0] * (utility.FLAGS.max_elements - len(example.word_print_answer[i_answer])) example.number_lookup_matrix = convert_to_bool_and_pad( example.number_lookup_matrix, utility) example.word_lookup_matrix = convert_to_bool_and_pad( example.word_lookup_matrix, utility) for remaining in range(num_cols, utility.FLAGS.max_number_cols): example.number_lookup_matrix.append([False] * utility.FLAGS.max_elements) example.number_print_answer.append([0.0] * utility.FLAGS.max_elements) for remaining in range(word_num_cols, utility.FLAGS.max_word_cols): example.word_lookup_matrix.append([False] * utility.FLAGS.max_elements) example.word_print_answer.append([0.0] * utility.FLAGS.max_elements) example.print_answer = example.number_print_answer + example.word_print_answer else: example.answer = example.calc_answer example.print_answer = [[0.0] * (utility.FLAGS.max_elements)] * ( utility.FLAGS.max_number_cols + utility.FLAGS.max_word_cols) #question_number masks if (example.question_number == -1): example.question_number_mask = np.zeros([utility.FLAGS.max_elements]) else: example.question_number_mask = np.ones([utility.FLAGS.max_elements]) if (example.question_number_1 == -1): example.question_number_one_mask = -10000.0 else: example.question_number_one_mask = np.float64(0.0) if (example.len_col > utility.FLAGS.max_elements): continue processed_data.append(example) return processed_data