def split_dataset_category(input_file): input_data = read_csv(input_file, True) category_to_questions = { } # Dict where key = category, value = [list of real questions in that category, list of generated questions in that category] # Output dictionaries where key is question, value is category train_dict = {} testA_dict = {} testB_dict = {} # Add questions to categories for row in input_data: category = row[0].split(' - ')[0] question = row[2] source = row[3] if category in category_to_questions.keys(): if source == 'Author Generated': category_to_questions.get(category)[1].append(question) else: category_to_questions.get(category)[0].append(question) else: category_to_questions[category] = [[], []] if source == 'Author Generated': category_to_questions.get(category)[1].append(question) else: category_to_questions.get(category)[0].append(question) for category in category_to_questions.keys(): if category == '' or category == 'Other': continue real_questions = category_to_questions.get(category)[0] generated_questions = category_to_questions.get(category)[1] print(category, len(real_questions)) random.shuffle(real_questions) random.shuffle(generated_questions) for question in generated_questions: testB_dict[question] = category counter = 0 for i in range(len(real_questions)): question = real_questions[i] if counter < 20: if question not in train_dict: counter += 1 train_dict[question] = category else: if question not in testA_dict: counter += 1 testA_dict[question] = category return [train_dict, testA_dict, testB_dict]
def get_questions_to_id(input_file): result = {} # Dictionary where key is question, value is ID input_data = read_csv(input_file, True) for row in input_data: result[row[0]] = int(row[1]) return result
def get_questions_to_category(input_path): result = {} # Dictionary where key is question, value is category input_data = read_csv(input_path, skip_header=False) for row in input_data: result[row[0]] = row[1] return result
def get_id_to_questions(input_file): result = {} # Dictionary where key is ID, value is list of questions input_data = read_csv(input_file, True) # 2D array where first column is question, second column is question ID for row in input_data: current_id = int(row[1]) if current_id not in result: result[current_id] = [] result.get(current_id).append(row[0]) return result
def get_all_embeddings(input_csv, tokenizer, model): result = { } # Dictionary where key = question and value = bert embedding for that question reader = read_csv(input_csv, True) for row in reader: question = row[2] embedding = get_embedding(question, tokenizer, model) question = ''.join([i if ord(i) < 128 else ' ' for i in question]) result[question] = embedding return result
def combine_with_augmented_dataset(original_pickle_path, new_pickle_path, augmented_dataset_path, tokenizer, model): original_data = read_pickle(original_pickle_path) augmented_data = read_csv(augmented_dataset_path, skip_header=False) for row in augmented_data: question = row[0] embedding = get_embedding(question, tokenizer, model) question = ''.join([i if ord(i) < 128 else ' ' for i in question]) original_data[question] = embedding save_to_pickle(original_data, new_pickle_path) return original_data
def get_embedding_data(input_file, category_to_num, question_to_embedding): input_data = read_csv(input_file, False) embeddings = [] labels = [] for row in input_data: if row[1] == '': continue else: embedding = question_to_embedding[row[0]] category_number = category_to_num[row[1]] embeddings.append(embedding) labels.append(category_number) return np.asarray(embeddings), np.asarray(labels)
#trim so that we have the desired number of augmented sentences if num_aug >= 1: augmented_sentences = augmented_sentences[:num_aug] else: keep_prob = num_aug / len(augmented_sentences) augmented_sentences = [ s for s in augmented_sentences if random.uniform(0, 1) < keep_prob ] #append the original sentence augmented_sentences.append(sentence) return augmented_sentences # MAIN # input_data = read_csv('dataset_categories/train20.csv', skip_header=False) new_result = {} for each in input_data: question = each[0] label = each[1] augmented_questions = eda(question, num_aug=16) for augmented_question in augmented_questions: new_result[augmented_question] = label new_result[question] = label write_dict_to_csv('train20_augmented.csv', new_result)
def do_readcsv(self, _arg): try: methods.read_csv() except: print('file not found. try fixing csv path')
for current_id in id_to_questions.keys(): real_questions = id_to_questions.get(current_id)[0] generated_questions = id_to_questions.get(current_id)[1] if len(real_questions) >= 4: #If there are at least 4 questions random.shuffle(real_questions) for question in real_questions[:3]: train_dict[question] = current_id for question in real_questions[3:]: testA_dict[question] = current_id for question in generated_questions: testB_dict[question] = current_id counter += 1 print(f"{counter} distinct question IDs") return [train_dict, testA_dict, testB_dict] # MAIN # master_data = read_csv('data/final_master_dataset.csv', True) output_names = 'train3.csv', 'testA.csv', 'testB.csv' sub_datasets = split_dataset(master_data) for i in range(3): write_dict_to_csv(output_names[i], sub_datasets[i])