def max_balancer(input_csv_path, output_csv_path='./output.csv'): dataset = csv_handler.csv_readlines(input_csv_path) pos_dataset = transform.filter_func(dataset, lambda row: row[2] == '1') neg_dataset = transform.filter_func(dataset, lambda row: row[2] == '0') assert (len(pos_dataset) <= len(neg_dataset)) sampler = Sampler() neg_dataset = sampler.sample_rows(neg_dataset, len(pos_dataset)) pos_ids = transform.map_func(pos_dataset, lambda row: row[0]) neg_ids = transform.map_func(neg_dataset, lambda row: row[0]) select_id_set = set(pos_ids + neg_ids) final = transform.filter_func(dataset, lambda row: row[0] in select_id_set) csv_handler.csv_writelines(output_csv_path, final)
def csv_split(self, percentage, first_output_path, second_output_path): assert (percentage > 0) assert (percentage < 1) first_size = percentage * len(self.dataset) first_idx = self.__sampled_idx(first_size) # first output first_dataset = transformer.map_func(first_idx, lambda idx: self.dataset[idx]) csv_writelines(first_output_path, first_dataset) # second output first_idx_set = set(first_idx) second_idx = transformer.filter_func( range(len(self.dataset)), lambda idx: idx not in first_idx_set) second_dataset = transformer.map_func(second_idx, lambda idx: self.dataset[idx]) csv_writelines(second_output_path, second_dataset)
def select(split, dataset): final = transformer.filter_func(dataset, lambda row: row[3] == split) final = transformer.map_func(final, lambda row: (row[0], row[1], row[2])) return final
train_id_path = data_dir + "train_ids.txt" test_id_path = data_dir + "test_ids.txt" anno_dir = data_dir + '/iclr_anno_final/' train_output_path = data_dir + "./trainset.csv" test_output_path = data_dir + "./devset.csv" import csv_handler as csv_handler train_id_dataset = open(train_id_path, "r").read().splitlines() test_id_dataset = open(test_id_path, "r").read().splitlines() from os import listdir from os.path import isfile, join files = listdir(anno_dir) files = transformer.filter_func(files, lambda name: 'rating' in name) def get_id(file_name): idx = file_name.index("rating") return file_name[:idx - 1] # check completeness anno_ids = transformer.map_func(files, lambda file_name: get_id(file_name)) anno_ids.sort() train_test_ids = train_id_dataset + test_id_dataset train_test_ids.sort() assert (anno_ids == train_test_ids)
# get idx for review_id, text, and funny_count def get_triplet_idx(header): idx_id = header.index('review_id') idx_text = header.index('text') idx_count = header.index('funny') return (idx_id, idx_text, idx_count) def selector(row, idx_id, idx_text, idx_count): r_id = row[idx_id] text = row[idx_text] funny_count = int(row[idx_count]) is_funny = None if (funny_count >= 5): is_funny = 1 elif (funny_count == 0): is_funny = 0 return (r_id, text, is_funny) (idx_id, idx_text, idx_count) = get_triplet_idx(dataset[0]) selected_datasets = transformer.map_func( dataset[1:], lambda line: selector(line, idx_id, idx_text, idx_count)) final_datasets = transformer.filter_func(selected_datasets, lambda row: row[2] != None) csv_handler.csv_writelines(review_output_path, final_datasets)
def distill(input_path, label_map): dataset = csv_handler.csv_readlines(input_path, delimit='\t') dataset = transform.filter_func(dataset, lambda row: len(row) >= 5) token_label = transform.map_func( dataset, lambda row: [row[0].strip(), label_map[row[4].strip()[0]]]) return token_label