def generate_single_file(sources, savefile, ifreader, statuses, cell_lines, versions, locations, multi_label, indices, verbose=False): savefile = open(savefile, 'w') for (x,y, plate_names) in ifreader.data_generator( sources, indices, statuses, cell_lines, versions, locations, multi_label, verbose=verbose): (x,y,plate_names) = utils.shuffle_lists([x,y,plate_names]) for lx, ly in zip(x,y): for l in lx: savefile.write(str(l)) savefile.write(',') savefile.write(' ') for l in ly: savefile.write(str(l)) savefile.write(',') savefile.write('\n') savefile.close()
def shuffle(self): '''shuffle order of dataset''' dataset = self._dataset names = dataset['name'] features = dataset['feature'] labels = dataset['label'] names, features, labels = utils.shuffle_lists(names, features, labels) self._dataset.update({ 'name': names, 'feature': features, 'label': labels }) return self
def write_synthetic_data(args): ''' make data where data points are (y,s,e) - s : sequence of integers - idx : index which will be s[0]. depending on condition, there will exist a map f : s_idx --> e_idx that allows for retrieval-based approaches - m,n are ints - r,d are ints - y : labels given as 1[#m > #n] in the vanilla setup. in one condition, this holds if s[1]==1 else 1[#r> #d] - e : is an "explanation" of the data point, which gives information the features that cause the label NUMBERS START AT 1. no zeros appear in s ''' print("Writing data...") if args.num_relevant_points < 0 and args.num_tasks < 0: # is neither parameter provided... args.num_relevant_points = args.context_size + 1 # DETERMINE NUM_RELEVANT_POINTS HERE IF NOT PROVIDED if args.num_relevant_points < 0 and args.num_tasks > 0: assert args.num_train_synthetic % args.num_tasks == 0, "please make n_train divisible by num_tasks" args.num_relevant_points = args.num_train_synthetic // args.num_tasks n_train = args.num_train_synthetic n_dev = 10000 # NOTE these may be slightly modified to allow for even group sizes wrt k n_test = 50000 # NOTE these may be slightly modified to allow for even group sizes wrt k if args.small_data: n_train, n_dev, n_test = [int(args.small_size) for n in range(3)] if n_train % args.num_relevant_points != 0: n_train = ((n_train // args.num_relevant_points + 1) * args.num_relevant_points) # slightly modify n_train if needed if n_train % args.num_relevant_points != 0: n_train = ((n_train // args.num_relevant_points + 1) * args.num_relevant_points) assert n_train % ( args.num_relevant_points ) == 0, "please make n_train divisible by num_relevant_points" if args.num_relevant_points > 1: if not args.num_relevant_points % 2 == 0: print( "\n Note that num_relevant_points is odd! Hence balancing is not precisely 50/50 \n" ) # get train_use_idx. num_per_train_idx = args.num_relevant_points n_train_idx = n_train // num_per_train_idx max_idx = args.max_int**2 if args.max_idx < 0 else args.max_idx assert n_train_idx <= max_idx, "need to decrease num_relevant_points to increase the numbers of tasks, or increase the num possible tasks by increasing args.max_int" train_use_idx = np.random.choice(np.arange(1, max_idx + 1), size=n_train_idx, replace=False) # test time idx are seen in training by default, or can be flagged to make them new if not args.disjoint_test_idx: dev_use_idx = train_use_idx test_use_idx = train_use_idx elif args.disjoint_test_idx: eligible_idx = np.setdiff1d(np.arange(1, max_idx + 1), train_use_idx) dev_use_idx = np.random.choice( eligible_idx, size=n_train_idx, replace=True ) # will need to replace=True when n_test > max_idx*num_per_idx test_use_idx = dev_use_idx num_per_dev_idx = round(n_dev / len(dev_use_idx)) num_per_test_idx = round(n_test / len(test_use_idx)) # modify n_dev and n_test if needed n_dev += (len(dev_use_idx) * num_per_dev_idx - n_dev) n_test += (len(test_use_idx) * num_per_test_idx - n_test) # make labels in advance labels_list = [ utils.balanced_array(size=n, prop=.5) for n in [n_train, n_dev, n_test] ] utils.shuffle_lists(labels_list) train_labels, dev_labels, test_labels = labels_list # make mn and rds. make idx to z dict max_mn = int(np.sqrt(max_idx)) mn_and_rds, collected = [], [] # here is the normal procedure: (See below for special case) order_counter = 123 # unique ints to start. will start at 0123 -> 1234 given +1 unique_idx = set(np.concatenate( [train_use_idx, dev_use_idx])) # by default this just turns into train_use_idx if not (args.use_mn_only and args.ordered_mnrd): while len(mn_and_rds) < len(unique_idx): # in this condition, randomly sample mnrd and simply avoid repeats if not args.ordered_mnrd: proposal = np.random.choice(np.arange(1, max_mn + 1), size=4, replace=False) if str( proposal ) not in collected: # weird truth value ambiguous just checking if proposal in mn_and_rds mn_and_rds.append(proposal) collected.append(str(proposal)) # in this condition, gradually increment values of m/n/r/d so that the task information is dense in integer space if args.ordered_mnrd: assert max_idx <= 10000, "right now ordered_mnrd without use_mn_only has a max_idx of 10k" str_mnrd = '%04d' % order_counter while len(set(str_mnrd)) != len( list(str_mnrd)): # if list is not unique characters order_counter += 1 str_mnrd = '%04d' % order_counter mnrd = np.array([int(_int) + 1 for _int in list(str_mnrd)]) mn_and_rds.append(mnrd) order_counter += 1 # this is a special condition where we need to order based on mn only, so we overwrite the above if args.use_mn_only and args.ordered_mnrd: mn_and_rds, collected = [], [] integers = np.array([1, 1]) while len(mn_and_rds) < len(unique_idx): proposal = integers # increment if not all integers unique while len(set(proposal)) != len(proposal): last_idx_where_valid = [ idx for idx in range(len(integers)) if integers[idx] < 100 ][-1] integers[last_idx_where_valid] += 1 mn = np.array([int(_int) for _int in integers]) distractors = np.random.choice(np.setdiff1d(np.arange(1, 101), mn), size=2, replace=False) mnrd = np.concatenate([mn, distractors]) mn_and_rds.append(mnrd) # increment last_idx_where_valid = [ idx for idx in range(len(integers)) if integers[idx] < 100 ][-1] if last_idx_where_valid != 1: integers[-1] = 1 integers[last_idx_where_valid] += 1 # order things if doing smooth_idx_to_z if args.smooth_idx_to_z: train_use_idx = np.sort(train_use_idx) mn_and_rds = sorted( mn_and_rds, key=lambda x: x[0] + 1e-3 * x[1] + 1e-6 * x[2] + 1e-9 * x[3] ) # this takes advantage of known scale of num_tasks to break ties by each next element of the mnrd array idx_to_z_dict = {idx: mn_and_rds[i] for i, idx in enumerate(unique_idx)} ''' now want a few other properties, per idx per dataset - mn or rd balance: use_mn_or_rd within each idx - #counts balance: want mn/rd #-counts to swap half the time, so there is no bias in size - distractor feature: want the non-causal feature (mn or rd, depending on above indicator) to correlate with the causal one 50% of the time ''' train_idx_to_info = { idx: { 'use_mn_or_rd': utils.balanced_array( size=num_per_train_idx, prop=.5), # pick whether to use mn, or rd for 'swap_samples': utils.balanced_array( size=num_per_train_idx, prop=.5 ), # set mnrd = (count1,2,3,4) or mnrd = (count3,4,1,2) based on this (whether to swap counts 1,2 and 3,4) 'distractor_correlates': utils.balanced_array( size=num_per_train_idx, prop=args.weak_feature_correlation ), # whether to have the non-causal feature (mn or rd) correlate with the causal one 'mnrd': idx_to_z_dict[idx] } for idx in train_use_idx } dev_idx_to_info = { idx: { 'use_mn_or_rd': utils.balanced_array( size=num_per_dev_idx, prop=.5), # pick whether to use mn, or rd for 'swap_samples': utils.balanced_array( size=num_per_dev_idx, prop=.5 ), # set mnrd = (count1,2,3,4) or mnrd = (count3,4,1,2) based on this (whether to swap counts 1,2 and 3,4) 'distractor_correlates': utils.balanced_array( size=num_per_dev_idx, prop=.5 ), # whether to have the non-causal feature (mn or rd) correlate with the causal one 'mnrd': idx_to_z_dict[idx] } for idx in dev_use_idx } test_idx_to_info = { idx: { 'use_mn_or_rd': utils.balanced_array( size=num_per_test_idx, prop=.5), # pick whether to use mn, or rd for 'swap_samples': utils.balanced_array( size=num_per_test_idx, prop=.5 ), # set mnrd = (count1,2,3,4) or mnrd = (count3,4,1,2) based on this (whether to swap counts 1,2 and 3,4) 'distractor_correlates': utils.balanced_array( size=num_per_test_idx, prop=.5 ), # whether to have the non-causal feature (mn or rd) correlate with the causal one 'mnrd': idx_to_z_dict[idx] } for idx in test_use_idx } # make splits train_s_list, train_e_list = make_split(args, train_labels, train_use_idx, num_per_train_idx, train_idx_to_info, ignore_list=None) dev_s_list, dev_e_list = make_split(args, dev_labels, dev_use_idx, num_per_dev_idx, dev_idx_to_info, ignore_list=train_s_list) test_s_list, test_e_list = make_split(args, test_labels, test_use_idx, num_per_test_idx, test_idx_to_info, ignore_list=train_s_list) assert len(train_s_list) == n_train assert len(dev_s_list) == n_dev # make dfs and write splits train_df = pd.DataFrame({ 'unique_id': i, 's': train_s_list[i], 'e': train_e_list[i], 'label': train_labels[i] } for i in range(n_train)) dev_df = pd.DataFrame({ 'unique_id': i + n_train, 's': dev_s_list[i], 'e': dev_e_list[i], 'label': dev_labels[i] } for i in range(n_dev)) test_df = pd.DataFrame({ 'unique_id': i + n_train + n_dev, 's': test_s_list[i], 'e': test_e_list[i], 'label': test_labels[i] } for i in range(n_test)) folder = args.data_dir + '_' + args.experiment_name if not os.path.exists(folder): os.mkdir(folder) paths = [ os.path.join(folder, split_name) + '.csv' for split_name in ['train', 'dev', 'test'] ] train_df.to_csv(paths[0], index=False) dev_df.to_csv(paths[1], index=False) test_df.to_csv(paths[2], index=False) print("\nData statistics:") print( f"\t Num train idx / tasks: {len(train_use_idx)} | Num per train idx: {num_per_train_idx}" ) print( f"\t Num dev idx / tasks: {len(dev_use_idx)} | Num per dev idx: {num_per_dev_idx}" ) print( f"\t Num test idx / tasks: {len(test_use_idx)} | Num per test idx: {num_per_test_idx}" ) return train_use_idx
from dataloader import Gleason2019SaveDISK from model import Unet from utils import shuffle_lists # Data preparation generate_sub_images = True root_path = './MICCAI_2019_pathology_challenge/' folder_to_save_train_samples = './train_samples' folder_to_save_val_samples = './val_samples' train_imgs = sorted( glob.glob(os.path.join(root_path, 'Train Imgs/Train Imgs/*.jpg'))) # You have to generate the labels first by running the script labels_final = sorted(glob.glob('./labels/*.png')) assert len(labels_final) == len(train_imgs) train_imgs, labels_final = shuffle_lists(train_imgs, labels_final) val_loader = Gleason2019SaveDISK('val', train_imgs, labels_final, (0.8, 0.2), (512, 512), samples=10) train_loader = Gleason2019SaveDISK('train', train_imgs, labels_final, (0.8, 0.2), (512, 512), samples=40) if generate_sub_images: val_loader.generate_data(folder_to_save_val_samples) train_loader.generate_data(folder_to_save_train_samples) else: print('You have to generate the image samples once') train_loader.load_paths() val_loader.load_paths()
def generate_data_files(base, ifreader, statuses, source, cell_lines, versions, locations, multi_label, indices, num_files=5, verbose=False): """ Generates a number of data files on the format features[1]\none-hot-classes[1]\n... Will generate and save the datafiles without checking if they already exist and as such may overwrite already existing data files with the same names. Parameters: base : The base name for the data files. The files will be named base-0, base-1, etc. ifreader: The ifreader which has read the IF_images file that should be used for this. See the if_reader.IFReader class for more information. statuses: What statuses that should be included from the original data. Should be a list of integers as strings. if None, all statuses are included. source : The original data files. Must be a list of filenames that contains the protein data on a comma (,) delimited csv format. cell_lines: What cell should be included from the original data. Should be a list of strings. if None, all cell lines are included. versions: What versions that should be included from the original data. Should be a list of integers as strings. if None, all versions are included. locations: What locations that should be included from the original data. Should be a list of strings. if None, all locations are included. multi_label:True if the data files should include multi_label instances. indices : A list of integers that corresponds to the indices of the features that should be included in the data. num_files: The number of data files to generate. Defaults to 5. verbose : True if verbose output should be printed. Defaults to False. """ if verbose: print('Generating data files') files = [] plate_name_files = [] for i in xrange(num_files): f = open(base + '-' + str(i), 'w') pnf = open(base + '-' + str(i) + '-platenames', 'w') files.append(f) plate_name_files.append(pnf) curr_index = 0 for (x,y, plate_names) in ifreader.data_generator( source, indices, statuses, cell_lines, versions, locations, multi_label, verbose=verbose): (x,y,plate_names) = utils.shuffle_lists([x,y,plate_names]) for (lx, ly, plate_name) in zip(x,y,plate_names): plate_name_files[curr_index].write(plate_name) plate_name_files[curr_index].write(' %d %d' % (curr_index, files[curr_index].tell())) plate_name_files[curr_index].write('\n') for l in lx: files[curr_index].write(str(l)) files[curr_index].write(' ') files[curr_index].write('\n') for l in ly: files[curr_index].write(str(l)) files[curr_index].write(' ') files[curr_index].write('\n') curr_index += 1 curr_index %= num_files for (f,pnf) in zip(files, plate_name_files): f.close() pnf.close()