def main(args, csv_path, img_path, out_path): # Initialize data loader data_loader = DataLoader(img_path, csv_path, file_type="dicom", test=args.test) # Flag to run both classifiers both = args.sbd_only is False and args.cnn_only is False if args.cnn_only and args.sbd_only: raise Exception("Use only one of '--sbd_only' or '--cnn_only'") if args.sbd_only or both: logging.info("Running SBD") # ### SINOGRAM-BASED DETECTION ### sbd_pool, sbd_tasks, sbd_classifier = setup_SBD(args, data_loader) run_SBD(sbd_pool, sbd_tasks, sbd_classifier) # ### ------------------------ ### if args.cnn_only or both: logging.info("Running CNN") # ### -- CNN-BASED DETECTION - ### network, on_gpu = setup_CNN(args, data_loader) run_cnn(network, on_gpu, data_loader, out_path)
def test_loaded_polyglot_embeddings(self): data = pickle_call('data/embeddings/polyglot-en.pkl') dl = DataLoader(embeddings_initial='Polyglot', embedding_loading='in_dict') dl.load('data/pickles/') dl.get_all_and_dump('data/pickles/') all_true = None for i in range(len(data[0])): term = data[0][i] embedding = data[1][i] if term in dl.embedding.vocab_dict: position = dl.embedding.vocab_dict[term] stored_embedding = dl.embedding.embeddings.weight[ position].data.numpy() if all_true is None: all_true = np.array_equal(embedding, stored_embedding) else: all_true = all_true and np.array_equal( embedding, stored_embedding) self.assertTrue(all_true)
def test_generator_data_length_fast_text_SNLI_in_dict(self): dl = DataLoader(embeddings_initial='FastText-Crawl', embedding_loading='in_dict') # dl = DataLoader(embeddings_initial='FastText', embedding_loading='load_dict', # embedding_params={'first_time_emb_load': False}) dl.load('data/pickles/') dl.get_all_and_dump('data/pickles/') gen = dl.get_generator(drop_last=False, initialize=True) tr = dl.get_train_data() nr_data_points = 0 # short analysis if the amount of data yielded equals the total amount of data points in the training set. # TLDC; yes it does while True: data, batch = gen.next() if data is None: break nr_data_points += len(data) self.assertEqual(nr_data_points, len(tr))
help='Whether or not to calculate the accuracy of predictions, based on image labels.') parser.add_argument("--label_dir", default=label_path, type=str, help='Path to a CSV containing image labels.') parser.add_argument("--logging", action='store_true', help='Whether or not to save results.') parser.add_argument("--logdir", default=log_dir, type=str, help='Where to save results.') parser.add_argument("--test", action='store_true', help="If the test option is given, code will only process a few images.") parser.add_argument("--ncpu", default=None, type=int, help="Number of CPUs to use.") args, unparsed = parser.parse_known_args() # Initialize data loader img_dir = "/cluster/projects/radiomics/RADCURE-images" label_path = "/cluster/home/carrowsm/data/radcure_DA_labels.csv" img_suffix = "" file_type = "dicom" dl = DataLoader(img_dir, label_path, img_suffix, file_type="dicom", test=args.test) # dl = DataLoader(args) p_list = dl.patient_list#, dl.label_list # Ordered list of patient IDs and their labels if args.test : # If in test mode, restrict data set size p_list = p_list[0 : 45] l_list = l_list[0 : 45] num_cpus = args.ncpu # Initialize classifier classifier = Classifier(args, dl) # Setup Parallel computing pool, tasks = parallel_setup(num_cpus, p_list)
def define_hyperparams_and_load_data(best_params=None, dl=None, data_set_name='Language_Text_100', embedding_name='character_100', batch_size=32, hidden_size=32, n_layers=1, dropout=0.0, lr=0.001, optimizer_type='adam', best_acc=0.0): """ here we define the hyperparams based on either defined settings, or based on the stored best json log If dl is not defined, we load it :param best_params: json file of best hyper parameters :param dl: data_loading object :param data_set_name: which data set ist to be loaded :param embedding_name: which embedding data set is to be loaded :param batch_size: :param hidden_size: :param n_layers: :param dropout: :param lr: :param optimizer_type: :param best_acc: :return: """ # if we have passed the dictionary of bestparams we extract that information and set the hyperparameters accordingly if best_params is not None: if 'data_set_name' in best_params: data_set_name = best_params['data_set_name'] if data_set_name.endswith("10000"): embedding_name = 'character_10000' elif data_set_name.endswith("1000"): embedding_name = 'character_1000' else: embedding_name = 'character_100' if 'batch_size' in best_params: batch_size = best_params['batch_size'] if 'hidden_size' in best_params: hidden_size = best_params['hidden_size'] if 'n_layers' in best_params: n_layers = best_params['n_layers'] if 'dropout' in best_params: dropout = best_params['dropout'] if 'lr' in best_params: lr = best_params['lr'] if 'optimizer_type' in best_params: optimizer_type = best_params['optimizer_type'] if 'best_acc' in best_params: best_acc = best_params['best_acc'] # if we have not passed a data_loader object, we call ist from file if dl is None: class_params = {'name': data_set_name} dl = DataLoader(data_set=data_set_name, embeddings_initial=embedding_name, embedding_loading='top_k', K_embeddings=float('inf'), param_dict=class_params) dl.load('data/pickles/') dl.get_all_and_dump('data/pickles/') # define all the hyperparameters hyperparameters = {} hyperparameters['optimizer_type'] = optimizer_type hyperparameters['lr'] = lr hyperparameters['hidden_size'] = hidden_size hyperparameters['batch_size'] = batch_size hyperparameters['vocab_size'] = dl.embedding.get_vocab_size() hyperparameters['n_layers'] = n_layers hyperparameters['dropout'] = dropout hyperparameters['padding_idx'] = dl.embedding.get_pad_pos() hyperparameters['num_classes'] = len(dl.labels) hyperparameters['embedding'] = dl.embedding.get_embeddings() hyperparameters['embedding_size'] = dl.embedding.embedding_size hyperparameters['data_set_name'] = data_set_name hyperparameters['best_acc'] = best_acc return hyperparameters, dl
def random_walk(best_params_file, epochs=100, nr_samples=1, data_set_name='Language_Text_10000', embedding_name='character_10000', lrs=[0.001], batch_sizes=[32], hidden_sizes=[32], n_layers_list=[1], dropouts=[0.0], optimizer_types=['adam']): """ This function randomly samples each hyperparameter and trains the model until early stopping :param best_params_file: location of the log file for the best model :param epochs: max number of epochs :param nr_samples: how many randomly sampled models should be trained :param data_set_name: which data set is to be chosen 'Language_Text_100', Language_Text_1000', 'Language_Text_10000' :param embedding_name: embeddings data set 'character_100', 'character_1000', or 'character_10000' :param lrs: possible learning rates :param batch_sizes: different batch sizes to be tested :param hidden_sizes: LSTM hidden sizes :param n_layers_list: number of layers for each LSTM timestep :param dropouts: dropout probability between LSTM layers :param optimizer_types: optimizer e.g. 'adam', 'sgd' :return: returns the overall best accuracy """ # load the defined data loader data set, this can be shared for each model because nothing changes class_params = {'name': data_set_name} dl = DataLoader(data_set=data_set_name, embeddings_initial=embedding_name, embedding_loading='top_k', K_embeddings=float('inf'), param_dict=class_params) dl.load('data/pickles/') dl.get_all_and_dump('data/pickles/') # if models have already been trained, get the best accuracy so that we don't overwrite our current best model try: best_params = load_params(best_params_file) best_acc = best_params['best_acc'] # if not, we just define the accuracy the smallest possible except: best_acc = -float('inf') # we randomly sample #nr_samples models and train them until early stopping for i in range(nr_samples): print("\nCurrent best accuracy = " + str(best_acc)) + '\n' # randomly sample the hyperparams lr = np.random.choice(lrs) batch_size = np.random.choice(batch_sizes) hidden_size = np.random.choice(hidden_sizes) n_layers = np.random.choice(n_layers_list) optimizer_type = np.random.choice(optimizer_types) # if we only have one layer, theres no need for dropout (at least for this kind) if n_layers == 1: dropout = 0.0 else: dropout = np.random.choice(dropouts) # batch_size = np.random.choice(batch_sizes) # hidden_size = np.random.choice([8,16,32,64,128,256]) # n_layers = np.random.choice([1,2]) # if n_layers == 1: # dropout = 0.0 # else: # dropout = np.random.choice([0.0,0.3,0.6]) # get the hyperparameters by initializing them and retrieving some parameters from data_loader hyperparameters, dl = define_hyperparams_and_load_data( dl=dl, data_set_name=data_set_name, embedding_name=embedding_name, batch_size=batch_size, hidden_size=hidden_size, n_layers=n_layers, dropout=dropout, lr=lr, optimizer_type=optimizer_type, best_acc=best_acc) print("Training with the following hyperparameters:") print hyperparameters # train until early stopping best_acc = train(hyperparameters, dl, epochs=epochs, batch_size=batch_size)