Exemple #1
0
def main(args, csv_path, img_path, out_path):
    # Initialize data loader
    data_loader = DataLoader(img_path,
                             csv_path,
                             file_type="dicom",
                             test=args.test)

    # Flag to run both classifiers
    both = args.sbd_only is False and args.cnn_only is False

    if args.cnn_only and args.sbd_only:
        raise Exception("Use only one of '--sbd_only' or '--cnn_only'")

    if args.sbd_only or both:
        logging.info("Running SBD")
        # ### SINOGRAM-BASED DETECTION ###
        sbd_pool, sbd_tasks, sbd_classifier = setup_SBD(args, data_loader)
        run_SBD(sbd_pool, sbd_tasks, sbd_classifier)
        # ### ------------------------ ###

    if args.cnn_only or both:
        logging.info("Running CNN")
        # ### -- CNN-BASED DETECTION - ###
        network, on_gpu = setup_CNN(args, data_loader)
        run_cnn(network, on_gpu, data_loader, out_path)
    def test_loaded_polyglot_embeddings(self):

        data = pickle_call('data/embeddings/polyglot-en.pkl')

        dl = DataLoader(embeddings_initial='Polyglot',
                        embedding_loading='in_dict')
        dl.load('data/pickles/')
        dl.get_all_and_dump('data/pickles/')

        all_true = None

        for i in range(len(data[0])):

            term = data[0][i]
            embedding = data[1][i]

            if term in dl.embedding.vocab_dict:
                position = dl.embedding.vocab_dict[term]
                stored_embedding = dl.embedding.embeddings.weight[
                    position].data.numpy()

                if all_true is None:
                    all_true = np.array_equal(embedding, stored_embedding)
                else:
                    all_true = all_true and np.array_equal(
                        embedding, stored_embedding)

        self.assertTrue(all_true)
    def test_generator_data_length_fast_text_SNLI_in_dict(self):
        dl = DataLoader(embeddings_initial='FastText-Crawl',
                        embedding_loading='in_dict')
        # dl = DataLoader(embeddings_initial='FastText', embedding_loading='load_dict',
        #                 embedding_params={'first_time_emb_load': False})
        dl.load('data/pickles/')
        dl.get_all_and_dump('data/pickles/')

        gen = dl.get_generator(drop_last=False, initialize=True)
        tr = dl.get_train_data()
        nr_data_points = 0

        # short analysis if the amount of data yielded equals the total amount of data points in the training set.
        # TLDC; yes it does
        while True:
            data, batch = gen.next()
            if data is None:
                break
            nr_data_points += len(data)

        self.assertEqual(nr_data_points, len(tr))
Exemple #4
0
                        help='Whether or not to calculate the accuracy of predictions, based on image labels.')
    parser.add_argument("--label_dir", default=label_path, type=str, help='Path to a CSV containing image labels.')
    parser.add_argument("--logging", action='store_true', help='Whether or not to save results.')
    parser.add_argument("--logdir", default=log_dir, type=str, help='Where to save results.')

    parser.add_argument("--test", action='store_true', help="If the test option is given, code will only process a few images.")

    parser.add_argument("--ncpu", default=None, type=int, help="Number of CPUs to use.")
    args, unparsed = parser.parse_known_args()

    # Initialize data loader
    img_dir = "/cluster/projects/radiomics/RADCURE-images"
    label_path = "/cluster/home/carrowsm/data/radcure_DA_labels.csv"
    img_suffix = ""
    file_type = "dicom"
    dl = DataLoader(img_dir, label_path, img_suffix, file_type="dicom", test=args.test)
    # dl = DataLoader(args)
    p_list = dl.patient_list#, dl.label_list   # Ordered list of patient IDs and their labels

    if args.test :
        # If in test mode, restrict data set size
        p_list = p_list[0 : 45]
        l_list = l_list[0 : 45]

    num_cpus = args.ncpu

    # Initialize classifier
    classifier = Classifier(args, dl)

    # Setup Parallel computing
    pool, tasks = parallel_setup(num_cpus, p_list)
def define_hyperparams_and_load_data(best_params=None,
                                     dl=None,
                                     data_set_name='Language_Text_100',
                                     embedding_name='character_100',
                                     batch_size=32,
                                     hidden_size=32,
                                     n_layers=1,
                                     dropout=0.0,
                                     lr=0.001,
                                     optimizer_type='adam',
                                     best_acc=0.0):
    """
    here we define the hyperparams based on either defined settings, or based on the stored best json log
    If dl is not defined, we load it
    :param best_params: json file of best hyper parameters
    :param dl: data_loading object
    :param data_set_name: which data set ist to be loaded
    :param embedding_name: which embedding data set is to be loaded
    :param batch_size:
    :param hidden_size:
    :param n_layers:
    :param dropout:
    :param lr:
    :param optimizer_type:
    :param best_acc:
    :return:
    """

    # if we have passed the dictionary of bestparams we extract that information and set the hyperparameters accordingly
    if best_params is not None:
        if 'data_set_name' in best_params:
            data_set_name = best_params['data_set_name']
            if data_set_name.endswith("10000"):
                embedding_name = 'character_10000'
            elif data_set_name.endswith("1000"):
                embedding_name = 'character_1000'
            else:
                embedding_name = 'character_100'
        if 'batch_size' in best_params:
            batch_size = best_params['batch_size']
        if 'hidden_size' in best_params:
            hidden_size = best_params['hidden_size']
        if 'n_layers' in best_params:
            n_layers = best_params['n_layers']
        if 'dropout' in best_params:
            dropout = best_params['dropout']
        if 'lr' in best_params:
            lr = best_params['lr']
        if 'optimizer_type' in best_params:
            optimizer_type = best_params['optimizer_type']
        if 'best_acc' in best_params:
            best_acc = best_params['best_acc']

    # if we have not passed a data_loader object, we call ist from file
    if dl is None:
        class_params = {'name': data_set_name}
        dl = DataLoader(data_set=data_set_name,
                        embeddings_initial=embedding_name,
                        embedding_loading='top_k',
                        K_embeddings=float('inf'),
                        param_dict=class_params)
        dl.load('data/pickles/')
        dl.get_all_and_dump('data/pickles/')

    # define all the hyperparameters
    hyperparameters = {}
    hyperparameters['optimizer_type'] = optimizer_type
    hyperparameters['lr'] = lr
    hyperparameters['hidden_size'] = hidden_size
    hyperparameters['batch_size'] = batch_size
    hyperparameters['vocab_size'] = dl.embedding.get_vocab_size()
    hyperparameters['n_layers'] = n_layers
    hyperparameters['dropout'] = dropout
    hyperparameters['padding_idx'] = dl.embedding.get_pad_pos()
    hyperparameters['num_classes'] = len(dl.labels)
    hyperparameters['embedding'] = dl.embedding.get_embeddings()
    hyperparameters['embedding_size'] = dl.embedding.embedding_size
    hyperparameters['data_set_name'] = data_set_name
    hyperparameters['best_acc'] = best_acc

    return hyperparameters, dl
def random_walk(best_params_file,
                epochs=100,
                nr_samples=1,
                data_set_name='Language_Text_10000',
                embedding_name='character_10000',
                lrs=[0.001],
                batch_sizes=[32],
                hidden_sizes=[32],
                n_layers_list=[1],
                dropouts=[0.0],
                optimizer_types=['adam']):
    """
    This function randomly samples each hyperparameter and trains the model until early stopping
    :param best_params_file: location of the log file for the best model
    :param epochs: max number of epochs
    :param nr_samples: how many randomly sampled models should be trained
    :param data_set_name: which data set is to be chosen 'Language_Text_100', Language_Text_1000', 'Language_Text_10000'
    :param embedding_name: embeddings data set 'character_100', 'character_1000', or 'character_10000'
    :param lrs: possible learning rates
    :param batch_sizes: different batch sizes to be tested
    :param hidden_sizes: LSTM hidden sizes
    :param n_layers_list: number of layers for each LSTM timestep
    :param dropouts: dropout probability between LSTM layers
    :param optimizer_types: optimizer e.g. 'adam', 'sgd'
    :return: returns the overall best accuracy
    """

    # load the defined data loader data set, this can be shared for each model because nothing changes
    class_params = {'name': data_set_name}
    dl = DataLoader(data_set=data_set_name,
                    embeddings_initial=embedding_name,
                    embedding_loading='top_k',
                    K_embeddings=float('inf'),
                    param_dict=class_params)
    dl.load('data/pickles/')
    dl.get_all_and_dump('data/pickles/')

    # if models have already been trained, get the best accuracy so that we don't overwrite our current best model
    try:
        best_params = load_params(best_params_file)
        best_acc = best_params['best_acc']

    # if not, we just define the accuracy the smallest possible
    except:
        best_acc = -float('inf')

    # we randomly sample #nr_samples models and train them until early stopping
    for i in range(nr_samples):

        print("\nCurrent best accuracy = " + str(best_acc)) + '\n'

        # randomly sample the hyperparams
        lr = np.random.choice(lrs)
        batch_size = np.random.choice(batch_sizes)
        hidden_size = np.random.choice(hidden_sizes)
        n_layers = np.random.choice(n_layers_list)
        optimizer_type = np.random.choice(optimizer_types)

        # if we only have one layer, theres no need for dropout (at least for this kind)
        if n_layers == 1:
            dropout = 0.0
        else:
            dropout = np.random.choice(dropouts)

        # batch_size = np.random.choice(batch_sizes)
        # hidden_size = np.random.choice([8,16,32,64,128,256])
        # n_layers = np.random.choice([1,2])
        # if n_layers == 1:
        #     dropout = 0.0
        # else:
        #     dropout = np.random.choice([0.0,0.3,0.6])

        # get the hyperparameters by initializing them and retrieving some parameters from data_loader
        hyperparameters, dl = define_hyperparams_and_load_data(
            dl=dl,
            data_set_name=data_set_name,
            embedding_name=embedding_name,
            batch_size=batch_size,
            hidden_size=hidden_size,
            n_layers=n_layers,
            dropout=dropout,
            lr=lr,
            optimizer_type=optimizer_type,
            best_acc=best_acc)

        print("Training with the following hyperparameters:")
        print hyperparameters

        # train until early stopping
        best_acc = train(hyperparameters,
                         dl,
                         epochs=epochs,
                         batch_size=batch_size)