def create_tfidf_features(): """ Using the scikit-learn TFIDF vectorizer, we create a dataframe with some new features. We compute the sum, the mean and the length of the TFIDF for both questions. :return: pandas dataframe for train and test set """ # Load dataset train, test = load_dataset() tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2)) tfidf_txt = pd.Series(train['question1'].tolist() + train['question2'].tolist() + test['question1'].tolist() + test['question2'].tolist()).astype(str) _ = tfidf.fit_transform(tfidf_txt) trn_features = tfidf_features(train, tfidf) tst_features = tfidf_features(test, tfidf) # removing unnecessary columns from train and test data X_train = trn_features.drop( ['id', 'id1', 'id2', "question1", "question2", 'is_duplicate'], axis=1) X_test = tst_features.drop(['id', 'id1', 'id2', "question1", "question2"], axis=1) return X_train, X_test
def create_pagerank_features(): """ We create a dataframe with some features extracted from the PageRank algorithm :return: pandas dataframe for train and test set """ # Load dataset df_train, df_test = load_dataset() def generate_qid_graph_table(row): """ Generating a graph of questions and their neighbors. Appending nodes to the graph directly :param row: dataframe row """ hash_key1 = hashlib.md5(row["question1"].encode('utf-8')).hexdigest() hash_key2 = hashlib.md5(row["question2"].encode('utf-8')).hexdigest() qid_graph.setdefault(hash_key1, []).append(hash_key2) qid_graph.setdefault(hash_key2, []).append(hash_key1) qid_graph = {} _ = df_train.apply(generate_qid_graph_table, axis=1) _ = df_test.apply(generate_qid_graph_table, axis=1) pagerank_dict = get_pagerank(qid_graph) X_train = df_train.apply(lambda x: get_pagerank_value(x, pagerank_dict), axis=1) # Empty garbage collector del df_train gc.collect() X_test = df_test.apply(lambda x: get_pagerank_value(x, pagerank_dict), axis=1) return X_train, X_test
def calculate_metric(dataset_name): """ Util function that manages to calculate metric scores for each given dataset (in parallel) """ db_name = 'DB_{}'.format(dataset_name) # prepare a list of metric names: metric_names = [ 'auc_rnx', 'pearsonr', 'mds_isotonic', 'cca_stress', 'sammon_nlm' ] # load original dataset X_original, _, _ = load_dataset(dataset_name) # prepare a list of pre-calculated tsne object files to_process_files = [] embedding_dir = '{}/{}'.format(output_folder, dataset_name) for file in os.listdir(embedding_dir): if file.endswith('.z'): in_name = os.path.join(embedding_dir, file) to_process_files.append(in_name) print('{} files to process'.format(len(to_process_files))) # setup to run calculation in parallel Parallel(n_jobs=n_cpus_using)( delayed(_calculate_metrics)( db_name, X_original, tsne_file, metric_names ) for tsne_file in to_process_files )
def pre_calculate(dataset_name, num_constraints=10, metrics=[], manual=False): # prepare original dataset X_original, y_original, labels_original = load_dataset(dataset_name) print(X_original.shape, y_original.shape, len(labels_original)) # get pre-calculated tsne results pkl_name = '{}/tsne_{}.pkl'.format(output_folder, dataset_name) pkl_data = pickle.load(open(pkl_name, 'rb')) if manual is True: # use hard-coded constraints print("Using manual constraints") mustlinks, cannotlinks = manual_constraints( dataset_name=dataset_name, n_take=num_constraints) else: # use generated constraints print("Using auto constraints") mustlinks, cannotlinks = auto_constraints( target_labels=y_original, n_take=num_constraints) # calculate neg. log. likelihood for constrainted points for item in pkl_data['results']: calculate_nll(X_original, item, mls=mustlinks, cls=cannotlinks) # add constraints into pickle object pkl_data['mustlinks'] = mustlinks pkl_data['cannotlinnks'] = cannotlinks # my typo if metrics: # calculate the named-metric in `metrics` for item in pkl_data['results']: calculate_metrics(X_original, item, metrics) # save pickle data for reuse (update existed file) pickle.dump(pkl_data, open(pkl_name, 'wb'))
def _construct_data_dicts(self): logger.info('Will collect samples (img/ann pairs).') name_to_tag = self.config['dataset_tags'] project_fs = sly.ProjectFS.from_disk_dir_project(self.helper.paths.project_dir) logger.info('Project structure has been read. Samples: {}.'.format(project_fs.pr_structure.image_cnt)) samples_dct = sly.samples_by_tags( tags=list(name_to_tag.values()), project_fs=project_fs, project_meta=self.helper.in_project_meta ) self.data_dicts = {} self.iters_cnt = {} for the_name, the_tag in name_to_tag.items(): samples_lst = samples_dct[the_tag] if len(samples_lst) < 1: raise RuntimeError('Dataset %s should contain at least 1 element.' % the_name) img_paths, labels, num_boxes = load_dataset(samples_lst, self.class_title_to_idx, self.helper.in_project_meta) dataset_dict = { 'img_paths': img_paths, 'labels': labels, 'num_boxes': num_boxes, 'sample_cnt': len(samples_lst) } self.data_dicts[the_name] = dataset_dict self.iters_cnt[the_name] = np.ceil(float(len(samples_lst)) / (self.config['batch_size'][the_name] * len(self.config['gpu_devices']))).astype('int') logger.info('Prepared dataset.', extra={ 'dataset_purpose': the_name, 'dataset_tag': the_tag, 'sample_cnt': len(samples_lst) })
def _construct_data_loaders(self): self.data_dicts = {} self.iters_cnt = {} for the_name, the_tag in self.name_to_tag.items(): samples_lst = self._deprecated_samples_by_tag[the_tag] supervisely_lib.nn.dataset.ensure_samples_nonempty( samples_lst, the_tag, self.project.meta) img_paths, labels, num_boxes = load_dataset( samples_lst, self.class_title_to_idx, self.project.meta) dataset_dict = { 'img_paths': img_paths, 'labels': labels, 'num_boxes': num_boxes, 'sample_cnt': len(samples_lst) } self.data_dicts[the_name] = dataset_dict samples_per_iter = self.config['batch_size'][the_name] * len( self.config['gpu_devices']) self.iters_cnt[the_name] = math.ceil( float(len(samples_lst)) / samples_per_iter) logger.info('Prepared dataset.', extra={ 'dataset_purpose': the_name, 'dataset_tag': the_tag, 'sample_cnt': len(samples_lst) })
def combined_generator(mnist_data, dir_path, mini_batch_size=128): from dataset_utils import load_dataset, dataset_generator from skimage.transform import resize from skimage import img_as_ubyte x, y = load_dataset(dir_path) x_mnist, y_mnist = mnist_data size = (45, 45) import numpy as np x_mnist_scaled = np.zeros((len(x_mnist), 45 * 45), dtype=np.uint8) for i in range(len(x_mnist)): resized = resize(x_mnist[i], size, anti_aliasing=True) resized = img_as_ubyte(resized) x_mnist_scaled[i] = resized.reshape(45 * 45) x = np.vstack((x, x_mnist_scaled)) y = np.hstack((y, y_mnist)) m = len(y) gen = dataset_generator(x, y, mini_batch_size=mini_batch_size) def wrapped_gen(): for x_batch, y_batch in gen: yield x_batch, y_batch.reshape((-1, 1, 1, 14)) return wrapped_gen(), m
def create_text_and_graph_features(): """ Using the function in feature_utils.py, we create a dataframe with text mining features (interrogative, caps, grammatical, leaky features) :return: pandas dataframe for train and test set """ #################################################### ### Load dataset #################################################### df_train, df_test = load_dataset() #################################################### ### Add graph features #################################################### df_train, df_test = get_graph_features(df_train, df_test) df_train_copy, df_test_copy = df_train.copy(), df_test.copy() # stopwords stops = set(stopwords.words("english")) # Add custom features to train/test df_train = count_features(df_train) df_test = count_features(df_test) # questions columns are now list of words for train / test df_train['question1'] = df_train['question1'].map( lambda x: clean_text(str(x)).split()) df_train['question2'] = df_train['question2'].map( lambda x: clean_text(str(x)).split()) df_test['question1'] = df_test['question1'].map( lambda x: clean_text(str(x)).split()) df_test['question2'] = df_test['question2'].map( lambda x: clean_text(str(x)).split()) #List of splitted questions train_qs = pd.Series(df_train['question1'].tolist() + df_train['question2'].tolist() + df_test['question1'].tolist() + df_test['question2'].tolist()) words = [x for y in train_qs for x in y] counts = Counter(words) weights = {word: get_weight(count) for word, count in counts.items()} #################################################### ### Add word features #################################################### word_features_train = word_features(df_train, stops, weights) df_train = pd.concat((df_train, word_features_train), axis=1) word_features_test = word_features(df_test, stops, weights) df_test = pd.concat((df_test, word_features_test), axis=1) # Find nouns train_noun_features = nouns_features(df_train_copy) test_noun_features = nouns_features(df_test_copy) X_train = pd.concat((df_train, train_noun_features['noun_match']), axis=1) X_test = pd.concat((df_test, test_noun_features['noun_match']), axis=1) return X_train, X_test
def run(config_file, fold=0): cf = imp.load_source('cf', config_file) print fold dataset_root = cf.dataset_root # this is seeded, will be identical each time train_keys, test_keys = get_split(fold) val_data = load_dataset(test_keys, root_dir=dataset_root) use_patients = val_data experiment_name = cf.EXPERIMENT_NAME results_folder = os.path.join(cf.results_dir, "fold%d/" % fold) mode = 'val' BATCH_SIZE = cf.BATCH_SIZE n_repeats = cf.val_num_repeats x_sym = T.tensor4() nt, net, seg_layer = cf.nt, cf.net, cf.seg_layer output_layer = seg_layer best_epoch = 299 results_out_folder = results_folder + "ep%03.0d_MA" % (best_epoch) if not os.path.isdir(results_out_folder): os.mkdir(results_out_folder) results_out_folder += "/%s_mirror" % mode if not os.path.isdir(results_out_folder): os.mkdir(results_out_folder) with open( os.path.join(results_folder, "%s_Params.pkl" % (experiment_name)), 'r') as f: params = cPickle.load(f) lasagne.layers.set_all_param_values(output_layer, params) print "compiling theano functions" output = softmax_helper( lasagne.layers.get_output(output_layer, x_sym, deterministic=not cf.val_bayesian_prediction, batch_norm_update_averages=False, batch_norm_use_averages=False)) pred_fn = theano.function([x_sym], output) _ = pred_fn(np.random.random((BATCH_SIZE, 1, 384, 352)).astype(np.float32)) run_validation( pred_fn, results_out_folder, use_patients, BATCH_SIZE=BATCH_SIZE, n_repeats=n_repeats, save_segmentation=cf.val_save_segmentation, plot_segmentation=cf.val_plot_segmentation, min_size=cf.val_min_size, do_mirroring=cf.val_do_mirroring, input_img_must_be_divisible_by=cf.val_input_img_must_be_divisible_by, preprocess_fn=cf.val_preprocess_fn)
def run_embedding(dataset_name): X, y, _ = load_dataset(dataset_name) N = X.shape[0] n_perps = max(500, min(1000, N // 2)) perps = gen_log_space_float(limit=N, n=n_perps) print('Number of perps: ', len(perps)) Parallel(n_jobs=n_cpus_using)( delayed(_run_tsne)(X, perp, i) for i, perp in enumerate(perps) )
def main(args): ''' main method ''' device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu' print("using device {} ...".format(device)) out = dataset_utils.load_dataset(args) if args.load or args.save: train_dataset, valid_dataset, train_vocab, output_categories = out # train_vocab = vocab.unk_vocab(train_vocab) elif args.clean: return if args.binary_classifier: b_class = args.binary_classifier print('converting to a binary problem for class: {}'.format(b_class)) output_categories = vocab.BinaryVocab(output_categories, select_class=b_class) model_type = None if args.train_bi_lstm: model_type = "bilstm_crf" elif args.train_elmo_bi_lstm: model_type = "elmo_bilstm_crf" elif args.train_dictionary: model_type = "dictionary" if args.train: train_bilstm_crf( train_dataset=train_dataset, test_dataset=valid_dataset, vocab=train_vocab, tag_vocab=output_categories, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=args.num_workers, num_epochs=args.num_epochs, embedding_dim=args.embedding_dim, hidden_dim=args.hidden_dim, learning_rate=args.learning_rate, weight_decay=args.weight_decay, momentum=args.momentum, optimizer_type=args.optimizer_type, log_dir=args.log_dir, save_dir=args.save_dir, model_name=args.model_name, model_path=args.model_path, sample=args.sample, summary_file=args.summary_file, model_type=model_type, device=device, training_threshold=args.training_threshold, )
def create_distance_features(): """ Common distances :return: pandas dataframe for train and test set """ p = Pool(2) df_train, df_test = load_dataset() df_train, df_test = p.map(extend_with_features, [df_train, df_test]) X_train = df_train.drop( ['id', 'id1', 'id2', "question1", "question2", 'is_duplicate'], axis=1) X_test = df_test.drop(['id', 'id1', 'id2', "question1", "question2"], axis=1) return X_train, X_test
def update_dataset(name): if not name: return 'Please select a dataset!' _reset() global dataset_name global dataX global target_labels global target_names global dists dataset_name = name dataX, target_labels, target_names = load_dataset(dataset_name) dists = squareform(pdist(dataX)) return "N = {}, D = {}".format(*dataX.shape)
def generate_dataset(path, saving_dir): images = load_dataset(path).astype(np.uint8)[2494:] fileName = 2494 for (first_image, second_image) in zip(images, shuffleArr(images)): print("next image") first_eyes, second_eyes = extract_eyes(first_image), extract_eyes( second_image) if len(first_eyes) != 2 or len(second_eyes) != 2: continue for (eye_first_person, eye_second_person) in zip(first_eyes, second_eyes): exchange_eye(first_image, second_image, eye_first_person, eye_second_person) save_image(first_image, saving_dir, fileName) fileName += 1
def calculate_constraint_score(dataset_name, auto_constraint=True): if auto_constraint: calculation_function = _calculate_constraint_score _, labels, _ = load_dataset(dataset_name) else: calculation_function = _calculate_manual_constraint labels = [] # prepare a list of pre-calculated tsne object files to_process_files = [] embedding_dir = '{}/{}'.format(output_folder, dataset_name) for file in os.listdir(embedding_dir): if file.endswith('.z'): in_name = os.path.join(embedding_dir, file) to_process_files.append(in_name) print('{} files to process'.format(len(to_process_files))) # setup to run calculation in parallel db_name = 'DB_{}'.format(dataset_name) Parallel(n_jobs=n_cpus_using)( delayed(calculation_function)(db_name, labels, tsne_file) for tsne_file in to_process_files)
def update_dataset(name): if not name: return 'Please select a dataset!' global dataset_name global dataX global target_labels global target_names global dists dataset_name = name dataX, target_labels, target_names = load_dataset(dataset_name) # debug the number of classes print('Number of class: ', len(np.unique(target_labels))) # print(set(target_names)) dists = squareform(pdist(dataX)) dataset_info = """ dataX: shape={}, mean={:.3f}, std={:.3f}, min={:.3f}, max={:.3f}, min_dist={:.3f}, max_dist={:.3f} """.format(dataX.shape, np.mean(dataX), np.std(dataX), np.min(dataX), np.max(dataX), np.min(dists), np.max(dists)) return dataset_info
data_gen_validation = BatchGenerator(val_data, BATCH_SIZE, num_batches=None, seed=False, PATCH_SIZE=INPUT_PATCH_SIZE) val_transforms = [] val_transforms.append( ConvertSegToOnehotTransform(range(4), 0, 'seg_onehot')) data_gen_validation = MultiThreadedAugmenter(data_gen_validation, Compose(val_transforms), 1, 2, [0]) return data_gen_train, data_gen_validation dataset = load_dataset(root_dir=os.environ['PATH_ACDC_3D']) split_seed = 12345 np.random.seed(65432) lasagne.random.set_rng(np.random.RandomState(98765)) sys.setrecursionlimit(2000) BATCH_SIZE = 4 INPUT_PATCH_SIZE = (10, 224, 224) num_classes = 4 num_input_channels = 1 EXPERIMENT_NAME = "UNet3D_final" if not os.path.isdir(os.path.join(os.environ['RESULTS_FOLDER'], "ACDC_lasagne")): os.mkdir(os.path.join(os.environ['RESULTS_FOLDER'], "ACDC_lasagne")) results_dir = os.path.join(os.environ['RESULTS_FOLDER'], "ACDC_lasagne", EXPERIMENT_NAME)
def main(args): ''' main method ''' device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu' print("using device {} ...".format(device)) out = dataset_utils.load_dataset(args) if args.load or args.save: train_dataset, valid_dataset, train_vocab, output_categories = out # train_vocab = vocab.unk_vocab(train_vocab) elif args.clean: return if args.binary_classifier: b_class = args.binary_classifier print('converting to a binary problem for class: {}'.format(b_class)) output_categories = vocab.BinaryVocab(output_categories, select_class=b_class) total_tokens = 0 total_class_tokens = 0 def explain_labels( example: List[str], seq_label: List[str], ) -> Tuple[List[Tuple[int, int]], List[str]]: ''' Convert a label to a list of word ranges and entities word_range[i] = (start: int, end: int) with end exclusive entities[i] = str, the entity corresponding to word_range[i] ''' ranges: list = [] entities: list = [] range_start: int = None seq_label = [] if seq_label is None else seq_label for i, label in enumerate(seq_label): if (label == 'O' or i == len(seq_label) - 1) and range_start is not None: ranges.append(( range_start, i, )) entities.append((example[range_start:i])) range_start = None elif label.startswith('B'): if range_start is not None: ranges.append(( range_start, i, )) entities.append((example[range_start:i])) range_start = i return ranges, entities # for item in train_dataset.data: # sent = [inp['word'] for inp in item ] # out = item['output'] # total_tokens += len(out) # num_pos = 0 # for out_i in out: # if len(out_i) > 0 and out_i[2:] == b_class: # num_pos += 1 # total_class_tokens += num_pos all_ents = [] has_ents = [] for item in train_dataset.data: sent = [inp['word'] for inp in item['input']] out = item['output'] total_tokens += len(out) num_pos = 0 act_out = [] for out_i in out: if len(out_i) > 0 and out_i[2:] == b_class: num_pos += 1 act_out.append(out_i) else: act_out.append('O') r, e = explain_labels(sent, act_out) has_ents.append(len(e) > 0) all_ents.extend(e) total_class_tokens += num_pos per = sum(has_ents) / len(has_ents) print(f'has_ents: {len(has_ents)} ents: {sum(has_ents)} per: {per}') print(f'Num Ents: {len(all_ents)}') lens = [len(ent) for ent in all_ents] avg_size = sum(lens) / len(lens) print(f'Average Size: {avg_size}') print( f'Positive Tokens: {total_class_tokens} | Total: {total_tokens} | Percent: {total_class_tokens / total_tokens}' )
from os import sys, path import numpy as np sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) import dataset_utils as du import pytorch_kit.models as tm import image_utils as iu if __name__ == "__main__": np.random.seed(0) X, y = du.load_dataset("boat_images", as_image=False) # TRAIN NETWORK model = tm.AttentionModel(n_channels=3, n_outputs=1) model.fit(X, y, batch_size=23, epochs=150) show = lambda m, i: iu.show(m.get_heatmap(X)[i], X[i]) show(model, 1) import pdb pdb.set_trace() # breakpoint c95ec4e5 //
def run(config_file, fold=0): cf = imp.load_source('cf', config_file) print fold dataset_root = cf.dataset_root # ================================================================================================================== BATCH_SIZE = cf.BATCH_SIZE INPUT_PATCH_SIZE = cf.INPUT_PATCH_SIZE num_classes = cf.num_classes EXPERIMENT_NAME = cf.EXPERIMENT_NAME results_dir = os.path.join(cf.results_dir, "fold%d/" % fold) if not os.path.isdir(results_dir): os.mkdir(results_dir) n_epochs = cf.n_epochs lr_decay = cf.lr_decay base_lr = cf.base_lr n_batches_per_epoch = cf.n_batches_per_epoch n_test_batches = cf.n_test_batches n_feedbacks_per_epoch = cf.n_feedbacks_per_epoch num_workers = cf.num_workers workers_seeds = cf.workers_seeds # ================================================================================================================== # this is seeded, will be identical each time train_keys, test_keys = get_split(fold) train_data = load_dataset(train_keys, root_dir=dataset_root) val_data = load_dataset(test_keys, root_dir=dataset_root) x_sym = cf.x_sym seg_sym = cf.seg_sym nt, net, seg_layer = cf.nt, cf.net, cf.seg_layer output_layer_for_loss = net #draw_to_file(lasagne.layers.get_all_layers(net), os.path.join(results_dir, 'network.png')) data_gen_validation = BatchGenerator_2D(val_data, BATCH_SIZE, num_batches=None, seed=False, PATCH_SIZE=INPUT_PATCH_SIZE) data_gen_validation = MultiThreadedAugmenter( data_gen_validation, ConvertSegToOnehotTransform(range(num_classes), 0, "seg_onehot"), 1, 2, [0]) # add some weight decay l2_loss = lasagne.regularization.regularize_network_params( output_layer_for_loss, lasagne.regularization.l2) * cf.weight_decay # the distinction between prediction_train and test is important only if we enable dropout prediction_train = lasagne.layers.get_output( output_layer_for_loss, x_sym, deterministic=False, batch_norm_update_averages=False, batch_norm_use_averages=False) loss_vec = -soft_dice(prediction_train, seg_sym) loss = loss_vec.mean() loss += l2_loss acc_train = T.mean(T.eq(T.argmax(prediction_train, axis=1), seg_sym.argmax(-1)), dtype=theano.config.floatX) prediction_test = lasagne.layers.get_output( output_layer_for_loss, x_sym, deterministic=True, batch_norm_update_averages=False, batch_norm_use_averages=False) loss_val = -soft_dice(prediction_test, seg_sym) loss_val = loss_val.mean() loss_val += l2_loss acc = T.mean(T.eq(T.argmax(prediction_test, axis=1), seg_sym.argmax(-1)), dtype=theano.config.floatX) # learning rate has to be a shared variable because we decrease it with every epoch params = lasagne.layers.get_all_params(output_layer_for_loss, trainable=True) learning_rate = theano.shared(base_lr) updates = lasagne.updates.adam(T.grad(loss, params), params, learning_rate=learning_rate, beta1=0.9, beta2=0.999) dc = hard_dice(prediction_test, seg_sym.argmax(1), num_classes) train_fn = theano.function([x_sym, seg_sym], [loss, acc_train, loss_vec], updates=updates) val_fn = theano.function([x_sym, seg_sym], [loss_val, acc, dc]) dice_scores = None data_gen_train = create_data_gen_train( train_data, BATCH_SIZE, num_classes, num_workers=num_workers, do_elastic_transform=True, alpha=(100., 350.), sigma=(14., 17.), do_rotation=True, a_x=(0, 2. * np.pi), a_y=(-0.000001, 0.00001), a_z=(-0.000001, 0.00001), do_scale=True, scale_range=(0.7, 1.3), seeds=workers_seeds) # new se has no brain mask all_training_losses = [] all_validation_losses = [] all_validation_accuracies = [] all_training_accuracies = [] all_val_dice_scores = [] epoch = 0 while epoch < n_epochs: if epoch == 100: data_gen_train = create_data_gen_train( train_data, BATCH_SIZE, num_classes, num_workers=num_workers, do_elastic_transform=True, alpha=(0., 250.), sigma=(14., 17.), do_rotation=True, a_x=(-2 * np.pi, 2 * np.pi), a_y=(-0.000001, 0.00001), a_z=(-0.000001, 0.00001), do_scale=True, scale_range=(0.75, 1.25), seeds=workers_seeds) # new se has no brain mask if epoch == 125: data_gen_train = create_data_gen_train( train_data, BATCH_SIZE, num_classes, num_workers=num_workers, do_elastic_transform=True, alpha=(0., 150.), sigma=(14., 17.), do_rotation=True, a_x=(-2 * np.pi, 2 * np.pi), a_y=(-0.000001, 0.00001), a_z=(-0.000001, 0.00001), do_scale=True, scale_range=(0.8, 1.2), seeds=workers_seeds) # new se has no brain mask epoch_start_time = time.time() learning_rate.set_value(np.float32(base_lr * lr_decay**epoch)) print "epoch: ", epoch, " learning rate: ", learning_rate.get_value() train_loss = 0 train_acc_tmp = 0 train_loss_tmp = 0 batch_ctr = 0 for data_dict in data_gen_train: data = data_dict["data"].astype(np.float32) seg = data_dict["seg_onehot"].astype(np.float32).transpose( 0, 2, 3, 1).reshape((-1, num_classes)) if batch_ctr != 0 and batch_ctr % int( np.floor( n_batches_per_epoch / n_feedbacks_per_epoch)) == 0: print "number of batches: ", batch_ctr, "/", n_batches_per_epoch print "training_loss since last update: ", \ train_loss_tmp/np.floor(n_batches_per_epoch/(n_feedbacks_per_epoch-1)), " train accuracy: ", \ train_acc_tmp/np.floor(n_batches_per_epoch/n_feedbacks_per_epoch) all_training_losses.append( train_loss_tmp / np.floor(n_batches_per_epoch / (n_feedbacks_per_epoch - 1))) all_training_accuracies.append( train_acc_tmp / np.floor(n_batches_per_epoch / (n_feedbacks_per_epoch - 1))) train_loss_tmp = 0 train_acc_tmp = 0 if len(all_val_dice_scores) > 0: dice_scores = np.concatenate(all_val_dice_scores, axis=0).reshape( (-1, num_classes)) plotProgress(all_training_losses, all_training_accuracies, all_validation_losses, all_validation_accuracies, os.path.join(results_dir, "%s.png" % EXPERIMENT_NAME), n_feedbacks_per_epoch, val_dice_scores=dice_scores, dice_labels=["brain", "1", "2", "3", "4", "5"]) loss_vec, acc, l = train_fn(data, seg) loss = loss_vec.mean() train_loss += loss train_loss_tmp += loss train_acc_tmp += acc batch_ctr += 1 if batch_ctr > (n_batches_per_epoch - 1): break train_loss /= n_batches_per_epoch print "training loss average on epoch: ", train_loss val_loss = 0 accuracies = [] valid_batch_ctr = 0 all_dice = [] for data_dict in data_gen_validation: data = data_dict["data"].astype(np.float32) seg = data_dict["seg_onehot"].astype(np.float32).transpose( 0, 2, 3, 1).reshape((-1, num_classes)) w = np.zeros(num_classes, dtype=np.float32) w[np.unique(seg.argmax(-1))] = 1 loss, acc, dice = val_fn(data, seg) dice[w == 0] = 2 all_dice.append(dice) val_loss += loss accuracies.append(acc) valid_batch_ctr += 1 if valid_batch_ctr > (n_test_batches - 1): break all_dice = np.vstack(all_dice) dice_means = np.zeros(num_classes) for i in range(num_classes): dice_means[i] = all_dice[all_dice[:, i] != 2, i].mean() val_loss /= n_test_batches print "val loss: ", val_loss print "val acc: ", np.mean(accuracies), "\n" print "val dice: ", dice_means print "This epoch took %f sec" % (time.time() - epoch_start_time) all_val_dice_scores.append(dice_means) all_validation_losses.append(val_loss) all_validation_accuracies.append(np.mean(accuracies)) dice_scores = np.concatenate(all_val_dice_scores, axis=0).reshape( (-1, num_classes)) plotProgress(all_training_losses, all_training_accuracies, all_validation_losses, all_validation_accuracies, os.path.join(results_dir, "%s.png" % EXPERIMENT_NAME), n_feedbacks_per_epoch, val_dice_scores=dice_scores, dice_labels=["brain", "1", "2", "3", "4", "5"]) with open( os.path.join(results_dir, "%s_Params.pkl" % (EXPERIMENT_NAME)), 'w') as f: cPickle.dump( lasagne.layers.get_all_param_values(output_layer_for_loss), f) with open( os.path.join(results_dir, "%s_allLossesNAccur.pkl" % (EXPERIMENT_NAME)), 'w') as f: cPickle.dump([ all_training_losses, all_training_accuracies, all_validation_losses, all_validation_accuracies, all_val_dice_scores ], f) epoch += 1
def load_dataset(self, trainset, partitions=('train', 'validation')): """ Load the file whithin the dataset in batches and preprocess them. Return a generator of the preprocessed batches :param trainset: :param partitions: :return: """ if isdir(trainset): # if the dataset is saved in the file system init = time.time() x_train, y_train, x_val, y_val, x_test, y_test, info = \ dataset_utils.load_dataset(trainset, val_percentage=self.val_percentage, test_percentage=self.test_percentage) self.load_dataset_time = time.time() - init # define max_batch_size, test_batch_size, test_steps, t_batch_size, steps_per_epoch, epochs, v_batch_size, # validation_steps, depending on the loaded dataset # max_batch_size: set a limit of the dimension of the batch size, considering the machine and the set size if self.machine == "blade": max_batch_size = 10000 else: max_batch_size = 2500 # test_batch_size: the size of the test # test_steps: how many batch used for test, note that if len(x_test) % self.test_batch_size == 0, # than the last iteration has no sample and can be avoided. self.test_batch_size = min(max_batch_size, int(len(x_test) / 2), self.test_batch_size) self.test_steps = int(len(x_test) / self.test_batch_size) \ if len(x_test) % self.test_batch_size != 0 \ else int(len(x_test) / self.test_batch_size) - 1 # the same as test_batch_size and test_steps self.t_batch_size = min(max_batch_size, int(len(x_train) / 2), self.t_batch_size) self.steps_per_epoch = int(len(x_train) / self.t_batch_size) \ if len(x_train) % self.t_batch_size != 0 \ else int(len(x_train) / self.t_batch_size) - 1 self.steps_per_epoch = int( self.steps_per_epoch / 10.0) # is preferred to do more epochs with less steps self.epochs *= 10 # to evaluate the validation more frequently # the same as test_batch_size and test_steps self.v_batch_size = min(max_batch_size, int(len(x_val) / 2), self.v_batch_size) self.validation_steps = int(len(x_val) / self.v_batch_size) \ if len(x_val) % self.v_batch_size != 0 \ else int(len(x_val) / self.v_batch_size) - 1 # with the calculated batch_size and steps, the selected wanted_words and the info, # create the generator of the dataset, which read the audio and resp. label g_train = dataset_utils.dataset_generator( x_train, y_train, self.info, self.wanted_words, batch_size=self.t_batch_size, tot_size=-1, unknown_percentage=self.unknown_percentage) g_val = dataset_utils.dataset_generator( x_val, y_val, self.info, self.wanted_words, batch_size=self.v_batch_size, tot_size=-1, unknown_percentage=self.unknown_percentage) g_test = dataset_utils.dataset_generator( x_test, y_test, self.info, self.wanted_words, batch_size=self.test_batch_size, tot_size=-1, unknown_percentage=self.unknown_percentage) self.info.update(info) else: # if the dataset is downloaded from tfds import tensorflow_datasets as tfds ds_train, info_train = tfds.load('speech_commands', split=tfds.Split.TRAIN, batch_size=self.t_batch_size, with_info=True) ds_val, info_val = tfds.load('speech_commands', split=tfds.Split.VALIDATION, batch_size=self.v_batch_size, with_info=True) ds_test, info_test = tfds.load('speech_commands', split=tfds.Split.TEST, batch_size=self.test_batch_size, with_info=True) self.info.update(info_train) self.info.update(info_val) self.info.update(info_test) g_train = tfds.as_numpy(ds_train) g_val = tfds.as_numpy(ds_val) g_test = tfds.as_numpy(ds_test) # for each batch of loaded audio and label, yield a batch of preprocessed audio with resp. label # used to isolate the preprocessing phase from the load of the audio # TODO: len(self..wanted_words) is not supported for tfds in batch_preprocessing_gen xy_train = self.batch_preprocessing_gen(g_train, ('audio', 'label'), len(self.wanted_words)) xy_val = self.batch_preprocessing_gen(g_val, ('audio', 'label'), len(self.wanted_words)) xy_test = self.batch_preprocessing_gen(g_test, ('audio', 'label'), len(self.wanted_words)) out = [] if 'train' in partitions: out.append(xy_train) if 'validation' in partitions: out.append(xy_val) if 'test' in partitions: out.append(xy_test) return out
exp_dict = json.loads(data_file.read()) info = exp_dict[exp] selection_rules = info["s_rules"] update_rules = info["u_rules"] dataset_name = info["dataset_name"] objective = info["objective"] n_iters = info["n_iters"] + 1 L1 = info["L1"] L2 = info["L2"] title = info["Title"] ylabel = info["ylabel"] xlabel = info["xlabel"] # 1. Load Dataset dataset = du.load_dataset(dataset_name) A = dataset["A"] b = dataset["b"] n_uRules = len(update_rules) n_sRules = len(selection_rules) results = pd.DataFrame() timeResults = {} for s_rule, u_rule in product(selection_rules, update_rules): np.random.seed(1) clf = CDPredictor(selection_rule=s_rule, update_rule=u_rule, L2=L2,
# import os # os.environ["CUDA_VISIBLE_DEVICES"]="-1" from autoencoder import Autoencoder from autogan import AutoGAN import numpy as np from dataset_utils import load_dataset, normalize_tanh import tensorflow as tf #positive = load_dataset("../data/faces_dummy_np")[:600] #negative = load_dataset("../data/cut_n_paste_dummy")[:600] positive = load_dataset("../data/faces_np")[:15000] negative = load_dataset("../data/cut_n_paste_np")[:15000] normalize_tanh(positive) normalize_tanh(negative) print("normalized range", np.min(positive), np.max(positive)) train = (positive, negative) autogan = AutoGAN(128, 128, train, training=True) autogan.create_model() autogan_model = autogan.get_autogan_model() autogan.load_trained("../checkpoints_autogan/epoch20") autogan.train(150, 80)
def cluster_pipeline(dataset, validate, config, dataset_to_predict=None): assert dataset in [dataset_utils.DATASET_ORG, dataset_utils.DATASET_UTIL] assert dataset_to_predict in [ None, dataset_utils.DATASET_VALIDATION, dataset_utils.DATASET_18M ] assert isinstance(validate, bool) print "Loading data..." all_measures, all_labels = dataset_utils.load_dataset(dataset) if not dataset_to_predict == None: all_measures_predict, _ = dataset_utils.load_dataset( dataset_to_predict) print "Finished loading features." result_map = {} measure_names = all_measures.index.levels[1].unique( ) # level[1] is the 'measure' column for measure in measure_names: accuracy_frame = clustering.create_accuracy_frame() threshold = config[dataset][measure] assert not threshold == None features = all_measures.xs(measure, level='measure') labels = all_labels.xs(measure, level='measure') model, scaler, dropped_features, training_accuracy, label_converter = cluster_train( features, labels, threshold) clustering.append_to_accuracy_frame(frame=accuracy_frame, accuracy=training_accuracy, measure=measure, dataset=dataset, context="training", corr_threshold=threshold, dropped=len(dropped_features)) write_low_level_data( features, training_accuracy, "./ll_data/training_%s_%s.csv" % (dataset, measure)) if validate: all_measures_validation, all_labels_validation = dataset_utils.load_dataset( dataset_utils.DATASET_VALIDATION) features = all_measures_validation.xs(measure, level='measure') labels = all_labels_validation.xs(measure, level='measure') validation_accuracy = cluster_validate(features, labels, dropped_features, scaler, model, label_converter) clustering.append_to_accuracy_frame(frame=accuracy_frame, accuracy=validation_accuracy, measure=measure, dataset=dataset, context="validation", corr_threshold=threshold, dropped=len(dropped_features)) write_low_level_data( features, validation_accuracy, "./ll_data/validation_%s_%s.csv" % (dataset, measure)) predicted_series = None if not dataset_to_predict == None: features_predict = all_measures_predict.xs(measure, level='measure') predicted_labels = cluster_predict(features=features_predict, to_drop=dropped_features, scaler=scaler, model=model, label_converter=label_converter) predicted_series = pd.Series(data=predicted_labels.values, index=features_predict.index, name=measure) pipeline_output = ClusterPipelineOutput(model, scaler, dropped_features, accuracy_frame, label_converter, predicted_series) result_map[measure] = pipeline_output return result_map
autogan_model = model ae_model = autogan_model.get_layer("autoencoder") batch = negative[50:66] ae_out = ae_model.predict(batch) normalize_tanh(ae_out) disc_model = autogan_model.get_layer("discriminator") disc_out = disc_model.predict(ae_out) print("discriminator output ", disc_out) if __name__ == '__main__': positive = load_dataset("../data/test_np")[:100] negative = load_dataset("../data/test_cut_np")[:100] normalize_tanh(positive) normalize_tanh(negative) filename = "../checkpoints_autogan/epoch56" autogan_model = load_model(filename) #test autogan test_autogan(autogan_model, negative) #test autoencoder test_autoencoder_output(autogan_model, negative) #test discriminator test_discriminator(autogan_model, negative, positive)
exp_dict = json.loads(data_file.read()) info = exp_dict[exp] selection_rules = info["s_rules"] update_rules = info["u_rules"] dataset_name = info["dataset_name"] objective = info["objective"] n_iters = info["n_iters"] + 1 L1 = info["L1"] L2 = info["L2"] title = info["Title"] ylabel = info["ylabel"] xlabel =info["xlabel"] # 1. Load Dataset dataset = du.load_dataset(dataset_name) A = dataset["A"] b = dataset["b"] n_uRules = len(update_rules) n_sRules = len(selection_rules) results = pd.DataFrame() timeResults = {} for s_rule, u_rule in product(selection_rules, update_rules): np.random.seed(1) clf = CDPredictor(selection_rule=s_rule, update_rule=u_rule, L2=L2, L1=L1, objective=objective,
plt.imsave(figFile, data.reshape(img_size, img_size).T, cmap='gray') else: plt.imsave(figFile, data.reshape(img_size, img_size), cmap=cmaps[classId]) plt.gcf().clear() figFile.seek(0) return base64.b64encode(figFile.getvalue()).decode('utf-8') def gen_svg_stack(dataset_name, X, classIds, n, img_size, for_COIL=False): outfile = './plots/{}.svg'.format(dataset_name) with open(outfile, "w") as svgFile: svgFile.write(svgMetaData) for i in range(n): figData = gen_figure_data(X[i], classIds[i], img_size, for_COIL) svgFile.write(svgImgTag.format(i, i, figData)) svgFile.write("</svg>") if __name__ == '__main__': datasets = {'MNIST': 28, 'MNIST-SMALL': 8, 'COIL20': 32} for dataset_name, img_size in datasets.items(): X, y, labels = load_dataset(dataset_name) gen_svg_stack(dataset_name, X, y, len(y), img_size, for_COIL=(dataset_name == 'COIL20'))
from Autoencoder import Autoencoder import numpy as np from dataset_utils import load_dataset #x_train = load_dataset("../data/data_np50/1") x_train = load_dataset("../data/data100_np/small") print(x_train.shape) ae = Autoencoder(100, 100, x_train, x_train) ae.create_model() #ae.load_model("hehehe.h5") ae.train_model(32, 20, 1)
import sampling_rules as sr import argparse import sys import helpers as hp if __name__ == "__main__": argv = sys.argv[1:] parser = argparse.ArgumentParser() parser.add_argument('-d', '--demo', required=True) io_args = parser.parse_args() demo = io_args.demo if demo == "small": n_pot, e_pot, V, E = du.load_dataset("simple_studentScores") # Decode: Compute optimal decoding (most likely configuration of the states) print dr.decode(n_pot, e_pot, V, E, rule="exact") # Infer: Compute Vertex and Edge marginals and Normalizing Constant print ir.infer(n_pot, e_pot, V, E, rule="exact") # Sample: dep_samples = sr.sample(100, n_pot, e_pot, V, E).T # Display New sampling results #ind_samples = sample_independent_potentials(node_pot) hp.plot_samples(dep_samples) elif demo == "chain":
def main(): args = active_args.get_arg_parser().parse_args() # determine device device = 'cuda' if torch.cuda.is_available() and args.cuda else 'cpu' print("using device {} ...".format(device)) model_type = 'bilstm_crf' if args.train_bi_lstm else 'elmo_bilstm_crf' model_type = 'dictionary' if args.train_dictionary else model_type model_type = 'cached' if args.train_cached else model_type model_type = 'phrase_dictionary' if args.train_phrase_dictionary else model_type out = dataset_utils.load_dataset(args, force_load=True) train_dataset, valid_dataset, train_vocab, output_categories = out if args.binary_classifier: b_class = args.binary_classifier print('converting to a binary problem for class: {}'.format(b_class)) output_categories = BinaryVocab(output_categories, select_class=b_class) # phrase: 69 F1 Drug 791 examples # phrase: 58 F1 ADR 791 examples # word: 69 F1 Drug 791 examples # word: 59 F1 ADR 791 examples # build unlabeled corpus unlabeled_corpus = conlldataloader.ConllDataSetUnlabeled(train_dataset) model = utils.build_model( model_type=model_type, embedding_dim=args.embedding_dim, hidden_dim=args.hidden_dim, batch_size=args.batch_size, vocab=train_vocab, tag_vocab=output_categories, ).to(device) if model_type == 'cached': model.embedder.cache_dataset(unlabeled_corpus, verbose=True, device=device) # created a simulated oracle with all the ground truth values sim_oracle = oracle.SimulatedOracle(train_dataset) # heuristic if args.heuristic == constants.ACTIVE_LEARNING_RANDOM_H: h = active_heuristic.Random(train_vocab, output_categories) elif args.heuristic == constants.ACTIVE_LEARNING_UNCERTAINTY_H: h = active_heuristic.Uncertantiy(train_vocab, output_categories) elif args.heuristic == constants.ACTIVE_LEARNING_KNN: h = active_heuristic.KNNEmbeddings(train_vocab, output_categories) h.prepare( model=model, dataset=unlabeled_corpus, device=device, ) else: raise Exception("Unknown heurisitc: {}".format(args.heuristic)) active_train( log_dir=args.log_dir, model=model, model_path=args.model_path, unlabeled_dataset=unlabeled_corpus, test_dataset=valid_dataset, # active learning parameters iterations=args.iterations, heuritic=h, oracle=sim_oracle, sample_size=args.sample_size, sampling_strategy=args.sampling_strategy, # train parameters vocab=train_vocab, tag_vocab=output_categories, batch_size=args.batch_size, shuffle=args.shuffle, num_workers=args.num_workers, num_epochs=args.num_epochs, learning_rate=args.learning_rate, weight_decay=args.weight_decay, momentum=args.momentum, optimizer_type=args.optimizer_type, # Other parameters device=device, summary_file=args.summary_file, )
data_gen_validation = BatchGenerator(val_data, BATCH_SIZE, num_batches=None, seed=False, PATCH_SIZE=INPUT_PATCH_SIZE) val_transforms = [] val_transforms.append( ConvertSegToOnehotTransform(range(4), 0, 'seg_onehot')) data_gen_validation = MultiThreadedAugmenter(data_gen_validation, Compose(val_transforms), 1, 2, [0]) return data_gen_train, data_gen_validation dataset = load_dataset(root_dir=path_acdc_3d) split_seed = 12345 np.random.seed(65432) lasagne.random.set_rng(np.random.RandomState(98765)) sys.setrecursionlimit(2000) BATCH_SIZE = 4 INPUT_PATCH_SIZE = (10, 224, 224) num_classes = 4 num_input_channels = 1 EXPERIMENT_NAME = "UNet3D_final" if not os.path.isdir(os.path.join(results_folder, "ACDC_lasagne")): os.mkdir(os.path.join(results_folder, "ACDC_lasagne")) results_dir = os.path.join(results_folder, "ACDC_lasagne", EXPERIMENT_NAME) if not os.path.isdir(results_dir):