def main(): # create instance of config config = Config(parser) assert config.data_keyname == 'pico' # build model model = HANNModel(config) model.build() # create datasets dev = Dataset(config.filename_dev, config.processing_word, config.processing_tag) train = Dataset(config.filename_train, config.processing_word, config.processing_tag) test = Dataset(config.filename_test, config.processing_word, config.processing_tag) if config.num_augmentation: data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation) else: data_aug = None # train model model.train(train, dev, data_aug) # evaluate model model.restore_session(config.dir_model) model.evaluate(test)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators dev = Dataset(config.filename_dev, processing_word) test = Dataset(config.filename_test, processing_word) train = Dataset(config.filename_train, processing_word) # add data augmentation dataset data_aug = Dataset(config.filename_aug, processing_word) # Build Word and Tag vocab vocab_words_freq, vocab_tags = get_vocabs([train, dev, test, data_aug]) vocab_words_freq_ = {} for vocab, freq in vocab_words_freq.items(): if freq > config.min_freq: vocab_words_freq_[vocab] = freq vocab_tags.remove('None') # vocab_glove = get_wordvec_vocab(config.filename_wordvec) # vocab = vocab_words & vocab_glove vocab_words_freq_.update({UNK: 1, WORD_PAD: 1, NUM: 1}) # vocab_tags.add(TAG_PAD) # Save vocab write_vocab(vocab_words_freq_, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab, _ = load_vocab(config.filename_words) export_trimmed_wordvec_vectors(vocab, config.filename_wordvec, config.filename_wordvec_trimmed)
log.setLevel(logging.WARNING) # get file paths to DICOM MRI scans and segmentation images # note that leaderboard samples can be used for training train_scan_files = glob('./data/raw/train/**/*.dcm', recursive=True) train_scan_files += glob('../data/raw/leaderboard/**/*.dcm', recursive=True) test_scan_files = glob('../data/raw/test/**/*.dcm', recursive=True) # ProstateDx-01-0006_corrected_label.nrrd was renamed to ProstateDx-01-0006.nrrd # In the leaderboard and test folders the _truth postfix have been removed from all nrrd files train_seg_files = glob('../data/raw/train/**/*.nrrd', recursive=True) train_seg_files += glob('../data/raw/leaderboard/**/*.nrrd', recursive=True) test_seg_files = glob('../data/raw/test/**/*.nrrd', recursive=True) # build datasets from file paths train_dataset = Dataset(scan_files=train_scan_files, seg_files=train_seg_files) test_dataset = Dataset(scan_files=test_scan_files, seg_files=test_seg_files) train_n = len(train_dataset.patient_ids) test_n = len(test_dataset.patient_ids) train_scan_nums = [p.scans.shape[0] for p in train_dataset.patients.values()] test_scan_nums = [p.scans.shape[0] for p in test_dataset.patients.values()] print('Number of patients in train4 dataset: %d' % train_n) print('Number of patients in test dataset: %d' % test_n) print('Number of scans in train dataset: %d' % sum(train_scan_nums)) print('Number of scans in test dataset: %d' % sum(test_scan_nums)) # extract manufacturer and thickness sets from each patient train_manufacturers = [ p.manufacturers for p in train_dataset.patients.values()
def main(): # create instance of config config = Config() assert config.data_keyname == 'pico' config.num_augmentation = 0 config.batch_size = 20 config.batch_size_aug = 20 config.dir_output = 'test-num_augmentation-{}'.format( config.num_augmentation) config.dir_model = os.path.join(config.dir_output, "model.weights") config.data_root = '../data/{}/10_folds'.format(config.data_keyname) result_file_path = os.path.join(config.dir_output, 'cross_validate_results') precisions = {'P': [], 'I': [], 'O': []} recalls = {'P': [], 'I': [], 'O': []} f1s = {'P': [], 'I': [], 'O': []} for fold in range(1, 11): # build model # tf.reset_default_graph() print('Fold {}'.format(fold)) # build model model = HANNModel(config) model.build() # if config.restore: # model.restore_session("results/test/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") # create datasets train = Dataset(os.path.join(config.data_root, str(fold), 'train.txt'), config.processing_word, config.processing_tag) dev = Dataset(os.path.join(config.data_root, str(fold), 'dev.txt'), config.processing_word, config.processing_tag) test = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word, config.processing_tag) if config.num_augmentation: data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation) else: data_aug = None # train model model.train(train, dev, data_aug) # evaluate model model.restore_session(config.dir_model) metrics = model.evaluate(test) [ precisions[tag].append(metrics['precision'][tag]) for tag in ['P', 'I', 'O'] ] [ recalls[tag].append(metrics['recall'][tag]) for tag in ['P', 'I', 'O'] ] [f1s[tag].append(metrics['f1'][tag]) for tag in ['P', 'I', 'O']] msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format( fold, metrics['precision'], metrics['recall'], metrics['f1']) print(msg) with open(result_file_path, 'a') as ofile: ofile.write(msg) # print('Precision: ', 'P: ', (precisions['P']), 'I: ', (precisions['I']), 'O: ', (precisions['O'])) # print('Recall: ', 'P: ', (recalls['P']), 'I: ', (recalls['I']), 'O: ', (recalls['O'])) # print('F1: ', 'P: ', (f1s['P']), 'I: ', (f1s['I']), 'O: ', (f1s['O'])) # print('Precision: ', 'P: ', np.mean(precisions['P']), 'I: ', np.mean(precisions['I']), 'O: ', np.mean(precisions['O'])) # print('Recall: ', 'P: ', np.mean(recalls['P']), 'I: ', np.mean(recalls['I']), 'O: ', np.mean(recalls['O'])) # res = np.mean([np.mean(values) for values in f1s.values()]) # print('F1: ', 'P: ', np.mean(f1s['P']), 'I: ', np.mean(f1s['I']), 'O: ', np.mean(f1s['O']), 'all avg: ', res) msg = 'Average Precision: P: {}\tI: {}\tO: {}\n'.format( np.mean(precisions['P']), np.mean(precisions['I']), np.mean(precisions['O'])) print(msg) with open(result_file_path, 'a') as ofile: ofile.write(msg) msg = 'Average Recall: P: {}\tI: {}\tO: {}\n'.format( np.mean(recalls['P']), np.mean(recalls['I']), np.mean(recalls['O'])) print(msg) with open(result_file_path, 'a') as ofile: ofile.write(msg) res = np.mean([np.mean(values) for values in f1s.values()]) msg = 'Average F1: P: {}\tI: {}\tO: {}\tall: {}\n'.format( np.mean(f1s['P']), np.mean(f1s['I']), np.mean(f1s['O']), res) print(msg) with open(result_file_path, 'a') as ofile: ofile.write(msg) ofile.write('\n\n\n')
def input_fn(training, params): """ Simple input_fn for our 3D U-Net estimator, handling train and test data preparation. Args: training (bool): Whether we are training or testing. params (dict): Params for setting up the data. Expected keys are: max_scans (int): Maximum number of scans we see in any patient. train_img_size (int): Width and height of resized training images. batch_size (int): Number of of patient in each batch for training. num_classes (int): Number of mutually exclusive output classes. train_dataset_path (str): Path to pickled :class:`src.data_utils.Dataset` object. test_dataset_path (str): Path to pickled :class:`src.data_utils.Dataset` object. Returns: :class:`tf.dataset.Dataset`: An instantiated Dataset object. """ package_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # for training we use a batch number and pad each 3D scan to have equal # depth, width and height have already been set to 128 in preprocessing max_s = params['max_scans'] w = h = params['train_img_size'] if training: dataset = Dataset.load_dataset( os.path.join(package_root, params['train_dataset_path']) ).create_tf_dataset().shuffle( # we have 70 train examples, this will provide good shuffling buffer_size=70 ).repeat().padded_batch( batch_size=params['batch_size'], padded_shapes=( [max_s, w, h, 1], [max_s, w, h, params['num_classes']] ) ) # for testing we use the unscaled images with their original dims, # we still pad the depth dimension to max_s though else: # predicting a resized dataset, i.e. all have same width height? resized = 'resized' in params['test_dataset_path'] dataset = Dataset.load_dataset( os.path.join(package_root, params['test_dataset_path']) ).create_tf_dataset( resized=resized ).padded_batch( # we have different sized test scans so we need batch 1 batch_size=1, padded_shapes=( [max_s, None, None, 1], [max_s, None, None, params['num_classes']] ) ) iterator = tf.data.Iterator.from_structure( dataset.output_types, dataset.output_shapes ) dataset_init_op = iterator.make_initializer(dataset) tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, dataset_init_op) next_element = iterator.get_next() # extremely hack way of getting tf.estimator to return labels at pred time # see https://github.com/tensorflow/tensorflow/issues/17824 features = {'x': next_element[0], 'y': next_element[1]} return features, next_element[1]
def main(): # create instance of config config = Config() assert config.data_keyname == 'nicta' config.num_augmentation = 200000 config.batch_size = 20 config.batch_size_aug = 20 config.attention_size = 50 config.hidden_size_lstm_document = 200 config.dropout = 0.8 config.cnn_filter_num = 150 config.adv_perturb_norm_length = 4 config.va_perturb_norm_length = 4 config.adv_reg_coeff = 0.3 config.va_reg_coeff = 0.3 config.data_root = '../data/nicta_piboso/10_folds' config.dir_output = 'results/nicta/test-num_augmentation-{}-va_coeff-{}-adv-coeff-{}'.format(config.num_augmentation, config.va_reg_coeff, config.adv_reg_coeff) config.dir_model = os.path.join(config.dir_output, "model.weights") result_file_path = os.path.join(config.dir_output, 'cross_validate_results') precisions = defaultdict(list) recalls = defaultdict(list) f1s = defaultdict(list) tag_ls = ['P', 'I', 'O', 'S', 'B', 'OT'] for fold in range(1, 11): # build model # tf.reset_default_graph() print('Fold {}'.format(fold)) # build model model = HANNModel(config) model.build() # if config.restore: # model.restore_session("results/test/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") # create datasets train = Dataset(os.path.join(config.data_root, str(fold), 'train.txt'), config.processing_word, config.processing_tag) dev = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word, config.processing_tag) test = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word, config.processing_tag) if config.num_augmentation: data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation) else: data_aug = None # train model model.train(train, dev, data_aug) # evaluate model model.restore_session(config.dir_model) metrics = model.evaluate(test) [precisions[tag].append(metrics['precision_all'][tag]) for tag in tag_ls] [recalls[tag].append(metrics['recall_all'][tag]) for tag in tag_ls] [f1s[tag].append(metrics['f1_all'][tag]) for tag in tag_ls] msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format(fold, metrics['precision_all'], metrics['recall_all'], metrics['f1_all']) print(msg) with open(result_file_path, 'a') as ofile: ofile.write(msg) msg = 'Average Precision: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(precisions[tag])) for tag in tag_ls])) print(msg) with open(result_file_path, 'a') as ofile: ofile.write(msg) msg = 'Average Recall: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(recalls[tag])) for tag in tag_ls])) print(msg) with open(result_file_path, 'a') as ofile: ofile.write(msg) res = np.mean([np.mean(values) for values in f1s.values()]) msg = 'Average F1: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(f1s[tag])) for tag in tag_ls])) print(msg) with open(result_file_path, 'a') as ofile: ofile.write(msg) ofile.write('\n\n\n')