def clustering(): """ Clustering all unlabeled texts """ data = prepare_tfidf_data() SSE = [] SilhouetteScores = [] Labels = [] with open('data/online_test/clustering_result_%s.txt' % get_time(), 'w') as f: for k in range(2, 21): print('\nClustering, n_cluster = %d' % k) f.write('\nClustering, n_cluster = %d\n' % k) model = KMeans(n_clusters=k, init='k-means++', n_jobs=-1) labels = model.fit(data).labels_ print(' model is trained, trying to compute silhouette score') score = metrics.silhouette_score(data, labels) Labels.append(labels) SilhouetteScores.append(score) SSE.append(model.inertia_) print('SSE: {}\n'.format(SSE)) print('SilhouetteScores: {}\n'.format(SilhouetteScores)) f.write(' SSE: {}\n SilhouetteScores: {}\n'.format( SSE, SilhouetteScores)) all_records = {'SSE': SSE, 'scores': SilhouetteScores, 'labels': Labels} save_pkl(all_records, 'data/online_test/clustering_records.pkl') # x = np.arange(1, 11) # plt.plot(x, SSE, 'o-') # plt.xlabel('k') # plt.ylabel('SSE') # plt.show() # plt.savefig('data/kmeans_clustering.pdf', format='pdf') print('Success!')
def from_config(cls, validation_ratio, validation_seed, dir_path, data_sampling_freq, start_sampling_freq, end_sampling_freq, start_seq_len, num_channels, return_long): assert end_sampling_freq <= data_sampling_freq target_location = os.path.join(dir_path, '{}c_{}v_{}ss_{}es_{}l.npz'.format(num_channels, DATASET_VERSION, start_sampling_freq, end_sampling_freq, start_seq_len)) if os.path.exists(target_location): print('loading dataset from file: {}'.format(target_location)) given_data = np.load(target_location) given_data = [load_pkl(target_location + '.pkl'), [given_data['arr_{}'.format(i)] for i in range(len(given_data.keys()))]] else: print('creating dataset from scratch') dataset = cls(None, dir_path, data_sampling_freq, start_sampling_freq, end_sampling_freq, start_seq_len, num_channels, return_long) np.savez_compressed(target_location, *dataset.datas) save_pkl(target_location + '.pkl', dataset.data_pointers) given_data = [dataset.data_pointers, dataset.datas] return_datasets = [] for i in range(2): return_datasets.append(cls(given_data, dir_path, data_sampling_freq, start_sampling_freq, end_sampling_freq, start_seq_len, num_channels, return_long)) data_pointers = [x for x in return_datasets[0].data_pointers] np.random.seed(validation_seed) np.random.shuffle(data_pointers) return_datasets[0].data_pointers = data_pointers[:int((1 - validation_ratio) * len(data_pointers))] return_datasets[1].data_pointers = data_pointers[int((1 - validation_ratio) * len(data_pointers)):] return return_datasets[0], return_datasets[1]
def save_weight_to_pkl(self): if not os.path.exists(self.weight_dir): os.makedirs(self.weight_dir) for name in self.w.keys(): save_pkl(self.w[name].eval(), os.path.join(self.weight_dir, "%s.pkl" % name))
def getPersons(self): for video_id in range(1, self.num_videos + 1): self.joints_dict = {} video_folder = os.path.join(self.dataset_folder, 'seq' + '%02d' % video_id) video_id = str(video_id) annotation_file = os.path.join(video_folder, 'annotations_new.txt') lines = open(annotation_file).readlines() img_list = sorted(glob.glob(os.path.join(video_folder, "*.jpg"))) #print video_id, len(img_list) for line in lines: frame_id, rects = self.annotation_parse(line) imgs = {} if int(frame_id) + 4 <= len( img_list) and int(frame_id) - 5 > 0: print video_id, frame_id clip_list = img_list[int(frame_id) - 6:int(frame_id) + 4] imgs['pre'] = clip_list[:5][::-1] imgs['back'] = clip_list[4:] #print imgs save_path = os.path.join(self.save_folder, video_id, frame_id) if not (os.path.exists(save_path)): os.makedirs(save_path) # We will track the frames as we load them off of disk self.track(rects, imgs, self.tracker, save_path) utils.save_pkl(self.joints_dict, os.path.join(self.save_folder, 'pose_' + video_id))
def _compute_result(self, model_name, data_loader, target, class_weight, inference_fn, save_name, conditional=False): """Load model and compute performance with given inference method""" state_dict = torch.load(os.path.join(self.save_path, model_name)) self.network.load_state_dict(state_dict['model']) loss, output, feature = self._test(data_loader) if conditional: predict = inference_fn(output, target) else: predict = inference_fn(output) per_class_AP = utils.compute_weighted_AP(target, predict, class_weight) mAP = utils.compute_mAP(per_class_AP, self.subclass_idx) result = { 'output': output.cpu().numpy(), 'feature': feature.cpu().numpy(), 'per_class_AP': per_class_AP, 'mAP': mAP } utils.save_pkl(result, os.path.join(self.save_path, save_name)) return mAP
def savePointsDict(path): for camera in camera_list: ann = ut.load_json(path + "CompleteAnnotations_2016-07-11/%s.json" % camera) pointsDict = {dots["imName"]: dots["xy"] for dots in ann["dots"]} ut.save_pkl(path + "CompleteAnnotations_2016-07-11/%s.pkl" % camera, pointsDict)
def train(args): devices = utils.get_devices(args.gpu) if args.seed is not None: utils.manual_seed(args.seed) logging.info("Loading data...") dataloader = create_dataloader(args) vocab = dataloader.dataset.vocab utils.save_pkl(vocab, os.path.join(args.save_dir, "vocab.pkl")) logging.info("Initializing training environment...") mdl = prepare_model(args, dataloader) optimizer_cls = get_optimizer_cls(args) trainer = Trainer(model=utils.to_device(mdl, devices), device=devices[0], vocab=vocab, epochs=args.epochs, save_dir=args.save_dir, save_period=args.save_period, optimizer_cls=optimizer_cls, tensor_key="tensor", samples=args.samples, show_progress=args.show_progress, kld_annealing=args.kld_annealing, dynamic_rnn=mdl.encoder.rnn.dynamic or mdl.decoder.rnn.dynamic) report_model(trainer) logging.info("Commecing training...") trainer.train(dataloader) logging.info("Done!")
def evaluate(self): self.preds = [] self.targets = [] self.T = self.num_frames['test'] HPIM_f_dict, HAIM_f_dict = {}, {} with torch.no_grad(): for i, data in enumerate(self.data_loaders['test']): inputs = data[0].cuda() labels = data[-2] labels = labels.view(-1, *labels.size()[2:]) labels = torch.split(labels, 1, dim=-1) target = labels[1].squeeze()[0] img_paths = data[-1] # compute output _, _, activity_scores, BIM_f, PIM_f = self.net(inputs, 'test') activity_scores = activity_scores.view(self.T, -1).mean(dim=0) pred = torch.argmax(activity_scores, -1) self.preds.append(pred.cpu()) self.targets.append(target) clip_id = '/'.join(img_paths[0][0].split('/')[4:6]) # print(HPIM_f.size()) HPIM_f_dict[clip_id] = HPIM_f.cpu() # HAIM_f_dict[clip_id] = HAIM_f if i == 100: utils.save_pkl(HPIM_f_dict, 'HPIM_f_dict') HPIM_f_dict = {} break # utils.save_pkl(HPIM_f_dict, 'HPIM_f_dict') # utils.save_pkl(HAIM_f_dict, 'HAIM_f_dict') self.show()
def extract_candidates(predictions_scan, annotations, tf_matrix, pid, outputs_path): print 'computing blobs' start_time = time.time() blobs = blobs_detection.blob_dog(predictions_scan[0, 0], min_sigma=1, max_sigma=15, threshold=0.1) print 'blobs computation time:', (time.time() - start_time) / 60. print 'n_blobs detected', len(blobs) correct_blobs_idxs = [] for zyxd in annotations: r = zyxd[-1] / 2. distance2 = ((zyxd[0] - blobs[:, 0]) ** 2 + (zyxd[1] - blobs[:, 1]) ** 2 + (zyxd[2] - blobs[:, 2]) ** 2) blob_idx = np.argmin(distance2) print 'node', zyxd print 'closest blob', blobs[blob_idx] if distance2[blob_idx] <= r ** 2: correct_blobs_idxs.append(blob_idx) else: print 'not detected !!!' # we will save blobs the the voxel space of the original image # blobs that are true detections will have blobs[-1] = 1 else 0 blobs_original_voxel_coords = [] for j in xrange(blobs.shape[0]): blob_j = np.append(blobs[j, :3], [1]) blob_j_original = tf_matrix.dot(blob_j) blob_j_original[-1] = 1 if j in correct_blobs_idxs else 0 if j in correct_blobs_idxs: print 'blob in original', blob_j_original blobs_original_voxel_coords.append(blob_j_original) blobs = np.asarray(blobs_original_voxel_coords) utils.save_pkl(blobs, outputs_path + '/%s.pkl' % pid)
def text_to_word(text, save_dir): logger = set_logger('word-process') sp_text = [] for i in range(len(text)): sp_text.append(text[i].split()) unq_word = [] logger.info('Set Dictionary.') for line in tqdm(sp_text): for word in line: if word not in unq_word: unq_word.append(word) logger.info('# of unique Word : {}\texample : {}'.format(len(unq_word), random.sample(unq_word, 50))) all_words = [] hangul = re.compile('[-=.#/?:^~!$}0-9]') for line in tqdm(sp_text): for word in line: word = hangul.sub('', word) if word: all_words.append(word) word_count = {} for word in all_words: if word in word_count: word_count[word] += 1 else: word_count[word] = 1 sorted_words = sorted([(k, v) for k, v in word_count.items()], key=lambda word_count: -word_count[1])[:40000] label_word = {i + 1: ch[0] for i, ch in enumerate(sorted_words)} word_label = {y: x for x, y in label_word.items()} x = np.asarray([[word_label[w] for w in sent if w in word_label.keys()] for sent in sp_text]) y_neg = [[1, 0] for _ in range(45000)] y_pos = [[0, 1] for _ in range(45000)] y = np.asarray(y_neg + y_pos) np.random.seed(618); np.random.shuffle(x) np.random.seed(618); np.random.shuffle(y) # Check Folder folder_check(dir_path=save_dir, dir_name='npz') folder_check(dir_path=save_dir, dir_name='dictionary') # Save Array & Dictionary save_npz(npz_path=save_dir + '/npz', npz_name='x_word.npz', arr=x) save_npz(npz_path=save_dir + '/npz', npz_name='y_word.npz', arr=y) save_pkl(pkl_path=save_dir + '/dictionary', pkl_name='dictionary_word.pkl', save_object=label_word) return None
def make_luna_validation_split(): luna_path = pathfinder.LUNA_DATA_PATH file_list = sorted(glob.glob(luna_path + "/*.mhd")) random.seed(317070) all_pids = [utils_lung.extract_pid_filename(f) for f in file_list] validation_pids = random.sample(all_pids, int(VALIDATION_SET_SIZE * len(file_list))) train_pids = list(set(all_pids) - set(validation_pids)) d = {} d['valid'] = validation_pids d['train'] = train_pids utils.save_pkl(d, pathfinder.LUNA_VALIDATION_SPLIT_PATH)
def save_weights(self, weight_dir): """ Save weights """ print('Saving weights to %s ...' % weight_dir) if not os.path.exists(weight_dir): os.makedirs(weight_dir) for name in self.w.keys(): save_pkl(self.w[name].eval(), os.path.join(weight_dir, "%s.pkl" % name))
def get_svm_model(X_train, y_train, retrain=False): from sklearn.svm import SVC if not retrain: svm_model = load_pkl(SVM_MODEL_PATH) if svm_model is not None: return svm_model print("start training SVM model...") svm_model = SVC() svm_model.fit(X_train, y_train) save_pkl(SVM_MODEL_PATH, svm_model) return svm_model
def get_mnb_model(X_train, y_train, retrain=False): from sklearn.naive_bayes import MultinomialNB if not retrain: mnb_model = load_pkl(MNB_MODEL_PATH) if mnb_model is not None: return mnb_model print("start training MNB model...") mnb_model = MultinomialNB(alpha=1, fit_prior=True) mnb_model.fit(X_train, y_train) save_pkl(MNB_MODEL_PATH, mnb_model) return mnb_model
def get_lr_model(X_train, y_train, retrain=False): from sklearn.linear_model import LogisticRegression if not retrain: lr_model = load_pkl(LR_MODEL_PATH) if lr_model is not None: return lr_model print("start training LR model...") lr_model = LogisticRegression(max_iter=200, n_jobs=-1) lr_model.fit(X_train, y_train) save_pkl(LR_MODEL_PATH, lr_model) return lr_model
def dump_features_all_item(self, name): if name == 'vali': data = self.data.vali_batch elif name == 'test': data = self.data.test_batch elif name == 'train': data = self.data.train_batch else: raise Exception(f'unknown name: {name}') users = [] items = [] logits = [] from run_for_fuse import all_res fn_list = all_res.keys() user2items = None for fn in fn_list: _users, _items_list, _ = utils.load_pkl( f'{utils.for_fuse_dir}/{fn}_{name}') if user2items is None: user2items = {} for _u, _items in zip(_users, _items_list): user2items[_u] = set(_items) else: assert set(user2items.keys()) == set(_users) for _u, _items in zip(_users, _items_list): user2items[_u] |= set(_items) pbar = tqdm(desc=f'dump {name}, predicting...', leave=False) for pv in self.model.predict(data): pbar.update(1) users.extend(pv.user.tolist()) for i in range(len(pv.user)): user = pv.user[i] _items_i = sorted(user2items[user]) items.append(_items_i) logits.append(pv.all_scores[i, _items_i].tolist()) pbar.close() feat = [users, items, logits] fn = f'{utils.for_fuse_dir}/union_{args.msg}_{name}' print(f'{utils.get_time_str()} dump file {fn}') utils.save_pkl(feat, fn) print(f'{utils.get_time_str()} dump file {fn} over') return fn
def OnSave(self, event): """ Save labeled data """ if len(self.cur_text['sents']) > 0: if self.cur_text['text_ind'] not in self.agent.history_texts: self.data.append(self.cur_text['sents']) self.agent.history_texts.append(self.cur_text['text_ind']) else: print('\nText%d exists and it is replaced now.\n' % self.cur_text['text_ind']) ind = self.agent.history_texts.index(self.cur_text['text_ind']) self.data[ind] = self.cur_text['sents'] print('Saving data to %s' % self.output_file_name) save_pkl([self.agent.history_texts, self.data], self.output_file_name)
def save_train_validation_ids(filename, data_path): patient_dirs = sorted(glob.glob(data_path + "/*/study/"), key=lambda folder: int(re.search(r'/(\d+)/', folder).group(1))) dirs_indices = range(0, len(patient_dirs)) valid_dirs_indices = get_cross_validation_indices(indices=dirs_indices, validation_index=0) train_patient_indices = list(set(dirs_indices) - set(valid_dirs_indices)) train_patient_dirs = [utils.get_patient_id(patient_dirs[idx]) for idx in train_patient_indices] validation_patient_dirs = [utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices] d = {'train': train_patient_dirs, 'valid': validation_patient_dirs} utils.save_pkl(d, filename) print 'train-valid patients split saved to', filename return d
def OnQuit(self, event): """ Quit the APP """ dlg = wx.TextEntryDialog( None, "Enter a name for saving the annotation data and press 'OK' ('Cancel' for no saving)!", 'Message Window', self.output_file_name, wx.OK | wx.CANCEL) if dlg.ShowModal() == wx.ID_OK: filename = dlg.GetValue() if filename: save_pkl([self.agent.history_texts, self.data], filename) #else: # filename = 'data/online_test/labeled_data%s' % get_time() # save_pkl([self.agent.history_texts, self.data], filename) wx.Exit()
def extract_candidates(predictions_scan, tf_matrix, pid, outputs_path): print 'computing blobs' start_time = time.time() blobs = blobs_detection.blob_dog(predictions_scan[0, 0], min_sigma=1, max_sigma=15, threshold=0.1) print 'blobs computation time:', (time.time() - start_time) / 60. print 'n blobs detected:', blobs.shape[0] blobs_original_voxel_coords = [] for j in xrange(blobs.shape[0]): blob_j = np.append(blobs[j, :3], [1]) blob_j_original = tf_matrix.dot(blob_j) blobs_original_voxel_coords.append(blob_j_original) blobs = np.asarray(blobs_original_voxel_coords) print blobs.shape utils.save_pkl(blobs, outputs_path + '/%s.pkl' % pid)
def test(self): # Test and save the result test_color_result = self._test(self.test_color_loader) test_gray_result = self._test(self.test_gray_loader) utils.save_pkl(test_color_result, os.path.join(self.save_path, 'test_color_result.pkl')) utils.save_pkl(test_gray_result, os.path.join(self.save_path, 'test_gray_result.pkl')) # Output the classification accuracy on test set info = ('Test on color images accuracy: {}, domain accuracy; {}\n' 'Test on gray images accuracy: {}, domain accuracy: {}'.format( test_color_result['class_accuracy'], test_color_result['domain_accuracy'], test_gray_result['class_accuracy'], test_gray_result['domain_accuracy'])) utils.write_info(os.path.join(self.save_path, 'test_result.txt'), info)
def _load_json_emb(self): fn = get_save_path() + '/{}_graph2vec_json_dict.pkl'.format( self.dataset) if isfile(fn): with open(fn, 'rb') as handle: d = load_pkl(handle) print('Loaded json dict from {}'.format(fn)) return d dfn = get_model_path( ) + '/graph2vec_tf/embeddings/{}_train_test_dims_{}_epochs_1000_lr_0.3_embeddings.txt'.format( self.dataset, self.dim) with open(dfn) as json_data: d = json.load(json_data) with open(fn, 'wb') as handle: save_pkl(d, handle) print('Loaded json dict from {}\nSaved to {}'.format(dfn, fn)) return d
def _load_sim_mat(self): fn = get_result_path() + '/{}/sim/{}_graph2vec_dim_{}_sim_{}.npy'.format( \ self.dataset, self.dataset, self.dim, self.sim) if isfile(fn): with open(fn, 'rb') as handle: sim_mat = load_pkl(handle) print('Loaded sim mat from {}'.format(fn)) return sim_mat train_emb = self._load_emb(True) test_emb = self._load_emb(False) if self.sim == 'dot': sim_mat = test_emb.dot(train_emb.T) else: raise RuntimeError('Unknown sim {}'.format(self.sim)) with open(fn, 'wb') as handle: save_pkl(sim_mat, handle) print('Saved sim mat {} to {}'.format(sim_mat.shape, fn)) return sim_mat
def prepare_tfidf_data(): """ Compute tfidf features for texts """ filename = 'data/online_test/tf_idf_data_max_feature20000.pkl' if os.path.exists(filename): print('Loading data from %s' % filename) return load_pkl(filename) else: texts = load_pkl('data/home_and_garden_500_words_with_title.pkl') print('Vectorizing texts ...') converter = TfidfVectorizer(decode_error='ignore', stop_words='english', max_features=20000) tf_idf = converter.fit_transform([' '.join(t['sent']) for t in texts]) print('n_samples: %d n_features: %d' % tf_idf.shape) save_pkl(tf_idf, filename) return tf_idf
def test2(): image_dir = utils.get_dir_path('analysis', pathfinder.METADATA_PATH) luna_data_paths = utils_lung.get_patient_data_paths(pathfinder.LUNA_DATA_PATH) luna_data_paths = [p for p in luna_data_paths if '.mhd' in p] print len(luna_data_paths) pid2mm_shape = {} for k, p in enumerate(luna_data_paths): img, origin, spacing = utils_lung.read_mhd(p) id = os.path.basename(p).replace('.mhd', '') mm_shape = img.shape * spacing pid2mm_shape[id] = mm_shape print k, id, mm_shape if k % 50 == 0: print 'Saved' utils.save_pkl(pid2mm_shape, image_dir + '/pid2mm.pkl') utils.save_pkl(pid2mm_shape, image_dir + '/pid2mm.pkl')
def test(self): # Test and save the result state_dict = torch.load(os.path.join(self.save_path, 'best.pth')) self.load_state_dict(state_dict) dev_class_loss, dev_domain_loss, dev_class_output, dev_domain_output, \ dev_feature, dev_domain_accuracy = self._test(self.dev_loader) dev_predict_prob = self.inference(dev_class_output) dev_per_class_AP = utils.compute_weighted_AP(self.dev_target, dev_predict_prob, self.dev_class_weight) dev_mAP = utils.compute_mAP(dev_per_class_AP, self.subclass_idx) dev_result = { 'output': dev_class_output.cpu().numpy(), 'feature': dev_feature.cpu().numpy(), 'per_class_AP': dev_per_class_AP, 'mAP': dev_mAP, 'domain_output': dev_domain_output.cpu().numpy(), 'domain_accuracy': dev_domain_accuracy } utils.save_pkl(dev_result, os.path.join(self.save_path, 'dev_result.pkl')) test_class_loss, test_domain_loss, test_class_output, test_domain_output, \ test_feature, test_domain_accuracy = self._test(self.test_loader) test_predict_prob = self.inference(test_class_output) test_per_class_AP = utils.compute_weighted_AP(self.test_target, test_predict_prob, self.test_class_weight) test_mAP = utils.compute_mAP(test_per_class_AP, self.subclass_idx) test_result = { 'output': test_class_output.cpu().numpy(), 'feature': test_feature.cpu().numpy(), 'per_class_AP': test_per_class_AP, 'mAP': test_mAP, 'domain_output': test_domain_output.cpu().numpy(), 'domain_accuracy': test_domain_accuracy } utils.save_pkl(test_result, os.path.join(self.save_path, 'test_result.pkl')) # Output the mean AP for the best model on dev and test set info = ('Dev mAP: {}\n' 'Test mAP: {}'.format(dev_mAP, test_mAP)) utils.write_info(os.path.join(self.save_path, 'result.txt'), info)
def test2(): image_dir = utils.get_dir_path('analysis', pathfinder.METADATA_PATH) luna_data_paths = utils_lung.get_patient_data_paths( pathfinder.LUNA_DATA_PATH) luna_data_paths = [p for p in luna_data_paths if '.mhd' in p] print len(luna_data_paths) pid2mm_shape = {} for k, p in enumerate(luna_data_paths): img, origin, spacing = utils_lung.read_mhd(p) id = os.path.basename(p).replace('.mhd', '') mm_shape = img.shape * spacing pid2mm_shape[id] = mm_shape print k, id, mm_shape if k % 50 == 0: print 'Saved' utils.save_pkl(pid2mm_shape, image_dir + '/pid2mm.pkl') utils.save_pkl(pid2mm_shape, image_dir + '/pid2mm.pkl')
def text_to_phoneme(text, save_dir): logger = set_logger('phoneme-process') sp_text = [] hangul = re.compile('[^\u3131-\u3163\uac00-\ud7a3]+') for split in text: review = hangul.sub(' ', split_syllables(split)) if len(review) != 0: sp_text.append(review) unq_phoneme = [] logger.info('Set Dictionary.') for line in tqdm(sp_text): for phoneme in line: if phoneme not in unq_phoneme: unq_phoneme.append(phoneme) logger.info('# of unique Phoneme : {}\nexample : {}'.format(len(unq_phoneme), unq_phoneme[:50])) phoneme_label = {ch: i + 1 for i, ch in enumerate(unq_phoneme)} label_phoneme = {i + 1: ch for i, ch in enumerate(unq_phoneme)} x = np.asarray([[phoneme_label[w] for w in sent if w in phoneme_label.keys()] for sent in sp_text]) y_neg = [[1, 0] for _ in range(45000)] y_pos = [[0, 1] for _ in range(45000)] y = np.asarray(y_neg + y_pos) np.random.seed(618); np.random.shuffle(x) np.random.seed(618); np.random.shuffle(y) # Check Folder folder_check(dir_path=save_dir, dir_name='npz') folder_check(dir_path=save_dir, dir_name='dictionary') # Save Array & Dictionary save_npz(npz_path=save_dir + '/npz', npz_name='x_phoneme.npz', arr=x) save_npz(npz_path=save_dir + '/npz', npz_name='y_phoneme.npz', arr=y) save_pkl(pkl_path=save_dir + '/dictionary', pkl_name='dictionary_phoneme.pkl', save_object=label_phoneme) return None
def train_and_predict(self, features, target='label', n_folds=5, save=True, seed=2019): self.fe = features X_train, y_train = self.train_[features].values, self.train_[ target].values.astype('uint8') X_test = self.test_[self.fe].values skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed) self.pred = np.zeros(len(X_test)) ## 进行K折训练 for fold_, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)): train_X, train_y = X_train[train_idx], y_train[train_idx] eval_X, eval_y = X_train[val_idx], y_train[val_idx] lgb_train = lgb.Dataset(train_X, train_y) lgb_eval = lgb.Dataset(eval_X, eval_y) print('\nFold: ', fold_) model = lgb.train(self.params, lgb_train, num_boost_round=7000, valid_sets=[lgb_train, lgb_eval], valid_names=['train', 'eval'], early_stopping_rounds=200, verbose_eval=1000) if save: save_pkl( model, os.path.join(self.opt['model_train'], f'lgb_fold_{fold_}.pkl')) self.pred += model.predict(X_test) / n_folds return self.pred
def text_classification(): """ Classify all unlabeled texts according to the clustering results """ filename = 'data/online_test/label2text.pkl' if os.path.exists(filename): label2text = load_pkl(filename) else: records = load_pkl('data/online_test/clustering_records.pkl') scores = records['scores'] best_i = scores.index(max(scores[:10])) # maximum categories = 11 labels = records['labels'][best_i] best_k = best_i + 2 # k in [2, 20], i in [0, 18] label2text = {c: [] for c in range(best_k)} for ind, label in enumerate(labels): label2text[label].append(ind) save_pkl(label2text, filename) for label, text_inds in label2text.items(): print('label: {} n_samples: {}'.format(label, len(text_inds))) return label2text
def __init__(self, root="", split=None, transform_function=None, sigma=8.0, version=None, ratio=0.3): self.split = split self.sigma = sigma self.name = "Penguins" self.path = "Penguins/" imgNames = ut.load_pkl(self.path + "/Splits_2016_07_11/%s_%s.pkl" % (split, version)) self.imgNames = imgNames path_points = (self.path + "/Splits_2016_07_11/points_%s_%s.pkl" % (split, version)) if not os.path.exists(path_points): allPointDict = save_pointsDict(self.path, self.imgNames) ut.save_pkl(path_points, allPointDict) self.pointsDict = ut.load_pkl(path_points) assert len(self.pointsDict) == len(self) self.split = split self.ratio = ratio self.n_classes = 2 self.n_channels = 3 self.transform_function = transform_function() self.version = version roi_name = self.path + "CompleteAnnotations_2016-07-11/roi.pkl" self.roiDict = ut.load_pkl(roi_name) self.longest = True
def save_train_validation_ids(filename, data_path): patient_dirs = sorted( glob.glob(data_path + "/*/study/"), key=lambda folder: int(re.search(r'/(\d+)/', folder).group(1))) dirs_indices = list(range(0, len(patient_dirs))) valid_dirs_indices = get_cross_validation_indices(indices=dirs_indices, validation_index=0) train_patient_indices = list(set(dirs_indices) - set(valid_dirs_indices)) train_patient_dirs = [ utils.get_patient_id(patient_dirs[idx]) for idx in train_patient_indices ] validation_patient_dirs = [ utils.get_patient_id(patient_dirs[idx]) for idx in valid_dirs_indices ] d = {'train': train_patient_dirs, 'valid': validation_patient_dirs} utils.save_pkl(d, filename) print('train-valid patients split saved to', filename) return d
assert patient_data["patient"] == patient_id patient_data["systole"] = np.concatenate((patient_data["systole"], systole_prediction[None, :]), axis=0) patient_data["diastole"] = np.concatenate((patient_data["diastole"], diastole_prediction[None, :]), axis=0) avg_patient_predictions = config().get_avg_patient_predictions(batch_predictions, batch_ids, mean=mean) patient_targets = utils_heart.get_patient_average_heaviside_predictions(batch_targets, batch_ids, mean=mean) assert avg_patient_predictions.viewkeys() == patient_targets.viewkeys() crpss_sys, crpss_dst = [], [] for id in avg_patient_predictions.iterkeys(): crpss_sys.append(utils_heart.crps(avg_patient_predictions[id][0], patient_targets[id][0])) crpss_dst.append(utils_heart.crps(avg_patient_predictions[id][1], patient_targets[id][1])) crps0, crps1 = np.mean(crpss_sys), np.mean(crpss_dst) print "\nValidation CRPS: ", crps0, crps1, 0.5 * (crps0 + crps1) utils.save_pkl(avg_patient_predictions, valid_prediction_path) print " validation predictions saved to %s" % valid_prediction_path print # test test_data_iterator = config().test_data_iterator if n_tta_iterations == 1: test_data_iterator.transformation_params = config().valid_transformation_params else: # just to be sure test_data_iterator.transformation_params["zoom_range"] = (1.0, 1.0) print "test transformation params" print test_data_iterator.transformation_params print "n test: %d" % test_data_iterator.nsamples
candidates_config = 'dsb_c3_s5_p8a1' predictions_dir = utils.get_dir_path('model-predictions', pathfinder.METADATA_PATH) candidates_path = predictions_dir + '/%s' % candidates_config id2candidates_path = utils_lung.get_candidates_paths(candidates_path) train_valid_ids = utils.load_pkl(pathfinder.VALIDATION_SPLIT_PATH) train_pids, valid_pids, test_pids = train_valid_ids['training'], train_valid_ids['validation'], train_valid_ids['test'] all_pids = train_pids + valid_pids + test_pids data_iterator = data_iterators.DSBPixelSpacingsGenerator(pathfinder.DATA_PATH, id2candidates_path, all_pids) z = [] y = [] x = [] pixel_spacings = {} # use buffering.buffered_gen_threaded() for idx, (pid, pixel_spacing) in enumerate(data_iterator.generate()): print idx, pid, pixel_spacing z.append(pixel_spacing[0]) y.append(pixel_spacing[1]) x.append(pixel_spacing[2]) pixel_spacings[pid] = pixel_spacing utils.save_pkl(pixel_spacings, 'pixel_spacings_dsb.pkl') print 'z', min(z), max(z) print 'y', min(y), max(y) print 'x', min(x), max(x)
def get_slice2roi(data_path, out_filename, num_circles=None, plot=False): patient_paths = sorted(glob.glob(data_path + '/*/study')) slice2roi = {} for p in patient_paths: patient_data = get_patient_data(p) # sax slices sax_slice_stack = [] ch4, ch2 = None, None for s in patient_data: if 'sax' in s['slice_id']: sax_slice_stack.append(s) elif '4ch' in s['slice_id']: ch4 = s elif '2ch' in s['slice_id']: ch2 = s sorted_sax_slices = sort_sax_slices(sax_slice_stack) grouped_sax_slices = group_sax_slices(sorted_sax_slices) # init patient dict pid = sorted_sax_slices[0]['patient_id'] # print pid slice2roi[pid] = {} for slice_group in grouped_sax_slices: # pixel spacing changes within one patient but not too much pixel_spacing = slice_group[0]['metadata']['PixelSpacing'][0] if num_circles: roi_center, roi_radii = data.extract_roi(slice_group, pixel_spacing, num_circles=num_circles) else: roi_center, roi_radii = data.extract_roi(slice_group, pixel_spacing) if plot: plot_roi(slice_group, roi_center, roi_radii) for s in slice_group: sid = s['slice_id'] slice2roi[pid][sid] = {'roi_center': roi_center, 'roi_radii': roi_radii} # project found roi_centers on the 4ch and 2ch slice ch4_centers = [] ch2_centers = [] for slice in sorted_sax_slices: sid = slice['slice_id'] roi_center = slice2roi[pid][sid]['roi_center'] metadata_source = slice['metadata'] hough_roi_center = (float(roi_center[0]) / metadata_source['Rows'], float(roi_center[1]) / metadata_source['Columns']) if ch4 is not None: metadata_target = ch4['metadata'] result = orthogonal_projection_on_slice(hough_roi_center, metadata_source, metadata_target) ch_roi_center = [float(result[0]) * metadata_target['Rows'], float(result[1]) * metadata_target['Columns']] ch4_centers.append(ch_roi_center) if ch2 is not None: metadata_target = ch2['metadata'] result = orthogonal_projection_on_slice(hough_roi_center, metadata_source, metadata_target) ch_roi_center = [float(result[0]) * metadata_target['Rows'], float(result[1]) * metadata_target['Columns']] ch2_centers.append(ch_roi_center) if ch4 is not None: centers = np.array(ch4_centers) ch4_result_center = tuple(np.mean(centers, axis=0)) ch4_result_radius = np.max(np.sqrt((centers - ch4_result_center) ** 2)) ch4_result_radius = (ch4_result_radius, ch4_result_radius) sid = ch4['slice_id'] slice2roi[pid][sid] = {'roi_center': ch4_result_center, 'roi_radii': ch4_result_radius} if plot: plot_roi([ch4], ch4_result_center, ch4_result_radius) if ch2 is not None: centers = np.array(ch2_centers) ch2_result_center = tuple(np.mean(centers, axis=0)) ch2_result_radius = np.max(np.sqrt((centers - ch2_result_center) ** 2)) ch2_result_radius = (ch2_result_radius, ch2_result_radius) sid = ch2['slice_id'] slice2roi[pid][sid] = {'roi_center': ch2_result_center, 'roi_radii': ch2_result_radius} if plot: plot_roi([ch2], ch2_result_center, ch2_result_radius) utils.save_pkl(slice2roi, out_filename) print 'saved to ', out_filename return slice2roi
final_train = [] final_pos_train = [] final_neg_train = [] for pid in all_pids: if pid not in final_test: final_train.append(pid) if id2label[pid] == 1: final_pos_train.append(pid) elif id2label[pid] == 0: final_neg_train.append(pid) else: raise ValueError("weird shit is going down") print 'pos id ratio final train set', 1.*len(final_pos_train) / (len(final_train)) print 'final test/(train+test):', 1.*len(final_test) / (len(final_train) + len(final_test)) concat_str = ''.join(final_test) print 'md5 of concatenated pids:', hashlib.md5(concat_str).hexdigest() output = {'train':final_train, 'test':final_test} output_name = pathfinder.METADATA_PATH+'final_split.pkl' utils.save_pkl(output, output_name) print 'final split saved at ', output_name
print print 'Data' print 'n samples: %d' % data_iterator.nsamples prev_pid = None candidates = [] patients_count = 0 max_malignancy = 0. for n, (x, candidate_zyxd, id) in enumerate(data_iterator.generate()): pid = id[0] if pid != prev_pid and prev_pid is not None: print patients_count, prev_pid, len(candidates) candidates = np.asarray(candidates) utils.save_pkl(candidates, outputs_path + '/%s.pkl' % prev_pid) patients_count += 1 candidates = [] #print 'x.shape', x.shape x_shared.set_value(x) predictions = get_predictions_patch() #print 'predictions.shape', predictions.shape #print 'candidate_zyxd', candidate_zyxd.shape candidate_zyxd_pred = np.append(candidate_zyxd, [predictions]) #print 'candidate_zyxd_pred', candidate_zyxd_pred candidates.append(candidate_zyxd_pred) prev_pid = pid
print '%d/%d (epoch %.3f) time/nvalid_iter=%.2fs' % ( niter, max_niter, niter / float(train_batches_per_epoch), current_time - prev_time) prev_time = current_time valid_loss = eval_valid() losses_eval_valid.append(valid_loss) eval_train_loss = eval_valid() losses_eval_train.append(eval_train_loss) if 'vae' in config.__name__: print 'validation loss: %0.4f KL: %0.4f CE: %0.4f' % (valid_loss[0], -valid_loss[1], valid_loss[2]) print 'train loss: %0.4f KL: %0.4f CE: %0.4f' % ( eval_train_loss[0], -eval_train_loss[1], eval_train_loss[2]) else: print 'validation loss: %0.4f' % valid_loss[0] print 'train loss: %0.4f' % eval_train_loss[0] print if (epoch + 1) % config.save_every == 0: d = { 'configuration': config_name, 'experiment_id': expid, 'epochs_since_start': epoch, 'losses_train': losses_train, 'losses_eval_valid': losses_eval_valid, 'losses_eval_train': losses_eval_train, 'param_values': lasagne.layers.get_all_param_values(model.l_out) } utils.save_pkl(d, metadata_target_path) print " saved to %s" % metadata_target_path print
avg_patient_predictions = config().get_avg_patient_predictions(batch_predictions, batch_ids, mean=mean) patient_targets = utils_heart.get_patient_average_heaviside_predictions(batch_targets, batch_ids) assert avg_patient_predictions.viewkeys() == patient_targets.viewkeys() crpss_sys, crpss_dst = [], [] for id in avg_patient_predictions.iterkeys(): crpss_sys.append(utils_heart.crps(avg_patient_predictions[id][0], patient_targets[id][0])) crpss_dst.append(utils_heart.crps(avg_patient_predictions[id][1], patient_targets[id][1])) print id, 0.5 * (crpss_sys[-1] + crpss_dst[-1]), crpss_sys[-1], crpss_dst[-1] crps0, crps1 = np.mean(crpss_sys), np.mean(crpss_dst) print '\n Train CRPS:', config().get_mean_crps_loss(batch_predictions, batch_targets, batch_ids) print 'Train CRPS', 0.5 * (crps0 + crps1) utils.save_pkl(avg_patient_predictions, prediction_path) print ' predictions saved to %s' % prediction_path print if set == 'valid': valid_data_iterator = config().valid_data_iterator if n_tta_iterations > 1: valid_data_iterator.transformation_params = config().train_transformation_params valid_data_iterator.transformation_params['zoom_range'] = (1., 1.) print print 'n valid: %d' % valid_data_iterator.nsamples print 'tta iteration:', batch_predictions, batch_targets, batch_ids = [], [], [] for i in xrange(n_tta_iterations):
print 'Data' print 'n samples: %d' % data_iterator.nsamples prev_pid = None candidates = [] patients_count = 0 max_malignancy = 0. for n, (x, candidate_zyxd, id) in enumerate(data_iterator.generate()): pid = id[0] if pid != prev_pid and prev_pid is not None: print patients_count, prev_pid, len(candidates) candidates = np.asarray(candidates) a = np.asarray(sorted(candidates, key=lambda x: x[-1], reverse=True)) print 'max malignancies', a[:10,-1] utils.save_pkl(a, outputs_path + '/%s.pkl' % prev_pid) print 'saved predictions' patients_count += 1 candidates = [] x_shared.set_value(x) predictions = get_predictions_patch() #print 'predictions.shape', predictions.shape total_malignancy = np.sum(config().malignancy_weights*predictions) #print 'total_malignancy', total_malignancy #print 'candidate_zyxd', candidate_zyxd candidate_zyxd_pred = np.append(candidate_zyxd, [predictions]) candidate_zyxd_pred_mal = np.append(candidate_zyxd_pred, [[total_malignancy]]) candidates.append(candidate_zyxd_pred_mal)
print 'n samples: %d' % data_iterator.nsamples prev_pid = None candidates = [] patients_count = 0 patch_size = 48 stride = 16 for n, (x, id) in enumerate(data_iterator.generate()): pid = id print(pid) print model.l_out.output_shape predictions = np.empty(((x.shape[2]-patch_size+1)//stride, (x.shape[3]-patch_size+1)//stride, (x.shape[4]-patch_size+1)//stride,) + (model.l_out.output_shape[1],)) print predictions.shape print 'x.shape', x.shape for idxi, i in enumerate(np.arange(0,x.shape[2]-patch_size,stride)): print 'slice idxi', idxi for idxj, j in enumerate(np.arange(0,x.shape[3]-patch_size,stride)): for idxk, k in enumerate(np.arange(0,x.shape[4]-patch_size,stride)): #print i, j, k, '|', idxi, idxj, idxk, x.shape[4], x.shape[4]-patch_size+1 x_in = x[0,0,i:i+patch_size,j:j+patch_size,k:k+patch_size] #print x_in.shape x_shared.set_value(x_in[None,:,:,:]) fm = get_featuremap() #print fm.shape predictions[idxi,idxj,idxk] = fm[0] result = np.concatenate(predictions,axis=0) utils.save_pkl(result, outputs_path + '/%s.pkl' % pid)
data_iterator = config().test_data_iterator print print 'Data' print 'n test: %d' % data_iterator.nsamples pid2prediction = {} for i, (x_test, _, id_test) in enumerate(buffering.buffered_gen_threaded( data_iterator.generate())): predictions = iter_test(x_test) pid = id_test[0] print predictions pid2prediction[pid] = predictions[1] if predictions.shape[-1] == 2 else predictions[0] print i, pid, predictions, pid2label[pid] utils.save_pkl(pid2prediction, output_pkl_file) print 'Saved validation predictions into pkl', os.path.basename(output_pkl_file) test_loss = utils_lung.evaluate_log_loss(pid2prediction, pid2label) print 'Test loss', test_loss utils_lung.write_submission(pid2prediction, output_csv_file) print 'Saved predictions into csv' loss = evaluate_submission.leaderboard_performance(output_csv_file) print loss elif set == 'valid': data_iterator = config().valid_data_iterator print print 'Data'
n_offsets = val_seq_len - cfg.seq_length pred_batch = [] for ofs in range(n_offsets): pred = compute_output(ofs) pred_batch.append(pred) pred_batch = np.array(pred_batch).swapaxes(0,1) msk = m_shared.get_value(borrow=True) new_preds_batch = [] for i in range(len(pred_batch)): l = len(msk[i][msk[i]==0.]) new_preds_batch.append(pred_batch[i][:-l]) preds.extend(new_preds_batch) print "%i%%"%int(np.round((b+1)/float(n_batches)*100.)) utils.save_pkl(preds, target_path) else: preds = utils.load_pkl(target_path) assert len(preds) == len(split.test_idxs) print "Preparing csv files" pred_files = glob(csv_path+"Sample*") if len(pred_files) != len(split.test_idxs): for i, pred in enumerate(preds): pred = np.argmax(pred, axis=-1) # window = 10 # new_pred = pred.copy() # for j in range(len(pred)-window):
test_data_iterator.transformation_params = config().valid_transformation_params batch_predictions, batch_ids = [], [] mu_predictions, sigma_predictions = [], [] for xs_batch_test, _, ids_batch in buffering.buffered_gen_threaded(test_data_iterator.generate()): for x_shared, x in zip(xs_shared, xs_batch_test): x_shared.set_value(x) batch_predictions.append(iter_test_det()) batch_ids.append(ids_batch) mu_predictions.append(iter_mu()) sigma_predictions.append(iter_sigma()) test_avg_patient_predictions = config().get_avg_patient_predictions(batch_predictions, batch_ids, mean='geometric') pid2mu = utils_heart.get_patient_normparam_prediction(mu_predictions, batch_ids) pid2sigma = utils_heart.get_patient_normparam_prediction(sigma_predictions, batch_ids) test_pid2musigma = {} for pid in pid2mu.iterkeys(): test_pid2musigma[pid] = {'mu': pid2mu[pid], 'sigma': pid2sigma[pid]} predictions = {'train': train_avg_patient_predictions, 'valid': valid_avg_patient_predictions, 'test': test_avg_patient_predictions} predictions_mu_std = {'train': train_pid2musigma, 'valid': valid_pid2musigma, 'test': test_pid2musigma} utils.save_pkl(predictions, prediction_path) utils.save_pkl(predictions_mu_std, prediction_mu_std_path) print ' predictions saved to %s' % prediction_path