def download(parquet_embeddings_path: str, dest_path: str, n_cores: int = 32, verbose: bool = True) -> bool: """ Download .parquet files from hdfs at max speed Parallelisation is essential to use the full bandwidth. """ filenames = read_filenames(parquet_embeddings_path) nb_files = len(filenames) os.makedirs(dest_path, exist_ok=True) src_dest_paths = zip(filenames, repeat(dest_path)) if n_cores == 1: if verbose: src_dest_paths = tq(list(src_dest_paths)) for src_dest_path in src_dest_paths: download_one(src_dest_path) else: with tq(total=nb_files) as pbar: with Pool(processes=n_cores) as pool: for _ in pool.imap_unordered(download_one, src_dest_paths): pbar.update(1) return True
def valid(self): self.model.eval() loss_valid_r = 0 valid_batches = 0 # Counter for valid batches out_gt = torch.FloatTensor().to(self.device) out_pred = torch.FloatTensor().to(self.device) with torch.no_grad(): for (var_input, var_target) in tq(self.data_loader_valid): var_target = var_target.to(self.device) out_gt = torch.cat((out_gt, var_target), 0).to(self.device) _, c, h, w = var_input.size() var_input = var_input.view(-1, c, h, w) var_output = self.model(var_input.to(self.device)) out_pred = torch.cat((out_pred, var_output), 0) lossvalue = self.loss_fn( var_output, tfunc.one_hot(var_target.squeeze(1).long(), num_classes=self.class_count).float()) loss_valid_r += lossvalue.item() valid_batches += 1 valid_loss = loss_valid_r / valid_batches auroc_individual = compute_auroc( tfunc.one_hot(out_gt.squeeze(1).long()).float(), out_pred, self.class_count) print(len(auroc_individual)) auroc_mean = np.array(auroc_individual).mean() return valid_loss, auroc_mean
def _popularity_biased_sampling(self): """ 使用采样方法,改变self.data_dicts的每个item,将每个item添加 item_query """ assert hasattr( self, "dataloader"), "AmazonDataset don't have self.dataloader" dataloader = self.dataloader train_data = dataloader.train_batch_data.values.tolist() items_pop = {} user2items = {} for single_item in train_data: user, item, f, s = int(single_item[0]), int(single_item[1]), int( single_item[2]), int(single_item[3]) items_pop[item] = items_pop.get(item, 0) + 1 user2items[user] = user2items.get(user, []) + [item] user2items = {k: set(v) for k, v in user2items.items()} import math import numpy as np items_pop = {k: math.pow(v, 0.75) for k, v in items_pop.items()} self.items_pop = items_pop self.user2items = user2items if not self._lazy: ret = [] print("Substitution Sampling Process: ") for item in tq(self.dataset_dicts): ret.append(foreach_sample(item)) #self.dataset_dicts = [ foreach_sample(item) for item in self.dataset_dicts ] self.dataset_dicts = ret
def convert_all_parquet_to_numpy( parquet_folder: str, embeddings_folder: str, n_cores: int = 32, delete: bool = False, embedding_column_name: str = "embedding", ) -> None: """ Convert embedding parquet files to an embedding numpy files """ assert n_cores > 0 os.makedirs(embeddings_folder, exist_ok=True) parquet_files = [ f"{parquet_folder}/{x}" for x in os.listdir(parquet_folder) if x.endswith(".parquet") ] parquet_files.sort() nb_files = len(parquet_files) func = partial(run_one, embeddings_folder=embeddings_folder, delete=delete, embedding_column_name=embedding_column_name) with tq(total=nb_files) as pbar: with Pool(processes=n_cores) as pool: for _ in pool.imap_unordered(func, parquet_files): pbar.update(1)
def plot_delay(self): pt.figure() mean1 = np.zeros(self.steps) mean2 = np.zeros(self.steps) for i in tq(range(0, self.runs)): np.random.seed(i) theta_star = self.env.set_theta() r1 = LinUCB(self.env).regret_delay_t(self.steps, theta_star) r2 = LinUCB(self.env).regret_t(self.steps, theta_star) # pt.subplot(221) # pt.plot(self.step_list, r[0]) mean1 += r1[0] mean2 += r2[0] # pt.subplot(222) mean1 = [i / self.runs for i in mean1] mean2 = [i / self.runs for i in mean2] # for i in tq(range(0, self.runs)): # np.random.seed(i) # theta_star = self.env.set_theta() # # r1 = LinUCB(self.env).regret_delay_t(self.steps, theta_star) # r2 = LinUCB(self.env).regret_t(self.steps, theta_star) # # pt.subplot(221) # # pt.plot(self.step_list, r[0]) # # mean1 += r1[0] # mean2 += r2[0] # # pt.subplot(222) # # mean1 = [i/self.runs for i in mean1] # mean2 = [i/self.runs for i in mean2] pt.plot(self.step_list, mean1, label="delay") pt.plot(self.step_list, mean2, label="non_delay") pt.legend() pt.show()
def sa(dataset_path: Path, model_path: Path, output_path: Path): check_path(output_path) nx_graphs, labels = read_graphs(dataset_path) model = load_model(model_path) def explain(graph_num): g = nx_graphs[graph_num] node_count = len(g.nodes) adj = np.zeros((1, 100, 100)) adj[0, :node_count, :node_count] = nx.to_numpy_matrix(g) adj = torch.tensor(adj, dtype=torch.float) x = torch.ones((1, 100, 10), requires_grad=True, dtype=torch.float) ypred, _ = model(x, adj) loss = model.loss(ypred, torch.LongTensor([labels[graph_num]])) loss.backward() node_importance = x.grad.detach().numpy()[0][:node_count] node_importance = (node_importance ** 2).sum(axis=1) N = nx_graphs[graph_num].number_of_nodes() masked_adj = np.zeros((N, N)) for u, v in nx_graphs[graph_num].edges(): u = int(u) v = int(v) masked_adj[u, v] = masked_adj[v, u] = node_importance[u] + node_importance[v] return masked_adj for gid in tq(nx_graphs): masked_adj = explain(gid) np.save(output_path / ('%s.npy' % gid), masked_adj)
def CompanyNames(self): # print(self.CompaniesNames) for i in tq(self.CompaniesNames): self.CompanyWiseData[i] = self.data[self.data['Name']==i] # self.FindTop() # self.FindBottomTen() self.TopAndBottom()
def TopAndBottom(self): for i in tq(self.CompaniesNames): self.ATRWEEKLY[i] = average_true_range( self.CompanyWiseData[i]['high'], self.CompanyWiseData[i]['low'], self.CompanyWiseData[i]['close'], n=7 ).mean() self.ATRANNUALLY[i] = average_true_range( self.CompanyWiseData[i]['high'], self.CompanyWiseData[i]['low'], self.CompanyWiseData[i]['close'], n=265 ).mean() self.WeeklySorted = sorted(self.ATRWEEKLY.items(), key = lambda x:x[1]) self.AnnualySorted = sorted(self.ATRANNUALLY.items(), key = lambda x:x[1]) # print(f"top ten Weekly volatile companies are {self.WeeklySorted[:10] }") # print(f"top ten Weekly least volatile companies are {self.WeeklySorted[-10:] }") # print(f"top ten Anually volatile companies are {self.AnnualySorted[:10] }") # print(f"top ten Anually least volatile companies are {self.AnnualySorted[-10:] }") topwL_name,topwL_value = self.namesAndValue(self.WeeklySorted[:10]) topaL_name,topaL_value = self.namesAndValue(self.AnnualySorted[:10]) topw_name,topw_value = self.namesAndValue(self.WeeklySorted[-10:]) topa_name,topa_value = self.namesAndValue(self.AnnualySorted[-10:]) self.plotting([topa_name, topa_value, topw_name, topw_value, topaL_name, topaL_value, topwL_name, topwL_value]) print(topa_name) self.p.plotting(self.CompanyWiseData[topa_name[0]]) self.p.plotting(self.CompanyWiseData[topaL_name[0]])
def __call__(self, data): if isinstance(data, list): data = [self._process(d) for d in tq(data)] data = list(itertools.chain(*data)) # 2d list needs to be flatten else: data = self._process(data) return data
def _valid(self): self.model.eval() avg_loss = 0.0 avg_acc = 0.0 n_samples = 0 progress_bar = tq(self.data_loader_valid) progress_bar.set_description("Validation") for batch_idx, (data, target) in enumerate(progress_bar): if self.cuda_available: data = data.cuda(non_blocking=True) target = target.cuda(non_blocking=True) output = self.model(data) loss = F.cross_entropy(output, target) avg_loss += loss.item() y_hat = output.argmax(dim=1) avg_acc += (target == y_hat).sum().item() n_samples += len(target) if batch_idx % self.args.logFrequency == 0: acc = avg_acc / n_samples metrics = { 'loss': '{:.3f}'.format(avg_loss / (batch_idx + 1)), 'acc': '{:.2f}%'.format(acc * 100) } progress_bar.set_postfix(metrics) loss = avg_loss / len(self.data_loader_valid) acc = avg_acc / n_samples torch.cuda.empty_cache() return {"loss": loss, "acc": acc}
def _train_iter(self): j = 1 self.model.train() self.optimizer.zero_grad() progress_bar = tq(self.data_loader_train) progress_bar.set_description("Training") avg_loss = 0.0 for batch_idx, (data, target) in enumerate(progress_bar): if self.cuda_available: data = data.cuda(non_blocking=True) target = target.cuda(non_blocking=True) output = self.model(data) loss = F.cross_entropy(output, target) loss.backward() avg_loss += loss.item() if j % self.batch_accumulation == 0: j = 1 self.optimizer.step() self.optimizer.zero_grad() else: j += 1 if batch_idx % self.args.logFrequency == 0: progress_bar.set_postfix( {'Loss': '{:.3f}'.format(avg_loss / (batch_idx + 1))}) torch.cuda.empty_cache()
def epoch_train(self): loss_train_list = [] loss_valid_list = [] self.model.train() scheduler = StepLR(self.optimizer, step_size=6, gamma=0.002) for batch_id, (var_input, var_target) in tq(enumerate(self.data_loader_train)): var_target = var_target.to(self.device) var_input = var_input.to(self.device) var_output= self.model(var_input) trainloss_value = self.loss_fn( var_output, tfunc.one_hot(var_target.squeeze(1).long(), num_classes=self.class_count).float()) self.optimizer.zero_grad() trainloss_value.backward() self.optimizer.step() train_loss_value = trainloss_value.item() loss_train_list.append(train_loss_value) if batch_id % (len(self.data_loader_train)-1) == 0 and batch_id != 0: validloss_value, auroc_mean = self.valid() loss_valid_list.append(validloss_value) if auroc_mean > self.auroc_max: print('Better auroc obtained') self.auroc_max = auroc_mean scheduler.step() train_loss_mean = np.mean(loss_train_list) valid_loss_mean = np.mean(loss_valid_list) return train_loss_mean, valid_loss_mean, auroc_mean
def extract_data_from_html_file(self, html_file): print('[INFO]: Extracting data from {}'.format(html_file)) soup = BeautifulSoup(open(html_file, encoding='utf8'), 'lxml') users, msgs, dates, times = [], [], [], [] # Find 'thread' tags for thread in soup.find_all(class_='thread'): # Find 'message' tags for chat in tq(thread.find_all(class_='message'), desc='Chats'): # Extract sender and message user = str(chat.find(class_='user').string) msg = str(chat.next_sibling.string) # Extract date and time full_date = dt.strptime( chat.find(class_='meta').string.replace("+01", ""), self.full_date_format) date = str(full_date.strftime(self.date_format)) time = str(full_date.strftime(self.time_format)) # Ignore 'pictures' if msg != 'None': users.append(user) msgs.append(msg) dates.append(date) times.append(time) print('[INFO]: Data extracted from {}'.format(html_file)) return [users, msgs, dates, times]
def plot_profiles_to_file(annot, pntr, ups=200, smooth_param=50): pp = PdfPages(options.save_path + 'Figures/individual_signals.pdf') clrs_ = ['red', 'blue', 'black', 'orange', 'magenta', 'cyan'] vec_sense = {} vec_antisense = {} # for qq in tq(range(annot.shape[0])): for qq in tq(range(100)): chname = annot['chr'].iloc[qq] if annot['strand'].iloc[qq] == '+': start = annot['start'].iloc[qq] - ups stop = annot['end'].iloc[qq] for key in pntr.keys(): vec_sense[key] = pntr[key][0].get_nparray( chname, start, stop - 1) vec_antisense[key] = pntr[key][1].get_nparray( chname, start, stop - 1) xran = np.arange(start, stop) else: start = annot['start'].iloc[qq] stop = annot['end'].iloc[qq] + ups for key in pntr.keys(): vec_sense[key] = np.flipud(pntr[key][1].get_nparray( chname, start, stop)) vec_antisense[key] = np.flipud(pntr[key][0].get_nparray( chname, start, stop)) xran = np.arange(stop, start, -1) ax = {} fig = pl.figure() pl.title(annot['name'].iloc[qq]) for i, key in enumerate(pntr.keys()): sm_vec_se = sm.smooth(vec_sense[key], smooth_param)[(smooth_param - 1):-(smooth_param - 1)] sm_vec_as = sm.smooth(vec_antisense[key], smooth_param)[(smooth_param - 1):-(smooth_param - 1)] ax[key] = pl.subplot(len(pntr), 1, i + 1) ax[key].plot(xran, vec_sense[key], label=key, color=clrs_[i], alpha=0.5) ax[key].plot(xran, -vec_antisense[key], color=clrs_[i], alpha=0.5) ax[key].plot(xran, sm_vec_se, color=clrs_[i], linewidth=2) ax[key].plot(xran, -sm_vec_as, color=clrs_[i], linewidth=2) ax[key].legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), fontsize=6, ncol=1) pp.savefig() pl.close() pp.close() for pn in pntr.values(): pn[0].close() pn[1].close()
def process(self): train_areas = [f for f in self.folders if str(self.test_area) not in f] test_areas = [f for f in self.folders if str(self.test_area) in f] train_files = [(f, room_name, osp.join(self.raw_dir, f, room_name)) for f in train_areas for room_name in os.listdir(osp.join(self.raw_dir, f)) if ".DS_Store" != room_name] test_files = [(f, room_name, osp.join(self.raw_dir, f, room_name)) for f in test_areas for room_name in os.listdir(osp.join(self.raw_dir, f)) if ".DS_Store" != room_name] train_data_list, test_data_list = [], [] for (area, room_name, file_path) in tq(train_files + test_files): if self.debug: read_s3dis_format(file_path, room_name, label_out=True, verbose=self.verbose, debug=self.debug) else: xyz, rgb, room_labels, room_object_indices = read_s3dis_format( file_path, room_name, label_out=True, verbose=self.verbose, debug=self.debug) data = Data(pos=xyz, x=rgb.float(), y=room_labels) if self.keep_instance: data.room_object_indices = room_object_indices if self.pre_filter is not None and not self.pre_filter(data): continue if self.pre_transform is not None: data = self.pre_transform(data) if (area, room_name, file_path) in train_files: train_data_list.append(data) else: test_data_list.append(data) if self.pre_collate_transform: train_data_list = self.pre_collate_transform.fit_transform( train_data_list) test_data_list = self.pre_collate_transform.transform( test_data_list) torch.save(self.collate(train_data_list), self.processed_paths[0]) torch.save(self.collate(test_data_list), self.processed_paths[1])
def evaluate_model(model , statedict_PATH, num_images, img_size = (56 , 56)): """Evaluate model using some data """ model.load_state_dict(torch.load(PATH)) model.eval() batch_size = 256 test_augmentations = Compose([ Resize(*img_size), ToFloat(max_value = 255), ToTensor()], p = 1) test_df = pd.read_csv(f"data/test.csv") test_df = test_df.reset_index() test_dataset = test_digitdataset(data = test_df , transform = test_augmentations) test_loader = DataLoader(test_dataset , batch_size = batch_size , shuffle = False) test_tq = tq(test_loader , total = int(len(test_loader))) preds, labels = [], [] with torch.no_grad(): for (images , label) in test_tq: images = images["image"].to(device , dtype = torch.float) outputs = model(images) preds.extend(outputs.cpu().numpy()) labels.extend(label.cpu().numpy() + 1) preds = np.array(preds) preds = np.argmax(np.array(preds) , axis = 1).reshape(-1) fig , axes = plt.subplots(nrows = num_images//4 + 1 , ncols = 4, figsize=(64,64), sharex = True, sharey = True) counter = 0 for row in axes: for col in row: col.imshow(images[counter].squeeze().detach().permute(1 , 2 , 0).cpu().numpy()) col.set_title(f"pred = {preds[counter]}") counter += 1 test_preds = pd.read_csv(f"data/sample_submission.csv") test_preds.ImageId = labels test_preds.Label = preds save_file = f"data/sample_submission_temp.csv" if os.path.exists(save_file): os.remove(save_file) test_preds.to_csv(save_file , index= False) print("Submission file created successfully")
def multi_crop_img_lst(ROIs, out_paths, in_path, pic_lst): """ crop all ROIs out of entire list of pictures in the in_path folder save them to the out_paths list """ ## get a list of all the pics in the in_path folder for full_pic_path in tq(pic_lst): ## Create new file name for the croped img pic_name = full_pic_path.rsplit("\\", 1)[-1] date = in_path.split("\\")[-3].split("_")[0] orient = in_path.split("\\")[-2].split("_")[-1] # new_pic_name = pic_name.strip(".JPG").strip("DSC_")+"_CROPED_"+date+"_"+orient+".jpg" new_pic_name = pic_name.strip(".JPG").strip("DSC_") + "_CROPED.jpg" ## Load img img = cv2.imread(full_pic_path) # ===== if anything should be donebefore cropping the imgs - add code here ==== # size = (3000,2000) # img = cv2.resize(img, size) # ============================================================================= ## Loop over selected ROIs for j, ROI in enumerate(ROIs): ## Crope the img x, y, w, h = ROI[0], ROI[1], ROI[2], ROI[3] croped_img = img[y:y + h, x:x + w] # ===== if anything should be done with the **croped** imgs add code here ===== # rtd = img_procesing.rotate_img(croped_img,180) # ============================================================================= ## create window for every ROI cv2.namedWindow("croping_" + str(ROI), cv2.WINDOW_NORMAL) cv2.imshow("croping_" + str(ROI), croped_img) ## Press Esc OR q key to stop k = cv2.waitKey(1) & 0xff if k == 27 or k == ord('q'): break ## Save the img to file out_path = out_paths[j] + "\\" + out_paths[j][ -1] + "_" + new_pic_name cv2.imwrite(out_path, croped_img) piexif.transplant(full_pic_path, out_path) ## If we broke off we should stop this loop as well if k == 27 or k == ord('q'): print("\n\n!!! You Stoped !!!") break
def setData(self): main_df = pd.DataFrame() for i in tq(self.data['Name'].unique()): df = self.data[self.data['Name'] == i] df.rename(columns={'close': i}, inplace=True) df.drop(['open', 'high', 'Name', 'volume', 'low'], 1, inplace=True) if main_df.empty: main_df = df else: main_df = main_df.join(df, how='outer') return main_df
def getTFdensity(annot, direction='sense', yeastract=None): tln = 0 tlnDws = 0 ttf = pd.Series(0, index=yeastract['TFlist'].unique()) ttfDws = pd.Series(0, index=yeastract['TFlist'].unique()) for i in tq(range(len(annot))): strn = annot['strand'].iloc[i] ch = annot['chr'].iloc[i] st = annot['peak_position'].iloc[i] - 200 if strn == "+" else annot[ 'peak_position'].iloc[i] en = annot['peak_position'].iloc[ i] if strn == "+" else annot['peak_position'].iloc[i] + 200 tf1 = yeastract['start'] >= st tf2 = yeastract['stop'] <= en tf3 = yeastract['sequence name'] == ch if direction == 'sense': tfa = yeastract['strand'] == strn elif direction == 'antisense': tfa = yeastract['strand'] == '-' if strn == '+' else yeastract[ 'strand'] == '+' elif direction == 'both': tfa = np.ones((len(yeastract['strand'])), dtype=bool) df = yeastract[tf1 & tf2 & tf3 & tfa] ttf = ttf.add(df['TFlist'].value_counts(), fill_value=0) tln += en - st st = annot['peak_position'].iloc[ i] if strn == "+" else annot['peak_position'].iloc[i] - 200 en = annot['peak_position'].iloc[i] + 200 if strn == "+" else annot[ 'peak_position'].iloc[i] tf1 = yeastract['start'] >= st tf2 = yeastract['stop'] <= en df = yeastract[tf1 & tf2 & tf3 & tfa] if any(df['TFlist'].value_counts()): if (en < st) | any(df['TFlist'].value_counts() < 0): print 'Alert!!! something is wrong!' ttfDws = ttfDws.add(df['TFlist'].value_counts(), fill_value=0) tlnDws += en - st # else: # print df['TFlist'].value_counts() return ttf, tln, ttfDws, tlnDws
def most_freq_kmers(S, d): ''' Return most frequent kmers for all k ; 3 <= k <= len(S) - 2 ''' from tqdm import tqdm as tq len_S = len(S) most_freq_kmers_dict = {} for i in tq(range(9, 10)): # TODO: fix later, only computing for 9-mers kmer_dict = frequent_word(S, i, d) kmer_dict_freq_values = list(kmer_dict.values()) most_freq_kmers_dict[str(i) + '-mer'] = kmer_dict return most_freq_kmers_dict
def plot_regret_t(self): pt.figure() mean = np.zeros(self.steps) for i in tq(range(0, self.runs)): np.random.seed(i) theta_star = self.env.set_theta() r = LinUCB(self.env).regret_t(self.steps, theta_star) pt.subplot(211) pt.plot(self.step_list, r[0]) mean += r[0] pt.subplot(212) mean = [i / self.runs for i in mean] pt.plot(self.step_list, mean) pt.show()
def read_arrays_local( local_path: str, reg_exp_pattern: str = r".+\.npy", stack_input: int = 1, verbose=True ) -> Iterator[np.ndarray]: """ Iterate over numpy array files that match the reg ex pattern and yield their content. It is possible to iterate over the stacked content of several arrays. Parameters ---------- local_embeddings_path : str Path on local disk of arrays in numpy format. stack_input : int (default 1) Number of arrays that should be stacked at each iterations. This parameter is useful when working with many small files. verbose : bool Print detailed informations if set to True Returns ------- arrays_iterator : Iterator[np.ndarray] An iterator over batchs of stacked arrays. """ assert stack_input > 0 reg_exp = re.compile(reg_exp_pattern) filenames = os.walk(local_path).__next__()[2] filenames = [filename for filename in filenames if reg_exp.match(filename)] filenames.sort() embeddings_stack: List[np.ndarray] = [] iterator = enumerate(filenames) if verbose: iterator = tq(list(iterator)) for file_number, file_name in iterator: if embeddings_stack and (file_number % stack_input == 0): yield np.concatenate(embeddings_stack) embeddings_stack = [] try: embeddings_stack.append(np.load(f"{local_path}/{file_name}")) except Exception as e: # pylint: disable=broad-except print(e) if embeddings_stack: yield np.concatenate(embeddings_stack).astype(np.float32)
def mean_sd_calculator(args): dirpath = os.path.join(args.dpath, 'original/') file_list = os.listdir(dirpath) mean_list = [] sd_list = [] for fpath in tq(file_list): ds = dcmread(dirpath + fpath) np_array = ds.pixel_array mean = np.mean(np_array) mean_list.append(mean) sd = np.std(np_array) sd_list.append(sd) print(f'Mean: {np.mean(mean_list)}') print(f'Standard Deviation: {np.std(sd_list)}')
def dcm_to_jpg(args): dirpath = os.path.join(args.dpath, 'original/') savepath = os.path.join(args.dpath, 'processed_data/') if not os.path.isdir(savepath): os.makedirs(savepath) file_list = os.listdir(dirpath) for fpath in tq(file_list): ds = dcmread(os.path.join(dirpath, fpath)) np_array = ds.pixel_array im = Image.fromarray(np_array) name = fpath.split('.dcm')[0] im.save(os.path.join(savepath, name + ".jpg")) print('Completed')
def read_embeddings_remote( embeddings_path: str, column_label: str = "embedding", stack_input: int = 1, verbose=True ) -> Iterator[np.ndarray]: """ Return an iterator over embeddings from a parquet folder Parameters ---------- embeddings_path : str Path on the hdfs of the embedding in parquet format. column_label : str (default "embeddings") Name of the column in which the embeddings are stored. stack_input : int (default 1) Number of arrays that should be stacked at each iterations. This parameter is useful when working with many small files. verbose : bool Print detailed informations if set to True Returns ------- embeddings_iterator : Iterator[np.ndarray] An iterator over batchs of embedding arrays. """ assert stack_input > 0 filenames = read_filenames(embeddings_path) embeddings_stack: List[np.ndarray] = [] iterator = list(enumerate(filenames)) if verbose: iterator = tq(iterator) for file_number, file_name in iterator: if embeddings_stack and (file_number % stack_input == 0): yield np.concatenate(embeddings_stack) embeddings_stack = [] small_table = pq.read_table(file_name) pandas_df = small_table[column_label].to_pandas() embeddings_stack.append(np.stack(pandas_df).astype("float32")) if embeddings_stack: yield np.concatenate(embeddings_stack)
def create_annotation(args): file_list = os.listdir(os.path.join(args.dpath, 'processed_data/')) df_class = pd.read_csv( os.path.join(args.dpath, 'stage_2_detailed_class_info.csv')) labels = ["Lung Opacity", "Normal", "No Lung Opacity / Not Normal"] dict_annotation = {} for file in tq(file_list): patient_id = file.split('.jpg')[0] tmp = df_class[df_class["patientId"] == patient_id]["class"].values[0] idx = labels.index(tmp) dict_annotation[str(file)] = idx with open(os.path.join(args.dpath, 'rsna_annotation.json'), 'w') as f: json.dump(dict_annotation, f) print('Created and saved rsna_annotation.json file.')
def main(): t_c_dic = defaultdict(int) t_dic = defaultdict(int) c_dic = defaultdict(int) N = 0 for line in tq(open('knock82_out', encoding='utf8')): t, cs = line.split('\t') t_dic[t] += 1 for c in cs.split(): t_c_dic[f'{t} {c}'] += 1 c_dic[c] += 1 N += 1 with open('counts.pickle', mode='wb') as f: pickle.dump(t_c_dic, f) pickle.dump(t_dic, f) pickle.dump(c_dic, f) pickle.dump(N, f)
def main(): client = pymongo.MongoClient() db = client.db_knock64 collection = db.collection_knock64 batch = [] for i, line in tq(enumerate(gzip.open('artist.json.gz', 'rt', encoding='utf8'))): jdata = json.loads(line) batch.append(jdata) if not i % 10000 and batch: collection.insert_many(batch) batch = [] collection.create_index([('name', pymongo.ASCENDING)]) collection.create_index([('aliases.name', pymongo.ASCENDING)]) collection.create_index([('tags.value', pymongo.ASCENDING)]) collection.create_index([('rating.value', pymongo.ASCENDING)])
def occlusion(dataset_path: Path, model_path: Path, output_path: Path): check_path(output_path) nx_graphs, labels = read_graphs(dataset_path) model = load_model(model_path) def prepare_input(g): node_count = len(g.nodes) adj = np.zeros((1, 100, 100)) adj[0, :node_count, :node_count] = nx.to_numpy_matrix(g) adj = torch.tensor(adj, dtype=torch.float) x = torch.ones((1, 100, 10), requires_grad=False, dtype=torch.float) return x, adj def explain(graph_num): model.eval() g = nx_graphs[graph_num] x, adj = prepare_input(g) ypred, _ = model(x, adj) true_label = labels[graph_num] before_occlusion = ypred[0].softmax(0) node_importance = {} for removed_node in g.nodes(): g2 = g.copy() g2.remove_node(removed_node) x, adj = prepare_input(g2) ypred, _ = model(x, adj) after_occlusion = ypred[0].softmax(0) importance = abs(after_occlusion[true_label] - before_occlusion[true_label]) node_importance[int(removed_node)] = importance.item() N = nx_graphs[graph_num].number_of_nodes() masked_adj = np.zeros((N, N)) for u, v in nx_graphs[graph_num].edges(): u = int(u) v = int(v) masked_adj[u, v] = masked_adj[v, u] = node_importance[u] + node_importance[v] return masked_adj for gid in tq(nx_graphs): masked_adj = explain(gid) np.save(output_path / ('%s.npy' % gid), masked_adj)
def write_adjacency(output_path: Path, dataset: Dataset, graphs): relabled_gs = [] first_label = 1 graph_indicator = [] for g, label in tq(graphs): relabled_gs.append( nx.convert_node_labels_to_integers(g, first_label=first_label)) N = len(g.nodes()) first_label += N graph_indicator.extend([g.graph['graph_num']] * N) with open(output_path / ('%s_A.txt' % dataset.value), 'w') as f: for g in relabled_gs: for u, v in g.edges(): f.write(f'{u}, {v}\n{v}, {u}\n') with open(output_path / ('%s_graph_indicator.txt' % dataset.value), 'w') as f: f.write('\n'.join(map(str, graph_indicator))) with open(output_path / ('%s_graph_labels.txt' % dataset.value), 'w') as f: f.write('\n'.join([str(label) for g, label in graphs]))