Beispiel #1
0
def download(parquet_embeddings_path: str,
             dest_path: str,
             n_cores: int = 32,
             verbose: bool = True) -> bool:
    """
    Download .parquet files from hdfs at max speed
    Parallelisation is essential to use the full bandwidth.
    """

    filenames = read_filenames(parquet_embeddings_path)

    nb_files = len(filenames)

    os.makedirs(dest_path, exist_ok=True)

    src_dest_paths = zip(filenames, repeat(dest_path))

    if n_cores == 1:

        if verbose:
            src_dest_paths = tq(list(src_dest_paths))

        for src_dest_path in src_dest_paths:
            download_one(src_dest_path)

    else:

        with tq(total=nb_files) as pbar:
            with Pool(processes=n_cores) as pool:
                for _ in pool.imap_unordered(download_one, src_dest_paths):
                    pbar.update(1)

    return True
Beispiel #2
0
    def valid(self):
        self.model.eval()
        loss_valid_r = 0
        valid_batches = 0      # Counter for valid batches
        out_gt = torch.FloatTensor().to(self.device)
        out_pred = torch.FloatTensor().to(self.device)
        with torch.no_grad():
            for (var_input, var_target) in tq(self.data_loader_valid):
                var_target = var_target.to(self.device)
                out_gt = torch.cat((out_gt, var_target), 0).to(self.device)

                _, c, h, w = var_input.size()
                var_input = var_input.view(-1, c, h, w)

                var_output = self.model(var_input.to(self.device))
                out_pred = torch.cat((out_pred, var_output), 0)

                lossvalue = self.loss_fn(
                    var_output, tfunc.one_hot(var_target.squeeze(1).long(), num_classes=self.class_count).float())

                loss_valid_r += lossvalue.item()
                valid_batches += 1

            valid_loss = loss_valid_r / valid_batches

            auroc_individual = compute_auroc(
                tfunc.one_hot(out_gt.squeeze(1).long()).float(),
                out_pred, self.class_count)
            print(len(auroc_individual))
            auroc_mean = np.array(auroc_individual).mean()
        return valid_loss, auroc_mean
Beispiel #3
0
    def _popularity_biased_sampling(self):
        """ 使用采样方法,改变self.data_dicts的每个item,将每个item添加 item_query
        """
        assert hasattr(
            self, "dataloader"), "AmazonDataset don't have self.dataloader"
        dataloader = self.dataloader

        train_data = dataloader.train_batch_data.values.tolist()
        items_pop = {}
        user2items = {}
        for single_item in train_data:
            user, item, f, s = int(single_item[0]), int(single_item[1]), int(
                single_item[2]), int(single_item[3])
            items_pop[item] = items_pop.get(item, 0) + 1
            user2items[user] = user2items.get(user, []) + [item]
        user2items = {k: set(v) for k, v in user2items.items()}
        import math
        import numpy as np
        items_pop = {k: math.pow(v, 0.75) for k, v in items_pop.items()}

        self.items_pop = items_pop
        self.user2items = user2items

        if not self._lazy:
            ret = []
            print("Substitution Sampling Process: ")
            for item in tq(self.dataset_dicts):
                ret.append(foreach_sample(item))
            #self.dataset_dicts = [ foreach_sample(item) for item in self.dataset_dicts ]
            self.dataset_dicts = ret
Beispiel #4
0
def convert_all_parquet_to_numpy(
    parquet_folder: str,
    embeddings_folder: str,
    n_cores: int = 32,
    delete: bool = False,
    embedding_column_name: str = "embedding",
) -> None:
    """ Convert embedding parquet files to an embedding numpy files """

    assert n_cores > 0
    os.makedirs(embeddings_folder, exist_ok=True)

    parquet_files = [
        f"{parquet_folder}/{x}" for x in os.listdir(parquet_folder)
        if x.endswith(".parquet")
    ]
    parquet_files.sort()

    nb_files = len(parquet_files)

    func = partial(run_one,
                   embeddings_folder=embeddings_folder,
                   delete=delete,
                   embedding_column_name=embedding_column_name)

    with tq(total=nb_files) as pbar:
        with Pool(processes=n_cores) as pool:
            for _ in pool.imap_unordered(func, parquet_files):
                pbar.update(1)
Beispiel #5
0
    def plot_delay(self):
        pt.figure()
        mean1 = np.zeros(self.steps)
        mean2 = np.zeros(self.steps)
        for i in tq(range(0, self.runs)):
            np.random.seed(i)
            theta_star = self.env.set_theta()
            r1 = LinUCB(self.env).regret_delay_t(self.steps, theta_star)
            r2 = LinUCB(self.env).regret_t(self.steps, theta_star)
            # pt.subplot(221)
            # pt.plot(self.step_list, r[0])
            mean1 += r1[0]
            mean2 += r2[0]
        # pt.subplot(222)
        mean1 = [i / self.runs for i in mean1]
        mean2 = [i / self.runs for i in mean2]

        # for i in tq(range(0, self.runs)):
        #     np.random.seed(i)
        #     theta_star = self.env.set_theta()
        #     # r1 = LinUCB(self.env).regret_delay_t(self.steps, theta_star)
        #     r2 = LinUCB(self.env).regret_t(self.steps, theta_star)
        #     # pt.subplot(221)
        #     # pt.plot(self.step_list, r[0])
        #     # mean1 += r1[0]
        #     mean2 += r2[0]
        # # pt.subplot(222)
        # # mean1 = [i/self.runs for i in mean1]
        # mean2 = [i/self.runs for i in mean2]

        pt.plot(self.step_list, mean1, label="delay")
        pt.plot(self.step_list, mean2, label="non_delay")
        pt.legend()
        pt.show()
Beispiel #6
0
def sa(dataset_path: Path, model_path: Path, output_path: Path):
    check_path(output_path)
    nx_graphs, labels = read_graphs(dataset_path)
    model = load_model(model_path)

    def explain(graph_num):
        g = nx_graphs[graph_num]
        node_count = len(g.nodes)

        adj = np.zeros((1, 100, 100))
        adj[0, :node_count, :node_count] = nx.to_numpy_matrix(g)
        adj = torch.tensor(adj, dtype=torch.float)
        x = torch.ones((1, 100, 10), requires_grad=True, dtype=torch.float)

        ypred, _ = model(x, adj)

        loss = model.loss(ypred, torch.LongTensor([labels[graph_num]]))
        loss.backward()
        node_importance = x.grad.detach().numpy()[0][:node_count]
        node_importance = (node_importance ** 2).sum(axis=1)
        N = nx_graphs[graph_num].number_of_nodes()
        masked_adj = np.zeros((N, N))
        for u, v in nx_graphs[graph_num].edges():
            u = int(u)
            v = int(v)
            masked_adj[u, v] = masked_adj[v, u] = node_importance[u] + node_importance[v]
        return masked_adj

    for gid in tq(nx_graphs):
        masked_adj = explain(gid)
        np.save(output_path / ('%s.npy' % gid), masked_adj)
 def CompanyNames(self):
     # print(self.CompaniesNames)
     for i in tq(self.CompaniesNames):
         self.CompanyWiseData[i] = self.data[self.data['Name']==i]    
     # self.FindTop()
     # self.FindBottomTen()
     self.TopAndBottom()
    def TopAndBottom(self):
        for i in tq(self.CompaniesNames):
            self.ATRWEEKLY[i] = average_true_range(
                self.CompanyWiseData[i]['high'],
                self.CompanyWiseData[i]['low'],
                self.CompanyWiseData[i]['close'],
                n=7
            ).mean()
            self.ATRANNUALLY[i] = average_true_range(
                self.CompanyWiseData[i]['high'],
                self.CompanyWiseData[i]['low'],
                self.CompanyWiseData[i]['close'],
                n=265
            ).mean()
        self.WeeklySorted = sorted(self.ATRWEEKLY.items(),
                                    key = lambda x:x[1])
        self.AnnualySorted = sorted(self.ATRANNUALLY.items(),
                                    key = lambda x:x[1])
        # print(f"top ten Weekly volatile companies are {self.WeeklySorted[:10] }")
        # print(f"top ten Weekly least volatile companies are {self.WeeklySorted[-10:] }")
        # print(f"top ten Anually volatile companies are {self.AnnualySorted[:10] }")
        # print(f"top ten Anually least volatile companies are {self.AnnualySorted[-10:] }")
        topwL_name,topwL_value = self.namesAndValue(self.WeeklySorted[:10])
        topaL_name,topaL_value = self.namesAndValue(self.AnnualySorted[:10])
        topw_name,topw_value = self.namesAndValue(self.WeeklySorted[-10:])
        topa_name,topa_value = self.namesAndValue(self.AnnualySorted[-10:])
        

        self.plotting([topa_name, topa_value,
                        topw_name, topw_value,
                        topaL_name, topaL_value,
                        topwL_name, topwL_value])
        print(topa_name)
        self.p.plotting(self.CompanyWiseData[topa_name[0]])
        self.p.plotting(self.CompanyWiseData[topaL_name[0]])
 def __call__(self, data):
     if isinstance(data, list):
         data = [self._process(d) for d in tq(data)]
         data = list(itertools.chain(*data))  # 2d list needs to be flatten
     else:
         data = self._process(data)
     return data
 def _valid(self):
     self.model.eval()
     avg_loss = 0.0
     avg_acc = 0.0
     n_samples = 0
     progress_bar = tq(self.data_loader_valid)
     progress_bar.set_description("Validation")
     for batch_idx, (data, target) in enumerate(progress_bar):
         if self.cuda_available:
             data = data.cuda(non_blocking=True)
             target = target.cuda(non_blocking=True)
         output = self.model(data)
         loss = F.cross_entropy(output, target)
         avg_loss += loss.item()
         y_hat = output.argmax(dim=1)
         avg_acc += (target == y_hat).sum().item()
         n_samples += len(target)
         if batch_idx % self.args.logFrequency == 0:
             acc = avg_acc / n_samples
             metrics = {
                 'loss': '{:.3f}'.format(avg_loss / (batch_idx + 1)),
                 'acc': '{:.2f}%'.format(acc * 100)
             }
             progress_bar.set_postfix(metrics)
     loss = avg_loss / len(self.data_loader_valid)
     acc = avg_acc / n_samples
     torch.cuda.empty_cache()
     return {"loss": loss, "acc": acc}
 def _train_iter(self):
     j = 1
     self.model.train()
     self.optimizer.zero_grad()
     progress_bar = tq(self.data_loader_train)
     progress_bar.set_description("Training")
     avg_loss = 0.0
     for batch_idx, (data, target) in enumerate(progress_bar):
         if self.cuda_available:
             data = data.cuda(non_blocking=True)
             target = target.cuda(non_blocking=True)
         output = self.model(data)
         loss = F.cross_entropy(output, target)
         loss.backward()
         avg_loss += loss.item()
         if j % self.batch_accumulation == 0:
             j = 1
             self.optimizer.step()
             self.optimizer.zero_grad()
         else:
             j += 1
         if batch_idx % self.args.logFrequency == 0:
             progress_bar.set_postfix(
                 {'Loss': '{:.3f}'.format(avg_loss / (batch_idx + 1))})
     torch.cuda.empty_cache()
Beispiel #12
0
    def epoch_train(self):
        loss_train_list = []
        loss_valid_list = []
        self.model.train()
        scheduler = StepLR(self.optimizer, step_size=6, gamma=0.002)

        for batch_id, (var_input, var_target) in tq(enumerate(self.data_loader_train)):
            var_target = var_target.to(self.device)
            var_input = var_input.to(self.device)
            var_output= self.model(var_input)
            trainloss_value = self.loss_fn(
                var_output,
                tfunc.one_hot(var_target.squeeze(1).long(), num_classes=self.class_count).float())

            self.optimizer.zero_grad()
            trainloss_value.backward()
            self.optimizer.step()
            train_loss_value = trainloss_value.item()
            loss_train_list.append(train_loss_value)

            if batch_id % (len(self.data_loader_train)-1) == 0 and batch_id != 0:
                validloss_value, auroc_mean = self.valid()
                loss_valid_list.append(validloss_value)
                if auroc_mean > self.auroc_max:
                    print('Better auroc obtained')
                    self.auroc_max = auroc_mean

                scheduler.step()

        train_loss_mean = np.mean(loss_train_list)
        valid_loss_mean = np.mean(loss_valid_list)
        return train_loss_mean, valid_loss_mean, auroc_mean
Beispiel #13
0
    def extract_data_from_html_file(self, html_file):
        print('[INFO]: Extracting data from {}'.format(html_file))
        soup = BeautifulSoup(open(html_file, encoding='utf8'), 'lxml')

        users, msgs, dates, times = [], [], [], []

        # Find 'thread' tags
        for thread in soup.find_all(class_='thread'):
            # Find 'message' tags
            for chat in tq(thread.find_all(class_='message'), desc='Chats'):
                # Extract sender and message
                user = str(chat.find(class_='user').string)
                msg = str(chat.next_sibling.string)

                # Extract date and time
                full_date = dt.strptime(
                    chat.find(class_='meta').string.replace("+01", ""),
                    self.full_date_format)
                date = str(full_date.strftime(self.date_format))
                time = str(full_date.strftime(self.time_format))

                # Ignore 'pictures'
                if msg != 'None':
                    users.append(user)
                    msgs.append(msg)
                    dates.append(date)
                    times.append(time)

        print('[INFO]: Data extracted from {}'.format(html_file))
        return [users, msgs, dates, times]
Beispiel #14
0
def plot_profiles_to_file(annot, pntr, ups=200, smooth_param=50):
    pp = PdfPages(options.save_path + 'Figures/individual_signals.pdf')
    clrs_ = ['red', 'blue', 'black', 'orange', 'magenta', 'cyan']
    vec_sense = {}
    vec_antisense = {}
    # for qq in tq(range(annot.shape[0])):
    for qq in tq(range(100)):

        chname = annot['chr'].iloc[qq]

        if annot['strand'].iloc[qq] == '+':
            start = annot['start'].iloc[qq] - ups
            stop = annot['end'].iloc[qq]
            for key in pntr.keys():
                vec_sense[key] = pntr[key][0].get_nparray(
                    chname, start, stop - 1)
                vec_antisense[key] = pntr[key][1].get_nparray(
                    chname, start, stop - 1)
            xran = np.arange(start, stop)
        else:
            start = annot['start'].iloc[qq]
            stop = annot['end'].iloc[qq] + ups
            for key in pntr.keys():
                vec_sense[key] = np.flipud(pntr[key][1].get_nparray(
                    chname, start, stop))
                vec_antisense[key] = np.flipud(pntr[key][0].get_nparray(
                    chname, start, stop))
            xran = np.arange(stop, start, -1)

        ax = {}
        fig = pl.figure()
        pl.title(annot['name'].iloc[qq])
        for i, key in enumerate(pntr.keys()):
            sm_vec_se = sm.smooth(vec_sense[key],
                                  smooth_param)[(smooth_param -
                                                 1):-(smooth_param - 1)]
            sm_vec_as = sm.smooth(vec_antisense[key],
                                  smooth_param)[(smooth_param -
                                                 1):-(smooth_param - 1)]
            ax[key] = pl.subplot(len(pntr), 1, i + 1)
            ax[key].plot(xran,
                         vec_sense[key],
                         label=key,
                         color=clrs_[i],
                         alpha=0.5)
            ax[key].plot(xran, -vec_antisense[key], color=clrs_[i], alpha=0.5)
            ax[key].plot(xran, sm_vec_se, color=clrs_[i], linewidth=2)
            ax[key].plot(xran, -sm_vec_as, color=clrs_[i], linewidth=2)
            ax[key].legend(loc='upper center',
                           bbox_to_anchor=(0.5, 1.05),
                           fontsize=6,
                           ncol=1)
        pp.savefig()

        pl.close()
    pp.close()
    for pn in pntr.values():
        pn[0].close()
        pn[1].close()
    def process(self):

        train_areas = [f for f in self.folders if str(self.test_area) not in f]
        test_areas = [f for f in self.folders if str(self.test_area) in f]

        train_files = [(f, room_name, osp.join(self.raw_dir, f, room_name))
                       for f in train_areas
                       for room_name in os.listdir(osp.join(self.raw_dir, f))
                       if ".DS_Store" != room_name]

        test_files = [(f, room_name, osp.join(self.raw_dir, f, room_name))
                      for f in test_areas
                      for room_name in os.listdir(osp.join(self.raw_dir, f))
                      if ".DS_Store" != room_name]

        train_data_list, test_data_list = [], []

        for (area, room_name, file_path) in tq(train_files + test_files):

            if self.debug:
                read_s3dis_format(file_path,
                                  room_name,
                                  label_out=True,
                                  verbose=self.verbose,
                                  debug=self.debug)
            else:
                xyz, rgb, room_labels, room_object_indices = read_s3dis_format(
                    file_path,
                    room_name,
                    label_out=True,
                    verbose=self.verbose,
                    debug=self.debug)

                data = Data(pos=xyz, x=rgb.float(), y=room_labels)

                if self.keep_instance:
                    data.room_object_indices = room_object_indices

                if self.pre_filter is not None and not self.pre_filter(data):
                    continue

                if self.pre_transform is not None:
                    data = self.pre_transform(data)

                if (area, room_name, file_path) in train_files:
                    train_data_list.append(data)
                else:
                    test_data_list.append(data)

        if self.pre_collate_transform:
            train_data_list = self.pre_collate_transform.fit_transform(
                train_data_list)
            test_data_list = self.pre_collate_transform.transform(
                test_data_list)

        torch.save(self.collate(train_data_list), self.processed_paths[0])
        torch.save(self.collate(test_data_list), self.processed_paths[1])
Beispiel #16
0
def evaluate_model(model , statedict_PATH, num_images, img_size = (56 , 56)):
    
    """Evaluate model using some data """
    
    model.load_state_dict(torch.load(PATH))
    model.eval()

    batch_size = 256
    test_augmentations = Compose([
    Resize(*img_size),
    ToFloat(max_value = 255),
    ToTensor()], p = 1)
    
    test_df = pd.read_csv(f"data/test.csv")
    test_df = test_df.reset_index()
    test_dataset = test_digitdataset(data = test_df , transform = test_augmentations)
    test_loader = DataLoader(test_dataset , batch_size = batch_size , shuffle = False)
    
    test_tq = tq(test_loader , total = int(len(test_loader)))
    
    preds, labels = [], []
        
    with torch.no_grad():
        for (images , label) in test_tq:
            
            images = images["image"].to(device , dtype = torch.float)
            outputs = model(images)
            preds.extend(outputs.cpu().numpy())
            labels.extend(label.cpu().numpy() + 1)
            
            
    
    preds = np.array(preds)
    preds = np.argmax(np.array(preds) , axis = 1).reshape(-1)

    
    
    fig , axes = plt.subplots(nrows = num_images//4 + 1 , ncols = 4,  figsize=(64,64), sharex = True, sharey = True)
    
    counter = 0
    for row in axes:
        for col in row:
            col.imshow(images[counter].squeeze().detach().permute(1 , 2 , 0).cpu().numpy())
            col.set_title(f"pred = {preds[counter]}")
            counter += 1
    
    test_preds = pd.read_csv(f"data/sample_submission.csv")
    test_preds.ImageId = labels
    test_preds.Label = preds
    
    save_file = f"data/sample_submission_temp.csv"
    if os.path.exists(save_file):
        
        os.remove(save_file)
    test_preds.to_csv(save_file , index= False)
    print("Submission file created successfully")
Beispiel #17
0
def multi_crop_img_lst(ROIs, out_paths, in_path, pic_lst):
    """
    crop all ROIs out of entire list of pictures in the in_path folder
    save them to the out_paths list
    """
    ## get a list of all the pics in the in_path folder

    for full_pic_path in tq(pic_lst):
        ## Create new file name for the croped img
        pic_name = full_pic_path.rsplit("\\", 1)[-1]
        date = in_path.split("\\")[-3].split("_")[0]
        orient = in_path.split("\\")[-2].split("_")[-1]
        #        new_pic_name = pic_name.strip(".JPG").strip("DSC_")+"_CROPED_"+date+"_"+orient+".jpg"
        new_pic_name = pic_name.strip(".JPG").strip("DSC_") + "_CROPED.jpg"

        ## Load img
        img = cv2.imread(full_pic_path)

        # ===== if anything should be donebefore cropping the imgs - add code here ====
        #        size = (3000,2000)
        #        img = cv2.resize(img, size)
        # =============================================================================

        ## Loop over selected ROIs
        for j, ROI in enumerate(ROIs):
            ## Crope the img
            x, y, w, h = ROI[0], ROI[1], ROI[2], ROI[3]
            croped_img = img[y:y + h, x:x + w]

            # ===== if anything should be done with the **croped** imgs add code here =====
            #            rtd = img_procesing.rotate_img(croped_img,180)
            # =============================================================================

            ## create window for every ROI
            cv2.namedWindow("croping_" + str(ROI), cv2.WINDOW_NORMAL)
            cv2.imshow("croping_" + str(ROI), croped_img)

            ## Press Esc OR q key to stop
            k = cv2.waitKey(1) & 0xff
            if k == 27 or k == ord('q'):
                break

            ## Save the img to file

            out_path = out_paths[j] + "\\" + out_paths[j][
                -1] + "_" + new_pic_name
            cv2.imwrite(out_path, croped_img)
            piexif.transplant(full_pic_path, out_path)

        ## If we broke off we should stop this loop as well
        if k == 27 or k == ord('q'):
            print("\n\n!!! You Stoped !!!")
            break
Beispiel #18
0
    def setData(self):
        main_df = pd.DataFrame()
        for i in tq(self.data['Name'].unique()):
            df = self.data[self.data['Name'] == i]
            df.rename(columns={'close': i}, inplace=True)
            df.drop(['open', 'high', 'Name', 'volume', 'low'], 1, inplace=True)

            if main_df.empty:
                main_df = df
            else:
                main_df = main_df.join(df, how='outer')
        return main_df
Beispiel #19
0
def getTFdensity(annot, direction='sense', yeastract=None):

    tln = 0
    tlnDws = 0
    ttf = pd.Series(0, index=yeastract['TFlist'].unique())
    ttfDws = pd.Series(0, index=yeastract['TFlist'].unique())

    for i in tq(range(len(annot))):
        strn = annot['strand'].iloc[i]
        ch = annot['chr'].iloc[i]

        st = annot['peak_position'].iloc[i] - 200 if strn == "+" else annot[
            'peak_position'].iloc[i]
        en = annot['peak_position'].iloc[
            i] if strn == "+" else annot['peak_position'].iloc[i] + 200

        tf1 = yeastract['start'] >= st
        tf2 = yeastract['stop'] <= en
        tf3 = yeastract['sequence name'] == ch

        if direction == 'sense':
            tfa = yeastract['strand'] == strn
        elif direction == 'antisense':
            tfa = yeastract['strand'] == '-' if strn == '+' else yeastract[
                'strand'] == '+'
        elif direction == 'both':
            tfa = np.ones((len(yeastract['strand'])), dtype=bool)

        df = yeastract[tf1 & tf2 & tf3 & tfa]

        ttf = ttf.add(df['TFlist'].value_counts(), fill_value=0)
        tln += en - st

        st = annot['peak_position'].iloc[
            i] if strn == "+" else annot['peak_position'].iloc[i] - 200
        en = annot['peak_position'].iloc[i] + 200 if strn == "+" else annot[
            'peak_position'].iloc[i]

        tf1 = yeastract['start'] >= st
        tf2 = yeastract['stop'] <= en

        df = yeastract[tf1 & tf2 & tf3 & tfa]

        if any(df['TFlist'].value_counts()):
            if (en < st) | any(df['TFlist'].value_counts() < 0):
                print 'Alert!!! something is wrong!'
            ttfDws = ttfDws.add(df['TFlist'].value_counts(), fill_value=0)
            tlnDws += en - st
        # else:
        #     print df['TFlist'].value_counts()

    return ttf, tln, ttfDws, tlnDws
Beispiel #20
0
def most_freq_kmers(S, d):
    ''' 
    Return most frequent kmers
    for all k ; 3 <= k <= len(S) - 2 '''
    from tqdm import tqdm as tq
    len_S = len(S)
    most_freq_kmers_dict = {}
    for i in tq(range(9, 10)):  # TODO: fix later, only computing for 9-mers
        kmer_dict = frequent_word(S, i, d)
        kmer_dict_freq_values = list(kmer_dict.values())
        most_freq_kmers_dict[str(i) + '-mer'] = kmer_dict

    return most_freq_kmers_dict
Beispiel #21
0
 def plot_regret_t(self):
     pt.figure()
     mean = np.zeros(self.steps)
     for i in tq(range(0, self.runs)):
         np.random.seed(i)
         theta_star = self.env.set_theta()
         r = LinUCB(self.env).regret_t(self.steps, theta_star)
         pt.subplot(211)
         pt.plot(self.step_list, r[0])
         mean += r[0]
     pt.subplot(212)
     mean = [i / self.runs for i in mean]
     pt.plot(self.step_list, mean)
     pt.show()
Beispiel #22
0
def read_arrays_local(
    local_path: str, reg_exp_pattern: str = r".+\.npy", stack_input: int = 1, verbose=True
) -> Iterator[np.ndarray]:
    """
    Iterate over numpy array files that match the reg ex pattern and yield their content.
    It is possible to iterate over the stacked content of several arrays.

    Parameters
    ----------
    local_embeddings_path : str
        Path on local disk of arrays in numpy format.
    stack_input : int (default 1)
        Number of arrays that should be stacked at each iterations.
        This parameter is useful when working with many small files.
    verbose : bool
        Print detailed informations if set to True

    Returns
    -------
    arrays_iterator : Iterator[np.ndarray]
        An iterator over batchs of stacked arrays.
    """

    assert stack_input > 0

    reg_exp = re.compile(reg_exp_pattern)

    filenames = os.walk(local_path).__next__()[2]
    filenames = [filename for filename in filenames if reg_exp.match(filename)]
    filenames.sort()
    embeddings_stack: List[np.ndarray] = []

    iterator = enumerate(filenames)
    if verbose:
        iterator = tq(list(iterator))

    for file_number, file_name in iterator:

        if embeddings_stack and (file_number % stack_input == 0):
            yield np.concatenate(embeddings_stack)
            embeddings_stack = []

        try:
            embeddings_stack.append(np.load(f"{local_path}/{file_name}"))
        except Exception as e:  # pylint: disable=broad-except
            print(e)

    if embeddings_stack:
        yield np.concatenate(embeddings_stack).astype(np.float32)
Beispiel #23
0
def mean_sd_calculator(args):
    dirpath = os.path.join(args.dpath, 'original/')
    file_list = os.listdir(dirpath)
    mean_list = []
    sd_list = []
    for fpath in tq(file_list):
        ds = dcmread(dirpath + fpath)
        np_array = ds.pixel_array
        mean = np.mean(np_array)
        mean_list.append(mean)
        sd = np.std(np_array)
        sd_list.append(sd)

    print(f'Mean: {np.mean(mean_list)}')
    print(f'Standard Deviation: {np.std(sd_list)}')
Beispiel #24
0
def dcm_to_jpg(args):

    dirpath = os.path.join(args.dpath, 'original/')
    savepath = os.path.join(args.dpath, 'processed_data/')
    if not os.path.isdir(savepath):
        os.makedirs(savepath)
    file_list = os.listdir(dirpath)
    for fpath in tq(file_list):
        ds = dcmread(os.path.join(dirpath, fpath))
        np_array = ds.pixel_array
        im = Image.fromarray(np_array)
        name = fpath.split('.dcm')[0]
        im.save(os.path.join(savepath, name + ".jpg"))

    print('Completed')
Beispiel #25
0
def read_embeddings_remote(
    embeddings_path: str, column_label: str = "embedding", stack_input: int = 1, verbose=True
) -> Iterator[np.ndarray]:
    """
    Return an iterator over embeddings from a parquet folder

    Parameters
    ----------
    embeddings_path : str
        Path on the hdfs of the embedding in parquet format.
    column_label : str (default "embeddings")
        Name of the column in which the embeddings are stored.
    stack_input : int (default 1)
        Number of arrays that should be stacked at each iterations.
        This parameter is useful when working with many small files.
    verbose : bool
        Print detailed informations if set to True

    Returns
    -------
    embeddings_iterator : Iterator[np.ndarray]
        An iterator over batchs of embedding arrays.
    """

    assert stack_input > 0

    filenames = read_filenames(embeddings_path)

    embeddings_stack: List[np.ndarray] = []

    iterator = list(enumerate(filenames))
    if verbose:
        iterator = tq(iterator)

    for file_number, file_name in iterator:

        if embeddings_stack and (file_number % stack_input == 0):
            yield np.concatenate(embeddings_stack)
            embeddings_stack = []

        small_table = pq.read_table(file_name)
        pandas_df = small_table[column_label].to_pandas()
        embeddings_stack.append(np.stack(pandas_df).astype("float32"))

    if embeddings_stack:
        yield np.concatenate(embeddings_stack)
Beispiel #26
0
def create_annotation(args):

    file_list = os.listdir(os.path.join(args.dpath, 'processed_data/'))
    df_class = pd.read_csv(
        os.path.join(args.dpath, 'stage_2_detailed_class_info.csv'))
    labels = ["Lung Opacity", "Normal", "No Lung Opacity / Not Normal"]
    dict_annotation = {}
    for file in tq(file_list):
        patient_id = file.split('.jpg')[0]
        tmp = df_class[df_class["patientId"] == patient_id]["class"].values[0]
        idx = labels.index(tmp)
        dict_annotation[str(file)] = idx

    with open(os.path.join(args.dpath, 'rsna_annotation.json'), 'w') as f:
        json.dump(dict_annotation, f)

    print('Created and saved rsna_annotation.json file.')
Beispiel #27
0
def main():
    t_c_dic = defaultdict(int)
    t_dic = defaultdict(int)
    c_dic = defaultdict(int)
    N = 0
    for line in tq(open('knock82_out', encoding='utf8')):
        t, cs = line.split('\t')
        t_dic[t] += 1
        for c in cs.split():
            t_c_dic[f'{t} {c}'] += 1
            c_dic[c] += 1
            N += 1
    with open('counts.pickle', mode='wb') as f:
        pickle.dump(t_c_dic, f)
        pickle.dump(t_dic, f)
        pickle.dump(c_dic, f)
        pickle.dump(N, f)
Beispiel #28
0
def main():
    client = pymongo.MongoClient()
    db = client.db_knock64
    collection = db.collection_knock64

    batch = []
    for i, line in tq(enumerate(gzip.open('artist.json.gz', 'rt', encoding='utf8'))):
        jdata = json.loads(line)
        batch.append(jdata)
        if not i % 10000 and batch:
            collection.insert_many(batch)
            batch = []

    collection.create_index([('name', pymongo.ASCENDING)])
    collection.create_index([('aliases.name', pymongo.ASCENDING)])
    collection.create_index([('tags.value', pymongo.ASCENDING)])
    collection.create_index([('rating.value', pymongo.ASCENDING)])
Beispiel #29
0
def occlusion(dataset_path: Path, model_path: Path, output_path: Path):
    check_path(output_path)
    nx_graphs, labels = read_graphs(dataset_path)
    model = load_model(model_path)

    def prepare_input(g):
        node_count = len(g.nodes)
        adj = np.zeros((1, 100, 100))
        adj[0, :node_count, :node_count] = nx.to_numpy_matrix(g)
        adj = torch.tensor(adj, dtype=torch.float)
        x = torch.ones((1, 100, 10), requires_grad=False, dtype=torch.float)
        return x, adj

    def explain(graph_num):
        model.eval()
        g = nx_graphs[graph_num]
        x, adj = prepare_input(g)

        ypred, _ = model(x, adj)
        true_label = labels[graph_num]
        before_occlusion = ypred[0].softmax(0)
        node_importance = {}

        for removed_node in g.nodes():
            g2 = g.copy()
            g2.remove_node(removed_node)
            x, adj = prepare_input(g2)
            ypred, _ = model(x, adj)
            after_occlusion = ypred[0].softmax(0)
            importance = abs(after_occlusion[true_label] - before_occlusion[true_label])
            node_importance[int(removed_node)] = importance.item()

        N = nx_graphs[graph_num].number_of_nodes()
        masked_adj = np.zeros((N, N))
        for u, v in nx_graphs[graph_num].edges():
            u = int(u)
            v = int(v)
            masked_adj[u, v] = masked_adj[v, u] = node_importance[u] + node_importance[v]
        return masked_adj

    for gid in tq(nx_graphs):
        masked_adj = explain(gid)
        np.save(output_path / ('%s.npy' % gid), masked_adj)
Beispiel #30
0
def write_adjacency(output_path: Path, dataset: Dataset, graphs):
    relabled_gs = []
    first_label = 1
    graph_indicator = []
    for g, label in tq(graphs):
        relabled_gs.append(
            nx.convert_node_labels_to_integers(g, first_label=first_label))
        N = len(g.nodes())
        first_label += N
        graph_indicator.extend([g.graph['graph_num']] * N)
    with open(output_path / ('%s_A.txt' % dataset.value), 'w') as f:
        for g in relabled_gs:
            for u, v in g.edges():
                f.write(f'{u}, {v}\n{v}, {u}\n')
    with open(output_path / ('%s_graph_indicator.txt' % dataset.value),
              'w') as f:
        f.write('\n'.join(map(str, graph_indicator)))
    with open(output_path / ('%s_graph_labels.txt' % dataset.value), 'w') as f:
        f.write('\n'.join([str(label) for g, label in graphs]))