Ejemplo n.º 1
0
    def scrublet_predictions(self, vlm, input_dir, doublet_rate=0.06):
        import scrublet as scr
        import scipy.io
        print('Loading counts matrix {}/matrix.mtx'.format(input_dir),
              file=sys.stderr)
        counts_matrix = scipy.io.mmread(input_dir + '/matrix.mtx').T.tocsc()
        print("Loading barcodes {}/barcodes.tsv".format(input_dir),
              file=sys.stderr)
        barcodes = np.array(
            scr.load_genes(input_dir + 'barcodes.tsv', delimiter='t',
                           column=0))

        print("Initializing scrublet object", file=sys.stderr)
        scrub = scr.Scrublet(
            counts_matrix,
            expected_doublet_rate=doublet_rate)  #whole counts matrix
        print("Computing doublet predictions", file=sys.stderr)
        doublet_scores, predicted_doublets = scrub.scrub_doublets(
            min_counts=2,
            min_cells=3,
            min_gene_variability_pctl=85,
            n_prin_comps=30)
        #collapse barcodes, scores, and predictions into a dict
        doublet_dict = {
            barcode: [doublet_scores[i], predicted_doublets[i]]
            for i, barcode in enumerate(barcodes)
        }

        #add doublet score and doublet prediction as column attributes:
        vlm.ca["doublet_scores"] = np.array(
            [doublet_dict[barcode][0] for barcode in vlm.ca['CellID']])
        vlm.ca["doublet_predictions"] = np.array(
            [doublet_dict[barcode][1] for barcode in vlm.ca['CellID']])
        return vlm
Ejemplo n.º 2
0
def run_scrublet(tenx_h5, doublet_rate=0.06, npca=40, save_to=None):
    if not save_to:
        raise ValueError(
            "Please, specify prefix path where to save results to")
    if tenx_h5.endswith(".h5"):
        ds = sc.read_10x_h5(tenx_h5)
        counts_matrix = ds.X.tocsc().astype(np.longlong)
        obs = ds.obs.reset_index()
        obs.columns = ["0"]
    else:
        counts_matrix = scipy.io.mmread(gzip.open(tenx_h5 +
                                                  '/matrix.mtx.gz')).T.tocsc()
        obs = pd.read_table(gzip.open(tenx_h5 + '/barcodes.tsv.gz'),
                            header=None)
    #features = pd.read_table(gzip.open(input_dir + '/features.tsv.gz'), header=None)
    #genes = scr.make_genes_unique(features[1])
    scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=doublet_rate)
    doublet_scores, doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=npca)
    save_dir = os.path.dirname(save_to)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    obs['doublet'] = doublet_scores
    obs.to_csv(save_to + 'doublets.csv')
    scrub.plot_histogram()
    plt.savefig(save_to + 'doublet_hist.pdf')
    if not os.path.exists(save_to + 'threshold.txt'):
        with open(save_to + 'threshold.txt', 'w') as f:
            f.write(str(scrub.threshold_))
Ejemplo n.º 3
0
def run_scrublet(adata, neotic_ratio=.5):
    '''
    '''
    import scrublet as scr
    from scipy.stats import rankdata

    expected_doublet_th = adata.shape[0] / 1000 * .01 * neotic_ratio
    adata_raw = adata.raw.copy()
    adata_raw = adata_raw[:, adata_raw.var.index.isin(
        adata.var_names.tolist())]
    counts_matris_2 = adata_raw.X.expm1()
    del adata_raw
    scrub = scr.Scrublet(
        counts_matris_2, expected_doublet_rate=expected_doublet_th)
    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        distance_metric='cosine',
        mean_center=False,
        n_prin_comps=50,
        log_transform=True,
        min_gene_variability_pctl=0)
    scrub.plot_histogram()
    predicted_doublets = scrub.call_doublets(threshold=np.quantile(
        doublet_scores, 1 - expected_doublet_th))  # directly call by trheshold
    print('total predicted doublets:', sum(predicted_doublets))
    print('predicted doublets ratio:', sum(
        predicted_doublets) / len(predicted_doublets))
    adata.obs['doublet_score'] = doublet_scores
    adata.obs['doublet'] = predicted_doublets
    adata.obs['doublet_quantile'] = (
        rankdata(doublet_scores) / len(doublet_scores))
    return(adata)
def annotate_doublets(mtx_fpath, feature_fpath, expected_doublet_rate2=0.06):
    if False:
        plt.rcParams['font.family'] = 'sans-serif'
        plt.rcParams['font.sans-serif'] = 'Arial'
        plt.rc('font', size=14)
        plt.rcParams['pdf.fonttype'] = 42

    counts_matrix = scipy.io.mmread(mtx_fpath).T.tocsc()
    genes = np.array(scr.load_genes(feature_fpath, delimiter='\t', column=1))

    print('Counts matrix shape: {} rows, {} columns'.format(
        counts_matrix.shape[0], counts_matrix.shape[1]))
    print('Number of genes in gene list: {}'.format(len(genes)))

    scrub = scr.Scrublet(counts_matrix,
                         expected_doublet_rate=expected_doublet_rate2)

    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=30)

    if False:
        scrub.plot_histogram()

        print('Running UMAP...')
        scrub.set_embedding(
            'UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
        print('Done.')

        scrub.plot_embedding('UMAP', order_points=True)

    return ([doublet_scores, predicted_doublets])
Ejemplo n.º 5
0
def Bertie_preclustered(adata,batch_key='batch',cluster_key='louvain'):
    import scrublet as scr
    scorenames = ['scrublet_score','scrublet_cluster_score','bh_pval']
    adata.obs['doublet_scores']=0
    def bh(pvalues):
        '''
        Computes the Benjamini-Hochberg FDR correction.

        Input:
            * pvals - vector of p-values to correct
        '''
        n = int(pvalues.shape[0])
        new_pvalues = np.empty(n)
        values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]
        values.sort()
        values.reverse()
        new_values = []
        for i, vals in enumerate(values):
            rank = n - i
            pvalue, index = vals
            new_values.append((n/rank) * pvalue)
        for i in range(0, int(n)-1):
            if new_values[i] < new_values[i+1]:
                new_values[i+1] = new_values[i]
        for i, vals in enumerate(values):
            pvalue, index = vals
            new_pvalues[index] = new_values[i]
        return new_pvalues

    for i in np.unique(adata.obs[batch_key]):
        adata_sample = adata[adata.obs[batch_key]==i,:]
        scrub = scr.Scrublet(adata_sample.X)
        doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False)
        adata_sample.obs['scrublet_score'] = doublet_scores
        adata_sample=adata_sample.copy()

        for clus in np.unique(adata_sample.obs[cluster_key]):
            adata_sample.obs.loc[adata_sample.obs[cluster_key]==clus, 'scrublet_cluster_score'] = \
                np.median(adata_sample.obs.loc[adata_sample.obs[cluster_key]==clus, 'scrublet_score'])

        med = np.median(adata_sample.obs['scrublet_cluster_score'])
        mask = adata_sample.obs['scrublet_cluster_score']>med
        mad = np.median(adata_sample.obs['scrublet_cluster_score'][mask]-med)
        #let's do a one-sided test. the Bertie write-up does not address this but it makes sense
        pvals = 1-scipy.stats.norm.cdf(adata_sample.obs['scrublet_cluster_score'], loc=med, scale=1.4826*mad)
        adata_sample.obs['bh_pval'] = bh(pvals)
        #create results data frame for single sample and copy stuff over from the adata object
        scrublet_sample = pd.DataFrame(0, index=adata_sample.obs_names, columns=scorenames)
        for meta in scorenames:
            scrublet_sample[meta] = adata_sample.obs[meta]
        #write out complete sample scores
        #scrublet_sample.to_csv('scrublet-scores/'+i+'.csv')

        scrub.plot_histogram();
        #plt.savefig('limb/sample_'+i+'_doulet_histogram.pdf')
        adata.obs.loc[adata.obs[batch_key]==i,'doublet_scores']=doublet_scores
        adata.obs.loc[adata.obs[batch_key]==i,'bh_pval'] = bh(pvals)
        del adata_sample
    return adata
Ejemplo n.º 6
0
def run_scrublet(adata, resolution_function=None):
    old_verbosity = sc.settings.verbosity
    sc.settings.verbosity = 1
    if resolution_function is None:
        resolution_function = lambda x: np.maximum(
            np.maximum(np.log10(x) - 1, 0)**2, 0.1)
    scrub = scr.Scrublet(adata.X)
    #this has the potential to brick for poor quality data
    #if so, abort it and everything downstream
    try:
        ds, pd = scrub.scrub_doublets(verbose=False)
    except:
        return
    adata.obs['scrublet_score'] = ds

    adata_copy = adata.copy()
    sc.pp.filter_genes(adata_copy, min_cells=3)
    sc.pp.normalize_total(adata_copy, target_sum=1e4)
    sc.pp.log1p(adata_copy)
    sc.pp.highly_variable_genes(adata_copy,
                                min_mean=0.0125,
                                max_mean=3,
                                min_disp=0.5,
                                subset=True)
    sc.pp.scale(adata_copy, zero_center=False)
    sc.pp.pca(adata_copy, svd_solver='arpack', zero_center=False)
    sc.pp.neighbors(adata_copy, n_pcs=30)
    sc.tl.umap(adata_copy)
    sc.tl.leiden(adata_copy, resolution=1)
    for clst in np.unique(adata_copy.obs['leiden']):
        clst_size = sum(adata_copy.obs['leiden'] == clst)
        sc.tl.leiden(adata_copy,
                     restrict_to=('leiden', [clst]),
                     resolution=resolution_function(clst_size),
                     key_added='leiden_R')
        adata_copy.obs['leiden'] = adata_copy.obs['leiden_R']
    clst_meds = []
    for clst in np.unique(adata_copy.obs['leiden']):
        k = adata_copy.obs['leiden'] == clst
        clst_med = np.median(adata_copy.obs.loc[k, 'scrublet_score'])
        adata_copy.obs.loc[k, 'cluster_scrublet_score'] = clst_med
        clst_meds.append(clst_med)
    clst_meds = np.array(clst_meds)
    pvals, bh_pvals = test_outlier(clst_meds)
    for i, clst in enumerate(np.unique(adata_copy.obs['leiden'])):
        k = adata_copy.obs['leiden'] == clst
        adata_copy.obs.loc[k, 'pval'] = pvals[i]
        adata_copy.obs.loc[k, 'bh_pval'] = bh_pvals[i]
    sc.settings.verbosity = old_verbosity
    #need to also export the clustering, for soupx purposes
    adata.obs['scrublet_leiden'] = adata_copy.obs['leiden']
    adata.obs['scrublet_score'] = adata_copy.obs['scrublet_score']
    adata.obs['cluster_scrublet_score'] = adata_copy.obs[
        'cluster_scrublet_score']
    adata.obs['doublet_pval'] = adata_copy.obs['pval']
    adata.obs['doublet_bh_pval'] = adata_copy.obs['bh_pval']
    del adata_copy
Ejemplo n.º 7
0
def main():
    # parse command line options
    parser = OptionParser()
    parser.add_option("--inputDir", "-i", dest="input_dir", default=None,
        help=("Directory of input matrix in 10x cellranger format"))
    parser.add_option("--outFile", "-o", dest="out_file", default=None,
        help=("Path for output file [default: $i/scrublet_table.tsv]"))
    parser.add_option("--cellranger2", "-2", dest="cellranger2", 
        action="store_true", default=False, 
        help="Use it for cellranger v2 instead of v3")
    parser.add_option("--expected_rate", "-r", dest="expected_rate", 
        default=None, help="Expected doublet rate: [default: n_cell/100K].")
    parser.add_option("--homotypicP", dest="homotypic_prop", default=0.15, 
        type=float, help="Proportion of homotypic doublets: [default: %default].")
    
    
    (options, args) = parser.parse_args()
    
    dat_path = os.path.abspath(options.input_dir)
    version3 = options.cellranger2 == False
    mat_dat, gene_ids, cell_ids = load_10X(dat_path, min_counts=None, 
                                           min_cells=None, version3=version3)
    
    n_cell = mat_dat.shape[1]
    if options.expected_rate is None:
        expected_rate = n_cell / 100000.0
    else:
        expected_rate = float(options.expected_rate)
    expected_rate = min(expected_rate, 0.5)
    homotypic_prop = min(max(options.homotypic_prop, 0.01), 0.99)
    
    print("Files loaded: %d cells." %(n_cell))
    print("Expected doublet rate: %.3f" %(expected_rate))
        
    scrub = scr.Scrublet(mat_dat.transpose(), 
                         expected_doublet_rate=expected_rate)
    raw_scores, raw_doublet = scrub.scrub_doublets(n_prin_comps=30)
    simu_scores = scrub.doublet_scores_sim_
    # when there is no suggested threshold
    if raw_doublet is None: 
        raw_doublet = np.array([None] * len(raw_scores))
    
    _cutoff = np.quantile(raw_scores, 1 - (1 - homotypic_prop) * expected_rate)
    label_frac = raw_scores >= _cutoff
    
    if options.out_file is None:
        out_file = dat_path + "/scrublet_table.tsv"
    else:
        out_file = options.out_file
    fid = open(out_file, "w")
    fid.writelines("cellID\tscore\tlabel_raw\tlabel_frac\n")
    for i in range(len(cell_ids)):
        out_list = [cell_ids[i], "%.3f" %raw_scores[i], 
                    str(raw_doublet[i]), str(label_frac[i])]
        fid.writelines("\t".join(out_list) + "\n")
    fid.close()
def run_scrublet_atac(input_dir):
	counts_matrix = scipy.io.mmread(input_dir + 'matrix.mtx').T.tocsc()
	print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1]))
	scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.05)
	doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                          min_cells=3, 
                                                          min_gene_variability_pctl=85, 
                                                          n_prin_comps=30)
	np.savetxt(input_dir + 'predicted_doublet_mask.txt', scrub.predicted_doublets_, fmt='%s')
	np.savetxt(input_dir + 'doublet_scores.txt', scrub.doublet_scores_obs_, fmt='%.4f')
def run_scrublet_rna(input_dir):
	counts_matrix = scipy.io.mmread(input_dir + 'matrix.mtx').T.tocsc()
	genes = np.array(scr.load_genes(input_dir + 'features.tsv', delimiter='\t', column=1))
	print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1]))
	print('Number of genes in gene list: {}'.format(len(genes)))
	scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.05)
	doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                          min_cells=3, 
                                                          min_gene_variability_pctl=85, 
                                                          n_prin_comps=30)
	np.savetxt(input_dir + 'predicted_doublet_mask.txt', scrub.predicted_doublets_, fmt='%s')
	np.savetxt(input_dir + 'doublet_scores.txt', scrub.doublet_scores_obs_, fmt='%.4f')
Ejemplo n.º 10
0
def doublet(adata, key='Sample'):
    '''detecting doublet using scrublet per key'''
    doublet = []
    for filename in set(adata.obs[key]):
        print(filename)
        sdata = adata[adata.obs[key] == filename].copy()
        scrub = scr.Scrublet(sdata.X)
        doublet_scores, predicted_doublets = scrub.scrub_doublets(verbose=False)
        doublet.extend([(x,y,z) for x,y,z in zip(sdata.obs_names,doublet_scores,predicted_doublets)])
    doublet_score = {x:y for (x,y,z) in doublet}
    doublet_predict = {x:z for (x,y,z) in doublet}
    adata.obs['doublet_score'] = [doublet_score[obs_name] for obs_name in list(adata.obs_names)]
    adata.obs['doublet_predict'] = [doublet_predict[obs_name] for obs_name in list(adata.obs_names)]
Ejemplo n.º 11
0
def run_scrublet(sample_name, counts_matrix):
    print('run_scrublet.py: run_scrublet: begin')
    warnings.showwarning = handle_warning

    if(numpy.size(counts_matrix, 0) == 0 or numpy.size(counts_matrix, 1) == 0):
        filename = args.sample_name + "-scrublet_hist.png"
        image = Image.new(mode = "RGB", size = (800,600), color = "white")
        draw = ImageDraw.Draw(image) 
        draw.text((50,50), "Scrublet failed. This is generally because there aren't enough cells with sufficient reads.\n", fill = "black")
        return(-1)

    if(not scipy.sparse.isspmatrix_csc(counts_matrix)):
        counts_matrix = counts_matrix.T.tocsc()
    else:
        counts_matrix = counts_matrix.T

    # count_matrix
    #   rows: cells
    #   cols: genes
    scrub = scr.Scrublet(counts_matrix)

    try:
        doublet_scores, predicted_doublets = scrub.scrub_doublets()
        scrub.plot_histogram()[0].savefig(args.sample_name + "-scrublet_hist.png")
        all_scores = numpy.vstack((doublet_scores, predicted_doublets))
        all_scores = numpy.transpose(all_scores)
        numpy.savetxt(args.sample_name + "-scrublet_table.csv", all_scores, delimiter=",", fmt='%.8e,%d')
    except (ZeroDivisionError, FloatingPointError, ValueError) as eobj:
        tb_str = traceback.format_exc()
        print('%s' % ( tb_str ), file=sys.stderr)
        temp = numpy.array(["NA"] * numpy.size(counts_matrix, 0))
        all_scores = numpy.vstack((temp, temp))
        all_scores = numpy.transpose(all_scores)
        filename = args.sample_name + "-scrublet_hist.png"
        image = Image.new(mode = "RGB", size = (800,600), color = "white")
        draw = ImageDraw.Draw(image)
        draw.text((50,50), "Scrublet failed. This is generally because there aren't enough cells with sufficient reads.\n\nFailure message:\n\n" + tb_str, fill = "black")
        image.save(filename)
        numpy.savetxt(args.sample_name + "-scrublet_table.csv", all_scores, fmt="%s", delimiter=",")
    except (AttributeError) as eobj:
        tb_str = traceback.format_exc()
        print('%s' % ( tb_str ), file=sys.stderr)
        predicted_doublets = scrub.call_doublets(threshold=0.15)
        scrub.plot_histogram()[0].savefig(args.sample_name + "-scrublet_hist.png")
        all_scores = numpy.vstack((doublet_scores, predicted_doublets))
        all_scores = numpy.transpose(all_scores)
        numpy.savetxt(args.sample_name + "-scrublet_table.csv", all_scores, delimiter=",", header='doublet_score,doublet')
    print('run_scrublet.py: run_scrublet: end')
    return( 0 )
Ejemplo n.º 12
0
def score_doublets(mtx, doublet_rate):
    scrub = scr.Scrublet(mtx.T, expected_doublet_rate=doublet_rate)
    doublet_scores, predicted_doublets = scrub.scrub_doublets()

    def manual_threshold(scores):
        top_n = int(doublet_rate * len(scores))
        sorted_scores = np.sort(scores)
        threshold = sorted_scores[len(scores) - top_n:].min()
        return threshold

    # scrublet can be conservative -- making sure I get most doublets
    if scrub.threshold_ > 0.3 and sum(
            predicted_doublets) < (doublet_rate * len(doublet_scores)) / 2:
        threshold = manual_threshold(doublet_scores)
        doublets = scrub.call_doublets(threshold=threshold)
    else:
        doublets = scrub.call_doublets()
    return doublets
Ejemplo n.º 13
0
def identify_doublets(data, **kw):
    """Detect doublets in single-cell RNA-seq data

    https://github.com/AllonKleinLab/scrublet
    """
    import scrublet as scr
    adata = data.copy()
    col_sum = adata.X.sum(0)
    if hasattr(col_sum, 'A'):
        col_sum = col_sum.A.squeeze()
    keep = col_sum > 3
    adata = adata[:,keep]
    scrub = scr.Scrublet(adata.X, **kw)
    doublet_score, predicted_doublets = scrub.scrub_doublets()
    if predicted_doublets is None:
        predicted_doublets = scrub.call_doublets(threshold=0.34)
    data.obs['doublet_score'] =  doublet_score
    data.obs['predicted_doublets'] = predicted_doublets
    return data
Ejemplo n.º 14
0
def detectDoublet(args):
    counts_matrix = readMatrix(args.input, binary=False)
    scrub = scr.Scrublet(counts_matrix,
                         expected_doublet_rate=0.06,
                         sim_doublet_ratio=3,
                         n_neighbors=25)
    doublet_scores, _ = scrub.scrub_doublets(
        min_counts=1,
        min_cells=3,
        min_gene_variability_pctl=85,
        mean_center=True,
        normalize_variance=True,
        n_prin_comps=min(30,
                         counts_matrix.get_shape()[0] // 10))

    # Fit a Gaussian mixture model
    X = scrub.doublet_scores_sim_
    X = np.array([X]).T
    gmm = BayesianGaussianMixture(n_components=2,
                                  max_iter=1000,
                                  random_state=2394).fit(X)
    i = np.argmax(gmm.means_)

    probs_sim = gmm.predict_proba(X)[:, i]
    vals = X[np.argwhere(probs_sim > 0.5)].flatten()
    if vals.size == 0:
        threshold = np.amax(X.flatten())
    else:
        threshold = min(vals)

    X = np.array([doublet_scores]).T
    probs = gmm.predict_proba(X)[:, i].tolist()

    with open(args.output, 'w') as fl:
        fl.write('\t'.join(map(str, probs)))
        fl.write("\n")

        fl.write(str(threshold))
        fl.write("\n")
        fl.write('\t'.join(map(str, (doublet_scores.tolist()))))
        fl.write("\n")
        fl.write('\t'.join(map(str, scrub.doublet_scores_sim_)))
Ejemplo n.º 15
0
def scrublet(adata, expected_rate=0.06, doublet_score=None):
    import scrublet as scr
    import numpy as np
    scrub = scr.Scrublet(adata.X, expected_doublet_rate=expected_rate)

    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=30)

    adata.obs['doublet_scores'] = doublet_scores
    adata.obs['predicted_doublets'] = predicted_doublets

    adata.obs[
        'predicted_doublets'] = adata.obs['doublet_scores'] > doublet_score
    print('Nr of predicted doublets ', np.sum(adata.obs['predicted_doublets']))
    print('Doublets indices saved in adata.obs["predicted_doublets"]')

    scrub.plot_histogram()
Ejemplo n.º 16
0
def scrublet_py(i, j, val, dim, expected_doublet_rate, min_counts, min_cells,
                min_gene_variability_pctl, n_prin_comps, sim_doublet_ratio,
                n_neighbors):
    import matplotlib
    matplotlib.use('agg')
    import scrublet as scr
    import scipy.io
    import numpy as np
    import os
    from scipy.sparse import csc_matrix
    data = csc_matrix((val, (i, j)), shape=dim)
    scrub = scr.Scrublet(data,
                         expected_doublet_rate=expected_doublet_rate,
                         sim_doublet_ratio=int(sim_doublet_ratio),
                         n_neighbors=int(n_neighbors))
    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=int(min_counts),
        min_cells=int(min_cells),
        min_gene_variability_pctl=min_gene_variability_pctl,
        n_prin_comps=int(n_prin_comps))
    return (doublet_scores, predicted_doublets)
def dedoublets(adata,
               edr=0.1,
               npc=30,
               pctl=85,
               pl=False,
               f_out_fig=None,
               dpi=300):
    scrub = scr.Scrublet(adata.X, expected_doublet_rate=edr)
    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_gene_variability_pctl=pctl, n_prin_comps=npc)
    #1. remove doublets
    adata.obs['doublets'] = predicted_doublets
    adata = adata[adata.obs['doublets'] == False, :].copy()
    #2. drop doublets column in obs
    adata.obs = adata.obs.drop('doublets', axis=1)
    #3. plot
    if pl:
        scrub.plot_histogram()
        plt.savefig(f_out_fig, dpi=dpi)
        plt.close()
    return adata
Ejemplo n.º 18
0
def anndata_from_mtx(outpath, name):
    import numpy as np
    import pandas as pd
    import scanpy.api as sc
    import scrublet as scr
    from scipy import sparse
    from skimage.filters import threshold_minimum

    sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
    sc.logging.print_versions()
    if not name:
        name = 'scanpy'
    results_file = os.path.join(outpath, name + '_raw.h5ad')
    sc.settings.set_figure_params(dpi=80)

    adata = sc.read(os.path.join(outpath, 'matrix.mtx'),
                    cache=False).T  # transpose the data
    adata.var_names = pd.read_csv(os.path.join(outpath, 'genes.tsv'),
                                  header=None,
                                  sep='\t')[0]
    adata.obs_names = pd.read_csv(os.path.join(outpath, 'barcodes.tsv'),
                                  header=None,
                                  sep='\t')[0]
    adata.var_names_make_unique()
    counts_matrix = sparse.csc_matrix(adata.X)

    scrub = scr.Scrublet(counts_matrix,
                         expected_doublet_rate=round(
                             counts_matrix.shape[0] / 125000, 4))
    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=30)
    threshold = threshold_minimum(scrub.doublet_scores_sim_)
    adata.obs['doublet_score'] = scrub.doublet_scores_obs_
    adata.uns['doublet_threshold'] = threshold
    adata.write_h5ad(results_file)
    return adata
Ejemplo n.º 19
0
def scrublet_c(sample, inDir, outDir, expected_doublet_rate, sim_doublet_ratio,
               ratio_df, out_df):
    print(sample, "start scrublet")
    counts_matrix = scipy.io.mmread(os.path.join(inDir,
                                                 'matrix.mtx')).T.tocsc()
    genes = np.array(
        scr.load_genes(os.path.join(inDir, 'genes.tsv'),
                       delimiter='\t',
                       column=1))

    scrub = scr.Scrublet(counts_matrix,
                         expected_doublet_rate=expected_doublet_rate,
                         sim_doublet_ratio=sim_doublet_ratio)
    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=30)

    scrub.plot_histogram()
    plt.savefig(
        os.path.join(
            outDir, "{0}_scrublet_doublet_score_histogram.pdf".format(sample)))
    print(sample, 'Running scrublet UMAP...')
    scrub.set_embedding('UMAP',
                        scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
    print(sample, 'scrublet Done.')

    scrub.plot_embedding('UMAP', order_points=True)
    plt.savefig(os.path.join(outDir, "{0}_scrublet_UMAP.pdf".format(sample)))
    print(sample, "Done scrublet")

    ratio_df.loc['scrublet', sample] = scrub.detected_doublet_rate_
    out_df['scrublet_doublet_scores'] = doublet_scores
    out_df['scrublet_doublets'] = predicted_doublets

    return ratio_df, out_df
Ejemplo n.º 20
0
    def detect(self):
        try:
            self.adata = sc.read_h5ad(self._adata)
        except:
            self.adata = sc.read_10x_mtx(self._adata)

        print("### Initialize Scruble")
        counts_matrix = self.adata.raw.X
        scrub = scr.Scrublet(counts_matrix,
                             expected_doublet_rate=self._expected_doublet_rate,
                             sim_doublet_ratio=1.0)

        print("### Detect ,Nomalize,PCA")
        doublet_scores, predicted_doublets = scrub.scrub_doublets(
            min_counts=self._min_counts,
            min_cells=self._min_cells,
            min_gene_variability_pctl=85,
            n_prin_comps=self._n_prin_comps)
        self.adata.obs["doublet_scores_obs"] = doublet_scores
        self.adata.obs["doublet_errors_obs"] = scrub.doublet_errors_obs_
        self.adata.obs["doublet_errors_sim"] = scrub.doublet_errors_sim_
        self.adata.obs["doublet_scores_sim"] = scrub.doublet_scores_sim_

        self.scrub = scrub
Ejemplo n.º 21
0
plt.rc('font', size=14)
plt.rcParams['pdf.fonttype'] = 42

#filtered
#input_dir = '/share/ScratchGeneral/briglo/scRNA/venchi/data/hg19/VENCHI_SampleBCITE/outs/filtered_feature_bc_matrix/'
input_dir = '/share/ScratchGeneral/briglo/scRNA/venchi/outputs/seurat/'
counts_matrix = scipy.io.mmread(input_dir + 'combined.human.mtx').T.tocsc()
genes = np.array(scr.load_genes(input_dir + 'genes.tsv', delimiter='\t', column=0))

print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1]))
print('Number of genes in gene list: {}'.format(len(genes)))

#Counts matrix shape: 12865 rows, 32738 columns
#Number of genes in gene list: 32738

scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06)

doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                          min_cells=3, 
                                                          min_gene_variability_pctl=85, 
                                                         n_prin_comps=30,
                                                         get_doublet_neighbor_parents=True)

scrub.plot_histogram();
plt.savefig('/share/ScratchGeneral/briglo/scRNA/venchi/plt.png')
scrub.call_doublets(threshold=0.24)
scrub.plot_histogram();
plt.savefig('/share/ScratchGeneral/briglo/scRNA/venchi/plt.png')

print('Running UMAP...')
# scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3))
Ejemplo n.º 22
0
import scrublet as rc 
import matplotlib.pyplot as plt 
import scipy.io 
from scipy.sparse import csc_matrix 
import numpy as np 

wd = "/restricted/projectnb/camplab/home/syyang/contamination/data/pbmc/4k/" 

counts = scipy.io.mmread( wd + 'data/matrix.mtx' ) 

geneIndex = ((counts > 2).sum( axis = 1 ) > 2 )  
counts_filter = counts.toarray()[ np.array(geneIndex).reshape(-1), : ] 

print( counts_filter.shape) 

counts_csc = csc_matrix( counts_filter.T ) 
genes = np.array( rc.load_genes( wd + 'data/genes.tsv', delimiter='\t', column=1)) [ np.array(geneIndex).reshape(-1)  ]


scrub = rc.Scrublet(counts_csc, expected_doublet_rate=0.06)


doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30)


np.save( "doublet_scores.npy", doublet_scores  ) 
np.save( "predicted_doublets.npy", predicted_doublets * 1 )
data = libraries['SIGAG4'].concatenate([libraries['SIGAH4']])

data.obs['organ'] = 'PB'

# Predict and remove putative doublets

scrub = {}
doublet_scores = {}
predicted_doublets = {}

# expected multiplet rate
emr = {'SIGAG4': 0.076, 'SIGAH4': 0.076}

for sample in samples:
    print(sample)
    scrub[sample] = scr.Scrublet(
        data[np.array(data.obs['library'] == sample), :].X)
    doublet_scores[sample], predicted_doublets[sample] = scrub[
        sample].scrub_doublets()
    print('\n\n')

for sample in samples:
    print(sample, ':', sum(predicted_doublets[sample]))

sample = 'SIGAG4'
scrub[sample].plot_histogram()

sample = 'SIGAH4'
scrub[sample].plot_histogram()

data_doublets = os.path.join(sc.settings.writedir, '..', 'doublets')
if not os.path.exists(data_doublets):
Ejemplo n.º 24
0
		raise ArgumentError("You need to supply a working directory, a sample metadata file and a genome build!") 
		
    # return argument values
	return wd,sampleID,genome_builds

	
working_dir,sampleID,genomes= parse_arguments(sys.argv)
		
## Perform doublet detection for each sample sequencially
for genome in genomes.split(','):
	if os.path.isfile(working_dir+'/count/'+sampleID+'/outs/filtered_feature_bc_matrix/matrix.mtx.gz'):
		matrix_path = working_dir+'/count/'+sampleID+'/outs/filtered_feature_bc_matrix/matrix.mtx.gz'
		
		raw_counts = mmread(matrix_path).T.tocsc()

		scrub = scr.Scrublet(raw_counts, expected_doublet_rate=0.06)
		doublet_scores, predicted_doublets = scrub.scrub_doublets()
			
		output_dir=working_dir+'/count/'+sampleID+'/outs/analysis/doubletdetection'
		if not os.path.isdir(output_dir):
			os.makedirs(output_dir)

		output_doublets = open(output_dir+'/'+sampleID+'_'+genome+'_scrublet_doublets.txt','w')
			
		if os.path.isfile(working_dir+'/count/'+sampleID+'/outs/filtered_feature_bc_matrix/barcodes.tsv.gz'):
			barcode_path = working_dir+'/count/'+sampleID+'/outs/filtered_feature_bc_matrix/barcodes.tsv.gz'
				
			barcodesList=list()
			for line in gzip.open(barcode_path, 'rb'):
				barcodesList.append(line.rstrip())
					
Ejemplo n.º 25
0
def scrublet_simulate_doublets(
    adata: AnnData,
    layer=None,
    sim_doublet_ratio: float = 2.0,
    synthetic_doublet_umi_subsampling: float = 1.0,
    random_seed: int = 0,
) -> AnnData:
    """\
    Simulate doublets by adding the counts of random observed transcriptome pairs.

    Parameters
    ----------
    adata
        The annotated data matrix of shape ``n_obs`` × ``n_vars``. Rows
        correspond to cells and columns to genes. Genes should have been
        filtered for expression and variability, and the object should contain
        raw expression of the same dimensions.
    layer
        Layer of adata where raw values are stored, or 'X' if values are in .X.
    sim_doublet_ratio
        Number of doublets to simulate relative to the number of observed
        transcriptomes. If `None`, self.sim_doublet_ratio is used.
    synthetic_doublet_umi_subsampling
        Rate for sampling UMIs when creating synthetic doublets. If 1.0,
        each doublet is created by simply adding the UMIs from two randomly
        sampled observed transcriptomes. For values less than 1, the
        UMI counts are added and then randomly sampled at the specified
        rate.

    Returns
    -------
    adata : anndata.AnnData with simulated doublets in .X
        Adds fields to ``adata``:

        ``.obsm['scrublet']['doublet_parents']``
            Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome

        ``.uns['scrublet']['parameters']``
            Dictionary of Scrublet parameters

    See also
    --------
    :func:`~scanpy.external.pp.scrublet`: Main way of running Scrublet, runs
        preprocessing, doublet simulation (this function) and calling.
    :func:`~scanpy.external.pl.scrublet_score_distribution`: Plot histogram of doublet
        scores for observed transcriptomes and simulated doublets.
    """
    try:
        import scrublet as sl
    except ImportError:
        raise ImportError(
            'Please install scrublet: `pip install scrublet` or `conda install scrublet`.'
        )

    X = _get_obs_rep(adata, layer=layer)
    scrub = sl.Scrublet(X)

    scrub.simulate_doublets(
        sim_doublet_ratio=sim_doublet_ratio,
        synthetic_doublet_umi_subsampling=synthetic_doublet_umi_subsampling,
    )

    adata_sim = AnnData(scrub._E_sim)
    adata_sim.obs['n_counts'] = scrub._total_counts_sim
    adata_sim.obsm['doublet_parents'] = scrub.doublet_parents_
    adata_sim.uns['scrublet'] = {
        'parameters': {
            'sim_doublet_ratio': sim_doublet_ratio
        }
    }
    return adata_sim
Ejemplo n.º 26
0
def _scrublet_call_doublets(
    adata_obs: AnnData,
    adata_sim: AnnData,
    n_neighbors: Optional[int] = None,
    expected_doublet_rate: float = 0.05,
    stdev_doublet_rate: float = 0.02,
    mean_center: bool = True,
    normalize_variance: bool = True,
    n_prin_comps: int = 30,
    use_approx_neighbors: bool = True,
    knn_dist_metric: str = 'euclidean',
    get_doublet_neighbor_parents: bool = False,
    threshold: Optional[float] = None,
    random_state: int = 0,
    verbose: bool = True,
) -> AnnData:
    """\
    Core function for predicting doublets using Scrublet [Wolock19]_.

    Predict cell doublets using a nearest-neighbor classifier of observed
    transcriptomes and simulated doublets. This is a wrapper around the core
    functions of `Scrublet <https://github.com/swolock/scrublet>`__ to allow
    for flexibility in applying Scanpy filtering operations upstream. Unless
    you know what you're doing you should use the main scrublet() function.

    .. note::
        More information and bug reports `here
        <https://github.com/swolock/scrublet>`__.

    Parameters
    ----------
    adata_obs
        The annotated data matrix of shape ``n_obs`` × ``n_vars``. Rows
        correspond to cells and columns to genes. Should be normalised with
        scanpy.pp.normalize_total() and filtered to include only highly
        variable genes.
    adata_sim
        Anndata object generated by
        sc.external.pp.scrublet_simulate_doublets(), with same number of vars
        as adata_obs. This should have been built from adata_obs after
        filtering genes and cells and selcting highly-variable genes.
    n_neighbors
        Number of neighbors used to construct the KNN graph of observed
        transcriptomes and simulated doublets. If ``None``, this is
        automatically set to ``np.round(0.5 * np.sqrt(n_obs))``.
    expected_doublet_rate
        The estimated doublet rate for the experiment.
    stdev_doublet_rate
        Uncertainty in the expected doublet rate.
    mean_center
        If True, center the data such that each gene has a mean of 0.
        `sklearn.decomposition.PCA` will be used for dimensionality
        reduction.
    normalize_variance
        If True, normalize the data such that each gene has a variance of 1.
        `sklearn.decomposition.TruncatedSVD` will be used for dimensionality
        reduction, unless `mean_center` is True.
    n_prin_comps
        Number of principal components used to embed the transcriptomes prior
        to k-nearest-neighbor graph construction.
    use_approx_neighbors
        Use approximate nearest neighbor method (annoy) for the KNN
        classifier.
    knn_dist_metric
        Distance metric used when finding nearest neighbors. For list of
        valid values, see the documentation for annoy (if `use_approx_neighbors`
        is True) or sklearn.neighbors.NearestNeighbors (if `use_approx_neighbors`
        is False).
    get_doublet_neighbor_parents
        If True, return the parent transcriptomes that generated the
        doublet neighbors of each observed transcriptome. This information can
        be used to infer the cell states that generated a given
        doublet state.
    threshold
        Doublet score threshold for calling a transcriptome a doublet. If
        `None`, this is set automatically by looking for the minimum between
        the two modes of the `doublet_scores_sim_` histogram. It is best
        practice to check the threshold visually using the
        `doublet_scores_sim_` histogram and/or based on co-localization of
        predicted doublets in a 2-D embedding.
    random_state
        Initial state for doublet simulation and nearest neighbors.
    verbose
        If True, print progress updates.

    Returns
    -------
    adata : anndata.AnnData
        if ``copy=True`` it returns or else adds fields to ``adata``:

        ``.obs['doublet_score']``
            Doublet scores for each observed transcriptome

        ``.obs['predicted_doublets']``
            Boolean indicating predicted doublet status

        ``.uns['scrublet']['doublet_scores_sim']``
            Doublet scores for each simulated doublet transcriptome

        ``.uns['scrublet']['doublet_parents']``
            Pairs of ``.obs_names`` used to generate each simulated doublet transcriptome

        ``.uns['scrublet']['parameters']``
            Dictionary of Scrublet parameters
    """
    try:
        import scrublet as sl
    except ImportError:
        raise ImportError(
            'Please install scrublet: `pip install scrublet` or `conda install scrublet`.'
        )

    # Estimate n_neighbors if not provided, and create scrublet object.

    if n_neighbors is None:
        n_neighbors = int(round(0.5 * np.sqrt(adata_obs.shape[0])))

    # Note: Scrublet() will sparse adata_obs.X if it's not already, but this
    # matrix won't get used if we pre-set the normalised slots.

    scrub = sl.Scrublet(
        adata_obs.X,
        n_neighbors=n_neighbors,
        expected_doublet_rate=expected_doublet_rate,
        stdev_doublet_rate=stdev_doublet_rate,
        random_state=random_state,
    )

    # Ensure normalised matrix sparseness as Scrublet does
    # https://github.com/swolock/scrublet/blob/67f8ecbad14e8e1aa9c89b43dac6638cebe38640/src/scrublet/scrublet.py#L100

    scrub._E_obs_norm = sparse.csc_matrix(adata_obs.X)
    scrub._E_sim_norm = sparse.csc_matrix(adata_sim.X)

    scrub.doublet_parents_ = adata_sim.obsm['doublet_parents']

    # Call scrublet-specific preprocessing where specified

    if mean_center and normalize_variance:
        sl.pipeline_zscore(scrub)
    elif mean_center:
        sl.pipeline_mean_center(scrub)
    elif normalize_variance:
        sl.pipeline_normalize_variance(scrub)

    # Do PCA. Scrublet fits to the observed matrix and decomposes both observed
    # and simulated based on that fit, so we'll just let it do its thing rather
    # than trying to use Scanpy's PCA wrapper of the same functions.

    if mean_center:
        logg.info('Embedding transcriptomes using PCA...')
        sl.pipeline_pca(scrub,
                        n_prin_comps=n_prin_comps,
                        random_state=scrub.random_state)
    else:
        logg.info('Embedding transcriptomes using Truncated SVD...')
        sl.pipeline_truncated_svd(scrub,
                                  n_prin_comps=n_prin_comps,
                                  random_state=scrub.random_state)

    # Score the doublets

    scrub.calculate_doublet_scores(
        use_approx_neighbors=use_approx_neighbors,
        distance_metric=knn_dist_metric,
        get_doublet_neighbor_parents=get_doublet_neighbor_parents,
    )

    # Actually call doublets

    scrub.call_doublets(threshold=threshold, verbose=verbose)

    # Store results in AnnData for return

    adata_obs.obs['doublet_score'] = scrub.doublet_scores_obs_

    # Store doublet Scrublet metadata

    adata_obs.uns['scrublet'] = {
        'doublet_scores_sim': scrub.doublet_scores_sim_,
        'doublet_parents': adata_sim.obsm['doublet_parents'],
        'parameters': {
            'expected_doublet_rate':
            expected_doublet_rate,
            'sim_doublet_ratio':
            (adata_sim.uns.get('scrublet',
                               {}).get('parameters',
                                       {}).get('sim_doublet_ratio', None)),
            'n_neighbors':
            n_neighbors,
            'random_state':
            random_state,
        },
    }

    # If threshold hasn't been located successfully then we couldn't make any
    # predictions. The user will get a warning from Scrublet, but we need to
    # set the boolean so that any downstream filtering on
    # predicted_doublet=False doesn't incorrectly filter cells. The user can
    # still use this object to generate the plot and derive a threshold
    # manually.

    if hasattr(scrub, 'threshold_'):
        adata_obs.uns['scrublet']['threshold'] = scrub.threshold_
        adata_obs.obs['predicted_doublet'] = scrub.predicted_doublets_
    else:
        adata_obs.obs['predicted_doublet'] = False

    if get_doublet_neighbor_parents:
        adata_obs.uns['scrublet'][
            'doublet_neighbor_parents'] = scrub.doublet_neighbor_parents_

    return adata_obs
Ejemplo n.º 27
0
parser.add_argument('-i', '--input', help='raw 10X file directory for input', type=str)
parser.add_argument('-o', '--output', help='output directory', type=str, default="./")
parser.add_argument('-n', '--name', help='name of output files', type=str, default="name")
parser.add_argument('-r', '--doublet', help='expected doublet rate, default=0.06', type=float, default=0.06)
parser.add_argument('-e', '--embed', help='plot UMAP and TSNE. True or False.', type=bool, default=False)
args = parser.parse_args()

#load counts matrix, genes, barcodes
print("Loading counts matrix %s" % args.input + '/matrix.mtx', file=sys.stderr)
counts_matrix = scipy.io.mmread(args.input + '/matrix.mtx').T.tocsc()
print("Loading barcodes %s" % args.input + '/barcodes.tsv', file=sys.stderr)
barcodes = np.array(scr.load_genes(args.input + 'barcodes.tsv', delimiter='t', column=0))

#initialize scrublet object
print("Initializing scrublet object", file=sys.stderr)
scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=args.doublet) #whole counts matrix

print("Computing doublet predictions", file=sys.stderr)
doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, 
                                                          min_cells=3, 
                                                          min_gene_variability_pctl=85, 
                                                          n_prin_comps=30)

#write scrublet output to file:
print("Writing doublet predictions to %s" % args.output + "/" + args.name + "_predicted_doublets.tsv", file=sys.stderr)
with open(args.output + "/" + args.name + "_predicted_doublets.tsv", 'w') as outfile:
	outfile.write("\t".join(["barcode", "doublet_score", "doublet_prediction"])+"\n")
	for barcode, score, prediction in zip(barcodes, doublet_scores, predicted_doublets):
		if prediction == False:
			doublet = "0"
		else:
data.obs['organ'] = 'PB'

# Predict and remove putative doublets

scrub = {}
doublet_scores = {}
predicted_doublets = {}

# expected multiplet rate
emr = {'SIGAE2': 0.069, 'SIGAF2': 0.076, 'SIGAG2': 0.076}

for sample in samples:
    print(sample)
    scrub[sample] = scr.Scrublet(
        data[np.array(data.obs['library'] == sample), :].X,
        expected_doublet_rate=emr[sample])
    doublet_scores[sample], predicted_doublets[sample] = scrub[
        sample].scrub_doublets()
    print('\n\n')

for sample in samples:
    print(sample, ':', sum(predicted_doublets[sample]))

sample = 'SIGAE2'
scrub[sample].plot_histogram()
predicted_doublets[sample] = scrub[sample].call_doublets(threshold=0.3)
sum(predicted_doublets[sample])

sample = 'SIGAF2'
scrub[sample].plot_histogram()
Ejemplo n.º 29
0
################################################################################
# Processing...

if args.use_variable_features:
    print("Subsetting the variable features from the counts matrix...")
    if args.h5ad_with_variable_features_info is None:
        raise Exception("VSN ERROR: Expecting --h5ad-with-variable-features-info argument to be set since --use-variable-features argument is set to True.")

    FILE_PATH_H5AD_WITH_HVG_INFO = args.h5ad_with_variable_features_info
    adata_hvg = sc.read_h5ad(filename=FILE_PATH_H5AD_WITH_HVG_INFO.name)
    counts_matrix = adata_raw.X[:, np.array(adata_hvg.var['highly_variable'])]
else:
    counts_matrix = adata_raw.X

scrub = scr.Scrublet(counts_matrix)
adata_raw.obs['doublet_scores'], adata_raw.obs['predicted_doublets'] = scrub.scrub_doublets(
    synthetic_doublet_umi_subsampling=args.synthetic_doublet_umi_subsampling,
    use_approx_neighbors=True,
    distance_metric='euclidean',
    get_doublet_neighbor_parents=False,
    min_counts=args.min_counts,
    min_cells=args.min_cells,
    min_gene_variability_pctl=args.min_gene_variability_pctl,
    log_transform=args.log_transform,
    mean_center=args.mean_center,
    normalize_variance=args.normalize_variance,
    n_prin_comps=args.n_prin_comps,
    verbose=True
)
# Rename the columns
Ejemplo n.º 30
0
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rc('font', size=14)
plt.rcParams['pdf.fonttype'] = 42

## Basic run with scrublet
input_dir = os.path.join(sys.argv[1])
counts_matrix = scipy.io.mmread(input_dir + 'matrix.mtx').T.tocsc()
genes = np.array(
    scr.load_genes(input_dir + 'genes.tsv', delimiter='\t',
                   column=1))  # Use with the raw data
print('Counts matrix shape: {} rows, {} columns'.format(
    counts_matrix.shape[0], counts_matrix.shape[1]))
print('Number of genes in gene list: {}'.format(len(genes)))
scrub = scr.Scrublet(counts_matrix,
                     expected_doublet_rate=0.15,
                     sim_doublet_ratio=2)
doublet_scores, predicted_doublets = scrub.scrub_doublets(
    min_counts=2,
    min_cells=150,
    min_gene_variability_pctl=var_number,
    n_prin_comps=30)

scrub.call_doublets(threshold=0.40)

outdir = sys.argv[2]
scrub.plot_histogram()
plt.savefig(os.path.join(outdir, 'figure1.png'))
print('Running UMAP...')
scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10,
                                         min_dist=0.3))