コード例 #1
0
ファイル: hdt.py プロジェクト: Xiaojieqiu/FastProject
def HDT_Sig_batch(xpdf_matrix, nboot, progressbar=True):
    """
    Saves time if calculating dip test on many sample-s of same size.
    Only generates the background distribution for significance once.

    :param xpdf_matrix: 2-dimensional numpy ndarray
    :param nboot: number of times to generate a random test statistic for p-value calculation
    :return: dips: dip statistic for each row in xpdf_matrix
                ps: p-value for each row in xpdf_matrix
                xlows: xlow for each row in xpdf_matrix
                xups: xup for each row in xpdf_matrix
    """

    dips = np.zeros(xpdf_matrix.shape[0]);
    xlows = np.zeros(xpdf_matrix.shape[0]);
    xups = np.zeros(xpdf_matrix.shape[0]);

    if(progressbar): pbar = ProgressBar(xpdf_matrix.shape[0] + nboot);

    for i in xrange(xpdf_matrix.shape[0]):
        (dip, xlow, xup, ifault, gcm, lcm, mn, mj) = DipTest(xpdf_matrix[i,:]);
        dips[i] = dip;
        xlows[i] = xlow;
        xups[i] = xup;
        if(progressbar): pbar.update();


    bootDip=np.zeros(nboot);
    for i in np.arange(nboot):
        unifpdf=np.sort(np.random.rand(xpdf_matrix.shape[1]))
        bootDip[i] = DipTest(unifpdf)[0];
        if(progressbar): pbar.update();

    dips = np.expand_dims(dips, axis=1);        #Make dips Nx1
    bootDip = np.expand_dims(bootDip, axis=0);  #Make bootDip 1xnboot

    ps = np.sum(dips < bootDip, axis=1) / float(nboot);

    if(progressbar): pbar.complete();

    return(dips, ps, xlows, xups)
コード例 #2
0
ファイル: em.py プロジェクト: Xiaojieqiu/FastProject
def em_exp_norm_mixture(zmat, cutoff, progressbar=True):
    with np.errstate(divide="ignore", invalid="ignore"):  # nans and infs are handled explicitly, don't need warning
        max_iter = 150

        if progressbar:
            pbar = ProgressBar(max_iter)

        # promote to 2d if single gene given
        if zmat.ndim == 1:
            zmat = zmat.reshape((1, zmat.shape[0]))
            cutoff = np.array([cutoff])

        cutoff = cutoff.reshape((cutoff.shape[0], 1))
        cutoffs = np.tile(cutoff, (1, zmat.shape[1]))
        gamma = zmat > cutoffs
        Pi = np.mean(gamma, axis=1).reshape((zmat.shape[0], 1))
        Pi[Pi == 0] = 1 / zmat.shape[1]

        mu_l = weighted_mean(zmat, 1 - gamma)
        mu_h = weighted_mean(zmat, gamma)
        st_h = weighted_std(zmat, gamma, mu_h)

        for niter in xrange(max_iter):
            # E
            prev_gamma = gamma
            p_low = np.exp(-1 * zmat / mu_l) / mu_l

            p_high = np.exp(-1 * (zmat - mu_h) ** 2 / (2 * st_h ** 2)) / st_h / np.sqrt(2 * np.pi)

            p_low[~np.isfinite(p_low)] = 1e-5
            p_low[p_low < 1e-5] = 1e-5

            p_high[~np.isfinite(p_high)] = 1e-5
            p_high[p_high < 1e-5] = 1e-5

            gamma = (Pi * p_high) / (((1 - Pi) * p_low) + (Pi * p_high))

            # M
            Pi = np.mean(gamma, axis=1).reshape((zmat.shape[0], 1))
            mu_l = weighted_mean(zmat, 1 - gamma)
            mu_h = weighted_mean(zmat, gamma)
            st_h = weighted_std(zmat, gamma, mu_h)

            if progressbar:
                pbar.update()

            if niter % 10 == 0:
                biggest_change = np.max(np.abs(gamma - prev_gamma))
                if biggest_change < 0.01:
                    break

            # if niter == 1: print mu_l, mu_h, st_l, st_h
            # print 'Iteration: ', niter, ' L: ', sum(L);
            # if d>0.95:
            # break;
        if progressbar:
            pbar.complete()

        L = np.sum(gamma * np.log(p_high) + (1 - gamma) * np.log(p_low), axis=1)
        st_l = weighted_std(zmat, 1 - gamma, mu_l)
        return (gamma, mu_l, mu_h, st_l, st_h, Pi, L)
コード例 #3
0
ファイル: Pipelines.py プロジェクト: Xiaojieqiu/FastProject
def FullOutput(options, args):

    start_time = time.time();
    if(options.housekeeping):
        housekeeping_filename = options.housekeeping;
    else:
        housekeeping_filename = '';  #If not in interactive mode and no -h specified, just use the default list

    def get_housekeeping_file():
        fn = housekeeping_filename;
        return fn;

    #%% Read expression data from file
    if(len(args) > 0):
        filename = args[0];
    else:
        raise ValueError("Argument Error:  data_file not specified.\nExiting...");

    if(not os.path.isfile(filename)):
        raise ValueError("Argument Error: data file not found.\nExiting...");

    (edata, genes, cells) = FileIO.read_matrix(filename);

    print("Imported ", edata.shape[0], " genes across ", edata.shape[1], " samples");

    #%% Load Signature file
    sigs = [];
    if(options.signatures):
        sig_file = options.signatures;
        if(not os.path.isfile(sig_file)):
            raise ValueError("Option Error: signature file " + sig_file + " not found.\nExiting...");

        sigs = Signatures.read_signatures(sig_file);
    if(options.precomputed):
        precomputed_sig_file = options.precomputed;
        if(not os.path.isfile(precomputed_sig_file)):
            raise ValueError("Option Error: precomputed signature file " + precomputed_sig_file + " not found.\nExiting...");

    if(not options.signatures and not options.precomputed): #Need one or the other here
        raise ValueError("Option Error: Must specify either a signature file or a pre-computed signature file.\nExiting...");


    #Wrap data in ExpressionData object
    edata = ExpressionData(edata, genes, cells);

    #Hold on to originals so we don't lose data after filtering in case it's needed later
    original_data = edata.copy();

    #Create directory for all outputs
    if(options.output):
        dir_name = options.output;
    else:
        default_dir_name = 'FastProject_Output';
        if(os.path.isdir(default_dir_name)):
            i = 1;
            while(True):
                dir_name = default_dir_name + str(i);
                if(not os.path.isdir(dir_name)):
                    break;
                else:
                    i = i+1;
        else:
            dir_name = default_dir_name;

    if(not os.path.isdir(dir_name)):
        os.makedirs(dir_name);

    #Filtering
    filter_dict = {};
    if(options.nofilter):
        edata = Filters.filter_genes_novar(edata);

        filter_dict.update({'None': set()});
    else:
        edata = Filters.filter_genes_threshold(edata, 0.2);

        #HDT Filtering
        print("Removing genes with unimodal distribution across samples using Hartigans DT...");
        hdt_mask = Filters.filter_genes_hdt(edata, 0.05);
        #Fano Filtering
        print("Applying Fano-Filtering...");
        fano_mask = Filters.filter_genes_fano(edata, 2);

        filter_dict.update({
            'None': set(edata.row_labels), #None means 'use all genes'. This set only used when outputting filter
            'HDT': set([edata.row_labels[i] for i,x in enumerate(hdt_mask) if x]),
            'Fano': set([edata.row_labels[i] for i,x in enumerate(fano_mask) if x])
        });

    edata.filters = filter_dict;

    #%% Probability transform
    housekeeping_filename = get_housekeeping_file();

    print()
    print('Fitting expression data to exp/norm mixture model');
    (pdata, mu_h) = Transforms.probability_of_expression(edata);


    print();
    print('Correcting for false-negatives using housekeeping gene levels');
    (fit_func, params) = Transforms.create_false_neg_map(original_data, housekeeping_filename);
    (pdata, fn_prob) = Transforms.correct_for_fn(pdata, mu_h, fit_func, params);
    fn_prob[edata > 0] = 0;

    pdata = ProbabilityData(pdata, edata);

    edata.weights = 1-fn_prob;
    pdata.weights = 1-fn_prob;

    sample_passes, sample_scores = Transforms.quality_check(params);

    if(options.qc):
        pdata = pdata.subset_samples(sample_passes);
        edata = edata.subset_samples(sample_passes);
        sample_scores = sample_scores[sample_passes];

    if(options.subsample_size > edata.shape[1]):
        options.subsample_size = None;

    Transforms.z_normalize(edata);

    model_names = ['Expression', 'Probability'];
    model_data = [edata, pdata];

    fout_js = open(dir_name + os.sep + "FP_data.jsdata", 'w');
    js_models = [];

    for name, data in zip(model_names, model_data):
        model_dir = os.path.join(dir_name, name);
        try:
            os.makedirs(model_dir);
        except OSError:
            pass;

        print();
        print('Model: ', name)

        #Evaluate Signatures
        print();
        print("Evaluating signature scores on samples...");

        sig_scores = dict();

        pbar = ProgressBar(len(sigs));
        for sig in sigs:
            try:
                sig_scores[sig.name] = data.eval_signature(sig);
            except ValueError:  #Only thrown when the signature has no genes in the data
                pass #Just discard the signature then
            pbar.update();
        pbar.complete();

        if(options.precomputed):
            precomputed_sig_scores = Signatures.load_precomputed(options.precomputed, data.col_labels);
            sig_scores.update(precomputed_sig_scores);

        #Prompt to save data
        out_file = 'SignatureScores.txt';
        FileIO.write_signature_scores(os.path.join(model_dir, out_file), sig_scores, data.col_labels);

        #Save data to js model as well
        js_model_dict = {'model': name};
        js_model_dict.update({'signatureScores': sig_scores})
        js_model_dict.update({'sampleLabels': data.col_labels});
        js_model_dict.update({'projectionData': []})
        js_models.append(js_model_dict);

        for filter_name in filter_dict.keys():
            if(filter_name == "None"):
                filter_dir = os.path.join(model_dir, "No_Filter");
            else:
                filter_dir = os.path.join(model_dir, filter_name + "_Filter");
            try:
                os.makedirs(filter_dir);
            except OSError:
                pass;

            print();
            print("Filter-Level:", filter_name);
            #%% Dimensional Reduction procedures
            print();
            print("Projecting data into 2 dimensions");

            projections, pcdata = Projections.generate_projections(data, filter_name, options.subsample_size);

            #Evaluate Clusters
            print("Evaluating Clusters...");
            clusters = Projections.define_clusters(projections);

            #%% Evaluating signatures against projections
            sp_row_labels, sp_col_labels, sig_proj_matrix, sig_proj_matrix_p = Signatures.sigs_vs_projections_v2(projections, sig_scores,subsample_size=options.subsample_size);

            #Save Projections
            FileIO.write_projection_file(os.path.join(filter_dir, 'Projections.txt'), data.col_labels, projections);

            #Save Clusters
            FileIO.write_cluster_file(os.path.join(filter_dir, 'Clusters.txt'), data.col_labels, clusters)

            #Output matrix of p-values for conformity scores
            FileIO.write_matrix(os.path.join(filter_dir, "DissimilarityMatrix.txt"),sig_proj_matrix, sp_row_labels, sp_col_labels);
            FileIO.write_matrix(os.path.join(filter_dir, "PMatrix.txt"),sig_proj_matrix_p, sp_row_labels, sp_col_labels);

            #Output genes used in filter
            FileIO.write_filter_file(os.path.join(filter_dir, 'ProjectedGenes.txt'), data.filters[filter_name]);

            #Output JS
            js_filt_dict = dict();
            js_model_dict['projectionData'].append(js_filt_dict);
            js_filt_dict.update({'filter': filter_name});
            js_filt_dict.update({'genes': data.filters[filter_name]});
            js_filt_dict.update({'pca': False});
            js_filt_dict.update({'projections': projections});
            js_filt_dict.update({'sigProjMatrix': sig_proj_matrix});
            js_filt_dict.update({'sigProjMatrix_p': sig_proj_matrix_p});
            js_filt_dict.update({'projectionKeys': sp_col_labels});
            js_filt_dict.update({'signatureKeys': sp_row_labels});
            js_filt_dict.update({'clusters': clusters});



            #Now do it all again using the principal component data
            if(options.pca_filter):
                pcdata = Projections.filter_PCA(pcdata, scores=sample_scores, variance_proportion=0.25);
            else:
                pcdata = Projections.filter_PCA(pcdata, variance_proportion=0.25, min_components = 30);

            #%% Dimensional Reduction procedures
            print();
            print("Projecting data into 2 dimensions");

            projections, pcdata2 = Projections.generate_projections(pcdata, filter_name, options.subsample_size);

            #Evaluate Clusters
            print("Evaluating Clusters...");
            clusters = Projections.define_clusters(projections);

            #%% Evaluating signatures against projections
            sp_row_labels, sp_col_labels, sig_proj_matrix, sig_proj_matrix_p = Signatures.sigs_vs_projections_v2(projections, sig_scores, subsample_size = options.subsample_size);

            #Save Projections
            FileIO.write_projection_file(os.path.join(filter_dir, 'Projections-PC.txt'), pcdata.col_labels, projections);

            #Save Clusters
            FileIO.write_cluster_file(os.path.join(filter_dir, 'Clusters-PC.txt'), pcdata.col_labels, clusters)

            #Output matrix of p-values for conformity scores
            FileIO.write_matrix(os.path.join(filter_dir, "DissimilarityMatrix-PC.txt"),sig_proj_matrix, sp_row_labels, sp_col_labels);
            FileIO.write_matrix(os.path.join(filter_dir, "PMatrix-PC.txt"),sig_proj_matrix_p, sp_row_labels, sp_col_labels);

            #Output JS
            js_filt_dict = dict();
            js_model_dict['projectionData'].append(js_filt_dict);
            js_filt_dict.update({'filter': filter_name});
            js_filt_dict.update({'genes': data.filters[filter_name]});
            js_filt_dict.update({'pca': True});
            js_filt_dict.update({'projections': projections});
            js_filt_dict.update({'sigProjMatrix': sig_proj_matrix});
            js_filt_dict.update({'sigProjMatrix_p': sig_proj_matrix_p});
            js_filt_dict.update({'projectionKeys': sp_col_labels});
            js_filt_dict.update({'signatureKeys': sp_row_labels});
            js_filt_dict.update({'clusters': clusters});


    fout_js.write(HtmlViewer.toJS_variable("FP_Models", js_models));

    #Write signatures to file
    #Assemble signatures into an object, then convert to JSON variable and write
    sig_dict = {};
    for sig in sigs:
        sig_genes = sig.sig_dict.keys();
        sig_values = sig.sig_dict.values();
        sort_i = np.array(sig_values).argsort()[::-1];#Put positive signatures first
        sig_genes = [sig_genes[i] for i in sort_i];
        sig_values = [sig_values[i] for i in sort_i];
        sig_dict.update({sig.name: {'Genes':sig_genes, 'Signs':sig_values}});
    fout_js.write(HtmlViewer.toJS_variable("FP_Signatures", sig_dict));

    #Write the original data matrix to the javascript file.
    #First, cluster genes
    from scipy.cluster.hierarchy import leaves_list, linkage;
    linkage_matrix = linkage(edata);
    leaves_i = leaves_list(linkage_matrix);
    edata_clustered = edata[leaves_i, :];
    edata_clustered.row_labels = [edata.row_labels[i] for i in leaves_i];

    data_json = dict({
        'data': edata_clustered,
        'gene_labels': edata_clustered.row_labels,
        'sample_labels': edata_clustered.col_labels,
    });
    fout_js.write(HtmlViewer.toJS_variable("FP_ExpressionMatrix", data_json));

    fout_js.close();
    HtmlViewer.copy_html_files(dir_name);
    print();
    print("FastProject Analysis Complete")
    elapsed_time = time.time() - start_time;
    print("Elapsed Time {:.2f} seconds".format(elapsed_time));