def HDT_Sig_batch(xpdf_matrix, nboot, progressbar=True): """ Saves time if calculating dip test on many sample-s of same size. Only generates the background distribution for significance once. :param xpdf_matrix: 2-dimensional numpy ndarray :param nboot: number of times to generate a random test statistic for p-value calculation :return: dips: dip statistic for each row in xpdf_matrix ps: p-value for each row in xpdf_matrix xlows: xlow for each row in xpdf_matrix xups: xup for each row in xpdf_matrix """ dips = np.zeros(xpdf_matrix.shape[0]); xlows = np.zeros(xpdf_matrix.shape[0]); xups = np.zeros(xpdf_matrix.shape[0]); if(progressbar): pbar = ProgressBar(xpdf_matrix.shape[0] + nboot); for i in xrange(xpdf_matrix.shape[0]): (dip, xlow, xup, ifault, gcm, lcm, mn, mj) = DipTest(xpdf_matrix[i,:]); dips[i] = dip; xlows[i] = xlow; xups[i] = xup; if(progressbar): pbar.update(); bootDip=np.zeros(nboot); for i in np.arange(nboot): unifpdf=np.sort(np.random.rand(xpdf_matrix.shape[1])) bootDip[i] = DipTest(unifpdf)[0]; if(progressbar): pbar.update(); dips = np.expand_dims(dips, axis=1); #Make dips Nx1 bootDip = np.expand_dims(bootDip, axis=0); #Make bootDip 1xnboot ps = np.sum(dips < bootDip, axis=1) / float(nboot); if(progressbar): pbar.complete(); return(dips, ps, xlows, xups)
def em_exp_norm_mixture(zmat, cutoff, progressbar=True): with np.errstate(divide="ignore", invalid="ignore"): # nans and infs are handled explicitly, don't need warning max_iter = 150 if progressbar: pbar = ProgressBar(max_iter) # promote to 2d if single gene given if zmat.ndim == 1: zmat = zmat.reshape((1, zmat.shape[0])) cutoff = np.array([cutoff]) cutoff = cutoff.reshape((cutoff.shape[0], 1)) cutoffs = np.tile(cutoff, (1, zmat.shape[1])) gamma = zmat > cutoffs Pi = np.mean(gamma, axis=1).reshape((zmat.shape[0], 1)) Pi[Pi == 0] = 1 / zmat.shape[1] mu_l = weighted_mean(zmat, 1 - gamma) mu_h = weighted_mean(zmat, gamma) st_h = weighted_std(zmat, gamma, mu_h) for niter in xrange(max_iter): # E prev_gamma = gamma p_low = np.exp(-1 * zmat / mu_l) / mu_l p_high = np.exp(-1 * (zmat - mu_h) ** 2 / (2 * st_h ** 2)) / st_h / np.sqrt(2 * np.pi) p_low[~np.isfinite(p_low)] = 1e-5 p_low[p_low < 1e-5] = 1e-5 p_high[~np.isfinite(p_high)] = 1e-5 p_high[p_high < 1e-5] = 1e-5 gamma = (Pi * p_high) / (((1 - Pi) * p_low) + (Pi * p_high)) # M Pi = np.mean(gamma, axis=1).reshape((zmat.shape[0], 1)) mu_l = weighted_mean(zmat, 1 - gamma) mu_h = weighted_mean(zmat, gamma) st_h = weighted_std(zmat, gamma, mu_h) if progressbar: pbar.update() if niter % 10 == 0: biggest_change = np.max(np.abs(gamma - prev_gamma)) if biggest_change < 0.01: break # if niter == 1: print mu_l, mu_h, st_l, st_h # print 'Iteration: ', niter, ' L: ', sum(L); # if d>0.95: # break; if progressbar: pbar.complete() L = np.sum(gamma * np.log(p_high) + (1 - gamma) * np.log(p_low), axis=1) st_l = weighted_std(zmat, 1 - gamma, mu_l) return (gamma, mu_l, mu_h, st_l, st_h, Pi, L)
def FullOutput(options, args): start_time = time.time(); if(options.housekeeping): housekeeping_filename = options.housekeeping; else: housekeeping_filename = ''; #If not in interactive mode and no -h specified, just use the default list def get_housekeeping_file(): fn = housekeeping_filename; return fn; #%% Read expression data from file if(len(args) > 0): filename = args[0]; else: raise ValueError("Argument Error: data_file not specified.\nExiting..."); if(not os.path.isfile(filename)): raise ValueError("Argument Error: data file not found.\nExiting..."); (edata, genes, cells) = FileIO.read_matrix(filename); print("Imported ", edata.shape[0], " genes across ", edata.shape[1], " samples"); #%% Load Signature file sigs = []; if(options.signatures): sig_file = options.signatures; if(not os.path.isfile(sig_file)): raise ValueError("Option Error: signature file " + sig_file + " not found.\nExiting..."); sigs = Signatures.read_signatures(sig_file); if(options.precomputed): precomputed_sig_file = options.precomputed; if(not os.path.isfile(precomputed_sig_file)): raise ValueError("Option Error: precomputed signature file " + precomputed_sig_file + " not found.\nExiting..."); if(not options.signatures and not options.precomputed): #Need one or the other here raise ValueError("Option Error: Must specify either a signature file or a pre-computed signature file.\nExiting..."); #Wrap data in ExpressionData object edata = ExpressionData(edata, genes, cells); #Hold on to originals so we don't lose data after filtering in case it's needed later original_data = edata.copy(); #Create directory for all outputs if(options.output): dir_name = options.output; else: default_dir_name = 'FastProject_Output'; if(os.path.isdir(default_dir_name)): i = 1; while(True): dir_name = default_dir_name + str(i); if(not os.path.isdir(dir_name)): break; else: i = i+1; else: dir_name = default_dir_name; if(not os.path.isdir(dir_name)): os.makedirs(dir_name); #Filtering filter_dict = {}; if(options.nofilter): edata = Filters.filter_genes_novar(edata); filter_dict.update({'None': set()}); else: edata = Filters.filter_genes_threshold(edata, 0.2); #HDT Filtering print("Removing genes with unimodal distribution across samples using Hartigans DT..."); hdt_mask = Filters.filter_genes_hdt(edata, 0.05); #Fano Filtering print("Applying Fano-Filtering..."); fano_mask = Filters.filter_genes_fano(edata, 2); filter_dict.update({ 'None': set(edata.row_labels), #None means 'use all genes'. This set only used when outputting filter 'HDT': set([edata.row_labels[i] for i,x in enumerate(hdt_mask) if x]), 'Fano': set([edata.row_labels[i] for i,x in enumerate(fano_mask) if x]) }); edata.filters = filter_dict; #%% Probability transform housekeeping_filename = get_housekeeping_file(); print() print('Fitting expression data to exp/norm mixture model'); (pdata, mu_h) = Transforms.probability_of_expression(edata); print(); print('Correcting for false-negatives using housekeeping gene levels'); (fit_func, params) = Transforms.create_false_neg_map(original_data, housekeeping_filename); (pdata, fn_prob) = Transforms.correct_for_fn(pdata, mu_h, fit_func, params); fn_prob[edata > 0] = 0; pdata = ProbabilityData(pdata, edata); edata.weights = 1-fn_prob; pdata.weights = 1-fn_prob; sample_passes, sample_scores = Transforms.quality_check(params); if(options.qc): pdata = pdata.subset_samples(sample_passes); edata = edata.subset_samples(sample_passes); sample_scores = sample_scores[sample_passes]; if(options.subsample_size > edata.shape[1]): options.subsample_size = None; Transforms.z_normalize(edata); model_names = ['Expression', 'Probability']; model_data = [edata, pdata]; fout_js = open(dir_name + os.sep + "FP_data.jsdata", 'w'); js_models = []; for name, data in zip(model_names, model_data): model_dir = os.path.join(dir_name, name); try: os.makedirs(model_dir); except OSError: pass; print(); print('Model: ', name) #Evaluate Signatures print(); print("Evaluating signature scores on samples..."); sig_scores = dict(); pbar = ProgressBar(len(sigs)); for sig in sigs: try: sig_scores[sig.name] = data.eval_signature(sig); except ValueError: #Only thrown when the signature has no genes in the data pass #Just discard the signature then pbar.update(); pbar.complete(); if(options.precomputed): precomputed_sig_scores = Signatures.load_precomputed(options.precomputed, data.col_labels); sig_scores.update(precomputed_sig_scores); #Prompt to save data out_file = 'SignatureScores.txt'; FileIO.write_signature_scores(os.path.join(model_dir, out_file), sig_scores, data.col_labels); #Save data to js model as well js_model_dict = {'model': name}; js_model_dict.update({'signatureScores': sig_scores}) js_model_dict.update({'sampleLabels': data.col_labels}); js_model_dict.update({'projectionData': []}) js_models.append(js_model_dict); for filter_name in filter_dict.keys(): if(filter_name == "None"): filter_dir = os.path.join(model_dir, "No_Filter"); else: filter_dir = os.path.join(model_dir, filter_name + "_Filter"); try: os.makedirs(filter_dir); except OSError: pass; print(); print("Filter-Level:", filter_name); #%% Dimensional Reduction procedures print(); print("Projecting data into 2 dimensions"); projections, pcdata = Projections.generate_projections(data, filter_name, options.subsample_size); #Evaluate Clusters print("Evaluating Clusters..."); clusters = Projections.define_clusters(projections); #%% Evaluating signatures against projections sp_row_labels, sp_col_labels, sig_proj_matrix, sig_proj_matrix_p = Signatures.sigs_vs_projections_v2(projections, sig_scores,subsample_size=options.subsample_size); #Save Projections FileIO.write_projection_file(os.path.join(filter_dir, 'Projections.txt'), data.col_labels, projections); #Save Clusters FileIO.write_cluster_file(os.path.join(filter_dir, 'Clusters.txt'), data.col_labels, clusters) #Output matrix of p-values for conformity scores FileIO.write_matrix(os.path.join(filter_dir, "DissimilarityMatrix.txt"),sig_proj_matrix, sp_row_labels, sp_col_labels); FileIO.write_matrix(os.path.join(filter_dir, "PMatrix.txt"),sig_proj_matrix_p, sp_row_labels, sp_col_labels); #Output genes used in filter FileIO.write_filter_file(os.path.join(filter_dir, 'ProjectedGenes.txt'), data.filters[filter_name]); #Output JS js_filt_dict = dict(); js_model_dict['projectionData'].append(js_filt_dict); js_filt_dict.update({'filter': filter_name}); js_filt_dict.update({'genes': data.filters[filter_name]}); js_filt_dict.update({'pca': False}); js_filt_dict.update({'projections': projections}); js_filt_dict.update({'sigProjMatrix': sig_proj_matrix}); js_filt_dict.update({'sigProjMatrix_p': sig_proj_matrix_p}); js_filt_dict.update({'projectionKeys': sp_col_labels}); js_filt_dict.update({'signatureKeys': sp_row_labels}); js_filt_dict.update({'clusters': clusters}); #Now do it all again using the principal component data if(options.pca_filter): pcdata = Projections.filter_PCA(pcdata, scores=sample_scores, variance_proportion=0.25); else: pcdata = Projections.filter_PCA(pcdata, variance_proportion=0.25, min_components = 30); #%% Dimensional Reduction procedures print(); print("Projecting data into 2 dimensions"); projections, pcdata2 = Projections.generate_projections(pcdata, filter_name, options.subsample_size); #Evaluate Clusters print("Evaluating Clusters..."); clusters = Projections.define_clusters(projections); #%% Evaluating signatures against projections sp_row_labels, sp_col_labels, sig_proj_matrix, sig_proj_matrix_p = Signatures.sigs_vs_projections_v2(projections, sig_scores, subsample_size = options.subsample_size); #Save Projections FileIO.write_projection_file(os.path.join(filter_dir, 'Projections-PC.txt'), pcdata.col_labels, projections); #Save Clusters FileIO.write_cluster_file(os.path.join(filter_dir, 'Clusters-PC.txt'), pcdata.col_labels, clusters) #Output matrix of p-values for conformity scores FileIO.write_matrix(os.path.join(filter_dir, "DissimilarityMatrix-PC.txt"),sig_proj_matrix, sp_row_labels, sp_col_labels); FileIO.write_matrix(os.path.join(filter_dir, "PMatrix-PC.txt"),sig_proj_matrix_p, sp_row_labels, sp_col_labels); #Output JS js_filt_dict = dict(); js_model_dict['projectionData'].append(js_filt_dict); js_filt_dict.update({'filter': filter_name}); js_filt_dict.update({'genes': data.filters[filter_name]}); js_filt_dict.update({'pca': True}); js_filt_dict.update({'projections': projections}); js_filt_dict.update({'sigProjMatrix': sig_proj_matrix}); js_filt_dict.update({'sigProjMatrix_p': sig_proj_matrix_p}); js_filt_dict.update({'projectionKeys': sp_col_labels}); js_filt_dict.update({'signatureKeys': sp_row_labels}); js_filt_dict.update({'clusters': clusters}); fout_js.write(HtmlViewer.toJS_variable("FP_Models", js_models)); #Write signatures to file #Assemble signatures into an object, then convert to JSON variable and write sig_dict = {}; for sig in sigs: sig_genes = sig.sig_dict.keys(); sig_values = sig.sig_dict.values(); sort_i = np.array(sig_values).argsort()[::-1];#Put positive signatures first sig_genes = [sig_genes[i] for i in sort_i]; sig_values = [sig_values[i] for i in sort_i]; sig_dict.update({sig.name: {'Genes':sig_genes, 'Signs':sig_values}}); fout_js.write(HtmlViewer.toJS_variable("FP_Signatures", sig_dict)); #Write the original data matrix to the javascript file. #First, cluster genes from scipy.cluster.hierarchy import leaves_list, linkage; linkage_matrix = linkage(edata); leaves_i = leaves_list(linkage_matrix); edata_clustered = edata[leaves_i, :]; edata_clustered.row_labels = [edata.row_labels[i] for i in leaves_i]; data_json = dict({ 'data': edata_clustered, 'gene_labels': edata_clustered.row_labels, 'sample_labels': edata_clustered.col_labels, }); fout_js.write(HtmlViewer.toJS_variable("FP_ExpressionMatrix", data_json)); fout_js.close(); HtmlViewer.copy_html_files(dir_name); print(); print("FastProject Analysis Complete") elapsed_time = time.time() - start_time; print("Elapsed Time {:.2f} seconds".format(elapsed_time));