def test_2(): n_features = 3 length = 32 for n_states in [4]: t1 = np.random.randn(length, n_features) means = np.random.randn(n_states, n_features) vars = np.random.rand(n_states, n_features) transmat = np.random.rand(n_states, n_states) transmat = transmat / np.sum(transmat, axis=1)[:, None] startprob = np.random.rand(n_states) startprob = startprob / np.sum(startprob) chmm = GaussianHMMCPUImpl(n_states, n_features) chmm._sequences = [t1] pyhmm = GaussianHMM(n_components=n_states, init_params='', params='', covariance_type='diag') chmm.means_ = means.astype(np.float32) chmm.vars_ = vars.astype(np.float32) chmm.transmat_ = transmat.astype(np.float32) chmm.startprob_ = startprob.astype(np.float32) clogprob, cstats = chmm.do_estep() pyhmm.means_ = means pyhmm.covars_ = vars pyhmm.transmat_ = transmat pyhmm.startprob_ = startprob framelogprob = pyhmm._compute_log_likelihood(t1) fwdlattice = pyhmm._do_forward_pass(framelogprob)[1] bwdlattice = pyhmm._do_backward_pass(framelogprob) gamma = fwdlattice + bwdlattice posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T stats = pyhmm._initialize_sufficient_statistics() pyhmm._accumulate_sufficient_statistics(stats, t1, framelogprob, posteriors, fwdlattice, bwdlattice, 'stmc') yield lambda: np.testing.assert_array_almost_equal( stats['trans'], cstats['trans'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['post'], cstats['post'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['obs'], cstats['obs'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['obs**2'], cstats['obs**2'], decimal=3)
def get_trained_model(rootpath, condition, n_states, n_iterations, feature, cov_type): fname_mean = condition + '-cond-' + feature + '-feat-' + str(n_states) + '-states-' + str(n_iterations) + '-iter-mean.txt' fname_cov = condition + '-cond-' + feature + '-feat-' + str(n_states) + '-states-' + str(n_iterations) + '-iter-cov.txt' fname_tmat = condition + '-cond-' + feature + '-feat-' + str(n_states) + '-states-' + str(n_iterations) + '-iter-transtion.txt' constructed_path_mean = rootpath + condition + '/' + fname_mean mean = np.loadtxt(constructed_path_mean) iter_list = range(n_states) iter_list.reverse() deleted_means = [] for i in iter_list: if mean[i][mean[i] > 0.01].shape[0] == 0: print 'skipping deleting ith mean:', i, mean[i] #mean = np.delete(mean, i, 0) #deleted_means.append(i) constructed_path_cov = rootpath + condition + '/' + fname_cov if cov_type == 'full': cov = load_full(constructed_path_cov, n_states, 10) else: cov = np.loadtxt(constructed_path_cov) constructed_path_tmat = rootpath + condition + '/' + fname_tmat tmat = np.loadtxt(constructed_path_tmat) #fixing tmat if any of the means and covs were deleted deleted_means.sort() deleted_means.reverse() for di in deleted_means: tmat = np.delete(tmat, di, 1) tmat = np.delete(tmat, di, 0) smat = np.zeros(tmat.shape[0]) smat[0] = 1.0 sum_fix = np.sum(tmat, axis=1) sum_fix = 1.0 / sum_fix #print tmat for i in range(tmat.shape[0]): tmat[i] = tmat[i] * sum_fix[i] #print 'corrected\n', tmat if n_states != tmat.shape[0]: print 'removed some states, n_states now corrected to: ', tmat.shape[0], 'was originaly', n_states n_states = tmat.shape[0] model = GaussianHMM(n_components=n_states, covariance_type=cov_type, startprob=smat, transmat=tmat, n_iter=0, init_params='mc') model.means_ = mean model.covars_ = cov return model
def test_2(): n_features = 3 length = 32 for n_states in [4]: t1 = np.random.randn(length, n_features) means = np.random.randn(n_states, n_features) vars = np.random.rand(n_states, n_features) transmat = np.random.rand(n_states, n_states) transmat = transmat / np.sum(transmat, axis=1)[:, None] startprob = np.random.rand(n_states) startprob = startprob / np.sum(startprob) chmm = GaussianHMMCPUImpl(n_states, n_features) chmm._sequences = [t1] pyhmm = GaussianHMM(n_components=n_states, init_params='', params='', covariance_type='diag') chmm.means_ = means.astype(np.float32) chmm.vars_ = vars.astype(np.float32) chmm.transmat_ = transmat.astype(np.float32) chmm.startprob_ = startprob.astype(np.float32) clogprob, cstats = chmm.do_estep() pyhmm.means_ = means pyhmm.covars_ = vars pyhmm.transmat_ = transmat pyhmm.startprob_ = startprob framelogprob = pyhmm._compute_log_likelihood(t1) fwdlattice = pyhmm._do_forward_pass(framelogprob)[1] bwdlattice = pyhmm._do_backward_pass(framelogprob) gamma = fwdlattice + bwdlattice posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T stats = pyhmm._initialize_sufficient_statistics() pyhmm._accumulate_sufficient_statistics( stats, t1, framelogprob, posteriors, fwdlattice, bwdlattice, 'stmc') yield lambda: np.testing.assert_array_almost_equal(stats['trans'], cstats['trans'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['post'], cstats['post'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['obs'], cstats['obs'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['obs**2'], cstats['obs**2'], decimal=3)
def predict(self, obs): """Find most likely state sequence corresponding to `obs`. Parameters ---------- obs : np.ndarray, shape=(n_samples, n_features) Sequence of n_features-dimensional data points. Each row corresponds to a single point in the sequence. Returns ------- hidden_states : np.ndarray, shape=(n_states) Index of the most likely states for each observation """ _, vl = scipy.linalg.eig(self.transmat_, left=True, right=False) startprob = vl[:, 0] / np.sum(vl[:, 0]) model = GaussianHMM(n_components=self.n_states, covariance_type='full') model.startprob_ = startprob model.transmat_ = self.transmat_ model.means_ = self.means_ model.covars_ = self.covars_ return model.predict(obs)
def predict(self, obs): """Find most likely state sequence corresponding to `obs`. Parameters ---------- obs : np.ndarray, shape=(n_samples, n_features) Sequence of n_features-dimensional data points. Each row corresponds to a single point in the sequence. Returns ------- hidden_states : np.ndarray, shape=(n_states) Index of the most likely states for each observation """ _, vl = scipy.linalg.eig(self.transmat_, left=True, right=False) startprob = vl[:, 0] / np.sum(vl[:, 0]) model = GaussianHMM(n_components=self.n_states, covariance_type='full') model.startprob_ = startprob model.transmat_ = self.transmat_ model.means_ = self.means_ model.covars_ = self.covars_ return model.predict(obs)
def test_2(): np.random.seed(42) n_features = 32 length = 20 #for n_states in [3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32]: for n_states in [8]: t1 = np.random.randn(length, n_features) means = np.random.randn(n_states, n_features) vars = np.random.rand(n_states, n_features) transmat = np.random.rand(n_states, n_states) transmat = transmat / np.sum(transmat, axis=1)[:, None] startprob = np.random.rand(n_states) startprob = startprob / np.sum(startprob) cuhmm = GaussianHMMCUDAImpl(n_states, n_features) cuhmm._sequences = [t1] pyhmm = GaussianHMM(n_components=n_states, init_params='', params='', covariance_type='diag') cuhmm.means_ = means cuhmm.vars_ = vars cuhmm.transmat_ = transmat cuhmm.startprob_ = startprob logprob, custats = cuhmm.do_estep() pyhmm.means_ = means pyhmm.covars_ = vars pyhmm.transmat_ = transmat pyhmm.startprob_ = startprob pyhmm._initialize_sufficient_statistics() framelogprob = pyhmm._compute_log_likelihood(t1) cuframelogprob = cuhmm._get_framelogprob() yield lambda: np.testing.assert_array_almost_equal(framelogprob, cuframelogprob, decimal=3) fwdlattice = pyhmm._do_forward_pass(framelogprob)[1] cufwdlattice = cuhmm._get_fwdlattice() yield lambda: np.testing.assert_array_almost_equal(fwdlattice, cufwdlattice, decimal=3) bwdlattice = pyhmm._do_backward_pass(framelogprob) cubwdlattice = cuhmm._get_bwdlattice() yield lambda: np.testing.assert_array_almost_equal(bwdlattice, cubwdlattice, decimal=3) gamma = fwdlattice + bwdlattice posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T cuposteriors = cuhmm._get_posteriors() yield lambda: np.testing.assert_array_almost_equal(posteriors, cuposteriors, decimal=3) stats = pyhmm._initialize_sufficient_statistics() pyhmm._accumulate_sufficient_statistics( stats, t1, framelogprob, posteriors, fwdlattice, bwdlattice, 'stmc') print 'ref transcounts' print transitioncounts(cufwdlattice, cubwdlattice, cuframelogprob, np.log(transmat)) print 'cutranscounts' print custats['trans'] yield lambda: np.testing.assert_array_almost_equal(stats['trans'], custats['trans'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['post'], custats['post'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['obs'], custats['obs'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['obs**2'], custats['obs**2'], decimal=3)
list_of_patient_feats, start_stop_idx, list_of_patient_file_paths = string_patient_feats(train_map, condition, overlap, window) #sirs_feats_stacked = stack_patient_feats(list_of_sirs_patients) feats_as_list = list_patient_feats(list_of_patient_feats) #print np.shape(sirs_feats_stacked) means, covs = get_initial_states(pre_states, condition, feature, end=False, start=False, cov_type=cov_type) print means print covs if cov_type == 'full': for i in range(n_states): print 'checking if initial covs are pos-definite' np.linalg.cholesky(covs[i]) print np.linalg.eigvals(covs[i]) tmat, smat = get_tmat_and_smat(pre_states, end=False, start=False) print tmat, smat model = GaussianHMM(n_components=n_states, n_iter=n_iter, covariance_type=cov_type, startprob=smat, transmat=tmat, init_params='mc') model.means_ = means model.covars_ = covs sum_inital_ll = 0.0 sum_initial_score = 0.0 sum_initial_map = 0.0 remove_idx = [] for idx, feat_from_list in enumerate(feats_as_list): if np.shape(feat_from_list)[0] > n_states: initial_ll, initial_best_seq = model.decode(feat_from_list) initial_map, initial_best_sep_map = model.decode(feat_from_list, algorithm='map') sum_initial_score += model.score(feat_from_list) sum_inital_ll += initial_ll sum_initial_map += initial_map else: remove_idx.append(idx) print 'too few samples in file', list_of_patient_file_paths[idx], np.shape(feat_from_list)
d = 0.050 ##EX transitions_prob = np.mat([row0 = [a,c,d,c,d], row1 = [ e,a,b,e,e], row2 = [c,d,a,c,d] , row3 = [d,c,c,a,d] , row4 [d,c,d,c ,a]]) transitions_prob = np.mat([[a, c, d, c, d], [e, a, b, e, e], [c, d, a, c, d], [d, c, c, a, d], [d, c, d, c, a]]) HMM = GaussianHMM(n_components=5, covariance_type="diag", transmat=transitions_prob) # # Must always fit the obs data before change means and covars # HMM.fit([Resul]) HMM.means_ = np.identity(5) HMM.covars_ = 0.2 * np.ones((5, 5)) # Use of LR probability to predict the states. HResul = HMM.predict(Resul) # Get the probability of success HMM Hscore = comp(HResul, target) # print HResul print "HMM = " print Hscore
def main(): """ Main function that performs footprint analysis. Keyword arguments: None Return: None """ ################################################################################################### # Processing Input Arguments ################################################################################################### # Initializing ErrorHandler error_handler = ErrorHandler() # Parameters current_version = "0.0.1" usage_message = ( "\n--------------------------------------------------\n" "The 'hint' program predicts TFBSs given open chromatin data.\n" "In order to use this tools, please type: \n\n" "%prog [options] <experiment_matrix>\n\n" "The <experiment matrix> should contain:\n" "- One region file representing the regions in which the HMM\n" " will be applied. It should contain 'regions' in the type field\n" "- One DNase aligned reads file (bam) file with 'DNASE' in the name field.\n" "- One to Three histone modification aligned reads file (bam).\n\n" "For more information, please refer to:\n" "http://www.regulatory-genomics.org/dnasefootprints/\n" "--------------------------------------------------") version_message = "HINT - Regulatory Analysis Toolbox (RGT). Version: " + str( current_version) # Initializing Option Parser parser = PassThroughOptionParser(usage=usage_message, version=version_message) # Optional Input Options parser.add_option( "--hmm-file", dest="hmm_file", type="string", metavar="FILE_1[,FILE_2,...,FILE_N]", default=None, help= ("List of HMM files separated by comma. If one file only, then this HMM will be " "applied for all histone signals, otherwise, the list must have the same number" "of histone files given. The order of the list should be the order of the" "histones in the input_matrix file. If the argument is not given, then an HMM" "trained with H3K4me3 in K562 will be used.")) # Parameters Options parser.add_option( "--organism", dest="organism", type="string", metavar="STRING", default="hg19", help= ("Organism considered on the analysis. Check our full documentation for all available " "options. All default files such as genomes will be based on the chosen organism " "and the data.config file. This option is used only if a bigbed output is asked." )) # Output Options parser.add_option("--output-location", dest="output_location", type="string", metavar="PATH", default=getcwd(), help=("Path where the output files will be written.")) parser.add_option("--footprint-name", dest="footprint_name", type="string", metavar="STRING", default="footprints", help=("Name of the footprint file (without extension).")) parser.add_option( "--print-bb", dest="print_bb", action="store_true", default=False, help=("If used, the output will be a bigbed (.bb) file.")) # Processing Options options, arguments = parser.parse_args() if (not arguments or len(arguments) > 1): error_handler.throw_error("FP_WRONG_ARGUMENT") # Fixed Parameters ################ region_total_ext = 10000 fp_state_nb = 7 fp_limit_size = 50 ### dnase_initial_clip = 1000 dnase_sg_window_size = 9 dnase_norm_per = 98 dnase_slope_per = 98 dnase_frag_ext = 1 ### histone_initial_clip = 1000 histone_sg_window_size = 201 histone_norm_per = 98 histone_slope_per = 98 histone_frag_ext = 200 ################################### ################################################################################################### # Reading Input Matrix ################################################################################################### # Reading input argument input_matrix = arguments[0] # Create experimental matrix try: exp_matrix = ExperimentalMatrix() exp_matrix.read(input_matrix) except Exception: error_handler.throw_error("FP_WRONG_EXPMAT") ################################################################################################### # Reading Regions ################################################################################################### # Fetching region file region_set_list = exp_matrix.get_regionsets() if (len(region_set_list) == 0): error_handler.throw_error("FP_ONE_REGION") elif (len(region_set_list) > 1): error_handler.throw_warning("FP_ONE_REGION") regions = region_set_list[0] # Extending + Sorting + Merging / keeping an original copy original_regions = deepcopy(regions) regions.extend(int(region_total_ext / 2), int(region_total_ext / 2)) # Extending regions.merge() # Sort & Merge ################################################################################################### # Reading Signals ################################################################################################### # Initialization name_list = exp_matrix.names type_list = exp_matrix.types file_dict = exp_matrix.files dnase_label = "DNASE" # Fetching signal files dnase_file = None histone_file_list = [] for i in range(0, len(name_list)): if (type_list[i] == "regions"): continue if (name_list[i].upper() == dnase_label): # DNase signal if (not dnase_file): dnase_file = BamFile(file_dict[name_list[i]]) dnase_file.load_sg_coefs(dnase_sg_window_size) else: error_handler.throw_warning("FP_MANY_DNASE") else: # Histone signal histone_file = BamFile(file_dict[name_list[i]]) histone_file.load_sg_coefs(histone_sg_window_size) histone_file_list.append(histone_file) # Handling errors if (not dnase_file): error_handler.throw_error("FP_NO_DNASE") if (len(histone_file_list) == 0): error_handler.throw_error("FP_NO_HISTONE") elif (len(histone_file_list) > 3): error_handler.throw_warning("FP_MANY_HISTONE") ################################################################################################### # Creating HMM list ################################################################################################### # Fetching HMM input flag_multiple_hmms = False if (options.hmm_file): # Argument is passed # Fetching list of HMM files hmm_file_list = options.hmm_file.split(",") # Verifying HMM application mode (one HMM or multiple HMM files) if (len(hmm_file_list) == 1): flag_multiple_hmms = False # One HMM file only elif (len(hmm_file_list) == len(histone_file_name_list)): flag_multiple_hmms = True # One HMM file for each histone else: error_handler.throw_error("FP_NB_HMMS") else: # Argument was not passed flag_multiple_hmms = False hmm_data = HmmData() hmm_file_list = [hmm_data.get_default_hmm()] # Creating scikit HMM list hmm_list = [] for hmm_file_name in hmm_file_list: try: hmm_scaffold = HMM() hmm_scaffold.load_hmm(hmm_file_name) scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full", transmat=array(hmm_scaffold.A), startprob=array(hmm_scaffold.pi)) scikit_hmm.means_ = array(hmm_scaffold.means) scikit_hmm.covars_ = array(hmm_scaffold.covs) except Exception: error_handler.throw_error("FP_HMM_FILES") hmm_list.append(scikit_hmm) ################################################################################################### # Main Pipeline ################################################################################################### # Initializing result set footprints = GenomicRegionSet("footprints") # Iterating over regions for r in regions.sequences: # Fetching DNase signal try: dnase_norm, dnase_slope = dnase_file.get_signal( r.chrom, r.initial, r.final, dnase_frag_ext, dnase_initial_clip, dnase_norm_per, dnase_slope_per) except Exception: error_handler.throw_warning( "FP_DNASE_PROC", add_msg="for region (" + ",".join([r.chrom, str(r.initial), str(r.final)]) + "). This iteration will be skipped.") continue # Iterating over histone modifications for i in range(0, len(histone_file_list)): # Fetching histone signal try: histone_file = histone_file_list[i] histone_norm, histone_slope = histone_file.get_signal( r.chrom, r.initial, r.final, histone_frag_ext, histone_initial_clip, histone_norm_per, histone_slope_per) except Exception: error_handler.throw_warning( "FP_HISTONE_PROC", add_msg="for region (" + ",".join([r.chrom, str(r.initial), str(r.final)]) + ") and histone modification " + histone_file.file_name + ". This iteration will be skipped for this histone.") continue # Formatting sequence try: input_sequence = array( [dnase_norm, dnase_slope, histone_norm, histone_slope]).T except Exception: error_handler.throw_warning( "FP_SEQ_FORMAT", add_msg="for region (" + ",".join([r.chrom, str(r.initial), str(r.final)]) + ") and histone modification " + histone_file.file_name + ". This iteration will be skipped.") continue # Applying HMM if (flag_multiple_hmms): current_hmm = hmm_list[i] else: current_hmm = hmm_list[0] try: posterior_list = current_hmm.predict(input_sequence) except Exception: error_handler.throw_warning( "FP_HMM_APPLIC", add_msg="in region (" + ",".join([r.chrom, str(r.initial), str(r.final)]) + ") and histone modification " + histone_file.file_name + ". This iteration will be skipped.") continue # Writing results start_pos = 0 flag_start = False for k in range(r.initial, r.final): curr_index = k - r.initial if (flag_start): if (posterior_list[curr_index] != fp_state_nb): if (k - start_pos < fp_limit_size): fp = GenomicRegion(r.chrom, start_pos, k) footprints.add(fp) flag_start = False else: if (posterior_list[curr_index] == fp_state_nb): flag_start = True start_pos = k if (flag_start): fp = GenomicRegion(r.chrom, start_pos, r.final) footprints.add(fp) # Sorting and Merging footprints.merge() # Overlapping results with original regions footprints = footprints.intersect(original_regions, mode=OverlapType.ORIGINAL) ################################################################################################### # Writing output ################################################################################################### # Creating output file output_file_name = options.output_location + options.footprint_name + ".bed" footprints.write_bed(output_file_name) # Verifying condition to write bb if (options.print_bb): # Fetching file with chromosome sizes genome_data = GenomeData(options.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() # Converting to big bed output_bb_name = options.output_location + options.footprint_name + ".bb" try: system(" ".join([ "bedToBigBed", output_file_name, chrom_sizes_file, output_bb_name ])) #remove(output_file_name) except Exception: error_handler.throw_error("FP_BB_CREATION")
def main(): """ Main function that performs footprint analysis. Keyword arguments: None Return: None """ ################################################################################################### # Processing Input Arguments ################################################################################################### # Initializing ErrorHandler error_handler = ErrorHandler() # Parameters current_version = "0.0.1" usage_message = ("\n--------------------------------------------------\n" "The 'hint' program predicts TFBSs given open chromatin data.\n" "In order to use this tools, please type: \n\n" "%prog [options] <experiment_matrix>\n\n" "The <experiment matrix> should contain:\n" "- One region file representing the regions in which the HMM\n" " will be applied. It should contain 'regions' in the type field\n" "- One DNase aligned reads file (bam) file with 'DNASE' in the name field.\n" "- One to Three histone modification aligned reads file (bam).\n\n" "For more information, please refer to:\n" "http://www.regulatory-genomics.org/dnasefootprints/\n" "--------------------------------------------------") version_message = "HINT - Regulatory Analysis Toolbox (RGT). Version: "+str(current_version) # Initializing Option Parser parser = PassThroughOptionParser(usage = usage_message, version = version_message) # Optional Input Options parser.add_option("--hmm-file", dest = "hmm_file", type = "string", metavar="FILE_1[,FILE_2,...,FILE_N]", default = None, help = ("List of HMM files separated by comma. If one file only, then this HMM will be " "applied for all histone signals, otherwise, the list must have the same number" "of histone files given. The order of the list should be the order of the" "histones in the input_matrix file. If the argument is not given, then an HMM" "trained with H3K4me3 in K562 will be used.")) # Parameters Options parser.add_option("--organism", dest = "organism", type = "string", metavar="STRING", default = "hg19", help = ("Organism considered on the analysis. Check our full documentation for all available " "options. All default files such as genomes will be based on the chosen organism " "and the data.config file. This option is used only if a bigbed output is asked.")) # Output Options parser.add_option("--output-location", dest = "output_location", type = "string", metavar="PATH", default = getcwd(), help = ("Path where the output files will be written.")) parser.add_option("--footprint-name", dest = "footprint_name", type = "string", metavar="STRING", default = "footprints", help = ("Name of the footprint file (without extension).")) parser.add_option("--print-bb", dest = "print_bb", action = "store_true", default = False, help = ("If used, the output will be a bigbed (.bb) file.")) # Processing Options options, arguments = parser.parse_args() if(not arguments or len(arguments) > 1): error_handler.throw_error("FP_WRONG_ARGUMENT") # Fixed Parameters ################ region_total_ext = 10000 fp_state_nb = 7 fp_limit_size = 50 ### dnase_initial_clip = 1000 dnase_sg_window_size = 9 dnase_norm_per = 98 dnase_slope_per = 98 dnase_frag_ext = 1 ### histone_initial_clip = 1000 histone_sg_window_size = 201 histone_norm_per = 98 histone_slope_per = 98 histone_frag_ext = 200 ################################### ################################################################################################### # Reading Input Matrix ################################################################################################### # Reading input argument input_matrix = arguments[0] # Create experimental matrix try: exp_matrix = ExperimentalMatrix() exp_matrix.read(input_matrix) except Exception: error_handler.throw_error("FP_WRONG_EXPMAT") ################################################################################################### # Reading Regions ################################################################################################### # Fetching region file region_set_list = exp_matrix.get_regionsets() if(len(region_set_list) == 0): error_handler.throw_error("FP_ONE_REGION") elif(len(region_set_list) > 1): error_handler.throw_warning("FP_ONE_REGION") regions = region_set_list[0] # Extending + Sorting + Merging / keeping an original copy original_regions = deepcopy(regions) regions.extend(int(region_total_ext/2),int(region_total_ext/2)) # Extending regions.merge() # Sort & Merge ################################################################################################### # Reading Signals ################################################################################################### # Initialization name_list = exp_matrix.names type_list = exp_matrix.types file_dict = exp_matrix.files dnase_label = "DNASE" # Fetching signal files dnase_file = None histone_file_list = [] for i in range(0,len(name_list)): if(type_list[i] == "regions"): continue if(name_list[i].upper() == dnase_label): # DNase signal if(not dnase_file): dnase_file = BamFile(file_dict[name_list[i]]) dnase_file.load_sg_coefs(dnase_sg_window_size) else: error_handler.throw_warning("FP_MANY_DNASE") else: # Histone signal histone_file = BamFile(file_dict[name_list[i]]) histone_file.load_sg_coefs(histone_sg_window_size) histone_file_list.append(histone_file) # Handling errors if(not dnase_file): error_handler.throw_error("FP_NO_DNASE") if(len(histone_file_list) == 0): error_handler.throw_error("FP_NO_HISTONE") elif(len(histone_file_list) > 3): error_handler.throw_warning("FP_MANY_HISTONE") ################################################################################################### # Creating HMM list ################################################################################################### # Fetching HMM input flag_multiple_hmms = False if(options.hmm_file): # Argument is passed # Fetching list of HMM files hmm_file_list = options.hmm_file.split(",") # Verifying HMM application mode (one HMM or multiple HMM files) if(len(hmm_file_list) == 1): flag_multiple_hmms = False # One HMM file only elif(len(hmm_file_list) == len(histone_file_name_list)): flag_multiple_hmms = True # One HMM file for each histone else: error_handler.throw_error("FP_NB_HMMS") else: # Argument was not passed flag_multiple_hmms = False hmm_data = HmmData() hmm_file_list = [hmm_data.get_default_hmm()] # Creating scikit HMM list hmm_list = [] for hmm_file_name in hmm_file_list: try: hmm_scaffold = HMM() hmm_scaffold.load_hmm(hmm_file_name) scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full", transmat=array(hmm_scaffold.A), startprob=array(hmm_scaffold.pi)) scikit_hmm.means_ = array(hmm_scaffold.means) scikit_hmm.covars_ = array(hmm_scaffold.covs) except Exception: error_handler.throw_error("FP_HMM_FILES") hmm_list.append(scikit_hmm) ################################################################################################### # Main Pipeline ################################################################################################### # Initializing result set footprints = GenomicRegionSet("footprints") # Iterating over regions for r in regions.sequences: # Fetching DNase signal try: dnase_norm, dnase_slope = dnase_file.get_signal(r.chrom, r.initial, r.final, dnase_frag_ext, dnase_initial_clip, dnase_norm_per, dnase_slope_per) except Exception: error_handler.throw_warning("FP_DNASE_PROC",add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+"). This iteration will be skipped.") continue # Iterating over histone modifications for i in range(0,len(histone_file_list)): # Fetching histone signal try: histone_file = histone_file_list[i] histone_norm, histone_slope = histone_file.get_signal(r.chrom, r.initial, r.final, histone_frag_ext, histone_initial_clip, histone_norm_per, histone_slope_per) except Exception: error_handler.throw_warning("FP_HISTONE_PROC",add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped for this histone.") continue # Formatting sequence try: input_sequence = array([dnase_norm,dnase_slope,histone_norm,histone_slope]).T except Exception: error_handler.throw_warning("FP_SEQ_FORMAT",add_msg="for region ("+",".join([r.chrom, str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped.") continue # Applying HMM if(flag_multiple_hmms): current_hmm = hmm_list[i] else: current_hmm = hmm_list[0] try: posterior_list = current_hmm.predict(input_sequence) except Exception: error_handler.throw_warning("FP_HMM_APPLIC",add_msg="in region ("+",".join([r.chrom, str(r.initial), str(r.final)])+") and histone modification "+histone_file.file_name+". This iteration will be skipped.") continue # Writing results start_pos = 0 flag_start = False for k in range(r.initial, r.final): curr_index = k - r.initial if(flag_start): if(posterior_list[curr_index] != fp_state_nb): if(k-start_pos < fp_limit_size): fp = GenomicRegion(r.chrom, start_pos, k) footprints.add(fp) flag_start = False else: if(posterior_list[curr_index] == fp_state_nb): flag_start = True start_pos = k if(flag_start): fp = GenomicRegion(r.chrom, start_pos, r.final) footprints.add(fp) # Sorting and Merging footprints.merge() # Overlapping results with original regions footprints = footprints.intersect(original_regions,mode=OverlapType.ORIGINAL) ################################################################################################### # Writing output ################################################################################################### # Creating output file output_file_name = options.output_location+options.footprint_name+".bed" footprints.write_bed(output_file_name) # Verifying condition to write bb if(options.print_bb): # Fetching file with chromosome sizes genome_data = GenomeData(options.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() # Converting to big bed output_bb_name = options.output_location+options.footprint_name+".bb" try: system(" ".join(["bedToBigBed",output_file_name,chrom_sizes_file,output_bb_name])) #remove(output_file_name) except Exception: error_handler.throw_error("FP_BB_CREATION")
def test_2(): np.random.seed(42) n_features = 32 length = 20 #for n_states in [3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32]: for n_states in [8]: t1 = np.random.randn(length, n_features) means = np.random.randn(n_states, n_features) vars = np.random.rand(n_states, n_features) transmat = np.random.rand(n_states, n_states) transmat = transmat / np.sum(transmat, axis=1)[:, None] startprob = np.random.rand(n_states) startprob = startprob / np.sum(startprob) cuhmm = GaussianHMMCUDAImpl(n_states, n_features) cuhmm._sequences = [t1] pyhmm = GaussianHMM(n_components=n_states, init_params='', params='', covariance_type='diag') cuhmm.means_ = means cuhmm.vars_ = vars cuhmm.transmat_ = transmat cuhmm.startprob_ = startprob logprob, custats = cuhmm.do_estep() pyhmm.means_ = means pyhmm.covars_ = vars pyhmm.transmat_ = transmat pyhmm.startprob_ = startprob pyhmm._initialize_sufficient_statistics() framelogprob = pyhmm._compute_log_likelihood(t1) cuframelogprob = cuhmm._get_framelogprob() yield lambda: np.testing.assert_array_almost_equal( framelogprob, cuframelogprob, decimal=3) fwdlattice = pyhmm._do_forward_pass(framelogprob)[1] cufwdlattice = cuhmm._get_fwdlattice() yield lambda: np.testing.assert_array_almost_equal( fwdlattice, cufwdlattice, decimal=3) bwdlattice = pyhmm._do_backward_pass(framelogprob) cubwdlattice = cuhmm._get_bwdlattice() yield lambda: np.testing.assert_array_almost_equal( bwdlattice, cubwdlattice, decimal=3) gamma = fwdlattice + bwdlattice posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T cuposteriors = cuhmm._get_posteriors() yield lambda: np.testing.assert_array_almost_equal( posteriors, cuposteriors, decimal=3) stats = pyhmm._initialize_sufficient_statistics() pyhmm._accumulate_sufficient_statistics(stats, t1, framelogprob, posteriors, fwdlattice, bwdlattice, 'stmc') print 'ref transcounts' print transitioncounts(cufwdlattice, cubwdlattice, cuframelogprob, np.log(transmat)) print 'cutranscounts' print custats['trans'] yield lambda: np.testing.assert_array_almost_equal( stats['trans'], custats['trans'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['post'], custats['post'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['obs'], custats['obs'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['obs**2'], custats['obs**2'], decimal=3)