def GetSites(Paths, Sequences, Background, EmissionParameters, TransitionParameters, TransitionTypeFirst, fg_state, merge_neighbouring_sites, minimal_site_length, seq_file='', bck_file=''): """Get the predicted sites.""" # Extract the paths Sites = convert_paths_to_sites( Paths, fg_state, merge_neighbouring_sites, minimal_site_length) nr_of_genes = len(list(Sequences.keys())) gene_nr_dict = {} for i, curr_gene in enumerate(Sequences.keys()): gene_nr_dict[curr_gene] = i sites_keys = [key for key in list(Sites.keys()) if len(Sites[key]) > 0] f = lambda key, Sites=Sites, nr_of_genes=nr_of_genes, gene_nr_dict=gene_nr_dict, seq_file=seq_file, bck_file=bck_file, EmissionParameters=EmissionParameters, TransitionParameters=TransitionParameters, TransitionTypeFirst=TransitionTypeFirst, fg_state=fg_state, merge_neighbouring_sites=merge_neighbouring_sites, minimal_site_length=minimal_site_length: (Sites[key], key, nr_of_genes, gene_nr_dict[key], seq_file, bck_file, EmissionParameters, TransitionParameters, TransitionTypeFirst, fg_state, merge_neighbouring_sites, minimal_site_length) data = map(f, sites_keys) LoadReads.close_data_handles(handles=[Sequences, Background]) if EmissionParameters['nb_proc'] == 1: ScoredSites = dict([GetSitesForGene(curr_slice) for curr_slice in data]) else: pool = multiprocessing.get_context("spawn").Pool( EmissionParameters['nb_proc'], maxtasksperchild=10) results = pool.imap(GetSitesForGene, data, chunksize=1) pool.close() pool.join() ScoredSites = dict([res for res in results]) for key in list(ScoredSites.keys()): if len(ScoredSites[key]) == 0: del ScoredSites[key] return ScoredSites
def GeneratePred(Paths, Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state=1, noise_state=0, pv_cutoff=0.05, verbosity=1): """Write the predictions.""" TransitionParameters = IterParameters[1] EmissionParameters = IterParameters[0] merge_neighbouring_sites = False minimal_site_length = 1 # Predict the sites print('Score peaks') LoadReads.close_data_handles(handles=[Sequences, Background]) Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') ScoredSites = GetSites( Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', fg_state, merge_neighbouring_sites, minimal_site_length, seq_file=EmissionParameters['dat_file_clip'], bck_file=EmissionParameters['dat_file_bg']) Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') print('Write peaks') # Write the results WriteResults(Sequences, Background, ScoredSites, OutFile, GeneAnnotation) generate_bed(OutFile, pv_cutoff=0.05) return
def ParallelGetMostLikelyPath( MostLikelyPaths, Sequences, Background, EmissionParameters, TransitionParameters, TransitionTypeFirst, RandomNoise=False, chunksize=1, verbosity=1): """Compute the most likely path.""" for gene in list(MostLikelyPaths.keys()): del MostLikelyPaths[gene] nr_of_genes = len(list(Sequences.keys())) gene_nr_dict = {} for i, curr_gene in enumerate(Sequences.keys()): gene_nr_dict[curr_gene] = i print("Computing most likely path") t = time.time() data = zip(list(Sequences.keys()), itertools.repeat(nr_of_genes), list(gene_nr_dict.values()), itertools.repeat(EmissionParameters), itertools.repeat(TransitionParameters), itertools.repeat(TransitionTypeFirst), itertools.repeat(RandomNoise)) LoadReads.close_data_handles(handles=[Sequences, Background]) if EmissionParameters['nb_proc'] == 1: results = [ParallelGetMostLikelyPathForGene(curr_slice) for curr_slice in data] else: print("Spawning processes") pool = multiprocessing.get_context("spawn").Pool( EmissionParameters['nb_proc'], maxtasksperchild=5) results = pool.imap(ParallelGetMostLikelyPathForGene, data, chunksize) pool.close() pool.join() print("Collecting results") results = [res for res in results] MostLikelyPaths = dict(zip( [result[0] for result in results], [result[1] for result in results])) # Compute the logliklihood of the gene LogLikelihood = sum([result[2] for result in results]) del results if verbosity > 0: print('\nDone: Elapsed time: ' + str(time.time() - t)) return MostLikelyPaths, LogLikelihood
def FitEmissionParameters(Sequences, Background, NewPaths, OldEmissionParameters, First, verbosity=1): """Fit EmissionParameters.""" print('Fitting emission parameters') t = time.time() # Unpack the arguments OldAlpha = OldEmissionParameters['Diag_event_params'] NrOfStates = OldEmissionParameters['NrOfStates'] OldPriorMatrix = OldEmissionParameters['PriorMatrix'] NewEmissionParameters = OldEmissionParameters # Compute new prior matrix PriorMatrix = np.zeros_like(OldPriorMatrix) for State in range(NrOfStates): for path in NewPaths: PriorMatrix[State] += np.sum(NewPaths[path] == State) # Check if one of the states is not used and add pseudo gene to prevent # singularities during distribution fitting if np.sum(PriorMatrix == 0) > 0: LoadReads.close_data_handles(handles=[Sequences, Background]) Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r+') Background = h5py.File(NewEmissionParameters['dat_file_bg'], 'r+') Sequences, Background, NewPaths = add_pseudo_gene( Sequences, Background, NewPaths, PriorMatrix) print('Adds pseudo gene to prevent singular matrix during GLM fitting') CorrPriorMatrix = np.copy(PriorMatrix) CorrPriorMatrix[CorrPriorMatrix == 0] = np.min( CorrPriorMatrix[CorrPriorMatrix > 0])/10 CorrPriorMatrix /= np.sum(CorrPriorMatrix) # Keep a copy to check which states are not used NewEmissionParameters['PriorMatrix'] = CorrPriorMatrix # Add Pseudo gene to Sequences, Background and Paths if NewEmissionParameters['ExpressionParameters'][0] is not None: Sequences, Background, NewPaths = add_pseudo_gene( Sequences, Background, NewPaths, PriorMatrix) # Compute parameters for the expression Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r') if (NewEmissionParameters['bg_type'] != 'None') and not First: if 'Pseudo' in list(Sequences.keys()): nr_of_genes = len(list(Sequences.keys())) new_pars = NewEmissionParameters['ExpressionParameters'][0] new_pars = np.vstack( (new_pars[:(nr_of_genes), :], np.mean(new_pars[:(nr_of_genes), :]), new_pars[(nr_of_genes):, :])) NewEmissionParameters['ExpressionParameters'][0] = new_pars print('Estimating expression parameters') get_mem_usage(verbosity) NewEmissionParameters = emission_prob.estimate_expression_param( (NewEmissionParameters, NewPaths), verbosity=verbosity) get_mem_usage(verbosity) Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r') Background = h5py.File(NewEmissionParameters['dat_file_bg'], 'r') if NewEmissionParameters['bg_type'] != 'None': if 'Pseudo' in list(Sequences.keys()): nr_of_genes = len(list(Sequences.keys())) new_pars = NewEmissionParameters['ExpressionParameters'][0] new_pars = np.vstack((new_pars[:(nr_of_genes-1), :], new_pars[(nr_of_genes):, :])) NewEmissionParameters['ExpressionParameters'][0] = new_pars if NewEmissionParameters['skip_diag_event_mdl'] is False: # Compute parameters for the ratios print('Computing sufficient statistic for fitting md') get_mem_usage(verbosity) SuffStat = tools.GetSuffStat( NewPaths, NrOfStates, Type='Conv', EmissionParameters=NewEmissionParameters, verbosity=verbosity) # Vectorize SuffStat Counts, NrOfCounts = tools.ConvertSuffStatToArrays(SuffStat) del SuffStat get_mem_usage(verbosity) if NewEmissionParameters['subs']: Counts, NrOfCounts = tools.subsample_suff_stat(Counts, NrOfCounts) print('Fitting md distribution') get_mem_usage(verbosity) if NewEmissionParameters['diag_bg']: print("Adjusting background") SuffStatBck = tools.GetSuffStatBck( NewPaths, NrOfStates, Type='Conv', EmissionParameters=NewEmissionParameters, verbosity=verbosity) # Vectorize SuffStat CountsBck, NrOfCountsBck = tools.ConvertSuffStatToArrays( SuffStatBck) if NewEmissionParameters['subs']: CountsBck, NrOfCountsBck = tools.subsample_suff_stat( CountsBck, NrOfCountsBck) # Overwrite counts in other bins fg_state, bg_state = emission_prob.get_fg_and_bck_state( NewEmissionParameters, final_pred=True) for curr_state in list(Counts.keys()): if curr_state != fg_state: Counts[curr_state] = CountsBck[fg_state] NrOfCounts[curr_state] = NrOfCountsBck[fg_state] del SuffStatBck NewEmissionParameters = mixture_tools.em( Counts, NrOfCounts, NewEmissionParameters, x_0=OldAlpha, First=First, verbosity=verbosity) get_mem_usage(verbosity) del Counts, NrOfCounts if 'Pseudo' in list(Sequences.keys()): del Sequences['Pseudo'] del Background['Pseudo'] del NewPaths['Pseudo'] if verbosity > 0: print('Done: Elapsed time: ' + str(time.time() - t)) return NewEmissionParameters
def PerformIteration(Sequences, Background, IterParameters, NrOfStates, First, NewPaths={}, verbosity=1): """ This function performs an iteration of the HMM algorithm """ # Unpack the Iteration parameters EmissionParameters = IterParameters[0] TransitionParameters = IterParameters[1] # Get new most likely path if First: NewPaths, LogLike = tools.ParallelGetMostLikelyPath( NewPaths, Sequences, Background, EmissionParameters, TransitionParameters, 'h**o', RandomNoise=True, verbosity=verbosity) Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') get_mem_usage(verbosity) # Perform EM to compute the new emission parameters print('Fitting emission parameters') get_mem_usage(verbosity) NewEmissionParameters = FitEmissionParameters( Sequences, Background, NewPaths, EmissionParameters, First, verbosity=verbosity) if First: First = 0 get_mem_usage(verbosity) # Fit the transition matrix parameters NewTransitionParameters = TransitionParameters print('Fitting transition parameters') get_mem_usage(verbosity) LoadReads.close_data_handles(handles=[Sequences, Background]) Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') TransistionPredictors = trans.FitTransistionParameters( Sequences, Background, TransitionParameters, NewPaths, verbosity=verbosity) NewTransitionParameters[1] = TransistionPredictors get_mem_usage(verbosity) NewIterParameters = [NewEmissionParameters, NewTransitionParameters] print('Computing most likely path') get_mem_usage(verbosity) gc.collect() NewPaths, LogLike = tools.ParallelGetMostLikelyPath( NewPaths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', verbosity=verbosity) Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') CurrLogLikelihood = LogLike get_mem_usage(verbosity) if verbosity > 1: print('LogLik:') print(CurrLogLikelihood) return CurrLogLikelihood, NewIterParameters, First, NewPaths
def ParallelGetMostLikelyPathForGene(data): """Compute the most likely path for a gene.""" (gene, nr_of_genes, gene_nr, EmissionParameters, TransitionParameters, TransitionTypeFirst, RandomNoise) = data # Turn the Sequence and Bacground objects into dictionaries again such that # the subsequent methods for using these do not need to be modified Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') # Parse the parameters alpha = EmissionParameters['Diag_event_params'] PriorMatrix = EmissionParameters['PriorMatrix'] NrOfStates = EmissionParameters['NrOfStates'] fg_state, bg_state = emission_prob.get_fg_and_bck_state( EmissionParameters, final_pred=True) # Score the state sequences # 1) Determine the positions where an observation is possible Sequences_per_gene = PreloadSequencesForGene(Sequences, gene) Background_per_gene = PreloadSequencesForGene(Background, gene) Ix = GetModelIx(Sequences_per_gene, Type='all') if np.sum(Ix) == 0: CurrPath = 2 * np.ones((0, Ix.shape[0]), dtype=np.int) return [gene, CurrPath, 0] if EmissionParameters['filter_snps']: Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['snps_thresh'], snps_min_cov=EmissionParameters['snps_min_cov'], Background=Background_per_gene) else: Ix = GetModelIx(Sequences_per_gene) # 2) Compute the probabilities for both states EmmisionProbGene = (np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))) CurrStackSum = StackData(Sequences_per_gene) CurrStackVar = StackData(Sequences_per_gene, add='no') CurrStackSumBck = StackData(Background_per_gene, add='only_cov') if EmissionParameters['glm_weight'] < 0.0: weight1 = 1.0 weight2 = 1.0 elif EmissionParameters['glm_weight'] == 0.0: weight1 = 0.0000001 weight2 = 1.0 - weight1 elif EmissionParameters['glm_weight'] == 1.0: weight1 = 0.9999999 weight2 = 1.0 - weight1 else: weight1 = EmissionParameters['glm_weight'] weight2 = (1.0 - EmissionParameters['glm_weight']) for State in range(NrOfStates): if not EmissionParameters['ign_GLM']: if isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray): EmmisionProbGene[State, :] = np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSum, State, nr_of_genes, gene_nr, EmissionParameters) if EmissionParameters['bg_type'] == 'Coverage': EmmisionProbGene[State, :] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg') if EmissionParameters['bg_type'] == 'Coverage_bck': EmmisionProbGene[State, :] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg') if not EmissionParameters['ign_diag']: EmmisionProbGene[State, Ix] += np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters) if State == fg_state: if EmissionParameters['LastIter']: EmmisionProbGene[State, :] -= EmissionParameters['fg_pen'] if RandomNoise: EmmisionProbGene = np.logaddexp( EmmisionProbGene, np.random.uniform( np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 4, np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 0.1, EmmisionProbGene.shape)) # Add some random noise # Get the transition probabilities TransistionProbabilities = np.float64(np.tile( np.log(TransitionParameters[0]), (EmmisionProbGene.shape[1], 1, 1)).T) CurrPath, Currloglik = viterbi.viterbi( np.float64(EmmisionProbGene), TransistionProbabilities, np.float64(np.log(PriorMatrix))) CurrPath = np.int8(CurrPath) del (TransistionProbabilities, EmmisionProbGene, CurrStackSum, CurrStackVar, CurrStackSumBck, Ix) LoadReads.close_data_handles(handles=[Sequences, Background]) return [gene, CurrPath, Currloglik]
def GetSitesForGene(data): """Determine the score of the sites for each gene.""" # Computing the probabilities for the current gene (Sites, gene, nr_of_genes, gene_nr, seq_file, bck_file, EmissionParameters, TransitionParameters, TransitionTypeFirst, fg_state, merge_neighbouring_sites, minimal_site_length) = data # Turn the Sequence and Bacground objects into dictionaries again such that # the subsequent methods for using these do not need to be modified if len(Sites) == 0: return gene, [] NrOfStates = EmissionParameters['NrOfStates'] Sites = dict([(gene, Sites)]) Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') Sequences_per_gene = PreloadSequencesForGene(Sequences, gene) Background_per_gene = PreloadSequencesForGene(Background, gene) Ix = GetModelIx(Sequences_per_gene, Type='all') if np.sum(Ix) == 0: return gene, [] if EmissionParameters['filter_snps']: Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['snps_thresh'], snps_min_cov=EmissionParameters['snps_min_cov'], Background=Background_per_gene) else: Ix = GetModelIx(Sequences_per_gene, Type='Conv') # Only compute the emission probability for regions where a site is ix_sites = np.zeros_like(Ix) ix_sites_len = Ix.shape[0] for currsite in Sites[gene]: ix_sites[max(0, currsite[0] - 1): min(ix_sites_len, currsite[1] + 1)] = 1 ix_sites = ix_sites == 1 # 2) Compute the probabilities for both states EmmisionProbGene = np.log( np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))) CurrStackSum = StackData(Sequences_per_gene) CurrStackVar = StackData(Sequences_per_gene, add='no') CurrStackSumBck = StackData(Background_per_gene, add='only_cov') CurrStackVarSumm = StackData(Sequences_per_gene, add='only_var_summed') EmmisionProbGeneDir = np.zeros_like(EmmisionProbGene) if EmissionParameters['glm_weight'] < 0.0: weight1 = 1.0 weight2 = 1.0 elif EmissionParameters['glm_weight'] == 0.0: weight1 = 0.0000001 weight2 = 1.0 - weight1 elif EmissionParameters['glm_weight'] == 1.0: weight1 = 0.9999999 weight2 = 1.0 - weight1 else: weight1 = EmissionParameters['glm_weight'] weight2 = (1.0 - EmissionParameters['glm_weight']) for State in range(NrOfStates): EmmisionProbGene[State, ix_sites] = np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSum[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters) if EmissionParameters['bg_type'] == 'Coverage': EmmisionProbGene[State, ix_sites] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters, 'bg') if EmissionParameters['bg_type'] == 'Coverage_bck': EmmisionProbGene[State, ix_sites] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters, 'bg') EmmisionProbGeneDir[State, Ix] = np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters) EmmisionProbGene[State, Ix] += np.log(weight2) + EmmisionProbGeneDir[State, Ix] Counts = StackData(Sequences_per_gene, add='all') Score = EmmisionProbGene CurrStack = CurrStackVar # Compute the scores when staying in the same state # RowIx = list(range(16)) + list(range(17, 38)) + list(range(39,44)) strand = Sequences_per_gene['strand'] # Get the coverages for the froeground and background CountsSeq = StackData(Sequences_per_gene, add='only_cov') CountsBck = StackData(Background_per_gene, add='only_cov') if strand == 0: strand = -1 # Since we the transition probabilty is the same for all States we do not # need to compute it for the bayes factor this list contains the returned # sites sites = [] for currsite in Sites[gene]: (mean_mat_fg, var_mat_fg, mean_mat_bg, var_mat_bg, counts_fg, counts_bg) = ComputeStatsForSite(CountsSeq, CountsBck, currsite, fg_state, nr_of_genes, gene_nr, EmissionParameters) site = {} site['Start'] = currsite[0] site['Stop'] = currsite[1] site['Strand'] = strand site['SiteScore'] = EvaluateSite(Score, currsite, fg_state) site['Coverage'] = np.sum(np.sum(Counts[:, site['Start']:site['Stop']], axis=0)) site['Variants'] = np.sum(CurrStackVarSumm[:, site['Start']:site['Stop']], axis=1) site['mean_mat_fg'] = mean_mat_fg site['var_mat_fg'] = var_mat_fg site['mean_mat_bg'] = mean_mat_bg site['var_mat_bg'] = var_mat_bg site['counts_fg'] = counts_fg site['counts_bg'] = counts_bg p = mean_mat_fg / var_mat_fg n = (mean_mat_fg ** 2) / (var_mat_fg - mean_mat_fg) site['pv'] = nbinom.logsf(counts_fg, n, p) site['max_pos'] = get_max_position(Score, currsite, fg_state, strand) site['dir_score'] = EvaluateSite(EmmisionProbGeneDir, currsite, fg_state) if site['SiteScore'] < 0.0: continue sites.append(site) LoadReads.close_data_handles(handles=[Sequences, Background]) return gene, sites