def em(counts, nr_of_counts, EmissionParameters, x_0=None, First=False, max_nr_iter=15, tol=0.0001, rand_sample_size=10, verbosity=1): ''' This function performs the EMlagorithm ''' template_state = 3 fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True) check = False OldEmissionParameters = deepcopy(EmissionParameters) for curr_state in list(counts.keys()): #Only compute the the emission probabilities once if EmissionParameters['diag_bg']: if curr_state != fg_state: if True: if check == True: print('Using template state ' + str(curr_state)) EmissionParameters['Diag_event_params']['mix_comp'][curr_state] = deepcopy(EmissionParameters['Diag_event_params']['mix_comp'][template_state]) EmissionParameters['Diag_event_params']['alpha'][curr_state] = deepcopy(EmissionParameters['Diag_event_params']['alpha'][template_state]) continue else: print('setting template state ' + str(curr_state)) check = True template_state = curr_state else: template_state = 3 check = True EmissionParameters['Diag_event_params']['mix_comp'][curr_state] = deepcopy(EmissionParameters['Diag_event_params']['mix_comp'][template_state]) EmissionParameters['Diag_event_params']['alpha'][curr_state] = deepcopy(EmissionParameters['Diag_event_params']['alpha'][template_state]) continue print('Estimating state ' + str(curr_state)) curr_counts = counts[curr_state] curr_nr_of_counts = nr_of_counts[curr_state] alpha, mixtures = Parallel_estimate_mixture_params(OldEmissionParameters, curr_counts, curr_nr_of_counts, curr_state, rand_sample_size, max_nr_iter, nr_of_iter=20, stop_crit=1.0, nr_of_init=10, verbosity=verbosity) EmissionParameters['Diag_event_params']['alpha'][curr_state] = alpha EmissionParameters['Diag_event_params']['mix_comp'][curr_state] = mixtures return EmissionParameters
def FitEmissionParameters(Sequences, Background, NewPaths, OldEmissionParameters, First, verbosity=1): print('Fitting emission parameters') t = time.time() #Unpack the arguments OldAlpha = OldEmissionParameters['Diag_event_params'] NrOfStates = OldEmissionParameters['NrOfStates'] OldPriorMatrix = OldEmissionParameters['PriorMatrix'] NewEmissionParameters = OldEmissionParameters #Compute new prior matrix PriorMatrix = np.zeros_like(OldPriorMatrix) for State in range(NrOfStates): for path in NewPaths: PriorMatrix[State] += np.sum(NewPaths[path] == State) #Check if one of the states is not used and add pseudo gene to prevent singularities during distribution fitting if np.sum(PriorMatrix == 0) > 0: Sequences.close() Background.close() Sequences = h5py.File(NewEmissionParameters['DataOutFile_seq'], 'r+') Background = h5py.File(NewEmissionParameters['DataOutFile_bck'], 'r+') Sequences, Background, NewPaths, pseudo_gene_names = add_pseudo_gene(Sequences, Background, NewPaths, PriorMatrix) Sequences.close() Background.close() print('Addes pseudo gene to prevent singular matrix during GLM fitting') CorrectedPriorMatrix = np.copy(PriorMatrix) CorrectedPriorMatrix[CorrectedPriorMatrix == 0] = np.min(CorrectedPriorMatrix[CorrectedPriorMatrix > 0])/10 CorrectedPriorMatrix /= np.sum(CorrectedPriorMatrix) #Keep a copy to check which states are not used NewEmissionParameters['PriorMatrix'] = CorrectedPriorMatrix #Add Pseudo gene to Sequences, Background and Paths if NewEmissionParameters['ExpressionParameters'][0] is not None: Sequences, Background, NewPaths, pseudo_gene_names = add_pseudo_gene(Sequences, Background, NewPaths, PriorMatrix) #Compute parameters for the expression sample_size = 10000 if NewEmissionParameters['BckType'] != 'None': if 'Pseudo' in Sequences: nr_of_genes = len(list(Sequences.keys())) new_pars = NewEmissionParameters['ExpressionParameters'][0] new_pars = np.vstack((new_pars[:(nr_of_genes), :], np.mean(new_pars[:(nr_of_genes), :]), new_pars[(nr_of_genes):, :])) NewEmissionParameters['ExpressionParameters'][0] = new_pars print('Estimating expression parameters') if verbosity > 0: print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) bg_type = NewEmissionParameters['BckType'] expr_data = (NewEmissionParameters, Sequences, Background, NewPaths, sample_size, bg_type) NewEmissionParameters = emission_prob.estimate_expression_param(expr_data, verbosity=verbosity) if verbosity > 0: print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if NewEmissionParameters['BckType'] != 'None': if 'Pseudo' in Sequences: nr_of_genes = len(list(Sequences.keys())) new_pars = NewEmissionParameters['ExpressionParameters'][0] new_pars = np.vstack((new_pars[:(nr_of_genes-1), :], new_pars[(nr_of_genes):, :])) NewEmissionParameters['ExpressionParameters'][0] = new_pars if (NewEmissionParameters['skip_diag_event_mdl'] == False) or (not (EmissionParameters['use_precomp_diagmod'] is None)): #Compute parameters for the ratios print('Computing sufficient statistic for fitting md') if verbosity > 0: print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) SuffStat = tools.GetSuffStat(Sequences, Background, NewPaths, NrOfStates, Type='Conv', EmissionParameters=NewEmissionParameters, verbosity=verbosity) #Vectorize SuffStat Counts, NrOfCounts = tools.ConvertSuffStatToArrays(SuffStat) del SuffStat if verbosity > 0: print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if NewEmissionParameters['Subsample']: Counts, NrOfCounts = tools.subsample_suff_stat(Counts, NrOfCounts) print('Fitting md distribution') if verbosity > 0: print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if NewEmissionParameters['diag_bg']: print("Adjusting background") SuffStatBck = tools.GetSuffStatBck(Sequences, Background, NewPaths, NrOfStates, Type='Conv', EmissionParameters=NewEmissionParameters, verbosity=verbosity) #Vectorize SuffStat CountsBck, NrOfCountsBck = tools.ConvertSuffStatToArrays(SuffStatBck) if NewEmissionParameters['Subsample']: CountsBck, NrOfCountsBck = tools.subsample_suff_stat(CountsBck, NrOfCountsBck) #Overwrite counts in other bins fg_state, bg_state = emission_prob.get_fg_and_bck_state(NewEmissionParameters, final_pred=True) for curr_state in list(Counts.keys()): if curr_state != fg_state: Counts[curr_state] = CountsBck[fg_state] NrOfCounts[curr_state] = NrOfCountsBck[fg_state] del SuffStatBck NewEmissionParameters = mixture_tools.em(Counts, NrOfCounts, NewEmissionParameters, x_0=OldAlpha, First=First, verbosity=verbosity) if verbosity > 0: print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) del Counts, NrOfCounts if 'Pseudo' in Sequences: del Sequences['Pseudo'] del Background['Pseudo'] del NewPaths['Pseudo'] if verbosity > 0: print('Done: Elapsed time: ' + str(time.time() - t)) return NewEmissionParameters
def pred_sites(args, verbosity=1): # Get the args args = parser.parse_args() print(args) #Check parameters if len(args.fg_libs) == 0: raise sys.exit('No CLIP-libraries given') if len(args.bg_libs) == 0: bg_type = 'None' else: bg_type = args.bg_type if args.out_dir == None: out_path = os.getcwd() else: out_path = args.out_dir MaxIter = args.max_it # process the parameters if not (bg_type == 'Coverage' or bg_type == 'Coverage_bck'): print('Bg-type: ' + bg_type + ' has not been implemented yet') return #Load the gene annotation print('Loading gene annotation') GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True) GenomeDir = args.genome_dir #Load the reads t = time.time() print('Loading reads') DataOutFile = os.path.join(out_path, 'fg_reads.dat') Sequences = LoadReads.load_data(args.fg_libs, GenomeDir, GeneAnnotation, DataOutFile, load_from_file = True, save_results = False, Collapse = args.fg_collapsed, ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) DataOutFile = os.path.join(out_path, 'bg_reads.dat') Background = LoadReads.load_data(args.bg_libs, GenomeDir, GeneAnnotation, DataOutFile, load_from_file = True, save_results = False, Collapse = args.bg_collapsed, OnlyCoverage = True, ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) #Removing genes without any reads in the CLIP data genes_to_keep = [] all_genes = list(Sequences.keys()) for i, gene in enumerate(Sequences.keys()): curr_cov = np.sum(np.array([np.sum(Sequences[gene]['Coverage'][rep].toarray()) for rep in list(Sequences[gene]['Coverage'].keys())])) if curr_cov < 100: continue genes_to_keep.append(gene) if i > args.gene_sample: break genes_to_del = list(set(all_genes).difference(set(genes_to_keep))) for gene in genes_to_del: del Sequences[gene] del Background[gene] del all_genes, genes_to_del, genes_to_keep if verbosity > 0: print('Done: Elapsed time: ' + str(time.time() - t)) #Load data tmp_file = pickle.load(open(os.path.join(out_path, 'IterSaveFile.dat'), 'rb')) IterParameters = tmp_file[0] args = tmp_file[1] EmissionParameters = IterParameters[0] fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True) if EmissionParameters['fg_pen'] > 0.0: print('Recomputing paths') EmissionParameters['LastIter'] = True Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') Paths, LogLike = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', verbosity=EmissionParameters['Verbosity']) Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') tools.GeneratePred(Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state, bg_state, verbosity=EmissionParameters['Verbosity']) print('Done')
def run_omniCLIP(args): # Get the args args = parser.parse_args() verbosity = args.verbosity if verbosity > 1: print(args) #Check parameters if len(args.fg_libs) == 0: raise sys.exit('No CLIP-libraries given') if len(args.bg_libs) == 0: bg_type = 'None' else: bg_type = args.bg_type if args.out_dir == None: out_path = os.getcwd() else: out_path = args.out_dir MaxIter = args.max_it # process the parameters if not (bg_type == 'Coverage' or bg_type == 'Coverage_bck'): print('Bg-type: ' + bg_type + ' has not been implemented yet') return #Set seed for the random number generators if args.rnd_seed is not None: random.seed(args.rnd_seed) print('setting seed') #Set the p-value cutoff for the bed-file creation pv_cutoff = args.pv_cutoff #Load the gene annotation print('Loading gene annotation') if args.gene_anno_file.split('.')[-1] == 'db': GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True) else: if os.path.isfile(args.gene_anno_file + '.db'): print('Using existing gene annotation database: ' + args.gene_anno_file + '.db') GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file + '.db', keep_order=True) else: print('Creating gene annotation database') db = gffutils.create_db(args.gene_anno_file, dbfn=(args.gene_anno_file + '.db'), force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True, disable_infer_transcripts=True, disable_infer_genes=True) GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file + '.db', keep_order=True) del db GenomeDir = args.genome_dir import warnings warnings.filterwarnings('error') #Load the reads if verbosity > 0: print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) print('Loading reads') EmissionParameters = {} #Check whether existing iteration parameters should be used restart_from_file = args.restart_from_file EmissionParameters['restart_from_file'] = restart_from_file EmissionParameters['glm_weight'] = args.glm_weight EmissionParameters['mask_flank_variants'] = args.mask_flank_variants EmissionParameters['max_mm'] = args.max_mm EmissionParameters['rev_strand'] = args.rev_strand EmissionParameters['skip_diag_event_mdl'] = args.skip_diag_event_mdl EmissionParameters['ign_out_rds'] = args.ign_out_rds EmissionParameters['DataOutFile_seq'] = os.path.join(out_path, 'fg_reads.dat') EmissionParameters['DataOutFile_bck'] = os.path.join(out_path, 'bg_reads.dat') EmissionParameters['tmp_dir'] = args.tmp_dir t = time.time() Sequences = LoadReads.load_data(args.fg_libs, GenomeDir, GeneAnnotation, EmissionParameters['DataOutFile_seq'], load_from_file = ((not args.overwrite_fg) or restart_from_file), save_results = True, Collapse = args.fg_collapsed, mask_flank_variants=EmissionParameters['mask_flank_variants'], max_mm=EmissionParameters['max_mm'], ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) Background = LoadReads.load_data(args.bg_libs, GenomeDir, GeneAnnotation, EmissionParameters['DataOutFile_bck'], load_from_file = ((not args.overwrite_bg) or restart_from_file), save_results = True, Collapse = args.bg_collapsed, OnlyCoverage = args.only_coverage, mask_flank_variants=EmissionParameters['mask_flank_variants'], max_mm=EmissionParameters['max_mm'], ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) #pdb.set_trace() #Mask the positions that overlap miRNA sites in the geneome Sequences.close() Background.close() f_name_read_fg = EmissionParameters['DataOutFile_seq'] f_name_read_bg = EmissionParameters['DataOutFile_bck'] #Create temporary read-files that can be modified by the masking operations if EmissionParameters['tmp_dir'] is None: f_name_read_fg_tmp = EmissionParameters['DataOutFile_seq'].replace('fg_reads.dat', 'fg_reads.tmp.dat') f_name_read_bg_tmp = EmissionParameters['DataOutFile_bck'].replace('bg_reads.dat', 'bg_reads.tmp.dat') else: f_name_read_fg_tmp = os.path.join(EmissionParameters['tmp_dir'], next(tempfile._get_candidate_names()) + '.dat') f_name_read_bg_tmp = os.path.join(EmissionParameters['tmp_dir'], next(tempfile._get_candidate_names()) + '.dat') shutil.copy(f_name_read_fg, f_name_read_fg_tmp) shutil.copy(f_name_read_bg, f_name_read_bg_tmp) #open the temporary read files Sequences = h5py.File(f_name_read_fg_tmp, 'r+') Background = h5py.File(f_name_read_bg_tmp, 'r+') EmissionParameters['DataOutFile_seq'] = f_name_read_fg_tmp EmissionParameters['DataOutFile_bck'] = f_name_read_bg_tmp #Set coverage for regions that overlapp annotated miRNAs to zero EmissionParameters['mask_miRNA'] = args.mask_miRNA if args.mask_miRNA: print('Removing miRNA-coverage') Sequences = mask_miRNA_positions(Sequences, GeneAnnotation) #Mask regions where genes overlap EmissionParameters['mask_ovrlp'] = args.mask_ovrlp if EmissionParameters['mask_ovrlp']: print('Masking overlapping positions') Sequences = mark_overlapping_positions(Sequences, GeneAnnotation) #Estimate the library size EmissionParameters['BckLibrarySize'] = tools.estimate_library_size(Background) EmissionParameters['LibrarySize'] = tools.estimate_library_size(Sequences) #Removing genes without any reads in the CLIP data print("Removing genes without CLIP coverage") genes_to_keep = [] all_genes = list(Sequences.keys()) for i, gene in enumerate(Sequences.keys()): curr_cov = sum([Sequences[gene]['Coverage'][rep][()].sum() for rep in list(Sequences[gene]['Coverage'].keys())]) if curr_cov <= 100: continue genes_to_keep.append(gene) if i > args.gene_sample: break genes_to_del = list(set(all_genes).difference(set(genes_to_keep))) for gene in genes_to_del: del Sequences[gene] del Background[gene] del all_genes, genes_to_del, genes_to_keep if verbosity > 0: print('Done: Elapsed time: ' + str(time.time() - t)) print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) #Initializing parameters print('Initialising the parameters') if bg_type == 'Coverage_bck': NrOfStates = 4 else: NrOfStates = 3 #Remove the gene sequence from the Sequences and Background when not needed. Currently this is always the case: for gene in list(Sequences.keys()): if 'GeneSeq' in Sequences[gene]: del Sequences[gene]['GeneSeq'] for gene in list(Background.keys()): if 'GeneSeq' in Background[gene]: del Background[gene]['GeneSeq'] #pdb.set_trace() TransMat = np.ones((NrOfStates, NrOfStates)) + np.eye(NrOfStates) TransMat = TransMat / np.sum(np.sum(TransMat)) TransitionParameters = [TransMat, []] NrOfReplicates = len(args.fg_libs) gene = list(Sequences.keys())[0] EmissionParameters['PriorMatrix'] = np.ones((NrOfStates, 1)) / float(NrOfStates) EmissionParameters['diag_bg'] = args.diag_bg EmissionParameters['emp_var'] = args.emp_var EmissionParameters['norm_class'] = args.norm_class #Define flag for penalized path prediction EmissionParameters['LastIter'] = False EmissionParameters['fg_pen'] = args.fg_pen EmissionParameters['Diag_event_params'] = {} EmissionParameters['Diag_event_params']['nr_mix_comp'] = args.nr_mix_comp EmissionParameters['Diag_event_params']['mix_comp'] = {} for state in range(NrOfStates): mixtures = np.random.uniform(0.0, 1.0, size=(args.nr_mix_comp)) EmissionParameters['Diag_event_params']['mix_comp'][state] = mixtures / np.sum(mixtures) #initialise the parameter vector alpha alphashape = (Sequences[gene]['Variants']['0']['shape'][0] + Sequences[gene]['Coverage']['0'][()].shape[0] + Sequences[gene]['Read-ends']['0'][()].shape[0]) alpha = {} for state in range(NrOfStates): alpha[state] = np.random.uniform(0.9, 1.1, size=(alphashape, args.nr_mix_comp)) EmissionParameters['Diag_event_params']['alpha'] = alpha EmissionParameters['Diag_event_type'] = args.diag_event_mod EmissionParameters['NrOfStates'] = NrOfStates EmissionParameters['NrOfReplicates'] = NrOfReplicates EmissionParameters['ExpressionParameters'] = [None, None] EmissionParameters['BckType'] = bg_type EmissionParameters['NrOfBckReplicates'] = len(args.bg_libs) EmissionParameters['TransitionType'] = 'binary' EmissionParameters['Verbosity'] = args.verbosity EmissionParameters['NbProc'] = args.nb_proc EmissionParameters['Subsample'] = args.subs EmissionParameters['FilterSNPs'] = args.filter_snps EmissionParameters['SnpRatio'] = args.snps_thresh EmissionParameters['SnpAbs'] = args.snps_min_cov EmissionParameters['ign_diag'] = args.ign_diag if EmissionParameters['ign_out_rds']: EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds'] EmissionParameters['ign_GLM'] = args.ign_GLM EmissionParameters['only_pred'] = args.only_pred EmissionParameters['use_precomp_diagmod'] = args.use_precomp_diagmod # Transistion parameters IterParameters = [EmissionParameters, TransitionParameters] #Start computation #Iterativly fit the parameters of the model OldLogLikelihood = 0 CurrLogLikelihood = -np.inf CurrIter = 0 LoglikelihodList = [] First = 1 IterSaveFile = os.path.join(out_path, 'IterSaveFile.dat') IterSaveFileHist = os.path.join(out_path, 'IterSaveFileHist.dat') IterHist = [] Paths = {} iter_cond = True #Check whether to preload the iteration file if EmissionParameters['only_pred']: IterParameters, args_old = pickle.load(open(IterSaveFile,'rb')) EmissionParameters['mask_miRNA'] = args.mask_miRNA EmissionParameters['glm_weight'] = args.glm_weight EmissionParameters['restart_from_file'] = restart_from_file EmissionParameters = IterParameters[0] EmissionParameters['ign_diag'] = args.ign_diag if EmissionParameters['ign_out_rds']: EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds'] EmissionParameters['ign_GLM'] = args.ign_GLM TransitionParameters = IterParameters[1] TransitionType = EmissionParameters['TransitionType'] OldLogLikelihood = -np.inf fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True) Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo') Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') First = 0 iter_cond = False if restart_from_file: IterParameters, args_old = pickle.load(open(IterSaveFile,'rb')) EmissionParameters = IterParameters[0] EmissionParameters['mask_miRNA'] = args.mask_miRNA EmissionParameters['glm_weight'] = args.glm_weight EmissionParameters['restart_from_file'] = restart_from_file EmissionParameters['ign_diag'] = args.ign_diag EmissionParameters['ign_GLM'] = args.ign_GLM TransitionParameters = IterParameters[1] TransitionType = EmissionParameters['TransitionType'] OldLogLikelihood = -np.inf Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo') Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') First = 1 iter_cond = True #import warnings #warnings.filterwarnings('error') if not EmissionParameters['use_precomp_diagmod'] is None: IterParametersPreComp, args_old = pickle.load(open(EmissionParameters['use_precomp_diagmod'],'r')) IterParameters[0]['Diag_event_params'] = IterParametersPreComp[0]['Diag_event_params'] while iter_cond: print("\n") print("Iteration: " + str(CurrIter)) if EmissionParameters['Verbosity'] > 1: print(IterParameters[0]) OldLogLikelihood = CurrLogLikelihood CurrLogLikelihood, IterParameters, First, Paths = PerformIteration(Sequences, Background, IterParameters, NrOfStates, First, Paths, verbosity=EmissionParameters['Verbosity']) gc.collect() if True: pickle.dump([IterParameters, args], open(IterSaveFile,'wb')) if args.safe_tmp: if CurrIter > 0: IterHist = pickle.load(open(IterSaveFileHist,'rb')) IterHist.append([IterParameters, CurrLogLikelihood]) pickle.dump(IterHist, open(IterSaveFileHist,'wb')) del IterHist if verbosity > 1: print("Log-likelihood: " + str(CurrLogLikelihood)) LoglikelihodList.append(CurrLogLikelihood) if verbosity > 1: print(LoglikelihodList) CurrIter += 1 if CurrIter >= MaxIter: print('Maximal number of iterations reached') if not restart_from_file: if CurrIter < max(3, MaxIter): iter_cond = True else: iter_cond = (CurrIter < MaxIter) and ((abs(CurrLogLikelihood - OldLogLikelihood)/max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik) else: if np.isinf(OldLogLikelihood): iter_cond = (CurrIter < MaxIter) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik) else: iter_cond = (CurrIter < MaxIter) and ((abs(CurrLogLikelihood - OldLogLikelihood)/max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik) #Return the fitted parameters print('Finished parameter fitting') EmissionParameters, TransitionParameters = IterParameters if not isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray): print('Emmision parameters have not been fit yet') return out_file_base = 'pred' if EmissionParameters['ign_GLM']: out_file_base += '_no_glm' if EmissionParameters['ign_diag']: out_file_base += '_no_diag' OutFile = os.path.join(out_path, out_file_base + '.txt') #determine which state has higher weight in fg. if verbosity > 0: print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True) if EmissionParameters['fg_pen'] > 0.0: print('Recomputing paths') EmissionParameters['LastIter'] = True Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') Paths, LogLike = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', verbosity=EmissionParameters['Verbosity']) Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') tools.GeneratePred(Paths, Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state, bg_state, seq_file=EmissionParameters['DataOutFile_seq'], bck_file=EmissionParameters['DataOutFile_bck'], pv_cutoff=pv_cutoff, verbosity=EmissionParameters['Verbosity']) print('Done') #Remove the temporary files if not (EmissionParameters['tmp_dir'] is None): print('removing temporary files') os.remove(EmissionParameters['DataOutFile_seq']) os.remove(EmissionParameters['DataOutFile_bck']) return
def GetSuffStatBck(Sequences, Background, Paths, NrOfStates, Type, ResetNotUsedStates = True, EmissionParameters=None, verbosity=1): ''' This function computes for each CurrPath state a set of suffcient statistics: ''' #Initialize the sufficent statistcs variable print("Getting suffcient statistic") t = time.time() SuffStatBck = {} fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True) SuffStatBck[fg_state] = defaultdict(int) try: Sequences.close() except: pass try: Background.close() except: pass Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') #Fil the sufficent statistcs variable for gene in list(Sequences.keys()): rep = list(Background[gene]['Coverage'].keys())[0] CurrGenePath = Paths[gene] #Stack the matrizes together and convert to dense matrix Background_per_gene = PreloadSequencesForGene(Background, gene) Sequences_per_gene = PreloadSequencesForGene(Sequences, gene) if Type == 'Conv': CurrStack = StackData(Background_per_gene, add = 'variants') else: CurrStack = StackData(Background_per_gene, add = 'all') if EmissionParameters['FilterSNPs']: if Type == 'Conv': Ix = GetModelIx(Background_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene) else: Ix = GetModelIx(Background_per_gene, Type) else: Ix = GetModelIx(Background_per_gene, Type) NonZero = np.sum(CurrStack, axis = 0) > 0 #Determine the nonzeros elements CurrState = fg_state CurrIx = Ix * NonZero > 0 if EmissionParameters['mask_ovrlp']: CurrIx = Ix * (Sequences_per_gene['mask'][rep][0, :] == 0) * NonZero * (CurrGenePath == CurrState) > 0 else: CurrIx = Ix * NonZero * (CurrGenePath == CurrState) > 0 data = CurrStack[:,CurrIx].T ncols = data.shape[1] dtype = data.T.dtype.descr * ncols struct = data.view(dtype) vals, val_counts = np.unique(struct, return_counts=True) #Save the tuples and how many times they have been seen so far. for curr_val, curr_count in zip(vals, val_counts): SuffStatBck[CurrState][tuple(curr_val)] += curr_count #Treat the 0 tuple seperately for speed improvment if len(Ix) == 0: continue NullIx = (NonZero == 0) * (CurrGenePath == CurrState) > 0 if np.sum(NullIx) == 0: continue NullCount = np.sum(NullIx) if NullCount > 0: NullTuple = np.zeros_like(CurrStack[:, 0]) NullTuple = tuple(NullTuple.T) SuffStatBck[CurrState][NullTuple] += NullCount del CurrStack, NonZero, CurrGenePath, Ix print('Done: Elapsed time: ' + str(time.time() - t)) return SuffStatBck
def ParallelGetMostLikelyPathForGene(data): ''' This function computes the most likely path for a gene ''' gene, nr_of_genes, gene_nr, EmissionParameters, TransitionParameters, TransitionTypeFirst, RandomNoise = data #Turn the Sequence and Bacground objects into dictionaries again such that the subsequent methods for using these do not need to be modified Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') #Parse the parameters alpha = EmissionParameters['Diag_event_params'] PriorMatrix = EmissionParameters['PriorMatrix'] NrOfStates = EmissionParameters['NrOfStates'] fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True) fg_pen = EmissionParameters['fg_pen'] #Score the state sequences #1) Determine the positions where an observation is possible Sequences_per_gene = PreloadSequencesForGene(Sequences, gene) Background_per_gene = PreloadSequencesForGene(Background, gene) Ix = GetModelIx(Sequences_per_gene, Type='all') if np.sum(Ix) == 0: CurrPath = 2 * np.ones((0, Ix.shape[0]), dtype=np.int) return [gene, CurrPath, 0] if EmissionParameters['FilterSNPs']: Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene) else: Ix = GetModelIx(Sequences_per_gene) #2) Compute the probabilities for both states EmmisionProbGene = np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)) CurrStackSum = StackData(Sequences_per_gene) CurrStackVar = StackData(Sequences_per_gene, add = 'no') CurrStackSumBck = StackData(Background_per_gene, add = 'only_cov') if EmissionParameters['glm_weight'] < 0.0: weight1 = 1.0 weight2 = 1.0 elif EmissionParameters['glm_weight'] == 0.0: weight1 = 0.0000001 weight2 = 1.0 - weight1 elif EmissionParameters['glm_weight'] == 1.0: weight1 = 0.9999999 weight2 = 1.0 - weight1 else: weight1 = EmissionParameters['glm_weight'] weight2 = (1.0 - EmissionParameters['glm_weight']) for State in range(NrOfStates): if not EmissionParameters['ign_GLM']: if isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray): #EmmisionProbGene[State, :] = FitBinoDirchEmmisionProbabilities.ComputeStateProbForGeneNB_unif(CurrStack, alpha, State, EmissionParameters) EmmisionProbGene[State, :] = np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSum, State, nr_of_genes, gene_nr, EmissionParameters) if EmissionParameters['BckType'] == 'Coverage': EmmisionProbGene[State, :] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg') if EmissionParameters['BckType'] == 'Coverage_bck': EmmisionProbGene[State, :] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg') if not EmissionParameters['ign_diag']: EmmisionProbGene[State, Ix] += np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters) if State == fg_state: if EmissionParameters['LastIter']: EmmisionProbGene[State, :] -= fg_pen if RandomNoise: EmmisionProbGene = np.logaddexp(EmmisionProbGene, np.random.uniform(np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 4, np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 0.1, EmmisionProbGene.shape)) #Add some random noise #Get the transition probabilities TransistionProbabilities = np.float64(np.tile(np.log(TransitionParameters[0]), (EmmisionProbGene.shape[1],1,1)).T) CurrPath, Currloglik = viterbi.viterbi(np.float64(EmmisionProbGene), TransistionProbabilities, np.float64(np.log(PriorMatrix))) CurrPath = np.int8(CurrPath) del TransistionProbabilities, EmmisionProbGene, CurrStackSum, CurrStackVar, CurrStackSumBck, Ix Sequences.close() Background.close() return [gene, CurrPath, Currloglik]