def pred_sites(args, verbosity=1): # Get the args args = parser.parse_args() print(args) #Check parameters if len(args.fg_libs) == 0: raise sys.exit('No CLIP-libraries given') if len(args.bg_libs) == 0: bg_type = 'None' else: bg_type = args.bg_type if args.out_dir == None: out_path = os.getcwd() else: out_path = args.out_dir MaxIter = args.max_it # process the parameters if not (bg_type == 'Coverage' or bg_type == 'Coverage_bck'): print('Bg-type: ' + bg_type + ' has not been implemented yet') return #Load the gene annotation print('Loading gene annotation') GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True) GenomeDir = args.genome_dir #Load the reads t = time.time() print('Loading reads') DataOutFile = os.path.join(out_path, 'fg_reads.dat') Sequences = LoadReads.load_data(args.fg_libs, GenomeDir, GeneAnnotation, DataOutFile, load_from_file = True, save_results = False, Collapse = args.fg_collapsed, ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) DataOutFile = os.path.join(out_path, 'bg_reads.dat') Background = LoadReads.load_data(args.bg_libs, GenomeDir, GeneAnnotation, DataOutFile, load_from_file = True, save_results = False, Collapse = args.bg_collapsed, OnlyCoverage = True, ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) #Removing genes without any reads in the CLIP data genes_to_keep = [] all_genes = list(Sequences.keys()) for i, gene in enumerate(Sequences.keys()): curr_cov = np.sum(np.array([np.sum(Sequences[gene]['Coverage'][rep].toarray()) for rep in list(Sequences[gene]['Coverage'].keys())])) if curr_cov < 100: continue genes_to_keep.append(gene) if i > args.gene_sample: break genes_to_del = list(set(all_genes).difference(set(genes_to_keep))) for gene in genes_to_del: del Sequences[gene] del Background[gene] del all_genes, genes_to_del, genes_to_keep if verbosity > 0: print('Done: Elapsed time: ' + str(time.time() - t)) #Load data tmp_file = pickle.load(open(os.path.join(out_path, 'IterSaveFile.dat'), 'rb')) IterParameters = tmp_file[0] args = tmp_file[1] EmissionParameters = IterParameters[0] fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True) if EmissionParameters['fg_pen'] > 0.0: print('Recomputing paths') EmissionParameters['LastIter'] = True Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') Paths, LogLike = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', verbosity=EmissionParameters['Verbosity']) Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') tools.GeneratePred(Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state, bg_state, verbosity=EmissionParameters['Verbosity']) print('Done')
def run_omniCLIP(args): # Get the args args = parser.parse_args() verbosity = args.verbosity if verbosity > 1: print(args) #Check parameters if len(args.fg_libs) == 0: raise sys.exit('No CLIP-libraries given') if len(args.bg_libs) == 0: bg_type = 'None' else: bg_type = args.bg_type if args.out_dir == None: out_path = os.getcwd() else: out_path = args.out_dir MaxIter = args.max_it # process the parameters if not (bg_type == 'Coverage' or bg_type == 'Coverage_bck'): print('Bg-type: ' + bg_type + ' has not been implemented yet') return #Set seed for the random number generators if args.rnd_seed is not None: random.seed(args.rnd_seed) print('setting seed') #Set the p-value cutoff for the bed-file creation pv_cutoff = args.pv_cutoff #Load the gene annotation print('Loading gene annotation') if args.gene_anno_file.split('.')[-1] == 'db': GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True) else: if os.path.isfile(args.gene_anno_file + '.db'): print('Using existing gene annotation database: ' + args.gene_anno_file + '.db') GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file + '.db', keep_order=True) else: print('Creating gene annotation database') db = gffutils.create_db(args.gene_anno_file, dbfn=(args.gene_anno_file + '.db'), force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True, disable_infer_transcripts=True, disable_infer_genes=True) GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file + '.db', keep_order=True) del db GenomeDir = args.genome_dir import warnings warnings.filterwarnings('error') #Load the reads if verbosity > 0: print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) print('Loading reads') EmissionParameters = {} #Check whether existing iteration parameters should be used restart_from_file = args.restart_from_file EmissionParameters['restart_from_file'] = restart_from_file EmissionParameters['glm_weight'] = args.glm_weight EmissionParameters['mask_flank_variants'] = args.mask_flank_variants EmissionParameters['max_mm'] = args.max_mm EmissionParameters['rev_strand'] = args.rev_strand EmissionParameters['skip_diag_event_mdl'] = args.skip_diag_event_mdl EmissionParameters['ign_out_rds'] = args.ign_out_rds EmissionParameters['DataOutFile_seq'] = os.path.join(out_path, 'fg_reads.dat') EmissionParameters['DataOutFile_bck'] = os.path.join(out_path, 'bg_reads.dat') EmissionParameters['tmp_dir'] = args.tmp_dir t = time.time() Sequences = LoadReads.load_data(args.fg_libs, GenomeDir, GeneAnnotation, EmissionParameters['DataOutFile_seq'], load_from_file = ((not args.overwrite_fg) or restart_from_file), save_results = True, Collapse = args.fg_collapsed, mask_flank_variants=EmissionParameters['mask_flank_variants'], max_mm=EmissionParameters['max_mm'], ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) Background = LoadReads.load_data(args.bg_libs, GenomeDir, GeneAnnotation, EmissionParameters['DataOutFile_bck'], load_from_file = ((not args.overwrite_bg) or restart_from_file), save_results = True, Collapse = args.bg_collapsed, OnlyCoverage = args.only_coverage, mask_flank_variants=EmissionParameters['mask_flank_variants'], max_mm=EmissionParameters['max_mm'], ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) #pdb.set_trace() #Mask the positions that overlap miRNA sites in the geneome Sequences.close() Background.close() f_name_read_fg = EmissionParameters['DataOutFile_seq'] f_name_read_bg = EmissionParameters['DataOutFile_bck'] #Create temporary read-files that can be modified by the masking operations if EmissionParameters['tmp_dir'] is None: f_name_read_fg_tmp = EmissionParameters['DataOutFile_seq'].replace('fg_reads.dat', 'fg_reads.tmp.dat') f_name_read_bg_tmp = EmissionParameters['DataOutFile_bck'].replace('bg_reads.dat', 'bg_reads.tmp.dat') else: f_name_read_fg_tmp = os.path.join(EmissionParameters['tmp_dir'], next(tempfile._get_candidate_names()) + '.dat') f_name_read_bg_tmp = os.path.join(EmissionParameters['tmp_dir'], next(tempfile._get_candidate_names()) + '.dat') shutil.copy(f_name_read_fg, f_name_read_fg_tmp) shutil.copy(f_name_read_bg, f_name_read_bg_tmp) #open the temporary read files Sequences = h5py.File(f_name_read_fg_tmp, 'r+') Background = h5py.File(f_name_read_bg_tmp, 'r+') EmissionParameters['DataOutFile_seq'] = f_name_read_fg_tmp EmissionParameters['DataOutFile_bck'] = f_name_read_bg_tmp #Set coverage for regions that overlapp annotated miRNAs to zero EmissionParameters['mask_miRNA'] = args.mask_miRNA if args.mask_miRNA: print('Removing miRNA-coverage') Sequences = mask_miRNA_positions(Sequences, GeneAnnotation) #Mask regions where genes overlap EmissionParameters['mask_ovrlp'] = args.mask_ovrlp if EmissionParameters['mask_ovrlp']: print('Masking overlapping positions') Sequences = mark_overlapping_positions(Sequences, GeneAnnotation) #Estimate the library size EmissionParameters['BckLibrarySize'] = tools.estimate_library_size(Background) EmissionParameters['LibrarySize'] = tools.estimate_library_size(Sequences) #Removing genes without any reads in the CLIP data print("Removing genes without CLIP coverage") genes_to_keep = [] all_genes = list(Sequences.keys()) for i, gene in enumerate(Sequences.keys()): curr_cov = sum([Sequences[gene]['Coverage'][rep][()].sum() for rep in list(Sequences[gene]['Coverage'].keys())]) if curr_cov <= 100: continue genes_to_keep.append(gene) if i > args.gene_sample: break genes_to_del = list(set(all_genes).difference(set(genes_to_keep))) for gene in genes_to_del: del Sequences[gene] del Background[gene] del all_genes, genes_to_del, genes_to_keep if verbosity > 0: print('Done: Elapsed time: ' + str(time.time() - t)) print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) #Initializing parameters print('Initialising the parameters') if bg_type == 'Coverage_bck': NrOfStates = 4 else: NrOfStates = 3 #Remove the gene sequence from the Sequences and Background when not needed. Currently this is always the case: for gene in list(Sequences.keys()): if 'GeneSeq' in Sequences[gene]: del Sequences[gene]['GeneSeq'] for gene in list(Background.keys()): if 'GeneSeq' in Background[gene]: del Background[gene]['GeneSeq'] #pdb.set_trace() TransMat = np.ones((NrOfStates, NrOfStates)) + np.eye(NrOfStates) TransMat = TransMat / np.sum(np.sum(TransMat)) TransitionParameters = [TransMat, []] NrOfReplicates = len(args.fg_libs) gene = list(Sequences.keys())[0] EmissionParameters['PriorMatrix'] = np.ones((NrOfStates, 1)) / float(NrOfStates) EmissionParameters['diag_bg'] = args.diag_bg EmissionParameters['emp_var'] = args.emp_var EmissionParameters['norm_class'] = args.norm_class #Define flag for penalized path prediction EmissionParameters['LastIter'] = False EmissionParameters['fg_pen'] = args.fg_pen EmissionParameters['Diag_event_params'] = {} EmissionParameters['Diag_event_params']['nr_mix_comp'] = args.nr_mix_comp EmissionParameters['Diag_event_params']['mix_comp'] = {} for state in range(NrOfStates): mixtures = np.random.uniform(0.0, 1.0, size=(args.nr_mix_comp)) EmissionParameters['Diag_event_params']['mix_comp'][state] = mixtures / np.sum(mixtures) #initialise the parameter vector alpha alphashape = (Sequences[gene]['Variants']['0']['shape'][0] + Sequences[gene]['Coverage']['0'][()].shape[0] + Sequences[gene]['Read-ends']['0'][()].shape[0]) alpha = {} for state in range(NrOfStates): alpha[state] = np.random.uniform(0.9, 1.1, size=(alphashape, args.nr_mix_comp)) EmissionParameters['Diag_event_params']['alpha'] = alpha EmissionParameters['Diag_event_type'] = args.diag_event_mod EmissionParameters['NrOfStates'] = NrOfStates EmissionParameters['NrOfReplicates'] = NrOfReplicates EmissionParameters['ExpressionParameters'] = [None, None] EmissionParameters['BckType'] = bg_type EmissionParameters['NrOfBckReplicates'] = len(args.bg_libs) EmissionParameters['TransitionType'] = 'binary' EmissionParameters['Verbosity'] = args.verbosity EmissionParameters['NbProc'] = args.nb_proc EmissionParameters['Subsample'] = args.subs EmissionParameters['FilterSNPs'] = args.filter_snps EmissionParameters['SnpRatio'] = args.snps_thresh EmissionParameters['SnpAbs'] = args.snps_min_cov EmissionParameters['ign_diag'] = args.ign_diag if EmissionParameters['ign_out_rds']: EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds'] EmissionParameters['ign_GLM'] = args.ign_GLM EmissionParameters['only_pred'] = args.only_pred EmissionParameters['use_precomp_diagmod'] = args.use_precomp_diagmod # Transistion parameters IterParameters = [EmissionParameters, TransitionParameters] #Start computation #Iterativly fit the parameters of the model OldLogLikelihood = 0 CurrLogLikelihood = -np.inf CurrIter = 0 LoglikelihodList = [] First = 1 IterSaveFile = os.path.join(out_path, 'IterSaveFile.dat') IterSaveFileHist = os.path.join(out_path, 'IterSaveFileHist.dat') IterHist = [] Paths = {} iter_cond = True #Check whether to preload the iteration file if EmissionParameters['only_pred']: IterParameters, args_old = pickle.load(open(IterSaveFile,'rb')) EmissionParameters['mask_miRNA'] = args.mask_miRNA EmissionParameters['glm_weight'] = args.glm_weight EmissionParameters['restart_from_file'] = restart_from_file EmissionParameters = IterParameters[0] EmissionParameters['ign_diag'] = args.ign_diag if EmissionParameters['ign_out_rds']: EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds'] EmissionParameters['ign_GLM'] = args.ign_GLM TransitionParameters = IterParameters[1] TransitionType = EmissionParameters['TransitionType'] OldLogLikelihood = -np.inf fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True) Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo') Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') First = 0 iter_cond = False if restart_from_file: IterParameters, args_old = pickle.load(open(IterSaveFile,'rb')) EmissionParameters = IterParameters[0] EmissionParameters['mask_miRNA'] = args.mask_miRNA EmissionParameters['glm_weight'] = args.glm_weight EmissionParameters['restart_from_file'] = restart_from_file EmissionParameters['ign_diag'] = args.ign_diag EmissionParameters['ign_GLM'] = args.ign_GLM TransitionParameters = IterParameters[1] TransitionType = EmissionParameters['TransitionType'] OldLogLikelihood = -np.inf Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo') Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') First = 1 iter_cond = True #import warnings #warnings.filterwarnings('error') if not EmissionParameters['use_precomp_diagmod'] is None: IterParametersPreComp, args_old = pickle.load(open(EmissionParameters['use_precomp_diagmod'],'r')) IterParameters[0]['Diag_event_params'] = IterParametersPreComp[0]['Diag_event_params'] while iter_cond: print("\n") print("Iteration: " + str(CurrIter)) if EmissionParameters['Verbosity'] > 1: print(IterParameters[0]) OldLogLikelihood = CurrLogLikelihood CurrLogLikelihood, IterParameters, First, Paths = PerformIteration(Sequences, Background, IterParameters, NrOfStates, First, Paths, verbosity=EmissionParameters['Verbosity']) gc.collect() if True: pickle.dump([IterParameters, args], open(IterSaveFile,'wb')) if args.safe_tmp: if CurrIter > 0: IterHist = pickle.load(open(IterSaveFileHist,'rb')) IterHist.append([IterParameters, CurrLogLikelihood]) pickle.dump(IterHist, open(IterSaveFileHist,'wb')) del IterHist if verbosity > 1: print("Log-likelihood: " + str(CurrLogLikelihood)) LoglikelihodList.append(CurrLogLikelihood) if verbosity > 1: print(LoglikelihodList) CurrIter += 1 if CurrIter >= MaxIter: print('Maximal number of iterations reached') if not restart_from_file: if CurrIter < max(3, MaxIter): iter_cond = True else: iter_cond = (CurrIter < MaxIter) and ((abs(CurrLogLikelihood - OldLogLikelihood)/max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik) else: if np.isinf(OldLogLikelihood): iter_cond = (CurrIter < MaxIter) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik) else: iter_cond = (CurrIter < MaxIter) and ((abs(CurrLogLikelihood - OldLogLikelihood)/max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik) #Return the fitted parameters print('Finished parameter fitting') EmissionParameters, TransitionParameters = IterParameters if not isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray): print('Emmision parameters have not been fit yet') return out_file_base = 'pred' if EmissionParameters['ign_GLM']: out_file_base += '_no_glm' if EmissionParameters['ign_diag']: out_file_base += '_no_diag' OutFile = os.path.join(out_path, out_file_base + '.txt') #determine which state has higher weight in fg. if verbosity > 0: print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True) if EmissionParameters['fg_pen'] > 0.0: print('Recomputing paths') EmissionParameters['LastIter'] = True Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') Paths, LogLike = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', verbosity=EmissionParameters['Verbosity']) Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') tools.GeneratePred(Paths, Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state, bg_state, seq_file=EmissionParameters['DataOutFile_seq'], bck_file=EmissionParameters['DataOutFile_bck'], pv_cutoff=pv_cutoff, verbosity=EmissionParameters['Verbosity']) print('Done') #Remove the temporary files if not (EmissionParameters['tmp_dir'] is None): print('removing temporary files') os.remove(EmissionParameters['DataOutFile_seq']) os.remove(EmissionParameters['DataOutFile_bck']) return