コード例 #1
0
def pred_sites(args, verbosity=1):
    # Get the args

    args = parser.parse_args()
    print(args)

    #Check parameters
    if len(args.fg_libs) == 0:
        raise sys.exit('No CLIP-libraries given')

    if len(args.bg_libs) == 0:
        bg_type = 'None'
    else:
        bg_type = args.bg_type


    if args.out_dir == None:
        out_path = os.getcwd()
    else:
        out_path = args.out_dir

    MaxIter  = args.max_it
    # process the parameters

    if not (bg_type == 'Coverage' or  bg_type == 'Coverage_bck'):
        print('Bg-type: ' + bg_type + ' has not been implemented yet')
        return 

    #Load the gene annotation
    print('Loading gene annotation')
    GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True)
    GenomeDir = args.genome_dir

    #Load the reads
    t = time.time()
    print('Loading reads')
    DataOutFile = os.path.join(out_path, 'fg_reads.dat')
    Sequences = LoadReads.load_data(args.fg_libs, GenomeDir, GeneAnnotation, DataOutFile, load_from_file = True, save_results = False, Collapse = args.fg_collapsed, ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand'])
    
    DataOutFile = os.path.join(out_path, 'bg_reads.dat')
    Background = LoadReads.load_data(args.bg_libs, GenomeDir, GeneAnnotation, DataOutFile, load_from_file = True, save_results = False, Collapse = args.bg_collapsed, OnlyCoverage = True, ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand'])

    
    #Removing genes without any reads in the CLIP data
    genes_to_keep = []
    all_genes = list(Sequences.keys())
    for i, gene in enumerate(Sequences.keys()):
        curr_cov = np.sum(np.array([np.sum(Sequences[gene]['Coverage'][rep].toarray()) for rep in list(Sequences[gene]['Coverage'].keys())]))

        if curr_cov < 100:
            continue

        genes_to_keep.append(gene)
        if i > args.gene_sample:
            break
    
    genes_to_del = list(set(all_genes).difference(set(genes_to_keep)))

    for gene in genes_to_del:
        del Sequences[gene]
        del Background[gene]

    del all_genes, genes_to_del, genes_to_keep 
    if verbosity > 0:
        print('Done: Elapsed time: ' + str(time.time() - t))
    
    #Load data
    tmp_file = pickle.load(open(os.path.join(out_path, 'IterSaveFile.dat'), 'rb'))
    IterParameters = tmp_file[0]
    args = tmp_file[1]
    EmissionParameters = IterParameters[0]
    fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True)
    if EmissionParameters['fg_pen'] > 0.0:
        print('Recomputing paths')
        EmissionParameters['LastIter'] = True        
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')
        Paths, LogLike = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', verbosity=EmissionParameters['Verbosity'])
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    tools.GeneratePred(Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state, bg_state, verbosity=EmissionParameters['Verbosity'])

    print('Done')
コード例 #2
0
def run_omniCLIP(args):
    # Get the args
    args = parser.parse_args()

    verbosity = args.verbosity

    if verbosity > 1:
        print(args)

    #Check parameters
    if len(args.fg_libs) == 0:
        raise sys.exit('No CLIP-libraries given')

    if len(args.bg_libs) == 0:
        bg_type = 'None'
    else:
        bg_type = args.bg_type


    if args.out_dir == None:
        out_path = os.getcwd()
    else:
        out_path = args.out_dir

    MaxIter  = args.max_it
    # process the parameters

    if not (bg_type == 'Coverage' or  bg_type == 'Coverage_bck'):
        print('Bg-type: ' + bg_type + ' has not been implemented yet')
        return 
    
    #Set seed for the random number generators
    if args.rnd_seed is not None:
        random.seed(args.rnd_seed)
        print('setting seed')

    #Set the p-value cutoff for the bed-file creation
    pv_cutoff = args.pv_cutoff

    #Load the gene annotation
    print('Loading gene annotation')
    if args.gene_anno_file.split('.')[-1] == 'db':
        GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True)
    else:
        if os.path.isfile(args.gene_anno_file + '.db'):
            print('Using existing gene annotation database: ' + args.gene_anno_file + '.db')
            GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file + '.db', keep_order=True)
        else:
            print('Creating gene annotation database')
            db = gffutils.create_db(args.gene_anno_file, dbfn=(args.gene_anno_file + '.db'), force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True, disable_infer_transcripts=True, disable_infer_genes=True)
            GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file + '.db', keep_order=True)
            del db

    GenomeDir = args.genome_dir
    
    import warnings
    warnings.filterwarnings('error')


    #Load the reads
    if verbosity > 0:
        print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    print('Loading reads')

    EmissionParameters = {}
 
    #Check whether existing iteration parameters should be used
    restart_from_file = args.restart_from_file
    EmissionParameters['restart_from_file'] = restart_from_file

    EmissionParameters['glm_weight'] = args.glm_weight

    EmissionParameters['mask_flank_variants'] = args.mask_flank_variants

    EmissionParameters['max_mm'] = args.max_mm

    EmissionParameters['rev_strand'] = args.rev_strand

    EmissionParameters['skip_diag_event_mdl'] = args.skip_diag_event_mdl

    EmissionParameters['ign_out_rds'] = args.ign_out_rds

    EmissionParameters['DataOutFile_seq'] = os.path.join(out_path, 'fg_reads.dat')
    EmissionParameters['DataOutFile_bck'] = os.path.join(out_path, 'bg_reads.dat')
    EmissionParameters['tmp_dir'] = args.tmp_dir
    t = time.time()

    Sequences = LoadReads.load_data(args.fg_libs, GenomeDir, GeneAnnotation, EmissionParameters['DataOutFile_seq'], load_from_file = ((not args.overwrite_fg) or restart_from_file), save_results = True, Collapse = args.fg_collapsed, mask_flank_variants=EmissionParameters['mask_flank_variants'], max_mm=EmissionParameters['max_mm'], ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand'])
    Background = LoadReads.load_data(args.bg_libs, GenomeDir, GeneAnnotation, EmissionParameters['DataOutFile_bck'], load_from_file = ((not args.overwrite_bg) or restart_from_file), save_results = True, Collapse = args.bg_collapsed, OnlyCoverage = args.only_coverage,  mask_flank_variants=EmissionParameters['mask_flank_variants'], max_mm=EmissionParameters['max_mm'], ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand'])
    #pdb.set_trace()
    #Mask the positions that overlap miRNA sites in the geneome
    
    Sequences.close()
    Background.close()

    f_name_read_fg = EmissionParameters['DataOutFile_seq']
    f_name_read_bg = EmissionParameters['DataOutFile_bck']

    #Create temporary read-files that can be modified by the masking operations
    if EmissionParameters['tmp_dir'] is None:
        f_name_read_fg_tmp = EmissionParameters['DataOutFile_seq'].replace('fg_reads.dat', 'fg_reads.tmp.dat')
        f_name_read_bg_tmp = EmissionParameters['DataOutFile_bck'].replace('bg_reads.dat', 'bg_reads.tmp.dat')
    else:
        f_name_read_fg_tmp = os.path.join(EmissionParameters['tmp_dir'], next(tempfile._get_candidate_names()) + '.dat') 
        f_name_read_bg_tmp = os.path.join(EmissionParameters['tmp_dir'], next(tempfile._get_candidate_names()) + '.dat') 
        
    shutil.copy(f_name_read_fg, f_name_read_fg_tmp)
    shutil.copy(f_name_read_bg, f_name_read_bg_tmp)

    #open the temporary read files
    Sequences = h5py.File(f_name_read_fg_tmp, 'r+')
    Background = h5py.File(f_name_read_bg_tmp, 'r+')

    EmissionParameters['DataOutFile_seq'] = f_name_read_fg_tmp
    EmissionParameters['DataOutFile_bck'] = f_name_read_bg_tmp
    

    #Set coverage for regions that overlapp annotated miRNAs to zero
    EmissionParameters['mask_miRNA'] = args.mask_miRNA
    if args.mask_miRNA: 
        print('Removing miRNA-coverage')
        Sequences = mask_miRNA_positions(Sequences, GeneAnnotation)

    #Mask regions where genes overlap
    EmissionParameters['mask_ovrlp'] = args.mask_ovrlp

    if EmissionParameters['mask_ovrlp']:
        print('Masking overlapping positions')
        Sequences = mark_overlapping_positions(Sequences, GeneAnnotation)

    #Estimate the library size
    EmissionParameters['BckLibrarySize'] =  tools.estimate_library_size(Background)
    EmissionParameters['LibrarySize'] = tools.estimate_library_size(Sequences)
    
    #Removing genes without any reads in the CLIP data
    print("Removing genes without CLIP coverage")

    genes_to_keep = []
    all_genes = list(Sequences.keys())
    for i, gene in enumerate(Sequences.keys()):
        curr_cov = sum([Sequences[gene]['Coverage'][rep][()].sum() for rep in list(Sequences[gene]['Coverage'].keys())])

        if curr_cov <= 100:
            continue

        genes_to_keep.append(gene)
        if i > args.gene_sample:
            break
    
    genes_to_del = list(set(all_genes).difference(set(genes_to_keep)))

    for gene in genes_to_del:
        del Sequences[gene]
        del Background[gene]

    del all_genes, genes_to_del, genes_to_keep 
    if verbosity > 0:
        print('Done: Elapsed time: ' + str(time.time() - t))
        print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    #Initializing parameters
    print('Initialising the parameters')
    if bg_type == 'Coverage_bck':
        NrOfStates = 4
    else:
        NrOfStates = 3

    #Remove the gene sequence from the Sequences and Background when not needed. Currently this is always the case:
    for gene in list(Sequences.keys()):
        if 'GeneSeq' in Sequences[gene]:
            del Sequences[gene]['GeneSeq']

    for gene in list(Background.keys()):
        if 'GeneSeq' in Background[gene]:
            del Background[gene]['GeneSeq']

    #pdb.set_trace()
    TransMat = np.ones((NrOfStates, NrOfStates)) + np.eye(NrOfStates)
    TransMat = TransMat / np.sum(np.sum(TransMat))
    TransitionParameters = [TransMat, []]

    NrOfReplicates = len(args.fg_libs)
    gene = list(Sequences.keys())[0]
    
    EmissionParameters['PriorMatrix'] = np.ones((NrOfStates, 1)) / float(NrOfStates)
    EmissionParameters['diag_bg'] = args.diag_bg
    EmissionParameters['emp_var'] = args.emp_var
    EmissionParameters['norm_class'] = args.norm_class

    #Define flag for penalized path prediction
    EmissionParameters['LastIter'] = False    
    EmissionParameters['fg_pen'] = args.fg_pen

    EmissionParameters['Diag_event_params'] = {}
    EmissionParameters['Diag_event_params']['nr_mix_comp'] = args.nr_mix_comp
    EmissionParameters['Diag_event_params']['mix_comp'] = {}
    for state in range(NrOfStates):
        mixtures = np.random.uniform(0.0, 1.0, size=(args.nr_mix_comp))
        EmissionParameters['Diag_event_params']['mix_comp'][state] = mixtures / np.sum(mixtures)
    
    #initialise the parameter vector alpha
    alphashape = (Sequences[gene]['Variants']['0']['shape'][0] + Sequences[gene]['Coverage']['0'][()].shape[0] + Sequences[gene]['Read-ends']['0'][()].shape[0])
    alpha = {}
    for state in range(NrOfStates):
            alpha[state] = np.random.uniform(0.9, 1.1, size=(alphashape, args.nr_mix_comp))
    
    EmissionParameters['Diag_event_params']['alpha'] = alpha
    EmissionParameters['Diag_event_type'] = args.diag_event_mod
    EmissionParameters['NrOfStates'] = NrOfStates
    EmissionParameters['NrOfReplicates'] = NrOfReplicates
    EmissionParameters['ExpressionParameters'] = [None, None]
    EmissionParameters['BckType'] = bg_type
    EmissionParameters['NrOfBckReplicates'] = len(args.bg_libs)
    EmissionParameters['TransitionType'] = 'binary'
    EmissionParameters['Verbosity'] = args.verbosity
    EmissionParameters['NbProc'] = args.nb_proc
    EmissionParameters['Subsample'] = args.subs

    EmissionParameters['FilterSNPs'] = args.filter_snps
    EmissionParameters['SnpRatio'] = args.snps_thresh
    EmissionParameters['SnpAbs'] = args.snps_min_cov
    EmissionParameters['ign_diag'] = args.ign_diag
    if EmissionParameters['ign_out_rds']:
        EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds']
    EmissionParameters['ign_GLM'] = args.ign_GLM
    EmissionParameters['only_pred'] = args.only_pred

    EmissionParameters['use_precomp_diagmod'] = args.use_precomp_diagmod

    # Transistion parameters
    IterParameters = [EmissionParameters, TransitionParameters]

    #Start computation

    #Iterativly fit the parameters of the model
    OldLogLikelihood = 0
    CurrLogLikelihood = -np.inf
    CurrIter = 0
    LoglikelihodList = []
    First = 1
    IterSaveFile = os.path.join(out_path, 'IterSaveFile.dat')
    IterSaveFileHist = os.path.join(out_path, 'IterSaveFileHist.dat')
    IterHist = []
    Paths = {}
    iter_cond = True
    #Check whether to preload the iteration file
    if EmissionParameters['only_pred']:
        IterParameters, args_old = pickle.load(open(IterSaveFile,'rb'))
        EmissionParameters['mask_miRNA'] = args.mask_miRNA
        EmissionParameters['glm_weight'] = args.glm_weight
        EmissionParameters['restart_from_file'] = restart_from_file
        EmissionParameters =  IterParameters[0]
        EmissionParameters['ign_diag'] = args.ign_diag
        if EmissionParameters['ign_out_rds']:
            EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds']
        EmissionParameters['ign_GLM'] = args.ign_GLM
        TransitionParameters = IterParameters[1]
        TransitionType = EmissionParameters['TransitionType']
        OldLogLikelihood = -np.inf
        fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True)
        
        Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo')
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

        First = 0
        iter_cond = False

    if restart_from_file:
        IterParameters, args_old = pickle.load(open(IterSaveFile,'rb'))
        EmissionParameters =  IterParameters[0]
        EmissionParameters['mask_miRNA'] = args.mask_miRNA
        EmissionParameters['glm_weight'] = args.glm_weight
        EmissionParameters['restart_from_file'] = restart_from_file
        EmissionParameters['ign_diag'] = args.ign_diag
        EmissionParameters['ign_GLM'] = args.ign_GLM
        TransitionParameters = IterParameters[1]
        TransitionType = EmissionParameters['TransitionType']
        OldLogLikelihood = -np.inf
        Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo')
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')
        First = 1
        iter_cond = True


    #import warnings
    #warnings.filterwarnings('error')



    if not EmissionParameters['use_precomp_diagmod'] is None:
        IterParametersPreComp, args_old = pickle.load(open(EmissionParameters['use_precomp_diagmod'],'r'))
        IterParameters[0]['Diag_event_params'] = IterParametersPreComp[0]['Diag_event_params']

    while iter_cond:
        print("\n")
        print("Iteration: " + str(CurrIter))
        if EmissionParameters['Verbosity'] > 1:
            print(IterParameters[0])

        OldLogLikelihood  = CurrLogLikelihood
        
        CurrLogLikelihood, IterParameters, First, Paths = PerformIteration(Sequences, Background, IterParameters, NrOfStates, First, Paths, verbosity=EmissionParameters['Verbosity'])
        gc.collect()
        
        if True:
            pickle.dump([IterParameters, args], open(IterSaveFile,'wb'))
        if args.safe_tmp:
            if CurrIter > 0:
                IterHist = pickle.load(open(IterSaveFileHist,'rb'))
            IterHist.append([IterParameters, CurrLogLikelihood])
            pickle.dump(IterHist, open(IterSaveFileHist,'wb'))
            del IterHist
        
        if verbosity > 1:
            print("Log-likelihood: " + str(CurrLogLikelihood)) 
        LoglikelihodList.append(CurrLogLikelihood)
        
        if verbosity > 1:
            print(LoglikelihodList)
        CurrIter += 1
        
        if CurrIter >= MaxIter:
            print('Maximal number of iterations reached')

        if not restart_from_file:
            if CurrIter < max(3, MaxIter):
                iter_cond = True
            else:
                iter_cond = (CurrIter < MaxIter) and ((abs(CurrLogLikelihood - OldLogLikelihood)/max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik)

        else:
            if np.isinf(OldLogLikelihood):
                iter_cond = (CurrIter < MaxIter) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik)    
            else:
                iter_cond = (CurrIter < MaxIter) and ((abs(CurrLogLikelihood - OldLogLikelihood)/max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik)
    
    #Return the fitted parameters
    print('Finished parameter fitting')

    EmissionParameters, TransitionParameters = IterParameters
    if not isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray):
        print('Emmision parameters have not been fit yet')
        return
    out_file_base = 'pred'
    if EmissionParameters['ign_GLM']:
       out_file_base += '_no_glm'
    if EmissionParameters['ign_diag']:
       out_file_base += '_no_diag'
    OutFile = os.path.join(out_path, out_file_base + '.txt')
    #determine which state has higher weight in fg.
    if verbosity > 0:
        print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True)
    if EmissionParameters['fg_pen'] > 0.0:
        print('Recomputing paths')
        EmissionParameters['LastIter'] = True        
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')
        Paths, LogLike = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', verbosity=EmissionParameters['Verbosity'])
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    tools.GeneratePred(Paths, Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state, bg_state, seq_file=EmissionParameters['DataOutFile_seq'], bck_file=EmissionParameters['DataOutFile_bck'], pv_cutoff=pv_cutoff, verbosity=EmissionParameters['Verbosity'])
    print('Done')

    #Remove the temporary files
    if not (EmissionParameters['tmp_dir'] is None):
        print('removing temporary files')
        os.remove(EmissionParameters['DataOutFile_seq'])
        os.remove(EmissionParameters['DataOutFile_bck'])

    return