Example #1
0
def em(counts, nr_of_counts, EmissionParameters, x_0=None, First=False, max_nr_iter=15, tol=0.0001, rand_sample_size=10, verbosity=1):
	'''
	This function performs the EMlagorithm
	'''

	template_state = 3
	fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True)
	check = False

	OldEmissionParameters = deepcopy(EmissionParameters)
	for curr_state in list(counts.keys()):
		#Only compute the the emission probabilities once
		if EmissionParameters['diag_bg']:
			if curr_state != fg_state:
				if True:
					if check == True:
						print('Using template state ' + str(curr_state))
						EmissionParameters['Diag_event_params']['mix_comp'][curr_state] = deepcopy(EmissionParameters['Diag_event_params']['mix_comp'][template_state])
						EmissionParameters['Diag_event_params']['alpha'][curr_state] = deepcopy(EmissionParameters['Diag_event_params']['alpha'][template_state])
						continue
					else:
						print('setting template state ' + str(curr_state))
						check = True
						template_state = curr_state
				else:
					template_state = 3
					check = True
					EmissionParameters['Diag_event_params']['mix_comp'][curr_state] = deepcopy(EmissionParameters['Diag_event_params']['mix_comp'][template_state])
					EmissionParameters['Diag_event_params']['alpha'][curr_state] = deepcopy(EmissionParameters['Diag_event_params']['alpha'][template_state])
					continue
		print('Estimating state ' + str(curr_state))

		curr_counts = counts[curr_state]
		curr_nr_of_counts = nr_of_counts[curr_state]
		
		alpha, mixtures = Parallel_estimate_mixture_params(OldEmissionParameters, curr_counts, curr_nr_of_counts, curr_state, rand_sample_size, max_nr_iter, nr_of_iter=20, stop_crit=1.0, nr_of_init=10, verbosity=verbosity)
		EmissionParameters['Diag_event_params']['alpha'][curr_state] = alpha
		EmissionParameters['Diag_event_params']['mix_comp'][curr_state] = mixtures
	
	return EmissionParameters
Example #2
0
def FitEmissionParameters(Sequences, Background, NewPaths, OldEmissionParameters, First, verbosity=1):
    print('Fitting emission parameters')
    t = time.time() 
    #Unpack the arguments
    OldAlpha = OldEmissionParameters['Diag_event_params']
    NrOfStates = OldEmissionParameters['NrOfStates']
    OldPriorMatrix = OldEmissionParameters['PriorMatrix']
    NewEmissionParameters = OldEmissionParameters

    #Compute new prior matrix    
    PriorMatrix = np.zeros_like(OldPriorMatrix)
    for State in range(NrOfStates):        
        for path in NewPaths:
            PriorMatrix[State] += np.sum(NewPaths[path] == State)

    #Check if one of the states is not used and add pseudo gene to prevent singularities during distribution fitting 
    if np.sum(PriorMatrix == 0) > 0:
        Sequences.close()
        Background.close()
        Sequences = h5py.File(NewEmissionParameters['DataOutFile_seq'], 'r+')
        Background = h5py.File(NewEmissionParameters['DataOutFile_bck'], 'r+')
        Sequences, Background, NewPaths, pseudo_gene_names = add_pseudo_gene(Sequences, Background, NewPaths, PriorMatrix)
        Sequences.close()
        Background.close()
        print('Addes pseudo gene to prevent singular matrix during GLM fitting')

    CorrectedPriorMatrix = np.copy(PriorMatrix)
    
    CorrectedPriorMatrix[CorrectedPriorMatrix == 0] = np.min(CorrectedPriorMatrix[CorrectedPriorMatrix > 0])/10 
    CorrectedPriorMatrix /= np.sum(CorrectedPriorMatrix)
    #Keep a copy to check which states are not used
    NewEmissionParameters['PriorMatrix'] = CorrectedPriorMatrix

    #Add Pseudo gene to Sequences, Background and Paths
    if NewEmissionParameters['ExpressionParameters'][0] is not None:
        Sequences, Background, NewPaths, pseudo_gene_names = add_pseudo_gene(Sequences, Background, NewPaths, PriorMatrix)

    #Compute parameters for the expression
    sample_size = 10000

    if NewEmissionParameters['BckType'] != 'None':
        if 'Pseudo' in Sequences:        
            nr_of_genes = len(list(Sequences.keys()))
            new_pars = NewEmissionParameters['ExpressionParameters'][0]
            new_pars = np.vstack((new_pars[:(nr_of_genes), :], np.mean(new_pars[:(nr_of_genes), :]), new_pars[(nr_of_genes):, :]))
            NewEmissionParameters['ExpressionParameters'][0] = new_pars
    print('Estimating expression parameters')
    if verbosity > 0:
        print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    bg_type = NewEmissionParameters['BckType']
    expr_data = (NewEmissionParameters, Sequences, Background, NewPaths, sample_size, bg_type)
    NewEmissionParameters = emission_prob.estimate_expression_param(expr_data, verbosity=verbosity)
    
    if verbosity > 0:
        print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    
    if NewEmissionParameters['BckType'] != 'None':
        if 'Pseudo' in Sequences:        
            nr_of_genes = len(list(Sequences.keys()))
            new_pars = NewEmissionParameters['ExpressionParameters'][0]
            new_pars = np.vstack((new_pars[:(nr_of_genes-1), :], new_pars[(nr_of_genes):, :]))
            NewEmissionParameters['ExpressionParameters'][0] = new_pars
    
    if (NewEmissionParameters['skip_diag_event_mdl'] == False) or (not (EmissionParameters['use_precomp_diagmod'] is None)):
        #Compute parameters for the ratios
        print('Computing sufficient statistic for fitting md')
        if verbosity > 0:
            print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        SuffStat = tools.GetSuffStat(Sequences, Background, NewPaths, NrOfStates, Type='Conv', EmissionParameters=NewEmissionParameters, verbosity=verbosity)
        
        #Vectorize SuffStat
        Counts, NrOfCounts = tools.ConvertSuffStatToArrays(SuffStat)

        del SuffStat
        if verbosity > 0:
            print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        if NewEmissionParameters['Subsample']:
            Counts, NrOfCounts = tools.subsample_suff_stat(Counts, NrOfCounts)


        print('Fitting md distribution')
        if verbosity > 0:
            print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

        if NewEmissionParameters['diag_bg']:
            print("Adjusting background")
            SuffStatBck = tools.GetSuffStatBck(Sequences, Background, NewPaths, NrOfStates, Type='Conv', EmissionParameters=NewEmissionParameters, verbosity=verbosity)
            #Vectorize SuffStat
            CountsBck, NrOfCountsBck = tools.ConvertSuffStatToArrays(SuffStatBck)

            if NewEmissionParameters['Subsample']:
                CountsBck, NrOfCountsBck = tools.subsample_suff_stat(CountsBck, NrOfCountsBck)
            
            #Overwrite counts in other bins
            fg_state, bg_state = emission_prob.get_fg_and_bck_state(NewEmissionParameters, final_pred=True)
            for curr_state in list(Counts.keys()):
                if curr_state != fg_state:
                    Counts[curr_state] = CountsBck[fg_state]
                    NrOfCounts[curr_state] = NrOfCountsBck[fg_state]

            del SuffStatBck
            
        NewEmissionParameters = mixture_tools.em(Counts, NrOfCounts, NewEmissionParameters, x_0=OldAlpha, First=First, verbosity=verbosity)
        if verbosity > 0:
            print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        del Counts, NrOfCounts
    
    if 'Pseudo' in Sequences:
        del Sequences['Pseudo']
        del Background['Pseudo']
        del NewPaths['Pseudo']

    if verbosity > 0:
        print('Done: Elapsed time: ' + str(time.time() - t))
    return NewEmissionParameters
Example #3
0
def pred_sites(args, verbosity=1):
    # Get the args

    args = parser.parse_args()
    print(args)

    #Check parameters
    if len(args.fg_libs) == 0:
        raise sys.exit('No CLIP-libraries given')

    if len(args.bg_libs) == 0:
        bg_type = 'None'
    else:
        bg_type = args.bg_type


    if args.out_dir == None:
        out_path = os.getcwd()
    else:
        out_path = args.out_dir

    MaxIter  = args.max_it
    # process the parameters

    if not (bg_type == 'Coverage' or  bg_type == 'Coverage_bck'):
        print('Bg-type: ' + bg_type + ' has not been implemented yet')
        return 

    #Load the gene annotation
    print('Loading gene annotation')
    GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True)
    GenomeDir = args.genome_dir

    #Load the reads
    t = time.time()
    print('Loading reads')
    DataOutFile = os.path.join(out_path, 'fg_reads.dat')
    Sequences = LoadReads.load_data(args.fg_libs, GenomeDir, GeneAnnotation, DataOutFile, load_from_file = True, save_results = False, Collapse = args.fg_collapsed, ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand'])
    
    DataOutFile = os.path.join(out_path, 'bg_reads.dat')
    Background = LoadReads.load_data(args.bg_libs, GenomeDir, GeneAnnotation, DataOutFile, load_from_file = True, save_results = False, Collapse = args.bg_collapsed, OnlyCoverage = True, ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand'])

    
    #Removing genes without any reads in the CLIP data
    genes_to_keep = []
    all_genes = list(Sequences.keys())
    for i, gene in enumerate(Sequences.keys()):
        curr_cov = np.sum(np.array([np.sum(Sequences[gene]['Coverage'][rep].toarray()) for rep in list(Sequences[gene]['Coverage'].keys())]))

        if curr_cov < 100:
            continue

        genes_to_keep.append(gene)
        if i > args.gene_sample:
            break
    
    genes_to_del = list(set(all_genes).difference(set(genes_to_keep)))

    for gene in genes_to_del:
        del Sequences[gene]
        del Background[gene]

    del all_genes, genes_to_del, genes_to_keep 
    if verbosity > 0:
        print('Done: Elapsed time: ' + str(time.time() - t))
    
    #Load data
    tmp_file = pickle.load(open(os.path.join(out_path, 'IterSaveFile.dat'), 'rb'))
    IterParameters = tmp_file[0]
    args = tmp_file[1]
    EmissionParameters = IterParameters[0]
    fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True)
    if EmissionParameters['fg_pen'] > 0.0:
        print('Recomputing paths')
        EmissionParameters['LastIter'] = True        
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')
        Paths, LogLike = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', verbosity=EmissionParameters['Verbosity'])
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    tools.GeneratePred(Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state, bg_state, verbosity=EmissionParameters['Verbosity'])

    print('Done')
Example #4
0
def run_omniCLIP(args):
    # Get the args
    args = parser.parse_args()

    verbosity = args.verbosity

    if verbosity > 1:
        print(args)

    #Check parameters
    if len(args.fg_libs) == 0:
        raise sys.exit('No CLIP-libraries given')

    if len(args.bg_libs) == 0:
        bg_type = 'None'
    else:
        bg_type = args.bg_type


    if args.out_dir == None:
        out_path = os.getcwd()
    else:
        out_path = args.out_dir

    MaxIter  = args.max_it
    # process the parameters

    if not (bg_type == 'Coverage' or  bg_type == 'Coverage_bck'):
        print('Bg-type: ' + bg_type + ' has not been implemented yet')
        return 
    
    #Set seed for the random number generators
    if args.rnd_seed is not None:
        random.seed(args.rnd_seed)
        print('setting seed')

    #Set the p-value cutoff for the bed-file creation
    pv_cutoff = args.pv_cutoff

    #Load the gene annotation
    print('Loading gene annotation')
    if args.gene_anno_file.split('.')[-1] == 'db':
        GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True)
    else:
        if os.path.isfile(args.gene_anno_file + '.db'):
            print('Using existing gene annotation database: ' + args.gene_anno_file + '.db')
            GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file + '.db', keep_order=True)
        else:
            print('Creating gene annotation database')
            db = gffutils.create_db(args.gene_anno_file, dbfn=(args.gene_anno_file + '.db'), force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True, disable_infer_transcripts=True, disable_infer_genes=True)
            GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file + '.db', keep_order=True)
            del db

    GenomeDir = args.genome_dir
    
    import warnings
    warnings.filterwarnings('error')


    #Load the reads
    if verbosity > 0:
        print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    print('Loading reads')

    EmissionParameters = {}
 
    #Check whether existing iteration parameters should be used
    restart_from_file = args.restart_from_file
    EmissionParameters['restart_from_file'] = restart_from_file

    EmissionParameters['glm_weight'] = args.glm_weight

    EmissionParameters['mask_flank_variants'] = args.mask_flank_variants

    EmissionParameters['max_mm'] = args.max_mm

    EmissionParameters['rev_strand'] = args.rev_strand

    EmissionParameters['skip_diag_event_mdl'] = args.skip_diag_event_mdl

    EmissionParameters['ign_out_rds'] = args.ign_out_rds

    EmissionParameters['DataOutFile_seq'] = os.path.join(out_path, 'fg_reads.dat')
    EmissionParameters['DataOutFile_bck'] = os.path.join(out_path, 'bg_reads.dat')
    EmissionParameters['tmp_dir'] = args.tmp_dir
    t = time.time()

    Sequences = LoadReads.load_data(args.fg_libs, GenomeDir, GeneAnnotation, EmissionParameters['DataOutFile_seq'], load_from_file = ((not args.overwrite_fg) or restart_from_file), save_results = True, Collapse = args.fg_collapsed, mask_flank_variants=EmissionParameters['mask_flank_variants'], max_mm=EmissionParameters['max_mm'], ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand'])
    Background = LoadReads.load_data(args.bg_libs, GenomeDir, GeneAnnotation, EmissionParameters['DataOutFile_bck'], load_from_file = ((not args.overwrite_bg) or restart_from_file), save_results = True, Collapse = args.bg_collapsed, OnlyCoverage = args.only_coverage,  mask_flank_variants=EmissionParameters['mask_flank_variants'], max_mm=EmissionParameters['max_mm'], ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand'])
    #pdb.set_trace()
    #Mask the positions that overlap miRNA sites in the geneome
    
    Sequences.close()
    Background.close()

    f_name_read_fg = EmissionParameters['DataOutFile_seq']
    f_name_read_bg = EmissionParameters['DataOutFile_bck']

    #Create temporary read-files that can be modified by the masking operations
    if EmissionParameters['tmp_dir'] is None:
        f_name_read_fg_tmp = EmissionParameters['DataOutFile_seq'].replace('fg_reads.dat', 'fg_reads.tmp.dat')
        f_name_read_bg_tmp = EmissionParameters['DataOutFile_bck'].replace('bg_reads.dat', 'bg_reads.tmp.dat')
    else:
        f_name_read_fg_tmp = os.path.join(EmissionParameters['tmp_dir'], next(tempfile._get_candidate_names()) + '.dat') 
        f_name_read_bg_tmp = os.path.join(EmissionParameters['tmp_dir'], next(tempfile._get_candidate_names()) + '.dat') 
        
    shutil.copy(f_name_read_fg, f_name_read_fg_tmp)
    shutil.copy(f_name_read_bg, f_name_read_bg_tmp)

    #open the temporary read files
    Sequences = h5py.File(f_name_read_fg_tmp, 'r+')
    Background = h5py.File(f_name_read_bg_tmp, 'r+')

    EmissionParameters['DataOutFile_seq'] = f_name_read_fg_tmp
    EmissionParameters['DataOutFile_bck'] = f_name_read_bg_tmp
    

    #Set coverage for regions that overlapp annotated miRNAs to zero
    EmissionParameters['mask_miRNA'] = args.mask_miRNA
    if args.mask_miRNA: 
        print('Removing miRNA-coverage')
        Sequences = mask_miRNA_positions(Sequences, GeneAnnotation)

    #Mask regions where genes overlap
    EmissionParameters['mask_ovrlp'] = args.mask_ovrlp

    if EmissionParameters['mask_ovrlp']:
        print('Masking overlapping positions')
        Sequences = mark_overlapping_positions(Sequences, GeneAnnotation)

    #Estimate the library size
    EmissionParameters['BckLibrarySize'] =  tools.estimate_library_size(Background)
    EmissionParameters['LibrarySize'] = tools.estimate_library_size(Sequences)
    
    #Removing genes without any reads in the CLIP data
    print("Removing genes without CLIP coverage")

    genes_to_keep = []
    all_genes = list(Sequences.keys())
    for i, gene in enumerate(Sequences.keys()):
        curr_cov = sum([Sequences[gene]['Coverage'][rep][()].sum() for rep in list(Sequences[gene]['Coverage'].keys())])

        if curr_cov <= 100:
            continue

        genes_to_keep.append(gene)
        if i > args.gene_sample:
            break
    
    genes_to_del = list(set(all_genes).difference(set(genes_to_keep)))

    for gene in genes_to_del:
        del Sequences[gene]
        del Background[gene]

    del all_genes, genes_to_del, genes_to_keep 
    if verbosity > 0:
        print('Done: Elapsed time: ' + str(time.time() - t))
        print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    #Initializing parameters
    print('Initialising the parameters')
    if bg_type == 'Coverage_bck':
        NrOfStates = 4
    else:
        NrOfStates = 3

    #Remove the gene sequence from the Sequences and Background when not needed. Currently this is always the case:
    for gene in list(Sequences.keys()):
        if 'GeneSeq' in Sequences[gene]:
            del Sequences[gene]['GeneSeq']

    for gene in list(Background.keys()):
        if 'GeneSeq' in Background[gene]:
            del Background[gene]['GeneSeq']

    #pdb.set_trace()
    TransMat = np.ones((NrOfStates, NrOfStates)) + np.eye(NrOfStates)
    TransMat = TransMat / np.sum(np.sum(TransMat))
    TransitionParameters = [TransMat, []]

    NrOfReplicates = len(args.fg_libs)
    gene = list(Sequences.keys())[0]
    
    EmissionParameters['PriorMatrix'] = np.ones((NrOfStates, 1)) / float(NrOfStates)
    EmissionParameters['diag_bg'] = args.diag_bg
    EmissionParameters['emp_var'] = args.emp_var
    EmissionParameters['norm_class'] = args.norm_class

    #Define flag for penalized path prediction
    EmissionParameters['LastIter'] = False    
    EmissionParameters['fg_pen'] = args.fg_pen

    EmissionParameters['Diag_event_params'] = {}
    EmissionParameters['Diag_event_params']['nr_mix_comp'] = args.nr_mix_comp
    EmissionParameters['Diag_event_params']['mix_comp'] = {}
    for state in range(NrOfStates):
        mixtures = np.random.uniform(0.0, 1.0, size=(args.nr_mix_comp))
        EmissionParameters['Diag_event_params']['mix_comp'][state] = mixtures / np.sum(mixtures)
    
    #initialise the parameter vector alpha
    alphashape = (Sequences[gene]['Variants']['0']['shape'][0] + Sequences[gene]['Coverage']['0'][()].shape[0] + Sequences[gene]['Read-ends']['0'][()].shape[0])
    alpha = {}
    for state in range(NrOfStates):
            alpha[state] = np.random.uniform(0.9, 1.1, size=(alphashape, args.nr_mix_comp))
    
    EmissionParameters['Diag_event_params']['alpha'] = alpha
    EmissionParameters['Diag_event_type'] = args.diag_event_mod
    EmissionParameters['NrOfStates'] = NrOfStates
    EmissionParameters['NrOfReplicates'] = NrOfReplicates
    EmissionParameters['ExpressionParameters'] = [None, None]
    EmissionParameters['BckType'] = bg_type
    EmissionParameters['NrOfBckReplicates'] = len(args.bg_libs)
    EmissionParameters['TransitionType'] = 'binary'
    EmissionParameters['Verbosity'] = args.verbosity
    EmissionParameters['NbProc'] = args.nb_proc
    EmissionParameters['Subsample'] = args.subs

    EmissionParameters['FilterSNPs'] = args.filter_snps
    EmissionParameters['SnpRatio'] = args.snps_thresh
    EmissionParameters['SnpAbs'] = args.snps_min_cov
    EmissionParameters['ign_diag'] = args.ign_diag
    if EmissionParameters['ign_out_rds']:
        EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds']
    EmissionParameters['ign_GLM'] = args.ign_GLM
    EmissionParameters['only_pred'] = args.only_pred

    EmissionParameters['use_precomp_diagmod'] = args.use_precomp_diagmod

    # Transistion parameters
    IterParameters = [EmissionParameters, TransitionParameters]

    #Start computation

    #Iterativly fit the parameters of the model
    OldLogLikelihood = 0
    CurrLogLikelihood = -np.inf
    CurrIter = 0
    LoglikelihodList = []
    First = 1
    IterSaveFile = os.path.join(out_path, 'IterSaveFile.dat')
    IterSaveFileHist = os.path.join(out_path, 'IterSaveFileHist.dat')
    IterHist = []
    Paths = {}
    iter_cond = True
    #Check whether to preload the iteration file
    if EmissionParameters['only_pred']:
        IterParameters, args_old = pickle.load(open(IterSaveFile,'rb'))
        EmissionParameters['mask_miRNA'] = args.mask_miRNA
        EmissionParameters['glm_weight'] = args.glm_weight
        EmissionParameters['restart_from_file'] = restart_from_file
        EmissionParameters =  IterParameters[0]
        EmissionParameters['ign_diag'] = args.ign_diag
        if EmissionParameters['ign_out_rds']:
            EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds']
        EmissionParameters['ign_GLM'] = args.ign_GLM
        TransitionParameters = IterParameters[1]
        TransitionType = EmissionParameters['TransitionType']
        OldLogLikelihood = -np.inf
        fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True)
        
        Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo')
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

        First = 0
        iter_cond = False

    if restart_from_file:
        IterParameters, args_old = pickle.load(open(IterSaveFile,'rb'))
        EmissionParameters =  IterParameters[0]
        EmissionParameters['mask_miRNA'] = args.mask_miRNA
        EmissionParameters['glm_weight'] = args.glm_weight
        EmissionParameters['restart_from_file'] = restart_from_file
        EmissionParameters['ign_diag'] = args.ign_diag
        EmissionParameters['ign_GLM'] = args.ign_GLM
        TransitionParameters = IterParameters[1]
        TransitionType = EmissionParameters['TransitionType']
        OldLogLikelihood = -np.inf
        Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo')
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')
        First = 1
        iter_cond = True


    #import warnings
    #warnings.filterwarnings('error')



    if not EmissionParameters['use_precomp_diagmod'] is None:
        IterParametersPreComp, args_old = pickle.load(open(EmissionParameters['use_precomp_diagmod'],'r'))
        IterParameters[0]['Diag_event_params'] = IterParametersPreComp[0]['Diag_event_params']

    while iter_cond:
        print("\n")
        print("Iteration: " + str(CurrIter))
        if EmissionParameters['Verbosity'] > 1:
            print(IterParameters[0])

        OldLogLikelihood  = CurrLogLikelihood
        
        CurrLogLikelihood, IterParameters, First, Paths = PerformIteration(Sequences, Background, IterParameters, NrOfStates, First, Paths, verbosity=EmissionParameters['Verbosity'])
        gc.collect()
        
        if True:
            pickle.dump([IterParameters, args], open(IterSaveFile,'wb'))
        if args.safe_tmp:
            if CurrIter > 0:
                IterHist = pickle.load(open(IterSaveFileHist,'rb'))
            IterHist.append([IterParameters, CurrLogLikelihood])
            pickle.dump(IterHist, open(IterSaveFileHist,'wb'))
            del IterHist
        
        if verbosity > 1:
            print("Log-likelihood: " + str(CurrLogLikelihood)) 
        LoglikelihodList.append(CurrLogLikelihood)
        
        if verbosity > 1:
            print(LoglikelihodList)
        CurrIter += 1
        
        if CurrIter >= MaxIter:
            print('Maximal number of iterations reached')

        if not restart_from_file:
            if CurrIter < max(3, MaxIter):
                iter_cond = True
            else:
                iter_cond = (CurrIter < MaxIter) and ((abs(CurrLogLikelihood - OldLogLikelihood)/max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik)

        else:
            if np.isinf(OldLogLikelihood):
                iter_cond = (CurrIter < MaxIter) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik)    
            else:
                iter_cond = (CurrIter < MaxIter) and ((abs(CurrLogLikelihood - OldLogLikelihood)/max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik)
    
    #Return the fitted parameters
    print('Finished parameter fitting')

    EmissionParameters, TransitionParameters = IterParameters
    if not isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray):
        print('Emmision parameters have not been fit yet')
        return
    out_file_base = 'pred'
    if EmissionParameters['ign_GLM']:
       out_file_base += '_no_glm'
    if EmissionParameters['ign_diag']:
       out_file_base += '_no_diag'
    OutFile = os.path.join(out_path, out_file_base + '.txt')
    #determine which state has higher weight in fg.
    if verbosity > 0:
        print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True)
    if EmissionParameters['fg_pen'] > 0.0:
        print('Recomputing paths')
        EmissionParameters['LastIter'] = True        
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')
        Paths, LogLike = tools.ParallelGetMostLikelyPath(Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', verbosity=EmissionParameters['Verbosity'])
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    tools.GeneratePred(Paths, Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state, bg_state, seq_file=EmissionParameters['DataOutFile_seq'], bck_file=EmissionParameters['DataOutFile_bck'], pv_cutoff=pv_cutoff, verbosity=EmissionParameters['Verbosity'])
    print('Done')

    #Remove the temporary files
    if not (EmissionParameters['tmp_dir'] is None):
        print('removing temporary files')
        os.remove(EmissionParameters['DataOutFile_seq'])
        os.remove(EmissionParameters['DataOutFile_bck'])

    return
Example #5
0
def GetSuffStatBck(Sequences, Background, Paths, NrOfStates, Type, ResetNotUsedStates = True, EmissionParameters=None, verbosity=1):
    '''
    This function computes for each CurrPath state a set of suffcient statistics:
    '''

    #Initialize the sufficent statistcs variable
    print("Getting suffcient statistic")
    t = time.time()
    SuffStatBck = {}

    fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True)

    SuffStatBck[fg_state] = defaultdict(int)

    try:
        Sequences.close()
    except:
        pass
    try:
        Background.close()
    except:
        pass     
    Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
    Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    #Fil the sufficent statistcs variable
    for gene in list(Sequences.keys()):
        rep = list(Background[gene]['Coverage'].keys())[0]
        CurrGenePath = Paths[gene]

        #Stack the matrizes together and convert to dense matrix
        Background_per_gene = PreloadSequencesForGene(Background, gene)
        Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
        if Type == 'Conv':
            CurrStack = StackData(Background_per_gene, add = 'variants')
        else:
            CurrStack = StackData(Background_per_gene, add = 'all')

        if EmissionParameters['FilterSNPs']:
            if Type == 'Conv':
                Ix = GetModelIx(Background_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene)
            else:
                Ix = GetModelIx(Background_per_gene, Type)
        else:
            Ix = GetModelIx(Background_per_gene, Type)

        NonZero = np.sum(CurrStack, axis = 0) > 0

        #Determine the nonzeros elements
        CurrState = fg_state
        
        CurrIx = Ix * NonZero > 0
        if EmissionParameters['mask_ovrlp']:
            CurrIx = Ix * (Sequences_per_gene['mask'][rep][0, :] == 0) * NonZero * (CurrGenePath == CurrState) > 0
        else:
            CurrIx = Ix * NonZero * (CurrGenePath == CurrState) > 0

        data = CurrStack[:,CurrIx].T
        ncols = data.shape[1]
        dtype = data.T.dtype.descr * ncols
        struct = data.view(dtype)

        vals, val_counts = np.unique(struct, return_counts=True)

        #Save the tuples and how many times they have been seen so far.
        for curr_val, curr_count in zip(vals, val_counts): 
            SuffStatBck[CurrState][tuple(curr_val)] += curr_count

        #Treat the 0 tuple seperately for speed improvment
        if len(Ix) == 0:
            continue
        NullIx = (NonZero == 0) * (CurrGenePath == CurrState) > 0
        if np.sum(NullIx) == 0:
            continue
        NullCount = np.sum(NullIx)
        if NullCount > 0:
            NullTuple = np.zeros_like(CurrStack[:, 0])
            NullTuple = tuple(NullTuple.T)
            SuffStatBck[CurrState][NullTuple] += NullCount
        
        del CurrStack, NonZero, CurrGenePath, Ix
    
    print('Done: Elapsed time: ' + str(time.time() - t))

    return SuffStatBck
Example #6
0
def ParallelGetMostLikelyPathForGene(data):
    ''' 
    This function computes the most likely path for a gene 
    '''
    
    gene, nr_of_genes, gene_nr, EmissionParameters, TransitionParameters, TransitionTypeFirst, RandomNoise = data
    
    #Turn the Sequence and Bacground objects into dictionaries again such that the subsequent methods for using these do not need to be modified
    Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
    Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    #Parse the parameters
    alpha = EmissionParameters['Diag_event_params']
    PriorMatrix = EmissionParameters['PriorMatrix']
    NrOfStates = EmissionParameters['NrOfStates']


    fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True)
    fg_pen = EmissionParameters['fg_pen']
    #Score the state sequences
    #1) Determine the positions where an observation is possible

    Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
    Background_per_gene = PreloadSequencesForGene(Background, gene)

    Ix = GetModelIx(Sequences_per_gene, Type='all')

    if np.sum(Ix) == 0:
        CurrPath = 2 * np.ones((0, Ix.shape[0]), dtype=np.int)
        return  [gene, CurrPath, 0]

    if EmissionParameters['FilterSNPs']:
        Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene)
    else:
        Ix = GetModelIx(Sequences_per_gene)
    
    #2) Compute the probabilities for both states
    EmmisionProbGene = np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))
    
    CurrStackSum = StackData(Sequences_per_gene)
    CurrStackVar = StackData(Sequences_per_gene, add = 'no')
    CurrStackSumBck = StackData(Background_per_gene, add = 'only_cov')

    if EmissionParameters['glm_weight'] < 0.0:
        weight1 = 1.0
        weight2 = 1.0
    elif EmissionParameters['glm_weight'] == 0.0:
        weight1 = 0.0000001
        weight2 = 1.0 - weight1 
    elif EmissionParameters['glm_weight'] == 1.0:
        weight1 = 0.9999999
        weight2 = 1.0 - weight1 
    else:
        weight1 = EmissionParameters['glm_weight'] 
        weight2 = (1.0 - EmissionParameters['glm_weight']) 

    for State in range(NrOfStates):
        if not EmissionParameters['ign_GLM']:
            if isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray):
                #EmmisionProbGene[State, :] = FitBinoDirchEmmisionProbabilities.ComputeStateProbForGeneNB_unif(CurrStack, alpha, State, EmissionParameters)
                EmmisionProbGene[State, :] = np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSum, State, nr_of_genes, gene_nr, EmissionParameters)
                if EmissionParameters['BckType'] == 'Coverage':
                    EmmisionProbGene[State, :] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
                if EmissionParameters['BckType'] == 'Coverage_bck':
                    EmmisionProbGene[State, :] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
        if not EmissionParameters['ign_diag']:
            EmmisionProbGene[State, Ix] += np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters)
        if State == fg_state:
            if EmissionParameters['LastIter']:
                EmmisionProbGene[State, :] -= fg_pen
    if RandomNoise:
        EmmisionProbGene = np.logaddexp(EmmisionProbGene, np.random.uniform(np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 4, np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 0.1, EmmisionProbGene.shape)) #Add some random noise 
        
    #Get the transition probabilities
    TransistionProbabilities = np.float64(np.tile(np.log(TransitionParameters[0]), (EmmisionProbGene.shape[1],1,1)).T)
    
    CurrPath, Currloglik = viterbi.viterbi(np.float64(EmmisionProbGene), TransistionProbabilities, np.float64(np.log(PriorMatrix)))
    CurrPath = np.int8(CurrPath)
    
    del TransistionProbabilities, EmmisionProbGene, CurrStackSum, CurrStackVar, CurrStackSumBck, Ix
    Sequences.close()
    Background.close()

    return [gene, CurrPath, Currloglik]