Exemple #1
0
def GetSites(Paths, Sequences, Background, EmissionParameters,
             TransitionParameters, TransitionTypeFirst, fg_state,
             merge_neighbouring_sites, minimal_site_length, seq_file='',
             bck_file=''):
    """Get the predicted sites."""
    # Extract the paths
    Sites = convert_paths_to_sites(
        Paths, fg_state,
        merge_neighbouring_sites, minimal_site_length)

    nr_of_genes = len(list(Sequences.keys()))
    gene_nr_dict = {}
    for i, curr_gene in enumerate(Sequences.keys()):
        gene_nr_dict[curr_gene] = i

    sites_keys = [key for key in list(Sites.keys()) if len(Sites[key]) > 0]
    f = lambda key, Sites=Sites, nr_of_genes=nr_of_genes, gene_nr_dict=gene_nr_dict, seq_file=seq_file, bck_file=bck_file, EmissionParameters=EmissionParameters, TransitionParameters=TransitionParameters, TransitionTypeFirst=TransitionTypeFirst, fg_state=fg_state, merge_neighbouring_sites=merge_neighbouring_sites, minimal_site_length=minimal_site_length: (Sites[key], key, nr_of_genes, gene_nr_dict[key], seq_file, bck_file, EmissionParameters, TransitionParameters, TransitionTypeFirst, fg_state, merge_neighbouring_sites, minimal_site_length)

    data = map(f, sites_keys)

    LoadReads.close_data_handles(handles=[Sequences, Background])
    if EmissionParameters['nb_proc'] == 1:
        ScoredSites = dict([GetSitesForGene(curr_slice) for curr_slice in data])
    else:
        pool = multiprocessing.get_context("spawn").Pool(
            EmissionParameters['nb_proc'], maxtasksperchild=10)
        results = pool.imap(GetSitesForGene, data, chunksize=1)
        pool.close()
        pool.join()
        ScoredSites = dict([res for res in results])
    for key in list(ScoredSites.keys()):
        if len(ScoredSites[key]) == 0:
            del ScoredSites[key]

    return ScoredSites
Exemple #2
0
def GeneratePred(Paths, Sequences, Background, IterParameters, GeneAnnotation,
                 OutFile, fg_state=1, noise_state=0, pv_cutoff=0.05,
                 verbosity=1):
    """Write the predictions."""
    TransitionParameters = IterParameters[1]
    EmissionParameters = IterParameters[0]

    merge_neighbouring_sites = False
    minimal_site_length = 1

    # Predict the sites
    print('Score peaks')
    LoadReads.close_data_handles(handles=[Sequences, Background])
    Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
    Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

    ScoredSites = GetSites(
        Paths, Sequences, Background, EmissionParameters,
        TransitionParameters, 'nonhomo', fg_state, merge_neighbouring_sites,
        minimal_site_length, seq_file=EmissionParameters['dat_file_clip'],
        bck_file=EmissionParameters['dat_file_bg'])

    Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
    Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

    print('Write peaks')
    # Write the results
    WriteResults(Sequences, Background, ScoredSites, OutFile, GeneAnnotation)

    generate_bed(OutFile, pv_cutoff=0.05)
    return
Exemple #3
0
def ParallelGetMostLikelyPath(
        MostLikelyPaths, Sequences, Background, EmissionParameters,
        TransitionParameters, TransitionTypeFirst, RandomNoise=False,
        chunksize=1, verbosity=1):
    """Compute the most likely path."""

    for gene in list(MostLikelyPaths.keys()):
        del MostLikelyPaths[gene]
    nr_of_genes = len(list(Sequences.keys()))
    gene_nr_dict = {}
    for i, curr_gene in enumerate(Sequences.keys()):
        gene_nr_dict[curr_gene] = i

    print("Computing most likely path")
    t = time.time()

    data = zip(list(Sequences.keys()), itertools.repeat(nr_of_genes),
               list(gene_nr_dict.values()),
               itertools.repeat(EmissionParameters),
               itertools.repeat(TransitionParameters),
               itertools.repeat(TransitionTypeFirst),
               itertools.repeat(RandomNoise))

    LoadReads.close_data_handles(handles=[Sequences, Background])

    if EmissionParameters['nb_proc'] == 1:
        results = [ParallelGetMostLikelyPathForGene(curr_slice)
                   for curr_slice in data]
    else:
        print("Spawning processes")
        pool = multiprocessing.get_context("spawn").Pool(
            EmissionParameters['nb_proc'], maxtasksperchild=5)
        results = pool.imap(ParallelGetMostLikelyPathForGene, data, chunksize)
        pool.close()
        pool.join()
        print("Collecting results")
        results = [res for res in results]

    MostLikelyPaths = dict(zip(
        [result[0] for result in results], [result[1] for result in results]))

    # Compute the logliklihood of the gene
    LogLikelihood = sum([result[2] for result in results])
    del results

    if verbosity > 0:
        print('\nDone: Elapsed time: ' + str(time.time() - t))

    return MostLikelyPaths, LogLikelihood
Exemple #4
0
def FitEmissionParameters(Sequences, Background, NewPaths,
                          OldEmissionParameters, First, verbosity=1):
    """Fit EmissionParameters."""
    print('Fitting emission parameters')
    t = time.time()
    # Unpack the arguments
    OldAlpha = OldEmissionParameters['Diag_event_params']
    NrOfStates = OldEmissionParameters['NrOfStates']
    OldPriorMatrix = OldEmissionParameters['PriorMatrix']
    NewEmissionParameters = OldEmissionParameters

    # Compute new prior matrix
    PriorMatrix = np.zeros_like(OldPriorMatrix)
    for State in range(NrOfStates):
        for path in NewPaths:
            PriorMatrix[State] += np.sum(NewPaths[path] == State)

    # Check if one of the states is not used and add pseudo gene to prevent
    # singularities during distribution fitting
    if np.sum(PriorMatrix == 0) > 0:
        LoadReads.close_data_handles(handles=[Sequences, Background])
        Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r+')
        Background = h5py.File(NewEmissionParameters['dat_file_bg'], 'r+')
        Sequences, Background, NewPaths = add_pseudo_gene(
            Sequences, Background, NewPaths, PriorMatrix)
        print('Adds pseudo gene to prevent singular matrix during GLM fitting')

    CorrPriorMatrix = np.copy(PriorMatrix)
    CorrPriorMatrix[CorrPriorMatrix == 0] = np.min(
        CorrPriorMatrix[CorrPriorMatrix > 0])/10
    CorrPriorMatrix /= np.sum(CorrPriorMatrix)
    # Keep a copy to check which states are not used
    NewEmissionParameters['PriorMatrix'] = CorrPriorMatrix

    # Add Pseudo gene to Sequences, Background and Paths
    if NewEmissionParameters['ExpressionParameters'][0] is not None:
        Sequences, Background, NewPaths = add_pseudo_gene(
            Sequences, Background, NewPaths, PriorMatrix)

    # Compute parameters for the expression
    Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r')
    if (NewEmissionParameters['bg_type'] != 'None') and not First:
        if 'Pseudo' in list(Sequences.keys()):
            nr_of_genes = len(list(Sequences.keys()))
            new_pars = NewEmissionParameters['ExpressionParameters'][0]
            new_pars = np.vstack(
                (new_pars[:(nr_of_genes), :],
                 np.mean(new_pars[:(nr_of_genes), :]),
                 new_pars[(nr_of_genes):, :]))
            NewEmissionParameters['ExpressionParameters'][0] = new_pars
    print('Estimating expression parameters')
    get_mem_usage(verbosity)

    NewEmissionParameters = emission_prob.estimate_expression_param(
        (NewEmissionParameters, NewPaths), verbosity=verbosity)

    get_mem_usage(verbosity)

    Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r')
    Background = h5py.File(NewEmissionParameters['dat_file_bg'], 'r')

    if NewEmissionParameters['bg_type'] != 'None':
        if 'Pseudo' in list(Sequences.keys()):
            nr_of_genes = len(list(Sequences.keys()))
            new_pars = NewEmissionParameters['ExpressionParameters'][0]
            new_pars = np.vstack((new_pars[:(nr_of_genes-1), :],
                                  new_pars[(nr_of_genes):, :]))
            NewEmissionParameters['ExpressionParameters'][0] = new_pars

    if NewEmissionParameters['skip_diag_event_mdl'] is False:
        # Compute parameters for the ratios
        print('Computing sufficient statistic for fitting md')
        get_mem_usage(verbosity)

        SuffStat = tools.GetSuffStat(
            NewPaths, NrOfStates, Type='Conv',
            EmissionParameters=NewEmissionParameters, verbosity=verbosity)

        # Vectorize SuffStat
        Counts, NrOfCounts = tools.ConvertSuffStatToArrays(SuffStat)

        del SuffStat
        get_mem_usage(verbosity)
        if NewEmissionParameters['subs']:
            Counts, NrOfCounts = tools.subsample_suff_stat(Counts, NrOfCounts)

        print('Fitting md distribution')
        get_mem_usage(verbosity)
        if NewEmissionParameters['diag_bg']:
            print("Adjusting background")
            SuffStatBck = tools.GetSuffStatBck(
                NewPaths, NrOfStates, Type='Conv',
                EmissionParameters=NewEmissionParameters, verbosity=verbosity)

            # Vectorize SuffStat
            CountsBck, NrOfCountsBck = tools.ConvertSuffStatToArrays(
                SuffStatBck)

            if NewEmissionParameters['subs']:
                CountsBck, NrOfCountsBck = tools.subsample_suff_stat(
                    CountsBck, NrOfCountsBck)

            # Overwrite counts in other bins
            fg_state, bg_state = emission_prob.get_fg_and_bck_state(
                NewEmissionParameters, final_pred=True)

            for curr_state in list(Counts.keys()):
                if curr_state != fg_state:
                    Counts[curr_state] = CountsBck[fg_state]
                    NrOfCounts[curr_state] = NrOfCountsBck[fg_state]

            del SuffStatBck

        NewEmissionParameters = mixture_tools.em(
            Counts, NrOfCounts, NewEmissionParameters, x_0=OldAlpha,
            First=First, verbosity=verbosity)

        get_mem_usage(verbosity)
        del Counts, NrOfCounts

    if 'Pseudo' in list(Sequences.keys()):
        del Sequences['Pseudo']
        del Background['Pseudo']
        del NewPaths['Pseudo']

    if verbosity > 0:
        print('Done: Elapsed time: ' + str(time.time() - t))
    return NewEmissionParameters
Exemple #5
0
def PerformIteration(Sequences, Background, IterParameters, NrOfStates, First,
                     NewPaths={}, verbosity=1):
    """
    This function performs an iteration of the HMM algorithm
    """
    # Unpack the Iteration parameters
    EmissionParameters = IterParameters[0]
    TransitionParameters = IterParameters[1]

    # Get new most likely path
    if First:
        NewPaths, LogLike = tools.ParallelGetMostLikelyPath(
            NewPaths, Sequences, Background, EmissionParameters,
            TransitionParameters, 'h**o', RandomNoise=True,
            verbosity=verbosity)

        Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
        Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

        get_mem_usage(verbosity)

    # Perform EM to compute the new emission parameters
    print('Fitting emission parameters')
    get_mem_usage(verbosity)

    NewEmissionParameters = FitEmissionParameters(
        Sequences, Background, NewPaths, EmissionParameters, First,
        verbosity=verbosity)

    if First:
        First = 0

    get_mem_usage(verbosity)

    # Fit the transition matrix parameters
    NewTransitionParameters = TransitionParameters
    print('Fitting transition parameters')
    get_mem_usage(verbosity)

    LoadReads.close_data_handles(handles=[Sequences, Background])
    Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
    Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

    TransistionPredictors = trans.FitTransistionParameters(
        Sequences, Background, TransitionParameters, NewPaths,
        verbosity=verbosity)

    NewTransitionParameters[1] = TransistionPredictors
    get_mem_usage(verbosity)

    NewIterParameters = [NewEmissionParameters, NewTransitionParameters]

    print('Computing most likely path')
    get_mem_usage(verbosity)

    gc.collect()
    NewPaths, LogLike = tools.ParallelGetMostLikelyPath(
        NewPaths, Sequences, Background, EmissionParameters,
        TransitionParameters, 'nonhomo', verbosity=verbosity)

    Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
    Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

    CurrLogLikelihood = LogLike
    get_mem_usage(verbosity)
    if verbosity > 1:
        print('LogLik:')
        print(CurrLogLikelihood)
    return CurrLogLikelihood, NewIterParameters, First, NewPaths
Exemple #6
0
def ParallelGetMostLikelyPathForGene(data):
    """Compute the most likely path for a gene."""
    (gene, nr_of_genes, gene_nr, EmissionParameters,
     TransitionParameters, TransitionTypeFirst, RandomNoise) = data

    # Turn the Sequence and Bacground objects into dictionaries again such that
    # the subsequent methods for using these do not need to be modified
    Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
    Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

    # Parse the parameters
    alpha = EmissionParameters['Diag_event_params']
    PriorMatrix = EmissionParameters['PriorMatrix']
    NrOfStates = EmissionParameters['NrOfStates']

    fg_state, bg_state = emission_prob.get_fg_and_bck_state(
        EmissionParameters, final_pred=True)
    # Score the state sequences
    # 1) Determine the positions where an observation is possible

    Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
    Background_per_gene = PreloadSequencesForGene(Background, gene)

    Ix = GetModelIx(Sequences_per_gene, Type='all')

    if np.sum(Ix) == 0:
        CurrPath = 2 * np.ones((0, Ix.shape[0]), dtype=np.int)
        return [gene, CurrPath, 0]

    if EmissionParameters['filter_snps']:
        Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv',
                        snps_thresh=EmissionParameters['snps_thresh'],
                        snps_min_cov=EmissionParameters['snps_min_cov'],
                        Background=Background_per_gene)
    else:
        Ix = GetModelIx(Sequences_per_gene)

    # 2) Compute the probabilities for both states
    EmmisionProbGene = (np.ones((NrOfStates, Ix.shape[0]))
                        * (1 / np.float64(NrOfStates)))

    CurrStackSum = StackData(Sequences_per_gene)
    CurrStackVar = StackData(Sequences_per_gene, add='no')
    CurrStackSumBck = StackData(Background_per_gene, add='only_cov')

    if EmissionParameters['glm_weight'] < 0.0:
        weight1 = 1.0
        weight2 = 1.0
    elif EmissionParameters['glm_weight'] == 0.0:
        weight1 = 0.0000001
        weight2 = 1.0 - weight1
    elif EmissionParameters['glm_weight'] == 1.0:
        weight1 = 0.9999999
        weight2 = 1.0 - weight1
    else:
        weight1 = EmissionParameters['glm_weight']
        weight2 = (1.0 - EmissionParameters['glm_weight'])

    for State in range(NrOfStates):
        if not EmissionParameters['ign_GLM']:
            if isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray):
                EmmisionProbGene[State, :] = np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSum, State, nr_of_genes, gene_nr, EmissionParameters)
                if EmissionParameters['bg_type'] == 'Coverage':
                    EmmisionProbGene[State, :] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
                if EmissionParameters['bg_type'] == 'Coverage_bck':
                    EmmisionProbGene[State, :] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
        if not EmissionParameters['ign_diag']:
            EmmisionProbGene[State, Ix] += np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters)
        if State == fg_state:
            if EmissionParameters['LastIter']:
                EmmisionProbGene[State, :] -= EmissionParameters['fg_pen']
    if RandomNoise:
        EmmisionProbGene = np.logaddexp(
            EmmisionProbGene, np.random.uniform(
                np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 4,
                np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 0.1,
                EmmisionProbGene.shape))  # Add some random noise

    # Get the transition probabilities
    TransistionProbabilities = np.float64(np.tile(
            np.log(TransitionParameters[0]),
            (EmmisionProbGene.shape[1], 1, 1)).T)

    CurrPath, Currloglik = viterbi.viterbi(
        np.float64(EmmisionProbGene), TransistionProbabilities,
        np.float64(np.log(PriorMatrix)))
    CurrPath = np.int8(CurrPath)

    del (TransistionProbabilities, EmmisionProbGene, CurrStackSum,
         CurrStackVar, CurrStackSumBck, Ix)

    LoadReads.close_data_handles(handles=[Sequences, Background])

    return [gene, CurrPath, Currloglik]
Exemple #7
0
def GetSitesForGene(data):
    """Determine the score of the sites for each gene."""
    # Computing the probabilities for the current gene
    (Sites, gene, nr_of_genes, gene_nr, seq_file, bck_file,
     EmissionParameters, TransitionParameters, TransitionTypeFirst,
     fg_state, merge_neighbouring_sites, minimal_site_length) = data

    # Turn the Sequence and Bacground objects into dictionaries again such that
    # the subsequent methods for using these do not need to be modified
    if len(Sites) == 0:
        return gene, []

    NrOfStates = EmissionParameters['NrOfStates']

    Sites = dict([(gene, Sites)])

    Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
    Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

    Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
    Background_per_gene = PreloadSequencesForGene(Background, gene)

    Ix = GetModelIx(Sequences_per_gene, Type='all')

    if np.sum(Ix) == 0:
        return gene, []

    if EmissionParameters['filter_snps']:
        Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv',
                        snps_thresh=EmissionParameters['snps_thresh'],
                        snps_min_cov=EmissionParameters['snps_min_cov'],
                        Background=Background_per_gene)
    else:
        Ix = GetModelIx(Sequences_per_gene, Type='Conv')

    # Only compute the emission probability for regions where a site is
    ix_sites = np.zeros_like(Ix)
    ix_sites_len = Ix.shape[0]
    for currsite in Sites[gene]:
        ix_sites[max(0, currsite[0] - 1): min(ix_sites_len, currsite[1] + 1)] = 1
    ix_sites = ix_sites == 1

    # 2) Compute the probabilities for both states
    EmmisionProbGene = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    CurrStackSum = StackData(Sequences_per_gene)
    CurrStackVar = StackData(Sequences_per_gene, add='no')
    CurrStackSumBck = StackData(Background_per_gene, add='only_cov')

    CurrStackVarSumm = StackData(Sequences_per_gene, add='only_var_summed')
    EmmisionProbGeneDir = np.zeros_like(EmmisionProbGene)

    if EmissionParameters['glm_weight'] < 0.0:
        weight1 = 1.0
        weight2 = 1.0
    elif EmissionParameters['glm_weight'] == 0.0:
        weight1 = 0.0000001
        weight2 = 1.0 - weight1
    elif EmissionParameters['glm_weight'] == 1.0:
        weight1 = 0.9999999
        weight2 = 1.0 - weight1
    else:
        weight1 = EmissionParameters['glm_weight']
        weight2 = (1.0 - EmissionParameters['glm_weight'])

    for State in range(NrOfStates):
        EmmisionProbGene[State, ix_sites] = np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSum[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters)
        if EmissionParameters['bg_type'] == 'Coverage':
            EmmisionProbGene[State, ix_sites] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
        if EmissionParameters['bg_type'] == 'Coverage_bck':
            EmmisionProbGene[State, ix_sites] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
        EmmisionProbGeneDir[State, Ix] = np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters)
        EmmisionProbGene[State, Ix] += np.log(weight2) + EmmisionProbGeneDir[State, Ix]

    Counts = StackData(Sequences_per_gene, add='all')

    Score = EmmisionProbGene
    CurrStack = CurrStackVar
    # Compute the scores when staying in the same state
    # RowIx = list(range(16)) + list(range(17, 38)) + list(range(39,44))
    strand = Sequences_per_gene['strand']

    # Get the coverages for the froeground and background
    CountsSeq = StackData(Sequences_per_gene, add='only_cov')
    CountsBck = StackData(Background_per_gene, add='only_cov')

    if strand == 0:
        strand = -1
    # Since we the transition probabilty is the same for all States we do not
    # need to compute it for the bayes factor this list contains the returned
    # sites
    sites = []
    for currsite in Sites[gene]:
        (mean_mat_fg, var_mat_fg, mean_mat_bg, var_mat_bg, counts_fg, counts_bg) = ComputeStatsForSite(CountsSeq, CountsBck, currsite, fg_state, nr_of_genes, gene_nr, EmissionParameters)

        site = {}
        site['Start'] = currsite[0]
        site['Stop'] = currsite[1]
        site['Strand'] = strand
        site['SiteScore'] = EvaluateSite(Score, currsite, fg_state)
        site['Coverage'] = np.sum(np.sum(Counts[:, site['Start']:site['Stop']], axis=0))
        site['Variants'] = np.sum(CurrStackVarSumm[:, site['Start']:site['Stop']], axis=1)
        site['mean_mat_fg'] = mean_mat_fg
        site['var_mat_fg'] = var_mat_fg
        site['mean_mat_bg'] = mean_mat_bg
        site['var_mat_bg'] = var_mat_bg
        site['counts_fg'] = counts_fg
        site['counts_bg'] = counts_bg

        p = mean_mat_fg / var_mat_fg
        n = (mean_mat_fg ** 2) / (var_mat_fg - mean_mat_fg)
        site['pv'] = nbinom.logsf(counts_fg, n, p)
        site['max_pos'] = get_max_position(Score, currsite, fg_state, strand)
        site['dir_score'] = EvaluateSite(EmmisionProbGeneDir, currsite, fg_state)
        if site['SiteScore'] < 0.0:
            continue
        sites.append(site)

    LoadReads.close_data_handles(handles=[Sequences, Background])

    return gene, sites