Beispiel #1
0
def FitEmissionParameters(Sequences, Background, NewPaths,
                          OldEmissionParameters, First):
    print 'Fitting emission parameters'
    t = time.time()
    #Unpack the arguments
    OldAlpha = OldEmissionParameters['Diag_event_params']
    NrOfStates = OldEmissionParameters['NrOfStates']
    OldPriorMatrix = OldEmissionParameters['PriorMatrix']
    NewEmissionParameters = OldEmissionParameters

    #Compute new prior matrix
    PriorMatrix = np.zeros_like(OldPriorMatrix)
    for State in range(NrOfStates):
        for path in NewPaths:
            PriorMatrix[State] += np.sum(NewPaths[path] == State)

    CorrectedPriorMatrix = np.copy(PriorMatrix)

    CorrectedPriorMatrix[CorrectedPriorMatrix == 0] = np.min(
        CorrectedPriorMatrix[CorrectedPriorMatrix > 0]) / 10
    CorrectedPriorMatrix /= np.sum(CorrectedPriorMatrix)
    #Keep a copy to check which states are not used
    NewEmissionParameters['PriorMatrix'] = CorrectedPriorMatrix

    #Add Pseudo gene to Sequences, Background and Paths
    if NewEmissionParameters['ExpressionParameters'][0] is not None:
        Sequences, Background, NewPaths, pseudo_gene_names = add_pseudo_gene(
            Sequences, Background, NewPaths, PriorMatrix)

    #Compute parameters for the expression
    sample_size = 10000

    if NewEmissionParameters['BckType'] != 'None':
        if 'Pseudo' in Sequences:
            nr_of_genes = len(Sequences.keys())
            new_pars = NewEmissionParameters['ExpressionParameters'][0]
            new_pars = np.vstack((new_pars[:(nr_of_genes), :],
                                  np.mean(new_pars[:(nr_of_genes), :]),
                                  new_pars[(nr_of_genes):, :]))
            NewEmissionParameters['ExpressionParameters'][0] = new_pars
    print 'Estimating expression parameters'
    print 'Memory usage: %s (kb)' % resource.getrusage(
        resource.RUSAGE_SELF).ru_maxrss

    bg_type = NewEmissionParameters['BckType']
    expr_data = (NewEmissionParameters, Sequences, Background, NewPaths,
                 sample_size, bg_type)
    NewEmissionParameters = emission.estimate_expression_param(expr_data)

    print 'Memory usage: %s (kb)' % resource.getrusage(
        resource.RUSAGE_SELF).ru_maxrss

    if NewEmissionParameters['BckType'] != 'None':
        if 'Pseudo' in Sequences:
            nr_of_genes = len(Sequences.keys())
            new_pars = NewEmissionParameters['ExpressionParameters'][0]
            new_pars = np.vstack(
                (new_pars[:(nr_of_genes - 1), :], new_pars[(nr_of_genes):, :]))
            NewEmissionParameters['ExpressionParameters'][0] = new_pars

    if (NewEmissionParameters['skip_diag_event_mdl'] == False) or (
            not (EmissionParameters['use_precomp_diagmod'] is None)):
        #Compute parameters for the ratios
        print 'computing sufficient statitics for fitting md'
        print 'Memory usage: %s (kb)' % resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss
        SuffStat = tools.GetSuffStat(Sequences,
                                     Background,
                                     NewPaths,
                                     NrOfStates,
                                     Type='Conv',
                                     EmissionParameters=NewEmissionParameters)

        #Vectorize SuffStat
        Counts, NrOfCounts = tools.ConvertSuffStatToArrays(SuffStat)

        print 'Memory usage: %s (kb)' % resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss
        if NewEmissionParameters['Subsample']:
            Counts, NrOfCounts = tools.subsample_suff_stat(Counts, NrOfCounts)

        print 'fitting md distribution'
        print 'Memory usage: %s (kb)' % resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss

        if NewEmissionParameters['diag_bg']:
            print "Adjusting background"
            SuffStatBck = tools.GetSuffStatBck(
                Sequences,
                Background,
                NewPaths,
                NrOfStates,
                Type='Conv',
                EmissionParameters=NewEmissionParameters)
            #Vectorize SuffStat
            CountsBck, NrOfCountsBck = tools.ConvertSuffStatToArrays(
                SuffStatBck)

            if NewEmissionParameters['Subsample']:
                CountsBck, NrOfCountsBck = tools.subsample_suff_stat(
                    CountsBck, NrOfCountsBck)

            #Overwrite counts in other bins
            fg_state, bg_state = emission.get_fg_and_bck_state(
                NewEmissionParameters, final_pred=True)
            for curr_state in Counts.keys():
                if curr_state != fg_state:
                    Counts[curr_state] = CountsBck[fg_state]
                    NrOfCounts[curr_state] = NrOfCountsBck[fg_state]

        NewEmissionParameters = mixture_tools.em(Counts,
                                                 NrOfCounts,
                                                 NewEmissionParameters,
                                                 x_0=OldAlpha,
                                                 First=First)
        print 'Memory usage: %s (kb)' % resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss
        del Counts, NrOfCounts, SuffStat

    if 'Pseudo' in Sequences:
        del Sequences['Pseudo']
        del Background['Pseudo']
        del NewPaths['Pseudo']

    print 'Done: Elapsed time: ' + str(time.time() - t)
    return NewEmissionParameters
Beispiel #2
0
def PlotGene(Sequences,
             Background,
             gene,
             IterParameters,
             TransitionTypeFirst='nonhomo',
             no_plot=False,
             Start=0,
             Stop=-1,
             figsize=(6, 8),
             dir_ylim=[],
             out_name=None):
    '''
    This function plot the coverage and the parameters for the model
    '''

    reload(diag_event_model)
    reload(emission)
    set2 = brewer2mpl.get_map('Dark2', 'qualitative', 8).mpl_colors
    TransitionParameters = IterParameters[1]
    EmissionParameters = IterParameters[0]
    TransitionType = EmissionParameters['TransitionType']
    PriorMatrix = EmissionParameters['PriorMatrix']
    NrOfStates = EmissionParameters['NrOfStates']

    Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
    Background_per_gene = PreloadSequencesForGene(Background, gene)

    if EmissionParameters['FilterSNPs']:
        Ix = tools.GetModelIx(Sequences_per_gene,
                              Type='no_snps_conv',
                              snps_thresh=EmissionParameters['SnpRatio'],
                              snps_min_cov=EmissionParameters['SnpAbs'],
                              Background=Background_per_gene)
    else:
        Ix = tools.GetModelIx(Sequences_per_gene)

    #2) Compute the probabilities for both states
    EmmisionProbGene = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    EmmisionProbGene_Dir = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    EmmisionProbGeneNB_fg = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    EmmisionProbGeneNB_bg = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))

    CurrStackSum = tools.StackData(Sequences_per_gene)
    CurrStackVar = tools.StackData(Sequences_per_gene, add='no')
    nr_of_genes = len(Sequences.keys())
    gene_nr_dict = {}
    for i, curr_gene in enumerate(Sequences.keys()):
        gene_nr_dict[curr_gene] = i

    #Compute the emission probapility
    for State in range(NrOfStates):
        if not EmissionParameters['ExpressionParameters'][0] == None:
            EmmisionProbGene[
                State, :] = emission.predict_expression_log_likelihood_for_gene(
                    CurrStackSum, State, nr_of_genes, gene_nr_dict[gene],
                    EmissionParameters)
            EmmisionProbGeneNB_fg[
                State, :] = emission.predict_expression_log_likelihood_for_gene(
                    CurrStackSum, State, nr_of_genes, gene_nr_dict[gene],
                    EmissionParameters)
            if EmissionParameters['BckType'] == 'Coverage':
                EmmisionProbGene[
                    State, :] += emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
                EmmisionProbGeneNB_bg[
                    State, :] = emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
            if EmissionParameters['BckType'] == 'Coverage_bck':
                EmmisionProbGene[
                    State, :] += emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
                EmmisionProbGeneNB_bg[
                    State, :] = emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
        if not EmissionParameters['ign_diag']:
            EmmisionProbGene[State, Ix] += diag_event_model.pred_log_lik(
                CurrStackVar[:, Ix], State, EmissionParameters)
            EmmisionProbGene_Dir[State, Ix] = diag_event_model.pred_log_lik(
                CurrStackVar[:, Ix], State, EmissionParameters)

    #Get the transition probabilities
    if TransitionTypeFirst == 'nonhomo':
        if TransitionType == 'unif_bck' or TransitionType == 'binary_bck':
            CountsSeq = tools.StackData(Sequences_per_gene, add='all')
            CountsBck = tools.StackData(Background_per_gene, add='only_cov')
            Counts = np.vstack((CountsSeq, CountsBck))
        else:
            Counts = tools.StackData(Sequences_per_gene, add='all')
        TransistionProbabilities = np.float64(
            trans.PredictTransistions(Counts, TransitionParameters, NrOfStates,
                                      TransitionType))
    else:
        TransistionProbabilities = np.float64(
            np.tile(np.log(TransitionParameters[0]),
                    (EmmisionProbGene.shape[1], 1, 1)).T)

    MostLikelyPath, LogLik = viterbi.viterbi(np.float64(EmmisionProbGene),
                                             TransistionProbabilities,
                                             np.float64(np.log(PriorMatrix)))
    for j in range(NrOfStates):
        print str(np.sum(MostLikelyPath == j))

    if no_plot:
        return MostLikelyPath, TransistionProbabilities, EmmisionProbGene
    #pdb.set_trace()
    fig, axes = plt.subplots(nrows=9, figsize=figsize)
    fig.subplots_adjust(hspace=1.001)

    Counts = tools.StackData(Sequences_per_gene, gene, add='no')
    if Stop == -1:
        Stop = Counts.shape[1]
    if Stop == -1:
        plt_rng = np.array(range(Start, Counts.shape[1]))
    else:
        plt_rng = np.array(range(Start, Stop))

    i = 0
    color = set2[i]
    nr_of_rep_fg = len(Sequences[gene]['Coverage'].keys())
    i += 1
    Ix = repl_track_nr([2, 16], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='TC',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([0, 1, 3, 5, 6, 7, 8, 10, 11, 12, 13, 15, 17, 18], 22,
                       nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='NonTC',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([20], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='Read-ends',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([4, 9, 14, 19], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='Deletions',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([21], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='Coverage',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    axes[0].set_ylabel('Counts')
    axes[0].set_xlabel('Position')
    axes[0].set_title('Coverage and Conversions')
    axes[0].get_xaxis().get_major_formatter().set_useOffset(False)

    BckCov = Background_per_gene['Coverage'][0]
    for i in range(1, len(Background_per_gene['Coverage'].keys())):
        BckCov += Background_per_gene['Coverage'][str(i)]

    ppl.plot(axes[0],
             plt_rng, (BckCov.T)[Start:Stop],
             ls='-',
             label='Bck',
             linewidth=2,
             color=color)
    ppl.legend(axes[0])

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[1],
                 plt_rng, (TransistionProbabilities[j, j, :])[Start:Stop],
                 label='Transition ' + str(j) + ' ' + str(j),
                 linewidth=2,
                 color=color)

    ppl.legend(axes[1])
    axes[1].set_ylabel('log-transition probability')
    axes[1].set_xlabel('Position')
    axes[1].set_title('Transition probability')
    axes[1].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[2],
                 plt_rng, (EmmisionProbGene[j, :][Start:Stop]),
                 label='Emission ' + str(j),
                 linewidth=2,
                 color=color)
    if EmissionParameters['BckType'] == 'Coverage_bck':
        axes[2].set_ylim(
            (np.min(np.min(EmmisionProbGene[0:2, :][:, Start:Stop])), 1))

    ppl.legend(axes[2])
    axes[2].set_ylabel('log-GLM probability')
    axes[2].set_xlabel('Position')
    axes[2].set_title('Emission probability')
    axes[2].get_xaxis().get_major_formatter().set_useOffset(False)

    ppl.plot(axes[3], plt_rng, MostLikelyPath[Start:Stop])
    axes[3].set_ylabel('State')
    axes[3].set_xlabel('Position')
    axes[3].set_title('Most likely path')
    axes[3].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[4],
                 plt_rng,
                 EmmisionProbGene_Dir[j, :][Start:Stop],
                 label='Dir State ' + str(j),
                 linewidth=2,
                 color=color)
    if len(dir_ylim) > 0:
        axes[4].set_ylim(dir_ylim)
    ppl.legend(axes[4])
    axes[4].set_ylabel('log-DMM probability')
    axes[4].set_xlabel('Position')
    axes[4].set_title('DMM probability')
    axes[4].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[5],
                 plt_rng,
                 EmmisionProbGeneNB_fg[j, :][Start:Stop],
                 label='NB fg ' + str(j),
                 linewidth=2,
                 color=color)
    if EmissionParameters['BckType'] == 'Coverage_bck':
        axes[5].set_ylim(
            [np.min(np.min(EmmisionProbGeneNB_fg[0:2, :][:, Start:Stop])), 1])

    ppl.legend(axes[5])
    axes[5].set_ylabel('prob')
    axes[5].set_xlabel('Position')
    axes[5].set_title('prob-fg')
    axes[5].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[6],
                 plt_rng,
                 EmmisionProbGeneNB_bg[j, :][Start:Stop],
                 label='NB bg ' + str(j),
                 linewidth=2,
                 color=color)
    if EmissionParameters['BckType'] == 'Coverage_bck':
        axes[6].set_ylim(
            [np.min(np.min(EmmisionProbGeneNB_bg[0:3, :][:, Start:Stop])), 1])
    ppl.legend(axes[6])
    axes[6].set_ylabel('prob')
    axes[6].set_xlabel('Position')
    axes[6].set_title('prob-bg')
    axes[6].get_xaxis().get_major_formatter().set_useOffset(False)

    fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters,
                                                       final_pred=True)
    ix_bg = range(EmmisionProbGene.shape[0])
    ix_bg.remove(fg_state)
    FGScore = EmmisionProbGene[fg_state, :]
    AltScore = EmmisionProbGene[ix_bg, :]
    norm = logsumexp(AltScore, axis=0)

    ix_ok = np.isinf(norm) + np.isnan(norm)
    if np.sum(ix_ok) < norm.shape[0]:
        SiteScore = FGScore[ix_ok == 0] - norm[ix_ok == 0]
    else:
        print 'Score problematic'
        SiteScore = FGScore
    ppl.plot(axes[7], plt_rng, SiteScore[Start:Stop])
    axes[7].set_ylabel('log-odd score')
    axes[7].set_xlabel('Position')
    axes[7].set_title('log-odd score')
    axes[7].get_xaxis().get_major_formatter().set_useOffset(False)

    FGScore = EmmisionProbGene_Dir[fg_state, :]
    AltScore = EmmisionProbGene_Dir[ix_bg, :]
    norm = logsumexp(AltScore, axis=0)
    ix_ok = np.isinf(norm) + np.isnan(norm)
    if np.sum(ix_ok) < norm.shape[0]:
        SiteScore = FGScore[ix_ok == 0] - norm[ix_ok == 0]
    else:
        print 'Score problematic'
        SiteScore = FGScore
    ppl.plot(axes[8], plt_rng, SiteScore[Start:Stop])
    axes[8].set_ylabel('DMM log-odd score')
    axes[8].set_xlabel('Position')
    axes[8].set_title('DMM log-odd score')
    axes[8].get_xaxis().get_major_formatter().set_useOffset(False)
    if not (out_name is None):
        print 'Saving result'
        fig.savefig(out_name)

    plt.show()

    return MostLikelyPath, TransistionProbabilities, EmmisionProbGeneNB_fg
Beispiel #3
0
def pred_sites(args):
    # Get the args

    args = parser.parse_args()
    print args

    #Check parameters
    if len(args.fg_libs) == 0:
        raise sys.exit('No CLIP-libraries given')

    if len(args.bg_libs) == 0:
        bg_type = 'None'
    else:
        bg_type = args.bg_type

    if args.out_dir == None:
        out_path = os.getcwdu()
    else:
        out_path = args.out_dir

    MaxIter = args.max_it
    # process the parameters

    if not (bg_type == 'Coverage' or bg_type == 'Coverage_bck'):
        print 'Bg-type: ' + bg_type + ' has not been implemented yet'
        return

    #Load the gene annotation
    print 'Loading gene annotation'
    GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True)
    GenomeDir = args.genome_dir

    #Load the reads
    t = time.time()
    print 'Loading reads'
    DataOutFile = os.path.join(out_path, 'fg_reads.dat')
    Sequences = LoadReads.load_data(
        args.fg_libs,
        GenomeDir,
        GeneAnnotation,
        DataOutFile,
        load_from_file=True,
        save_results=False,
        Collapse=args.fg_collapsed,
        ign_out_rds=EmissionParameters['ign_out_rds'],
        rev_strand=EmissionParameters['rev_strand'])

    DataOutFile = os.path.join(out_path, 'bg_reads.dat')
    Background = LoadReads.load_data(
        args.bg_libs,
        GenomeDir,
        GeneAnnotation,
        DataOutFile,
        load_from_file=True,
        save_results=False,
        Collapse=args.bg_collapsed,
        OnlyCoverage=True,
        ign_out_rds=EmissionParameters['ign_out_rds'],
        rev_strand=EmissionParameters['rev_strand'])

    #Removing genes without any reads in the CLIP data
    genes_to_keep = []
    all_genes = Sequences.keys()
    for i, gene in enumerate(Sequences.keys()):
        curr_cov = np.sum(
            np.array([
                np.sum(Sequences[gene]['Coverage'][rep].toarray())
                for rep in Sequences[gene]['Coverage'].keys()
            ]))
        curr_neg_vars = np.sum(
            np.array([
                np.sum(np.sum(Sequences[gene]['Variants'][rep].toarray() < 0))
                for rep in Sequences[gene]['Variants'].keys()
            ]))

        if curr_cov < 100 or curr_neg_vars > 0:
            continue

        genes_to_keep.append(gene)
        if i > args.gene_sample:
            break

    genes_to_del = list(set(all_genes).difference(set(genes_to_keep)))

    for gene in genes_to_del:
        del Sequences[gene]
        del Background[gene]

    del all_genes, genes_to_del, genes_to_keep
    print 'Done: Elapsed time: ' + str(time.time() - t)

    #Load data
    tmp_file = cPickle.load(
        open(os.path.join(out_path, 'IterSaveFile.dat'), 'r'))
    IterParameters = tmp_file[0]
    args = tmp_file[1]
    EmissionParameters = IterParameters[0]
    fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters,
                                                       final_pred=True)
    if EmissionParameters['fg_pen'] > 0.0:
        print 'Recomputing paths'
        EmissionParameters['LastIter'] = True
        Paths, LogLike = tools.ParallelGetMostLikelyPath(
            Paths, Sequences, Background, EmissionParameters,
            TransitionParameters, 'nonhomo')
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    tools.GeneratePred(Sequences, Background, IterParameters, GeneAnnotation,
                       OutFile, fg_state, bg_state)

    print 'Done'
Beispiel #4
0
def run_omniCLIP(args):
    # Get the args
    args = parser.parse_args()
    print args

    #Check parameters
    if len(args.fg_libs) == 0:
        raise sys.exit('No CLIP-libraries given')

    if len(args.bg_libs) == 0:
        bg_type = 'None'
    else:
        bg_type = args.bg_type

    if args.out_dir == None:
        out_path = os.getcwdu()
    else:
        out_path = args.out_dir

    MaxIter = args.max_it
    # process the parameters

    if not (bg_type == 'Coverage' or bg_type == 'Coverage_bck'):
        print 'Bg-type: ' + bg_type + ' has not been implemented yet'
        return

    #Set seed for the random number generators
    if args.rnd_seed is not None:
        random.seed(args.rnd_seed)
        print 'setting seed'

    #Set the p-value cutoff for the bed-file creation
    pv_cutoff = args.pv_cutoff

    #Load the gene annotation
    print 'Loading gene annotation'
    GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True)
    GenomeDir = args.genome_dir

    #Load the reads
    print 'Memory usage: %s (kb)' % resource.getrusage(
        resource.RUSAGE_SELF).ru_maxrss
    print 'Loading reads'

    EmissionParameters = {}

    #Check whether existing iteration parameters should be used
    restart_from_file = args.restart_from_file
    EmissionParameters['restart_from_file'] = restart_from_file

    EmissionParameters['glm_weight'] = args.glm_weight

    EmissionParameters['mask_flank_variants'] = args.mask_flank_variants

    EmissionParameters['max_mm'] = args.max_mm

    EmissionParameters['rev_strand'] = args.rev_strand

    EmissionParameters['skip_diag_event_mdl'] = args.skip_diag_event_mdl

    EmissionParameters['ign_out_rds'] = args.ign_out_rds

    EmissionParameters['DataOutFile_seq'] = os.path.join(
        out_path, 'fg_reads.dat')
    EmissionParameters['DataOutFile_bck'] = os.path.join(
        out_path, 'bg_reads.dat')
    EmissionParameters['tmp_dir'] = args.tmp_dir
    t = time.time()

    Sequences = LoadReads.load_data(
        args.fg_libs,
        GenomeDir,
        GeneAnnotation,
        EmissionParameters['DataOutFile_seq'],
        load_from_file=((not args.overwrite_fg) or restart_from_file),
        save_results=True,
        Collapse=args.fg_collapsed,
        mask_flank_variants=EmissionParameters['mask_flank_variants'],
        max_mm=EmissionParameters['max_mm'],
        ign_out_rds=EmissionParameters['ign_out_rds'],
        rev_strand=EmissionParameters['rev_strand'])
    Background = LoadReads.load_data(
        args.bg_libs,
        GenomeDir,
        GeneAnnotation,
        EmissionParameters['DataOutFile_bck'],
        load_from_file=((not args.overwrite_bg) or restart_from_file),
        save_results=True,
        Collapse=args.bg_collapsed,
        OnlyCoverage=args.only_coverage,
        mask_flank_variants=EmissionParameters['mask_flank_variants'],
        max_mm=EmissionParameters['max_mm'],
        ign_out_rds=EmissionParameters['ign_out_rds'],
        rev_strand=EmissionParameters['rev_strand'])
    #pdb.set_trace()
    #Mask the positions that overlap miRNA sites in the geneome

    Sequences.close()
    Background.close()

    f_name_read_fg = EmissionParameters['DataOutFile_seq']
    f_name_read_bg = EmissionParameters['DataOutFile_bck']

    #Create temporary read-files that can be modified by the masking operations
    if EmissionParameters['tmp_dir'] is None:
        f_name_read_fg_tmp = EmissionParameters['DataOutFile_seq'].replace(
            'fg_reads.dat', 'fg_reads.tmp.dat')
        f_name_read_bg_tmp = EmissionParameters['DataOutFile_bck'].replace(
            'bg_reads.dat', 'bg_reads.tmp.dat')
    else:
        f_name_read_fg_tmp = os.path.join(
            EmissionParameters['tmp_dir'],
            next(tempfile._get_candidate_names()) + '.dat')
        f_name_read_bg_tmp = os.path.join(
            EmissionParameters['tmp_dir'],
            next(tempfile._get_candidate_names()) + '.dat')

    shutil.copy(f_name_read_fg, f_name_read_fg_tmp)
    shutil.copy(f_name_read_bg, f_name_read_bg_tmp)

    #open the temporary read files
    Sequences = h5py.File(f_name_read_fg_tmp, 'r+')
    Background = h5py.File(f_name_read_bg_tmp, 'r+')

    EmissionParameters['DataOutFile_seq'] = f_name_read_fg_tmp
    EmissionParameters['DataOutFile_bck'] = f_name_read_bg_tmp

    #Set coverage for regions that overlapp annotated miRNAs to zero
    EmissionParameters['mask_miRNA'] = args.mask_miRNA
    if args.mask_miRNA:
        print 'Removing miRNA-coverage'
        Sequences = mask_miRNA_positions(Sequences, GeneAnnotation)

    #Mask regions where genes overlap
    EmissionParameters['mask_ovrlp'] = args.mask_ovrlp

    if EmissionParameters['mask_ovrlp']:
        print 'Masking overlapping positions'
        Sequences = mark_overlapping_positions(Sequences, GeneAnnotation)

    #Estimate the library size
    EmissionParameters['BckLibrarySize'] = tools.estimate_library_size(
        Background)
    EmissionParameters['LibrarySize'] = tools.estimate_library_size(Sequences)

    #Removing genes without any reads in the CLIP data
    print "Removing genes without CLIP coverage"

    genes_to_keep = []
    all_genes = Sequences.keys()
    for i, gene in enumerate(Sequences.keys()):
        curr_cov = sum([
            Sequences[gene]['Coverage'][rep].value.sum()
            for rep in Sequences[gene]['Coverage'].keys()
        ])
        curr_neg_vars = sum([
            np.sum(np.sum(Sequences[gene]['Variants'][rep].value < 0))
            for rep in Sequences[gene]['Variants'].keys()
        ])

        if curr_cov <= 100 or curr_neg_vars > 0:
            continue

        genes_to_keep.append(gene)
        if i > args.gene_sample:
            break

    genes_to_del = list(set(all_genes).difference(set(genes_to_keep)))

    for gene in genes_to_del:
        del Sequences[gene]
        del Background[gene]

    del all_genes, genes_to_del, genes_to_keep
    print 'Done: Elapsed time: ' + str(time.time() - t)
    print 'Memory usage: %s (kb)' % resource.getrusage(
        resource.RUSAGE_SELF).ru_maxrss

    #Initializing parameters
    print 'Initialising the parameters'
    if bg_type == 'Coverage_bck':
        NrOfStates = 4
    else:
        NrOfStates = 3

    #Remove the gene sequence from the Sequences and Background when not needed. Currently this is always the case:
    for gene in Sequences.keys():
        if 'GeneSeq' in Sequences[gene]:
            del Sequences[gene]['GeneSeq']

    for gene in Background.keys():
        if 'GeneSeq' in Background[gene]:
            del Background[gene]['GeneSeq']

    #pdb.set_trace()
    TransMat = np.ones((NrOfStates, NrOfStates)) + np.eye(NrOfStates)
    TransMat = TransMat / np.sum(np.sum(TransMat))
    TransitionParameters = [TransMat, []]

    NrOfReplicates = len(args.fg_libs)
    gene = Sequences.keys()[0]

    EmissionParameters['PriorMatrix'] = np.ones(
        (NrOfStates, 1)) / float(NrOfStates)
    EmissionParameters['diag_bg'] = args.diag_bg
    EmissionParameters['emp_var'] = args.emp_var
    EmissionParameters['norm_class'] = args.norm_class

    #Define flag for penalized path prediction
    EmissionParameters['LastIter'] = False
    EmissionParameters['fg_pen'] = args.fg_pen

    EmissionParameters['Diag_event_params'] = {}
    EmissionParameters['Diag_event_params']['nr_mix_comp'] = args.nr_mix_comp
    EmissionParameters['Diag_event_params']['mix_comp'] = {}
    for state in range(NrOfStates):
        mixtures = np.random.uniform(0.0, 1.0, size=(args.nr_mix_comp))
        EmissionParameters['Diag_event_params']['mix_comp'][
            state] = mixtures / np.sum(mixtures)

    #initialise the parameter vector alpha
    alphashape = (Sequences[gene]['Variants']['0'].value.shape[0] +
                  Sequences[gene]['Coverage']['0'].value.shape[0] +
                  Sequences[gene]['Read-ends']['0'].value.shape[0])
    alpha = {}
    for state in range(NrOfStates):
        alpha[state] = np.random.uniform(0.9,
                                         1.1,
                                         size=(alphashape, args.nr_mix_comp))

    EmissionParameters['Diag_event_params']['alpha'] = alpha
    EmissionParameters['Diag_event_type'] = args.diag_event_mod
    EmissionParameters['NrOfStates'] = NrOfStates
    EmissionParameters['NrOfReplicates'] = NrOfReplicates
    EmissionParameters['ExpressionParameters'] = [None, None]
    EmissionParameters['BckType'] = bg_type
    EmissionParameters['NrOfBckReplicates'] = len(args.bg_libs)
    EmissionParameters['TransitionType'] = args.tr_type
    EmissionParameters['Verbosity'] = args.verbosity
    EmissionParameters['NbProc'] = args.nb_proc
    EmissionParameters['Subsample'] = args.subs

    EmissionParameters['FilterSNPs'] = args.filter_snps
    EmissionParameters['SnpRatio'] = args.snps_thresh
    EmissionParameters['SnpAbs'] = args.snps_min_cov
    EmissionParameters['ign_diag'] = args.ign_diag
    if EmissionParameters['ign_out_rds']:
        EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds']
    EmissionParameters['ign_GLM'] = args.ign_GLM
    EmissionParameters['only_pred'] = args.only_pred

    EmissionParameters['use_precomp_diagmod'] = args.use_precomp_diagmod

    # Transistion parameters
    IterParameters = [EmissionParameters, TransitionParameters]

    #Start computation

    #Iterativly fit the parameters of the model
    OldLogLikelihood = 0
    CurrLogLikelihood = -np.inf
    CurrIter = 0
    LoglikelihodList = []
    First = 1
    IterSaveFile = os.path.join(out_path, 'IterSaveFile.dat')
    IterSaveFileHist = os.path.join(out_path, 'IterSaveFileHist.dat')
    IterHist = []
    Paths = {}
    iter_cond = True
    #Check whether to preload the iteration file
    if EmissionParameters['only_pred']:
        IterParameters, args_old = cPickle.load(open(IterSaveFile, 'r'))
        EmissionParameters['mask_miRNA'] = args.mask_miRNA
        EmissionParameters['glm_weight'] = args.glm_weight
        EmissionParameters['restart_from_file'] = restart_from_file
        EmissionParameters = IterParameters[0]
        EmissionParameters['ign_diag'] = args.ign_diag
        if EmissionParameters['ign_out_rds']:
            EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds']
        EmissionParameters['ign_GLM'] = args.ign_GLM
        TransitionParameters = IterParameters[1]
        TransitionType = EmissionParameters['TransitionType']
        OldLogLikelihood = -np.inf
        fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters,
                                                           final_pred=True)

        Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath(
            Paths, Sequences, Background, EmissionParameters,
            TransitionParameters, 'nonhomo')
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

        First = 0
        iter_cond = False

    if restart_from_file:
        IterParameters, args_old = cPickle.load(open(IterSaveFile, 'r'))
        EmissionParameters = IterParameters[0]
        EmissionParameters['mask_miRNA'] = args.mask_miRNA
        EmissionParameters['glm_weight'] = args.glm_weight
        EmissionParameters['restart_from_file'] = restart_from_file
        EmissionParameters['ign_diag'] = args.ign_diag
        EmissionParameters['ign_GLM'] = args.ign_GLM
        TransitionParameters = IterParameters[1]
        TransitionType = EmissionParameters['TransitionType']
        OldLogLikelihood = -np.inf
        Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath(
            Paths, Sequences, Background, EmissionParameters,
            TransitionParameters, 'nonhomo')
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')
        First = 1
        iter_cond = True

    if not EmissionParameters['use_precomp_diagmod'] is None:
        IterParametersPreComp, args_old = cPickle.load(
            open(EmissionParameters['use_precomp_diagmod'], 'r'))
        IterParameters[0]['Diag_event_params'] = IterParametersPreComp[0][
            'Diag_event_params']

    while iter_cond:
        print "Iteration: " + str(CurrIter)
        if EmissionParameters['Verbosity'] > 0:
            print IterParameters[0]

        OldLogLikelihood = CurrLogLikelihood

        CurrLogLikelihood, IterParameters, First, Paths = PerformIteration(
            Sequences, Background, IterParameters, NrOfStates, First, Paths)
        gc.collect()

        if True:
            cPickle.dump([IterParameters, args], open(IterSaveFile, 'w'))
        if args.safe_tmp:
            if CurrIter > 0:
                IterHist = cPickle.load(open(IterSaveFileHist, 'r'))
            IterHist.append([IterParameters, CurrLogLikelihood])
            cPickle.dump(IterHist, open(IterSaveFileHist, 'w'))
            del IterHist

        print "Log-likelihood: " + str(CurrLogLikelihood)
        LoglikelihodList.append(CurrLogLikelihood)

        print LoglikelihodList
        CurrIter += 1

        if CurrIter >= MaxIter:
            print 'Maximal number of iterations reached'

        if not restart_from_file:
            if CurrIter < max(3, MaxIter):
                iter_cond = True
            else:
                iter_cond = (CurrIter < MaxIter) and (
                    (abs(CurrLogLikelihood - OldLogLikelihood) /
                     max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) >
                    0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) >
                               args.tol_lg_lik)

        else:
            if np.isinf(OldLogLikelihood):
                iter_cond = (CurrIter < MaxIter) and (
                    abs(CurrLogLikelihood - OldLogLikelihood) >
                    args.tol_lg_lik)
            else:
                iter_cond = (CurrIter < MaxIter) and (
                    (abs(CurrLogLikelihood - OldLogLikelihood) /
                     max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) >
                    0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) >
                               args.tol_lg_lik)

    #Return the fitted parameters
    print 'Finished fitting of parameters'

    EmissionParameters, TransitionParameters = IterParameters
    if not isinstance(EmissionParameters['ExpressionParameters'][0],
                      np.ndarray):
        print 'Emmision parameters have not been fit yet'
        return
    out_file_base = 'pred'
    if EmissionParameters['ign_GLM']:
        out_file_base += '_no_glm'
    if EmissionParameters['ign_diag']:
        out_file_base += '_no_diag'
    OutFile = os.path.join(out_path, out_file_base + '.txt')
    #determine which state has higher weight in fg.
    print 'Memory usage: %s (kb)' % resource.getrusage(
        resource.RUSAGE_SELF).ru_maxrss
    fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters,
                                                       final_pred=True)
    if EmissionParameters['fg_pen'] > 0.0:
        print 'Recomputing paths'
        EmissionParameters['LastIter'] = True
        Paths, LogLike = tools.ParallelGetMostLikelyPath(
            Paths, Sequences, Background, EmissionParameters,
            TransitionParameters, 'nonhomo')
        Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
        Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    tools.GeneratePred(Paths,
                       Sequences,
                       Background,
                       IterParameters,
                       GeneAnnotation,
                       OutFile,
                       fg_state,
                       bg_state,
                       seq_file=EmissionParameters['DataOutFile_seq'],
                       bck_file=EmissionParameters['DataOutFile_bck'],
                       pv_cutoff=pv_cutoff)
    print 'Done'

    #Remove the temporary files
    if not (EmissionParameters['tmp_dir'] is None):
        print 'removing temporary files'
        os.remove(EmissionParameters['DataOutFile_seq'])
        os.remove(EmissionParameters['DataOutFile_bck'])

    return
Beispiel #5
0
def ParallelGetMostLikelyPathForGene(data):
    ''' 
    This function computes the most likely path for a gene 
    '''
    
    gene, nr_of_genes, gene_nr, EmissionParameters, TransitionParameters, TransitionTypeFirst, RandomNoise = data
    
    #Turn the Sequence and Bacground objects into dictionaries again such that the subsequent methods for using these do not need to be modified
    Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
    Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    #Parse the parameters
    alpha = EmissionParameters['Diag_event_params']
    PriorMatrix = EmissionParameters['PriorMatrix']
    NrOfStates = EmissionParameters['NrOfStates']

    TransitionType = EmissionParameters['TransitionType']

    fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters, final_pred=True)
    fg_pen = EmissionParameters['fg_pen']
    #Score the state sequences
    #1) Determine the positions where an observation is possible

    Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
    Background_per_gene = PreloadSequencesForGene(Background, gene)

    Ix = GetModelIx(Sequences_per_gene, Type='all')

    if np.sum(Ix) == 0:
        CurrPath = 2 * np.ones((0, Ix.shape[0]), dtype=np.int)
        return  [gene, CurrPath, 0]

    if EmissionParameters['FilterSNPs']:
        Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene)
    else:
        Ix = GetModelIx(Sequences_per_gene)
    
    #2) Compute the probabilities for both states
    EmmisionProbGene = np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))
    
    CurrStackSum = StackData(Sequences_per_gene)
    CurrStackVar = StackData(Sequences_per_gene, add = 'no')
    CurrStackSumBck = StackData(Background_per_gene, add = 'only_cov')

    if EmissionParameters['glm_weight'] < 0.0:
        weight1 = 1.0
        weight2 = 1.0
    elif EmissionParameters['glm_weight'] == 0.0:
        weight1 = 0.0000001
        weight2 = 1.0 - weight1 
    elif EmissionParameters['glm_weight'] == 1.0:
        weight1 = 0.9999999
        weight2 = 1.0 - weight1 
    else:
        weight1 = EmissionParameters['glm_weight'] 
        weight2 = (1.0 - EmissionParameters['glm_weight']) 

    for State in range(NrOfStates):
        if not EmissionParameters['ign_GLM']:
            if isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray):
                #EmmisionProbGene[State, :] = FitBinoDirchEmmisionProbabilities.ComputeStateProbForGeneNB_unif(CurrStack, alpha, State, EmissionParameters)
                EmmisionProbGene[State, :] = np.log(weight1) + emission.predict_expression_log_likelihood_for_gene(CurrStackSum, State, nr_of_genes, gene_nr, EmissionParameters)
                if EmissionParameters['BckType'] == 'Coverage':
                    EmmisionProbGene[State, :] += np.log(weight1) + emission.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
                if EmissionParameters['BckType'] == 'Coverage_bck':
                    EmmisionProbGene[State, :] += np.log(weight1) + emission.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
        if not EmissionParameters['ign_diag']:
            EmmisionProbGene[State, Ix] += np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters)
        if State == fg_state:
            if EmissionParameters['LastIter']:
                EmmisionProbGene[State, :] -= fg_pen
    if RandomNoise:
        EmmisionProbGene = np.logaddexp(EmmisionProbGene, np.random.uniform(np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 4, np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 0.1, EmmisionProbGene.shape)) #Add some random noise 
        
    #Get the transition probabilities
    if TransitionTypeFirst == 'nonhomo':
        if TransitionType == 'unif_bck' or TransitionType == 'binary_bck':
            CountsSeq = StackData(Sequences_per_gene, add = 'all')
            CountsBck = StackData(Background_per_gene, add = 'only_cov')
            Counts = np.vstack((CountsSeq, CountsBck))
        else:
            Counts = StackData(Sequences_per_gene, add = 'all')
        TransistionProbabilities = np.float64(trans.PredictTransistions(Counts, TransitionParameters, NrOfStates, TransitionType))
    else: 
        TransistionProbabilities = np.float64(np.tile(np.log(TransitionParameters[0]), (EmmisionProbGene.shape[1],1,1)).T)
    
    CurrPath, Currloglik = viterbi.viterbi(np.float64(EmmisionProbGene), TransistionProbabilities, np.float64(np.log(PriorMatrix)))
    
    del TransistionProbabilities, EmmisionProbGene, CurrStackSum, CurrStackVar, Ix
    Sequences.close()
    Background.close()

    return [gene, CurrPath, Currloglik]
Beispiel #6
0
def GetSuffStatBck(Sequences, Background, Paths, NrOfStates, Type, ResetNotUsedStates = True, EmissionParameters=None):
    '''
    This function computes for each CurrPath state a set of suffcient statistics:
    '''

    #Initialize the sufficent statistcs variable
    print "Getting suffcient statistic"
    t = time.time()
    SuffStatBck = {}

    fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters, final_pred=True)

    SuffStatBck[fg_state] = defaultdict(int)

    try:
        Sequences.close()
    except:
        pass
    try:
        Background.close()
    except:
        pass     
    Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
    Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    #Fil the sufficent statistcs variable
    for gene in Sequences.keys():
        rep = Background[gene]['Coverage'].keys()[0]
        CurrGenePath = Paths[gene]

        #Stack the matrizes together and convert to dense matrix
        Background_per_gene = PreloadSequencesForGene(Background, gene)
        Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
        if Type == 'Conv':
            CurrStack = StackData(Background_per_gene, add = 'variants')
        else:
            CurrStack = StackData(Background_per_gene, add = 'all')

        if EmissionParameters['FilterSNPs']:
            if Type == 'Conv':
                Ix = GetModelIx(Background_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene)
            else:
                Ix = GetModelIx(Background_per_gene, Type)
        else:
            Ix = GetModelIx(Background_per_gene, Type)

        NonZero = np.sum(CurrStack, axis = 0) > 0

        #Determine the nonzeros elements
        CurrState = fg_state
        
        CurrIx = Ix * NonZero > 0
        if EmissionParameters['mask_ovrlp']:
            CurrIx = Ix * (Sequences_per_gene['mask'][rep][0, :] == 0) * NonZero * (CurrGenePath == CurrState) > 0
        else:
            CurrIx = Ix * NonZero * (CurrGenePath == CurrState) > 0

        data = CurrStack[:,CurrIx].T
        ncols = data.shape[1]
        dtype = data.T.dtype.descr * ncols
        struct = data.view(dtype)

        vals, val_counts = np.unique(struct, return_counts=True)

        #Save the tuples and how many times they have been seen so far.
        for curr_val, curr_count in itertools.izip(vals, val_counts): 
            SuffStatBck[CurrState][tuple(curr_val)] += curr_count

        #Treat the 0 tuple seperately for speed improvment
        if len(Ix) == 0:
            continue
        NullIx = (NonZero == 0) * (CurrGenePath == CurrState) > 0
        if np.sum(NullIx) == 0:
            continue
        NullCount = np.sum(NullIx)
        if NullCount > 0:
            NullTuple = np.zeros_like(CurrStack[:, 0])
            NullTuple = tuple(NullTuple.T)
            SuffStatBck[CurrState][NullTuple] += NullCount
        
        del CurrStack, NonZero, CurrGenePath, Ix
    
    print 'Done: Elapsed time: ' + str(time.time() - t)

    return SuffStatBck
Beispiel #7
0
def em(counts,
       nr_of_counts,
       EmissionParameters,
       x_0=None,
       First=False,
       max_nr_iter=15,
       tol=0.0001,
       rand_sample_size=10):
    '''
	This function performs the EMlagorithm
	'''

    template_state = 3
    fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters,
                                                       final_pred=True)
    check = False

    OldEmissionParameters = deepcopy(EmissionParameters)
    for curr_state in counts.keys():
        #Only compute the the emission probabilities once
        if EmissionParameters['diag_bg']:
            if curr_state != fg_state:
                if True:
                    if check == True:
                        print 'Using template state ' + str(curr_state)
                        EmissionParameters['Diag_event_params']['mix_comp'][
                            curr_state] = deepcopy(
                                EmissionParameters['Diag_event_params']
                                ['mix_comp'][template_state])
                        EmissionParameters['Diag_event_params']['alpha'][
                            curr_state] = deepcopy(
                                EmissionParameters['Diag_event_params']
                                ['alpha'][template_state])
                        continue
                    else:
                        print 'setting template state ' + str(curr_state)
                        check = True
                        template_state = curr_state
                else:
                    template_state = 3
                    check = True
                    EmissionParameters['Diag_event_params']['mix_comp'][
                        curr_state] = deepcopy(
                            EmissionParameters['Diag_event_params']['mix_comp']
                            [template_state])
                    EmissionParameters['Diag_event_params']['alpha'][
                        curr_state] = deepcopy(
                            EmissionParameters['Diag_event_params']['alpha']
                            [template_state])
                    continue
        print 'Estimating state ' + str(curr_state)

        curr_counts = counts[curr_state]
        curr_nr_of_counts = nr_of_counts[curr_state]

        alpha, mixtures = Parallel_estimate_mixture_params(
            OldEmissionParameters,
            curr_counts,
            curr_nr_of_counts,
            curr_state,
            rand_sample_size,
            max_nr_iter,
            nr_of_iter=20,
            stop_crit=1.0,
            nr_of_init=10)
        EmissionParameters['Diag_event_params']['alpha'][curr_state] = alpha
        EmissionParameters['Diag_event_params']['mix_comp'][
            curr_state] = mixtures

    return EmissionParameters