Ejemplo n.º 1
0
def FitTransistionParameters(Sequences, Background, TransitionParameters,
                             CurrPath, verbosity=1):
    """Determine optimal logistic regression parameters.

    Return the optimal parameters of the logistic regression for predicting
    the TransitionParameters.
    """
    print('Fitting transition parameters')
    get_mem_usage(verbosity)

    NewTransitionParametersLogReg = FitTransistionParametersSimple(
        Sequences, Background,
        TransitionParameters, CurrPath,
        verbosity=verbosity)

    get_mem_usage(verbosity)

    return NewTransitionParametersLogReg
Ejemplo n.º 2
0
def run_omniCLIP(args):
    """Run omniCLIP function."""
    # Parsing the arguments
    EmissionParameters = ParsingArgs.parsing_argparse(args)

    # Creating temp copies of Sequence and Background
    ParsingArgs.dup_seqfiles(EmissionParameters)

    # Parsing arguments dependents of Sequence and Background
    EmissionParameters = ParsingArgs.parsing_files(args, EmissionParameters)

    # Load the gene annotation
    print('Loading gene annotation')
    if args.gene_anno_file.split('.')[-1] == 'db':
        GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True)

    import warnings
    warnings.filterwarnings('error')

    # Load the reads
    get_mem_usage(EmissionParameters['verbosity'])
    t = time.time()
    Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r+')
    Background = h5py.File(EmissionParameters['dat_file_bg'], 'r+')

    msg = 'Done: Elapsed time: ' + str(time.time() - t)
    get_mem_usage(EmissionParameters['verbosity'], t=t, msg=msg)

    # Initializing parameters
    print('Initialising the parameters')
    TransitionParameters = [EmissionParameters['TransMat'], []]

    # Transistion parameters
    IterParameters = [EmissionParameters, TransitionParameters]

    # Start computation

    # Iteratively fit the parameters of the model
    OldLogLikelihood = 0
    CurrLogLikelihood = -np.inf
    CurrIter = 0
    LoglikelihodList = []
    First = 1
    Paths = {}
    iter_cond = True

    while iter_cond:
        print("\n")
        print("Iteration: " + str(CurrIter))
        if EmissionParameters['verbosity'] > 1:
            print(IterParameters[0])

        OldLogLikelihood = CurrLogLikelihood

        CurrLogLikelihood, IterParameters, First, Paths = PerformIteration(
            Sequences, Background, IterParameters,
            EmissionParameters['NrOfStates'], First, Paths,
            verbosity=EmissionParameters['verbosity'])

        gc.collect()

        if EmissionParameters['verbosity'] > 1:
            print("Log-likelihood: " + str(CurrLogLikelihood))
        LoglikelihodList.append(CurrLogLikelihood)

        if EmissionParameters['verbosity'] > 1:
            print(LoglikelihodList)
        CurrIter += 1

        if CurrIter >= EmissionParameters['max_it']:
            print('Maximal number of iterations reached')

        if CurrIter < max(3, EmissionParameters['max_it']):
            iter_cond = True
        else:
            iter_cond = (
                (CurrIter < EmissionParameters['max_it'])
                and ((abs(CurrLogLikelihood - OldLogLikelihood)/max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01)
                and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik))

    # Return the fitted parameters
    print('Finished parameter fitting')

    EmissionParameters, TransitionParameters = IterParameters
    if not isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray):
        print('Emmision parameters have not been fit yet')
        return

    OutFile = os.path.join(EmissionParameters['out_dir'],
                           EmissionParameters['out_file_base'] + '.txt')

    # Determine which state has higher weight in fg.
    get_mem_usage(EmissionParameters['verbosity'])

    fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters,
                                                            final_pred=True)
    if EmissionParameters['fg_pen'] > 0.0:
        print('Recomputing paths')
        EmissionParameters['LastIter'] = True
        Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
        Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

        Paths, LogLike = tools.ParallelGetMostLikelyPath(
            Paths, Sequences, Background, EmissionParameters,
            TransitionParameters, 'nonhomo',
            verbosity=EmissionParameters['verbosity'])

        Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
        Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

    tools.GeneratePred(
        Paths, Sequences, Background, IterParameters, GeneAnnotation, OutFile,
        fg_state, bg_state, pv_cutoff=EmissionParameters['pv_cutoff'],
        verbosity=EmissionParameters['verbosity'])
    print('Done')

    # Remove the temporary files
    print('Removing temporary files')
    os.remove(EmissionParameters['dat_file_clip'])
    os.remove(EmissionParameters['dat_file_bg'])

    return
Ejemplo n.º 3
0
def FitEmissionParameters(Sequences, Background, NewPaths,
                          OldEmissionParameters, First, verbosity=1):
    """Fit EmissionParameters."""
    print('Fitting emission parameters')
    t = time.time()
    # Unpack the arguments
    OldAlpha = OldEmissionParameters['Diag_event_params']
    NrOfStates = OldEmissionParameters['NrOfStates']
    OldPriorMatrix = OldEmissionParameters['PriorMatrix']
    NewEmissionParameters = OldEmissionParameters

    # Compute new prior matrix
    PriorMatrix = np.zeros_like(OldPriorMatrix)
    for State in range(NrOfStates):
        for path in NewPaths:
            PriorMatrix[State] += np.sum(NewPaths[path] == State)

    # Check if one of the states is not used and add pseudo gene to prevent
    # singularities during distribution fitting
    if np.sum(PriorMatrix == 0) > 0:
        LoadReads.close_data_handles(handles=[Sequences, Background])
        Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r+')
        Background = h5py.File(NewEmissionParameters['dat_file_bg'], 'r+')
        Sequences, Background, NewPaths = add_pseudo_gene(
            Sequences, Background, NewPaths, PriorMatrix)
        print('Adds pseudo gene to prevent singular matrix during GLM fitting')

    CorrPriorMatrix = np.copy(PriorMatrix)
    CorrPriorMatrix[CorrPriorMatrix == 0] = np.min(
        CorrPriorMatrix[CorrPriorMatrix > 0])/10
    CorrPriorMatrix /= np.sum(CorrPriorMatrix)
    # Keep a copy to check which states are not used
    NewEmissionParameters['PriorMatrix'] = CorrPriorMatrix

    # Add Pseudo gene to Sequences, Background and Paths
    if NewEmissionParameters['ExpressionParameters'][0] is not None:
        Sequences, Background, NewPaths = add_pseudo_gene(
            Sequences, Background, NewPaths, PriorMatrix)

    # Compute parameters for the expression
    Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r')
    if (NewEmissionParameters['bg_type'] != 'None') and not First:
        if 'Pseudo' in list(Sequences.keys()):
            nr_of_genes = len(list(Sequences.keys()))
            new_pars = NewEmissionParameters['ExpressionParameters'][0]
            new_pars = np.vstack(
                (new_pars[:(nr_of_genes), :],
                 np.mean(new_pars[:(nr_of_genes), :]),
                 new_pars[(nr_of_genes):, :]))
            NewEmissionParameters['ExpressionParameters'][0] = new_pars
    print('Estimating expression parameters')
    get_mem_usage(verbosity)

    NewEmissionParameters = emission_prob.estimate_expression_param(
        (NewEmissionParameters, NewPaths), verbosity=verbosity)

    get_mem_usage(verbosity)

    Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r')
    Background = h5py.File(NewEmissionParameters['dat_file_bg'], 'r')

    if NewEmissionParameters['bg_type'] != 'None':
        if 'Pseudo' in list(Sequences.keys()):
            nr_of_genes = len(list(Sequences.keys()))
            new_pars = NewEmissionParameters['ExpressionParameters'][0]
            new_pars = np.vstack((new_pars[:(nr_of_genes-1), :],
                                  new_pars[(nr_of_genes):, :]))
            NewEmissionParameters['ExpressionParameters'][0] = new_pars

    if NewEmissionParameters['skip_diag_event_mdl'] is False:
        # Compute parameters for the ratios
        print('Computing sufficient statistic for fitting md')
        get_mem_usage(verbosity)

        SuffStat = tools.GetSuffStat(
            NewPaths, NrOfStates, Type='Conv',
            EmissionParameters=NewEmissionParameters, verbosity=verbosity)

        # Vectorize SuffStat
        Counts, NrOfCounts = tools.ConvertSuffStatToArrays(SuffStat)

        del SuffStat
        get_mem_usage(verbosity)
        if NewEmissionParameters['subs']:
            Counts, NrOfCounts = tools.subsample_suff_stat(Counts, NrOfCounts)

        print('Fitting md distribution')
        get_mem_usage(verbosity)
        if NewEmissionParameters['diag_bg']:
            print("Adjusting background")
            SuffStatBck = tools.GetSuffStatBck(
                NewPaths, NrOfStates, Type='Conv',
                EmissionParameters=NewEmissionParameters, verbosity=verbosity)

            # Vectorize SuffStat
            CountsBck, NrOfCountsBck = tools.ConvertSuffStatToArrays(
                SuffStatBck)

            if NewEmissionParameters['subs']:
                CountsBck, NrOfCountsBck = tools.subsample_suff_stat(
                    CountsBck, NrOfCountsBck)

            # Overwrite counts in other bins
            fg_state, bg_state = emission_prob.get_fg_and_bck_state(
                NewEmissionParameters, final_pred=True)

            for curr_state in list(Counts.keys()):
                if curr_state != fg_state:
                    Counts[curr_state] = CountsBck[fg_state]
                    NrOfCounts[curr_state] = NrOfCountsBck[fg_state]

            del SuffStatBck

        NewEmissionParameters = mixture_tools.em(
            Counts, NrOfCounts, NewEmissionParameters, x_0=OldAlpha,
            First=First, verbosity=verbosity)

        get_mem_usage(verbosity)
        del Counts, NrOfCounts

    if 'Pseudo' in list(Sequences.keys()):
        del Sequences['Pseudo']
        del Background['Pseudo']
        del NewPaths['Pseudo']

    if verbosity > 0:
        print('Done: Elapsed time: ' + str(time.time() - t))
    return NewEmissionParameters
Ejemplo n.º 4
0
def PerformIteration(Sequences, Background, IterParameters, NrOfStates, First,
                     NewPaths={}, verbosity=1):
    """
    This function performs an iteration of the HMM algorithm
    """
    # Unpack the Iteration parameters
    EmissionParameters = IterParameters[0]
    TransitionParameters = IterParameters[1]

    # Get new most likely path
    if First:
        NewPaths, LogLike = tools.ParallelGetMostLikelyPath(
            NewPaths, Sequences, Background, EmissionParameters,
            TransitionParameters, 'h**o', RandomNoise=True,
            verbosity=verbosity)

        Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
        Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

        get_mem_usage(verbosity)

    # Perform EM to compute the new emission parameters
    print('Fitting emission parameters')
    get_mem_usage(verbosity)

    NewEmissionParameters = FitEmissionParameters(
        Sequences, Background, NewPaths, EmissionParameters, First,
        verbosity=verbosity)

    if First:
        First = 0

    get_mem_usage(verbosity)

    # Fit the transition matrix parameters
    NewTransitionParameters = TransitionParameters
    print('Fitting transition parameters')
    get_mem_usage(verbosity)

    LoadReads.close_data_handles(handles=[Sequences, Background])
    Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
    Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

    TransistionPredictors = trans.FitTransistionParameters(
        Sequences, Background, TransitionParameters, NewPaths,
        verbosity=verbosity)

    NewTransitionParameters[1] = TransistionPredictors
    get_mem_usage(verbosity)

    NewIterParameters = [NewEmissionParameters, NewTransitionParameters]

    print('Computing most likely path')
    get_mem_usage(verbosity)

    gc.collect()
    NewPaths, LogLike = tools.ParallelGetMostLikelyPath(
        NewPaths, Sequences, Background, EmissionParameters,
        TransitionParameters, 'nonhomo', verbosity=verbosity)

    Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
    Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

    CurrLogLikelihood = LogLike
    get_mem_usage(verbosity)
    if verbosity > 1:
        print('LogLik:')
        print(CurrLogLikelihood)
    return CurrLogLikelihood, NewIterParameters, First, NewPaths
Ejemplo n.º 5
0
def FitTransistionParametersSimple(Sequences, Background, TransitionParameters,
                                   CurrPath, verbosity=1):
    """Determine optimal logistic regression parameters.

    Return the optimal parameters of the logistic regression for predicting
    the TransitionParameters.
    """
    # Generate features from the CurrPaths and the Information in the coverage
    TransitionMatrix = TransitionParameters[0]
    NewTransitionParametersLogReg = {}
    t = time.time()

    # Iterate over the possible transitions
    assert (TransitionMatrix.shape[0] > 1), 'Only two states are currently allowed'

    genes = list(CurrPath.keys())
    genes = random.sample(genes, min(len(genes), 1000))

    NrOfStates = TransitionMatrix.shape[0]
    Xs = []
    Ys = []
    SampleSame = []
    SampleOther = []
    print("Learning transition model")
    print("Iterating over genes")
    get_mem_usage(verbosity, msg='Fitting transition parameters: I')

    for i, gene in enumerate(genes):
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        # Get data
        Sequences_per_gene = tools.PreloadSequencesForGene(Sequences, gene)
        CovMat = tools.StackData(Sequences_per_gene, add='all')
        CovMat[CovMat < 0] = 0
        nr_of_samples = CovMat.shape[0]
        for CurrState in range(NrOfStates):
            for NextState in range(NrOfStates):
                # Positions where the path is in the current state
                Ix1 = CurrPath[gene][:-1] == CurrState
                # Positions where the subsequent position path is in the "next"
                # state
                Ix2 = CurrPath[gene][1:] == NextState
                # Positions where the path changes from the current state to
                # the other state
                Ix = np.where(Ix1 * Ix2)[0]

                CovMatIx = GenerateFeatures(Ix, CovMat)

                if CurrState == NextState:
                    if CovMatIx.shape[1] == 0:
                        CovMatIx = np.zeros((nr_of_samples, 1))
                        SampleSame.append(CovMatIx)
                    else:
                        SampleSame.append(CovMatIx)
                else:
                    if CovMatIx.shape[1] == 0:
                        CovMatIx = np.zeros((nr_of_samples, 1))
                        SampleOther.append(CovMatIx)
                    else:
                        SampleOther.append(CovMatIx)
        del Sequences_per_gene, CovMat

    get_mem_usage(verbosity, msg='Fitting transition parameters: II')

    len_same = np.sum([Mat.shape[1] for Mat in SampleSame])
    len_other = np.sum([Mat.shape[1] for Mat in SampleOther])

    X = np.concatenate(SampleSame + SampleOther, axis=1).T
    del SampleSame, SampleOther

    # Create Y
    Y = np.hstack(
        (np.ones((1, len_same), dtype=np.int),
         np.zeros((1, len_other), dtype=np.int)))[0, :].T
    classes = np.unique(Y)

    get_mem_usage(verbosity, msg='Fitting transition parameters: III')
    n_iter = max(5, np.ceil(10**6 / Y.shape[0]))

    NewTransitionParametersLogReg = SGDClassifier(loss="log", max_iter=n_iter)
    ix_shuffle = np.arange(X.shape[0])
    for n in range(n_iter):
        np.random.shuffle(ix_shuffle)
        for batch_ix in np.array_split(ix_shuffle, 50):
            NewTransitionParametersLogReg.partial_fit(
                X[batch_ix, :], Y[batch_ix], classes=classes)

    del Ix1, Ix2,  Ix, X, Y, Xs, Ys
    get_mem_usage(verbosity, t=t, msg='Fitting transition parameters: IV')

    return NewTransitionParametersLogReg
Ejemplo n.º 6
0
def estimate_expression_param(expr_data, verbosity=1):
    """Estimate the parameters for the expression GLM."""
    (EmissionParameters, Paths) = expr_data

    Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r')
    Background = h5py.File(EmissionParameters['dat_file_bg'], 'r')

    # 1) Get the library size
    print('Start estimation of expression parameters')
    bg_type = EmissionParameters['bg_type']
    lib_size = EmissionParameters['LibrarySize']
    bck_lib_size = EmissionParameters['BckLibrarySize']
    start_params = EmissionParameters['ExpressionParameters'][0]
    disp = EmissionParameters['ExpressionParameters'][1]

    # 2) Estimate dispersion
    print('Constructing GLM matrix')
    t = time.time()

    # 3) Compute sufficient statistics
    get_mem_usage(
        verbosity,
        msg='Estimating expression parameters: before GLM matrix construction')

    A, w, Y, rep = construct_glm_matrix(
        EmissionParameters, Sequences, Background, Paths)

    print('Estimating expression parameters: GLM matrix constrution')
    get_mem_usage(verbosity, t=t)

    # Make sure that matrix A is in the right format
    if not sp.sparse.isspmatrix_csc(A):
        A = csc_matrix(A)

    get_mem_usage(
        verbosity,
        msg='Estimating expression parameters: before GLM matrix')

    # Create the offset for the library size
    offset = np.zeros_like(rep)
    for i in range(EmissionParameters['NrOfReplicates']):
        offset[rep == (i + 1)] = lib_size[str(i)]
    if bg_type != 'None':
        for i in range(EmissionParameters['NrOfBckReplicates']):
            offset[rep == -(i + 1)] = bck_lib_size[str(i)]

    # 4) Fit GLM
    print('Fitting GLM')
    t = time.time()

    print('Estimating expression parameters: before fitting')
    get_mem_usage(verbosity)

    start_params, disp = fit_glm(
        A, w, Y, offset, disp, start_params,
        norm_class=EmissionParameters['norm_class'],
        tmp_dir=EmissionParameters['tmp_dir'])

    get_mem_usage(
        verbosity,
        msg='Estimating expression parameters: after fitting')

    del A, w, Y, offset

    get_mem_usage(
        verbosity, t=t,
        msg='Estimating expression parameters: after cleanup')

    # 5) Process the output
    EmissionParameters['ExpressionParameters'] = [start_params, disp]
    print('Finished expression parameter estimation')

    return EmissionParameters