Ejemplo n.º 1
0
    def _buildModel(self, data):
        '''
        builds the model given the data to init the distributions at good point
        data: 2d matrix every row is a vector of features
        '''
        # we want to call from_matrix(transition, dists, starts, ends)
        tm = np.zeros((self.statesNumber, self.statesNumber))
        indices = [(x, x) for x in range(self.statesNumber)]
        indices.extend([(x, x + 1) for x in range(self.statesNumber)])
        indices.pop(
        )  # this the item (self.statesNumber-1 , self.statesNumber) that is out of bound
        indices = np.array(indices)
        tm[indices[:, 0], indices[:, 1]] = 0.5
        tm[self.statesNumber - 1, self.statesNumber -
           1] = 0.5  # this is the end state prob, i write it alone as we may change it specificity

        dists = self._initDists(data)

        starts = np.zeros((self.statesNumber, ))
        starts[0] = 1

        ends = np.zeros((self.statesNumber, ))
        ends[-1] = 0.5

        self.model = HiddenMarkovModel.from_matrix(tm,
                                                   dists,
                                                   starts,
                                                   ends,
                                                   name=self.mname)

        return self.model
def init():
    m = 1000  # restricts number of genes, used for local testing
    gc, mt, track = load_data(m)
    state_range = [5, 10, 25, 50, 100]
    z_range = [3, 5, 10, 20]

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data)

    sequences = np.concatenate((msequences, gsequences), 0)
    labels = np.concatenate((mlabels, glabels))

    sequences = np.concatenate((sequences, -1 * sequences))

    # tie positive and negative expression sequences
    tied = {}
    for i, label in enumerate(labels):
        tied[label] = [i, i+labels.size]

    state_labels = np.concatenate(((labels + '+'), (labels + '-')))
    labels = np.concatenate((labels, labels))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts)
    noise.freeze_distributions()
    return gc, mt, sequences, labels, state_labels, tied, noise, z_range, \
        state_range
def init(m, seed):
    if m == -1:
        m = None
    gc, mt, track = load_data(m, seed)

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data)

    sequences = np.concatenate((msequences, gsequences), 0)
    labels = np.concatenate((mlabels, glabels))

    sequences = np.concatenate((sequences, -1 * sequences))

    # tie positive and negative expression sequences
    tied = {}
    for i, label in enumerate(labels):
        tied[label] = [i, i+labels.size]

    labels = np.concatenate(((labels + '+'), (labels + '-')))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts,
                                          name='noise')
    noise.freeze_distributions()

    return sequences, labels, tied, noise
Ejemplo n.º 4
0
def get_variable_number_of_repeats_matcher_hmm(patterns,
                                               copies=1,
                                               vpaths=None):
    model = get_constant_number_of_repeats_matcher_hmm(patterns, copies,
                                                       vpaths)

    start_repeats_matches = State(None, name='start_repeating_pattern_match')
    end_repeats_matches = State(None, name='end_repeating_pattern_match')
    mat = model.dense_transition_matrix()
    states = model.states
    states.append(start_repeats_matches)
    states.append(end_repeats_matches)
    states_count = len(mat)
    start_repeats_ind = states_count
    end_repeats_ind = states_count + 1
    mat = np.c_[mat, np.zeros(states_count), np.zeros(states_count)]
    mat = np.r_[mat, [np.zeros(states_count + 2)]]
    mat = np.r_[mat, [np.zeros(states_count + 2)]]

    unit_ends = []
    for i, state in enumerate(model.states):
        if state.name.startswith('unit_end'):
            unit_ends.append(i)

    first_unit_start = None
    for i in range(len(mat[model.start_index])):
        if mat[model.start_index][i] != 0:
            first_unit_start = i
    mat[model.start_index][first_unit_start] = 0.0
    mat[model.start_index][start_repeats_ind] = 1
    mat[start_repeats_ind][first_unit_start] = 1

    for unit_end in unit_ends:
        next_state = None
        for j in range(len(mat[unit_end])):
            if mat[unit_end][j] != 0:
                next_state = j
        mat[unit_end][next_state] = 0.5
        mat[unit_end][end_repeats_ind] = 0.5

    mat[end_repeats_ind][model.end_index] = 1

    starts = np.zeros(states_count + 2)
    starts[model.start_index] = 1.0
    ends = np.zeros(states_count + 2)
    ends[model.end_index] = 1.0
    state_names = [state.name for state in states]
    distributions = [state.distribution for state in states]
    name = 'Repeat Matcher HMM Model'
    new_model = Model.from_matrix(mat,
                                  distributions,
                                  starts,
                                  ends,
                                  name=name,
                                  state_names=state_names,
                                  merge=None)
    new_model.bake(merge=None)
    return new_model
Ejemplo n.º 5
0
def get_read_matcher_model(left_flanking_region,
                           right_flanking_region,
                           patterns,
                           copies=1,
                           vpaths=None):
    model = get_suffix_matcher_hmm(left_flanking_region)
    repeats_matcher = get_variable_number_of_repeats_matcher_hmm(
        patterns, copies, vpaths)
    right_flanking_matcher = get_prefix_matcher_hmm(right_flanking_region)
    model.concatenate(repeats_matcher)
    model.concatenate(right_flanking_matcher)
    model.bake(merge=None)

    mat = model.dense_transition_matrix()

    first_repeat_matches = []
    repeat_match_states = []
    suffix_start = None
    for i, state in enumerate(model.states):
        if state.name[0] == 'M' and state.name.split('_')[-1] == '0':
            first_repeat_matches.append(i)
        if state.name[0] == 'M' and state.name.split('_')[-1] not in [
                'prefix', 'suffix'
        ]:
            repeat_match_states.append(i)
        if state.name == 'suffix_start_suffix':
            suffix_start = i

    mat[model.start_index][suffix_start] = 0.3
    for first_repeat_match in first_repeat_matches:
        mat[model.
            start_index][first_repeat_match] = 0.7 / len(first_repeat_matches)

    for match_state in repeat_match_states:
        to_end = 0.7 / len(repeat_match_states)
        total = 1 + to_end
        for next_state in range(len(mat[match_state])):
            if mat[match_state][next_state] != 0:
                mat[match_state][next_state] /= total
        mat[match_state][model.end_index] = to_end / total

    starts = np.zeros(len(model.states))
    starts[model.start_index] = 1.0
    ends = np.zeros(len(model.states))
    ends[model.end_index] = 1.0
    state_names = [state.name for state in model.states]
    distributions = [state.distribution for state in model.states]
    name = 'Read Matcher'
    new_model = Model.from_matrix(mat,
                                  distributions,
                                  starts,
                                  ends,
                                  name=name,
                                  state_names=state_names,
                                  merge=None)
    new_model.bake(merge=None)
    return new_model
Ejemplo n.º 6
0
def test_sample_from_site():

    dists = [
        NormalDistribution(5, 1),
        NormalDistribution(1, 7),
        NormalDistribution(8, 2)
    ]
    trans_mat = np.array([[0.7, 0.3, 0.0], [0.0, 0.8, 0.2], [0.0, 0.0, 0.9]])
    starts = np.array([1.0, 0.0, 0.0])
    ends = np.array([0.0, 0.0, 0.1])
    model = HiddenMarkovModel.from_matrix(trans_mat, dists, starts, ends)
    model.plot()
Ejemplo n.º 7
0
 def __init__(self,
              length=None,
              n_features=None,
              initial=None,
              match_match=0.9,
              delete_insert=0.1,
              flank_prob=0):  #last is polymorphism dummy
     super(ProfileHMM, self).__init__()
     if length is not None:
         n_states = 3 * length + 1
         #print(self.get_emission_dists(n_states, n_features, initial)[:3])
         self.model = HiddenMarkovModel.from_matrix(
             transition_probabilities=self.get_transmat(
                 n_states, match_match, delete_insert),
             distributions=self.get_emission_dists(n_states, n_features,
                                                   initial),
             starts=self.get_startprob(n_states),
             ends=self.get_endprob(n_states),
             state_names=self.get_state_names(length))
Ejemplo n.º 8
0
 def build_model(self):
     distributions = []
     for _ in range(self.hidden_size):
         emission_probs = np.random.random(self.num_characters)
         emission_probs = emission_probs / emission_probs.sum()
         distributions.append(
             DiscreteDistribution(
                 dict(zip(self.all_characters, emission_probs))))
     trans_mat = np.random.random((self.hidden_size, self.hidden_size))
     trans_mat = trans_mat / trans_mat.sum(axis=1, keepdims=1)
     starts = np.random.random(self.hidden_size)
     starts = starts / starts.sum()
     # testing initializations
     np.testing.assert_almost_equal(starts.sum(), 1)
     np.testing.assert_array_almost_equal(np.ones(self.hidden_size),
                                          trans_mat.sum(axis=1))
     self.model = HiddenMarkovModel.from_matrix(trans_mat, distributions,
                                                starts)
     self.model.bake()
Ejemplo n.º 9
0
    def oriHMMParams(self, numdists=3):
        """
        Set initial parameters for the Hidden Markov Model (HMM).
        
        """
        # GMM emissions
        # 3 Hidden States:
        # 0--downstream, 1--no bias, 2--upstream
        if numdists == 1:
            dists = [
                NormalDistribution(-2.5, 7.5),
                NormalDistribution(0, 7.5),
                NormalDistribution(2.5, 7.5)
            ]
        else:
            var = 7.5 / (numdists - 1)
            means = [[], [], []]
            for i in range(numdists):
                means[0].append(i * 7.5 / (numdists - 1) + 2.5)
                means[1].append(i * 7.5 * (-1)**i / (numdists - 1))
                means[2].append(-i * 7.5 / (numdists - 1) - 2.5)

            dists = []
            for i, m in enumerate(means):
                tmp = []
                for j in m:
                    tmp.append(NormalDistribution(j, var))
                mixture = GeneralMixtureModel(tmp)
                dists.append(mixture)

        # transition matrix
        A = [[0.34, 0.33, 0.33], [0.33, 0.34, 0.33], [0.33, 0.33, 0.34]]
        starts = np.ones(3) / 3

        hmm = HiddenMarkovModel.from_matrix(A,
                                            dists,
                                            starts,
                                            state_names=['0', '1', '2'],
                                            name='mixture{0}'.format(numdists))

        return hmm
Ejemplo n.º 10
0
def init():
    m = 1000  # restricts number of genes, used for local testing
    gc, mt, track = load_data(m)
    state_range = [5, 10, 25, 50, 100]
    z_range = [3, 5, 10, 20]

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data.iloc[:m, :])

    sequences = np.concatenate((msequences, gsequences))
    labels = np.concatenate((mlabels, glabels))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts)
    noise.freeze_distributions()
    return gc, mt, sequences, labels, noise, z_range, state_range
Ejemplo n.º 11
0
def init(m, seed):
    if m == -1:
        m = None
    gc, mt, track = load_data(m, seed)

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data)

    sequences = np.concatenate((msequences, gsequences), 0)
    labels = np.concatenate((mlabels, glabels))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts,
                                          name='noise')
    noise.freeze_distributions()

    return sequences, labels, noise
Ejemplo n.º 12
0
 def __init__(self,
              length=None,
              n_features=None,
              initial=None,
              match_match=0.9,
              delete_insert=0.1,
              flank_prob=0.9999999):
     super(ProfileHMM, self).__init__()
     if length is not None:
         n_states = 3 * length + 1
         transmat = self.get_transmat(n_states, match_match, delete_insert,
                                      length, flank_prob)
         #print(transmat.shape)
         #np.set_printoptions(edgeitems=10, linewidth=200)
         #print(transmat.round(2))
         emissions = self.get_emission_dists(n_states, n_features, initial)
         self.model = HiddenMarkovModel.from_matrix(
             transition_probabilities=transmat,
             distributions=emissions,
             starts=self.get_startprob(n_states, flank_prob),
             ends=self.get_endprob(n_states, flank_prob),
             state_names=self.get_state_names(length))
def init(m=None, seed=None):
    gc, mt, track = load_data(m, seed)

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data)

    sequences = np.concatenate((msequences, gsequences), 0)
    labels = np.concatenate((mlabels, glabels))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts,
                                          name='noise')
    noise.freeze_distributions()

    # khmm clustering over a range of k and states-per model
    k_range = [10, 25, 50, 100, 200]
    state_range = [5, 10, 25, 50, 100]
    return sequences, labels, noise, k_range, state_range
Ejemplo n.º 14
0
def roh_poissonhmm(gv,
                   pos,
                   phet_roh=0.001,
                   phet_nonroh=(0.0025, 0.01),
                   transition=1e-3,
                   window_size=1000,
                   min_roh=0,
                   is_accessible=None,
                   contig_size=None):
    """Call ROH (runs of homozygosity) in a single individual given a genotype vector.

    This function computes the likely ROH using a Poisson HMM model. The chromosome is divided into
    equally accessible windows of specified size, then the number of hets observed in each is used
    to fit a Poisson HMM. Note this is much faster than `roh_mhmm`, but at the cost of some
    resolution.

    The model is provided with a probability of observing a het in a ROH (`phet_roh`) and one
    or more probabilities of observing a het in a non-ROH, as this probability may not be
    constant across the genome (`phet_nonroh`).

    Parameters
    ----------
    gv : array_like, int, shape (n_variants, ploidy)
        Genotype vector.
    pos: array_like, int, shape (n_variants,)
        Positions of variants, same 0th dimension as `gv`.
    phet_roh: float, optional
        Probability of observing a heterozygote in a ROH. Appropriate values
        will depend on de novo mutation rate and genotype error rate.
    phet_nonroh: tuple of floats, optional
        One or more probabilites of observing a heterozygote outside of ROH.
        Appropriate values will depend primarily on nucleotide diversity within
        the population, but also on mutation rate and genotype error rate.
    transition: float, optional
        Probability of moving between states. This is based on windows, so a larger window size may
        call for a larger transitional probability
    window_size: integer, optional
        Window size (equally accessible bases) to consider as a potential ROH. Setting this window
        too small may result in spurious ROH calls, while too large will result in a lack of
        resolution.
    min_roh: integer, optional
        Minimum size (bp) to condsider as a ROH. Will depend on contig size and recombination rate.
    is_accessible: array_like, bool, shape (`contig_size`,), optional
        Boolean array for each position in contig describing whether accessible
        or not. Although optional, highly recommended so invariant sites are distinguishable from
        sites where variation is inaccessible
    contig_size: integer, optional
        If is_accessible is not available, use this to specify the size of the contig, and assume
        all sites are accessible.


    Returns
    -------
    df_roh: DataFrame
        Data frame where each row describes a run of homozygosity. Columns are 'start',
        'stop', 'length' and 'is_marginal'. Start and stop are 1-based, stop-inclusive.
    froh: float
        Proportion of genome in a ROH.

    Notes
    -----
    This function requires `pomegranate` (>= 0.9.0) to be installed.

    """

    from pomegranate import HiddenMarkovModel, PoissonDistribution

    # equally accessbile windows
    if is_accessible is None:
        if contig_size is None:
            raise ValueError(
                "If is_accessibile argument is not provided, you must provide contig_size"
            )
        is_accessible = np.ones((contig_size, ), dtype="bool")
    else:
        contig_size = is_accessible.size

    eqw = equally_accessible_windows(is_accessible, window_size)

    ishet = GenotypeVector(gv).is_het()
    counts, wins, records = windowed_statistic(pos, ishet, np.sum, windows=eqw)

    # heterozygote probabilities
    het_px = np.concatenate([(phet_roh, ), phet_nonroh])

    # start probabilities (all equal)
    start_prob = np.repeat(1 / het_px.size, het_px.size)

    # transition between underlying states
    transition_mx = _hmm_derive_transition_matrix(transition, het_px.size)

    dists = [PoissonDistribution(x * window_size) for x in het_px]

    model = HiddenMarkovModel.from_matrix(
        transition_probabilities=transition_mx,
        distributions=dists,
        starts=start_prob)

    prediction = np.array(model.predict(counts[:, None]))

    df_blocks = tabulate_state_blocks(prediction,
                                      states=list(range(len(het_px))))
    df_roh = df_blocks[(df_blocks.state == 0)].reset_index(drop=True)

    # adapt the dataframe for ROH
    df_roh["start"] = df_roh.start_ridx.apply(lambda y: eqw[y, 0])
    df_roh["stop"] = df_roh.stop_lidx.apply(lambda y: eqw[y, 1])
    df_roh["length"] = df_roh.stop - df_roh.start

    # filter by ROH size
    if min_roh > 0:
        df_roh = df_roh[df_roh.length >= min_roh]

    # compute FROH
    froh = df_roh.length.sum() / contig_size

    return df_roh[["start", "stop", "length", "is_marginal"]], froh
def gen_model(sequences, labels, algorithm, initialization, restarts, n, k,
              out_dir, base_id, tied):

    if initialization == 'rand':
        init_method = init_gaussian_hmm
        init_args = {'n_states': n[0]}
    if init_lr_hmm == 'lr':
        init_method = init_lr_hmm
        s = n[0]
        sps = n[1]
        init_args = {'steps': s, 'state_per_step': sps, 'force_end': True}
    if initialization == 'cycle':
        init_method = init_cycle_hmm
        s = n[0]
        sps = n[1]
        init_args = {'steps': s, 'state_per_step': sps}

    best = 0
    best_score = -1e1000

    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts,
                                          name='noise')
    noise.freeze_distributions()

    np.random.seed(int(time.time()))
    randassigns = []
    for x in range(restarts):
        randassigns.append(np.random.randint(k, size=labels.size))

    for x in range(restarts):
        randassign = randassigns[x]
        assignments = {}
        for i in range(k):
            model_id = str(i)
            assignments[model_id] = \
                np.where(randassign == i)[0].tolist()
            in_model = assignments[model_id]
        print assignments
    # gen model for number of restarts
    for x in range(restarts):
        try:
            collection_id = base_id + '_' + str(x)
            odir = '/'.join(out_dir.split('/') + [collection_id])

            print 'Learning: ', collection_id

            # generate random initial assignments
            # initialize models on random assignments
            randassign = randassigns[x]
            assignments = {}
            models = {}
            for i in range(k):
                model_id = str(i)
                assignments[model_id] = \
                    np.where(randassign == i)[0].tolist()
                in_model = assignments[model_id]
                models[model_id] = \
                    init_method(sequences[in_model, :], model_id=model_id,
                                **init_args)

            # add noise model
            models['noise'] = noise
            assignments['noise'] = []

            # all are un-fixed
            fixed = {}
            for model_id, model in models.iteritems():
                fixed[model_id] = []

            # perform clustering
            models, assignments, c = cluster(models=models,
                                             sequences=sequences,
                                             assignments=assignments,
                                             algorithm=algorithm,
                                             fixed=fixed, tied=tied,
                                             labels=labels,
                                             odir=odir)

            score = total_log_prob(models, sequences, assignments)
            if best_score < score:
                best_score = score
                best = collection_id
                bestfile = '/'.join(out_dir.split('/') + ['best'])
                with open(bestfile, 'w') as f:
                    print >> f, collection_id
                    f.close()

        except:
            error_file = odir.split('/') + ['errors.txt']
            error_file = '/'.join(error_file)
            f = open(error_file, 'a')
            print >> f, 'error computing parameters for: ', collection_id
            print >> f, "Unexpected error:", sys.exc_info()[0]
            f.close()

    return best