Beispiel #1
0
def load_segmentation_model(modeldata):
    model = HiddenMarkovModel('model')

    states = {}
    for s in modeldata:
        if len(s['emission']) == 1:
            emission = NormalDistribution(*s['emission'][0][:2])
        else:
            weights = np.array([w for _, _, w in s['emission']])
            dists = [NormalDistribution(mu, sigma)
                     for mu, sigma, _ in s['emission']]
            emission = GeneralMixtureModel(dists, weights=weights)
        state = State(emission, name=s['name'])

        states[s['name']] = state
        model.add_state(state)
        if 'start_prob' in s:
            model.add_transition(model.start, state, s['start_prob'])

    for s in modeldata:
        current = states[s['name']]
        for nextstate, prob in s['transition']:
            model.add_transition(current, states[nextstate], prob)

    model.bake()

    return model
Beispiel #2
0
    def update_hmm(self):
        num_states = self.num_states
        start_prob = self.start_prob
        num_emissions = self.num_emissions

        hmm = HiddenMarkovModel('hmm')
        dist = [
            DiscreteDistribution(
                dict(zip(range(num_emissions), self.emissions[i])))
            for i in range(num_states)
        ]
        states = [
            State(dist[i], 's' + str(i).zfill(2)) for i in range(num_states)
        ]
        hmm.add_states(states)
        for i in range(num_states):
            s_i = states[i]
            hmm.add_transition(hmm.start, s_i, start_prob[i])
            for j in range(num_states):
                s_j = states[j]
                p = self.transitions[i, j]
                hmm.add_transition(s_i, s_j, p)

        self.hmm = hmm
        self.hmm.bake()
Beispiel #3
0
    def _buildModel(self, data):
        '''
        builds the model given the data to init the distributions at good point
        data: 2d matrix every row is a vector of features
        '''
        # we want to call from_matrix(transition, dists, starts, ends)
        tm = np.zeros((self.statesNumber, self.statesNumber))
        indices = [(x, x) for x in range(self.statesNumber)]
        indices.extend([(x, x + 1) for x in range(self.statesNumber)])
        indices.pop(
        )  # this the item (self.statesNumber-1 , self.statesNumber) that is out of bound
        indices = np.array(indices)
        tm[indices[:, 0], indices[:, 1]] = 0.5
        tm[self.statesNumber - 1, self.statesNumber -
           1] = 0.5  # this is the end state prob, i write it alone as we may change it specificity

        dists = self._initDists(data)

        starts = np.zeros((self.statesNumber, ))
        starts[0] = 1

        ends = np.zeros((self.statesNumber, ))
        ends[-1] = 0.5

        self.model = HiddenMarkovModel.from_matrix(tm,
                                                   dists,
                                                   starts,
                                                   ends,
                                                   name=self.mname)

        return self.model
def init():
    m = 1000  # restricts number of genes, used for local testing
    gc, mt, track = load_data(m)
    state_range = [5, 10, 25, 50, 100]
    z_range = [3, 5, 10, 20]

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data)

    sequences = np.concatenate((msequences, gsequences), 0)
    labels = np.concatenate((mlabels, glabels))

    sequences = np.concatenate((sequences, -1 * sequences))

    # tie positive and negative expression sequences
    tied = {}
    for i, label in enumerate(labels):
        tied[label] = [i, i+labels.size]

    state_labels = np.concatenate(((labels + '+'), (labels + '-')))
    labels = np.concatenate((labels, labels))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts)
    noise.freeze_distributions()
    return gc, mt, sequences, labels, state_labels, tied, noise, z_range, \
        state_range
def init(m, seed):
    if m == -1:
        m = None
    gc, mt, track = load_data(m, seed)

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data)

    sequences = np.concatenate((msequences, gsequences), 0)
    labels = np.concatenate((mlabels, glabels))

    sequences = np.concatenate((sequences, -1 * sequences))

    # tie positive and negative expression sequences
    tied = {}
    for i, label in enumerate(labels):
        tied[label] = [i, i+labels.size]

    labels = np.concatenate(((labels + '+'), (labels + '-')))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts,
                                          name='noise')
    noise.freeze_distributions()

    return sequences, labels, tied, noise
Beispiel #6
0
def insert_delete_main_hmm(data_matrix):
    v_columns = column_clasify(data_matrix)
    v_zones = create_zones(v_columns)
    v_grouped_states = group_states(v_zones, 'test')
    v_model = HiddenMarkovModel()
    v_first_state = State(None, name='ali_start')
    v_last_state = State(None, name='ali_end')
    v_model.add_state(v_first_state)
    v_model.add_transition(v_model.start, v_first_state, 1)
    v_model.add_state(v_last_state)
    add_states(v_model, v_grouped_states)
    v_trans = calculate_transitions(v_first_state, v_last_state,
                                    v_grouped_states)
    apply_transitions(v_model, v_trans)
    v_model.bake()
    return v_model
    def HiddenSpaceGenerator(cls, X, n_components):
        """This method creates more features by training a HiddenMarkovModel
        on the game statistics, then returns the hidden state space of each
        timestep/game as a new feature. HOWEVER, note that this doesn't
        return a list of features, instead it returns a HMM that can generate
        more features but can still be used for classification also.

        Arguments:
            X: the input features (with a series_idx)
            n_components: The number of hidden state space variables to
                initialize. Note that (sadly) pomegranate does not implement
                continuous HMMs, so it will discretize every continuous
                variable by K-means so then the outputted space will be
                discrete.
        """
        # the last series id, none are negative so we will get a new one
        last_series_idx = -1
        # restrict down, but since a temporal we need to make a list of entries
        _X = []
        for row in X:
            if row[0] != last_series_idx:
                # create a new series to train on, start with empty input
                # series then add to them while the row has the same index
                last_series_idx = row[0]
                _X.append(np.full((0, X.shape[1] - 1), None, dtype=None))

            # add the datapoint to the current series
            _X[-1] = np.vstack((_X[-1], row[1:]))

        # now train an HMM to the data
        return HiddenMarkovModel.from_samples(MultivariateGaussianDistribution,
                                              n_components, _X)
Beispiel #8
0
 def load(self, file_path):
     with open(file_path, 'rb') as f:
         objects = pickle.load(f)
     try:
         self.transformer = objects['transformer']
     except KeyError:  # for backwards compatibility
         self.transformer = objects['lda']
     self.hmm = HiddenMarkovModel.from_json(objects['hmm'])
Beispiel #9
0
def get_variable_number_of_repeats_matcher_hmm(patterns,
                                               copies=1,
                                               vpaths=None):
    model = get_constant_number_of_repeats_matcher_hmm(patterns, copies,
                                                       vpaths)

    start_repeats_matches = State(None, name='start_repeating_pattern_match')
    end_repeats_matches = State(None, name='end_repeating_pattern_match')
    mat = model.dense_transition_matrix()
    states = model.states
    states.append(start_repeats_matches)
    states.append(end_repeats_matches)
    states_count = len(mat)
    start_repeats_ind = states_count
    end_repeats_ind = states_count + 1
    mat = np.c_[mat, np.zeros(states_count), np.zeros(states_count)]
    mat = np.r_[mat, [np.zeros(states_count + 2)]]
    mat = np.r_[mat, [np.zeros(states_count + 2)]]

    unit_ends = []
    for i, state in enumerate(model.states):
        if state.name.startswith('unit_end'):
            unit_ends.append(i)

    first_unit_start = None
    for i in range(len(mat[model.start_index])):
        if mat[model.start_index][i] != 0:
            first_unit_start = i
    mat[model.start_index][first_unit_start] = 0.0
    mat[model.start_index][start_repeats_ind] = 1
    mat[start_repeats_ind][first_unit_start] = 1

    for unit_end in unit_ends:
        next_state = None
        for j in range(len(mat[unit_end])):
            if mat[unit_end][j] != 0:
                next_state = j
        mat[unit_end][next_state] = 0.5
        mat[unit_end][end_repeats_ind] = 0.5

    mat[end_repeats_ind][model.end_index] = 1

    starts = np.zeros(states_count + 2)
    starts[model.start_index] = 1.0
    ends = np.zeros(states_count + 2)
    ends[model.end_index] = 1.0
    state_names = [state.name for state in states]
    distributions = [state.distribution for state in states]
    name = 'Repeat Matcher HMM Model'
    new_model = Model.from_matrix(mat,
                                  distributions,
                                  starts,
                                  ends,
                                  name=name,
                                  state_names=state_names,
                                  merge=None)
    new_model.bake(merge=None)
    return new_model
Beispiel #10
0
def hmm(df, emissions, n_states, algorithm):
    model = HiddenMarkovModel.from_samples(
        distribution=MultivariateGaussianDistribution,
        n_components=n_states,
        X=df[emissions].to_numpy(),
        algorithm=algorithm,
        verbose=True,
    )
    return model
Beispiel #11
0
def get_read_matcher_model(left_flanking_region,
                           right_flanking_region,
                           patterns,
                           copies=1,
                           vpaths=None):
    model = get_suffix_matcher_hmm(left_flanking_region)
    repeats_matcher = get_variable_number_of_repeats_matcher_hmm(
        patterns, copies, vpaths)
    right_flanking_matcher = get_prefix_matcher_hmm(right_flanking_region)
    model.concatenate(repeats_matcher)
    model.concatenate(right_flanking_matcher)
    model.bake(merge=None)

    mat = model.dense_transition_matrix()

    first_repeat_matches = []
    repeat_match_states = []
    suffix_start = None
    for i, state in enumerate(model.states):
        if state.name[0] == 'M' and state.name.split('_')[-1] == '0':
            first_repeat_matches.append(i)
        if state.name[0] == 'M' and state.name.split('_')[-1] not in [
                'prefix', 'suffix'
        ]:
            repeat_match_states.append(i)
        if state.name == 'suffix_start_suffix':
            suffix_start = i

    mat[model.start_index][suffix_start] = 0.3
    for first_repeat_match in first_repeat_matches:
        mat[model.
            start_index][first_repeat_match] = 0.7 / len(first_repeat_matches)

    for match_state in repeat_match_states:
        to_end = 0.7 / len(repeat_match_states)
        total = 1 + to_end
        for next_state in range(len(mat[match_state])):
            if mat[match_state][next_state] != 0:
                mat[match_state][next_state] /= total
        mat[match_state][model.end_index] = to_end / total

    starts = np.zeros(len(model.states))
    starts[model.start_index] = 1.0
    ends = np.zeros(len(model.states))
    ends[model.end_index] = 1.0
    state_names = [state.name for state in model.states]
    distributions = [state.distribution for state in model.states]
    name = 'Read Matcher'
    new_model = Model.from_matrix(mat,
                                  distributions,
                                  starts,
                                  ends,
                                  name=name,
                                  state_names=state_names,
                                  merge=None)
    new_model.bake(merge=None)
    return new_model
Beispiel #12
0
def test_sample_from_site():

    dists = [
        NormalDistribution(5, 1),
        NormalDistribution(1, 7),
        NormalDistribution(8, 2)
    ]
    trans_mat = np.array([[0.7, 0.3, 0.0], [0.0, 0.8, 0.2], [0.0, 0.0, 0.9]])
    starts = np.array([1.0, 0.0, 0.0])
    ends = np.array([0.0, 0.0, 0.1])
    model = HiddenMarkovModel.from_matrix(trans_mat, dists, starts, ends)
    model.plot()
def gaussian_hmm(n_states, lower, upper, variance, model_id):
    """
    insantiate a model with random parameters
    randomly generates start and transition matrices
    generates nomal distrobutions for each state from partition on sequences
    """
    np.random.seed(int(time.time()))

    model = HiddenMarkovModel(model_id)

    # make states with distrobutions from random subsets of timepoints
    x = np.linspace(lower, upper, n_states)
    states = []
    for i in range(n_states):
        dist = \
            NormalDistribution(x[i], variance)
        states.append(State(dist, name=str(i)))

    model.add_states(states)

    # add uniform start probabilities
    start_prob = 1.0 / n_states
    start_probs = []
    for i in range(n_states):
        start_probs.append(start_prob + np.random.ranf())
    start_probs = np.array(start_probs)
    start_probs = start_probs / start_probs.sum()
    for i, state in enumerate(states):
        model.add_transition(model.start, state, start_probs[i])

    # add transition probabilities proportional to probability of generating
    # one state mean from another
    for state1 in states:
        transitions = []
        for other_state in states:
            transitions.append(np.exp(state1.distribution.log_probability(
                other_state.distribution.parameters[0])) + np.random.ranf())
        transitions = np.array(transitions)
        transitions = transitions / transitions.sum()
        for i, state2 in enumerate(states):
            model.add_transition(state1, state2, transitions[i])

    model.bake()
    print 'Initialized HMM: ', model.name
    return model
def init(base_dir):
    print base_dir
    cluster_directories = \
        glob.glob(base_dir + '/*')

    initial_clusterings = {}
    clusterings = {}
    clusterings_models = {}
    for cluster_dir in cluster_directories:
        try:
            clustering_id = cluster_dir.split('/')[-1:][0]
            # read initial clusters
            initial_clusters = {}
            filepath = '/'.join(cluster_dir.split('/') + ['init_assignments.txt'])
            lines = (open(filepath, 'r').read().splitlines())
            l = 0
            while l < len(lines):
                cluster_name = lines[l]
                cluster_members = lines[l + 1].split('\t')
                initial_clusters[cluster_name] = cluster_members
                l += 4

            initial_clusterings[clustering_id] = initial_clusters

            # read final clusters
            clusters = {}
            filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt'])
            lines = (open(filepath, 'r').read().splitlines())
            l = 0
            while l < len(lines):
                cluster_name = lines[l]
                cluster_members = lines[l + 1].split('\t')
                clusters[cluster_name] = cluster_members
                l += 4

            clusterings[clustering_id] = clusters

            # load models
            models = {}
            model_files = glob.glob(cluster_dir + '/*')
            for model_file in model_files:
                try:
                    model_id = model_file.split('/')[-1:][0]
                    json = open(model_file).read()
                    models[model_id] = HiddenMarkovModel.from_json(json)
                    print 'model loaded from: ', model_file
                except:
                    pass
            clusterings_models[clustering_id] = models
        except:
            pass
    return initial_clusterings, clusterings
Beispiel #15
0
def lambda_handler(event, context):
    # TODO implement
    content_object = s3.get_object(Bucket='bhargav-ml-trained-models', Key='pos_model.txt')
    file_content = content_object['Body'].read().decode()
    json_content = json.loads(file_content)
    model = HiddenMarkovModel.from_json(json_content)
    sentence = event['body'].split(' ')
    output = simplify_decoding(sentence, model)
    return {
        'statusCode': 200,
        'headers': {'Content-Type': 'text/plain', 'Access-Control-Allow-Origin': '*'},
        'body': output
    }
Beispiel #16
0
    def get_vntr_matcher_hmm(self, read_length):
        """Try to load trained HMM for this VNTR
        If there was no trained HMM, it will build one and store it for later usage
        """
        logging.info('Using read length %s' % read_length)
        copies = self.get_copies_for_hmm(read_length)

        base_name = str(
            self.reference_vntr.id) + '_' + str(read_length) + '.json'
        stored_hmm_file = settings.TRAINED_HMMS_DIR + base_name
        if settings.USE_TRAINED_HMMS and os.path.isfile(stored_hmm_file):
            model = Model()
            model = model.from_json(stored_hmm_file)
            return model

        flanking_region_size = read_length
        vntr_matcher = self.build_vntr_matcher_hmm(copies,
                                                   flanking_region_size)

        json_str = vntr_matcher.to_json()
        with open(stored_hmm_file, 'w') as outfile:
            outfile.write(json_str)
        return vntr_matcher
Beispiel #17
0
    def cluster(self):
        if self.preprocessed_data is None:
            print("No preprocessed_data attribute found")
            return -1

        if self.alg == "Kmeans":
            from sklearn.cluster import KMeans
            km = KMeans(n_clusters=self.K, precompute_distances=True)
            km.fit(np.concatenate(
                self.preprocessed_data))  #flattens all dates together
            self.states = [km.predict(d) for d in self.preprocessed_data]

        elif self.alg == "HMM":
            from pomegranate import HiddenMarkovModel, MultivariateGaussianDistribution
            distribution = MultivariateGaussianDistribution
            hmm=HiddenMarkovModel().from_samples(distribution,n_components=self.K\
            ,X=self.preprocessed_data.copy())
            self.states = [
                np.array(hmm.predict(d.copy())) for d in self.preprocessed_data
            ]
        else:
            print("Unrecognised or undefined clustering algorithm.")
            return -1
        self.experiment_progress = 2
Beispiel #18
0
def create_hidden_MarkovModel(e_df, q_df, start_p_dict):
    """
    Creates a Hidden Markov Model based on DataFrame
    @args:
        - e_df (pd.Dataframe): contains the emission probabilites
        - q_df (pd.Dataframe): contains the emission probabilites
    """
    model = HiddenMarkovModel(name="Example Model")

    '#1: Create a dict for each key in trans. df'
    model_dict = {}
    for key in q_df.keys().values:
        model_dict[key] = {}

    '#2: Create the states'
    for key in model_dict:
        '#2.1.Step Add teh emission prob. to each state, , P(observation | state)'
        emission_p = DiscreteDistribution(e_df[key].to_dict())
        sunny_state = State(emission_p, name=key)
        model_dict[key] = State(emission_p, name=key)
        model.add_state(model_dict[key])
        '#2.2.Step: Add the start probability for each state'
        model.add_transition(model.start, model_dict[key], start_p_dict[key])

    '#3.Step: Add the transition probability to each state'
    for key, item in q_df.to_dict("index").items():
        for item_name, value in item.items():
            print(key, " , ", item_name, ": ", value)
            tmp_origin = model_dict[key]
            tmp_destination = model_dict[item_name]
            model.add_transition(tmp_origin, tmp_destination,
                                 q_df.loc[key, item_name])
    # finally, call the .bake() method to finalize the model
    model.bake()

    return model
Beispiel #19
0
def hmm(df, num_states):
    "HMM program"
    # df['value']=df['value'].replace(0,np.nan) #this removes unmappable areas of chr
    # df_dropna=df.dropna(subset=['value']) #this removes unmappable areas of chr (NaN is otherwise considered 0)
    vals = df["value"].values
    model = HiddenMarkovModel.from_samples(NormalDistribution,
                                           X=[vals],
                                           n_components=num_states)
    states = model.predict(vals)

    # Rename states to increase with mean signal
    order = np.argsort(df['value'].groupby(states).mean())
    states = [order[s] for s in states]
    df["state"] = states
    df['state'][np.isnan(df['value'])] = np.nan
    return df
Beispiel #20
0
def run_test(arg, k):
    np.random.seed(k)
    exp_type = arg['type']
    N = arg['N']
    alpha = arg['alpha']
    n_comp = arg['n_comp']
    norm_params = arg['norm_params']
    save_dir = arg['dir']

    sequence = generator.Sequence(N, alpha, type=exp_type, params=norm_params)
    labels = list(map(myutils.rename_state, sequence.path))
    model = HiddenMarkovModel.from_samples(DiscreteDistribution,
                                           n_components=n_comp,
                                           X=[sequence.sequence],
                                           labels=[labels],
                                           algorithm='labeled')
    return model, sequence.sequence
def generate(
    genre_folder: str,
    bpm: int,
    beats: int,
    steps: int,
    onset: str,
    components: int,
    regex: str,
    output: str,
    include: bool,
):
    """
    This command generates a new unique beat, based on the audio files in the given input folder.
    """

    audios = util.read_audio_files(include, Path(genre_folder), regex)

    sequences, samples = util.create_knowledge_base(
        audios, OnsetAlgorithm(onset.lower()), beats, steps
    )

    # Create the model
    # sequences = [add_up_ones(seq) for seq in sequences]
    model: HiddenMarkovModel = HiddenMarkovModel.from_samples(
        DiscreteDistribution,
        n_components=components,
        X=sequences,
        algorithm="viterbi",
        verbose=True,
        name="groover",
    )
    # model: MarkovChain = MarkovChain.from_samples(X=sequences)
    # lengths: List[int] = [len(x) for x in sequences]
    sequence = model.sample(length=beats * steps)
    sequence = sequences[0]
    print(sequence)
    # sequence = ones(sequence)
    # print(len(sequence))
    print(
        "BPM: {}, Beats: {}, Steps:{}, Onset Algorithm: {}".format(
            bpm, beats, steps, onset
        )
    )

    # Save the beat
    util.create_beat(sequence, samples, bpm, beats, steps).save(Path(output))
Beispiel #22
0
class HMMWrapper:
    def __init__(self):
        self.model = HiddenMarkovModel()
        self.start = self.model.start
        self.end = self.model.end
        self.states_before_bake = []
        self.states = None

    def add_state(self, state, start_prob=0):
        self.states_before_bake.append((state, start_prob))
        self.model.add_state(state)

    def add_transition(self, start_state, end_state, prob):
        # print('adding from', start_state.name, 'to', end_state.name, prob)
        self.model.add_transition(start_state, end_state, prob)

    def bake(self):
        starter_states_no_prob = []
        free_start_prob = 1.0
        for state in self.states_before_bake:
            if 'none' not in state[0].name:
                if not state[1]:
                    starter_states_no_prob.append(state)
                else:
                    free_start_prob -= state[1]
                    print('asignado ' + str(state[1]) + ' a ' + state[0].name)
                    self.add_transition(self.start, state[0], state[1])

        len_no_prob = len(starter_states_no_prob)
        starter_prob = free_start_prob / len_no_prob
        print(len_no_prob, starter_prob)
        for state in starter_states_no_prob:
            self.add_transition(self.start, state, starter_prob)

        self.model.bake()
        self.states = self.model.states

    def make_states_from_alignment(self, first_state, last_state, seq_matrix,
                                   name):
        columns = column_clasify(seq_matrix)
        zones = create_zones(columns)
        grouped_states = group_states(zones, name)
        add_states(self, grouped_states)
        trans = calculate_transitions(first_state, last_state, grouped_states)
        apply_transitions(self, trans)

    def predict(self, *args, **kwargs):
        return self.model.predict(*args, **kwargs)
Beispiel #23
0
    def fit_hmm(self,
                signal_arrays,
                state_vectors,
                distribution,
                state_transition_threshold=1e-4,
                **kwargs):

        # We want to bunch together artefact states with their
        # corresponding "clean" states.
        state_vectors = [np.abs(vec) for vec in state_vectors]

        # remove 'undefined' samples
        # TODO: let pomegranate handle that
        signal_arrays = [
            arr[vec != 0] for arr, vec in zip(signal_arrays, state_vectors)
        ]
        state_vectors = [vec[vec != 0] for vec in state_vectors]

        # Pomegranate expects string labels for valid states and None for invalid states.
        # labels = [[str(state) if state != 0 else None for state in vec] for vec in state_vectors]
        labels = [[str(state) for state in vec] for vec in state_vectors]

        # construct matching state names
        # state_names = [str(state) for state in np.unique(np.concatenate(state_vectors)) if state != 0]
        state_names = [
            str(state) for state in np.unique(np.concatenate(state_vectors))
        ]

        # fit HMM states to transformed signals
        signals = [self.transform(arr) for arr in signal_arrays]

        hmm = HiddenMarkovModel.from_samples(distribution=distribution,
                                             n_components=len(state_names),
                                             X=signals,
                                             labels=labels,
                                             algorithm='labeled',
                                             state_names=state_names,
                                             **kwargs)

        if state_transition_threshold > 0.:
            new_hmm = _sparsify_hmm(hmm, state_transition_threshold)
            return new_hmm

        else:
            return hmm
Beispiel #24
0
    def fit(self, data):
        """
        Fits a model---learns transition and emission probabilities

        Arguments:
            data: list of SMILES
        """
        list_data = [list(smiles) for smiles in data]
        self.model = HiddenMarkovModel.from_samples(
            DiscreteDistribution, n_components=self.n_components,
            end_state=True, X=list_data,
            init='kmeans||', verbose=self.verbose, n_jobs=self.n_jobs,
            max_iterations=self.epochs,
            batches_per_epoch=self.batches_per_epoch,
            random_state=self.seed
        )
        self.fitted = True
        return self
Beispiel #25
0
    def load(cls, path):
        """
        Loads saved model

        Arguments:
            path: path to saved .pkl file

        Returns:
            Loaded HMM
        """
        with open(path, "rb") as f:
            data = pickle.load(f)
        hmm = data['model']
        del data['model']
        model = cls(**data)
        model.model = HiddenMarkovModel.from_json(hmm)
        model.fitted = True
        return model
Beispiel #26
0
    def fit(self, X, y=None):
        X_processed = self._check_and_preprocess(X, True)
        self.hmmmodel = HiddenMarkovModel.from_samples(
            NormalDistribution,
            self.n_states,
            X_processed,
            algorithm="baum-welch",
            n_jobs=8,
            verbose=self.verbose,
            batches_per_epoch=20,
            max_iterations=self.max_iterations)
        self.hmmmodel.bake()

        self.decision_scores_ = np.zeros(X.shape[0])
        for i, sequence in enumerate(X_processed):
            self.decision_scores_[i] = -self.hmmmodel.log_probability(sequence)

        self._process_decision_scores()
Beispiel #27
0
def create_casas7_HMM_with_prepared_train_and_test_based_on_seq_of_activities(
        train_set, list_of_persons_in_train, test_set,
        list_of_persons_in_test):
    '''
    create a single HMM for all of persons
    train_set = an ndarray that has train_set for each person separately
    test_set = 
    '''

    #concatinate train_sets and test_sets of all of people
    number_of_persons = len(train_set)
    final_train_set = train_set[0]
    final_test_set = test_set[0]
    final_train_set_labels = list_of_persons_in_train[0]
    final_test_set_labels = list_of_persons_in_test[0]
    #print(type(final_train_set) , type(train_set) , type(train_set[1]))
    for per in range(1, number_of_persons):
        final_train_set = np.concatenate((final_train_set, train_set[per]),
                                         axis=0)
        final_test_set = np.concatenate((final_test_set, test_set[per]),
                                        axis=0)
        final_train_set_labels = np.concatenate(
            (final_train_set_labels, list_of_persons_in_train[per]), axis=0)
        final_test_set_labels = np.concatenate(
            (final_test_set_labels, list_of_persons_in_test[per]), axis=0)

    #r = np.shape(final_train_set)
    #for i in range(r[0]):
    #    print(np.shape(final_train_set[i]))
    #final_train_set = np.array([[1,2,3,0,0] , [1,2,0,0,0]], dtype = np.ndarray)
    #final_train_set_labels = np.array([1,2] , dtype= np.ndarray)
    print(type(final_train_set[11]), np.shape(final_train_set[11]))
    print(final_train_set[0:2])
    model = HiddenMarkovModel.from_samples(
        DiscreteDistribution,
        n_components=2,
        X=final_train_set,
        labels=final_train_set_labels,
        algorithm='labeled'
    )  # according to my tests :D n_components is number of hidden states
    print(model)
    #return 0
    #test
    '''predicted_labels = np.zeros_like(actual_labels)
Beispiel #28
0
 def __init__(self,
              length=None,
              n_features=None,
              initial=None,
              match_match=0.9,
              delete_insert=0.1,
              flank_prob=0):  #last is polymorphism dummy
     super(ProfileHMM, self).__init__()
     if length is not None:
         n_states = 3 * length + 1
         #print(self.get_emission_dists(n_states, n_features, initial)[:3])
         self.model = HiddenMarkovModel.from_matrix(
             transition_probabilities=self.get_transmat(
                 n_states, match_match, delete_insert),
             distributions=self.get_emission_dists(n_states, n_features,
                                                   initial),
             starts=self.get_startprob(n_states),
             ends=self.get_endprob(n_states),
             state_names=self.get_state_names(length))
Beispiel #29
0
 def build_model(self):
     distributions = []
     for _ in range(self.hidden_size):
         emission_probs = np.random.random(self.num_characters)
         emission_probs = emission_probs / emission_probs.sum()
         distributions.append(
             DiscreteDistribution(
                 dict(zip(self.all_characters, emission_probs))))
     trans_mat = np.random.random((self.hidden_size, self.hidden_size))
     trans_mat = trans_mat / trans_mat.sum(axis=1, keepdims=1)
     starts = np.random.random(self.hidden_size)
     starts = starts / starts.sum()
     # testing initializations
     np.testing.assert_almost_equal(starts.sum(), 1)
     np.testing.assert_array_almost_equal(np.ones(self.hidden_size),
                                          trans_mat.sum(axis=1))
     self.model = HiddenMarkovModel.from_matrix(trans_mat, distributions,
                                                starts)
     self.model.bake()
Beispiel #30
0
def create_hmm_from_sample(file_address):

    #data, _ , _ = read_sequence_based_CSV_file_with_activity(file_address = file_address, has_header = True , separate_data_based_on_persons = False )
    #data = read_data_from_CSV_file(dest_file = file_address, data_type = np.int ,  has_header = True , return_as_pandas_data_frame = False )
    '''
    data = np.delete(data , 2, 1)
    data = np.delete(data , 2, 1)
    data = np.delete(data , 0, 1)
    data = np.delete(data , 0, 1)
    data = np.delete(data , 0, 1)
    print(np.shape(data))
    '''
    #print(data)
    data = np.array([['a', 'b'], ['a', 'b']])
    data = np.array([[np.array([1, 2, 3]),
                      np.array([1, 1, 1])],
                     [np.array([1, 1, 2]),
                      np.array([1, 2, 2])]])
    data = [
        np.array([[1, 2, 3], [1, 2, 3]], np.int32),
        np.array([[1, 2, 3], [1, 2, 3]], np.int32),
        np.array([[1, 2, 3], [1, 2, 3]], np.int32)
    ]
    print(data)
    #data = np.array([[['a' , 'b'] , ['a' , 'a']] , [['a' , 'b'] , ['b' , 'b']]])

    #data = create_sequence_of_sensor_events_based_on_activity(address_to_read = file_address, has_header = False, address_for_save = " ", isSave = False)#read_data_from_CSV_file(dest_file = file_address, data_type = numpy.int ,  has_header = False , return_as_pandas_data_frame = False )
    model = HiddenMarkovModel.from_samples(
        MultivariateDistribution, n_components=3, X=data
    )  # according to my tests :D n_components is number of hidden states

    #print(model)
    #print(model._baum_welch_summarize())
    #model.plot()
    '''
    print("dense_transition_matrix:" , model.dense_transition_matrix())
    print("edge_count:" , model.edge_count())
    print("edges:" , model.edges)
    print("name:" , model.name)
    print("state_count:" , model.state_count())
    '''
    print(model)
Beispiel #31
0
    def oriHMMParams(self, numdists=3):
        """
        Set initial parameters for the Hidden Markov Model (HMM).
        
        """
        # GMM emissions
        # 3 Hidden States:
        # 0--downstream, 1--no bias, 2--upstream
        if numdists == 1:
            dists = [
                NormalDistribution(-2.5, 7.5),
                NormalDistribution(0, 7.5),
                NormalDistribution(2.5, 7.5)
            ]
        else:
            var = 7.5 / (numdists - 1)
            means = [[], [], []]
            for i in range(numdists):
                means[0].append(i * 7.5 / (numdists - 1) + 2.5)
                means[1].append(i * 7.5 * (-1)**i / (numdists - 1))
                means[2].append(-i * 7.5 / (numdists - 1) - 2.5)

            dists = []
            for i, m in enumerate(means):
                tmp = []
                for j in m:
                    tmp.append(NormalDistribution(j, var))
                mixture = GeneralMixtureModel(tmp)
                dists.append(mixture)

        # transition matrix
        A = [[0.34, 0.33, 0.33], [0.33, 0.34, 0.33], [0.33, 0.33, 0.34]]
        starts = np.ones(3) / 3

        hmm = HiddenMarkovModel.from_matrix(A,
                                            dists,
                                            starts,
                                            state_names=['0', '1', '2'],
                                            name='mixture{0}'.format(numdists))

        return hmm
def init():
    m = 1000  # restricts number of genes, used for local testing
    gc, mt, track = load_data(m)
    state_range = [5, 10, 25, 50, 100]
    z_range = [3, 5, 10, 20]

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data.iloc[:m, :])

    sequences = np.concatenate((msequences, gsequences))
    labels = np.concatenate((mlabels, glabels))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts)
    noise.freeze_distributions()
    return gc, mt, sequences, labels, noise, z_range, state_range
Beispiel #33
0
class ModelWrapper:
    def __init__(self):
        self.model = HiddenMarkovModel()

    def add_state(self, distribution, name):
        state = State(distribution, name=name)
        self.model.add_state(state)
        return state

    def bake(self):
        self.model.bake()

    def viterbi(self, seq):
        return self.model.viterbi(seq)

    def add_transition(self, states, next_state_data):
        for state in states:
            for next_data in next_state_data:
                self.model.add_transition(state, next_data[0], next_data[1])
def init(m, seed):
    if m == -1:
        m = None
    gc, mt, track = load_data(m, seed)

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data)

    sequences = np.concatenate((msequences, gsequences), 0)
    labels = np.concatenate((mlabels, glabels))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts,
                                          name='noise')
    noise.freeze_distributions()

    return sequences, labels, noise
Beispiel #35
0
 def __init__(self,
              length=None,
              n_features=None,
              initial=None,
              match_match=0.9,
              delete_insert=0.1,
              flank_prob=0.9999999):
     super(ProfileHMM, self).__init__()
     if length is not None:
         n_states = 3 * length + 1
         transmat = self.get_transmat(n_states, match_match, delete_insert,
                                      length, flank_prob)
         #print(transmat.shape)
         #np.set_printoptions(edgeitems=10, linewidth=200)
         #print(transmat.round(2))
         emissions = self.get_emission_dists(n_states, n_features, initial)
         self.model = HiddenMarkovModel.from_matrix(
             transition_probabilities=transmat,
             distributions=emissions,
             starts=self.get_startprob(n_states, flank_prob),
             ends=self.get_endprob(n_states, flank_prob),
             state_names=self.get_state_names(length))
def init(m=None, seed=None):
    gc, mt, track = load_data(m, seed)

    msequences, mlabels = df_to_sequence_list(mt.data)
    gsequences, glabels = df_to_sequence_list(gc.data)

    sequences = np.concatenate((msequences, gsequences), 0)
    labels = np.concatenate((mlabels, glabels))

    # noise model trained on all data once
    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts,
                                          name='noise')
    noise.freeze_distributions()

    # khmm clustering over a range of k and states-per model
    k_range = [10, 25, 50, 100, 200]
    state_range = [5, 10, 25, 50, 100]
    return sequences, labels, noise, k_range, state_range
Beispiel #37
0
def create_casas7_hmm(file_address, has_activity):

    if has_activity:
        list_of_data, list_of_persons, _ = read_sequence_based_CSV_file_with_activity(
            file_address=file_address,
            has_header=True,
            separate_data_based_on_persons=False)
    else:
        list_of_data, list_of_persons = read_sequence_based_CSV_file_without_activity(
            file_address=file_address,
            has_header=True,
            separate_data_based_on_persons=False)

    model = ""

    try:
        model = HiddenMarkovModel.from_samples(DiscreteDistribution,
                                               n_components=5,
                                               X=list_of_data,
                                               algorithm='baum-welch')
        #model = HiddenMarkovModel.from_samples(DiscreteDistribution, n_components=2, X=list_of_data , labels = list_of_persons , algorithm = 'labeled' )
    except KeyError:
        print('there is an exception')
    print(model)

    #print((list_of_persons[0]))
    print("np.shape(list_of_data):", np.shape(list_of_data))

    #print(model._baum_welch_summarize())
    model.plot()
    print("dense_transition_matrix:", model.dense_transition_matrix())
    print("edge_count:", model.edge_count())
    print("edges:", model.edges)
    print("name:", model.name)
    print("state_count:", model.state_count())
    #print("summarize:" , model.summarize())
    print(model.thaw())
#!/usr/bin/env python2.7
# example.py: Yet Another Hidden Markov Model library
# Contact: Jacob Schreiber ( [email protected] )

"""
A simple example highlighting how to build a model using states, add
transitions, and then run the algorithms, including showing how training
on a sequence improves the probability of the sequence.
"""

import random
from pomegranate import *
from pomegranate import HiddenMarkovModel as Model

random.seed(0)
model = Model(name="ExampleModel")
distribution = UniformDistribution(0.0, 1.0)
state = State(distribution, name="uniform")
state2 = State(NormalDistribution(0, 2), name="normal")
silent = State(None, name="silent")
model.add_state(state)
model.add_state(state2)

model.add_transition(state, state, 0.4)
model.add_transition(state, state2, 0.4)
model.add_transition(state2, state2, 0.4)
model.add_transition(state2, state, 0.4)

model.add_transition(model.start, state, 0.5)
model.add_transition(model.start, state2, 0.5)
model.add_transition(state, model.end, 0.2)
infinite model, with no extra work! This change is passed on to all the
algorithms.
'''

from pomegranate import *
from pomegranate import HiddenMarkovModel as Model
import itertools as it
import numpy as np

# Define the states
s1 = State( NormalDistribution( 5, 2 ), name="S1" )
s2 = State( NormalDistribution( 15, 2 ), name="S2" )
s3 = State( NormalDistribution( 25, 2 ), name="S3 ")

# Define the transitions
model = Model( "infinite" )
model.add_transition( model.start, s1, 0.7 )
model.add_transition( model.start, s2, 0.2 )
model.add_transition( model.start, s3, 0.1 )
model.add_transition( s1, s1, 0.6 )
model.add_transition( s1, s2, 0.1 )
model.add_transition( s1, s3, 0.3 )
model.add_transition( s2, s1, 0.4 )
model.add_transition( s2, s2, 0.4 )
model.add_transition( s2, s3, 0.2 )
model.add_transition( s3, s1, 0.05 )
model.add_transition( s3, s2, 0.15 )
model.add_transition( s3, s3, 0.8 )
model.bake()

sequence = [ 4.8, 5.6, 24.1, 25.8, 14.3, 26.5, 15.9, 5.5, 5.1 ]
Beispiel #40
0
s2 = State(b, name="M2")
s22 = State(bb, name="M22")
s222 = State(bbb, name="M222")
s2222 = State(bbbb, name="M2222")
s22222 = State(bbbbb, name="M22222")
s222222 = State(bbbbbb, name="M222222")

s3 = State(c, name="M3")
s33 = State(cc, name="M33")
s333 = State(ccc, name="M333")
s3333 = State(cccc, name="M3333")
s33333 = State(ccccc, name="M33333")
s333333 = State(cccccc, name="M333333")

hmm = HiddenMarkovModel()
hmm.add_states(s1, s11, s111, s2, s22, s222, s3, s33, s333,
               s1111, s11111, s111111, s2222, s22222, s222222, s3333, s33333,
               s333333)

hmm.add_transition(hmm.start, s1, 1.)
hmm.add_transition(hmm.start, s11, 1.)
hmm.add_transition(hmm.start, s111, 1.)
hmm.add_transition(hmm.start, s2, 1.)
hmm.add_transition(hmm.start, s22, 1.)
hmm.add_transition(hmm.start, s222, 1.)
hmm.add_transition(hmm.start, s3, 1.)
hmm.add_transition(hmm.start, s33, 1.)
hmm.add_transition(hmm.start, s333, 1.)
hmm.add_transition(hmm.start, s1111, 1.)
hmm.add_transition(hmm.start, s11111, 1.)
def prep(cluster_directory_root, depth, genefile):

    # load data
    gc, mt, track = load_data(None, 0)
    genes = load(open(genefile, 'r'))
    gc.data = gc.data.loc[genes, :]

    data = pd.concat([gc.data, mt.data])

    labels = data.index.values
    original_labels = labels
    pos_labels = labels + '+'
    neg_labels = labels + '-'
    pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels,
                            columns=data.columns.values)
    neg_data = pd.DataFrame(data=(data.as_matrix() * -1), index=neg_labels,
                            columns=data.columns.values)

    data = pd.concat([data, pos_data, neg_data])

    print data.index.values

    generic_dir = cluster_directory_root.split('/') + (['*'] * depth)
    generic_dir = ('/').join(generic_dir)
    cluster_directories = \
        glob.glob(generic_dir)

    clusterings = {}
    clusterings_models = {}
    for cluster_dir in cluster_directories:
        try:
            clustering_id = cluster_dir.split('/')[-1:][0]
            # read final clusters
            clusters = {}
            filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt'])
            lines = (open(filepath, 'r').read().splitlines())
            l = 0
            while l < len(lines):
                cluster_name = lines[l]
                cluster_members = lines[l + 1].split('\t')
                if cluster_members == ['']:
                    cluster_members = []
                clusters[cluster_name] = cluster_members
                l += 4

            clusterings[clustering_id] = clusters

            # load models
            models = {}
            model_files = glob.glob(cluster_dir + '/*')
            for model_file in model_files:
                try:
                    model_id = model_file.split('/')[-1:][0]
                    json = open(model_file).read()
                    models[model_id] = HiddenMarkovModel.from_json(json)
                    print 'model loaded from: ', model_file
                except:
                    pass
            clusterings_models[clustering_id] = models
        except:
            pass

    """
    background = set()
    for clustering in clusterings.itervalues():
        for cid, members in clustering.iteritems():
            background.update(set(members))
    """

    background = list(original_labels)
    # data = data.loc[background, :]

    # generate ranomd clusterings of the same size k as our models
    random_clusterings = {}
    np.random.seed(int(time.time()))
    for clustering_id, clustering in clusterings.iteritems():
        source = np.array(background)
        random_assignments = np.random.choice(len(clustering), source.size)
        random_clusters = {}
        for i, cluster_id in enumerate(clustering.iterkeys()):
            random_clusters[cluster_id] = \
                source[np.where(random_assignments == i)[0]].tolist()
        random_clusterings[clustering_id] = random_clusters

    # generate random signed clustering
    random_signed_clusterings = {}
    pn = np.array(['+', '-'])
    for clustering_id, clustering in clusterings.iteritems():
        source = np.array(background)
        random_assignments = np.random.choice(len(clustering), source.size)
        random_clusters = {}
        for i, cluster_id in enumerate(clustering.iterkeys()):
            members = source[np.where(random_assignments == i)[0]].tolist()
            signed_members = []
            for member in members:
                sign = np.random.choice(pn, 1)[0]
                signed_members.append(member + sign)

            random_clusters[cluster_id] = signed_members
        random_signed_clusterings[clustering_id] = random_clusters

    return clusterings, random_clusterings, random_signed_clusterings,\
        clusterings_models, data, original_labels
Beispiel #42
0
def init_lr_hmm(sequences, steps, states_per_step,
                force_end=False, model_id='Left-Righ HMM', seed=None):
    """
    insantiate a left-right model with random parameters
    randomly generates start and transition matrices
    generates nomal distrobutions for each state from partition on sequences
    force_end if we require sequence to end in end state
    """

    # seed random number generator
    if seed is not None:
        np.random.seed(seed)

    model = HiddenMarkovModel(model_id)
    n_states = steps * states_per_step

    # make distrobutions from chronological subsets of timepoints
    step_size = int(math.ceil(sequences.shape[1] / float(n_states+1)))

    # generate states
    states = np.empty((steps, states_per_step), dtype=object)
    for i in range(steps):
        for j in range(states_per_step):
            temp_assignment = np.arange(step_size * i, step_size * (i+1))
            dist = \
                NormalDistribution.from_samples(sequences[:, temp_assignment])
            state_name = str(i) + '-' + str(j)
            states[i, j] = State(dist, name=str(state_name))

    # add states to model
    model.add_states(states.flatten().tolist())

    # make random transition from start -> step0
    trans = np.random.ranf(states_per_step)
    trans = trans / trans.sum()
    for j in range(states_per_step):
        model.add_transition(model.start, states[0, j], trans[j])

    # make random transition from step(i) -> step(i+1)
    for i in range(steps-1):
        for j in range(states_per_step):
            trans = np.random.ranf(states_per_step + 1)
            trans = trans / trans.sum()
            # self transition
            model.add_transition(states[i, j], states[i, j], trans[0])
            # out transition
            for x in range(states_per_step):
                model.add_transition(states[i, j], states[i + 1, x],
                                     trans[x + 1])

    # make random transition from stepn -> end
    if force_end:
        for j in range(states_per_step):
            trans = np.random.ranf(2)
            trans = trans / trans.sum()
            # self transition
            model.add_transition(states[(steps - 1), j],
                                 states[(steps - 1), j], trans[0])
            # end transition
            model.add_transition(states[(steps - 1), j], model.end, trans[1])

    model.bake()
    print 'Initialized Left-Right HMM:', model.name, '[', \
        steps, states_per_step, ']'
    return model
Beispiel #43
0
def init_cycle_hmm(sequences, steps, states_per_step, model_id):
    """
    insantiate a left-right model with random parameters
    randomly generates start and transition matrices
    generates nomal distrobutions for each state from partition on sequences
    """
    model = HiddenMarkovModel(model_id)
    n_states = steps * states_per_step

    # make distrobutions from chronological subsets of timepoints
    step_size = int(math.ceil(sequences.shape[1] / float(n_states+1)))

    # generate states
    states = np.empty((steps, states_per_step), dtype=object)
    for i in range(steps):
        for j in range(states_per_step):
            temp_assignment = np.arange(step_size * i, step_size * (i+1))
            dist = \
                NormalDistribution.from_samples(sequences[:, temp_assignment])
            state_name = str(i) + '-' + str(j)
            states[i, j] = State(dist, name=str(state_name))

    # add states to model
    model.add_states(states.flatten().tolist())

    # make random transition from start -> step0
    trans = np.random.ranf(n_states)
    trans = trans / trans.sum()
    for i, state in enumerate(states.flatten().tolist()):
        model.add_transition(model.start, state, trans[i])

    # make random transition from step(i) -> step(i+1)
    for i in range(steps-1):
        for j in range(states_per_step):
            trans = np.random.ranf(states_per_step + 1)
            trans = trans / trans.sum()
            # self transition
            model.add_transition(states[i, j], states[i, j], trans[0])
            # out transition
            for x in range(states_per_step):
                model.add_transition(states[i, j], states[i + 1, x],
                                     trans[x + 1])

    # make random transition from stepn -> step0
    for j in range(states_per_step):
        trans = np.random.ranf(states_per_step + 1)
        trans = trans / trans.sum()
        # self transition
        model.add_transition(states[(steps - 1), j], states[(steps - 1), j],
                             trans[0])
        # out transition
        for x in range(states_per_step):
            model.add_transition(states[(steps - 1), j], states[0, x],
                                 trans[x + 1])
    model.bake()
    print 'Initialized Cyclic State HMM:', '[', \
        steps, states_per_step, ']'
    return model
def run(cluster_directory_root, depth, plottype):

    # load data
    gc, mt, track = load_data(None, 0)
    data = pd.concat([gc.data, mt.data])

    labels = data.index.values
    pos_labels = labels + '+'
    neg_labels = labels + '-'
    pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels,
                            columns=data.columns.values)
    neg_data = pd.DataFrame(data=data.as_matrix(), index=neg_labels,
                            columns=data.columns.values)

    data = pd.concat([data, pos_data, neg_data])

    generic_dir = cluster_directory_root.split('/') + (['*'] * depth)
    generic_dir = ('/').join(generic_dir)
    cluster_directories = \
        glob.glob(generic_dir)

    clusterings = {}
    clusterings_models = {}
    for cluster_dir in cluster_directories:
        try:
            clustering_id = cluster_dir.split('/')[-1:][0]
            # read final clusters
            clusters = {}
            filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt'])
            lines = (open(filepath, 'r').read().splitlines())
            l = 0
            while l < len(lines):
                cluster_name = lines[l]
                cluster_members = lines[l + 1].split('\t')
                clusters[cluster_name] = cluster_members
                l += 4

            clusterings[clustering_id] = clusters

            # load models
            models = {}
            model_files = glob.glob(cluster_dir + '/*')
            for model_file in model_files:
                try:
                    model_id = model_file.split('/')[-1:][0]
                    json = open(model_file).read()
                    models[model_id] = HiddenMarkovModel.from_json(json)
                    print 'model loaded from: ', model_file
                except:
                    pass
            clusterings_models[clustering_id] = models
        except:
            pass

    background = set()
    for clustering in clusterings.itervalues():
        for cid, members in clustering.iteritems():
            background.update(set(members))

    background = list(background)
    # data = data.loc[background, :]

    # generate ranomd clusterings of the same size k as our models
    random_clusterings = {}

    for clustering_id, clustering in clusterings.iteritems():
        source = np.array(background)
        random_assignments = np.random.choice(len(clustering), source.size)
        random_clusters = {}
        for i, cluster_id in enumerate(clustering.iterkeys()):
            random_clusters[cluster_id] = \
                source[np.where(random_assignments == i)[0]].tolist()
        random_clusterings[clustering_id] = random_clusters

    # run dunn and davies_bouldin for clusterings and random permutations
    rand_dunn = report_dunn(random_clusterings, clusterings_models, data)
    savename = cluster_directory_root + 'dunn_index_random'
    dump(rand_dunn, open(savename, 'w'))

    rand_davies = report_davies_bouldin(random_clusterings, clusterings_models,
                                        data)
    savename = cluster_directory_root + 'davies_bouldin_index_random'
    dump(rand_davies, open(savename, 'w'))

    if plottype == 'none':
        pass

    elif plottype == 'kn_grid':

        rand_dunn_df = pd.DataFrame()
        rand_davies_df = pd.DataFrame()

        for clustering_id, clustering in clusterings.iteritems():
            cid = clustering_id.replace('k', '_'). \
                replace('n', '_').split('_')
            m = cid[0]
            k = int(cid[1])
            n = int(cid[2])

            rand_dunn_df.loc[k, n] = rand_dunn[clustering_id]
            rand_davies_df.loc[k, n] = rand_davies[clustering_id]

        rand_davies_df = rand_davies_df.fillna(0)
        rand_dunn_df = rand_dunn_df.fillna(0)

        rand_dunn_df = rand_dunn_df.sort_index().sort_index(1)
        rand_davies_df = rand_davies_df.sort_index().sort_index(1)

        odir = cluster_directory_root
        title = 'RANDOM_' + str(m) + ': Dunn Index'
        HeatMap(rand_dunn_df.as_matrix(), rand_dunn_df.index.values,
                rand_dunn_df.columns.values,
                title=title, odir=odir)

        odir = cluster_directory_root
        title = 'RANDOM_' + str(m) + ': Davies-Bouldin Index'
        HeatMap(rand_davies_df.as_matrix(), rand_davies_df.index.values,
                rand_davies_df.columns.values,
                title=title, odir=odir)

    elif plottype == 'row':
        rand_dunn_df = pd.Series()
        rand_davies_df = pd.Series()

        for clustering_id, clustering in clusterings.iteritems():
            rand_dunn_df.loc[clustering_id] = rand_dunn[clustering_id]
            rand_davies_df.loc[clustering_id] = rand_davies[clustering_id]

        rand_davies_df = rand_davies_df.fillna(0)
        rand_dunn_df = rand_dunn_df.fillna(0)

        rand_dunn_df = rand_dunn_df.sort_index()
        rand_davies_df = rand_davies_df.sort_index()

        odir = cluster_directory_root
        title = 'RANDOM' + ': Dunn Index'
        HeatMap(rand_dunn_df.as_matrix().reshape(-1, 1),
                rand_dunn_df.index.values,
                [' '], title=title, odir=odir, cmin=0, cmax=.5)

        odir = cluster_directory_root
        title = 'RANDOM' + ': Davies-Bouldin Index'
        HeatMap(rand_davies_df.as_matrix().reshape(-1, 1),
                rand_davies_df.index.values,
                [' '], title=title, odir=odir, cmin=5, cmax=10)

    return clusterings, clusterings_models
Beispiel #45
0
    X_1 = X[y == 0]
    X_2 = X[y == 1]
    X_3 = X[y == 2]

else:
    X_1 = X[2000:4000]
    X_2 = X[400:800]
    X_3 = X[7000:8000]
a = MultivariateGaussianDistribution.from_samples(X_1)
b = MultivariateGaussianDistribution.from_samples(X_2)
c = MultivariateGaussianDistribution.from_samples(X_3)
s1 = State(a, name="M1")
s2 = State(b, name="M2")
s3 = State(c, name="M3")

hmm = HiddenMarkovModel()
hmm.add_states(s1, s2, s3)
hmm.add_transition(hmm.start, s1, 0.34)
hmm.add_transition(hmm.start, s3, 0.33)
hmm.add_transition(hmm.start, s2, 0.33)

hmm.add_transition(s1, s1, 0.9)
hmm.add_transition(s1, s2, 0.05)
hmm.add_transition(s1, s3, 0.05)

hmm.add_transition(s2, s1, 0.05)
hmm.add_transition(s2, s3, 0.05)
hmm.add_transition(s2, s2, 0.9)

hmm.add_transition(s3, s3, 0.9)
hmm.add_transition(s3, s2, 0.05)
Beispiel #46
0
cc = MultivariateGaussianDistribution.from_samples(X_33)
ccc = MultivariateGaussianDistribution.from_samples(X_333)

s1 = State(a, name="M1")
s11 = State(aa, name="M11")
s111 = State(aaa, name="M111")

s2 = State(b, name="M2")
s22 = State(bb, name="M22")
s222 = State(bbb, name="M222")

s3 = State(c, name="M3")
s33 = State(cc, name="M33")
s333 = State(ccc, name="M333")

hmm = HiddenMarkovModel()
hmm.add_states(s1, s11, s111, s2, s22, s222, s3, s33, s333)

hmm.add_transition(hmm.start, s1, 0.12)
hmm.add_transition(hmm.start, s11, 0.11)
hmm.add_transition(hmm.start, s111, 0.11)
hmm.add_transition(hmm.start, s2, 0.11)
hmm.add_transition(hmm.start, s22, 0.11)
hmm.add_transition(hmm.start, s222, 0.11)
hmm.add_transition(hmm.start, s3, 0.11)
hmm.add_transition(hmm.start, s33, 0.11)
hmm.add_transition(hmm.start, s333, 0.11)


hmm.add_transition(s1, s1, 0.92)
hmm.add_transition(s1, s11, 0.01)
def gen_cluster_plots(cluster_directory_root, depth):
    # load data
    gc, mt, track = load_data(None, 0)
    data = pd.concat([gc.data, mt.data])

    labels = data.index.values
    pos_labels = labels + '+'
    neg_labels = labels + '-'
    pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels,
                            columns=data.columns.values)
    neg_data = pd.DataFrame(data=data.as_matrix(), index=neg_labels,
                            columns=data.columns.values)

    data = pd.concat([data, pos_data, neg_data])

    generic_dir = cluster_directory_root.split('/') + (['*'] * depth)
    generic_dir = ('/').join(generic_dir)
    cluster_directories = \
        glob.glob(generic_dir)

    clusterings = {}
    clusterings_models = {}
    for cluster_dir in cluster_directories:
        try:
            clustering_id = cluster_dir.split('/')[-1:][0]
            # read final clusters
            clusters = {}
            filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt'])
            lines = (open(filepath, 'r').read().splitlines())
            l = 0
            while l < len(lines):
                cluster_name = lines[l]
                cluster_members = lines[l + 1].split('\t')
                clusters[cluster_name] = cluster_members
                l += 4

            clusterings[clustering_id] = clusters

            # load models
            models = {}
            model_files = glob.glob(cluster_dir + '/*')
            for model_file in model_files:
                try:
                    model_id = model_file.split('/')[-1:][0]
                    json = open(model_file).read()
                    models[model_id] = HiddenMarkovModel.from_json(json)
                    print 'model loaded from: ', model_file
                except:
                    pass
            clusterings_models[clustering_id] = models
        except:
            pass

    background = set()
    for clustering in clusterings.itervalues():
        for cid, members in clustering.iteritems():
            background.update(set(members))

    background = list(background)
    # data = data.loc[background, :]

    # generate ranomd clusterings of the same size k as our models
    for clustering_id, clustering in clusterings.iteritems():
        for model_id, members in clustering.iteritems():
            sequences = data.loc[members, :]
            pltdir = '/'.join(cluster_directory_root.split('/') + ['plots'])

            # make line plots directory
            if not os.path.isdir(pltdir + '/line'):
                print "Creating directory...", pltdir
                os.mkdir(pltdir + '/line')

            savename = pltdir + '/line/' + model_id + '_lineplot'

            plt_title = model_id + ' Line Plot'
            ax = sequences.T.plot(legend=False, rot=2)
            ax.set_title(plt_title)
            ax.set_xlabel('Timepoint')
            ax.set_ylabel('Normalized Expression')

            print 'Saving: ', savename
            fig = ax.get_figure()
            fig.savefig(savename)
            fig.clear()

            # make autocorr plots directory
            if not os.path.isdir(pltdir + '/autocorr'):
                print "Creating directory...", pltdir
                os.mkdir(pltdir + '/autocorr')

            savename = pltdir + '/autocorr/' + model_id + '_autocorr'

            plt_title = model_id + ' Autocorr Plot'
            for seq in sequences.index:
                ax = autocorrelation_plot(sequences.loc[seq])
            ax.set_title(plt_title)

            print 'Saving: ', savename
            fig = ax.get_figure()
            fig.savefig(savename)
            fig.clear()

            # make lag plots directory
            if not os.path.isdir(pltdir + '/lag'):
                print "Creating directory...", pltdir
                os.mkdir(pltdir + '/lag')

            from pylab import *
            NUM_COLORS = len(members)
            cm = get_cmap('gist_rainbow')
            colors = []
            for i in range(NUM_COLORS):
                colors.append(cm(1.*i/NUM_COLORS))

            savename = pltdir + '/lag/' + model_id + '_lagplot'

            plt_title = model_id + ' Lag Plot'
            for i, seq in enumerate(sequences.index):
                ax = lag_plot(sequences.loc[seq], c=colors[i])
            ax.set_title(plt_title)

            print 'Saving: ', savename
            fig = ax.get_figure()
            fig.savefig(savename)
            fig.clear()

            """
def gen_model(sequences, labels, algorithm, initialization, restarts, n, k,
              out_dir, base_id, tied):

    if initialization == 'rand':
        init_method = init_gaussian_hmm
        init_args = {'n_states': n[0]}
    if init_lr_hmm == 'lr':
        init_method = init_lr_hmm
        s = n[0]
        sps = n[1]
        init_args = {'steps': s, 'state_per_step': sps, 'force_end': True}
    if initialization == 'cycle':
        init_method = init_cycle_hmm
        s = n[0]
        sps = n[1]
        init_args = {'steps': s, 'state_per_step': sps}

    best = 0
    best_score = -1e1000

    # genes/metabolites will be assigned to noise model if other models
    # fail to model it better
    noise_dist = [NormalDistribution(0, 1)]
    noise_trans = np.array([[1]])
    starts = np.array([1])
    noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts,
                                          name='noise')
    noise.freeze_distributions()

    np.random.seed(int(time.time()))
    randassigns = []
    for x in range(restarts):
        randassigns.append(np.random.randint(k, size=labels.size))

    for x in range(restarts):
        randassign = randassigns[x]
        assignments = {}
        for i in range(k):
            model_id = str(i)
            assignments[model_id] = \
                np.where(randassign == i)[0].tolist()
            in_model = assignments[model_id]
        print assignments
    # gen model for number of restarts
    for x in range(restarts):
        try:
            collection_id = base_id + '_' + str(x)
            odir = '/'.join(out_dir.split('/') + [collection_id])

            print 'Learning: ', collection_id

            # generate random initial assignments
            # initialize models on random assignments
            randassign = randassigns[x]
            assignments = {}
            models = {}
            for i in range(k):
                model_id = str(i)
                assignments[model_id] = \
                    np.where(randassign == i)[0].tolist()
                in_model = assignments[model_id]
                models[model_id] = \
                    init_method(sequences[in_model, :], model_id=model_id,
                                **init_args)

            # add noise model
            models['noise'] = noise
            assignments['noise'] = []

            # all are un-fixed
            fixed = {}
            for model_id, model in models.iteritems():
                fixed[model_id] = []

            # perform clustering
            models, assignments, c = cluster(models=models,
                                             sequences=sequences,
                                             assignments=assignments,
                                             algorithm=algorithm,
                                             fixed=fixed, tied=tied,
                                             labels=labels,
                                             odir=odir)

            score = total_log_prob(models, sequences, assignments)
            if best_score < score:
                best_score = score
                best = collection_id
                bestfile = '/'.join(out_dir.split('/') + ['best'])
                with open(bestfile, 'w') as f:
                    print >> f, collection_id
                    f.close()

        except:
            error_file = odir.split('/') + ['errors.txt']
            error_file = '/'.join(error_file)
            f = open(error_file, 'a')
            print >> f, 'error computing parameters for: ', collection_id
            print >> f, "Unexpected error:", sys.exc_info()[0]
            f.close()

    return best
Beispiel #49
0
def init_gaussian_hmm(sequences, n_states, model_id, seed=None):
    """
    insantiate a model with random parameters
    randomly generates start and transition matrices
    generates nomal distrobutions for each state from partition on sequences
    """
    """
    # make random transition probability matrix
    # scale each row to sum to 1
    trans = np.random.ranf((n_states, n_states))
    for i in range(n_states):
        trans[i, :] = trans[i, :] / trans[i, :].sum()

    # make distrobutions from random subsets of timepoints
    x = int(math.ceil(sequences.shape[1] / float(n_states)))
    # x = math.min(3, x)

    dists = []
    for i in range(n_states):
        temp_assignment = np.random.choice(sequences.shape[1], x)
        dists.append(NormalDistribution.from_samples
                     (sequences[:, temp_assignment]))

    # random start probabilities
    # scale to sum to 1
    starts = np.random.ranf(n_states)
    starts = starts / sum(starts)

    model = HiddenMarkovModel.from_matrix(trans, dists, starts, name=model_id)
    """
    # seed random numer generator
    if seed is not None:
        np.random.seed(seed)

    model = HiddenMarkovModel(model_id)

    # make states with distrobutions from random subsets of timepoints
    x = int(math.ceil(sequences.shape[1] / float(n_states)))
    states = []
    for i in range(n_states):
        temp_assignment = np.random.choice(sequences.shape[1], x)
        dist = \
            NormalDistribution.from_samples(sequences[:, temp_assignment])
        states.append(State(dist, name=str(i)))

    model.add_states(states)

    # add random start probabilities
    start_probs = np.random.ranf(n_states)
    start_probs = start_probs / start_probs.sum()
    for i, state in enumerate(states):
        model.add_transition(model.start, state, start_probs[i])

    # add random transition probabilites out of each state
    for state1 in states:
        transitions = np.random.ranf(n_states)
        transitions = transitions / transitions.sum()
        for i, state2 in enumerate(states):
            model.add_transition(state1, state2, transitions[i])

    model.bake()
    print 'Initialized HMM: ', model.name
    return model
#		   [email protected]

"""
Example rainy-sunny HMM using yahmm. Example drawn from the wikipedia HMM
article: http://en.wikipedia.org/wiki/Hidden_Markov_model describing what
Bob likes to do on rainy or sunny days.
"""

from pomegranate import *
from pomegranate import HiddenMarkovModel as Model
import random
import math

random.seed(0)

model = Model( name="Rainy-Sunny" )

# Emission probabilities
rainy = State( DiscreteDistribution({ 'walk': 0.1, 'shop': 0.4, 'clean': 0.5 }), name='Rainy' )
sunny = State( DiscreteDistribution({ 'walk': 0.6, 'shop': 0.3, 'clean': 0.1 }), name='Sunny' )

model.add_transition( model.start, rainy, 0.6 )
model.add_transition( model.start, sunny, 0.4 )

# Transition matrix, with 0.05 subtracted from each probability to add to
# the probability of exiting the hmm
model.add_transition( rainy, rainy, 0.65 )
model.add_transition( rainy, sunny, 0.25 )
model.add_transition( sunny, rainy, 0.35 )
model.add_transition( sunny, sunny, 0.55 )