Ejemplo n.º 1
0
 def load(self, file_path):
     with open(file_path, 'rb') as f:
         objects = pickle.load(f)
     try:
         self.transformer = objects['transformer']
     except KeyError:  # for backwards compatibility
         self.transformer = objects['lda']
     self.hmm = HiddenMarkovModel.from_json(objects['hmm'])
Ejemplo n.º 2
0
def init(base_dir):
    print base_dir
    cluster_directories = \
        glob.glob(base_dir + '/*')

    initial_clusterings = {}
    clusterings = {}
    clusterings_models = {}
    for cluster_dir in cluster_directories:
        try:
            clustering_id = cluster_dir.split('/')[-1:][0]
            # read initial clusters
            initial_clusters = {}
            filepath = '/'.join(cluster_dir.split('/') + ['init_assignments.txt'])
            lines = (open(filepath, 'r').read().splitlines())
            l = 0
            while l < len(lines):
                cluster_name = lines[l]
                cluster_members = lines[l + 1].split('\t')
                initial_clusters[cluster_name] = cluster_members
                l += 4

            initial_clusterings[clustering_id] = initial_clusters

            # read final clusters
            clusters = {}
            filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt'])
            lines = (open(filepath, 'r').read().splitlines())
            l = 0
            while l < len(lines):
                cluster_name = lines[l]
                cluster_members = lines[l + 1].split('\t')
                clusters[cluster_name] = cluster_members
                l += 4

            clusterings[clustering_id] = clusters

            # load models
            models = {}
            model_files = glob.glob(cluster_dir + '/*')
            for model_file in model_files:
                try:
                    model_id = model_file.split('/')[-1:][0]
                    json = open(model_file).read()
                    models[model_id] = HiddenMarkovModel.from_json(json)
                    print 'model loaded from: ', model_file
                except:
                    pass
            clusterings_models[clustering_id] = models
        except:
            pass
    return initial_clusterings, clusterings
Ejemplo n.º 3
0
def lambda_handler(event, context):
    # TODO implement
    content_object = s3.get_object(Bucket='bhargav-ml-trained-models', Key='pos_model.txt')
    file_content = content_object['Body'].read().decode()
    json_content = json.loads(file_content)
    model = HiddenMarkovModel.from_json(json_content)
    sentence = event['body'].split(' ')
    output = simplify_decoding(sentence, model)
    return {
        'statusCode': 200,
        'headers': {'Content-Type': 'text/plain', 'Access-Control-Allow-Origin': '*'},
        'body': output
    }
Ejemplo n.º 4
0
    def load(cls, path):
        """
        Loads saved model

        Arguments:
            path: path to saved .pkl file

        Returns:
            Loaded HMM
        """
        with open(path, "rb") as f:
            data = pickle.load(f)
        hmm = data['model']
        del data['model']
        model = cls(**data)
        model.model = HiddenMarkovModel.from_json(hmm)
        model.fitted = True
        return model
Ejemplo n.º 5
0
    def get_vntr_matcher_hmm(self, read_length):
        """Try to load trained HMM for this VNTR
        If there was no trained HMM, it will build one and store it for later usage
        """
        logging.info('Using read length %s' % read_length)
        copies = self.get_copies_for_hmm(read_length)

        base_name = str(
            self.reference_vntr.id) + '_' + str(read_length) + '.json'
        stored_hmm_file = settings.TRAINED_HMMS_DIR + base_name
        if settings.USE_TRAINED_HMMS and os.path.isfile(stored_hmm_file):
            model = Model()
            model = model.from_json(stored_hmm_file)
            return model

        flanking_region_size = read_length
        vntr_matcher = self.build_vntr_matcher_hmm(copies,
                                                   flanking_region_size)

        json_str = vntr_matcher.to_json()
        with open(stored_hmm_file, 'w') as outfile:
            outfile.write(json_str)
        return vntr_matcher
Ejemplo n.º 6
0
        subseqs.append(seq[cut[0]:cut[1]])
    return subseqs


def predict_path(model, seq):
    logp, path = model.viterbi(seq)
    return [p[1].name for p in path]


with open('coding_model_base_poly.json') as base_model_file:
    coding_model_json = base_model_file.read()

with open('utr_model_base.json') as promoter_model_file:
    promoter_utr_model_json = promoter_model_file.read()

coding_model = HiddenMarkovModel.from_json(coding_model_json)
promoter_utr_model = HiddenMarkovModel.from_json(promoter_utr_model_json)


def predict_all_old(seq, string):
    path_names = predict_path(coding_model, seq)

    print([(string[i + 1], name, i - len(path_names) + 1)
           for i, name in enumerate(path_names) if i + 1 < len(string)])

    starts = find_gene_cut_index(path_names, ['start zone7'])
    ends = find_gene_cut_index(
        path_names, ['stop zone taa9', 'stop zone tag9', 'stop zone tga9'])

    ext_subseq = find_intercoding_region(starts, ends, seq)
Ejemplo n.º 7
0
from nltk.stem import WordNetLemmatizer, SnowballStemmer
snowball_stemmer = SnowballStemmer("english")
porter_stemmer = PorterStemmer()

# Import LDA corpus data
lda_data = pd.read_csv('abcnews-date-text.csv')
lda_data_text = lda_data[:300000][['headline_text']]
# We only need the Headlines text column from the data
lda_data_text['index'] = lda_data_text.index
documents = lda_data_text

# Import HMM Tagger corpus data and model
hmm_data = Dataset("tags-universal.txt",
                   "brown-universal.txt",
                   train_test_split=0.8)
hmm_model = HiddenMarkovModel.from_json(
    r'C:\Users\Marco\Desktop\Gits\eg-texttools\hmm-models\model.json')

# Output dictionary. To export data to visualizer.
output_dict = {}


def lemmatize_stemming(text):
    return snowball_stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


# Tokenize and lemmatize
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(
                token) > 3:
Ejemplo n.º 8
0
def filter_repeat(genes):
    names = []
    filtered = []
    for gene in genes:
        if gene['name'] not in names:
            filtered.append(gene)
            names.append(gene['name'])
    return filtered


if __name__ == '__main__':
    with open('coding_model_base.json') as base_model_file:
        model_json = base_model_file.read()

    hmmodel = HiddenMarkovModel.from_json(model_json)
    genes = extract(folder_path='/run/media/jose/BE96A68C96A6452D/Asi/Data/', lookfor='CDS', before=500, after=30)
    valid_st = ['start zone8', 'start zone9', 'start zone10', 'start zone11', 'start zone12',
                'start zone13', 'start zone14', 'start one15', 'start zone16',
                'stop zone0', 'stop zone1', 'stop zone2', 'stop zone3', 'stop zone4', 'stop zone5',
                'coding',
                'acceptor016', 'acceptor017', 'acceptor018',
                'acceptor116', 'acceptor117', 'acceptor118', 'acceptor119', 'acceptor120',
                'acceptor216', 'acceptor217', 'acceptor218', 'acceptor219',
                'donor00', 'donor01', 'donor02',
                'donor10', 'donor11', 'donor12', 'donor13',
                'donor20', 'donor21', 'donor22', 'donor23', 'donor24',
                ]

    unique_genes = filter_repeat(genes)
    predicted = map(test(hmmodel, valid_st), unique_genes)
Ejemplo n.º 9
0
 def load_model(self, path, **kwargs):
     with open(path, 'r') as f:
         json_model = json.load(f)
     self.model = HiddenMarkovModel.from_json(json_model)
 def __init__(self, n_trials=3, leave_one_out=1):
     """Variable initialization"""
     self.patient = rospy.get_param("gait_phase_det/patient")
     self.verbose = rospy.get_param("gait_phase_det/verbose")
     self.n_trials = n_trials
     self.n_features = 2      # Raw data and 1st-derivative
     self.leave_one_out = leave_one_out
     self.rec_data = 0.0       # Number of recorded IMU data
     self.proc_data = 0.0      # Number of extracted features
     self.win_size = 3
     self.raw_win = [None] * self.win_size
     # self.fder_win = [0] * self.win_size
     self.ff = [[] for x in range(self.n_trials)]      # Training and test dataset
     self.labels = [[] for x in range(self.n_trials)]  # Reference labels from local data
     self.first_eval = True
     self.model_loaded = False
     algorithm = rospy.get_param("gait_phase_det/algorithm")
     rospy.loginfo('Decoding algorithm: {}'.format(algorithm))
     if algorithm not in DECODER_ALGORITHMS:
         raise ValueError("Unknown decoder {!r}".format(algorithm))
     self.decode = {
         "fov": self._run_fov,
         "bvsw": self._run_bvsw
     }[algorithm]
     self.imu_callback = {
         "fov": self._fov_callback,
         "bvsw": self._bvsw_callback
     }[algorithm]
     """HMM variables"""
     ''' State list:
         s1: Heel Strike (HS)
         s2: Flat Foot   (FF)
         s3: Heel Off    (HO)
         s4: Swing Phase (SP)'''
     self.model_name = "Gait"
     self.has_model = False
     self.must_train = False
     self.states = ['s1', 's2', 's3', 's4']
     self.n_states = len(self.states)
     self.state2phase = {"s1": "hs", "s2": "ff", "s3": "ho", "s4": "sp"}
     self.train_data = []
     self.mgds = {}
     self.dis_means = [[] for x in range(self.n_states)]
     self.dis_covars = [[] for x in range(self.n_states)]
     self.start_prob = [1.0/self.n_states]*self.n_states
     self.trans_mat = np.array([(0.9, 0.1, 0, 0), (0, 0.9, 0.1, 0), (0, 0, 0.9, 0.1), (0.1, 0, 0, 0.9)])    # Left-right model
     # self.trans_mat = np.array([0.8, 0.1, 0, 0.1], [0.1, 0.8, 0.1, 0], [0, 0.1, 0.8, 0.1], [0.1, 0, 0.1, 0.8])    # Left-right-left model
     self.log_startprob = []
     self.log_transmat = np.empty((self.n_states, self.n_states))
     self.max_win_len = 11       # ms (120 ms: mean IC duration for healthy subjects walking at comfortable speed)
     self.viterbi_path = np.empty((self.max_win_len+1, self.n_states))
     self.backtrack = [[None for x in range(self.n_states)] for y in range(self.max_win_len+1)]
     self.global_path = []
     self.work_buffer = np.empty(self.n_states)
     self.boundary = 1
     self.buff_len = 0
     self.states_pos = {}
     for i in range(len(self.states)):
         self.states_pos[self.states[i]] = i
     self.last_state = -1
     self.curr_state = -1
     self.conv_point = 0
     self.conv_found = False
     self.smp_freq = 100.0   # Hz
     self.fp_thresh = 1/self.smp_freq*4    # Threshold corresponds to 8 samples
     self.time_passed = 0.0
     self.obs = [[None for x in range(self.n_features)] for y in range(self.max_win_len)]
     self.model = HMM(name=self.model_name)
     """ROS init"""
     rospy.init_node('real_time_HMM', anonymous=True)
     rospack = rospkg.RosPack()
     self.packpath = rospack.get_path('hmm_gait_phase_classifier')
     self.init_subs()
     self.init_pubs()
     """HMM-training (if no model exists)"""
     try:
         '''HMM-model loading'''
         with open(self.packpath+'/log/HMM_models/'+self.patient+'.txt') as infile:
             json_model = json.load(infile)
             self.model = HMM.from_json(json_model)
             rospy.logwarn(self.patient + "'s HMM model was loaded.")
             self.has_model = True
     except IOError:
         if os.path.isfile(self.packpath + "/log/mat_files/" + self.patient + "_proc_data1.mat"):
             """Training with data collected with FSR-based reference system"""
             self.data_ext = 'mat'
             self.must_train = True
         elif os.path.isfile(self.packpath + "/log/IMU_data/" + self.patient + "_labels.csv"):
             """Training with data collected with offline threshold-based gait phase detection method"""
             self.data_ext = 'csv'
             self.must_train = True
         else:
             rospy.logerr("Please collect data for training ({})!".format(self.patient))
     if self.must_train:
         rospy.logwarn("HMM model not trained yet for {}!".format(self.patient))
         rospy.logwarn("Training HMM with local data...")
         self.load_data()
         self.init_hmm()
         self.train_hmm()
         self.has_model = True
     if self.has_model:
         try:
             '''MGDs loading if model exists'''
             for st in self.states:
                 with open(self.packpath+'/log/HMM_models/'+self.patient+'_'+self.state2phase[st]+'.txt') as infile:
                     yaml_dis = yaml.safe_load(infile)
                     dis = MGD.from_yaml(yaml_dis)
                     self.mgds[st] = dis
                     rospy.logwarn(self.patient +"'s " + self.state2phase[st] + " MGC was loaded.")
                     '''Loading means and covariance matrix'''
                     self.dis_means[self.states_pos[st]] = self.mgds[st].parameters[0]
                     self.dis_covars[self.states_pos[st]] = self.mgds[st].parameters[1]
         except yaml.YAMLError as exc:
             rospy.logwarn("Not able to load distributions: " + exc)
         """Transition and initial (log) probabilities matrices upon training"""
         trans_mat = self.model.dense_transition_matrix()[:self.n_states,:self.n_states]
         if self.verbose: print '**TRANSITION MATRIX (post-training)**\n'+ str(trans_mat)
         for i in range(self.n_states):
             self.log_startprob.append(ln(self.start_prob[i]))
             for j in range(self.n_states):
                 self.log_transmat[i,j] = ln(trans_mat[i][j])
         self.model_loaded = True
def run(cluster_directory_root, depth, plottype):

    # load data
    gc, mt, track = load_data(None, 0)
    data = pd.concat([gc.data, mt.data])

    labels = data.index.values
    pos_labels = labels + '+'
    neg_labels = labels + '-'
    pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels,
                            columns=data.columns.values)
    neg_data = pd.DataFrame(data=data.as_matrix(), index=neg_labels,
                            columns=data.columns.values)

    data = pd.concat([data, pos_data, neg_data])

    generic_dir = cluster_directory_root.split('/') + (['*'] * depth)
    generic_dir = ('/').join(generic_dir)
    cluster_directories = \
        glob.glob(generic_dir)

    clusterings = {}
    clusterings_models = {}
    for cluster_dir in cluster_directories:
        try:
            clustering_id = cluster_dir.split('/')[-1:][0]
            # read final clusters
            clusters = {}
            filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt'])
            lines = (open(filepath, 'r').read().splitlines())
            l = 0
            while l < len(lines):
                cluster_name = lines[l]
                cluster_members = lines[l + 1].split('\t')
                clusters[cluster_name] = cluster_members
                l += 4

            clusterings[clustering_id] = clusters

            # load models
            models = {}
            model_files = glob.glob(cluster_dir + '/*')
            for model_file in model_files:
                try:
                    model_id = model_file.split('/')[-1:][0]
                    json = open(model_file).read()
                    models[model_id] = HiddenMarkovModel.from_json(json)
                    print 'model loaded from: ', model_file
                except:
                    pass
            clusterings_models[clustering_id] = models
        except:
            pass

    background = set()
    for clustering in clusterings.itervalues():
        for cid, members in clustering.iteritems():
            background.update(set(members))

    background = list(background)
    # data = data.loc[background, :]

    # generate ranomd clusterings of the same size k as our models
    random_clusterings = {}

    for clustering_id, clustering in clusterings.iteritems():
        source = np.array(background)
        random_assignments = np.random.choice(len(clustering), source.size)
        random_clusters = {}
        for i, cluster_id in enumerate(clustering.iterkeys()):
            random_clusters[cluster_id] = \
                source[np.where(random_assignments == i)[0]].tolist()
        random_clusterings[clustering_id] = random_clusters

    # run dunn and davies_bouldin for clusterings and random permutations
    rand_dunn = report_dunn(random_clusterings, clusterings_models, data)
    savename = cluster_directory_root + 'dunn_index_random'
    dump(rand_dunn, open(savename, 'w'))

    rand_davies = report_davies_bouldin(random_clusterings, clusterings_models,
                                        data)
    savename = cluster_directory_root + 'davies_bouldin_index_random'
    dump(rand_davies, open(savename, 'w'))

    if plottype == 'none':
        pass

    elif plottype == 'kn_grid':

        rand_dunn_df = pd.DataFrame()
        rand_davies_df = pd.DataFrame()

        for clustering_id, clustering in clusterings.iteritems():
            cid = clustering_id.replace('k', '_'). \
                replace('n', '_').split('_')
            m = cid[0]
            k = int(cid[1])
            n = int(cid[2])

            rand_dunn_df.loc[k, n] = rand_dunn[clustering_id]
            rand_davies_df.loc[k, n] = rand_davies[clustering_id]

        rand_davies_df = rand_davies_df.fillna(0)
        rand_dunn_df = rand_dunn_df.fillna(0)

        rand_dunn_df = rand_dunn_df.sort_index().sort_index(1)
        rand_davies_df = rand_davies_df.sort_index().sort_index(1)

        odir = cluster_directory_root
        title = 'RANDOM_' + str(m) + ': Dunn Index'
        HeatMap(rand_dunn_df.as_matrix(), rand_dunn_df.index.values,
                rand_dunn_df.columns.values,
                title=title, odir=odir)

        odir = cluster_directory_root
        title = 'RANDOM_' + str(m) + ': Davies-Bouldin Index'
        HeatMap(rand_davies_df.as_matrix(), rand_davies_df.index.values,
                rand_davies_df.columns.values,
                title=title, odir=odir)

    elif plottype == 'row':
        rand_dunn_df = pd.Series()
        rand_davies_df = pd.Series()

        for clustering_id, clustering in clusterings.iteritems():
            rand_dunn_df.loc[clustering_id] = rand_dunn[clustering_id]
            rand_davies_df.loc[clustering_id] = rand_davies[clustering_id]

        rand_davies_df = rand_davies_df.fillna(0)
        rand_dunn_df = rand_dunn_df.fillna(0)

        rand_dunn_df = rand_dunn_df.sort_index()
        rand_davies_df = rand_davies_df.sort_index()

        odir = cluster_directory_root
        title = 'RANDOM' + ': Dunn Index'
        HeatMap(rand_dunn_df.as_matrix().reshape(-1, 1),
                rand_dunn_df.index.values,
                [' '], title=title, odir=odir, cmin=0, cmax=.5)

        odir = cluster_directory_root
        title = 'RANDOM' + ': Davies-Bouldin Index'
        HeatMap(rand_davies_df.as_matrix().reshape(-1, 1),
                rand_davies_df.index.values,
                [' '], title=title, odir=odir, cmin=5, cmax=10)

    return clusterings, clusterings_models
Ejemplo n.º 12
0
from pomegranate import HiddenMarkovModel
from converter_to import converter_to
import numpy

with open('coding_model_base.json') as base_model_file:
    model_json = base_model_file.read()

promoter_utr_model = HiddenMarkovModel.from_json(model_json)

string = 'ggccctggtgtgtgatgttccccgccctgtgtccaagtgttctcattgttcagttcccacccatgagtgagaacatgctcgcaccgccgcttctaaatgttttaaaaacaaagacaccaatgcccttcattggggaaatgaaagacttttaagtaaaacgattttgagtgaaataatatttgttgttttaaaaagttaatattaaccactctccatcatatattgaaattaacttaagatgtgaaagttaaattagaaaccttgtaaaggaaaaataggaaatagtttcatgaacttgacacaggaaaatatttcttagactagatactgtagcactcaccacaataagaaatcaagcgaattgcacttcatttttaaaaagcttctccttattatgttgttgtttaacaacttaaacgctatctctagaccaggaataattatttgctatataatacagcaaaaaatatgtatgtataaatggactcattcaaaatatataaagaactcctattacaaagaaattgacaaacagcccagtatatcaatgaatataaaaatttgagaagatattttccataagaagatatctaaatgaacattaggcatgagaaaaccaaattttaggatatcactacacacctggtgtagtttaaaagactgaaaatattaagtgtgtgggaatgtagagcaactggaaatggcctacatctttcatagaaatgtaaaacaatacaaatactttgcaaaactctgtccaacattttctacccattcaccaagcaactccatccctagctatagatacccaggaaaataagtatgtatcttcacagaaataattgtatgagaatattcatagttacttatgcacagtagttaccaagtaaacctgtctcccatcagaaaaatggatatcaaattgtgtgataatcatacaatcaataggatattacttggccaaaacaaaatgaaacaagggaaaaacacaatcaaacaaattagtggcatatatacccacctgagtaaagagaagtcggccgggtgcggtggctcatgcctgtaatcccagcactttgggaggccgaggcggcagatcacgaggtcaggagatcgagaccatcctggctaacacagtgaaaccccgtctctactaaaaatacaaaaaaaaaaagaaaaagaaaaattagccgggcgtggtggcgggcgcctgtattcccaactacttgggagtctgaggcaggagaatagcgtgaacctgggaggcagagcttgcagtgagcctagatcgcgtcactgcactccagcctgggcgacagagtgagactctgtctcaaacaaaaaaaaaaaaaaaaaagtcaaaacaagagaacatactaaatgattctatttttttatttatgatttcatgactaccattaagaaaatataacctgttgggaaactgtttctgccttgatgatgttgtacagacaagagataaacagtgaggaatatgcttagatgtattgggaaagacacgggtctgtggcattgtcacaagggtacacgaatactgagagtgaatgctgaaggaatgatccccattggtggtgaccctcaggtgagactagggtgcctgtgtttcagcaaagcctgggcaattggaatgcagggctcctaagattccatgacacccccaccttctaattctgttttgcaactgcagacggttacctggcacgctggccacaatctacctcactcttatcagagtctgcgctactgacagtgctttcagctctgagttgaggcacctcgaaccttgtttttgtggtgaaggatcctaaagtgctgtggggagtgatcacatttttgacaacagtaagttaagaatttcagttacttacatccctcagtcctgattaaacctatttgatttcaccagtttttaacccatcatatgtttgggtttcttctccccagtccctgactccacctcttctgccacaaacgtcagcatggtggtatcagccggccctttgtccagcgagaaggcagagatgaacattctagaaatcaatgagaaattgcgcccccagttggcagagaagaaacagcagttcagaaacctcaaagagaaatgttttctaactcaactggccggcttcctggccaaccgacagaagaaatacagtaagatctataggctcaccgtcatgaaagtgatgaatgatgtcctgtcttctctctgagacactaaatgctctctccatcaaaaataatttcatccttcctgtacttctaggaaaacagaaatgggtattttaacattttgttaaagttggaagacagaggtaccaaagtatttagcaactttccatgtttgcaatcaggtgggggtgggactagagttaaactgccatttattgatttctgacacaggcacagaatgacctgttttctccaagaggctcaatcatgttttcaagaatcctctctgtaccatataagatcctgcagacaaataacatctagtctgttgttctaaatgtctgagactagtgaacttttattcagttcaagtttctgtggaggcccaacaggcaaagctctgttctagtgactctgagggaaacttggtgatagtagccagtacctgctctgaggggcttcaagaggagtctactcctaatagaacctgtgctgtctataagtgacagcatcaagagcagggagtaggggccgtgcatggtggctcactcctgtaatcccagcactttgggaggctgaggcgggcagatcatgaggtcaggagtttgagaccagcctgggcaacatggagaaaccccatctccactaaaaatacaaaaagtagatgggcgtggtggcaggtgactgtaatcacccctgcctaggaggctgaggcaggagaatcctttgaacccaggaggctgaggttgcagtgagccaagattttgccattgcactccagcctgggcgacagggcaagactgttaaaaaaaaaaataataataatgataaataaaaataagaataaaaagcagagagtagcttggtgagagtgaagtcctgcttcctggggcacagagtcttgttgctaaagaggaagaaagatcgcatccgagaatgtgtggagatagcagtgcagtgtacagagcagggactgtgggcctgtctactgggctccatccaagttgcttgtcttgtctgtccctcagtttcctcacctgttcagagggtactacaataatacctacctctgtaaattgctgcagtgaattacatgagctatttcttgtcaatctcctagaacatttattggcacacagtaaacactatctattagttgttcattctgctgtttctaaattaacacaaactttattagcatttgggcatatttccttcatggccttatggtgttatgtgtcactctttatgcttcagatatgattcttaaaatcataactgaagatatgatttaaaaatcaaagattttaaaaatctttcacatacttgtccttgaaattcccagtaaaagggaaaccatcagtcccatagtcctaggggccttcccgactgtacgagaaatcactacttcatgccccagtgcagtgttttagaggagaggctgcaaggcttgggaaagtggccccgcattcagagtcagacctcagggactgtgaattctgactccacttcgttgtggttgaatcatcttgtcaacttccttgatgtgcccttgaggttctctttcttcatctctaaattttggaggatcagatgccagaaagtcaggagactgaagagtaaagatgtggaaatccctgtctagaccctggtactggggagagttttgtccttgggatggacctggctcctgtcctgtaggcaatgaccacagcagcatgtccagccttccactgaggcaggcgtgtctgtcttttctcagaatatgaagagtgcaaagatctcataaaatttatgctgaggaatgagcgacagttcaaggaggagaagcttgcagagcagctcaagcaagctgaggagctcaggtgaggggaccccatgggggcaggcaggggtcaggtgtgtaaatctctgaagtacagcagctcggtggggagatgtaagagctaagctgggccaggggaagggcaggaattgccatggcaggctcgctacacacaaatatttatcaaacagagaagaaggataataaaaatgtatgggttgcagttgtttctcagagccttgttttctctttttcaaacaagtaattgttgatgtgaaatttacataacacaaaattaaccaaaggagtgtgaaccacacagcagcattcagtatactcaaaatggtgtgccatcaccaccccacttacccttagtgagaatcacctcctgactgactgcggcttctcattctttcactcaatcaatgttgccttctcgaccctgtcattcttttcttctttcgtcttttcaattcgccccatctgcacctggcctcatttctgtacatggctttgtatctagtggccgcaagatgcactatgtgtattttcacatggaaatgtccatggccagagtgaggaactgaaaggatgtctttttgaaacggaattaggaagacacctacttttgtttacagaagagaaagatgaatggaacatcatcgaggatcttgcaggagccctctctgatacagaggaagcctgtaaaccattttctattctttctcttggccacagacattcctttaaacatgtgctgaccttctgcttcgaggtctccttgaggacattgtctcagaaatctctgttgcaatatttgagcggatcactcaaccctttccactcttaaattttctctaccgtctcaccttaggcaatataaagtcctggttcacgctcaggaacgagagctgacccagttaagggagaagttacgggaagggagagatgcctcccgctcattgaatgagcatctccaggccctcctcactccggatgagccggacaagtcccaggggcaggacctccaagaacagctggctgaggggtgtagactgacacagcaccttgtccaaaagctcagcccaggtaaggtggccataggccctgatgacccaaaatcccaggcttatgagagactccagacctccatactttcacaatgacagttgtatcaatggtgtttttttccactaagcttatgtggccatgacatgaccaggacttcttgggtaagagcggagatgggaaacccatggggttggaggtcacagtattgcaagtgtccctcctcccttgatggaaggtggtctttggagtatgaggcagcatctgtctagttttaaaggacaggaaggaggctgcgatgggagcaggcttgttagagtgaaaagagctctggactaagaatgaagattcccaggctgtcttttcggcaatgttcttagtaactgtcagagagtgaatgacttgtccttcctgaatttctctctctccgtggcagacaaattgtctcttgcaagggtctgaagcattcaaatgtgggaacacttacaactgctttccaaaatgagatgaaggccctcgccgtgtgatgttggagaaggcactttatgtgggggcgttttgtggtaggaagtgcttcagactggagcactccccatggatagaatgtccctgaataacacagcagaagccacttggaggcttgaaatcttctgatgcatagaggactgtgggacaagtttgtctgcttctaagagaaagaattaggtttgaaatgcaaactgtgacgggacaccaagcctgtgcctgggaatcagatctggcaggatgggggacacagctgccaatgtccagagagaggctgcacaagcctccagtgatatgggaagcaaaaggtcttttcaatatttggccacatcttgatggtggccctccagatcagaaatgcattgcctgatggatcaggaaaccatgccagggcattctgttaaagataaaacatgagagttttcagttgaacggtgacccatgcctagatgttcatgtctctgttgcacattgggctgactgtgcttgcagactgtgaagtgggaaatatctgaacgaacacttctgtatttacagaaaatgacaacgatgacgatgaagatgttcaagttgaggtggctgagaaagtgcagaaatcgtctgcccccaggtaacactgaatactcaggaacaattaatggatggtaacatatgaggaatatctaggaggcacaccctctctggcatctatgatgggccaaaaacccgcattcgcttggccacagtatgtgaaatataacccagcttagacacagggtgcggcagctgtcatgtttctctgtgtgtgccgagtgtcatgtctgcaccgtacagggatagctgagtcttcatcctcctcagctcctatctgtccagtgcaatgaacagcagctgctctcttcctctctggttcccatggcagccatgctctgttgcagagagaacaggattgcatgttccctcttaatgggaacgtccattttgctttctgggaccactctcttaatgccgcctgtcaaaaccagctaggactccctggggtccaatccctctgtgtttaatcttctgtcatctctgtcccacctggctcatcagggagatgcagaaggctgaagaaaaggaagtccctgaggactcactggaggaatgtgccatcacttgttcaaatagccatggcccttatgactccaaccagccacataagaaaaccaaaatcacatttgaggaagacaaagtcgactcaactctcattggctcatcctctcatgttgaatgggaggatgctgtacacattattccaggtagcctctgttttccttgtgtctcgtacctctctctaggctgaggaagataaactctgaagacaggctctatcaacacaaattcatttgaataaaaaactgtgatgggtttctaaacagatatcagggagtttttttgtccttctcagctaatgtcatgcctttgtctgccagtccccagtatcaagttactcaaccccaggcaagtgtgacaatctcatagtcacctgagtgcaggaggtgcacaggccatatctgtcaggcctcctagcttcgattcagtatttcttgtcatctgtgattaagtcatctgtccctgaacaatgcccatggagtttctatgcctgtttaaggaagctggcagccttgcctttgtatttggaaatatcgttccccaggcttcactgctctcagctttcatctggatctcctttaagtcagcttgcttagctgcacagtcaccctgaaatcaggacggaaacttttcttctttactttgctgatatatttccataaagcaaggctggaccctggttctccaccctgtcaatgcaatggctgatccaatgtttctttgtagcatcgtggattttttttttttttttttttttgcgatggagtcttgctctgtcacccaggctggagtgcagtggcaccatcttggcttggtgcaacctctccctcccagattcaagtgattctcctgcctcagcctcctgagttgctgggaccacaggtgcacaacatcacatctggctaatttttgtatttttagtagagacagggtttccccatattggccagggtagtcctgaactcatgacctcaaatgattcacctgtcttggcctcccaaatcacagattctttttaaagcaagagttgttcaaatttatctatcagtcgtgtttcatgtatagatgcctctaaacatttaatgtccatgttacctggtgatataagtccgtattgcagcaacactcttagaaaattgtttgaccaatttttggagatttttttggggaaaaaattttgtttaactttgactcaggcagggaatatggcattatggtctacacgtagagggagattttggcctgtgggtctggaaagcagggtcatctaattctcaccaaagttaatctaggacaccctagaatattcctgtcagaatccttattcttgcactgagaatagttatgtccttgtgctatgactggacagtgatttgttcctatgtgaagtatgaattgcttaatgtgacctgcttctctgaatttatttacagaaaatgaaagtgatgatgaggaagaggaagaaaaagggccagtgtctcccaggtaatgttgtggaattgttggctgttaattcagtagtgacatctggagattgtagatttagggaaaatgaggaagtgatgaatagaactatttcttccattcacccagctacaaattgtgctgatttacaatgttgtatgttatttgtggcacttgtattggttttaatttcatagtcctctcaagataggaacttgccatcagatgagccaggtgaactagccaaacagggttttcttgttgatcttttcaaaaaaccagccctggattcattgattttttgaagggttttttgtgtctctatctcctttagttctgctctgatcttagttacttcttgtcttctgctagcttttgaatttgtttgctttgcttctctcgttattttaattgtgatgttaggatgtcaattttagatcttttctgctttctcttgtgggcatttagtgctataattttccctctacacattgctttaaatgtgtctcagagattctggtatgttgtgtctttgttctcattggtttcaaagaacatctttatttctgccttcattttgttattttcccagtagtcattcaggagcaggttgttgagtttccatgtagttgtgcggttttgagtgagtttcttaatcctgggttctaatttgatggcactgtggtctgacagtttgttgtgatttccattcttttacatttgctgacgagtgctttacctccaactatgtggtcaattttggaataagtgtgatgtggtgctgagaagaatgtatattctgttgatttggggtggagagttctgtagatgtcttttaggtctgcttggtggagagctgagttcaagtcctggatatccttgttaagcttctgtctcattgatctgtctaatattgacagtggggtgttaaagtctcccattatgattgtgtggagtctaaatctctttgtaggtctctcagacttgctttatgaatctgggtgctcctgtatagggtgcatatatatttaggatagttaactcttgttgaattgatccctttaccattatgtagtggccttctttgtctcttttgatctttgttggtttaaagtctgttttatcagagactaggattgcaacccctgcctttttttgttttccatttgcttggtagatcttcctccatccctttattttgagcctatgtgtgtctctgcatgtgagatgggtttcctgagtacagcacactgatgggtcttgactctttatccaatttgccattctgtgttttttaactggggcatttagcccatttacatttaaggttaatatcgttatgtgtgaatttgatcctgtcattatgatattagctggttatttcgcccgttagttgatgcagtttcttcctagcgtcaatggtctttacagtttggcatgtttttgtagtggctggtaccggttgttcctttccatgtttagtgcttcctttaggagctcttgtaaggcaggcctggtggtgacaaaatctctcagcatttgcttctctgtaaaggatttatttctccttcacttatgaagctttgtttggctggatatgaaattctgggttgaaaattcttttctttaagaaggttgaagatgctggagaggatgtggagaaataggaacacttttacactgttggtgggactgtaaactagttcaatgattgtggaaggcagtgtggcaattcctcagggatctagaactagaaatactatttgacccagccatcccattactgggtgtgtacccaaatgattataaatcatgctgctgtaaagacacatgcacacatatgtttattgtggcactattcacaatagcaaagacttggaaccaagccaaatatccagcaatgatagactggattaagaaaatgtggcacatatacaccatggaatactatgcagctataaaaaatgatgagttcatgtcctttgtaggggcatggatgaagctggaaaccatcattctcagcaaactattgcaaggacaaaaaaccaaataccgcatgttcttactcacaggtgggaattgaacaatgagaacacatggacacagaaaggggaacatcacacactggggcctgttgtagggtggggggagggaggaggggtagcattaggagatatacctaatgttaaatgatgagttaatgggtgaagcacaccaatgtggacatgtatacatatgtaactaacctgcacgttgtgcacatgtaccctaagacttaaagtattaaaatatatatatctgtatatatatatatacatacacacaaaaaataataaaggaaaactatacatatggaaaaaaaaagaatgttgaatattgctcccactctcttctggcttgtagggtttgtgccaagagatctgctgctagtctgatgggcttccctttgtgggtaatccgacctttctctctggctgcccttagcattttttccttcatttcaaccttggtgaatctgacaattaagtgttttggggttgctcttctcgaggagtatctttatggtgttctctgtgtttcctgaatttgaatgttggccttccttgctaggttggggaagtcctcctggataatatcctgaagaatgtttcccagcttggttccattctccccgtcactttcagtacaccaatcaaacgtagatttggtctttccacatagtcccatatttattggaggcttgttcatttctttttactcttttttctctaaacttctcttctcgcttcatttcattaatttgatcttgaatcactgataccatttcttgcacttgatcgaattggctactgaagcttgtgcatgcaccacgtagttctcgtgccatggttttcagctccatcaggtcatttaaggtcttctctacactgttcattctggttagccattcgtctaatcttttttcctttagctcagagaagtttgttattaccgactttctgaagcctacttctgtcagctcatcaaagtcattctccatcctgctttgttccattgctggcgaggagctgcgatcctttggaggagaagggatgtcaggtttttggaattttcagcttttgtgctctggtttctccccacctttgtggttttatctacccttggtctttgatgatggtgacctacagatggggttttggggtggatgtcttttttgttgatgttgatgctattcctttctgtgtgttagttttccttctaacagtcaggtccctcagcttcaggtctgttggagtttgctggaagtccactccagaccctcaaacagggatttctttgtgttgcctattctctcccatgtgtttaaatccagggagaggtgtatacatgctttcttcctatttgttggtagtatgttggctagtatttttgcaagaaaagaaattgaaaaagtaaatatattatatcaaaatattgggaaaatggggcccttaatacacaagatctgtgtctgcactgcgtcaagaactctcttcacttgaatgctgcatgtaaaattcaacccaatttatgcaaagtagttgaagccctgtgtcagttctctgtgctgcaagtcatgatggtagtttacagggagagtctgggtgccctgagttggctcatctgtggcaaatgtactgagcacatgctgcccatttttgctgtgtccccagagcagtcaccctccaccctgtatttagaaggatagttttatttctcttgaaggaaaaatgcctttggtttctgtgaccactccattctgtctcccatcagatcatctgggaggttttgttgtctaatgtctgttggttaaatcttctatcatccctgtcctgcctggctcatcaggaatctgcaggagtctgaagaggaggaagtcccccaggagtcctgggatgaaggttattcgactctctcaattcctcctgaaatgttggcctcgtaccagtcttacagcagcacatttcactcattagaggaacagcaagtctgcatggctgttgacataggcagtgagtactccattgtgaaggtgataaagctccagttcatggcccaggtagaccccataatctttgggccttgtgccccttgttgggctgagatttgccatcaccgtgggctgaacctatatatcaatgtagatttcaatcactctggagtcgagtctgaagcacaggcatggggtgggtcagtgagctttgctctcttcctagtctcaggccatgcccgtgccaacctggactgactgtcacgacattgaactcaaggcaggtgtggcaaactcacaccaaactatgcagcacatgcccaggagctgtctgtcagctcagctcatctgaattaaatgtctcttgccagctacaaaattccttatgagttttgttcccaaagcatgtctgtgtggttctttacctgccgaaggccagtgtcacccttgtctacctctcagtgaaagatgtgacccaggtttcactgaatttattcccattttctgtgtcttctaagttcgcttgttttagctcatctgtccatcatgttcctggtacgttttctagataaacagctgacttttcacccacaaaagccataatagctgatgcttctgtgtagaaccaagtttcattttgactcaagagctggtacattgcaccccttcatcaaatctctgtgtccacaatctcataaactatcaaattctgggtatttaatgagagaaagcttaatattgaagtatctctcctatgaggtgttagaactatttgcctacaatttattggggaaaaaattgctcatttgtgtacataaacctaggacagagcacatagggaagataacattccaaaacaggggaattttgcccaaggctcatgaaagaacccaagccagttttctcaagacttgacctcaggcctactggaatatttctctcaaagtctcctgttctcacactgacaagactgatgtccctgtgttaggattggacagaggaatgtttctgtgtgcaaggaagaactgcttaatgtaagagggcccatctgaatttatttgcaggacatcggtgggatcaagtgaaaaaggaggaccaagaggcaacaggtcccaggtgagtctgagaaattgtggagagttaatttgatgttgacacctggagatgccaagtccagggaaaacagtacatgctgaaaataatgattttgtcttgtcagacaagtctgaattatgcctactacattgctttttggttctcattagagtaaatgtttaggtttccatttcttcctacacttatcatttactaacctagtgaaggttgaccatacctcaaaagctgtattctcatggtaactgcagggaaacttgagcacattttatgcaaaattattgaggacatgcttttcatgatcactgttcactgtgtgtcctgagagcacaaatacagagtgtcctttgattccctcatcagtgtgtcacctgaccaattcactgagctcgctctgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtctttctctttcatccttttctacctggccctagtctatcccaacataaaggcaataatttgttacctcattaatggatctgtcctttttcttttcaaactcttccttatgttagccatgaaatctagctgggactgtgtggtttctgattccccctggcttattctttactttttcccacttttccaggctcagcagggagctgctggatgagaaagagcctgaagtcttgcaggactcactggatagatgttattcaactccttcaggttgtcttgaactgactgactcatgccagccctacagaagtgccttttacgtattggagcaacagcgtgttggcttggctattgacatggatggtgagtacctttctatgaaggtgataaggatccactgagtcttctggttagggtcatattcctactgcaagtggcccttactgagctgagagatgtcattgccacagggaggacctataggcacatgtaggttgaatgaaactctagttccacttggaagcccagacaagggatgggtcagtgagcaaggctctcttcctagtctcaggccatgcctgtggcgccctaatcctactctcatgacgttggacctgggcagatgtgacaaattcacacaactctgattttgtctcaattttgtagatcttgtagatttcatccttcactctaatttcagcgtctaaaatccttgctaccatgaacaatctgagtatttgatgagacagggctgaatagtgcagtttttctcctagcaaccatttgggggcatttgctttaaatcgattggaaaaatatggcataaccatttgcacaaacttgggacaaatgatattgggataacgatctaccagaatagggaattttacccacagtttctgggacaaaaaccaaggaatctctatcgtgatcagccttcaggcctcttgaagaatatctctcacagtgtcctattctcatgctgaggagcctgaagtccctgtgtgaggattagacagtggattgttatgtgtgtaggagaaccagcttaatatgtctgtccatgtctgaacttattgcagaaattgaaaagtaccaagaagtggaagaagaccaagacccatcatgccccaggtaactttgagcaattatggatgcttaattgtgtgttgacacctggagatgccaggtccagggaaaacaagagtgtgttcaatttcatgttttcaacgaaggttgaattactcctcctgacattgctgttggttttcattgcagtagatgtttaggtttccatttcttcctccccttatcatttactaacttactataggttgaccatacctcaaaggctgtatggcaactgcatggaatcttgagcaagtttatggaaaattattgagcccactcttttcatgaccactgttcgctgtgtgtcccgagcgcactaactcagagtgtcctttgaccccttcatcagtgtgtcacccggccaattcgctgagctcactttctcctctctctctctctccctctccctgtctttctctttcattcttttctacctggccctggtctatcccaacataaaggcaataattcattacctcattaatggatccgtcctttttctttttaaacagttccttatgttagccatgaaatctagctggggctgtgtggtttctgattctccctggcttattctttactttttcctacttttccaggctcagcagggagctgctggatgagaaagagcctgaagtcttgcaggactcactggatagatgttattcgactccttcagattatcttgaactgcctgacttaggccagccctacagcagtgctgtttactcattggaggaacagtaccttggcttggctcttgacgtggacagtgagtaccttactgtgaaggtgataagcctccacctggtcttccagataggggtgatattcctgttccaagtggcccttactgacccgagagatgtcattgccgcaggcaggacctatgggcgcatataggttgtaatgaaactgttgtctcagttggaagcctagacatgaaatgggtcagtgagcaaggctctattcctagtctccagccatgcctgtggcaagctgagcccgctctcagcacattggacccaggcagatgtaaaaaattcacagaactatgatttggactcaagggtttgtagatttcctccttcattctaatttcagtgtctaaaattcttgcatccatgaacgagctgggcatttgatgagacagggctgaatactgcagttttcctcctagaaatcatctggggcattttctttgaactgatgggaacaataaggcataactgtttgcacaaacttgggataaatgattttgggataacgatctaccagaatggggatatttcacccttggttctgagatgcaaaccaaagaatatcatgaccagctttcaggcctcctgaagtatctctctcacattgtcctgttctcatgctgagaagcctgagatccctgtgtggggattagacagtggactgttatgggtgtaggtgaattggcttattttgtctgtccctgtctgaatgtattgcaggaattaaaaaggaccaagaagaggaagaagaccaaggcccaccatgccccaggtaactgagcaattgtgaacagctacttctgtgttgacatctggagactcctggttcagggaaaacagagcgggctgacattatcgattacatcttttccagcaagcctgaattattcctactaacattgctgttggttttcattgcagtagatatttaggtttccatttcttcctccccttatcatttactaacctactgtaggtggaccagacttcaaaaactgtattctcatggcgactgcatggaaacttgagcacattttatggaaaattattgagcacagtcttttcatgatcactgtatgctgtgtgtcctgagggcactaactcagagtgtcctgttactccctcatcagtgtgtcacctggacaattcactgagctcattctctctctctctctctctgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtgtctatctgtctttctctttcattcttttccatttggccctgttctgtcccaacatgaaggcaataatttgttacctcattaatggatctatccttttacttttttaaccacttccttatgctacccatgaaacctagttggggctctgttgtgtctgatttcccctggcttattctttactttttcctccttttccaggctcagcagggagctgctggaggtagtagagcctgaagtcttgcaggactcactggatagatattattcaactccttccagttgtcttgaacagcctgactcctgccagccctatggaagttccttttatgcattggaggaaaaacatgttggcttttctcttgacgtgggaggtgagtacctttctatgaaggtgataaggatccactgagtcttccatataaagatcatattcctgctccaattggccattactgagctgagagatgtcattgccgcagtgaggacctataggcacatgtaggttgaatgaaactctagttctacctggaagcccagacaagggatgggtcagtgagcaagactctcttcctagtctcaggccatgcctgtggcactctgattctactctcatgacattggacctgggcagatgtgacaaattcagagaactatgattttgactcaagggtttgtagatttcctttttcactctaatttcagtgtctaaagtcctcacaaccatgaacaatctgagtatttgatgagacagggctaaatattgcagtttttctcctagaaatcatttgagggtatttgctttaaattgattggaaaaatatggcataactgtttgcacaaacttgggacaaatgttattgggataacgatctactagaatagggacactttacccacagtttttgggagaaaaactgaggaatttatatcatgaccagccttcagacctcctgaaatatatctctcatggtgtcgtattcttatgctgaggagcctgaggtccctgtgtgaggattagacagtggattgttatatgtgtaggggaatcagcttaatgtgtctgtccatgtctgaatttattgcagaaattgaaaagaaggggaaggggaagaaaagaaggggaagaagatcaaagaagaaaagaagaaggggaagaaaagaaggggaagatgacaacccaccatgccccaggtaactttcagcaattgtggatgcttaattctgtgttaacacctggaggcaacagattcagggaaaccagagtgtgtttgatgtcatgttttcaacgaaggctgaattactcctactgtcattgctgttggttttcattgcagtagatgtttaggtttccatttcttcctccccttatcatttactaacgtaccataggttgaccatacttcaaaagctgtactctgatggccactgcatcaaattttgagcatattttatgggaaactattgagctcactctttttgtgatcacagtttgctgtgtgtcatgagggcactaactcagagtgtccttttactcccttaccagtatgtcacctgggcaattcactagctcactttctctctgtctctgtctctgtctctctctctctgtctttctctttcattgttttctacctggccctgttctatcccaacataaaggcaataaattttttttttttacctcattaatgaatctatcctttttcttttctaaccacttccttatattacttctgaaatctagtggggctctgtggtgtctgattttccctggctgcttctttagttttgtctccttttccaggctcaacggcgtgctgatggaagtggaagagcctgaagtcttacaggactcactggatagatgttattcgactcagtcaatgtactttgaactacctgactcattccagcactacagaagtgtgttttactcatttgaggaagagcatatcagcttcgccctttacgtggacaataggttttttactttgacggtgacaagtctccacctggtgttccagatgggagtcatattcccacaataagcagctcttactaagccgagagatgtcatt'
string = string.lower().replace('\n', '')
print(len(string))
lists = list(string)
two = converter_to(lists, 2)
print(two)
seq = numpy.array(two, numpy.unicode_)

logp, path = promoter_utr_model.viterbi(seq)

path_names = [p[1].name for p in path]

count = 0
print([(string[i + 1], name, i - len(path_names) + 1) for i, name in enumerate(path_names) if i + 1 < len(string)])
def prep(cluster_directory_root, depth, genefile):

    # load data
    gc, mt, track = load_data(None, 0)
    genes = load(open(genefile, 'r'))
    gc.data = gc.data.loc[genes, :]

    data = pd.concat([gc.data, mt.data])

    labels = data.index.values
    original_labels = labels
    pos_labels = labels + '+'
    neg_labels = labels + '-'
    pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels,
                            columns=data.columns.values)
    neg_data = pd.DataFrame(data=(data.as_matrix() * -1), index=neg_labels,
                            columns=data.columns.values)

    data = pd.concat([data, pos_data, neg_data])

    print data.index.values

    generic_dir = cluster_directory_root.split('/') + (['*'] * depth)
    generic_dir = ('/').join(generic_dir)
    cluster_directories = \
        glob.glob(generic_dir)

    clusterings = {}
    clusterings_models = {}
    for cluster_dir in cluster_directories:
        try:
            clustering_id = cluster_dir.split('/')[-1:][0]
            # read final clusters
            clusters = {}
            filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt'])
            lines = (open(filepath, 'r').read().splitlines())
            l = 0
            while l < len(lines):
                cluster_name = lines[l]
                cluster_members = lines[l + 1].split('\t')
                if cluster_members == ['']:
                    cluster_members = []
                clusters[cluster_name] = cluster_members
                l += 4

            clusterings[clustering_id] = clusters

            # load models
            models = {}
            model_files = glob.glob(cluster_dir + '/*')
            for model_file in model_files:
                try:
                    model_id = model_file.split('/')[-1:][0]
                    json = open(model_file).read()
                    models[model_id] = HiddenMarkovModel.from_json(json)
                    print 'model loaded from: ', model_file
                except:
                    pass
            clusterings_models[clustering_id] = models
        except:
            pass

    """
    background = set()
    for clustering in clusterings.itervalues():
        for cid, members in clustering.iteritems():
            background.update(set(members))
    """

    background = list(original_labels)
    # data = data.loc[background, :]

    # generate ranomd clusterings of the same size k as our models
    random_clusterings = {}
    np.random.seed(int(time.time()))
    for clustering_id, clustering in clusterings.iteritems():
        source = np.array(background)
        random_assignments = np.random.choice(len(clustering), source.size)
        random_clusters = {}
        for i, cluster_id in enumerate(clustering.iterkeys()):
            random_clusters[cluster_id] = \
                source[np.where(random_assignments == i)[0]].tolist()
        random_clusterings[clustering_id] = random_clusters

    # generate random signed clustering
    random_signed_clusterings = {}
    pn = np.array(['+', '-'])
    for clustering_id, clustering in clusterings.iteritems():
        source = np.array(background)
        random_assignments = np.random.choice(len(clustering), source.size)
        random_clusters = {}
        for i, cluster_id in enumerate(clustering.iterkeys()):
            members = source[np.where(random_assignments == i)[0]].tolist()
            signed_members = []
            for member in members:
                sign = np.random.choice(pn, 1)[0]
                signed_members.append(member + sign)

            random_clusters[cluster_id] = signed_members
        random_signed_clusterings[clustering_id] = random_clusters

    return clusterings, random_clusterings, random_signed_clusterings,\
        clusterings_models, data, original_labels
Ejemplo n.º 14
0
with open('partial_model_start_model.json') as start_model_file:
    start_model_json = start_model_file.read()

with open('partial_model_coding_to_stop_model0.json'
          ) as coding_to_stop_model_file0:
    coding_to_stop_model_json0 = coding_to_stop_model_file0.read()

with open('partial_model_coding_to_stop_model1.json'
          ) as coding_to_stop_model_file1:
    coding_to_stop_model_json1 = coding_to_stop_model_file1.read()

with open('partial_model_coding_to_stop_model2.json'
          ) as coding_to_stop_model_file2:
    coding_to_stop_model_json2 = coding_to_stop_model_file2.read()

start_model = HiddenMarkovModel.from_json(start_model_json)

coding_to_donor_model0 = HiddenMarkovModel.from_json(
    coding_to_donor_model_json0)
coding_to_donor_model1 = HiddenMarkovModel.from_json(
    coding_to_donor_model_json1)
coding_to_donor_model2 = HiddenMarkovModel.from_json(
    coding_to_donor_model_json2)

intron_acceptor_model = HiddenMarkovModel.from_json(intron_acceptor_model_json)

coding_to_stop_model0 = HiddenMarkovModel.from_json(coding_to_stop_model_json0)
coding_to_stop_model1 = HiddenMarkovModel.from_json(coding_to_stop_model_json1)
coding_to_stop_model2 = HiddenMarkovModel.from_json(coding_to_stop_model_json2)

Ejemplo n.º 15
0
 def load(fileLocation):
     with open(fileLocation) as modelFile:
         model = HiddenMarkovModel.from_json(modelFile.read())
         return model
     raise RuntimeError("can't load the model")
Ejemplo n.º 16
0
def gen_cluster_plots(cluster_directory_root, depth):
    # load data
    gc, mt, track = load_data(None, 0)
    data = pd.concat([gc.data, mt.data])

    labels = data.index.values
    pos_labels = labels + '+'
    neg_labels = labels + '-'
    pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels,
                            columns=data.columns.values)
    neg_data = pd.DataFrame(data=data.as_matrix(), index=neg_labels,
                            columns=data.columns.values)

    data = pd.concat([data, pos_data, neg_data])

    generic_dir = cluster_directory_root.split('/') + (['*'] * depth)
    generic_dir = ('/').join(generic_dir)
    cluster_directories = \
        glob.glob(generic_dir)

    clusterings = {}
    clusterings_models = {}
    for cluster_dir in cluster_directories:
        try:
            clustering_id = cluster_dir.split('/')[-1:][0]
            # read final clusters
            clusters = {}
            filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt'])
            lines = (open(filepath, 'r').read().splitlines())
            l = 0
            while l < len(lines):
                cluster_name = lines[l]
                cluster_members = lines[l + 1].split('\t')
                clusters[cluster_name] = cluster_members
                l += 4

            clusterings[clustering_id] = clusters

            # load models
            models = {}
            model_files = glob.glob(cluster_dir + '/*')
            for model_file in model_files:
                try:
                    model_id = model_file.split('/')[-1:][0]
                    json = open(model_file).read()
                    models[model_id] = HiddenMarkovModel.from_json(json)
                    print 'model loaded from: ', model_file
                except:
                    pass
            clusterings_models[clustering_id] = models
        except:
            pass

    background = set()
    for clustering in clusterings.itervalues():
        for cid, members in clustering.iteritems():
            background.update(set(members))

    background = list(background)
    # data = data.loc[background, :]

    # generate ranomd clusterings of the same size k as our models
    for clustering_id, clustering in clusterings.iteritems():
        for model_id, members in clustering.iteritems():
            sequences = data.loc[members, :]
            pltdir = '/'.join(cluster_directory_root.split('/') + ['plots'])

            # make line plots directory
            if not os.path.isdir(pltdir + '/line'):
                print "Creating directory...", pltdir
                os.mkdir(pltdir + '/line')

            savename = pltdir + '/line/' + model_id + '_lineplot'

            plt_title = model_id + ' Line Plot'
            ax = sequences.T.plot(legend=False, rot=2)
            ax.set_title(plt_title)
            ax.set_xlabel('Timepoint')
            ax.set_ylabel('Normalized Expression')

            print 'Saving: ', savename
            fig = ax.get_figure()
            fig.savefig(savename)
            fig.clear()

            # make autocorr plots directory
            if not os.path.isdir(pltdir + '/autocorr'):
                print "Creating directory...", pltdir
                os.mkdir(pltdir + '/autocorr')

            savename = pltdir + '/autocorr/' + model_id + '_autocorr'

            plt_title = model_id + ' Autocorr Plot'
            for seq in sequences.index:
                ax = autocorrelation_plot(sequences.loc[seq])
            ax.set_title(plt_title)

            print 'Saving: ', savename
            fig = ax.get_figure()
            fig.savefig(savename)
            fig.clear()

            # make lag plots directory
            if not os.path.isdir(pltdir + '/lag'):
                print "Creating directory...", pltdir
                os.mkdir(pltdir + '/lag')

            from pylab import *
            NUM_COLORS = len(members)
            cm = get_cmap('gist_rainbow')
            colors = []
            for i in range(NUM_COLORS):
                colors.append(cm(1.*i/NUM_COLORS))

            savename = pltdir + '/lag/' + model_id + '_lagplot'

            plt_title = model_id + ' Lag Plot'
            for i, seq in enumerate(sequences.index):
                ax = lag_plot(sequences.loc[seq], c=colors[i])
            ax.set_title(plt_title)

            print 'Saving: ', savename
            fig = ax.get_figure()
            fig.savefig(savename)
            fig.clear()

            """
Ejemplo n.º 17
0
 def _model(self):
     return HiddenMarkovModel.from_json(self.json_model)