def generate_kmers_all(self, k, save=False):
        '''
        :param k:
        :param save:
        :return:
        '''
        self.k=k
        self.vocab = [''.join(xs) for xs in itertools.product('atcg', repeat=k)]
        self.vocab.sort()
        self.vectorizer = TfidfVectorizer(use_idf=False, vocabulary=self.vocab, analyzer='char', ngram_range=(k, k),
                                          norm=None, stop_words=[], lowercase=True, binary=False)

        data = np.zeros((len(self.fasta_files), len(self.vocab))).astype(np.float64)

        # multi processing extraction of k-mer distributions
        t_steps=[]
        s_steps=[]
        pool = Pool(processes=self.num_p)
        for ky, (v,t,s) in tqdm.tqdm(pool.imap_unordered(self.get_kmer_distribution, self.fasta_files, chunksize=1),
                               total=len(self.fasta_files)):
            data[self.indexing[ky], :] = v
            t_steps.append(t)
            s_steps.append(s)

        # normalize the frequencies
        data = normalize(data, axis=1, norm='l1')
        data = sparse.csr_matrix(data)

        if save:
            FileUtility.save_sparse_csr(save, data)
            FileUtility.save_list(save+'_meta',self.fasta_files)
            # ': '.join(['mean_time', str(np.mean(t_steps))]), ': '.join(['std_time', str(np.std(t_steps))])
            FileUtility.save_list(save+'_log',[': '.join(['mean_size', str(np.mean(s_steps))]), ': '.join(['std_size', str(np.std(s_steps))])])

        return data
Beispiel #2
0
    def create_kmer_table(self, path, k, cores=4, override=False):

        save_path = self.output_path + 'sequence_' + str(k) + 'mer'

        if override or not os.path.exists('_'.join(
            [save_path, 'feature', 'vect.npz'])):
            files = FileUtility.recursive_glob(path, '*')
            files.sort()
            input_tuples = []
            for file in files:
                input_tuples.append(
                    (file.split('/')[-1].split('.')[0], file, k))

            strains = []
            mat = []
            kmers = []
            pool = Pool(processes=cores)
            for strain, vec, vocab in tqdm.tqdm(pool.imap_unordered(
                    self._get_kmer_rep, input_tuples, chunksize=cores),
                                                total=len(input_tuples)):
                strains.append(strain)
                mat.append(vec)
                kmers = vocab
            pool.close()
            mat = sparse.csr_matrix(mat)

            FileUtility.save_sparse_csr(save_path + '_feature_vect', mat)
            FileUtility.save_list('_'.join([save_path, 'strains', 'list.txt']),
                                  strains)
            FileUtility.save_list('_'.join([save_path, 'feature', 'list.txt']),
                                  kmers)
        return ('_'.join([save_path]) + ' created')
 def write_in_file(filename, pos, neg):
     lines = [['direction', 'marker', 'p-value']]
     for marker, pval in pos:
         lines.append(['+', marker, str(pval)])
     for marker, pval in neg:
         lines.append(['-', marker, str(pval)])
     FileUtility.save_list(filename, ['\t'.join(line) for line in lines])
 def generate(self,
              vocab_size,
              sample_size,
              output_dir,
              num_p=4,
              backend='Sentencepiece'):
     '''
     :param vocab_size: the size of final vocabulary
     :param sample_size: how many reads from each file
     :param output_dir: where to write the results
     :param num_p: number of cores
     :return:
     '''
     start = timeit.default_timer()
     fasta_files = [(x, sample_size) for x in self.fasta_files]
     corpus = []
     pool = Pool(processes=num_p)
     for ky, v in tqdm.tqdm(pool.imap_unordered(self._get_corpus,
                                                fasta_files,
                                                chunksize=num_p),
                            total=len(fasta_files)):
         corpus = corpus + v
     pool.close()
     print('Corpus size for training NPE is ', len(corpus))
     if backend == 'Sentencepiece':
         FileUtility.save_list('../tmp/tmp_txt', corpus)
         spm.SentencePieceTrainer.Train(
             '--input=../tmp/tmp_txt --model_prefix=' + output_dir +
             ' --add_dummy_prefix false --max_sentencepiece_length=512 --model_type=bpe --mining_sentence_size=5000000 --input_sentence_size=10000000 --vocab_size=50000'
         )
         FileUtility.save_list('../tmp/tmp_txt', corpus[0:10])
     elif backend == 'normalbpe':
         train_npe(corpus, output_dir, vocab_size, output_dir + '_freq')
     print(' The segmentation training took ',
           timeit.default_timer() - start, ' ms.')
Beispiel #5
0
    def create_treefold(self, path, tree_addr, cv, test_ratio, phenotype, mapping=None):

        ## find a mapping from strains to the phenotypes
        if mapping:
            mapping_isolate_label = dict(self.get_new_labeling(mapping)[phenotype])
        else:
            mapping_isolate_label = self.phenotype2labeled_strains_mapping[phenotype]

        # get common strains
        list_of_list_of_strains = list(self.strains.values())
        list_of_list_of_strains.append(list(mapping_isolate_label.keys()))
        final_strains = GenotypePhenotypeAccess.get_common_strains(list_of_list_of_strains)
        final_strains.sort()

        # prepare test
        Y = [mapping_isolate_label[strain] for strain in final_strains]

        isolate_to_group=dict([tuple(l.split('\t')) for l in FileUtility.load_list(tree_addr.replace(tree_addr.split('/')[-1], 'phylogenetic_nodes_and_clusters.txt'))])

        groups=[int(isolate_to_group[iso]) for iso in final_strains]
        group_kfold = GroupKFold(n_splits=round(1/test_ratio))

        train_index, test_index = list(group_kfold.split(final_strains, Y, groups))[0]
        X_test=[final_strains[x] for x in test_index]
        FileUtility.save_list(path.replace('_folds.txt', '_test.txt'), ['\t'.join(X_test)])
        final_strains = [final_strains[ix] for ix in train_index]
        group_kfold = GroupKFold(n_splits=cv)

        folds=[]
        for _, test_index in group_kfold.split(train_index, [Y[idx] for idx in train_index],  [groups[idx] for idx in train_index]):
            folds.append(test_index)
        folds=['\t'.join([final_strains[x] for x in fold.tolist()]) for fold in  folds]
        FileUtility.save_list(path, folds)
Beispiel #6
0
 def train_npe(self):
     '''
     :return:
     '''
     print(
         'npe training started.. it might take more than 1 hour for more than 1000 samples'
     )
     DiTaxaWorkflow.blockPrint()
     start = time.time()
     G16s = NPESegmentTrainMetagenomics(self.file_directory,
                                        self.file_extenstion)
     DiTaxaWorkflow.ensure_dir(self.output_directory +
                               'npe_segmentatation/')
     G16s.generate(self.vocab_size,
                   self.seg_train_depth,
                   self.output_directory + 'npe_segmentatation/' +
                   self.dbname + '_' + '_'.join([
                       'unique',
                       str(self.vocab_size), 'v',
                       str(self.seg_train_depth), 's'
                   ]),
                   backend='Sentencepiece',
                   num_p=self.num_p)
     end = time.time()
     spent = (end - start)
     self.log_file.append('training segmentation ' + '_'.join([
         'unique',
         str(self.vocab_size), 'v',
         str(self.seg_train_depth), 's '
     ]) + str(spent) + ' seconds , using ' + str(self.num_p) + 'cores')
     DiTaxaWorkflow.enablePrint()
     FileUtility.save_list(self.output_directory + 'logfile.txt',
                           self.log_file)
Beispiel #7
0
 def create_continous_mics():
     '''
     
     '''
     scaler = MinMaxScaler()
     df = pd.read_table("../data_config/Final_MICs_16.06.16.txt")
     res = df[[
         'Isolates', 'CIP MIC', 'TOB MIC', 'COL MIC', 'CAZ MIC', 'MEM MIC'
     ]]
     matrix = np.array([[
         float(
             str(x).replace('<=',
                            '').replace('≤', '').replace('<=', '').replace(
                                '≥', '').replace('>=', '')) for x in row
     ] for row in res[[
         'CIP MIC', 'TOB MIC', 'COL MIC', 'CAZ MIC', 'MEM MIC'
     ]].as_matrix()])
     # find nans [[(idx,idy) for idy,y in enumerate(x) if y] for idx, x in enumerate(np.isnan(matrix))]
     resistances = np.delete(matrix, [509], axis=0)
     isolates = [
         x[0] for idx, x in enumerate(list(df[['Isolates']].values))
         if not idx == 509
     ]
     # scale to 0-1
     resistances = scaler.fit_transform(resistances)
     features = ['CIP', 'TOB', 'COL', 'CAZ', 'MEM']
     base_path = '/mounts/data/proj/asgari/dissertation/datasets/deepbio/pseudomonas/data_v3/continous_mic_vals'
     resistances = csr_matrix(resistances)
     FileUtility.save_sparse_csr(base_path + '_feature_vect', resistances)
     FileUtility.save_list(base_path + '_isolates_list.txt', isolates)
     FileUtility.save_list(base_path + '_feature_list.txt', features)
Beispiel #8
0
 def representation_npe(self):
     '''
     :return:
     '''
     print('npe generation started..')
     start = time.time()
     G16s = NPESegmentApplyMetagenomics(
         self.file_directory,
         self.file_extenstion,
         self.output_directory + 'npe_segmentatation/' + self.dbname + '_' +
         '_'.join([
             'unique',
             str(self.vocab_size), 'v',
             str(self.seg_train_depth), 's.model'
         ]),
         sampling_number=self.rep_sampling_depth,
         num_p=self.num_p)
     DiTaxaWorkflow.ensure_dir(self.output_directory +
                               'npe_representation/')
     G16s.generate_npes_all(save=self.output_directory +
                            'npe_representation/' + self.dbname +
                            '_uniquepiece_' + str(self.rep_sampling_depth))
     end = time.time()
     spent = end - start
     self.log_file.append(
         'generating the representations npe_representation/' +
         self.dbname + '_uniquepiece_' + str(self.rep_sampling_depth) +
         '  ' + str(spent) + ' seconds , using ' + str(self.num_p) +
         'cores')
     FileUtility.save_list(self.output_directory + 'logfile.txt',
                           self.log_file)
Beispiel #9
0
def train_resampling_npe(sentenses, outfile, num_symbols, frequency_file, min_frequency=2, verbose=False, is_dict=False, resample_size=10000, N=10):
    """Learn num_symbols BPE operations from vocabulary, and write to outfile.
    """
    outfile_name=outfile
    list_of_seg=[]
    outfile = codecs.open(outfile, 'w', 'utf-8')
    f = codecs.open(frequency_file, 'w', 'utf-8')
    # version 0.2 changes the handling of the end-of-word token ('</w>');
    # version numbering allows bckward compatibility
    outfile.write('#version: 0.2\n')
    list_of_seg.append('#version: 0.2')


    vocab = get_vocabulary(sentenses, is_dict)
    vocab = dict([(tuple(x[:-1]) + (x[-1] + '</w>',), y) for (x, y) in vocab.items()])
    sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True)

    stats, indices = get_pair_statistics(sorted_vocab)
    big_stats = copy.deepcopy(stats)
    # threshold is inspired by Zipfian assumption, but should only affect speed
    threshold = max(stats.values()) / 10
    for i in tqdm.tqdm(range(num_symbols)):
        if stats:
            most_frequent = max(stats, key=lambda x: (stats[x], x))

        # we probably missed the best pair because of pruning; go back to full statistics
        if not stats or (i and stats[most_frequent] < threshold):
            prune_stats(stats, big_stats, threshold)
            stats = copy.deepcopy(big_stats)
            most_frequent = max(stats, key=lambda x: (stats[x], x))
            # threshold is inspired by Zipfian assumption, but should only affect speed
            threshold = stats[most_frequent] * i / (i + 10000.0)
            prune_stats(stats, big_stats, threshold)

        if stats[most_frequent] < min_frequency:
            sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency))
            break

        f.write('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n')
        list_of_seg.append('{0} {1} '.format(*most_frequent))
        #print('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n')
        if verbose:
            sys.stderr.write(
                'pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1],
                                                                       stats[most_frequent]))
        outfile.write('{0} {1}\n'.format(*most_frequent))
        changes = replace_pair(most_frequent, sorted_vocab, indices)
        update_pair_statistics(most_frequent, changes, stats, indices)
        stats[most_frequent] = 0
        if not i % 100:
            prune_stats(stats, big_stats, threshold)
        if not i % 100:
            FileUtility.save_list(outfile_name+'_temp',list_of_seg)


    f.close()
Beispiel #10
0
def create_tsne_web(X, Y, tsne_file_coor, tsne_file_label):
    classes = list(set(Y))
    classes.sort()
    L = [classes.index(y) for y in Y]
    tsne_res = np.hstack((X, np.array([L]).T))
    tsne_res[:, 0:2] = np.round(tsne_res[:, 0:2], 2)
    tsne_lines = []
    for l in tsne_res:
        tsne_lines.append('\t'.join([str(l[0]), str(l[1]), str(int(l[2]))]))
    FileUtility.save_list(tsne_file_coor, tsne_lines)
    FileUtility.save_list(tsne_file_label, Y)
    def sequential_crawl(triples, override=False):

        if not override:
            new_list=[]
            for x,y,z in triples:
                if not FileUtility.exists(y+z):
                    new_list.append((x,y,z))
            triples=new_list

        print ('Start crawling..')
        for x in tqdm.tqdm(triples):
            PNGScriptRetrieve(x)
        FileUtility.save_list(triples[0][1]+'log.txt',PNGScriptRetrieve.log)
Beispiel #12
0
 def numpy2trainfiles(file, name, out='../data/s8_features/'):
     '''
     test_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cb513+profile_split1.npy'
     train_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cullpdb+profile_6133_filtered.npy'
     :param name:
     :param out:
     :return:
     '''
     db = np.load(file)
     a = np.arange(0, 21)
     b = np.arange(35, 56)
     c = np.hstack((a, b))
     db = np.reshape(db, (db.shape[0], int(db.shape[1] / 57), 57))
     seq = [
         'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q',
         'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X', 'NoSeq'
     ]
     label = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T']
     sequences = []
     labels = []
     possible_features = dict()
     for i in range(0, db.shape[0]):
         sequences.append(''.join([
             seq[np.argmax(x)] if np.max(x) == 1 else ''
             for x in db[i, :, 0:21]
         ]).lower())
         labels.append(''.join([
             label[np.argmax(y)] if np.max(y) == 1 else ''
             for y in db[i, :, 22:30]
         ]).lower())
     lengths = [len(x) for x in sequences]
     sorted_idxs = argsort(lengths)
     lengths.sort()
     sequences = [sequences[i] for i in sorted_idxs]
     labels = [labels[i] for i in sorted_idxs]
     FileUtility.save_list(out + name, [
         '\n'.join([
             ' '.join([elx, labels[idx][idy]])
             for idy, elx in enumerate(list(seq))
         ] + ['']) for idx, seq in enumerate(sequences)
     ])
     db_new = db[sorted_idxs, :, :]
     label_encoding = [[([0] if np.max(row) == 1 else [1]) + row
                        for row in db_new[i, :, 22:30].tolist()]
                       for i in range(0, db.shape[0])]
     np.save(out + name + '_mat_Y', label_encoding)
     db_new = db_new[:, :, c]
     np.save(out + name + '_mat_X', db_new)
     FileUtility.save_list(out + name + '_length.txt',
                           [str(l) for l in lengths])
 def parallel_crawl(triples, num_p, override=False):
     if not override:
         new_list=[]
         for x,y,z in triples:
             if not FileUtility.exists(y+z):
                 new_list.append((x,y,z))
         triples=new_list
     if len(triples)>0:
         print ('Start parallel crawling..')
         pool = Pool(processes=num_p)
         res=[]
         for x in tqdm.tqdm(pool.imap_unordered(PNGScriptRetrieve, triples, chunksize=num_p),total=len(triples)):
             res.append(x)
         pool.close()
         FileUtility.save_list(triples[0][1]+'log.txt',PNGScriptRetrieve.log)
 def generate_excel(self, filename, settingname):
     '''
     :param filename:
     :param settingname:
     :return:
     '''
     df = self.get_pandas_df()
     final_markers = df['marker'].values.tolist()
     FileUtility.save_list(
         filename.replace('.xlsx', '_finalmarker_list.txt').replace(
             'final_outputs', 'intermediate_files/npe_marker_files'),
         final_markers)
     writer = pd.ExcelWriter(filename)
     df.to_excel(writer, settingname)
     writer.save()
Beispiel #15
0
    def sequential_crawl(triples, override=False):
        '''
        :param triples:
        :param override:
        :return:
        '''
        if not override:
            new_list=[]
            for x,y,z in triples:
                if not FileUtility.exists(y+z):
                    new_list.append((x,y,z))
            triples=new_list

        print ('Start crawling..')
        for x in tqdm.tqdm(triples):
            BibleCom(x)
        FileUtility.save_list(triples[0][1]+'log.txt',BibleCom.log)
Beispiel #16
0
 def representation_npe(self):
     '''
     :return:
     '''
     if self.override == 1 or not DiTaxaWorkflow.exists(
             self.output_directory_inter + 'npe_representation/'):
         print('\t✔ Creating NPE representations ...')
         start = time.time()
         G16s = NPESegmentApplyMetagenomics(
             self.file_directory,
             self.file_extenstion,
             self.output_directory_inter + 'npe_segmentatation/' +
             self.dbname + '_' + '_'.join([
                 'unique',
                 str(self.vocab_size), 'v',
                 str(self.seg_train_depth), 's.model'
             ]),
             sampling_number=self.rep_sampling_depth,
             num_p=self.num_p)
         DiTaxaWorkflow.ensure_dir(self.output_directory_inter +
                                   'npe_representation/')
         G16s.generate_npes_all(
             save=self.output_directory_inter + 'npe_representation/' +
             self.dbname + '_uniquepiece_' + str(self.rep_sampling_depth))
         end = time.time()
         spent = end - start
         print(
             '\t✔ Generating the NPE representations at npe_representation/'
             + self.dbname + '_uniquepiece_' +
             str(self.rep_sampling_depth) + '  ' + str(spent) +
             ' seconds , using ' + str(self.num_p) + 'cores')
         self.log_file.append(
             'Generating the NPE representations at npe_representation/' +
             self.dbname + '_uniquepiece_' + str(self.rep_sampling_depth) +
             '  ' + str(spent) + ' seconds , using ' + str(self.num_p) +
             'cores')
     else:
         print(
             '\t✔ Representation are already created. Thus, this is step is skipped!'
         )
         self.log_file.append(
             'Representation are already created. Thus, this is step is skipped!'
         )
     FileUtility.save_list(self.output_directory + 'logfile.txt',
                           self.log_file)
     DiTaxaWorkflow.temp_cleanup()
def tree2mat_group(tree_file, n_group=20):
    '''
    This function maps the phylgenetic tree to adj matrix and perform spectral clustering on it
    :param tree_file:
    :param n_group:
    :return:
    '''
    if not os.path.exists(
            tree_file.replace(
                tree_file.split('/')[-1],
                'phylogenetic_nodes_and_clusters.txt')):
        print('Create phylogenetic information distance and groupings..')
        t = Phylo.read(tree_file, 'newick')
        d = {}
        for x, y in itertools.combinations(t.get_terminals(), 2):
            v = t.distance(x, y)
            d[x.name] = d.get(x.name, {})
            d[x.name][y.name] = v
            d[y.name] = d.get(y.name, {})
            d[y.name][x.name] = v
        for x in t.get_terminals():
            d[x.name][x.name] = 0

        m = pd.DataFrame(d)
        isolates = [x for x in m.axes[0]]
        isolates.sort()
        mat = np.zeros((len(isolates), len(isolates)))
        for x in range(len(isolates)):
            for y in range(len(isolates)):
                mat[x, y] = m[isolates[x]][isolates[y]]
        transferred_mat = np.exp(-mat**2 / (2. * 0.08**2))
        clustering = SpectralClustering(n_clusters=n_group,
                                        assign_labels="kmeans",
                                        random_state=0).fit(transferred_mat)
        np.save(
            tree_file.replace(
                tree_file.split('/')[-1], 'phylogenetic_distance_matrix'),
            transferred_mat)
        FileUtility.save_list(
            tree_file.replace(
                tree_file.split('/')[-1],
                'phylogenetic_nodes_and_clusters.txt'), [
                    '\t'.join([x, str(clustering.labels_[idx])])
                    for idx, x in enumerate(isolates)
                ])
Beispiel #18
0
 def convert_to_kmer(input_file, out_file, n=3):
     train = FileUtility.load_list(input_file)
     training_data = [line.split() for line in train]
     final_list = list()
     temp = []
     for x in training_data:
         if x == []:
             final_list.append(temp)
             temp = []
         else:
             temp.append(x)
     res = []
     for prot in final_list:
         sentence = ''.join(['$'] + [aa[0] for aa in prot] + ['#'])
         res += [(sentence[i:i + n], prot[i][1])
                 for i in range(len(sentence) - n + 1)]
         res += ['']
     FileUtility.save_list(out_file, [' '.join(list(x)) for x in res])
Beispiel #19
0
    def ret_a_book(self, tr_meta):

        isocode, trID, dam_ids = tr_meta

        # store the api call results in json
        file_path = self.output_path + '/api_intermediate/' + '_'.join(
            [isocode, trID]) + '.json'
        f = codecs.open(file_path, 'w', 'utf-8')
        for x in dam_ids:
            response = requests.get('http://dbt.io/library/verse?key=' +
                                    self.key + '&dam_id=' + x + '&v=2')
            f.write(response.content.decode("utf-8") + '\n')
        f.close()

        # read the books
        books = []
        for line in codecs.open(file_path, 'r', 'utf-8'):
            try:
                books.append(json.loads(line))
            except:
                self.to_double_check.append(tr_meta)

        # parse the books
        bible = dict()
        for book in books:
            for rec in book:
                try:
                    key = self.book_map[rec['book_id']] + rec[
                        'chapter_id'].zfill(3) + rec['verse_id'].zfill(3)
                    bible[key] = rec['verse_text'].strip()
                except KeyError as e:
                    pass

        # save the books
        ordered_bible = collections.OrderedDict(sorted(bible.items()))
        bible = ['\t'.join([k, v]) for k, v in ordered_bible.items()]
        if len(bible) > 0:
            FileUtility.save_list(
                self.output_path + '/' + '_'.join([isocode, trID]) +
                '.api.txt', bible)

        return trID, len(bible)
Beispiel #20
0
    def generate_npes_all(self, save=False, norm=False):
        data = np.zeros(
            (len(self.fasta_files), len(self.npe_vocab))).astype(np.float64)

        # multi processing extraction of npe distributions
        t_steps = []
        s_steps = []
        pool = Pool(processes=self.num_p)
        for ky, (v, t, s) in tqdm.tqdm(
                pool.imap_unordered(self._get_npe_distribution,
                                    self.fasta_files,
                                    chunksize=self.num_p),
                total=len(self.fasta_files)):
            data[self.indexing[ky], :] = v
            t_steps.append(t)
            s_steps.append(s)
        pool.close()
        # normalize the frequencies
        if norm:
            data = normalize(data, axis=1, norm='l1')
        data = sparse.csr_matrix(data)

        if save:
            FileUtility.save_sparse_csr(save, data)
            FileUtility.save_list(save + '_meta', self.fasta_files)
            FileUtility.save_list(save + '_features', self.npe_vocab)
            FileUtility.save_list(save + '_log', [
                ': '.join(['mean_time', str(np.mean(t_steps))]), ': '.join([
                    'std_time', str(np.std(t_steps))
                ]), ': '.join(['mean_size', str(np.mean(s_steps))]), ': '.join(
                    ['std_size', str(np.std(s_steps))])
            ])
        return data
Beispiel #21
0
 def train_npe(self):
     '''
     :return:
     '''
     if self.override == 1 or not DiTaxaWorkflow.exists(
             self.output_directory_inter + 'npe_segmentatation/'):
         print('\t✔ Segmentation inference started.. ')
         start = time.time()
         G16s = NPESegmentTrainMetagenomics(self.file_directory,
                                            self.file_extenstion)
         DiTaxaWorkflow.ensure_dir(self.output_directory_inter +
                                   'npe_segmentatation/')
         G16s.generate(self.vocab_size,
                       self.seg_train_depth,
                       self.output_directory_inter + 'npe_segmentatation/' +
                       self.dbname + '_' + '_'.join([
                           'unique',
                           str(self.vocab_size), 'v',
                           str(self.seg_train_depth), 's'
                       ]),
                       backend='Sentencepiece',
                       num_p=self.num_p)
         end = time.time()
         spent = (end - start)
         self.log_file.append('Segmentation inference ' + '_'.join([
             'unique',
             str(self.vocab_size), 'v',
             str(self.seg_train_depth), 's '
         ]) + str(spent) + ' seconds , using ' + str(self.num_p) + ' cores')
     else:
         print(
             '\t✔ Segmentation results directory exists. Thus, the step was bypassed'
         )
         self.log_file.append(
             'Segmentation results directory exists. Thus, the step was bypassed'
         )
     FileUtility.save_list(self.output_directory + 'logfile.txt',
                           self.log_file)
Beispiel #22
0
    def create_randfold(self, path, cv, test_ratio, phenotype, mapping=None):

        ## find a mapping from strains to the phenotypes
        if mapping:
            mapping_isolate_label = dict(self.get_new_labeling(mapping)[phenotype])
        else:
            mapping_isolate_label = self.phenotype2labeled_strains_mapping[phenotype]

        # get common strains
        list_of_list_of_strains = list(self.strains.values())
        list_of_list_of_strains.append(list(mapping_isolate_label.keys()))
        final_strains = GenotypePhenotypeAccess.get_common_strains(list_of_list_of_strains)
        final_strains.sort()

        # prepare test
        Y = [mapping_isolate_label[strain] for strain in final_strains]
        X_train, X_test, y_train, _ = train_test_split(final_strains, Y, test_size=test_ratio, random_state=0, stratify=Y)
        FileUtility.save_list(path.replace('_folds.txt', '_test.txt'), ['\t'.join(X_test)])

        # prepare train
        spliter=StratifiedKFold(cv)
        folds=['\t'.join([final_strains[x] for x in fold.tolist()]) for _,fold in  list(spliter.split(X_train,y_train))]
        FileUtility.save_list(path, folds)
    def generate_tree(self, path, name):

        path_g = path + '/graphlan_files/'
        FileUtility.ensure_dir(path_g)
        font_map = {1: 15, 2: 14, 3: 13, 4: 12, 5: 8, 6: 7, 7: 4}
        taxonomy = self.get_pandas_df()['taxonomy'].tolist()
        direction = self.get_pandas_df()['direction'].tolist()
        taxlev = self.get_pandas_df()['taxonomylevel'].tolist()

        logpval = [
            round(-np.log(x)) for x in self.get_pandas_df()['pvalue'].tolist()
        ]

        taxonomy = [
            '.'.join(self.refine_ez_taxonomy(x).split(';')) for x in taxonomy
        ]
        tax_freq = dict(FreqDist(taxonomy).most_common())
        logpval_frq = [tax_freq[x] for idx, x in enumerate(taxonomy)]

        #taxonomy=['.'.join(x[0:-1] if isGenomeName(x[-1]) else x) for x in taxonomy]

        dict_color = dict()
        for idx, x in enumerate(direction):
            if len(taxonomy[idx].split('.')) > 5:
                coloring = ('r' if x == '+' else ('b' if x == '-' else 'g'))
                if taxonomy[idx].split('.')[-1] in dict_color:
                    dict_color[taxonomy[idx].split('.')[-1]].append(coloring)
                else:
                    dict_color[taxonomy[idx].split('.')[-1]] = [coloring]

        new_dict_color = dict()
        for tax, colors in dict_color.items():
            freq = FreqDist(colors)
            if freq['r'] / (freq['r'] + freq['b']) > 0.8:
                new_dict_color[tax] = 'r'
            elif freq['b'] / (freq['r'] + freq['b']) > 0.8:
                new_dict_color[tax] = 'b'
            else:
                new_dict_color[tax] = 'w'
        dict_color = new_dict_color

        annot = [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation_background_color',
                dict_color[taxonomy[idx].split('.')[-1]]
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
        ]

        #annot=['\t'.join([taxonomy[idx].split('.')[-1],'annotation_background_color',('r' if x=='+' else ('b' if x=='-' else 'g'))])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>5]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation_background_color',
                'w'
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) == 5
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation',
                taxonomy[idx].split('.')[-1]
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
            if not dict_color[taxonomy[idx].split('.')[-1]] == 'w'
        ]

        #annot=annot+['\t'.join([taxonomy[idx].split('.')[-1],'annotation_background_color','purple'])  for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>5]

        ## OUTER RINGS
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[1], 'annotation',
                taxonomy[idx].split('.')[1]
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]
        annot = annot + [
            '\t'.join(
                [taxonomy[idx].split('.')[1], 'annotation_rotation',
                 str(1)]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]
        annot = annot + [
            '\t'.join(
                [taxonomy[idx].split('.')[1], 'annotation_font_size',
                 str(9)]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[1], 'annotation_background_color',
                '#eedbfc'
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]

        ## Clades
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'clade_marker_size',
                str(logpval_frq[idx])
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
            if not dict_color[taxonomy[idx].split('.')[-1]] == 'w'
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'clade_marker_edge_width',
                str(logpval[idx])
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
            if not dict_color[taxonomy[idx].split('.')[-1]] == 'w'
        ]

        annot = annot + [
            '\t'.join(
                [taxonomy[idx].split('.')[-1], 'annotation_rotation',
                 str(1)]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
            if not dict_color[taxonomy[idx].split('.')[-1]] == 'w'
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation_font_size',
                str(font_map[taxlev[idx]])
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
            if not dict_color[taxonomy[idx].split('.')[-1]] == 'w'
        ]
        annot = annot + ['annotation_background_offset\t0.5']
        annot = annot + ['clade_marker_edge_color\t#4f1a49']
        annot = annot + ['branch_color\t#4f1a49']
        annot = annot + ['annotation_background_separation\t-0.01']
        annot = annot + ['annotation_background_width\t0.2']

        #https://bitbucket.org/nsegata/graphlan/src/default/readme.txt?fileviewer=file-view-default
        #asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan_annotate.py --annot ../annot.txt ../test.txt  ../new.xml
        #asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan.py ../new.xml image_name.pdf --dpi 1000 --size 15 --external_legends
        taxonomy = [
            x for x in taxonomy if len(x.split('.')) > 5
            if not dict_color[x.split('.')[-1]] == 'w'
        ]

        FileUtility.save_list(path_g + name + '_taxonomy.txt', taxonomy)
        FileUtility.save_list(path_g + name + '_annot.txt', annot)

        subprocess.call("python3 graphlan/graphlan_annotate.py --annot " +
                        path_g + name + '_annot.txt' + " " + path_g + name +
                        '_taxonomy.txt' + "  " + path_g + name + '.xml',
                        shell=True)
        subprocess.call("python3 graphlan/graphlan.py " + path_g + name +
                        '.xml' + " " + path + name +
                        '.pdf --dpi 1000 --size 15 --external_legends',
                        shell=True)
        try:
            FileUtility.remove(path + name + '_legend.pdf')
        except:
            print('')
Beispiel #24
0
    def biomarker_extraction(self,
                             labeler,
                             label_mapper,
                             name_setting,
                             p_value_threshold=0.05,
                             pos_label=None,
                             neg_label=None):
        '''

        :return:
        '''
        print('npe marker detection started')
        DiTaxaWorkflow.blockPrint()
        start = time.time()
        rep_base_path = self.output_directory + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
            self.rep_sampling_depth)
        filenames = [
            x.split('/')[-1]
            for x in FileUtility.load_list(rep_base_path + '_meta')
        ]

        # CHECK EXISTING LABELS
        if callable(labeler):
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler(file) in label_mapper
            ]
        else:
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler[file] in label_mapper
            ]

        if callable(labeler):
            Y = [
                str(label_mapper[labeler(filenames[sample_id])])
                for sample_id in selected_samples
            ]
        else:
            Y = [
                str(label_mapper[labeler[filenames[sample_id]]])
                for sample_id in selected_samples
            ]

        FileUtility.save_list(rep_base_path + '_' + name_setting + '_Y.txt', Y)
        DiTaxaWorkflow.ensure_dir(self.output_directory + 'npe_marker_files/')
        G16s = NPEMarkerDetection(
            rep_base_path + '.npz',
            rep_base_path + '_' + name_setting + '_Y.txt',
            rep_base_path + '_features',
            self.output_directory + 'npe_marker_files/' + name_setting,
            selected_samples)
        G16s.extract_markers()
        end = time.time()
        spent = end - start
        self.log_file.append('biomarker extraction ' + name_setting + '  ' +
                             str(spent) + ' seconds , using ' +
                             str(self.num_p) + 'cores')
        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)
        DiTaxaWorkflow.enablePrint()
        print('npe marker taxonomic detection started')
        start = time.time()

        if callable(labeler):
            phenotypes = [
                labeler(filenames[sample_id]) for sample_id in selected_samples
            ]
        else:
            phenotypes = [
                labeler[filenames[sample_id]] for sample_id in selected_samples
            ]

        fasta_file = self.output_directory + 'npe_marker_files/' + name_setting + '_chi2_relative.fasta'
        matrix_path = rep_base_path + '.npz'
        feature_file_path = rep_base_path + '_features'

        if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000:
            remove_redundants = False
        else:
            remove_redundants = True

        Final_OBJ = NPEMarkerAnlaysis(fasta_file,
                                      matrix_path,
                                      feature_file_path,
                                      phenotypes,
                                      label_mapper,
                                      selected_samples,
                                      p_value_threshold=p_value_threshold,
                                      remove_redundants=remove_redundants,
                                      num_p=self.num_p)
        end = time.time()
        spent = end - start
        DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/')
        FileUtility.save_obj(
            self.output_directory + 'final_outputs/' + name_setting, Final_OBJ)
        Final_OBJ.generate_tree(self.output_directory + 'final_outputs/',
                                name_setting)
        self.log_file.append('blasting extraction ' + name_setting + '  ' +
                             str(spent) + ' seconds, using ' +
                             str(self.num_p) + 'cores')
        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)
        if pos_label and neg_label:
            Final_OBJ.generate_heatmap(self.output_directory +
                                       'final_outputs/' + name_setting +
                                       '_heatmap',
                                       pos_label=pos_label,
                                       neg_label=neg_label)
Beispiel #25
0
 def tolower(file):
     lines = [l.lower() for l in FileUtility.load_list(file)]
     FileUtility.save_list(file + 'new', lines)
    def create_read_tabular_file(path,
                                 save_pref='_',
                                 feature_normalization=None,
                                 transpose=False,
                                 override=False):
        '''
        :param path:
        :param save_pref:
        :param transpose: if isolates are columns
        :param feature_normalization: 'binary': {0,1}, '0-1': [0-1],  'percent': {0,1,..,100}, 'zu': zero mean, unit variance
        :return:
        '''
        print('Start creating ', save_pref)
        if override or not os.path.exists('_'.join(
            [save_pref, 'feature', 'vect.npz'])):
            rows = [
                l.strip() for l in codecs.open(path, 'r', 'utf-8').readlines()
            ]
            tf_vec = sparse.csr_matrix([[
                GenotypeReader.get_float_or_zero(x)
                for x in entry.split('\t')[1::]
            ] for entry in rows[1::]])

            if transpose:
                tf_vec = sparse.csr_matrix(tf_vec.toarray().T)
                isolates = [
                    feat.replace(' ', '')
                    for feat in rows[0].rstrip().split('\t')
                ]
                feature_names = [row.split()[0] for row in rows[1::]]
            else:
                isolates = [row.split()[0] for row in rows[1::]]
                feature_names = [
                    feat.replace(' ', '')
                    for feat in rows[0].rstrip().split('\t')
                ]

            # normalizer / discretizer
            if feature_normalization:
                if feature_normalization == 'binary':
                    tf_vec = np.round(MaxAbsScaler().fit_transform(tf_vec))

                elif feature_normalization == '01':
                    tf_vec = MaxAbsScaler().fit_transform(tf_vec)
                elif feature_normalization == 'percent':
                    tf_vec = np.round(MaxAbsScaler().fit_transform(tf_vec) *
                                      100)
                elif feature_normalization == 'zu':
                    tf_vec = sparse.csr_matrix(
                        preprocessing.StandardScaler().fit_transform(
                            tf_vec.toarray()))

                FileUtility.save_sparse_csr(
                    '_'.join([save_pref, 'feature', 'vect.npz']), tf_vec)
                FileUtility.save_list(
                    '_'.join([save_pref, 'feature', 'list.txt']),
                    feature_names)
                FileUtility.save_list(
                    '_'.join([save_pref, 'strains', 'list.txt']), isolates)
                print(save_pref, ' created successfully containing ',
                      str(len(isolates)), ' strains and ',
                      str(len(feature_names)), ' features')
                return (''.join([
                    save_pref, ' created successfully containing ',
                    str(len(isolates)), ' strains and ',
                    str(len(feature_names)), ' features'
                ]))
        else:
            print(save_pref, ' already exist ')
            return (''.join([save_pref, ' already exist ']))
Beispiel #27
0
    def read_data(self):

        self.xmldoc = minidom.parse(self.genml_path)

        # parse project part
        self.project = self.xmldoc.getElementsByTagName('project')
        self.output = self.project[0].attributes['output'].value
        self.project_name = self.project[0].attributes['name'].value

        if self.override and os.path.exists(self.output):
            var = input("Delete existing files at the output path? (y/n)")
            if var == 'y':
                shutil.rmtree(self.output)
        if not os.path.exists(self.output):
            os.makedirs(self.output)

        log_file = self.output + '/' + 'logfile'
        log_info = ['Project ' + self.project_name]


        self.representation_path = self.output + '/intermediate_rep/'
        IC = IntermediateRepCreate(self.representation_path)

        # load tables
        tabless = self.xmldoc.getElementsByTagName('tables')
        for tables in tabless:
            path = tables.attributes['path'].value
            normalization = tables.attributes['normalization'].value
            prefix = tables.firstChild.nodeValue.strip() + '_'
            if len(prefix) == 1:
                prefix = ''
            for file in FileUtility.recursive_glob(path, '*.uniq.mat'):
                log=IC.create_table(file, prefix + file.split('/')[-1], normalization, self.override)
                log_info.append(log)

        tables = self.xmldoc.getElementsByTagName('table')
        for table in tables:
            path = table.attributes['path'].value
            normalization = table.attributes['normalization'].value
            prefix = table.firstChild.nodeValue.strip()
            log=IC.create_table(path, prefix + path.split('/')[-1] if prefix=='' else prefix, normalization, self.override)
            log_info.append(log)

        # load sequences
        sequences = self.xmldoc.getElementsByTagName('sequence')
        for sequence in sequences:
            path = sequence.attributes['path'].value
            kmer = int(sequence.attributes['kmer'].value)
            log=IC.create_kmer_table(path,kmer,cores=min(self.cores,4),override=self.override)
            log_info.append(log)

        ## Adding metadata
        self.metadata_path = self.output + '/metadata/'
        if not os.path.exists(self.metadata_path):
            os.makedirs(self.metadata_path)
        # phenotype
        phenotype = self.xmldoc.getElementsByTagName('phenotype')
        if not os.path.exists(self.metadata_path + 'phenotypes.txt') or self.override:
            FileUtility.save_list(self.metadata_path + 'phenotypes.txt',
                                  FileUtility.load_list(phenotype[0].attributes['path'].value))

        # tree
        phylogentictree = self.xmldoc.getElementsByTagName('phylogentictree')
        if not os.path.exists(self.metadata_path + 'phylogentictree.txt') or self.override:
            FileUtility.save_list(self.metadata_path + 'phylogentictree.txt',
                                  FileUtility.load_list(phylogentictree[0].attributes['path'].value))
        tree2mat_group(self.metadata_path + 'phylogentictree.txt',n_group=20)

        FileUtility.save_list(log_file, log_info)
Beispiel #28
0
    def biomarker_extraction(self,
                             labeler,
                             label_mapper,
                             phenoname,
                             p_value_threshold=0.05,
                             pos_label=None,
                             neg_label=None,
                             excel=0):
        '''

        :return:
        '''
        print('\t✔ NPE Marker detection is started..')
        start = time.time()
        rep_base_path = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
            self.rep_sampling_depth)
        filenames = [
            x.split('/')[-1]
            for x in FileUtility.load_list(rep_base_path + '_meta')
        ]

        # CHECK EXISTING LABELS
        if callable(labeler):
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler(file) in label_mapper
            ]
        else:
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler[file] in label_mapper
            ]

        if callable(labeler):
            Y = [
                str(label_mapper[labeler(filenames[sample_id])])
                for sample_id in selected_samples
            ]
        else:
            Y = [
                str(label_mapper[labeler[filenames[sample_id]]])
                for sample_id in selected_samples
            ]

        FileUtility.save_list(rep_base_path + '_' + phenoname + '_Y.txt', Y)
        DiTaxaWorkflow.ensure_dir(self.output_directory_inter +
                                  'npe_marker_files/')

        if self.override == 1 or not DiTaxaWorkflow.exists(
                self.output_directory_inter + 'npe_marker_files/' +
                '_'.join([phenoname, 'chi2_relative.fasta'])):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                G16s = NPEMarkerDetection(
                    rep_base_path + '.npz',
                    rep_base_path + '_' + phenoname + '_Y.txt',
                    rep_base_path + '_features', self.output_directory_inter +
                    'npe_marker_files/' + phenoname, selected_samples)
                G16s.extract_markers()

            end = time.time()
            spent = end - start
            print('\t✔ biomarker extraction ' + phenoname + '  ' + str(spent) +
                  ' seconds , using ' + str(self.num_p) + ' cores')
            self.log_file.append('biomarker extraction ' + phenoname + '  ' +
                                 str(spent) + ' seconds , using ' +
                                 str(self.num_p) + ' cores')
        else:
            print(
                '\t✔ Biomarker are already extracted. Thus, the statistical test was bypassed'
            )
            self.log_file.append(
                ' Biomarker are already extracted. Thus, the statistical test was bypassed'
            )

        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)

        print('\t✔ Taxonomic assignment of the markers..')

        if callable(labeler):
            phenotypes = [
                labeler(filenames[sample_id]) for sample_id in selected_samples
            ]
        else:
            phenotypes = [
                labeler[filenames[sample_id]] for sample_id in selected_samples
            ]

        fasta_file = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_chi2_relative.fasta'
        matrix_path = rep_base_path + '.npz'
        feature_file_path = rep_base_path + '_features'

        if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000:
            remove_redundants = False
        else:
            remove_redundants = True

        FileUtility.ensure_dir(self.output_directory +
                               'final_outputs/save_states/')
        if self.override == 1 or not DiTaxaWorkflow.exists(
                self.output_directory + 'final_outputs/save_states/' +
                phenoname + '.pickle'):
            start = time.time()
            Final_OBJ = NPEMarkerAnlaysis(fasta_file,
                                          matrix_path,
                                          feature_file_path,
                                          phenotypes,
                                          label_mapper,
                                          selected_samples,
                                          p_value_threshold=p_value_threshold,
                                          remove_redundants=remove_redundants,
                                          num_p=self.num_p,
                                          blastn_path=self.blastn_path)
            end = time.time()
            spent = end - start
            DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/')
            FileUtility.save_obj(
                self.output_directory + 'final_outputs/save_states/' +
                phenoname, Final_OBJ)
            print('\t✔ Marker analysis and alignment ' + phenoname + '  ' +
                  str(spent) + ' seconds, using ' + str(self.num_p) + 'cores')
            self.log_file.append('Marker analysis and alignment ' + phenoname +
                                 '  ' + str(spent) + ' seconds, using ' +
                                 str(self.num_p) + 'cores')
        else:
            Final_OBJ = FileUtility.load_obj(self.output_directory +
                                             'final_outputs/save_states/' +
                                             phenoname + '.pickle')
            print('\t✔ The aligned markers already existed and are loaded!')
            self.log_file.append(
                'The aligned markers already existed and are loaded!')
        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)

        # generating the tree
        Final_OBJ.generate_tree(self.output_directory + 'final_outputs/',
                                phenoname)

        if excel == 1:
            print('\t✔ Creating marker excel file..')
            Final_OBJ.generate_excel(
                self.output_directory + 'final_outputs/' + phenoname + '.xlsx',
                phenoname)
            X_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '.npz'
            feature_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '_features'
            markers = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_finalmarker_list.txt'
            Y = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '_' + phenoname + "_Y.txt"
            print('\t✔ Creating t-sne plot..')
            DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' +
                                    phenoname + '_tsne.pdf',
                                    X_addr,
                                    feature_addr,
                                    markers,
                                    Y,
                                    labels=['Negative', 'Positive'])

        if pos_label and neg_label:
            print('\t✔ Creating marker heatmap..')
            Final_OBJ.update_matrix_by_markers_N()
            Final_OBJ.generate_heatmap(self.output_directory +
                                       'final_outputs/' + phenoname +
                                       '_heatmap',
                                       pos_label=pos_label,
                                       neg_label=neg_label)
            if not excel == 1:
                print('\t✔ Creating t-sne plot..')
                DiTaxaWorkflow.plot_res(self.output_directory +
                                        'final_outputs/' + phenoname +
                                        '_tsne.pdf',
                                        X_addr,
                                        feature_addr,
                                        markers,
                                        Y,
                                        labels=[neg_label, pos_label])
        DiTaxaWorkflow.temp_cleanup()
        print(
            '\t⬛ Marker detection and analysis completed. You can find the results at '
            + self.output_directory +
            ', in partuclar at final_outputs subdirectory.')
    def generate_tree_comparative(self,
                                  pos_file,
                                  neg_file,
                                  path,
                                  name,
                                  highlight_up=None,
                                  highlight_down=None):

        font_map = {
            -2: 30,
            -1: 25,
            1: 15,
            2: 14,
            3: 13,
            4: 12,
            5: 8,
            6: 7,
            7: 4
        }
        taxonomy = self.get_pandas_df()['taxonomy'].tolist()
        direction = self.get_pandas_df()['direction'].tolist()
        taxlev = self.get_pandas_df()['taxonomylevel'].tolist()

        logpval = [
            round(-np.log(x)) for x in self.get_pandas_df()['pvalue'].tolist()
        ]
        taxonomy = [
            '.'.join(self.refine_ez_taxonomy(x).split(';')) for x in taxonomy
        ]
        tax_freq = dict(FreqDist(taxonomy).most_common())
        logpval_frq = [tax_freq[x] for idx, x in enumerate(taxonomy)]

        dict_color_ditaxa = dict()
        for idx, x in enumerate(direction):
            if len(taxonomy[idx].split('.')) >= 5:
                coloring = ('r' if x == '+' else ('b' if x == '-' else 'g'))
                if taxonomy[idx].split('.')[-1] in dict_color_ditaxa:
                    dict_color_ditaxa[taxonomy[idx].split('.')[-1]].append(
                        coloring)
                else:
                    dict_color_ditaxa[taxonomy[idx].split('.')[-1]] = [
                        coloring
                    ]
        dict_color_ditaxa = self.purify_tax_color(dict_color_ditaxa)

        pos_tax = FileUtility.load_list(pos_file)
        neg_tax = FileUtility.load_list(neg_file)

        dict_color_lefse = dict()
        for pos in pos_tax:
            if len(pos.split('.')) >= 5:
                if pos.split('.')[-1] in dict_color_lefse:
                    dict_color_lefse[pos.split('.')[-1]].append('r')
                else:
                    dict_color_lefse[pos.split('.')[-1]] = ['r']
        for taxonomy_lefse in neg_tax:
            if len(taxonomy_lefse.split('.')) >= 5:
                if taxonomy_lefse.split('.')[-1] in dict_color_lefse:
                    dict_color_lefse[taxonomy_lefse.split('.')[-1]].append('b')
                else:
                    dict_color_lefse[taxonomy_lefse.split('.')[-1]] = ['b']

        dict_color_lefse = self.purify_tax_color(dict_color_lefse)

        final_dict = dict()

        for taxa, color in dict_color_ditaxa.items():
            if taxa in dict_color_lefse:
                if dict_color_ditaxa[taxa] == dict_color_lefse[
                        taxa] and dict_color_lefse[taxa] == 'r':
                    final_dict[taxa] = 'orange'
                elif dict_color_ditaxa[taxa] == dict_color_lefse[
                        taxa] and dict_color_lefse[taxa] == 'b':
                    final_dict[taxa] = 'cyan'
                elif dict_color_ditaxa[taxa] == dict_color_lefse[taxa]:
                    final_dict[taxa] = 'w'
                elif dict_color_ditaxa[taxa] == 'w':
                    final_dict[taxa] = dict_color_lefse[taxa]
                elif dict_color_lefse[taxa] == 'w':
                    final_dict[taxa] = dict_color_ditaxa[taxa]
                else:
                    final_dict[taxa] = 'black'
            else:
                final_dict[taxa] = dict_color_ditaxa[taxa]

        for taxa, color in dict_color_lefse.items():
            if taxa not in dict_color_ditaxa:
                if color == 'r':
                    final_dict[taxa] = 'yellow'
                elif color == 'b':
                    final_dict[taxa] = 'green'
                else:
                    final_dict[taxa] = 'w'

        if highlight_up and highlight_down:
            correct = []
            wrong_dir = []
            for x in highlight_up:
                if x in final_dict:
                    if final_dict[x] == 'r' or final_dict[x] == 'orange':
                        correct.append(x)
                    elif not final_dict[x] == 'w':
                        wrong_dir.append(x)
                # else:
                #     for y,res in final_dict.items():
                #         if x.lower() in y.lower():
                #             if final_dict[y]=='r' or final_dict[y]=='orange':
                #                 correct.append(x)
                #             elif not final_dict[y]=='w':
                #                 wrong_dir.append(x)

            for x in highlight_down:
                if x in final_dict:
                    if final_dict[x] == 'b' or final_dict[x] == 'cyan':
                        correct.append(x)
                    elif not final_dict[x] == 'w':
                        wrong_dir.append(x)
            for i, j in final_dict.items():
                if j == 'cyan' or j == 'orange':
                    correct.append(i)
            correct = list(set(correct))
            # else:
            #     for y,res in final_dict.items():
            #         if x.lower() in y.lower():
            #             if final_dict[y]=='b' or final_dict[y]=='cyan':
            #                 correct.append(x)
            #             elif not final_dict[y]=='w':
            #                 wrong_dir.append(x)

        taxonomy = [
            '.'.join(self.refine_ez_taxonomy(x).split(';')) for x in taxonomy
        ]
        tax_freq = dict(FreqDist(taxonomy).most_common())
        logpval_frq = [tax_freq[x] for idx, x in enumerate(taxonomy)]

        #taxonomy=['.'.join(x[0:-1] if isGenomeName(x[-1]) else x) for x in taxonomy]
        annot = [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation_background_color',
                final_dict[taxonomy[idx].split('.')[-1]]
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5 and (
                not final_dict[taxonomy[idx].split('.')[-1]] == 'w')
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation_background_color',
                'w'
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) == 5
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation',
                taxonomy[idx].split('.')[-1]
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5 and
            (not final_dict[taxonomy[idx].split('.')[-1]] == 'w')
        ]

        annot = annot + [
            '\t'.join([
                pos.split('.')[-1], 'annotation_background_color',
                final_dict[pos.split('.')[-1]]
            ]) for idx, pos in enumerate(pos_tax)
        ]
        annot = annot + [
            '\t'.join([pos.split('.')[-1], 'annotation',
                       pos.split('.')[-1]])
            for idx, pos in enumerate(pos_tax) if len(pos.split('.')) > 5 and
            (not final_dict[pos.split('.')[-1]] == 'w')
        ]

        annot = annot + [
            '\t'.join([
                neg.split('.')[-1], 'annotation_background_color',
                final_dict[neg.split('.')[-1]]
            ]) for idx, neg in enumerate(neg_tax)
        ]
        annot = annot + [
            '\t'.join([neg.split('.')[-1], 'annotation',
                       neg.split('.')[-1]])
            for idx, neg in enumerate(neg_tax) if len(neg.split('.')) > 5 and
            (not final_dict[neg.split('.')[-1]] == 'w')
        ]

        lneg = [neg.split('.')[-1] for idx, neg in enumerate(neg_tax)]
        lpos = [pos.split('.')[-1] for idx, pos in enumerate(pos_tax)]

        ## OUTER RINGS
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[1], 'annotation',
                taxonomy[idx].split('.')[1]
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]
        annot = annot + [
            '\t'.join(
                [taxonomy[idx].split('.')[1], 'annotation_rotation',
                 str(1)]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]
        annot = annot + [
            '\t'.join(
                [taxonomy[idx].split('.')[1], 'annotation_font_size',
                 str(9)]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[1], 'annotation_background_color',
                '#eedbfc'
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 1
        ]

        ## Clades
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'clade_marker_size',
                str(logpval_frq[idx])
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5 and
            (not final_dict[taxonomy[idx].split('.')[-1]] == 'w')
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'clade_marker_edge_width',
                str(logpval[idx])
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5 and
            (not final_dict[taxonomy[idx].split('.')[-1]] == 'w')
        ]

        annot = annot + [
            '\t'.join(
                [taxonomy[idx].split('.')[-1], 'annotation_rotation',
                 str(1)]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
        ]
        annot = annot + [
            '\t'.join([
                taxonomy[idx].split('.')[-1], 'annotation_font_size',
                str(font_map[taxlev[idx]])
            ]) for idx, x in enumerate(direction)
            if len(taxonomy[idx].split('.')) > 5
        ]

        if highlight_up and highlight_down:
            for taxon in correct:
                if '_' in taxon:
                    annot = annot + [
                        '\t'.join([taxon, 'annotation_font_size', '25'])
                    ]
                else:
                    annot = annot + [
                        '\t'.join([taxon, 'annotation_font_size', '30'])
                    ]

        annot = annot + ['annotation_background_offset\t0.5']
        annot = annot + ['clade_marker_edge_color\t#4f1a49']
        annot = annot + ['branch_color\t#4f1a49']
        annot = annot + ['annotation_background_separation\t-0.01']
        annot = annot + ['annotation_background_width\t0.2']

        #https://bitbucket.org/nsegata/graphlan/src/default/readme.txt?fileviewer=file-view-default
        #asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan_annotate.py --annot ../annot.txt ../test.txt  ../new.xml
        #asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan.py ../new.xml image_name.pdf --dpi 1000 --size 15 --external_legends
        taxonomy = [
            x for x in taxonomy if len(x.split('.')) > 5 and (
                not final_dict[x.split('.')[-1]] == 'w')
        ]
        taxonomy += [
            x for x in pos_tax + neg_tax if len(x.split('.')) > 5 and (
                not final_dict[x.split('.')[-1]] == 'w')
        ]

        FileUtility.save_list(path + name + '_taxonomy.txt', taxonomy)
        FileUtility.save_list(path + name + '_annot.txt', annot)

        FileUtility.save_list(path + name + '_taxonomy.txt', taxonomy)
        FileUtility.save_list(path + name + '_annot.txt', annot)

        subprocess.call("python3 graphlan/graphlan_annotate.py --annot " +
                        path + name + '_annot.txt' + " " + path + name +
                        '_taxonomy.txt' + "  " + path + name + '.xml',
                        shell=True)
        subprocess.call("python3 graphlan/graphlan.py " + path + name +
                        '.xml' + " " + path + name +
                        '.pdf --dpi 1000 --size 15 --external_legends',
                        shell=True)