def generate_kmers_all(self, k, save=False): ''' :param k: :param save: :return: ''' self.k=k self.vocab = [''.join(xs) for xs in itertools.product('atcg', repeat=k)] self.vocab.sort() self.vectorizer = TfidfVectorizer(use_idf=False, vocabulary=self.vocab, analyzer='char', ngram_range=(k, k), norm=None, stop_words=[], lowercase=True, binary=False) data = np.zeros((len(self.fasta_files), len(self.vocab))).astype(np.float64) # multi processing extraction of k-mer distributions t_steps=[] s_steps=[] pool = Pool(processes=self.num_p) for ky, (v,t,s) in tqdm.tqdm(pool.imap_unordered(self.get_kmer_distribution, self.fasta_files, chunksize=1), total=len(self.fasta_files)): data[self.indexing[ky], :] = v t_steps.append(t) s_steps.append(s) # normalize the frequencies data = normalize(data, axis=1, norm='l1') data = sparse.csr_matrix(data) if save: FileUtility.save_sparse_csr(save, data) FileUtility.save_list(save+'_meta',self.fasta_files) # ': '.join(['mean_time', str(np.mean(t_steps))]), ': '.join(['std_time', str(np.std(t_steps))]) FileUtility.save_list(save+'_log',[': '.join(['mean_size', str(np.mean(s_steps))]), ': '.join(['std_size', str(np.std(s_steps))])]) return data
def create_kmer_table(self, path, k, cores=4, override=False): save_path = self.output_path + 'sequence_' + str(k) + 'mer' if override or not os.path.exists('_'.join( [save_path, 'feature', 'vect.npz'])): files = FileUtility.recursive_glob(path, '*') files.sort() input_tuples = [] for file in files: input_tuples.append( (file.split('/')[-1].split('.')[0], file, k)) strains = [] mat = [] kmers = [] pool = Pool(processes=cores) for strain, vec, vocab in tqdm.tqdm(pool.imap_unordered( self._get_kmer_rep, input_tuples, chunksize=cores), total=len(input_tuples)): strains.append(strain) mat.append(vec) kmers = vocab pool.close() mat = sparse.csr_matrix(mat) FileUtility.save_sparse_csr(save_path + '_feature_vect', mat) FileUtility.save_list('_'.join([save_path, 'strains', 'list.txt']), strains) FileUtility.save_list('_'.join([save_path, 'feature', 'list.txt']), kmers) return ('_'.join([save_path]) + ' created')
def write_in_file(filename, pos, neg): lines = [['direction', 'marker', 'p-value']] for marker, pval in pos: lines.append(['+', marker, str(pval)]) for marker, pval in neg: lines.append(['-', marker, str(pval)]) FileUtility.save_list(filename, ['\t'.join(line) for line in lines])
def generate(self, vocab_size, sample_size, output_dir, num_p=4, backend='Sentencepiece'): ''' :param vocab_size: the size of final vocabulary :param sample_size: how many reads from each file :param output_dir: where to write the results :param num_p: number of cores :return: ''' start = timeit.default_timer() fasta_files = [(x, sample_size) for x in self.fasta_files] corpus = [] pool = Pool(processes=num_p) for ky, v in tqdm.tqdm(pool.imap_unordered(self._get_corpus, fasta_files, chunksize=num_p), total=len(fasta_files)): corpus = corpus + v pool.close() print('Corpus size for training NPE is ', len(corpus)) if backend == 'Sentencepiece': FileUtility.save_list('../tmp/tmp_txt', corpus) spm.SentencePieceTrainer.Train( '--input=../tmp/tmp_txt --model_prefix=' + output_dir + ' --add_dummy_prefix false --max_sentencepiece_length=512 --model_type=bpe --mining_sentence_size=5000000 --input_sentence_size=10000000 --vocab_size=50000' ) FileUtility.save_list('../tmp/tmp_txt', corpus[0:10]) elif backend == 'normalbpe': train_npe(corpus, output_dir, vocab_size, output_dir + '_freq') print(' The segmentation training took ', timeit.default_timer() - start, ' ms.')
def create_treefold(self, path, tree_addr, cv, test_ratio, phenotype, mapping=None): ## find a mapping from strains to the phenotypes if mapping: mapping_isolate_label = dict(self.get_new_labeling(mapping)[phenotype]) else: mapping_isolate_label = self.phenotype2labeled_strains_mapping[phenotype] # get common strains list_of_list_of_strains = list(self.strains.values()) list_of_list_of_strains.append(list(mapping_isolate_label.keys())) final_strains = GenotypePhenotypeAccess.get_common_strains(list_of_list_of_strains) final_strains.sort() # prepare test Y = [mapping_isolate_label[strain] for strain in final_strains] isolate_to_group=dict([tuple(l.split('\t')) for l in FileUtility.load_list(tree_addr.replace(tree_addr.split('/')[-1], 'phylogenetic_nodes_and_clusters.txt'))]) groups=[int(isolate_to_group[iso]) for iso in final_strains] group_kfold = GroupKFold(n_splits=round(1/test_ratio)) train_index, test_index = list(group_kfold.split(final_strains, Y, groups))[0] X_test=[final_strains[x] for x in test_index] FileUtility.save_list(path.replace('_folds.txt', '_test.txt'), ['\t'.join(X_test)]) final_strains = [final_strains[ix] for ix in train_index] group_kfold = GroupKFold(n_splits=cv) folds=[] for _, test_index in group_kfold.split(train_index, [Y[idx] for idx in train_index], [groups[idx] for idx in train_index]): folds.append(test_index) folds=['\t'.join([final_strains[x] for x in fold.tolist()]) for fold in folds] FileUtility.save_list(path, folds)
def train_npe(self): ''' :return: ''' print( 'npe training started.. it might take more than 1 hour for more than 1000 samples' ) DiTaxaWorkflow.blockPrint() start = time.time() G16s = NPESegmentTrainMetagenomics(self.file_directory, self.file_extenstion) DiTaxaWorkflow.ensure_dir(self.output_directory + 'npe_segmentatation/') G16s.generate(self.vocab_size, self.seg_train_depth, self.output_directory + 'npe_segmentatation/' + self.dbname + '_' + '_'.join([ 'unique', str(self.vocab_size), 'v', str(self.seg_train_depth), 's' ]), backend='Sentencepiece', num_p=self.num_p) end = time.time() spent = (end - start) self.log_file.append('training segmentation ' + '_'.join([ 'unique', str(self.vocab_size), 'v', str(self.seg_train_depth), 's ' ]) + str(spent) + ' seconds , using ' + str(self.num_p) + 'cores') DiTaxaWorkflow.enablePrint() FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file)
def create_continous_mics(): ''' ''' scaler = MinMaxScaler() df = pd.read_table("../data_config/Final_MICs_16.06.16.txt") res = df[[ 'Isolates', 'CIP MIC', 'TOB MIC', 'COL MIC', 'CAZ MIC', 'MEM MIC' ]] matrix = np.array([[ float( str(x).replace('<=', '').replace('≤', '').replace('<=', '').replace( '≥', '').replace('>=', '')) for x in row ] for row in res[[ 'CIP MIC', 'TOB MIC', 'COL MIC', 'CAZ MIC', 'MEM MIC' ]].as_matrix()]) # find nans [[(idx,idy) for idy,y in enumerate(x) if y] for idx, x in enumerate(np.isnan(matrix))] resistances = np.delete(matrix, [509], axis=0) isolates = [ x[0] for idx, x in enumerate(list(df[['Isolates']].values)) if not idx == 509 ] # scale to 0-1 resistances = scaler.fit_transform(resistances) features = ['CIP', 'TOB', 'COL', 'CAZ', 'MEM'] base_path = '/mounts/data/proj/asgari/dissertation/datasets/deepbio/pseudomonas/data_v3/continous_mic_vals' resistances = csr_matrix(resistances) FileUtility.save_sparse_csr(base_path + '_feature_vect', resistances) FileUtility.save_list(base_path + '_isolates_list.txt', isolates) FileUtility.save_list(base_path + '_feature_list.txt', features)
def representation_npe(self): ''' :return: ''' print('npe generation started..') start = time.time() G16s = NPESegmentApplyMetagenomics( self.file_directory, self.file_extenstion, self.output_directory + 'npe_segmentatation/' + self.dbname + '_' + '_'.join([ 'unique', str(self.vocab_size), 'v', str(self.seg_train_depth), 's.model' ]), sampling_number=self.rep_sampling_depth, num_p=self.num_p) DiTaxaWorkflow.ensure_dir(self.output_directory + 'npe_representation/') G16s.generate_npes_all(save=self.output_directory + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(self.rep_sampling_depth)) end = time.time() spent = end - start self.log_file.append( 'generating the representations npe_representation/' + self.dbname + '_uniquepiece_' + str(self.rep_sampling_depth) + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + 'cores') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file)
def train_resampling_npe(sentenses, outfile, num_symbols, frequency_file, min_frequency=2, verbose=False, is_dict=False, resample_size=10000, N=10): """Learn num_symbols BPE operations from vocabulary, and write to outfile. """ outfile_name=outfile list_of_seg=[] outfile = codecs.open(outfile, 'w', 'utf-8') f = codecs.open(frequency_file, 'w', 'utf-8') # version 0.2 changes the handling of the end-of-word token ('</w>'); # version numbering allows bckward compatibility outfile.write('#version: 0.2\n') list_of_seg.append('#version: 0.2') vocab = get_vocabulary(sentenses, is_dict) vocab = dict([(tuple(x[:-1]) + (x[-1] + '</w>',), y) for (x, y) in vocab.items()]) sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True) stats, indices = get_pair_statistics(sorted_vocab) big_stats = copy.deepcopy(stats) # threshold is inspired by Zipfian assumption, but should only affect speed threshold = max(stats.values()) / 10 for i in tqdm.tqdm(range(num_symbols)): if stats: most_frequent = max(stats, key=lambda x: (stats[x], x)) # we probably missed the best pair because of pruning; go back to full statistics if not stats or (i and stats[most_frequent] < threshold): prune_stats(stats, big_stats, threshold) stats = copy.deepcopy(big_stats) most_frequent = max(stats, key=lambda x: (stats[x], x)) # threshold is inspired by Zipfian assumption, but should only affect speed threshold = stats[most_frequent] * i / (i + 10000.0) prune_stats(stats, big_stats, threshold) if stats[most_frequent] < min_frequency: sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency)) break f.write('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n') list_of_seg.append('{0} {1} '.format(*most_frequent)) #print('{0} {1} '.format(*most_frequent) + str(stats[most_frequent]) + '\n') if verbose: sys.stderr.write( 'pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent])) outfile.write('{0} {1}\n'.format(*most_frequent)) changes = replace_pair(most_frequent, sorted_vocab, indices) update_pair_statistics(most_frequent, changes, stats, indices) stats[most_frequent] = 0 if not i % 100: prune_stats(stats, big_stats, threshold) if not i % 100: FileUtility.save_list(outfile_name+'_temp',list_of_seg) f.close()
def create_tsne_web(X, Y, tsne_file_coor, tsne_file_label): classes = list(set(Y)) classes.sort() L = [classes.index(y) for y in Y] tsne_res = np.hstack((X, np.array([L]).T)) tsne_res[:, 0:2] = np.round(tsne_res[:, 0:2], 2) tsne_lines = [] for l in tsne_res: tsne_lines.append('\t'.join([str(l[0]), str(l[1]), str(int(l[2]))])) FileUtility.save_list(tsne_file_coor, tsne_lines) FileUtility.save_list(tsne_file_label, Y)
def sequential_crawl(triples, override=False): if not override: new_list=[] for x,y,z in triples: if not FileUtility.exists(y+z): new_list.append((x,y,z)) triples=new_list print ('Start crawling..') for x in tqdm.tqdm(triples): PNGScriptRetrieve(x) FileUtility.save_list(triples[0][1]+'log.txt',PNGScriptRetrieve.log)
def numpy2trainfiles(file, name, out='../data/s8_features/'): ''' test_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cb513+profile_split1.npy' train_file='/mounts/data/proj/asgari/dissertation/datasets/deepbio/protein_general/ss/data/cullpdb+profile_6133_filtered.npy' :param name: :param out: :return: ''' db = np.load(file) a = np.arange(0, 21) b = np.arange(35, 56) c = np.hstack((a, b)) db = np.reshape(db, (db.shape[0], int(db.shape[1] / 57), 57)) seq = [ 'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X', 'NoSeq' ] label = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T'] sequences = [] labels = [] possible_features = dict() for i in range(0, db.shape[0]): sequences.append(''.join([ seq[np.argmax(x)] if np.max(x) == 1 else '' for x in db[i, :, 0:21] ]).lower()) labels.append(''.join([ label[np.argmax(y)] if np.max(y) == 1 else '' for y in db[i, :, 22:30] ]).lower()) lengths = [len(x) for x in sequences] sorted_idxs = argsort(lengths) lengths.sort() sequences = [sequences[i] for i in sorted_idxs] labels = [labels[i] for i in sorted_idxs] FileUtility.save_list(out + name, [ '\n'.join([ ' '.join([elx, labels[idx][idy]]) for idy, elx in enumerate(list(seq)) ] + ['']) for idx, seq in enumerate(sequences) ]) db_new = db[sorted_idxs, :, :] label_encoding = [[([0] if np.max(row) == 1 else [1]) + row for row in db_new[i, :, 22:30].tolist()] for i in range(0, db.shape[0])] np.save(out + name + '_mat_Y', label_encoding) db_new = db_new[:, :, c] np.save(out + name + '_mat_X', db_new) FileUtility.save_list(out + name + '_length.txt', [str(l) for l in lengths])
def parallel_crawl(triples, num_p, override=False): if not override: new_list=[] for x,y,z in triples: if not FileUtility.exists(y+z): new_list.append((x,y,z)) triples=new_list if len(triples)>0: print ('Start parallel crawling..') pool = Pool(processes=num_p) res=[] for x in tqdm.tqdm(pool.imap_unordered(PNGScriptRetrieve, triples, chunksize=num_p),total=len(triples)): res.append(x) pool.close() FileUtility.save_list(triples[0][1]+'log.txt',PNGScriptRetrieve.log)
def generate_excel(self, filename, settingname): ''' :param filename: :param settingname: :return: ''' df = self.get_pandas_df() final_markers = df['marker'].values.tolist() FileUtility.save_list( filename.replace('.xlsx', '_finalmarker_list.txt').replace( 'final_outputs', 'intermediate_files/npe_marker_files'), final_markers) writer = pd.ExcelWriter(filename) df.to_excel(writer, settingname) writer.save()
def sequential_crawl(triples, override=False): ''' :param triples: :param override: :return: ''' if not override: new_list=[] for x,y,z in triples: if not FileUtility.exists(y+z): new_list.append((x,y,z)) triples=new_list print ('Start crawling..') for x in tqdm.tqdm(triples): BibleCom(x) FileUtility.save_list(triples[0][1]+'log.txt',BibleCom.log)
def representation_npe(self): ''' :return: ''' if self.override == 1 or not DiTaxaWorkflow.exists( self.output_directory_inter + 'npe_representation/'): print('\t✔ Creating NPE representations ...') start = time.time() G16s = NPESegmentApplyMetagenomics( self.file_directory, self.file_extenstion, self.output_directory_inter + 'npe_segmentatation/' + self.dbname + '_' + '_'.join([ 'unique', str(self.vocab_size), 'v', str(self.seg_train_depth), 's.model' ]), sampling_number=self.rep_sampling_depth, num_p=self.num_p) DiTaxaWorkflow.ensure_dir(self.output_directory_inter + 'npe_representation/') G16s.generate_npes_all( save=self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(self.rep_sampling_depth)) end = time.time() spent = end - start print( '\t✔ Generating the NPE representations at npe_representation/' + self.dbname + '_uniquepiece_' + str(self.rep_sampling_depth) + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + 'cores') self.log_file.append( 'Generating the NPE representations at npe_representation/' + self.dbname + '_uniquepiece_' + str(self.rep_sampling_depth) + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + 'cores') else: print( '\t✔ Representation are already created. Thus, this is step is skipped!' ) self.log_file.append( 'Representation are already created. Thus, this is step is skipped!' ) FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) DiTaxaWorkflow.temp_cleanup()
def tree2mat_group(tree_file, n_group=20): ''' This function maps the phylgenetic tree to adj matrix and perform spectral clustering on it :param tree_file: :param n_group: :return: ''' if not os.path.exists( tree_file.replace( tree_file.split('/')[-1], 'phylogenetic_nodes_and_clusters.txt')): print('Create phylogenetic information distance and groupings..') t = Phylo.read(tree_file, 'newick') d = {} for x, y in itertools.combinations(t.get_terminals(), 2): v = t.distance(x, y) d[x.name] = d.get(x.name, {}) d[x.name][y.name] = v d[y.name] = d.get(y.name, {}) d[y.name][x.name] = v for x in t.get_terminals(): d[x.name][x.name] = 0 m = pd.DataFrame(d) isolates = [x for x in m.axes[0]] isolates.sort() mat = np.zeros((len(isolates), len(isolates))) for x in range(len(isolates)): for y in range(len(isolates)): mat[x, y] = m[isolates[x]][isolates[y]] transferred_mat = np.exp(-mat**2 / (2. * 0.08**2)) clustering = SpectralClustering(n_clusters=n_group, assign_labels="kmeans", random_state=0).fit(transferred_mat) np.save( tree_file.replace( tree_file.split('/')[-1], 'phylogenetic_distance_matrix'), transferred_mat) FileUtility.save_list( tree_file.replace( tree_file.split('/')[-1], 'phylogenetic_nodes_and_clusters.txt'), [ '\t'.join([x, str(clustering.labels_[idx])]) for idx, x in enumerate(isolates) ])
def convert_to_kmer(input_file, out_file, n=3): train = FileUtility.load_list(input_file) training_data = [line.split() for line in train] final_list = list() temp = [] for x in training_data: if x == []: final_list.append(temp) temp = [] else: temp.append(x) res = [] for prot in final_list: sentence = ''.join(['$'] + [aa[0] for aa in prot] + ['#']) res += [(sentence[i:i + n], prot[i][1]) for i in range(len(sentence) - n + 1)] res += [''] FileUtility.save_list(out_file, [' '.join(list(x)) for x in res])
def ret_a_book(self, tr_meta): isocode, trID, dam_ids = tr_meta # store the api call results in json file_path = self.output_path + '/api_intermediate/' + '_'.join( [isocode, trID]) + '.json' f = codecs.open(file_path, 'w', 'utf-8') for x in dam_ids: response = requests.get('http://dbt.io/library/verse?key=' + self.key + '&dam_id=' + x + '&v=2') f.write(response.content.decode("utf-8") + '\n') f.close() # read the books books = [] for line in codecs.open(file_path, 'r', 'utf-8'): try: books.append(json.loads(line)) except: self.to_double_check.append(tr_meta) # parse the books bible = dict() for book in books: for rec in book: try: key = self.book_map[rec['book_id']] + rec[ 'chapter_id'].zfill(3) + rec['verse_id'].zfill(3) bible[key] = rec['verse_text'].strip() except KeyError as e: pass # save the books ordered_bible = collections.OrderedDict(sorted(bible.items())) bible = ['\t'.join([k, v]) for k, v in ordered_bible.items()] if len(bible) > 0: FileUtility.save_list( self.output_path + '/' + '_'.join([isocode, trID]) + '.api.txt', bible) return trID, len(bible)
def generate_npes_all(self, save=False, norm=False): data = np.zeros( (len(self.fasta_files), len(self.npe_vocab))).astype(np.float64) # multi processing extraction of npe distributions t_steps = [] s_steps = [] pool = Pool(processes=self.num_p) for ky, (v, t, s) in tqdm.tqdm( pool.imap_unordered(self._get_npe_distribution, self.fasta_files, chunksize=self.num_p), total=len(self.fasta_files)): data[self.indexing[ky], :] = v t_steps.append(t) s_steps.append(s) pool.close() # normalize the frequencies if norm: data = normalize(data, axis=1, norm='l1') data = sparse.csr_matrix(data) if save: FileUtility.save_sparse_csr(save, data) FileUtility.save_list(save + '_meta', self.fasta_files) FileUtility.save_list(save + '_features', self.npe_vocab) FileUtility.save_list(save + '_log', [ ': '.join(['mean_time', str(np.mean(t_steps))]), ': '.join([ 'std_time', str(np.std(t_steps)) ]), ': '.join(['mean_size', str(np.mean(s_steps))]), ': '.join( ['std_size', str(np.std(s_steps))]) ]) return data
def train_npe(self): ''' :return: ''' if self.override == 1 or not DiTaxaWorkflow.exists( self.output_directory_inter + 'npe_segmentatation/'): print('\t✔ Segmentation inference started.. ') start = time.time() G16s = NPESegmentTrainMetagenomics(self.file_directory, self.file_extenstion) DiTaxaWorkflow.ensure_dir(self.output_directory_inter + 'npe_segmentatation/') G16s.generate(self.vocab_size, self.seg_train_depth, self.output_directory_inter + 'npe_segmentatation/' + self.dbname + '_' + '_'.join([ 'unique', str(self.vocab_size), 'v', str(self.seg_train_depth), 's' ]), backend='Sentencepiece', num_p=self.num_p) end = time.time() spent = (end - start) self.log_file.append('Segmentation inference ' + '_'.join([ 'unique', str(self.vocab_size), 'v', str(self.seg_train_depth), 's ' ]) + str(spent) + ' seconds , using ' + str(self.num_p) + ' cores') else: print( '\t✔ Segmentation results directory exists. Thus, the step was bypassed' ) self.log_file.append( 'Segmentation results directory exists. Thus, the step was bypassed' ) FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file)
def create_randfold(self, path, cv, test_ratio, phenotype, mapping=None): ## find a mapping from strains to the phenotypes if mapping: mapping_isolate_label = dict(self.get_new_labeling(mapping)[phenotype]) else: mapping_isolate_label = self.phenotype2labeled_strains_mapping[phenotype] # get common strains list_of_list_of_strains = list(self.strains.values()) list_of_list_of_strains.append(list(mapping_isolate_label.keys())) final_strains = GenotypePhenotypeAccess.get_common_strains(list_of_list_of_strains) final_strains.sort() # prepare test Y = [mapping_isolate_label[strain] for strain in final_strains] X_train, X_test, y_train, _ = train_test_split(final_strains, Y, test_size=test_ratio, random_state=0, stratify=Y) FileUtility.save_list(path.replace('_folds.txt', '_test.txt'), ['\t'.join(X_test)]) # prepare train spliter=StratifiedKFold(cv) folds=['\t'.join([final_strains[x] for x in fold.tolist()]) for _,fold in list(spliter.split(X_train,y_train))] FileUtility.save_list(path, folds)
def generate_tree(self, path, name): path_g = path + '/graphlan_files/' FileUtility.ensure_dir(path_g) font_map = {1: 15, 2: 14, 3: 13, 4: 12, 5: 8, 6: 7, 7: 4} taxonomy = self.get_pandas_df()['taxonomy'].tolist() direction = self.get_pandas_df()['direction'].tolist() taxlev = self.get_pandas_df()['taxonomylevel'].tolist() logpval = [ round(-np.log(x)) for x in self.get_pandas_df()['pvalue'].tolist() ] taxonomy = [ '.'.join(self.refine_ez_taxonomy(x).split(';')) for x in taxonomy ] tax_freq = dict(FreqDist(taxonomy).most_common()) logpval_frq = [tax_freq[x] for idx, x in enumerate(taxonomy)] #taxonomy=['.'.join(x[0:-1] if isGenomeName(x[-1]) else x) for x in taxonomy] dict_color = dict() for idx, x in enumerate(direction): if len(taxonomy[idx].split('.')) > 5: coloring = ('r' if x == '+' else ('b' if x == '-' else 'g')) if taxonomy[idx].split('.')[-1] in dict_color: dict_color[taxonomy[idx].split('.')[-1]].append(coloring) else: dict_color[taxonomy[idx].split('.')[-1]] = [coloring] new_dict_color = dict() for tax, colors in dict_color.items(): freq = FreqDist(colors) if freq['r'] / (freq['r'] + freq['b']) > 0.8: new_dict_color[tax] = 'r' elif freq['b'] / (freq['r'] + freq['b']) > 0.8: new_dict_color[tax] = 'b' else: new_dict_color[tax] = 'w' dict_color = new_dict_color annot = [ '\t'.join([ taxonomy[idx].split('.')[-1], 'annotation_background_color', dict_color[taxonomy[idx].split('.')[-1]] ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 ] #annot=['\t'.join([taxonomy[idx].split('.')[-1],'annotation_background_color',('r' if x=='+' else ('b' if x=='-' else 'g'))]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>5] annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[-1], 'annotation_background_color', 'w' ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) == 5 ] annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[-1], 'annotation', taxonomy[idx].split('.')[-1] ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 if not dict_color[taxonomy[idx].split('.')[-1]] == 'w' ] #annot=annot+['\t'.join([taxonomy[idx].split('.')[-1],'annotation_background_color','purple']) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.'))>5] ## OUTER RINGS annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[1], 'annotation', taxonomy[idx].split('.')[1] ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 1 ] annot = annot + [ '\t'.join( [taxonomy[idx].split('.')[1], 'annotation_rotation', str(1)]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 1 ] annot = annot + [ '\t'.join( [taxonomy[idx].split('.')[1], 'annotation_font_size', str(9)]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 1 ] annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[1], 'annotation_background_color', '#eedbfc' ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 1 ] ## Clades annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[-1], 'clade_marker_size', str(logpval_frq[idx]) ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 if not dict_color[taxonomy[idx].split('.')[-1]] == 'w' ] annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[-1], 'clade_marker_edge_width', str(logpval[idx]) ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 if not dict_color[taxonomy[idx].split('.')[-1]] == 'w' ] annot = annot + [ '\t'.join( [taxonomy[idx].split('.')[-1], 'annotation_rotation', str(1)]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 if not dict_color[taxonomy[idx].split('.')[-1]] == 'w' ] annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[-1], 'annotation_font_size', str(font_map[taxlev[idx]]) ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 if not dict_color[taxonomy[idx].split('.')[-1]] == 'w' ] annot = annot + ['annotation_background_offset\t0.5'] annot = annot + ['clade_marker_edge_color\t#4f1a49'] annot = annot + ['branch_color\t#4f1a49'] annot = annot + ['annotation_background_separation\t-0.01'] annot = annot + ['annotation_background_width\t0.2'] #https://bitbucket.org/nsegata/graphlan/src/default/readme.txt?fileviewer=file-view-default #asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan_annotate.py --annot ../annot.txt ../test.txt ../new.xml #asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan.py ../new.xml image_name.pdf --dpi 1000 --size 15 --external_legends taxonomy = [ x for x in taxonomy if len(x.split('.')) > 5 if not dict_color[x.split('.')[-1]] == 'w' ] FileUtility.save_list(path_g + name + '_taxonomy.txt', taxonomy) FileUtility.save_list(path_g + name + '_annot.txt', annot) subprocess.call("python3 graphlan/graphlan_annotate.py --annot " + path_g + name + '_annot.txt' + " " + path_g + name + '_taxonomy.txt' + " " + path_g + name + '.xml', shell=True) subprocess.call("python3 graphlan/graphlan.py " + path_g + name + '.xml' + " " + path + name + '.pdf --dpi 1000 --size 15 --external_legends', shell=True) try: FileUtility.remove(path + name + '_legend.pdf') except: print('')
def biomarker_extraction(self, labeler, label_mapper, name_setting, p_value_threshold=0.05, pos_label=None, neg_label=None): ''' :return: ''' print('npe marker detection started') DiTaxaWorkflow.blockPrint() start = time.time() rep_base_path = self.output_directory + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) filenames = [ x.split('/')[-1] for x in FileUtility.load_list(rep_base_path + '_meta') ] # CHECK EXISTING LABELS if callable(labeler): selected_samples = [ idx for idx, file in enumerate(filenames) if labeler(file) in label_mapper ] else: selected_samples = [ idx for idx, file in enumerate(filenames) if labeler[file] in label_mapper ] if callable(labeler): Y = [ str(label_mapper[labeler(filenames[sample_id])]) for sample_id in selected_samples ] else: Y = [ str(label_mapper[labeler[filenames[sample_id]]]) for sample_id in selected_samples ] FileUtility.save_list(rep_base_path + '_' + name_setting + '_Y.txt', Y) DiTaxaWorkflow.ensure_dir(self.output_directory + 'npe_marker_files/') G16s = NPEMarkerDetection( rep_base_path + '.npz', rep_base_path + '_' + name_setting + '_Y.txt', rep_base_path + '_features', self.output_directory + 'npe_marker_files/' + name_setting, selected_samples) G16s.extract_markers() end = time.time() spent = end - start self.log_file.append('biomarker extraction ' + name_setting + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + 'cores') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) DiTaxaWorkflow.enablePrint() print('npe marker taxonomic detection started') start = time.time() if callable(labeler): phenotypes = [ labeler(filenames[sample_id]) for sample_id in selected_samples ] else: phenotypes = [ labeler[filenames[sample_id]] for sample_id in selected_samples ] fasta_file = self.output_directory + 'npe_marker_files/' + name_setting + '_chi2_relative.fasta' matrix_path = rep_base_path + '.npz' feature_file_path = rep_base_path + '_features' if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000: remove_redundants = False else: remove_redundants = True Final_OBJ = NPEMarkerAnlaysis(fasta_file, matrix_path, feature_file_path, phenotypes, label_mapper, selected_samples, p_value_threshold=p_value_threshold, remove_redundants=remove_redundants, num_p=self.num_p) end = time.time() spent = end - start DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/') FileUtility.save_obj( self.output_directory + 'final_outputs/' + name_setting, Final_OBJ) Final_OBJ.generate_tree(self.output_directory + 'final_outputs/', name_setting) self.log_file.append('blasting extraction ' + name_setting + ' ' + str(spent) + ' seconds, using ' + str(self.num_p) + 'cores') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) if pos_label and neg_label: Final_OBJ.generate_heatmap(self.output_directory + 'final_outputs/' + name_setting + '_heatmap', pos_label=pos_label, neg_label=neg_label)
def tolower(file): lines = [l.lower() for l in FileUtility.load_list(file)] FileUtility.save_list(file + 'new', lines)
def create_read_tabular_file(path, save_pref='_', feature_normalization=None, transpose=False, override=False): ''' :param path: :param save_pref: :param transpose: if isolates are columns :param feature_normalization: 'binary': {0,1}, '0-1': [0-1], 'percent': {0,1,..,100}, 'zu': zero mean, unit variance :return: ''' print('Start creating ', save_pref) if override or not os.path.exists('_'.join( [save_pref, 'feature', 'vect.npz'])): rows = [ l.strip() for l in codecs.open(path, 'r', 'utf-8').readlines() ] tf_vec = sparse.csr_matrix([[ GenotypeReader.get_float_or_zero(x) for x in entry.split('\t')[1::] ] for entry in rows[1::]]) if transpose: tf_vec = sparse.csr_matrix(tf_vec.toarray().T) isolates = [ feat.replace(' ', '') for feat in rows[0].rstrip().split('\t') ] feature_names = [row.split()[0] for row in rows[1::]] else: isolates = [row.split()[0] for row in rows[1::]] feature_names = [ feat.replace(' ', '') for feat in rows[0].rstrip().split('\t') ] # normalizer / discretizer if feature_normalization: if feature_normalization == 'binary': tf_vec = np.round(MaxAbsScaler().fit_transform(tf_vec)) elif feature_normalization == '01': tf_vec = MaxAbsScaler().fit_transform(tf_vec) elif feature_normalization == 'percent': tf_vec = np.round(MaxAbsScaler().fit_transform(tf_vec) * 100) elif feature_normalization == 'zu': tf_vec = sparse.csr_matrix( preprocessing.StandardScaler().fit_transform( tf_vec.toarray())) FileUtility.save_sparse_csr( '_'.join([save_pref, 'feature', 'vect.npz']), tf_vec) FileUtility.save_list( '_'.join([save_pref, 'feature', 'list.txt']), feature_names) FileUtility.save_list( '_'.join([save_pref, 'strains', 'list.txt']), isolates) print(save_pref, ' created successfully containing ', str(len(isolates)), ' strains and ', str(len(feature_names)), ' features') return (''.join([ save_pref, ' created successfully containing ', str(len(isolates)), ' strains and ', str(len(feature_names)), ' features' ])) else: print(save_pref, ' already exist ') return (''.join([save_pref, ' already exist ']))
def read_data(self): self.xmldoc = minidom.parse(self.genml_path) # parse project part self.project = self.xmldoc.getElementsByTagName('project') self.output = self.project[0].attributes['output'].value self.project_name = self.project[0].attributes['name'].value if self.override and os.path.exists(self.output): var = input("Delete existing files at the output path? (y/n)") if var == 'y': shutil.rmtree(self.output) if not os.path.exists(self.output): os.makedirs(self.output) log_file = self.output + '/' + 'logfile' log_info = ['Project ' + self.project_name] self.representation_path = self.output + '/intermediate_rep/' IC = IntermediateRepCreate(self.representation_path) # load tables tabless = self.xmldoc.getElementsByTagName('tables') for tables in tabless: path = tables.attributes['path'].value normalization = tables.attributes['normalization'].value prefix = tables.firstChild.nodeValue.strip() + '_' if len(prefix) == 1: prefix = '' for file in FileUtility.recursive_glob(path, '*.uniq.mat'): log=IC.create_table(file, prefix + file.split('/')[-1], normalization, self.override) log_info.append(log) tables = self.xmldoc.getElementsByTagName('table') for table in tables: path = table.attributes['path'].value normalization = table.attributes['normalization'].value prefix = table.firstChild.nodeValue.strip() log=IC.create_table(path, prefix + path.split('/')[-1] if prefix=='' else prefix, normalization, self.override) log_info.append(log) # load sequences sequences = self.xmldoc.getElementsByTagName('sequence') for sequence in sequences: path = sequence.attributes['path'].value kmer = int(sequence.attributes['kmer'].value) log=IC.create_kmer_table(path,kmer,cores=min(self.cores,4),override=self.override) log_info.append(log) ## Adding metadata self.metadata_path = self.output + '/metadata/' if not os.path.exists(self.metadata_path): os.makedirs(self.metadata_path) # phenotype phenotype = self.xmldoc.getElementsByTagName('phenotype') if not os.path.exists(self.metadata_path + 'phenotypes.txt') or self.override: FileUtility.save_list(self.metadata_path + 'phenotypes.txt', FileUtility.load_list(phenotype[0].attributes['path'].value)) # tree phylogentictree = self.xmldoc.getElementsByTagName('phylogentictree') if not os.path.exists(self.metadata_path + 'phylogentictree.txt') or self.override: FileUtility.save_list(self.metadata_path + 'phylogentictree.txt', FileUtility.load_list(phylogentictree[0].attributes['path'].value)) tree2mat_group(self.metadata_path + 'phylogentictree.txt',n_group=20) FileUtility.save_list(log_file, log_info)
def biomarker_extraction(self, labeler, label_mapper, phenoname, p_value_threshold=0.05, pos_label=None, neg_label=None, excel=0): ''' :return: ''' print('\t✔ NPE Marker detection is started..') start = time.time() rep_base_path = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) filenames = [ x.split('/')[-1] for x in FileUtility.load_list(rep_base_path + '_meta') ] # CHECK EXISTING LABELS if callable(labeler): selected_samples = [ idx for idx, file in enumerate(filenames) if labeler(file) in label_mapper ] else: selected_samples = [ idx for idx, file in enumerate(filenames) if labeler[file] in label_mapper ] if callable(labeler): Y = [ str(label_mapper[labeler(filenames[sample_id])]) for sample_id in selected_samples ] else: Y = [ str(label_mapper[labeler[filenames[sample_id]]]) for sample_id in selected_samples ] FileUtility.save_list(rep_base_path + '_' + phenoname + '_Y.txt', Y) DiTaxaWorkflow.ensure_dir(self.output_directory_inter + 'npe_marker_files/') if self.override == 1 or not DiTaxaWorkflow.exists( self.output_directory_inter + 'npe_marker_files/' + '_'.join([phenoname, 'chi2_relative.fasta'])): with warnings.catch_warnings(): warnings.simplefilter("ignore") G16s = NPEMarkerDetection( rep_base_path + '.npz', rep_base_path + '_' + phenoname + '_Y.txt', rep_base_path + '_features', self.output_directory_inter + 'npe_marker_files/' + phenoname, selected_samples) G16s.extract_markers() end = time.time() spent = end - start print('\t✔ biomarker extraction ' + phenoname + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + ' cores') self.log_file.append('biomarker extraction ' + phenoname + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + ' cores') else: print( '\t✔ Biomarker are already extracted. Thus, the statistical test was bypassed' ) self.log_file.append( ' Biomarker are already extracted. Thus, the statistical test was bypassed' ) FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) print('\t✔ Taxonomic assignment of the markers..') if callable(labeler): phenotypes = [ labeler(filenames[sample_id]) for sample_id in selected_samples ] else: phenotypes = [ labeler[filenames[sample_id]] for sample_id in selected_samples ] fasta_file = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_chi2_relative.fasta' matrix_path = rep_base_path + '.npz' feature_file_path = rep_base_path + '_features' if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000: remove_redundants = False else: remove_redundants = True FileUtility.ensure_dir(self.output_directory + 'final_outputs/save_states/') if self.override == 1 or not DiTaxaWorkflow.exists( self.output_directory + 'final_outputs/save_states/' + phenoname + '.pickle'): start = time.time() Final_OBJ = NPEMarkerAnlaysis(fasta_file, matrix_path, feature_file_path, phenotypes, label_mapper, selected_samples, p_value_threshold=p_value_threshold, remove_redundants=remove_redundants, num_p=self.num_p, blastn_path=self.blastn_path) end = time.time() spent = end - start DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/') FileUtility.save_obj( self.output_directory + 'final_outputs/save_states/' + phenoname, Final_OBJ) print('\t✔ Marker analysis and alignment ' + phenoname + ' ' + str(spent) + ' seconds, using ' + str(self.num_p) + 'cores') self.log_file.append('Marker analysis and alignment ' + phenoname + ' ' + str(spent) + ' seconds, using ' + str(self.num_p) + 'cores') else: Final_OBJ = FileUtility.load_obj(self.output_directory + 'final_outputs/save_states/' + phenoname + '.pickle') print('\t✔ The aligned markers already existed and are loaded!') self.log_file.append( 'The aligned markers already existed and are loaded!') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) # generating the tree Final_OBJ.generate_tree(self.output_directory + 'final_outputs/', phenoname) if excel == 1: print('\t✔ Creating marker excel file..') Final_OBJ.generate_excel( self.output_directory + 'final_outputs/' + phenoname + '.xlsx', phenoname) X_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '.npz' feature_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '_features' markers = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_finalmarker_list.txt' Y = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '_' + phenoname + "_Y.txt" print('\t✔ Creating t-sne plot..') DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' + phenoname + '_tsne.pdf', X_addr, feature_addr, markers, Y, labels=['Negative', 'Positive']) if pos_label and neg_label: print('\t✔ Creating marker heatmap..') Final_OBJ.update_matrix_by_markers_N() Final_OBJ.generate_heatmap(self.output_directory + 'final_outputs/' + phenoname + '_heatmap', pos_label=pos_label, neg_label=neg_label) if not excel == 1: print('\t✔ Creating t-sne plot..') DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' + phenoname + '_tsne.pdf', X_addr, feature_addr, markers, Y, labels=[neg_label, pos_label]) DiTaxaWorkflow.temp_cleanup() print( '\t⬛ Marker detection and analysis completed. You can find the results at ' + self.output_directory + ', in partuclar at final_outputs subdirectory.')
def generate_tree_comparative(self, pos_file, neg_file, path, name, highlight_up=None, highlight_down=None): font_map = { -2: 30, -1: 25, 1: 15, 2: 14, 3: 13, 4: 12, 5: 8, 6: 7, 7: 4 } taxonomy = self.get_pandas_df()['taxonomy'].tolist() direction = self.get_pandas_df()['direction'].tolist() taxlev = self.get_pandas_df()['taxonomylevel'].tolist() logpval = [ round(-np.log(x)) for x in self.get_pandas_df()['pvalue'].tolist() ] taxonomy = [ '.'.join(self.refine_ez_taxonomy(x).split(';')) for x in taxonomy ] tax_freq = dict(FreqDist(taxonomy).most_common()) logpval_frq = [tax_freq[x] for idx, x in enumerate(taxonomy)] dict_color_ditaxa = dict() for idx, x in enumerate(direction): if len(taxonomy[idx].split('.')) >= 5: coloring = ('r' if x == '+' else ('b' if x == '-' else 'g')) if taxonomy[idx].split('.')[-1] in dict_color_ditaxa: dict_color_ditaxa[taxonomy[idx].split('.')[-1]].append( coloring) else: dict_color_ditaxa[taxonomy[idx].split('.')[-1]] = [ coloring ] dict_color_ditaxa = self.purify_tax_color(dict_color_ditaxa) pos_tax = FileUtility.load_list(pos_file) neg_tax = FileUtility.load_list(neg_file) dict_color_lefse = dict() for pos in pos_tax: if len(pos.split('.')) >= 5: if pos.split('.')[-1] in dict_color_lefse: dict_color_lefse[pos.split('.')[-1]].append('r') else: dict_color_lefse[pos.split('.')[-1]] = ['r'] for taxonomy_lefse in neg_tax: if len(taxonomy_lefse.split('.')) >= 5: if taxonomy_lefse.split('.')[-1] in dict_color_lefse: dict_color_lefse[taxonomy_lefse.split('.')[-1]].append('b') else: dict_color_lefse[taxonomy_lefse.split('.')[-1]] = ['b'] dict_color_lefse = self.purify_tax_color(dict_color_lefse) final_dict = dict() for taxa, color in dict_color_ditaxa.items(): if taxa in dict_color_lefse: if dict_color_ditaxa[taxa] == dict_color_lefse[ taxa] and dict_color_lefse[taxa] == 'r': final_dict[taxa] = 'orange' elif dict_color_ditaxa[taxa] == dict_color_lefse[ taxa] and dict_color_lefse[taxa] == 'b': final_dict[taxa] = 'cyan' elif dict_color_ditaxa[taxa] == dict_color_lefse[taxa]: final_dict[taxa] = 'w' elif dict_color_ditaxa[taxa] == 'w': final_dict[taxa] = dict_color_lefse[taxa] elif dict_color_lefse[taxa] == 'w': final_dict[taxa] = dict_color_ditaxa[taxa] else: final_dict[taxa] = 'black' else: final_dict[taxa] = dict_color_ditaxa[taxa] for taxa, color in dict_color_lefse.items(): if taxa not in dict_color_ditaxa: if color == 'r': final_dict[taxa] = 'yellow' elif color == 'b': final_dict[taxa] = 'green' else: final_dict[taxa] = 'w' if highlight_up and highlight_down: correct = [] wrong_dir = [] for x in highlight_up: if x in final_dict: if final_dict[x] == 'r' or final_dict[x] == 'orange': correct.append(x) elif not final_dict[x] == 'w': wrong_dir.append(x) # else: # for y,res in final_dict.items(): # if x.lower() in y.lower(): # if final_dict[y]=='r' or final_dict[y]=='orange': # correct.append(x) # elif not final_dict[y]=='w': # wrong_dir.append(x) for x in highlight_down: if x in final_dict: if final_dict[x] == 'b' or final_dict[x] == 'cyan': correct.append(x) elif not final_dict[x] == 'w': wrong_dir.append(x) for i, j in final_dict.items(): if j == 'cyan' or j == 'orange': correct.append(i) correct = list(set(correct)) # else: # for y,res in final_dict.items(): # if x.lower() in y.lower(): # if final_dict[y]=='b' or final_dict[y]=='cyan': # correct.append(x) # elif not final_dict[y]=='w': # wrong_dir.append(x) taxonomy = [ '.'.join(self.refine_ez_taxonomy(x).split(';')) for x in taxonomy ] tax_freq = dict(FreqDist(taxonomy).most_common()) logpval_frq = [tax_freq[x] for idx, x in enumerate(taxonomy)] #taxonomy=['.'.join(x[0:-1] if isGenomeName(x[-1]) else x) for x in taxonomy] annot = [ '\t'.join([ taxonomy[idx].split('.')[-1], 'annotation_background_color', final_dict[taxonomy[idx].split('.')[-1]] ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 and ( not final_dict[taxonomy[idx].split('.')[-1]] == 'w') ] annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[-1], 'annotation_background_color', 'w' ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) == 5 ] annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[-1], 'annotation', taxonomy[idx].split('.')[-1] ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 and (not final_dict[taxonomy[idx].split('.')[-1]] == 'w') ] annot = annot + [ '\t'.join([ pos.split('.')[-1], 'annotation_background_color', final_dict[pos.split('.')[-1]] ]) for idx, pos in enumerate(pos_tax) ] annot = annot + [ '\t'.join([pos.split('.')[-1], 'annotation', pos.split('.')[-1]]) for idx, pos in enumerate(pos_tax) if len(pos.split('.')) > 5 and (not final_dict[pos.split('.')[-1]] == 'w') ] annot = annot + [ '\t'.join([ neg.split('.')[-1], 'annotation_background_color', final_dict[neg.split('.')[-1]] ]) for idx, neg in enumerate(neg_tax) ] annot = annot + [ '\t'.join([neg.split('.')[-1], 'annotation', neg.split('.')[-1]]) for idx, neg in enumerate(neg_tax) if len(neg.split('.')) > 5 and (not final_dict[neg.split('.')[-1]] == 'w') ] lneg = [neg.split('.')[-1] for idx, neg in enumerate(neg_tax)] lpos = [pos.split('.')[-1] for idx, pos in enumerate(pos_tax)] ## OUTER RINGS annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[1], 'annotation', taxonomy[idx].split('.')[1] ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 1 ] annot = annot + [ '\t'.join( [taxonomy[idx].split('.')[1], 'annotation_rotation', str(1)]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 1 ] annot = annot + [ '\t'.join( [taxonomy[idx].split('.')[1], 'annotation_font_size', str(9)]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 1 ] annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[1], 'annotation_background_color', '#eedbfc' ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 1 ] ## Clades annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[-1], 'clade_marker_size', str(logpval_frq[idx]) ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 and (not final_dict[taxonomy[idx].split('.')[-1]] == 'w') ] annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[-1], 'clade_marker_edge_width', str(logpval[idx]) ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 and (not final_dict[taxonomy[idx].split('.')[-1]] == 'w') ] annot = annot + [ '\t'.join( [taxonomy[idx].split('.')[-1], 'annotation_rotation', str(1)]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 ] annot = annot + [ '\t'.join([ taxonomy[idx].split('.')[-1], 'annotation_font_size', str(font_map[taxlev[idx]]) ]) for idx, x in enumerate(direction) if len(taxonomy[idx].split('.')) > 5 ] if highlight_up and highlight_down: for taxon in correct: if '_' in taxon: annot = annot + [ '\t'.join([taxon, 'annotation_font_size', '25']) ] else: annot = annot + [ '\t'.join([taxon, 'annotation_font_size', '30']) ] annot = annot + ['annotation_background_offset\t0.5'] annot = annot + ['clade_marker_edge_color\t#4f1a49'] annot = annot + ['branch_color\t#4f1a49'] annot = annot + ['annotation_background_separation\t-0.01'] annot = annot + ['annotation_background_width\t0.2'] #https://bitbucket.org/nsegata/graphlan/src/default/readme.txt?fileviewer=file-view-default #asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan_annotate.py --annot ../annot.txt ../test.txt ../new.xml #asgari@epsilon1:/mounts/data/proj/asgari/dissertation/libraries/graphlan$ python graphlan.py ../new.xml image_name.pdf --dpi 1000 --size 15 --external_legends taxonomy = [ x for x in taxonomy if len(x.split('.')) > 5 and ( not final_dict[x.split('.')[-1]] == 'w') ] taxonomy += [ x for x in pos_tax + neg_tax if len(x.split('.')) > 5 and ( not final_dict[x.split('.')[-1]] == 'w') ] FileUtility.save_list(path + name + '_taxonomy.txt', taxonomy) FileUtility.save_list(path + name + '_annot.txt', annot) FileUtility.save_list(path + name + '_taxonomy.txt', taxonomy) FileUtility.save_list(path + name + '_annot.txt', annot) subprocess.call("python3 graphlan/graphlan_annotate.py --annot " + path + name + '_annot.txt' + " " + path + name + '_taxonomy.txt' + " " + path + name + '.xml', shell=True) subprocess.call("python3 graphlan/graphlan.py " + path + name + '.xml' + " " + path + name + '.pdf --dpi 1000 --size 15 --external_legends', shell=True)