def __init__(self, pos_fasta, neg_fasta, output_path, segmentation_schemes=10, topN=100): ''' ''' if not isinstance(pos_fasta, str): self.pos=pos_fasta elif pos_fasta.split('.')[-1]=='txt': self.pos=FileUtility.load_list(pos_fasta) elif pos_fasta.split('.')[-1]=='fasta': self.pos=FileUtility.read_fasta_sequences(pos_fasta) if not isinstance(neg_fasta, str): self.neg=neg_fasta elif neg_fasta.split('.')[-1]=='txt': self.neg=FileUtility.load_list(neg_fasta) elif neg_fasta.split('.')[-1]=='fasta': self.neg=FileUtility.read_fasta_sequences(neg_fasta) self.seqs=[seq.lower() for seq in self.pos+self.neg] self.labels=[1]*len(self.pos)+[0]*len(self.neg) self.segmentation_schemes=segmentation_schemes self.load_alpha_distribution() self.prepare_segmentations() print (output_path) FileUtility.ensure_dir(output_path) self.output_path=output_path self.motif_extraction(topN)
def pairwise(iterable): "s -> (s0, s1), (s2, s3), (s4, s5), ..." a = iter(iterable) return zip(a, a) ## script for segmentation of PDB secondary structures according to PPE units ## trained over Swiss Prot for different vocabulary sizes (here from "preferred numbers") ## but it can be also sampled from alpha distribution of SwissProt PPE lengths changes sampled_lengths = [10000, 20000, 50000, 100000, 200000, 500000, -1] triples = dict() for i in sampled_lengths: print(i) f = open('../data_config/swissprot_ppe', 'r') CPE_Applier = CPE(f, separator='', merge_size=i) sequences = FileUtility.read_fasta_sequences('../data_config/ss_N.txt') for pdb_idx, (x, y) in tqdm.tqdm(enumerate(pairwise(sequences))): segments = CPE_Applier.segment(x).split() label_segments = according_segmentation(segments, y) if i not in triples: triples[i] = [] triples[i] += [(seg, label_segments[idx], pdb_idx) for idx, seg in enumerate(segments)] for i in sampled_lengths: FileUtility.save_obj('../data_config/pdbsegments_' + str(i), triples[i]) ## mapping of motifs to PDB ids seq_ids = [ x.strip() for x in FileUtility.load_list('../data_config/ss_N.txt') if x.strip()[0] == '>' ]
for vocab in tqdm.tqdm(vocab_sizes): f = open('../../protein_datasets/segmentations/swissprot_cpe', 'r') CPE_Applier = CPE(f, separator='', merge_size=vocab) pool = Pool(processes=nump) for idx, seg in pool.imap_unordered(CPE_Applier.segment_with_keys, sequences, chunksize=nump): if idx not in segmented_seqs: segmented_seqs[idx] = [] segmented_seqs[idx].append(seg) pool.close() return [segmented_seqs[idx] for idx, x in enumerate(sequences)] # read the whole swiss-prot SWSSSEQ = FileUtility.read_fasta_sequences('swiss_prot.fasta') # look at the changes for 1000 sequences with respect to the sampling sizes randseq = random.sample(SWSSSEQ, 1000) size_change = dict() for vocab in tqdm.tqdm(np.arange(10000, 1000000, 10000)): size_change[vocab] = [] f = open('../data_config/swissprot_cpe', 'r') CPE_Applier = CPE(f, separator='', merge_size=int(vocab)) for seq in randseq: size_change[vocab].append(len(CPE_Applier.segment(seq).split())) all_samples = [] for i in tqdm.tqdm(range(0, 1000)): sample = [] for vocab in np.arange(10000, 1000000, 10000):
def _get_kmer_rep(self, inp): strain, seq_file, k = inp seq = FileUtility.read_fasta_sequences(seq_file) vec, vocab = GenotypeReader.get_nuc_kmer_distribution(seq, k) return strain, vec, vocab
def biomarker_extraction(self, labeler, label_mapper, phenoname, p_value_threshold=0.05, pos_label=None, neg_label=None, excel=0): ''' :return: ''' print('\t✔ NPE Marker detection is started..') start = time.time() rep_base_path = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) filenames = [ x.split('/')[-1] for x in FileUtility.load_list(rep_base_path + '_meta') ] # CHECK EXISTING LABELS if callable(labeler): selected_samples = [ idx for idx, file in enumerate(filenames) if labeler(file) in label_mapper ] else: selected_samples = [ idx for idx, file in enumerate(filenames) if labeler[file] in label_mapper ] if callable(labeler): Y = [ str(label_mapper[labeler(filenames[sample_id])]) for sample_id in selected_samples ] else: Y = [ str(label_mapper[labeler[filenames[sample_id]]]) for sample_id in selected_samples ] FileUtility.save_list(rep_base_path + '_' + phenoname + '_Y.txt', Y) DiTaxaWorkflow.ensure_dir(self.output_directory_inter + 'npe_marker_files/') if self.override == 1 or not DiTaxaWorkflow.exists( self.output_directory_inter + 'npe_marker_files/' + '_'.join([phenoname, 'chi2_relative.fasta'])): with warnings.catch_warnings(): warnings.simplefilter("ignore") G16s = NPEMarkerDetection( rep_base_path + '.npz', rep_base_path + '_' + phenoname + '_Y.txt', rep_base_path + '_features', self.output_directory_inter + 'npe_marker_files/' + phenoname, selected_samples) G16s.extract_markers() end = time.time() spent = end - start print('\t✔ biomarker extraction ' + phenoname + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + ' cores') self.log_file.append('biomarker extraction ' + phenoname + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + ' cores') else: print( '\t✔ Biomarker are already extracted. Thus, the statistical test was bypassed' ) self.log_file.append( ' Biomarker are already extracted. Thus, the statistical test was bypassed' ) FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) print('\t✔ Taxonomic assignment of the markers..') if callable(labeler): phenotypes = [ labeler(filenames[sample_id]) for sample_id in selected_samples ] else: phenotypes = [ labeler[filenames[sample_id]] for sample_id in selected_samples ] fasta_file = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_chi2_relative.fasta' matrix_path = rep_base_path + '.npz' feature_file_path = rep_base_path + '_features' if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000: remove_redundants = False else: remove_redundants = True FileUtility.ensure_dir(self.output_directory + 'final_outputs/save_states/') if self.override == 1 or not DiTaxaWorkflow.exists( self.output_directory + 'final_outputs/save_states/' + phenoname + '.pickle'): start = time.time() Final_OBJ = NPEMarkerAnlaysis(fasta_file, matrix_path, feature_file_path, phenotypes, label_mapper, selected_samples, p_value_threshold=p_value_threshold, remove_redundants=remove_redundants, num_p=self.num_p, blastn_path=self.blastn_path) end = time.time() spent = end - start DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/') FileUtility.save_obj( self.output_directory + 'final_outputs/save_states/' + phenoname, Final_OBJ) print('\t✔ Marker analysis and alignment ' + phenoname + ' ' + str(spent) + ' seconds, using ' + str(self.num_p) + 'cores') self.log_file.append('Marker analysis and alignment ' + phenoname + ' ' + str(spent) + ' seconds, using ' + str(self.num_p) + 'cores') else: Final_OBJ = FileUtility.load_obj(self.output_directory + 'final_outputs/save_states/' + phenoname + '.pickle') print('\t✔ The aligned markers already existed and are loaded!') self.log_file.append( 'The aligned markers already existed and are loaded!') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) # generating the tree Final_OBJ.generate_tree(self.output_directory + 'final_outputs/', phenoname) if excel == 1: print('\t✔ Creating marker excel file..') Final_OBJ.generate_excel( self.output_directory + 'final_outputs/' + phenoname + '.xlsx', phenoname) X_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '.npz' feature_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '_features' markers = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_finalmarker_list.txt' Y = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) + '_' + phenoname + "_Y.txt" print('\t✔ Creating t-sne plot..') DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' + phenoname + '_tsne.pdf', X_addr, feature_addr, markers, Y, labels=['Negative', 'Positive']) if pos_label and neg_label: print('\t✔ Creating marker heatmap..') Final_OBJ.update_matrix_by_markers_N() Final_OBJ.generate_heatmap(self.output_directory + 'final_outputs/' + phenoname + '_heatmap', pos_label=pos_label, neg_label=neg_label) if not excel == 1: print('\t✔ Creating t-sne plot..') DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' + phenoname + '_tsne.pdf', X_addr, feature_addr, markers, Y, labels=[neg_label, pos_label]) DiTaxaWorkflow.temp_cleanup() print( '\t⬛ Marker detection and analysis completed. You can find the results at ' + self.output_directory + ', in partuclar at final_outputs subdirectory.')
def biomarker_extraction(self, labeler, label_mapper, name_setting, p_value_threshold=0.05, pos_label=None, neg_label=None): ''' :return: ''' print('npe marker detection started') DiTaxaWorkflow.blockPrint() start = time.time() rep_base_path = self.output_directory + 'npe_representation/' + self.dbname + '_uniquepiece_' + str( self.rep_sampling_depth) filenames = [ x.split('/')[-1] for x in FileUtility.load_list(rep_base_path + '_meta') ] # CHECK EXISTING LABELS if callable(labeler): selected_samples = [ idx for idx, file in enumerate(filenames) if labeler(file) in label_mapper ] else: selected_samples = [ idx for idx, file in enumerate(filenames) if labeler[file] in label_mapper ] if callable(labeler): Y = [ str(label_mapper[labeler(filenames[sample_id])]) for sample_id in selected_samples ] else: Y = [ str(label_mapper[labeler[filenames[sample_id]]]) for sample_id in selected_samples ] FileUtility.save_list(rep_base_path + '_' + name_setting + '_Y.txt', Y) DiTaxaWorkflow.ensure_dir(self.output_directory + 'npe_marker_files/') G16s = NPEMarkerDetection( rep_base_path + '.npz', rep_base_path + '_' + name_setting + '_Y.txt', rep_base_path + '_features', self.output_directory + 'npe_marker_files/' + name_setting, selected_samples) G16s.extract_markers() end = time.time() spent = end - start self.log_file.append('biomarker extraction ' + name_setting + ' ' + str(spent) + ' seconds , using ' + str(self.num_p) + 'cores') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) DiTaxaWorkflow.enablePrint() print('npe marker taxonomic detection started') start = time.time() if callable(labeler): phenotypes = [ labeler(filenames[sample_id]) for sample_id in selected_samples ] else: phenotypes = [ labeler[filenames[sample_id]] for sample_id in selected_samples ] fasta_file = self.output_directory + 'npe_marker_files/' + name_setting + '_chi2_relative.fasta' matrix_path = rep_base_path + '.npz' feature_file_path = rep_base_path + '_features' if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000: remove_redundants = False else: remove_redundants = True Final_OBJ = NPEMarkerAnlaysis(fasta_file, matrix_path, feature_file_path, phenotypes, label_mapper, selected_samples, p_value_threshold=p_value_threshold, remove_redundants=remove_redundants, num_p=self.num_p) end = time.time() spent = end - start DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/') FileUtility.save_obj( self.output_directory + 'final_outputs/' + name_setting, Final_OBJ) Final_OBJ.generate_tree(self.output_directory + 'final_outputs/', name_setting) self.log_file.append('blasting extraction ' + name_setting + ' ' + str(spent) + ' seconds, using ' + str(self.num_p) + 'cores') FileUtility.save_list(self.output_directory + 'logfile.txt', self.log_file) if pos_label and neg_label: Final_OBJ.generate_heatmap(self.output_directory + 'final_outputs/' + name_setting + '_heatmap', pos_label=pos_label, neg_label=neg_label)