コード例 #1
0
    def __init__(self, pos_fasta, neg_fasta, output_path, segmentation_schemes=10, topN=100):
        '''

        '''
        if not isinstance(pos_fasta, str):
            self.pos=pos_fasta
        elif pos_fasta.split('.')[-1]=='txt':
            self.pos=FileUtility.load_list(pos_fasta)
        elif pos_fasta.split('.')[-1]=='fasta':
            self.pos=FileUtility.read_fasta_sequences(pos_fasta)
        if not isinstance(neg_fasta, str):
            self.neg=neg_fasta
        elif neg_fasta.split('.')[-1]=='txt':
            self.neg=FileUtility.load_list(neg_fasta)
        elif neg_fasta.split('.')[-1]=='fasta':
            self.neg=FileUtility.read_fasta_sequences(neg_fasta)
        self.seqs=[seq.lower() for seq in self.pos+self.neg]
        self.labels=[1]*len(self.pos)+[0]*len(self.neg)
        self.segmentation_schemes=segmentation_schemes
        self.load_alpha_distribution()
        self.prepare_segmentations()
        print (output_path)
        FileUtility.ensure_dir(output_path)
        self.output_path=output_path
        self.motif_extraction(topN)
コード例 #2
0
def pairwise(iterable):
    "s -> (s0, s1), (s2, s3), (s4, s5), ..."
    a = iter(iterable)
    return zip(a, a)


## script for segmentation of PDB secondary structures according to PPE units
## trained over Swiss Prot for different vocabulary sizes (here from "preferred numbers")
## but it can be also sampled from alpha distribution of SwissProt PPE lengths changes
sampled_lengths = [10000, 20000, 50000, 100000, 200000, 500000, -1]
triples = dict()
for i in sampled_lengths:
    print(i)
    f = open('../data_config/swissprot_ppe', 'r')
    CPE_Applier = CPE(f, separator='', merge_size=i)
    sequences = FileUtility.read_fasta_sequences('../data_config/ss_N.txt')
    for pdb_idx, (x, y) in tqdm.tqdm(enumerate(pairwise(sequences))):
        segments = CPE_Applier.segment(x).split()
        label_segments = according_segmentation(segments, y)
        if i not in triples:
            triples[i] = []
        triples[i] += [(seg, label_segments[idx], pdb_idx)
                       for idx, seg in enumerate(segments)]
for i in sampled_lengths:
    FileUtility.save_obj('../data_config/pdbsegments_' + str(i), triples[i])

## mapping of motifs to PDB ids
seq_ids = [
    x.strip() for x in FileUtility.load_list('../data_config/ss_N.txt')
    if x.strip()[0] == '>'
]
    for vocab in tqdm.tqdm(vocab_sizes):
        f = open('../../protein_datasets/segmentations/swissprot_cpe', 'r')
        CPE_Applier = CPE(f, separator='', merge_size=vocab)
        pool = Pool(processes=nump)
        for idx, seg in pool.imap_unordered(CPE_Applier.segment_with_keys,
                                            sequences,
                                            chunksize=nump):
            if idx not in segmented_seqs:
                segmented_seqs[idx] = []
            segmented_seqs[idx].append(seg)
        pool.close()
    return [segmented_seqs[idx] for idx, x in enumerate(sequences)]


# read the whole swiss-prot
SWSSSEQ = FileUtility.read_fasta_sequences('swiss_prot.fasta')

# look at the changes for 1000 sequences with respect to the sampling sizes
randseq = random.sample(SWSSSEQ, 1000)
size_change = dict()
for vocab in tqdm.tqdm(np.arange(10000, 1000000, 10000)):
    size_change[vocab] = []
    f = open('../data_config/swissprot_cpe', 'r')
    CPE_Applier = CPE(f, separator='', merge_size=int(vocab))
    for seq in randseq:
        size_change[vocab].append(len(CPE_Applier.segment(seq).split()))

all_samples = []
for i in tqdm.tqdm(range(0, 1000)):
    sample = []
    for vocab in np.arange(10000, 1000000, 10000):
コード例 #4
0
 def _get_kmer_rep(self, inp):
     strain, seq_file, k = inp
     seq = FileUtility.read_fasta_sequences(seq_file)
     vec, vocab = GenotypeReader.get_nuc_kmer_distribution(seq, k)
     return strain, vec, vocab
コード例 #5
0
ファイル: DiTaxa.py プロジェクト: seedpcseed/DiTaxa
    def biomarker_extraction(self,
                             labeler,
                             label_mapper,
                             phenoname,
                             p_value_threshold=0.05,
                             pos_label=None,
                             neg_label=None,
                             excel=0):
        '''

        :return:
        '''
        print('\t✔ NPE Marker detection is started..')
        start = time.time()
        rep_base_path = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
            self.rep_sampling_depth)
        filenames = [
            x.split('/')[-1]
            for x in FileUtility.load_list(rep_base_path + '_meta')
        ]

        # CHECK EXISTING LABELS
        if callable(labeler):
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler(file) in label_mapper
            ]
        else:
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler[file] in label_mapper
            ]

        if callable(labeler):
            Y = [
                str(label_mapper[labeler(filenames[sample_id])])
                for sample_id in selected_samples
            ]
        else:
            Y = [
                str(label_mapper[labeler[filenames[sample_id]]])
                for sample_id in selected_samples
            ]

        FileUtility.save_list(rep_base_path + '_' + phenoname + '_Y.txt', Y)
        DiTaxaWorkflow.ensure_dir(self.output_directory_inter +
                                  'npe_marker_files/')

        if self.override == 1 or not DiTaxaWorkflow.exists(
                self.output_directory_inter + 'npe_marker_files/' +
                '_'.join([phenoname, 'chi2_relative.fasta'])):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                G16s = NPEMarkerDetection(
                    rep_base_path + '.npz',
                    rep_base_path + '_' + phenoname + '_Y.txt',
                    rep_base_path + '_features', self.output_directory_inter +
                    'npe_marker_files/' + phenoname, selected_samples)
                G16s.extract_markers()

            end = time.time()
            spent = end - start
            print('\t✔ biomarker extraction ' + phenoname + '  ' + str(spent) +
                  ' seconds , using ' + str(self.num_p) + ' cores')
            self.log_file.append('biomarker extraction ' + phenoname + '  ' +
                                 str(spent) + ' seconds , using ' +
                                 str(self.num_p) + ' cores')
        else:
            print(
                '\t✔ Biomarker are already extracted. Thus, the statistical test was bypassed'
            )
            self.log_file.append(
                ' Biomarker are already extracted. Thus, the statistical test was bypassed'
            )

        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)

        print('\t✔ Taxonomic assignment of the markers..')

        if callable(labeler):
            phenotypes = [
                labeler(filenames[sample_id]) for sample_id in selected_samples
            ]
        else:
            phenotypes = [
                labeler[filenames[sample_id]] for sample_id in selected_samples
            ]

        fasta_file = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_chi2_relative.fasta'
        matrix_path = rep_base_path + '.npz'
        feature_file_path = rep_base_path + '_features'

        if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000:
            remove_redundants = False
        else:
            remove_redundants = True

        FileUtility.ensure_dir(self.output_directory +
                               'final_outputs/save_states/')
        if self.override == 1 or not DiTaxaWorkflow.exists(
                self.output_directory + 'final_outputs/save_states/' +
                phenoname + '.pickle'):
            start = time.time()
            Final_OBJ = NPEMarkerAnlaysis(fasta_file,
                                          matrix_path,
                                          feature_file_path,
                                          phenotypes,
                                          label_mapper,
                                          selected_samples,
                                          p_value_threshold=p_value_threshold,
                                          remove_redundants=remove_redundants,
                                          num_p=self.num_p,
                                          blastn_path=self.blastn_path)
            end = time.time()
            spent = end - start
            DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/')
            FileUtility.save_obj(
                self.output_directory + 'final_outputs/save_states/' +
                phenoname, Final_OBJ)
            print('\t✔ Marker analysis and alignment ' + phenoname + '  ' +
                  str(spent) + ' seconds, using ' + str(self.num_p) + 'cores')
            self.log_file.append('Marker analysis and alignment ' + phenoname +
                                 '  ' + str(spent) + ' seconds, using ' +
                                 str(self.num_p) + 'cores')
        else:
            Final_OBJ = FileUtility.load_obj(self.output_directory +
                                             'final_outputs/save_states/' +
                                             phenoname + '.pickle')
            print('\t✔ The aligned markers already existed and are loaded!')
            self.log_file.append(
                'The aligned markers already existed and are loaded!')
        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)

        # generating the tree
        Final_OBJ.generate_tree(self.output_directory + 'final_outputs/',
                                phenoname)

        if excel == 1:
            print('\t✔ Creating marker excel file..')
            Final_OBJ.generate_excel(
                self.output_directory + 'final_outputs/' + phenoname + '.xlsx',
                phenoname)
            X_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '.npz'
            feature_addr = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '_features'
            markers = self.output_directory_inter + 'npe_marker_files/' + phenoname + '_finalmarker_list.txt'
            Y = self.output_directory_inter + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
                self.rep_sampling_depth) + '_' + phenoname + "_Y.txt"
            print('\t✔ Creating t-sne plot..')
            DiTaxaWorkflow.plot_res(self.output_directory + 'final_outputs/' +
                                    phenoname + '_tsne.pdf',
                                    X_addr,
                                    feature_addr,
                                    markers,
                                    Y,
                                    labels=['Negative', 'Positive'])

        if pos_label and neg_label:
            print('\t✔ Creating marker heatmap..')
            Final_OBJ.update_matrix_by_markers_N()
            Final_OBJ.generate_heatmap(self.output_directory +
                                       'final_outputs/' + phenoname +
                                       '_heatmap',
                                       pos_label=pos_label,
                                       neg_label=neg_label)
            if not excel == 1:
                print('\t✔ Creating t-sne plot..')
                DiTaxaWorkflow.plot_res(self.output_directory +
                                        'final_outputs/' + phenoname +
                                        '_tsne.pdf',
                                        X_addr,
                                        feature_addr,
                                        markers,
                                        Y,
                                        labels=[neg_label, pos_label])
        DiTaxaWorkflow.temp_cleanup()
        print(
            '\t⬛ Marker detection and analysis completed. You can find the results at '
            + self.output_directory +
            ', in partuclar at final_outputs subdirectory.')
コード例 #6
0
ファイル: DiTaxa.py プロジェクト: llpberkeley/DiTaxa
    def biomarker_extraction(self,
                             labeler,
                             label_mapper,
                             name_setting,
                             p_value_threshold=0.05,
                             pos_label=None,
                             neg_label=None):
        '''

        :return:
        '''
        print('npe marker detection started')
        DiTaxaWorkflow.blockPrint()
        start = time.time()
        rep_base_path = self.output_directory + 'npe_representation/' + self.dbname + '_uniquepiece_' + str(
            self.rep_sampling_depth)
        filenames = [
            x.split('/')[-1]
            for x in FileUtility.load_list(rep_base_path + '_meta')
        ]

        # CHECK EXISTING LABELS
        if callable(labeler):
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler(file) in label_mapper
            ]
        else:
            selected_samples = [
                idx for idx, file in enumerate(filenames)
                if labeler[file] in label_mapper
            ]

        if callable(labeler):
            Y = [
                str(label_mapper[labeler(filenames[sample_id])])
                for sample_id in selected_samples
            ]
        else:
            Y = [
                str(label_mapper[labeler[filenames[sample_id]]])
                for sample_id in selected_samples
            ]

        FileUtility.save_list(rep_base_path + '_' + name_setting + '_Y.txt', Y)
        DiTaxaWorkflow.ensure_dir(self.output_directory + 'npe_marker_files/')
        G16s = NPEMarkerDetection(
            rep_base_path + '.npz',
            rep_base_path + '_' + name_setting + '_Y.txt',
            rep_base_path + '_features',
            self.output_directory + 'npe_marker_files/' + name_setting,
            selected_samples)
        G16s.extract_markers()
        end = time.time()
        spent = end - start
        self.log_file.append('biomarker extraction ' + name_setting + '  ' +
                             str(spent) + ' seconds , using ' +
                             str(self.num_p) + 'cores')
        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)
        DiTaxaWorkflow.enablePrint()
        print('npe marker taxonomic detection started')
        start = time.time()

        if callable(labeler):
            phenotypes = [
                labeler(filenames[sample_id]) for sample_id in selected_samples
            ]
        else:
            phenotypes = [
                labeler[filenames[sample_id]] for sample_id in selected_samples
            ]

        fasta_file = self.output_directory + 'npe_marker_files/' + name_setting + '_chi2_relative.fasta'
        matrix_path = rep_base_path + '.npz'
        feature_file_path = rep_base_path + '_features'

        if len(FileUtility.read_fasta_sequences(fasta_file)) > 2000:
            remove_redundants = False
        else:
            remove_redundants = True

        Final_OBJ = NPEMarkerAnlaysis(fasta_file,
                                      matrix_path,
                                      feature_file_path,
                                      phenotypes,
                                      label_mapper,
                                      selected_samples,
                                      p_value_threshold=p_value_threshold,
                                      remove_redundants=remove_redundants,
                                      num_p=self.num_p)
        end = time.time()
        spent = end - start
        DiTaxaWorkflow.ensure_dir(self.output_directory + 'final_outputs/')
        FileUtility.save_obj(
            self.output_directory + 'final_outputs/' + name_setting, Final_OBJ)
        Final_OBJ.generate_tree(self.output_directory + 'final_outputs/',
                                name_setting)
        self.log_file.append('blasting extraction ' + name_setting + '  ' +
                             str(spent) + ' seconds, using ' +
                             str(self.num_p) + 'cores')
        FileUtility.save_list(self.output_directory + 'logfile.txt',
                              self.log_file)
        if pos_label and neg_label:
            Final_OBJ.generate_heatmap(self.output_directory +
                                       'final_outputs/' + name_setting +
                                       '_heatmap',
                                       pos_label=pos_label,
                                       neg_label=neg_label)