Ejemplo n.º 1
0
    def generate_kmers_all(self, k, save=False):
        '''
        :param k:
        :param save:
        :return:
        '''
        self.k=k
        self.vocab = [''.join(xs) for xs in itertools.product('atcg', repeat=k)]
        self.vocab.sort()
        self.vectorizer = TfidfVectorizer(use_idf=False, vocabulary=self.vocab, analyzer='char', ngram_range=(k, k),
                                          norm=None, stop_words=[], lowercase=True, binary=False)

        data = np.zeros((len(self.fasta_files), len(self.vocab))).astype(np.float64)

        # multi processing extraction of k-mer distributions
        t_steps=[]
        s_steps=[]
        pool = Pool(processes=self.num_p)
        for ky, (v,t,s) in tqdm.tqdm(pool.imap_unordered(self.get_kmer_distribution, self.fasta_files, chunksize=1),
                               total=len(self.fasta_files)):
            data[self.indexing[ky], :] = v
            t_steps.append(t)
            s_steps.append(s)

        # normalize the frequencies
        data = normalize(data, axis=1, norm='l1')
        data = sparse.csr_matrix(data)

        if save:
            FileUtility.save_sparse_csr(save, data)
            FileUtility.save_list(save+'_meta',self.fasta_files)
            # ': '.join(['mean_time', str(np.mean(t_steps))]), ': '.join(['std_time', str(np.std(t_steps))])
            FileUtility.save_list(save+'_log',[': '.join(['mean_size', str(np.mean(s_steps))]), ': '.join(['std_size', str(np.std(s_steps))])])

        return data
Ejemplo n.º 2
0
    def create_kmer_table(self, path, k, cores=4, override=False):

        save_path = self.output_path + 'sequence_' + str(k) + 'mer'

        if override or not os.path.exists('_'.join(
            [save_path, 'feature', 'vect.npz'])):
            files = FileUtility.recursive_glob(path, '*')
            files.sort()
            input_tuples = []
            for file in files:
                input_tuples.append(
                    (file.split('/')[-1].split('.')[0], file, k))

            strains = []
            mat = []
            kmers = []
            pool = Pool(processes=cores)
            for strain, vec, vocab in tqdm.tqdm(pool.imap_unordered(
                    self._get_kmer_rep, input_tuples, chunksize=cores),
                                                total=len(input_tuples)):
                strains.append(strain)
                mat.append(vec)
                kmers = vocab
            pool.close()
            mat = sparse.csr_matrix(mat)

            FileUtility.save_sparse_csr(save_path + '_feature_vect', mat)
            FileUtility.save_list('_'.join([save_path, 'strains', 'list.txt']),
                                  strains)
            FileUtility.save_list('_'.join([save_path, 'feature', 'list.txt']),
                                  kmers)
        return ('_'.join([save_path]) + ' created')
Ejemplo n.º 3
0
    def generate_npes_all(self, save=False, norm=False):
        data = np.zeros(
            (len(self.fasta_files), len(self.npe_vocab))).astype(np.float64)

        # multi processing extraction of npe distributions
        t_steps = []
        s_steps = []
        pool = Pool(processes=self.num_p)
        for ky, (v, t, s) in tqdm.tqdm(
                pool.imap_unordered(self._get_npe_distribution,
                                    self.fasta_files,
                                    chunksize=self.num_p),
                total=len(self.fasta_files)):
            data[self.indexing[ky], :] = v
            t_steps.append(t)
            s_steps.append(s)
        pool.close()
        # normalize the frequencies
        if norm:
            data = normalize(data, axis=1, norm='l1')
        data = sparse.csr_matrix(data)

        if save:
            FileUtility.save_sparse_csr(save, data)
            FileUtility.save_list(save + '_meta', self.fasta_files)
            FileUtility.save_list(save + '_features', self.npe_vocab)
            FileUtility.save_list(save + '_log', [
                ': '.join(['mean_time', str(np.mean(t_steps))]), ': '.join([
                    'std_time', str(np.std(t_steps))
                ]), ': '.join(['mean_size', str(np.mean(s_steps))]), ': '.join(
                    ['std_size', str(np.std(s_steps))])
            ])
        return data
Ejemplo n.º 4
0
 def create_continous_mics():
     '''
     
     '''
     scaler = MinMaxScaler()
     df = pd.read_table("../data_config/Final_MICs_16.06.16.txt")
     res = df[[
         'Isolates', 'CIP MIC', 'TOB MIC', 'COL MIC', 'CAZ MIC', 'MEM MIC'
     ]]
     matrix = np.array([[
         float(
             str(x).replace('<=',
                            '').replace('≤', '').replace('<=', '').replace(
                                '≥', '').replace('>=', '')) for x in row
     ] for row in res[[
         'CIP MIC', 'TOB MIC', 'COL MIC', 'CAZ MIC', 'MEM MIC'
     ]].as_matrix()])
     # find nans [[(idx,idy) for idy,y in enumerate(x) if y] for idx, x in enumerate(np.isnan(matrix))]
     resistances = np.delete(matrix, [509], axis=0)
     isolates = [
         x[0] for idx, x in enumerate(list(df[['Isolates']].values))
         if not idx == 509
     ]
     # scale to 0-1
     resistances = scaler.fit_transform(resistances)
     features = ['CIP', 'TOB', 'COL', 'CAZ', 'MEM']
     base_path = '/mounts/data/proj/asgari/dissertation/datasets/deepbio/pseudomonas/data_v3/continous_mic_vals'
     resistances = csr_matrix(resistances)
     FileUtility.save_sparse_csr(base_path + '_feature_vect', resistances)
     FileUtility.save_list(base_path + '_isolates_list.txt', isolates)
     FileUtility.save_list(base_path + '_feature_list.txt', features)
    def create_read_tabular_file(path,
                                 save_pref='_',
                                 feature_normalization=None,
                                 transpose=False,
                                 override=False):
        '''
        :param path:
        :param save_pref:
        :param transpose: if isolates are columns
        :param feature_normalization: 'binary': {0,1}, '0-1': [0-1],  'percent': {0,1,..,100}, 'zu': zero mean, unit variance
        :return:
        '''
        print('Start creating ', save_pref)
        if override or not os.path.exists('_'.join(
            [save_pref, 'feature', 'vect.npz'])):
            rows = [
                l.strip() for l in codecs.open(path, 'r', 'utf-8').readlines()
            ]
            tf_vec = sparse.csr_matrix([[
                GenotypeReader.get_float_or_zero(x)
                for x in entry.split('\t')[1::]
            ] for entry in rows[1::]])

            if transpose:
                tf_vec = sparse.csr_matrix(tf_vec.toarray().T)
                isolates = [
                    feat.replace(' ', '')
                    for feat in rows[0].rstrip().split('\t')
                ]
                feature_names = [row.split()[0] for row in rows[1::]]
            else:
                isolates = [row.split()[0] for row in rows[1::]]
                feature_names = [
                    feat.replace(' ', '')
                    for feat in rows[0].rstrip().split('\t')
                ]

            # normalizer / discretizer
            if feature_normalization:
                if feature_normalization == 'binary':
                    tf_vec = np.round(MaxAbsScaler().fit_transform(tf_vec))

                elif feature_normalization == '01':
                    tf_vec = MaxAbsScaler().fit_transform(tf_vec)
                elif feature_normalization == 'percent':
                    tf_vec = np.round(MaxAbsScaler().fit_transform(tf_vec) *
                                      100)
                elif feature_normalization == 'zu':
                    tf_vec = sparse.csr_matrix(
                        preprocessing.StandardScaler().fit_transform(
                            tf_vec.toarray()))

                FileUtility.save_sparse_csr(
                    '_'.join([save_pref, 'feature', 'vect.npz']), tf_vec)
                FileUtility.save_list(
                    '_'.join([save_pref, 'feature', 'list.txt']),
                    feature_names)
                FileUtility.save_list(
                    '_'.join([save_pref, 'strains', 'list.txt']), isolates)
                print(save_pref, ' created successfully containing ',
                      str(len(isolates)), ' strains and ',
                      str(len(feature_names)), ' features')
                return (''.join([
                    save_pref, ' created successfully containing ',
                    str(len(isolates)), ' strains and ',
                    str(len(feature_names)), ' features'
                ]))
        else:
            print(save_pref, ' already exist ')
            return (''.join([save_pref, ' already exist ']))