Exemple #1
0
    def __init__(self, pd, study_names, protocol=None):
        # @timing
        def _check(map, keys):
            values = np.array(list(map.dic.values()))
            result = {}
            r = (values == -1).any(axis=1)
            if np.sum(r) == values.shape[0]:
                raise ValueError('There is no common names between studies')
            for i, k in enumerate(keys):
                result[k] = values[~r, i]
            return result

        self.name = None
        self.study_names = study_names
        self.phen_mapper = Mapper()
        self.cov_mapper = Mapper()
        self.covariates = OrderedDict()
        self.pd = OrderedDict()
        for i in pd:
            self.pd[i.folder.name] = i
        keys = []
        if protocol is None:
            for i, k in enumerate(pd):
                self.covariates[k] = [
                    n.split(self.study_names[i] + '_')[1]
                    for n in k.folder._data.metadata['names']
                ]
                if i == 0:
                    self.phen_mapper.fill(k.folder._data.metadata['phenotype'],
                                          k.folder.name,
                                          reference=False)
                    self.cov_mapper.fill(self.covariates[k],
                                         k.folder.name,
                                         reference=False)
                else:
                    self.phen_mapper.push(k.folder._data.metadata['phenotype'],
                                          name=k.folder.name,
                                          new_id=False)
                    self.cov_mapper.push(self.covariates[k],
                                         name=k.folder.name,
                                         new_id=False)
                keys.append(k.folder.name)

            self.phen_order = _check(self.phen_mapper, keys)
            self.cov_order = _check(self.cov_mapper, keys)

        if protocol is not None:
            if not protocol.enable:
                protocol.parse()
Exemple #2
0
class MetaParData(object):
    def __init__(self, pd, study_names, protocol=None):
        # @timing
        def _check(map, keys):
            values = np.array(list(map.dic.values()))
            result = {}
            r = (values == -1).any(axis=1)
            if np.sum(r) == values.shape[0]:
                raise ValueError('There is no common names between studies')
            for i, k in enumerate(keys):
                result[k] = values[~r, i]
            return result

        self.name = None
        self.study_names = study_names
        self.phen_mapper = Mapper()
        self.cov_mapper = Mapper()
        self.covariates = OrderedDict()
        self.pd = OrderedDict()
        for i in pd:
            self.pd[i.folder.name] = i
        keys = []
        if protocol is None:
            for i, k in enumerate(pd):
                self.covariates[k] = [
                    n.split(self.study_names[i] + '_')[1]
                    for n in k.folder._data.metadata['names']
                ]
                if i == 0:
                    self.phen_mapper.fill(k.folder._data.metadata['phenotype'],
                                          k.folder.name,
                                          reference=False)
                    self.cov_mapper.fill(self.covariates[k],
                                         k.folder.name,
                                         reference=False)
                else:
                    self.phen_mapper.push(k.folder._data.metadata['phenotype'],
                                          name=k.folder.name,
                                          new_id=False)
                    self.cov_mapper.push(self.covariates[k],
                                         name=k.folder.name,
                                         new_id=False)
                keys.append(k.folder.name)

            self.phen_order = _check(self.phen_mapper, keys)
            self.cov_order = _check(self.cov_mapper, keys)

        if protocol is not None:
            if not protocol.enable:
                protocol.parse()

    def check_pd(self, old, new):

        np.set_printoptions(precision=3, suppress=True)
        print("*******PD CHECK*********")
        print('A covariates...')
        print(old[3] / old[3][0, 0])
        print(new[3] / new[3][0, 0])
        print('FREQ TESTs')
        N = np.random.randint(0, np.min((old[0].shape[0], new[0].shape[0])),
                              10)
        print(np.array(old[0][:, 0] / old[3][0, 0] / 2)[N])
        print(np.array(new[0][:, 0] / new[3][0, 0] / 2)[N])
        print('FREQ PHENO')
        M = np.random.randint(0, np.min((old[1].shape[1], new[1].shape[1])),
                              10)
        print(np.array(old[1][0, :] / old[3][0, 0])[M])
        print(np.array(new[1][0, :] / new[3][0, 0])[M])
        print("****************")

    # @timing
    def check_maf(self, SNPs_index):
        maf = np.zeros((SNPs_index[0].shape[0], len(self.pd)))

        for i, j in enumerate(self.pd):
            maf[:, i] = np.array(
                self.pd[j].folder._data.metadata['MAF'])[SNPs_index[i].astype(
                    np.int64)]

        if (np.std(maf, axis=1) > 0.1).any():
            raise ValueError('MAF is not consistent between PD data!')

    def get(self,
            SNPs_index=None,
            B4=False,
            regression_model=None,
            random_effect_intercept=False):

        if self.pd is None:
            raise ValueError('Data not defined!')
        k = list(self.pd.keys())
        if SNPs_index is not None:
            if len(self.pd) != len(SNPs_index):
                raise ValueError(
                    'There are not equal number od PD and SNPs indexes {}!={}'.
                    format(len(self.pd), len(SNPs_index)))
            a_test, b_cov, C, a_cov = self.pd[k[0]].get(
                gen_order=SNPs_index[0],
                phen_order=self.phen_order[k[0]],
                cov_order=self.cov_order[k[0]])
            if random_effect_intercept and len(self.pd) > 1:  # TODO (high)
                a_test_effect = a_test[:, 0:1]
                b_cov_effect = b_cov[0:1, :]
                a_cov_effect = a_cov[0:1, :]
        else:
            raise ValueError('Indexes are not defined!')

        if B4:
            b4 = self.pd[k[0]].folder._data.b4[SNPs_index[0], :]
            b4 = b4[:, self.phen_order[k[0]]]

        # self.check_maf(SNPs_index)

        for i in range(1, len(self.pd)):
            a, b, c, a_c = self.pd[k[i]].get(gen_order=SNPs_index[i],
                                             phen_order=self.phen_order[k[i]],
                                             cov_order=self.cov_order[k[i]])
            self.check_pd([a_test, b_cov, C, a_cov], [a, b, c, a_c])
            a_test = a_test + a
            b_cov = b_cov + b
            C = C + c
            a_cov = a_cov + a_c
            if random_effect_intercept and i < (len(self.pd) - 1):
                a_test_effect = np.hstack((a_test_effect, a[:, 0:1]))
                b_cov_effect = np.vstack((b_cov_effect, b[0:1, :]))
                a_cov_effect = np.vstack((a_cov_effect, a_c[0:1, :]))
            if B4:
                b4_tmp = self.pd[k[i]].folder._data.b4[SNPs_index[i], :]
                b4 = b4 + b4_tmp[:, self.phen_order[k[i]]]

        if random_effect_intercept and len(self.pd) > 1:
            a_cov_I = np.zeros((len(self.pd) - 1, len(self.pd) - 1))
            np.fill_diagonal(a_cov_I, a_cov_effect[:, 0])
            a_effect = np.hstack((a_cov_I, a_cov_effect))
            a_test = np.hstack((a_test_effect, a_test))
            b_cov = np.vstack((b_cov_effect, b_cov))
            a_cov = np.vstack((a_cov_effect, a_cov))
            a_cov = np.hstack((a_effect.T, a_cov))

        if B4:
            return a_test, b_cov, C, a_cov, b4
        else:
            return a_test, b_cov, C, a_cov

    def maf_pard(self, SNPs_index=None):

        samples = 0
        maf = np.zeros(len(SNPs_index[0]))

        for j, i in enumerate(self.pd):
            n = len(self.pd[i].folder._data.metadata['id'])
            samples += n
            maf = maf + n * np.array(
                self.pd[i].folder._data.metadata['MAF'])[SNPs_index[j]]
        maf = maf / np.float(samples)
        return maf

    def get_n_id(self):
        return np.sum(
            [len(self.pd[i].folder._data.metadata['id']) for i in self.pd])

    def get_phenotype_names(self):
        n = {}
        for i in self.pd:
            n[i] = self.pd[i].folder._data.metadata['phenotype']
Exemple #3
0
class MetaPhenotype(object):
    def __init__(self, phen, protocol=None, include=None, exclude=None):
        # @timing
        def _check(map, keys, index=None):
            values = np.array(list(map.dic.values()))
            result = {}
            r = (values == -1).any(axis=1)
            if index is not None:
                r[np.where(index == False)[0]] = True
            if np.sum(r) == values.shape[0]:
                raise ValueError('There is no common names between studies')
            for i, k in enumerate(keys):
                result[k] = values[~r, i]
            return result, np.array(list(map.dic.keys()))[~r]

        self.chunk_size = 10000
        self.exclude = None
        self.include = None
        self.name = None
        self.mapper = Mapper()
        self.keys = []
        self.keep_index = None
        if include is not None:
            self.include = pd.DataFrame.from_csv(include, index_col=None)
            print('Include:')
            print(self.include.head())
            if 'ID' not in self.include.columns:
                raise ValueError(
                    '{} table does not have ID column for phenotypes'.format(
                        include))
        if exclude is not None:
            self.exclude = pd.DataFrame.from_csv(exclude, index_col=None)
            print('Exclude:')
            print(self.exclude.head())
            if 'ID' not in self.exclude.columns:
                raise ValueError(
                    '{} table does not have ID column for phenotypes'.format(
                        exclude))
        if protocol is None:
            for i, k in enumerate(phen):
                phen_names = []
                if i == 0:
                    for j in k.folder.files:
                        if j != 'info_dic.npy':
                            phen_names = phen_names + list(
                                k.folder.data_info[j])
                    self.mapper.fill(phen_names, i, reference=False)
                else:
                    for j in k.folder.files:
                        if j != 'info_dic.npy':
                            phen_names = phen_names + list(
                                k.folder.data_info[j])
                    self.mapper.push(phen_names, name=i, new_id=False)

                self.keys.append(i)
            if self.exclude is not None or self.include is not None:
                phen_names = pd.Series(list(self.mapper.dic.keys()))
                phen_names = phen_names[phen_names.isin(
                    self.include.ID
                )] if self.include is not None else phen_names
                phen_names = phen_names[~phen_names.isin(
                    self.exclude.ID
                )] if self.exclude is not None else phen_names
                self.keep_index = pd.Series(list(
                    self.mapper.dic.keys())).isin(phen_names)
            print(np.sum(self.keep_index))
            self.order, self.phen_names = _check(self.mapper,
                                                 self.keys,
                                                 index=self.keep_index)
            self.n_phenotypes = len(self.order[self.keys[0]])
            print(('Loaded {} common phenotypes for meta-analysis'.format(
                self.n_phenotypes)))
            self.processed = 0
        else:
            if not protocol.enable:
                protocol.parse()

        self.pool = {i: PhenPool(j.folder) for i, j in enumerate(phen)}

    def get(self):
        if self.processed == self.n_phenotypes:
            return None, None
        else:
            start = self.processed
            finish = self.processed + self.chunk_size if (
                self.processed +
                self.chunk_size) <= self.n_phenotypes else self.n_phenotypes
            self.processed = finish
            phenotype = np.zeros(
                (np.sum([len(self.pool[i].folder._data.id)
                         for i in self.pool]), len(list(range(start,
                                                              finish)))))
        for i, j in enumerate(self.keys):

            if i == 0:
                a, b = 0, self.pool[j].folder._data.id.shape[0]
                phenotype[a:b, :] = self.pool[j].get_chunk(
                    self.order[j][start:finish])
                a = b
                N = np.random.randint(0, phenotype.shape[1], 10)
                print(phenotype.mean(axis=0)[N])
            else:
                ph_tmp = self.pool[j].get_chunk(self.order[j][start:finish])
                print(self.pool[j].folder.path)
                print(ph_tmp.mean(axis=0)[N])
                b += ph_tmp.shape[0]
                phenotype[a:b, :] = ph_tmp
                a = b

        return phenotype, self.phen_names[start:finish]
Exemple #4
0
    def __init__(self, phen, protocol=None, include=None, exclude=None):
        # @timing
        def _check(map, keys, index=None):
            values = np.array(list(map.dic.values()))
            result = {}
            r = (values == -1).any(axis=1)
            if index is not None:
                r[np.where(index == False)[0]] = True
            if np.sum(r) == values.shape[0]:
                raise ValueError('There is no common names between studies')
            for i, k in enumerate(keys):
                result[k] = values[~r, i]
            return result, np.array(list(map.dic.keys()))[~r]

        self.chunk_size = 10000
        self.exclude = None
        self.include = None
        self.name = None
        self.mapper = Mapper()
        self.keys = []
        self.keep_index = None
        if include is not None:
            self.include = pd.DataFrame.from_csv(include, index_col=None)
            print('Include:')
            print(self.include.head())
            if 'ID' not in self.include.columns:
                raise ValueError(
                    '{} table does not have ID column for phenotypes'.format(
                        include))
        if exclude is not None:
            self.exclude = pd.DataFrame.from_csv(exclude, index_col=None)
            print('Exclude:')
            print(self.exclude.head())
            if 'ID' not in self.exclude.columns:
                raise ValueError(
                    '{} table does not have ID column for phenotypes'.format(
                        exclude))
        if protocol is None:
            for i, k in enumerate(phen):
                phen_names = []
                if i == 0:
                    for j in k.folder.files:
                        if j != 'info_dic.npy':
                            phen_names = phen_names + list(
                                k.folder.data_info[j])
                    self.mapper.fill(phen_names, i, reference=False)
                else:
                    for j in k.folder.files:
                        if j != 'info_dic.npy':
                            phen_names = phen_names + list(
                                k.folder.data_info[j])
                    self.mapper.push(phen_names, name=i, new_id=False)

                self.keys.append(i)
            if self.exclude is not None or self.include is not None:
                phen_names = pd.Series(list(self.mapper.dic.keys()))
                phen_names = phen_names[phen_names.isin(
                    self.include.ID
                )] if self.include is not None else phen_names
                phen_names = phen_names[~phen_names.isin(
                    self.exclude.ID
                )] if self.exclude is not None else phen_names
                self.keep_index = pd.Series(list(
                    self.mapper.dic.keys())).isin(phen_names)
            print(np.sum(self.keep_index))
            self.order, self.phen_names = _check(self.mapper,
                                                 self.keys,
                                                 index=self.keep_index)
            self.n_phenotypes = len(self.order[self.keys[0]])
            print(('Loaded {} common phenotypes for meta-analysis'.format(
                self.n_phenotypes)))
            self.processed = 0
        else:
            if not protocol.enable:
                protocol.parse()

        self.pool = {i: PhenPool(j.folder) for i, j in enumerate(phen)}
Exemple #5
0
                G.split_size = CONVERTER_SPLIT_SIZE
                G.VCF2hdf5(args.out)
            else:
                raise ValueError(
                    'Genotype data should be in PLINK/MINIMAC/VCF format and alone in folder'
                )

        check_converter(args.out, args.study_name[0])
        print(('Time to convert all data: {} sec'.format(t.secs)))

    ################################### ENCODING ##############################

    elif args.mode == 'encoding':

        # ARG_CHECKER.check(args,mode='encoding')
        mapper = Mapper()
        mapper.genotype_names = args.study_name
        mapper.chunk_size = MAPPER_CHUNK_SIZE
        mapper.reference_name = args.ref_name
        mapper.load_flip(args.mapper)
        mapper.load(args.mapper)

        phen = Reader('phenotype')
        phen.start(args.phenotype[0])

        gen = Reader('genotype')
        gen.start(args.genotype[0],
                  hdf5=args.hdf5,
                  study_name=args.study_name[0],
                  ID=False)