Ejemplo n.º 1
0
    def convert_genotypes(self):

        chunk_size = self.split_size
        if chunk_size is None:
            raise ValueError(
                'CONVERTER_SPLIT_SIZE does not define in config file!')
        G = np.array([])
        # self.reader.folder.processed=0
        while True:
            with Timer() as t:
                G = self.reader.folder.get_bed(chunk_size)
                if isinstance(G, type(None)):
                    break

            print(('Time to read {} SNPs is {} s'.format(G.shape[0], t.secs)))

            self.write_data('gen')
            atom = tables.Int8Atom()
            self.genotype = self.h5_gen_file.create_carray(
                self.h5_gen_file.root,
                'genotype',
                atom, (G.shape),
                title='Genotype',
                filters=self.pytable_filters)
            with Timer() as t:
                self.genotype[:] = G

            print(('Time to write {} SNPs is {} s'.format(G.shape[0], t.secs)))

            self.h5_gen_file.close()
            G = None
            gc.collect()
Ejemplo n.º 2
0
def hase_convert(args):
    R = Reader('genotype')

    R.start(args.genotype[0], vcf=args.vcf)

    with Timer() as t:
        if R.format == 'PLINK':
            G = GenotypePLINK(args.study_name[0], reader=R)
            G.split_size = CONVERTER_SPLIT_SIZE
            G.plink2hdf5(out=args.out)

        elif R.format == 'MINIMAC':
            G = GenotypeMINIMAC(args.study_name[0], reader=R)
            G.split_size = CONVERTER_SPLIT_SIZE
            G.MACH2hdf5(args.out, id=args.id)

        elif R.format == 'VCF':
            G = GenotypeVCF(args.study_name[0], reader=R)
            G.split_size = CONVERTER_SPLIT_SIZE
            G.VCF2hdf5(args.out)
        else:
            raise ValueError(
                'Genotype data should be in PLINK/MINIMAC/VCF format and alone in folder'
            )

    check_converter(args.out, args.study_name[0])
    print(('Time to convert all data: {} sec'.format(t.secs)))
    return
Ejemplo n.º 3
0
def hase_convert(args):
    if (os.path.exists(args.outfolder + '/probes/')) and (
            os.path.exists(args.outfolder + '/genotype/')) and (
                os.path.exists(args.outfolder + '/individuals/')):
        print(
            "The folders: probes, genotype and individuals already exist. Data seems already in HASE format. Delete "
            "the folders if the files are not converted properly. Continuing with the current files:"
        )
        return
    else:
        print('using', args.outfolder)

    R = Reader('genotype')

    R.start(args.genotype[0], vcf=args.vcf)

    with Timer() as t:
        if R.format == 'PLINK':
            G = GenotypePLINK(args.study_name[0], reader=R)
            G.split_size = CONVERTER_SPLIT_SIZE
            G.plink2hdf5(out=args.out)

        elif R.format == 'MINIMAC':
            G = GenotypeMINIMAC(args.study_name[0], reader=R)
            G.split_size = CONVERTER_SPLIT_SIZE
            G.MACH2hdf5(args.out, id=args.id)

        elif R.format == 'VCF':
            G = GenotypeVCF(args.study_name[0], reader=R)
            G.split_size = CONVERTER_SPLIT_SIZE
            G.VCF2hdf5(args.out)
        else:
            raise ValueError(
                'Genotype data should be in PLINK/MINIMAC/VCF format and alone in folder'
            )

    check_converter(args.out, args.study_name[0])
    args.outfolder = args.genotype
    print(('Time to convert all data: {} sec'.format(t.secs)))
    return
Ejemplo n.º 4
0
def HASE(b4, A_inverse, b_cov, C, N_con, DF):
    with Timer() as t:
        B13 = b_cov
        B4 = b4

        A1_B_constant = np.tensordot(A_inverse[:, :, 0:(N_con)],
                                     B13,
                                     axes=([2], [0]))

        A1_B_nonconstant = np.einsum('ijk,il->ijl',
                                     A_inverse[:, :, N_con:N_con + 1], B4)

        A1_B_full = A1_B_constant + A1_B_nonconstant

        BT_A1B_const = np.einsum('ij,lji->li', B13.T, A1_B_full[:,
                                                                0:(N_con), :])

        BT_A1B_nonconst = np.einsum('ijk,ijk->ijk', B4[:, None, :],
                                    A1_B_full[:, (N_con):N_con + 1, :])

        BT_A1B_full = BT_A1B_const[:, None, :] + BT_A1B_nonconst

        C_BTA1B = BT_A1B_full - C.reshape(1, -1)

        C_BTA1B = np.abs(C_BTA1B)

        a44_C_BTA1B = C_BTA1B * A_inverse[:, (N_con):N_con + 1,
                                          (N_con):N_con + 1]

        a44_C_BTA1B = np.sqrt((a44_C_BTA1B))

        t_stat = np.sqrt(DF) * np.divide(A1_B_full[:, (N_con):N_con + 1, :],
                                         a44_C_BTA1B)

        SE = a44_C_BTA1B / np.sqrt(DF)

    print("time to compute GWAS for {} phenotypes and {} SNPs .... {} sec".
          format(b4.shape[1], A_inverse.shape[0], t.secs))
    return t_stat, SE
Ejemplo n.º 5
0
def haseregression(phen, gen, cov, mapper, Analyser, maf, intercept=True, interaction=None):
    g = tuple([i.folder._data for i in gen])

    row_index, ids = study_indexes(phenotype=phen.folder._data,
                                   genotype=g,
                                   covariates=cov.folder._data)

    if mapper is not None:
        SNP = [0, 0, mapper.n_keys]
    else:
        SNP = [0, 0, 'unknown']

    covariates = cov.get_next(index=row_index[2])
    a_cov = A_covariates(covariates, intercept=intercept)

    while True:
        gc.collect()
        if mapper is not None:
            if mapper.cluster == 'n':
                SNPs_index, keys = mapper.get()
            else:
                ch = mapper.chunk_pop()
                if ch is None:
                    SNPs_index = None
                    break
                SNPs_index, keys = mapper.get(chunk_number=ch)
            if isinstance(SNPs_index, type(None)):
                break
            Analyser.rsid = keys
        else:
            SNPs_index = None

        with Timer() as t:
            genotype = merge_genotype(gen, SNPs_index, mapper)
        print(('time to read and merge genotype {}s'.format(t.secs)))
        gc.collect()
        if genotype is None:
            print('All genotype processed!')
            break
        SNP[0] += genotype.shape[0]
        genotype = genotype[:, row_index[0]]

        if mapper is None:
            Analyser.rsid = np.array(list(range(genotype.shape[0])))

        MAF = np.mean(genotype, axis=1) / 2
        STD = np.std(genotype, axis=1)

        if maf != 0:

            filter = (MAF > maf) & (MAF < 1 - maf) & (MAF != 0.5)
            genotype = genotype[filter, :]
            Analyser.MAF = MAF[filter]
            Analyser.rsid = Analyser.rsid[filter]

            if genotype.shape[0] == 0:
                print('NO SNPs > MAF')
                continue

        else:
            Analyser.MAF = MAF

        SNP[1] += genotype.shape[0]

        while True:
            phenotype = phen.get_next(index=row_index[1])

            if isinstance(phenotype, type(None)):
                phen.folder.processed = 0
                print('All phenotypes processed!')
                break

            if phen.permutation:
                np.random.shuffle(phenotype)

            b_cov = B_covariates(covariates, phenotype, intercept=intercept)

            C = C_matrix(phenotype)

            if interaction is not None:
                pass

            a_test = A_tests(covariates, genotype, intercept=intercept)
            a_inv = A_inverse(a_cov, a_test)

            N_con = a_inv.shape[1] - 1

            DF = (phenotype.shape[0] - a_inv.shape[1])

            b4 = B4(phenotype, genotype)

            t_stat, SE = HASE(b4, a_inv, b_cov, C, N_con, DF)
            print(('Read {}, processed {}, total {}'.format(SNP[0], SNP[1], SNP[2])))
            Analyser.t_stat = t_stat
            Analyser.SE = SE
            if mapper is not None and mapper.cluster == 'y':
                Analyser.cluster = True
                Analyser.chunk = ch
                Analyser.node = mapper.node[1]
            if phen.permutation:
                Analyser.permutation = True
            Analyser.save_result(phen.folder._data.names[phen.folder._data.start:phen.folder._data.finish])
            t_stat = None
            Analyser.t_stat = None
            del b4
            del C
            del b_cov
            del a_inv
            del a_test
            del t_stat
            gc.collect()

    if Analyser.cluster:
        np.save(os.path.join(Analyser.out, str(Analyser.node) + '_node_RSID.npy'), Analyser.rsid_dic)
Ejemplo n.º 6
0
                print('********************************')

            print('r', r)
            if p == 0:
                ID = np.append(ID, b.ID)

            b['counter_ref'] = np.arange(counter_ref,
                                         counter_ref + b.shape[0],
                                         dtype='int32')
            counter_ref += b.shape[0]

            if len(match_index) or len(flip_index):
                print('matched {}'.format(match_index.shape[0]))
                print('flipped {}'.format(flip_index.shape[0]))
                if del_counter_ref.get(r) is not None:
                    with Timer() as t:
                        b = b[~b.counter_ref.isin(del_counter_ref[r])]
                    print('time {}'.format(t.secs))

            match_df = pd.merge(b,
                                a,
                                left_on=merge['straight'],
                                right_on=merge['straight'])
            flip_df = pd.merge(b[~b.counter_ref.isin(match_df.counter_ref)],
                               a,
                               left_on=merge['reverse'],
                               right_on=merge['straight'])

            if len(match_df):
                match_key = np.append(match_key, match_df.counter_ref)
                match_index = np.append(match_index, match_df.counter_prob)
Ejemplo n.º 7
0
def partial_derivatives(save_path=None,
                        COV=None,
                        PHEN=None,
                        GEN=None,
                        MAP=None,
                        MAF=None,
                        R2=None,
                        B4_flag=False,
                        study_name=None,
                        intercept=True):
    row_index, ids = study_indexes(phenotype=PHEN.folder._data,
                                   genotype=GEN.folder._data,
                                   covariates=COV.folder._data)

    metadata = {}

    # TODO (mid) add parameter to compute PD only for new phenotypes or cov
    metadata['id'] = ids
    metadata['MAF'] = []
    metadata['filter'] = []
    metadata['names'] = []  # TODO (low) change to cov_names
    metadata['phenotype'] = []
    b_cov = []
    C = []
    a_test = []
    b4 = []

    covariates = COV.get_next(index=row_index[2])

    if MAP.cluster == 'n' or MAP.node[1] == 1:
        if intercept:
            metadata['names'].append(study_name + '_intercept')
        metadata['names'] = metadata['names'] + [
            study_name + '_' + i for i in COV.folder._data.get_names()
        ]

        a_cov = A_covariates(covariates, intercept=intercept)
        np.save(os.path.join(save_path, study_name + '_a_cov.npy'), a_cov)

        with Timer() as t_phen:

            while True:

                phenotype = PHEN.get_next(index=row_index[1])
                if isinstance(phenotype, type(None)):
                    b_cov = np.concatenate(b_cov, axis=1)
                    C = np.concatenate(C, axis=0)
                    np.save(os.path.join(save_path, study_name + '_b_cov.npy'),
                            b_cov)
                    np.save(os.path.join(save_path, study_name + '_C.npy'), C)
                    break

                metadata['phenotype'] = metadata['phenotype'] + list(
                    PHEN.folder._data.get_names())
                b_cov.append(
                    B_covariates(covariates, phenotype, intercept=intercept))
                C.append(C_matrix(phenotype))

        print(('Time to PD phenotype {} is {} s'.format(
            np.array(C).shape, t_phen.secs)))

    if MAP.cluster == 'y':
        f_max = np.max([int(f.split('_')[0]) for f in GEN.folder.files])
        files2read = [
            '{}_{}.h5'.format(i, study_name)
            for i in np.array_split(list(range(f_max +
                                               1)), MAP.node[0])[MAP.node[1] -
                                                                 1]
        ][::-1]
        filesdone = []
        for i in range(MAP.node[1] - 1):
            filesdone = filesdone + [
                '{}_{}.h5'.format(i, study_name)
                for i in np.array_split(list(range(f_max + 1)), MAP.node[0])[i]
            ]

        N_snps_read = 0
        for f in filesdone:
            file = os.path.join(GEN.folder.path, 'genotype', f)
            N_snps_read += GEN.folder.get_info(file)['shape'][0]
    else:
        N_snps_read = 0
    while True:
        with Timer() as t_gen:
            if MAP.cluster == 'y':
                if len(files2read) != 0:
                    file = os.path.join(GEN.folder.path, 'genotype',
                                        files2read.pop())
                    genotype = GEN.folder.read(file)
                else:
                    genotype = None
            else:
                genotype = GEN.get_next()
            if isinstance(genotype, type(None)):
                if MAP.cluster == 'y':

                    np.save(
                        os.path.join(
                            save_path, 'node_{}_'.format(MAP.node[1]) +
                            study_name + '_a_test.npy'),
                        np.concatenate(a_test).astype(np.float64))
                    np.save(
                        os.path.join(
                            save_path, 'node_{}_'.format(MAP.node[1]) +
                            study_name + '_metadata.npy'), metadata)
                    if B4_flag:
                        b4 = np.concatenate(b4, axis=0)
                        np.save(
                            os.path.join(
                                save_path, 'node_{}_'.format(MAP.node[1]) +
                                study_name + '_b4.npy'), b4.astype(np.float64))
                    if MAP.node[1] == MAP.node[0]:
                        merge_PD(save_path, MAP.node[0], study_name)

                else:
                    np.save(
                        os.path.join(save_path, study_name + '_a_test.npy'),
                        np.concatenate(a_test))
                    np.save(
                        os.path.join(save_path, study_name + '_metadata.npy'),
                        metadata)
                    if B4_flag:
                        b4 = np.concatenate(b4, axis=0)
                        np.save(
                            os.path.join(save_path, study_name + '_b4.npy'),
                            b4)
                break
            flip = MAP.flip[GEN.folder.name][N_snps_read:N_snps_read +
                                             genotype.shape[0]]
            N_snps_read += genotype.shape[0]
            flip_index = (flip == -1)
            genotype = np.apply_along_axis(
                lambda x: flip * (x - 2 * flip_index), 0, genotype)
            genotype = genotype[:, row_index[0]]
            maf = np.mean(genotype, axis=1) / 2
            metadata['MAF'] = metadata['MAF'] + list(maf)

            # TODO (low) add interaction
            a_test.append(A_tests(covariates, genotype, intercept=intercept))

            if B4_flag:
                # works only when all phenotypes in one chunk, if not, do not use this option!
                # it would use to much disk space anyway
                if len([f for f in PHEN.folder.files if f != 'info_dic.npy'
                        ]) > 1:
                    print('pd_full flag disabled!')
                    B4_flag = False
                    continue
                PHEN.folder.processed = 0
                phenotype = PHEN.get_next(index=row_index[1])
                b4.append(B4(phenotype, genotype))

        print(('Time to PD genotype {} is {} s'.format(genotype.shape,
                                                       t_gen.secs)))
Ejemplo n.º 8
0
    if not os.path.isdir(args.out):
        print("Creating output folder {}".format(args.out))
        os.mkdir(args.out)

    if args.np:
        check_np()

    ################################### CONVERTING ##############################
    if args.mode == 'converting':

        # ARG_CHECKER.check(args,mode='converting')

        R = Reader('genotype')
        R.start(args.genotype[0], vcf=args.vcf)

        with Timer() as t:
            if R.format == 'PLINK':
                G = GenotypePLINK(args.study_name[0], reader=R)
                G.split_size = CONVERTER_SPLIT_SIZE
                G.plink2hdf5(out=args.out)

            elif R.format == 'MINIMAC':
                G = GenotypeMINIMAC(args.study_name[0], reader=R)
                if args.cluster == 'y':
                    G.cluster = True
                G.split_size = CONVERTER_SPLIT_SIZE
                G.MACH2hdf5(args.out, id=args.id)

            elif R.format == 'VCF':
                G = GenotypeVCF(args.study_name[0], reader=R)
                if args.cluster == 'y':