Esempio n. 1
0
    def print_subsets(cls, outfilename, snpsubsets, names, add_other=False):
        def snp_info_df(d):
            bfile = d.genotypes_bedfile.filename
            return pd.read_csv(bfile + '.bim',
                    delim_whitespace=True,
                    usecols=[0,1,2,3],
                    names=['CHR','SNP','CM','BP'])

        # check that all snpsubsets have the same data set
        if len(set([ss.dataset for ss in snpsubsets])) > 1:
            print('error: all subsets must have the same underlying dataset')
            return
        if not outfilename.endswith('.gz'):
            print('outfilename must end with ".gz". I only write zipped files')
            return

        # get snp info for this dataset
        d = snpsubsets[0].dataset
        df = snp_info_df(d)

        # add the 'other' annotation if necessary
        if add_other:
            union = IntRangeSet()
            for ss in snpsubsets:
                union.update(ss.irs)
            snpsubsets.append(SnpSubset(d, irs=d.all_snps() - union))
            names.append('OTHER')

        # create the pandas dataframe and output it
        for name, ss in zip(names, snpsubsets):
            df[name] = 0
            df.ix[[i for i in ss.irs], name] = 1
        df = df[['CHR','BP','SNP','CM'] + names]
        with gzip.open(outfilename, 'wt') as write_file:
            df.to_csv(write_file, index=False, sep='\t')