def print_subsets(cls, outfilename, snpsubsets, names, add_other=False): def snp_info_df(d): bfile = d.genotypes_bedfile.filename return pd.read_csv(bfile + '.bim', delim_whitespace=True, usecols=[0,1,2,3], names=['CHR','SNP','CM','BP']) # check that all snpsubsets have the same data set if len(set([ss.dataset for ss in snpsubsets])) > 1: print('error: all subsets must have the same underlying dataset') return if not outfilename.endswith('.gz'): print('outfilename must end with ".gz". I only write zipped files') return # get snp info for this dataset d = snpsubsets[0].dataset df = snp_info_df(d) # add the 'other' annotation if necessary if add_other: union = IntRangeSet() for ss in snpsubsets: union.update(ss.irs) snpsubsets.append(SnpSubset(d, irs=d.all_snps() - union)) names.append('OTHER') # create the pandas dataframe and output it for name, ss in zip(names, snpsubsets): df[name] = 0 df.ix[[i for i in ss.irs], name] = 1 df = df[['CHR','BP','SNP','CM'] + names] with gzip.open(outfilename, 'wt') as write_file: df.to_csv(write_file, index=False, sep='\t')