def munge_sumstats(args, p=True): if args.out is None: raise ValueError('The --out flag is required.') START_TIME = time.time() log = Logger(args.out + '.log') try: if args.sumstats is None: raise ValueError('The --sumstats flag is required.') if args.no_alleles and args.merge_alleles: raise ValueError( '--no-alleles and --merge-alleles are not compatible.') if p: defaults = vars(parser.parse_args('')) opts = vars(args) non_defaults = [x for x in opts.keys() if opts[x] != defaults[x]] header = MASTHEAD header += "Call: \n" header += './munge_sumstats.py \\\n' options = [ '--' + x.replace('_', '-') + ' ' + str(opts[x]) + ' \\' for x in non_defaults ] header += '\n'.join(options).replace('True', '').replace('False', '') header = header[0:-1] + '\n' log.log(header) file_cnames = read_header(args.sumstats) # note keys not cleaned flag_cnames, signed_sumstat_null = parse_flag_cnames(log, args) if args.ignore: ignore_cnames = [clean_header(x) for x in args.ignore.split(',')] else: ignore_cnames = [] # remove LOG_ODDS, BETA, Z, OR from the default list if args.signed_sumstats is not None or args.a1_inc: mod_default_cnames = { x: default_cnames[x] for x in default_cnames if default_cnames[x] not in null_values } else: mod_default_cnames = default_cnames cname_map = get_cname_map(flag_cnames, mod_default_cnames, ignore_cnames) if args.daner: frq_u = filter(lambda x: x.startswith('FRQ_U_'), file_cnames)[0] frq_a = filter(lambda x: x.startswith('FRQ_A_'), file_cnames)[0] N_cas = float(frq_a[6:]) N_con = float(frq_u[6:]) log.log( 'Inferred that N_cas = {N1}, N_con = {N2} from the FRQ_[A/U] columns.' .format(N1=N_cas, N2=N_con)) args.N_cas = N_cas args.N_con = N_con # drop any N, N_cas, N_con or FRQ columns for c in ['N', 'N_CAS', 'N_CON', 'FRQ']: for d in [x for x in cname_map if cname_map[x] == 'c']: del cname_map[d] cname_map[frq_u] = 'FRQ' cname_translation = { x: cname_map[clean_header(x)] for x in file_cnames if clean_header(x) in cname_map } # note keys not cleaned cname_description = { x: describe_cname[cname_translation[x]] for x in cname_translation } if args.signed_sumstats is None and not args.a1_inc: sign_cnames = [ x for x in cname_translation if cname_translation[x] in null_values ] if len(sign_cnames) > 1: raise ValueError( 'Too many signed sumstat columns. Specify which to ignore with the --ignore flag.' ) if len(sign_cnames) == 0: raise ValueError( 'Could not find a signed summary statistic column.') sign_cname = sign_cnames[0] signed_sumstat_null = null_values[cname_translation[sign_cname]] cname_translation[sign_cname] = 'SIGNED_SUMSTAT' else: sign_cname = 'SIGNED_SUMSTATS' # check that we have all the columns we need if not args.a1_inc: req_cols = ['SNP', 'P', 'SIGNED_SUMSTAT'] else: req_cols = ['SNP', 'P'] for c in req_cols: if c not in cname_translation.values(): raise ValueError('Could not find {C} column.'.format(C=c)) if (not args.N) and (not (args.N_cas and args.N_con)) and ('N' not in cname_translation.values()) and\ (any(x not in cname_translation.values() for x in ['N_CAS', 'N_CON'])): raise ValueError('Could not determine N.') if ('N' in cname_translation.values() or all(x in cname_translation.values() for x in ['N_CAS', 'N_CON']))\ and 'NSTUDY' in cname_translation.values(): nstudy = [ x for x in cname_translation if cname_translation[x] == 'NSTUDY' ] for x in nstudy: del cname_translation[x] if not args.no_alleles and not all(x in cname_translation.values() for x in ['A1', 'A2']): raise ValueError('Could not find A1/A2 columns.') log.log('Interpreting column names as follows:') log.log('\n'.join( [x + ':\t' + cname_description[x] for x in cname_description]) + '\n') if args.merge_alleles: log.log('Reading list of SNPs for allele merge from {F}'.format( F=args.merge_alleles)) (openfunc, compression) = get_compression(args.merge_alleles) merge_alleles = pd.read_csv(args.merge_alleles, compression=compression, header=0, delim_whitespace=True, na_values='.') if any(x not in merge_alleles.columns for x in ["SNP", "A1", "A2"]): raise ValueError( '--merge-alleles must have columns SNP, A1, A2.') log.log( 'Read {N} SNPs for allele merge.'.format(N=len(merge_alleles))) merge_alleles['MA'] = (merge_alleles.A1 + merge_alleles.A2).apply(lambda y: y.upper()) merge_alleles.drop( [x for x in merge_alleles.columns if x not in ['SNP', 'MA']], axis=1, inplace=True) else: merge_alleles = None (openfunc, compression) = get_compression(args.sumstats) # figure out which columns are going to involve sign information, so we can ensure # they're read as floats signed_sumstat_cols = [ k for k, v in cname_translation.items() if v == 'SIGNED_SUMSTAT' ] dat_gen = pd.read_csv( args.sumstats, delim_whitespace=True, header=0, compression=compression, usecols=cname_translation.keys(), na_values=['.', 'NA'], iterator=True, chunksize=args.chunksize, dtype={c: np.float64 for c in signed_sumstat_cols}) dat = parse_dat(dat_gen, cname_translation, merge_alleles, log, args) if len(dat) == 0: raise ValueError('After applying filters, no SNPs remain.') old = len(dat) dat = dat.drop_duplicates(subset='SNP').reset_index(drop=True) new = len(dat) log.log( 'Removed {M} SNPs with duplicated rs numbers ({N} SNPs remain).'. format(M=old - new, N=new)) # filtering on N cannot be done chunkwise dat = process_n(dat, args, log) dat.P = p_to_z(dat.P, dat.N) dat.rename(columns={'P': 'Z'}, inplace=True) if not args.a1_inc: log.log( check_median(dat.SIGNED_SUMSTAT, signed_sumstat_null, 0.1, sign_cname)) dat.Z *= (-1)**(dat.SIGNED_SUMSTAT < signed_sumstat_null) dat.drop('SIGNED_SUMSTAT', inplace=True, axis=1) # do this last so we don't have to worry about NA values in the rest of # the program if args.merge_alleles: dat = allele_merge(dat, merge_alleles, log) out_fname = args.out + '.sumstats' print_colnames = [ c for c in dat.columns if c in ['SNP', 'N', 'Z', 'A1', 'A2'] ] if args.keep_maf and 'FRQ' in dat.columns: print_colnames.append('FRQ') msg = 'Writing summary statistics for {M} SNPs ({N} with nonmissing beta) to {F}.' log.log( msg.format(M=len(dat), F=out_fname + '.gz', N=dat.N.notnull().sum())) if p: dat.to_csv(out_fname, sep="\t", index=False, columns=print_colnames, float_format='%.3f') os.system('gzip -f {F}'.format(F=out_fname)) log.log('\nMetadata:') CHISQ = (dat.Z**2) mean_chisq = CHISQ.mean() log.log('Mean chi^2 = ' + str(round(mean_chisq, 3))) if mean_chisq < 1.02: log.log("WARNING: mean chi^2 may be too small.") log.log('Lambda GC = ' + str(round(CHISQ.median() / 0.4549, 3))) log.log('Max chi^2 = ' + str(round(CHISQ.max(), 3))) log.log( '{N} Genome-wide significant SNPs (some may have been removed by filtering).' .format(N=(CHISQ > 29).sum())) return dat except Exception: log.log('\nERROR converting summary statistics:\n') ex_type, ex, tb = sys.exc_info() log.log(traceback.format_exc(ex)) raise finally: log.log('\nConversion finished at {T}'.format(T=time.ctime())) log.log('Total time elapsed: {T}'.format( T=sec_to_str(round(time.time() - START_TIME, 2))))
def munge_sumstats(args, p=True): if args.out is None: raise ValueError('The --out flag is required.') START_TIME = time.time() log = Logger(args.out + '.log') try: if args.sumstats is None: raise ValueError('The --sumstats flag is required.') if args.no_alleles and args.merge_alleles: raise ValueError( '--no-alleles and --merge-alleles are not compatible.') if p: defaults = vars(parser.parse_args('')) opts = vars(args) non_defaults = [x for x in opts.keys() if opts[x] != defaults[x]] header = MASTHEAD header += "Call: \n" header += './munge_sumstats.py \\\n' options = ['--'+x.replace('_','-')+' '+str(opts[x])+' \\' for x in non_defaults] header += '\n'.join(options).replace('True','').replace('False','') header = header[0:-1]+'\n' log.log(header) file_cnames = read_header(args.sumstats) # note keys not cleaned flag_cnames, signed_sumstat_null = parse_flag_cnames(log, args) if args.ignore: ignore_cnames = [clean_header(x) for x in args.ignore.split(',')] else: ignore_cnames = [] # remove LOG_ODDS, BETA, Z, OR from the default list if args.signed_sumstats is not None or args.a1_inc: mod_default_cnames = {x: default_cnames[ x] for x in default_cnames if default_cnames[x] not in null_values} else: mod_default_cnames = default_cnames cname_map = get_cname_map( flag_cnames, mod_default_cnames, ignore_cnames) if args.daner: frq_u = filter(lambda x: x.startswith('FRQ_U_'), file_cnames)[0] frq_a = filter(lambda x: x.startswith('FRQ_A_'), file_cnames)[0] N_cas = float(frq_a[6:]) N_con = float(frq_u[6:]) log.log( 'Inferred that N_cas = {N1}, N_con = {N2} from the FRQ_[A/U] columns.'.format(N1=N_cas, N2=N_con)) args.N_cas = N_cas args.N_con = N_con # drop any N, N_cas, N_con or FRQ columns for c in ['N', 'N_CAS', 'N_CON', 'FRQ']: for d in [x for x in cname_map if cname_map[x] == 'c']: del cname_map[d] cname_map[frq_u] = 'FRQ' cname_translation = {x: cname_map[clean_header(x)] for x in file_cnames if clean_header(x) in cname_map} # note keys not cleaned cname_description = { x: describe_cname[cname_translation[x]] for x in cname_translation} if args.signed_sumstats is None and not args.a1_inc: sign_cnames = [ x for x in cname_translation if cname_translation[x] in null_values] if len(sign_cnames) > 1: raise ValueError( 'Too many signed sumstat columns. Specify which to ignore with the --ignore flag.') if len(sign_cnames) == 0: raise ValueError( 'Could not find a signed summary statistic column.') sign_cname = sign_cnames[0] signed_sumstat_null = null_values[cname_translation[sign_cname]] cname_translation[sign_cname] = 'SIGNED_SUMSTAT' else: sign_cname = 'SIGNED_SUMSTATS' # check that we have all the columns we need if not args.a1_inc: req_cols = ['SNP', 'P', 'SIGNED_SUMSTAT'] else: req_cols = ['SNP', 'P'] for c in req_cols: if c not in cname_translation.values(): raise ValueError('Could not find {C} column.'.format(C=c)) if (not args.N) and (not (args.N_cas and args.N_con)) and ('N' not in cname_translation.values()) and\ (any(x not in cname_translation.values() for x in ['N_CAS', 'N_CON'])): raise ValueError('Could not determine N.') if ('N' in cname_translation.values() or all(x in cname_translation.values() for x in ['N_CAS', 'N_CON']))\ and 'NSTUDY' in cname_translation.values(): nstudy = [ x for x in cname_translation if cname_translation[x] == 'NSTUDY'] for x in nstudy: del cname_translation[x] if not args.no_alleles and not all(x in cname_translation.values() for x in ['A1', 'A2']): raise ValueError('Could not find A1/A2 columns.') log.log('Interpreting column names as follows:') log.log('\n'.join([x + ':\t' + cname_description[x] for x in cname_description]) + '\n') if args.merge_alleles: log.log( 'Reading list of SNPs for allele merge from {F}'.format(F=args.merge_alleles)) (openfunc, compression) = get_compression(args.merge_alleles) merge_alleles = pd.read_csv(args.merge_alleles, compression=compression, header=0, delim_whitespace=True, na_values='.') if any(x not in merge_alleles.columns for x in ["SNP", "A1", "A2"]): raise ValueError( '--merge-alleles must have columns SNP, A1, A2.') log.log( 'Read {N} SNPs for allele merge.'.format(N=len(merge_alleles))) merge_alleles['MA'] = ( merge_alleles.A1 + merge_alleles.A2).apply(lambda y: y.upper()) merge_alleles.drop( [x for x in merge_alleles.columns if x not in ['SNP', 'MA']], axis=1, inplace=True) else: merge_alleles = None (openfunc, compression) = get_compression(args.sumstats) dat_gen = pd.read_csv(args.sumstats, delim_whitespace=True, header=0, compression=compression, usecols=cname_translation.keys(), na_values=['.', 'NA'], iterator=True, chunksize=args.chunksize) dat = parse_dat(dat_gen, cname_translation, merge_alleles, log, args) if len(dat) == 0: raise ValueError('After applying filters, no SNPs remain.') old = len(dat) dat = dat.drop_duplicates(subset='SNP').reset_index(drop=True) new = len(dat) log.log('Removed {M} SNPs with duplicated rs numbers ({N} SNPs remain).'.format( M=old - new, N=new)) # filtering on N cannot be done chunkwise dat = process_n(dat, args, log) dat.P = p_to_z(dat.P, dat.N) dat.rename(columns={'P': 'Z'}, inplace=True) if not args.a1_inc: log.log( check_median(dat.SIGNED_SUMSTAT, signed_sumstat_null, 0.1, sign_cname)) dat.Z *= (-1) ** (dat.SIGNED_SUMSTAT < signed_sumstat_null) dat.drop('SIGNED_SUMSTAT', inplace=True, axis=1) # do this last so we don't have to worry about NA values in the rest of # the program if args.merge_alleles: dat = allele_merge(dat, merge_alleles, log) out_fname = args.out + '.sumstats' print_colnames = [ c for c in dat.columns if c in ['SNP', 'N', 'Z', 'A1', 'A2']] if args.keep_maf and 'FRQ' in dat.columns: print_colnames.append('FRQ') msg = 'Writing summary statistics for {M} SNPs ({N} with nonmissing beta) to {F}.' log.log( msg.format(M=len(dat), F=out_fname + '.gz', N=dat.N.notnull().sum())) if p: dat.to_csv(out_fname, sep="\t", index=False, columns=print_colnames, float_format='%.3f') os.system('gzip -f {F}'.format(F=out_fname)) log.log('\nMetadata:') CHISQ = (dat.Z ** 2) mean_chisq = CHISQ.mean() log.log('Mean chi^2 = ' + str(round(mean_chisq, 3))) if mean_chisq < 1.02: log.log("WARNING: mean chi^2 may be too small.") log.log('Lambda GC = ' + str(round(CHISQ.median() / 0.4549, 3))) log.log('Max chi^2 = ' + str(round(CHISQ.max(), 3))) log.log('{N} Genome-wide significant SNPs (some may have been removed by filtering).'.format(N=(CHISQ > 29).sum())) return dat except Exception: log.log('\nERROR converting summary statistics:\n') ex_type, ex, tb = sys.exc_info() log.log(traceback.format_exc(ex)) raise finally: log.log('\nConversion finished at {T}'.format(T=time.ctime())) log.log('Total time elapsed: {T}'.format( T=sec_to_str(round(time.time() - START_TIME, 2))))