Ejemplo n.º 1
0
def munge_sumstats(args, p=True):
    if args.out is None:
        raise ValueError('The --out flag is required.')

    START_TIME = time.time()
    log = Logger(args.out + '.log')
    try:
        if args.sumstats is None:
            raise ValueError('The --sumstats flag is required.')
        if args.no_alleles and args.merge_alleles:
            raise ValueError(
                '--no-alleles and --merge-alleles are not compatible.')

        if p:
            defaults = vars(parser.parse_args(''))
            opts = vars(args)
            non_defaults = [x for x in opts.keys() if opts[x] != defaults[x]]
            header = MASTHEAD
            header += "Call: \n"
            header += './munge_sumstats.py \\\n'
            options = [
                '--' + x.replace('_', '-') + ' ' + str(opts[x]) + ' \\'
                for x in non_defaults
            ]
            header += '\n'.join(options).replace('True',
                                                 '').replace('False', '')
            header = header[0:-1] + '\n'
            log.log(header)

        file_cnames = read_header(args.sumstats)  # note keys not cleaned
        flag_cnames, signed_sumstat_null = parse_flag_cnames(log, args)
        if args.ignore:
            ignore_cnames = [clean_header(x) for x in args.ignore.split(',')]
        else:
            ignore_cnames = []

        # remove LOG_ODDS, BETA, Z, OR from the default list
        if args.signed_sumstats is not None or args.a1_inc:
            mod_default_cnames = {
                x: default_cnames[x]
                for x in default_cnames if default_cnames[x] not in null_values
            }
        else:
            mod_default_cnames = default_cnames

        cname_map = get_cname_map(flag_cnames, mod_default_cnames,
                                  ignore_cnames)
        if args.daner:
            frq_u = filter(lambda x: x.startswith('FRQ_U_'), file_cnames)[0]
            frq_a = filter(lambda x: x.startswith('FRQ_A_'), file_cnames)[0]
            N_cas = float(frq_a[6:])
            N_con = float(frq_u[6:])
            log.log(
                'Inferred that N_cas = {N1}, N_con = {N2} from the FRQ_[A/U] columns.'
                .format(N1=N_cas, N2=N_con))
            args.N_cas = N_cas
            args.N_con = N_con
            # drop any N, N_cas, N_con or FRQ columns
            for c in ['N', 'N_CAS', 'N_CON', 'FRQ']:
                for d in [x for x in cname_map if cname_map[x] == 'c']:
                    del cname_map[d]

            cname_map[frq_u] = 'FRQ'

        cname_translation = {
            x: cname_map[clean_header(x)]
            for x in file_cnames if clean_header(x) in cname_map
        }  # note keys not cleaned
        cname_description = {
            x: describe_cname[cname_translation[x]]
            for x in cname_translation
        }
        if args.signed_sumstats is None and not args.a1_inc:
            sign_cnames = [
                x for x in cname_translation
                if cname_translation[x] in null_values
            ]
            if len(sign_cnames) > 1:
                raise ValueError(
                    'Too many signed sumstat columns. Specify which to ignore with the --ignore flag.'
                )
            if len(sign_cnames) == 0:
                raise ValueError(
                    'Could not find a signed summary statistic column.')

            sign_cname = sign_cnames[0]
            signed_sumstat_null = null_values[cname_translation[sign_cname]]
            cname_translation[sign_cname] = 'SIGNED_SUMSTAT'
        else:
            sign_cname = 'SIGNED_SUMSTATS'

        # check that we have all the columns we need
        if not args.a1_inc:
            req_cols = ['SNP', 'P', 'SIGNED_SUMSTAT']
        else:
            req_cols = ['SNP', 'P']

        for c in req_cols:
            if c not in cname_translation.values():
                raise ValueError('Could not find {C} column.'.format(C=c))

        if (not args.N) and (not (args.N_cas and args.N_con)) and ('N' not in cname_translation.values()) and\
                (any(x not in cname_translation.values() for x in ['N_CAS', 'N_CON'])):
            raise ValueError('Could not determine N.')
        if ('N' in cname_translation.values() or all(x in cname_translation.values() for x in ['N_CAS', 'N_CON']))\
                and 'NSTUDY' in cname_translation.values():
            nstudy = [
                x for x in cname_translation
                if cname_translation[x] == 'NSTUDY'
            ]
            for x in nstudy:
                del cname_translation[x]
        if not args.no_alleles and not all(x in cname_translation.values()
                                           for x in ['A1', 'A2']):
            raise ValueError('Could not find A1/A2 columns.')

        log.log('Interpreting column names as follows:')
        log.log('\n'.join(
            [x + ':\t' + cname_description[x]
             for x in cname_description]) + '\n')

        if args.merge_alleles:
            log.log('Reading list of SNPs for allele merge from {F}'.format(
                F=args.merge_alleles))
            (openfunc, compression) = get_compression(args.merge_alleles)
            merge_alleles = pd.read_csv(args.merge_alleles,
                                        compression=compression,
                                        header=0,
                                        delim_whitespace=True,
                                        na_values='.')
            if any(x not in merge_alleles.columns
                   for x in ["SNP", "A1", "A2"]):
                raise ValueError(
                    '--merge-alleles must have columns SNP, A1, A2.')

            log.log(
                'Read {N} SNPs for allele merge.'.format(N=len(merge_alleles)))
            merge_alleles['MA'] = (merge_alleles.A1 +
                                   merge_alleles.A2).apply(lambda y: y.upper())
            merge_alleles.drop(
                [x for x in merge_alleles.columns if x not in ['SNP', 'MA']],
                axis=1,
                inplace=True)
        else:
            merge_alleles = None

        (openfunc, compression) = get_compression(args.sumstats)

        # figure out which columns are going to involve sign information, so we can ensure
        # they're read as floats
        signed_sumstat_cols = [
            k for k, v in cname_translation.items() if v == 'SIGNED_SUMSTAT'
        ]
        dat_gen = pd.read_csv(
            args.sumstats,
            delim_whitespace=True,
            header=0,
            compression=compression,
            usecols=cname_translation.keys(),
            na_values=['.', 'NA'],
            iterator=True,
            chunksize=args.chunksize,
            dtype={c: np.float64
                   for c in signed_sumstat_cols})

        dat = parse_dat(dat_gen, cname_translation, merge_alleles, log, args)
        if len(dat) == 0:
            raise ValueError('After applying filters, no SNPs remain.')

        old = len(dat)
        dat = dat.drop_duplicates(subset='SNP').reset_index(drop=True)
        new = len(dat)
        log.log(
            'Removed {M} SNPs with duplicated rs numbers ({N} SNPs remain).'.
            format(M=old - new, N=new))
        # filtering on N cannot be done chunkwise
        dat = process_n(dat, args, log)
        dat.P = p_to_z(dat.P, dat.N)
        dat.rename(columns={'P': 'Z'}, inplace=True)
        if not args.a1_inc:
            log.log(
                check_median(dat.SIGNED_SUMSTAT, signed_sumstat_null, 0.1,
                             sign_cname))
            dat.Z *= (-1)**(dat.SIGNED_SUMSTAT < signed_sumstat_null)
            dat.drop('SIGNED_SUMSTAT', inplace=True, axis=1)
        # do this last so we don't have to worry about NA values in the rest of
        # the program
        if args.merge_alleles:
            dat = allele_merge(dat, merge_alleles, log)

        out_fname = args.out + '.sumstats'
        print_colnames = [
            c for c in dat.columns if c in ['SNP', 'N', 'Z', 'A1', 'A2']
        ]
        if args.keep_maf and 'FRQ' in dat.columns:
            print_colnames.append('FRQ')
        msg = 'Writing summary statistics for {M} SNPs ({N} with nonmissing beta) to {F}.'
        log.log(
            msg.format(M=len(dat),
                       F=out_fname + '.gz',
                       N=dat.N.notnull().sum()))
        if p:
            dat.to_csv(out_fname,
                       sep="\t",
                       index=False,
                       columns=print_colnames,
                       float_format='%.3f')
            os.system('gzip -f {F}'.format(F=out_fname))

        log.log('\nMetadata:')
        CHISQ = (dat.Z**2)
        mean_chisq = CHISQ.mean()
        log.log('Mean chi^2 = ' + str(round(mean_chisq, 3)))
        if mean_chisq < 1.02:
            log.log("WARNING: mean chi^2 may be too small.")

        log.log('Lambda GC = ' + str(round(CHISQ.median() / 0.4549, 3)))
        log.log('Max chi^2 = ' + str(round(CHISQ.max(), 3)))
        log.log(
            '{N} Genome-wide significant SNPs (some may have been removed by filtering).'
            .format(N=(CHISQ > 29).sum()))
        return dat

    except Exception:
        log.log('\nERROR converting summary statistics:\n')
        ex_type, ex, tb = sys.exc_info()
        log.log(traceback.format_exc(ex))
        raise
    finally:
        log.log('\nConversion finished at {T}'.format(T=time.ctime()))
        log.log('Total time elapsed: {T}'.format(
            T=sec_to_str(round(time.time() - START_TIME, 2))))
Ejemplo n.º 2
0
def munge_sumstats(args, p=True):
    if args.out is None:
        raise ValueError('The --out flag is required.')

    START_TIME = time.time()
    log = Logger(args.out + '.log')
    try:
        if args.sumstats is None:
            raise ValueError('The --sumstats flag is required.')
        if args.no_alleles and args.merge_alleles:
            raise ValueError(
                '--no-alleles and --merge-alleles are not compatible.')

        if p:
            defaults = vars(parser.parse_args(''))
            opts = vars(args)
            non_defaults = [x for x in opts.keys() if opts[x] != defaults[x]]
            header = MASTHEAD
            header += "Call: \n"
            header += './munge_sumstats.py \\\n'
            options = ['--'+x.replace('_','-')+' '+str(opts[x])+' \\' for x in non_defaults]
            header += '\n'.join(options).replace('True','').replace('False','')
            header = header[0:-1]+'\n'
            log.log(header)

        file_cnames = read_header(args.sumstats)  # note keys not cleaned
        flag_cnames, signed_sumstat_null = parse_flag_cnames(log, args)
        if args.ignore:
            ignore_cnames = [clean_header(x) for x in args.ignore.split(',')]
        else:
            ignore_cnames = []

        # remove LOG_ODDS, BETA, Z, OR from the default list
        if args.signed_sumstats is not None or args.a1_inc:
            mod_default_cnames = {x: default_cnames[
                x] for x in default_cnames if default_cnames[x] not in null_values}
        else:
            mod_default_cnames = default_cnames

        cname_map = get_cname_map(
            flag_cnames, mod_default_cnames, ignore_cnames)
        if args.daner:
            frq_u = filter(lambda x: x.startswith('FRQ_U_'), file_cnames)[0]
            frq_a = filter(lambda x: x.startswith('FRQ_A_'), file_cnames)[0]
            N_cas = float(frq_a[6:])
            N_con = float(frq_u[6:])
            log.log(
                'Inferred that N_cas = {N1}, N_con = {N2} from the FRQ_[A/U] columns.'.format(N1=N_cas, N2=N_con))
            args.N_cas = N_cas
            args.N_con = N_con
            # drop any N, N_cas, N_con or FRQ columns
            for c in ['N', 'N_CAS', 'N_CON', 'FRQ']:
                for d in [x for x in cname_map if cname_map[x] == 'c']:
                    del cname_map[d]

            cname_map[frq_u] = 'FRQ'

        cname_translation = {x: cname_map[clean_header(x)] for x in file_cnames if
                             clean_header(x) in cname_map}  # note keys not cleaned
        cname_description = {
            x: describe_cname[cname_translation[x]] for x in cname_translation}
        if args.signed_sumstats is None and not args.a1_inc:
            sign_cnames = [
                x for x in cname_translation if cname_translation[x] in null_values]
            if len(sign_cnames) > 1:
                raise ValueError(
                    'Too many signed sumstat columns. Specify which to ignore with the --ignore flag.')
            if len(sign_cnames) == 0:
                raise ValueError(
                    'Could not find a signed summary statistic column.')

            sign_cname = sign_cnames[0]
            signed_sumstat_null = null_values[cname_translation[sign_cname]]
            cname_translation[sign_cname] = 'SIGNED_SUMSTAT'
        else:
            sign_cname = 'SIGNED_SUMSTATS'

        # check that we have all the columns we need
        if not args.a1_inc:
            req_cols = ['SNP', 'P', 'SIGNED_SUMSTAT']
        else:
            req_cols = ['SNP', 'P']

        for c in req_cols:
            if c not in cname_translation.values():
                raise ValueError('Could not find {C} column.'.format(C=c))

        if (not args.N) and (not (args.N_cas and args.N_con)) and ('N' not in cname_translation.values()) and\
                (any(x not in cname_translation.values() for x in ['N_CAS', 'N_CON'])):
            raise ValueError('Could not determine N.')
        if ('N' in cname_translation.values() or all(x in cname_translation.values() for x in ['N_CAS', 'N_CON']))\
                and 'NSTUDY' in cname_translation.values():
            nstudy = [
                x for x in cname_translation if cname_translation[x] == 'NSTUDY']
            for x in nstudy:
                del cname_translation[x]
        if not args.no_alleles and not all(x in cname_translation.values() for x in ['A1', 'A2']):
            raise ValueError('Could not find A1/A2 columns.')

        log.log('Interpreting column names as follows:')
        log.log('\n'.join([x + ':\t' + cname_description[x]
                           for x in cname_description]) + '\n')

        if args.merge_alleles:
            log.log(
                'Reading list of SNPs for allele merge from {F}'.format(F=args.merge_alleles))
            (openfunc, compression) = get_compression(args.merge_alleles)
            merge_alleles = pd.read_csv(args.merge_alleles, compression=compression, header=0,
                                        delim_whitespace=True, na_values='.')
            if any(x not in merge_alleles.columns for x in ["SNP", "A1", "A2"]):
                raise ValueError(
                    '--merge-alleles must have columns SNP, A1, A2.')

            log.log(
                'Read {N} SNPs for allele merge.'.format(N=len(merge_alleles)))
            merge_alleles['MA'] = (
                merge_alleles.A1 + merge_alleles.A2).apply(lambda y: y.upper())
            merge_alleles.drop(
                [x for x in merge_alleles.columns if x not in ['SNP', 'MA']], axis=1, inplace=True)
        else:
            merge_alleles = None

        (openfunc, compression) = get_compression(args.sumstats)
        dat_gen = pd.read_csv(args.sumstats, delim_whitespace=True, header=0, compression=compression,
                              usecols=cname_translation.keys(), na_values=['.', 'NA'], iterator=True, chunksize=args.chunksize)

        dat = parse_dat(dat_gen, cname_translation, merge_alleles, log, args)
        if len(dat) == 0:
            raise ValueError('After applying filters, no SNPs remain.')

        old = len(dat)
        dat = dat.drop_duplicates(subset='SNP').reset_index(drop=True)
        new = len(dat)
        log.log('Removed {M} SNPs with duplicated rs numbers ({N} SNPs remain).'.format(
            M=old - new, N=new))
        # filtering on N cannot be done chunkwise
        dat = process_n(dat, args, log)
        dat.P = p_to_z(dat.P, dat.N)
        dat.rename(columns={'P': 'Z'}, inplace=True)
        if not args.a1_inc:
            log.log(
                check_median(dat.SIGNED_SUMSTAT, signed_sumstat_null, 0.1, sign_cname))
            dat.Z *= (-1) ** (dat.SIGNED_SUMSTAT < signed_sumstat_null)
            dat.drop('SIGNED_SUMSTAT', inplace=True, axis=1)
        # do this last so we don't have to worry about NA values in the rest of
        # the program
        if args.merge_alleles:
            dat = allele_merge(dat, merge_alleles, log)

        out_fname = args.out + '.sumstats'
        print_colnames = [
            c for c in dat.columns if c in ['SNP', 'N', 'Z', 'A1', 'A2']]
        if args.keep_maf and 'FRQ' in dat.columns:
            print_colnames.append('FRQ')
        msg = 'Writing summary statistics for {M} SNPs ({N} with nonmissing beta) to {F}.'
        log.log(
            msg.format(M=len(dat), F=out_fname + '.gz', N=dat.N.notnull().sum()))
        if p:
            dat.to_csv(out_fname, sep="\t", index=False,
                       columns=print_colnames, float_format='%.3f')
            os.system('gzip -f {F}'.format(F=out_fname))

        log.log('\nMetadata:')
        CHISQ = (dat.Z ** 2)
        mean_chisq = CHISQ.mean()
        log.log('Mean chi^2 = ' + str(round(mean_chisq, 3)))
        if mean_chisq < 1.02:
            log.log("WARNING: mean chi^2 may be too small.")

        log.log('Lambda GC = ' + str(round(CHISQ.median() / 0.4549, 3)))
        log.log('Max chi^2 = ' + str(round(CHISQ.max(), 3)))
        log.log('{N} Genome-wide significant SNPs (some may have been removed by filtering).'.format(N=(CHISQ
                                                                                                        > 29).sum()))
        return dat

    except Exception:
        log.log('\nERROR converting summary statistics:\n')
        ex_type, ex, tb = sys.exc_info()
        log.log(traceback.format_exc(ex))
        raise
    finally:
        log.log('\nConversion finished at {T}'.format(T=time.ctime()))
        log.log('Total time elapsed: {T}'.format(
            T=sec_to_str(round(time.time() - START_TIME, 2))))