Example #1
0
def parse_sum_stats_custom(filename=None,
                           bimfile=None,
                           only_hm3=False,
                           hdf5_file=None,
                           n=None,
                           ch=None,
                           pos=None,
                           A1=None,
                           A2=None,
                           reffreq=None,
                           case_freq=None,
                           control_freq=None,
                           case_n=None,
                           control_n=None,
                           info=None,
                           rs=None,
                           pval=None,
                           eff=None,
                           ncol=None,
                           input_is_beta=False,
                           match_genomic_pos=False,
                           debug=False,
                           summary_dict=None):
    # Check required fields are here
    assert not A2 is None, 'Require header for non-effective allele'
    assert not A1 is None, 'Require header for effective allele'
    assert not rs is None, 'Require header for RS ID'
    assert not eff is None, 'Require header for Statistics'
    assert not pval is None, 'Require header for pval'
    assert not ncol is None or not n is None or (
        control_n is not None
        and case_n is not None), 'Require either N or NCOL information'

    if ch is None:
        assert not bimfile is None, 'Require bimfile when chromosome header not provided'
        print("Chromosome Header not provided, will use info from bim file")
    if pos is None:
        assert not bimfile is None, 'Require bimfile when position header not provided'
        print("Position Header not provided, will use info from bim file")

    num_lines = util.count_lines(filename)
    snps_pos_map = {}
    if only_hm3:
        if debug:
            print('Loading HapMap3 SNPs')
        hm3_sids = util.load_hapmap_SNPs()

    if bimfile is not None:
        valid_sids = set()
        if debug:
            print('Parsing bim file: %s' % bimfile)

        with open(bimfile) as f:
            for line in f:
                l = line.split()
                chrom = util.get_chrom_num(l[0])
                if chrom not in util.ok_chromosomes:
                    continue
                sid = l[1]
                if only_hm3:
                    if sid in hm3_sids:
                        valid_sids.add(sid)
                        snps_pos_map[sid] = {'pos': int(l[3]), 'chrom': chrom}
                else:
                    valid_sids.add(sid)
                    snps_pos_map[sid] = {'pos': int(l[3]), 'chrom': chrom}

        if len(valid_sids) == 0:
            raise Exception('Unable to parse BIM file')
    else:
        raise Exception(
            'BIM file missing. Please check genotype paths provided.')

    invalid_chr = 0
    invalid_pos = 0
    invalid_p = 0
    invalid_beta = 0
    chrom_dict = {}
    opener = open
    if is_gz(filename):
        opener = gzip.open
    print('Parsing summary statistics file: %s' % filename)
    with opener(filename) as f:
        header = f.readline()
        if is_gz(filename):
            header = header.decode('utf-8')
        if debug:
            print('File header:')
            print(header)
        header_dict = {}
        columns = (header.strip()).split()
        index = 0
        for col in columns:
            header_dict[col] = index
            index += 1
        assert ch is None or ch in header_dict, 'Chromosome header cannot be found in summary statistic file'
        assert A2 in header_dict, 'Non-effective allele column cannot be found in summary statistic file'
        assert A1 in header_dict, 'Effective allele column cannot be found in summary statistic file'
        assert eff in header_dict, 'Effect size column not found in summary statistic file'
        assert rs in header_dict, 'SNP ID column not found in summary statistic file'
        assert pos is None or pos in header_dict, 'Position column not found in summary statistic file'
        assert pval in header_dict, 'P Value column not found in summary statistic file'
        assert not n is None or ncol in header_dict or (control_n in header_dict and case_n in header_dict), 'Sample size column not found in summary statistic ' \
                                                     'file and N not provided'
        # header_dict now contains the header column name for each corresponding input
        bad_chromosomes = set()
        line_i = 1
        for line in f:
            line_i += 1
            if line_i % 1000 == 0 and num_lines > 0:
                sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' %
                                 (100.0 * (float(line_i) / (num_lines))))
                sys.stdout.flush()
            if is_gz(filename):
                line = line.decode('utf-8')
            l = (line.strip()).split()
            # get the SNP ID first
            sid = l[header_dict[rs]]
            # check the SNP ID
            if sid in valid_sids:
                # Get the chromosome information
                chrom = 0
                if not ch is None and ch in header_dict:
                    chrom = util.get_chrom_num(l[header_dict[ch]])
                    # Check if the chromosome of the SNP is correct
                    if not chrom == snps_pos_map[sid]['chrom']:
                        invalid_chr += 1
                        continue
                else:
                    chrom = snps_pos_map[sid]['chrom']

                pos_read = 0
                if not pos is None and pos in header_dict:
                    pos_read = int(l[header_dict[pos]])
                    if not pos_read == snps_pos_map[sid]['pos']:
                        invalid_pos += 1
                        if match_genomic_pos:
                            continue
                else:
                    pos_read = snps_pos_map[sid]['pos']

                pval_read = float(l[header_dict[pval]])
                if not isfinite(stats.norm.ppf(pval_read)):
                    invalid_p += 1
                    continue

                if not isfinite(float(l[header_dict[eff]])):
                    invalid_beta += 1
                    continue

                if not chrom in chrom_dict:
                    chrom_dict[chrom] = {
                        'ps': [],
                        'log_odds': [],
                        'infos': [],
                        'freqs': [],
                        'betas': [],
                        'nts': [],
                        'sids': [],
                        'positions': []
                    }
                chrom_dict[chrom]['sids'].append(sid)
                chrom_dict[chrom]['positions'].append(pos_read)
                # Check the frequency
                if reffreq is not None and reffreq in header_dict:
                    if l[header_dict[reffreq]] == '.' or l[
                            header_dict[reffreq]] == 'NA':
                        chrom_dict[chrom]['freqs'].append(-1)
                    else:
                        chrom_dict[chrom]['freqs'].append(
                            float(l[header_dict[reffreq]]))
                elif (case_freq is not None and control_freq is not None
                      and case_freq in header_dict
                      and control_freq in header_dict):
                    if (case_n is not None and control_n is not None
                            and case_n in header_dict
                            and control_n in header_dict):
                        if (l[header_dict[control_n]] == '.'
                                or l[header_dict[control_n]] == 'NA'
                                or l[header_dict[case_n]] == '.'
                                or l[header_dict[case_n]] == 'NA'
                                or l[header_dict[control_freq]] == '.'
                                or l[header_dict[control_freq]] == 'NA'
                                or l[header_dict[case_freq]] == '.'
                                or l[header_dict[case_freq]] == 'NA'):
                            chrom_dict[chrom]['freqs'].append(-1)
                        else:
                            case_N = float(l[header_dict[case_n]])
                            control_N = float(l[header_dict[control_n]])
                            tot_N = case_N + control_N
                            a_scalar = case_N / float(tot_N)
                            u_scalar = control_N / float(tot_N)
                            freq = float(
                                l[header_dict[case_freq]]) * a_scalar + float(
                                    l[header_dict[control_freq]]) * u_scalar
                            chrom_dict[chrom]['freqs'].append(freq)
                    else:
                        if (l[header_dict[case_freq]] == '.'
                                or l[header_dict[case_freq]] == 'NA'
                                or l[header_dict[control_freq]] == '.'
                                or l[header_dict[control_freq]] == 'NA'):
                            chrom_dict[chrom]['freqs'].append(-1)
                        else:
                            freq = (float(l[header_dict[case_freq]]) +
                                    float(l[header_dict[control_freq]])) / 2.0
                            chrom_dict[chrom]['freqs'].append(freq)
                else:
                    chrom_dict[chrom]['freqs'].append(-1)
                # Get the INFO score
                info_sc = -1
                if info is not None and info in header_dict:
                    info_sc = float(l[header_dict[info]])
                chrom_dict[chrom]['infos'].append(info_sc)
                chrom_dict[chrom]['ps'].append(pval_read)
                nt = [l[header_dict[A1]].upper(), l[header_dict[A2]].upper()]
                chrom_dict[chrom]['nts'].append(nt)
                raw_beta = float(l[header_dict[eff]])

                if n is None:
                    if ncol not in header_dict:
                        case_N = float(l[header_dict[case_n]])
                        control_N = float(l[header_dict[control_n]])
                        N = case_N + control_N
                    else:
                        N = float(header_dict[ncol])
                else:
                    N = n
                if not input_is_beta:
                    raw_beta = sp.log(raw_beta)
                    chrom_dict[chrom]['log_odds'].append(raw_beta)
                    beta = sp.sign(raw_beta) * stats.norm.ppf(pval_read / 2.0)
                    chrom_dict[chrom]['betas'].append(beta / sp.sqrt(N))
                else:
                    beta = sp.sign(raw_beta) * stats.norm.ppf(pval_read / 2.0)
                    chrom_dict[chrom]['log_odds'].append(beta / sp.sqrt(N))
                    chrom_dict[chrom]['betas'].append(beta / sp.sqrt(N))

        if len(bad_chromosomes) > 0:
            if debug:
                print('Ignored chromosomes: %s' %
                      (','.join(list(bad_chromosomes))))
                print(
                    'Please note that only data on chromosomes 1-23, and X are parsed.'
                )

    if num_lines > 0:
        sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%\n' % (100.0))
        sys.stdout.flush()
    print('SS file loaded, now sorting and storing in HDF5 file.')
    assert not 'sum_stats' in hdf5_file, 'Something is wrong with HDF5 file?'
    ssg = hdf5_file.create_group('sum_stats')
    num_snps = 0
    num_non_finite = 0
    for chrom in chrom_dict:
        if debug:
            print('%d SNPs on chromosome %s' %
                  (len(chrom_dict[chrom]['positions']), chrom))
        assert len(chrom_dict[chrom]['positions']) == len(
            chrom_dict[chrom]['betas']) == len(chrom_dict[chrom]['ps']) == len(
                chrom_dict[chrom]
                ['nts']), 'Problems with parsing summary stats'
        sl = list(
            zip(chrom_dict[chrom]['positions'], chrom_dict[chrom]['sids'],
                chrom_dict[chrom]['nts'], chrom_dict[chrom]['betas'],
                chrom_dict[chrom]['log_odds'], chrom_dict[chrom]['infos'],
                chrom_dict[chrom]['freqs'], chrom_dict[chrom]['ps']))
        sl.sort()
        ps = []
        betas = []
        nts = []
        sids = []
        positions = []
        log_odds = []
        infos = []
        freqs = []
        prev_pos = -1
        for pos, sid, nt, beta, lo, info, frq, p in sl:
            if pos == prev_pos:
                if debug:
                    print('duplicated position %d' % pos)
                continue
            else:
                prev_pos = pos
            if not sp.isfinite(beta):
                num_non_finite += 1
                continue
            ps.append(p)
            betas.append(beta)
            nts.append(nt)
            sids.append(sid)
            positions.append(pos)
            log_odds.append(lo)
            infos.append(info)
            freqs.append(frq)
        nts = sp.array(nts, dtype=util.nts_dtype)
        sids = sp.array(sids, dtype=util.sids_dtype)
        if debug:
            if not num_non_finite == 0:
                print('%d SNPs have non-finite statistics on chromosome %s' %
                      (num_non_finite, chrom))
            print('Still %d SNPs on chromosome %s' % (len(ps), chrom))
        g = ssg.create_group('chrom_%s' % chrom)
        g.create_dataset('ps', data=sp.array(ps))
        g.create_dataset('freqs', data=freqs)
        g.create_dataset('betas', data=betas)
        g.create_dataset('log_odds', data=log_odds)
        num_snps += len(log_odds)
        g.create_dataset('infos', data=infos)
        g.create_dataset('nts', data=nts)
        g.create_dataset('sids', data=sids)
        g.create_dataset('positions', data=positions)
        hdf5_file.flush()
    if debug:
        print('%d SNPs excluded due to invalid chromosome' % invalid_chr)
        if match_genomic_pos:
            print('%d SNPs excluded due to invalid genomic positions' %
                  invalid_pos)
        else:
            print(
                '%d SNPs with non-matching genomic positions (not excluded)' %
                invalid_pos)
        print('%d SNPs excluded due to invalid P-value' % invalid_p)
        print('%d SNPs excluded due to invalid effect sizes' % invalid_p)
        print('%d SNPs parsed from summary statistics file' % num_snps)
    summary_dict[3.09] = {'name': 'dash', 'value': 'Summary statistics'}
    summary_dict[3.1] = {
        'name': 'Num SNPs parsed from sum stats file',
        'value': num_snps
    }
    if invalid_p > 0:
        summary_dict[3.2] = {
            'name': 'Num invalid P-values in sum stats',
            'value': invalid_p
        }
    if invalid_beta > 0:
        summary_dict[3.21] = {
            'name': 'Num invalid P-values in sum stats',
            'value': invalid_p
        }
    if invalid_chr > 0:
        summary_dict[3.4] = {
            'name': 'SNPs w non-matching chromosomes excluded',
            'value': invalid_chr
        }
    if invalid_pos > 0:
        if match_genomic_pos:
            summary_dict[3.3] = {
                'name': 'SNPs w non-matching positions excluded',
                'value': invalid_pos
            }
        else:
            summary_dict[3.3] = {
                'name': 'SNPs w non-matching positions (not excluded)',
                'value': invalid_pos
            }
Example #2
0
def parse_sum_stats_custom(filename=None,
                           bimfile=None,
                           only_hm3=False,
                           hdf5_file=None,
                           n=None,
                           ch=None,
                           pos=None,
                           A1=None,
                           A2=None,
                           reffreq=None,
                           case_freq=None,
                           control_freq=None,
                           case_n=None,
                           control_n=None,
                           info=None,
                           rs=None,
                           pval=None,
                           eff=None,
                           ncol=None,
                           se=None,
                           eff_type='OR',
                           match_genomic_pos=False,
                           debug=True,
                           z_from_se=False,
                           summary_dict=None):
    # Check required fields are here
    assert not A2 is None, 'Require header for non-effective allele'
    assert not A1 is None, 'Require header for effective allele'
    assert not rs is None, 'Require header for RS ID'
    assert not eff is None, 'Require header for Statistics'
    assert not pval is None, 'Require header for pval'
    assert not ncol is None or not n is None or (
        control_n is not None
        and case_n is not None), 'Require either N or NCOL information'

    if ch is None:
        assert not bimfile is None, 'Require bimfile when chromosome header not provided'
        print("Chromosome Header not provided, will use info from bim file")
    if pos is None:
        assert not bimfile is None, 'Require bimfile when position header not provided'
        print("Position Header not provided, will use info from bim file")

    num_lines = util.count_lines(filename)
    snps_pos_map = {}
    if only_hm3:
        if debug:
            print('Loading HapMap3 SNPs')
        hm3_sids = util.load_hapmap_SNPs()

    if bimfile is not None:
        valid_sids = set()
        if debug:
            print('Parsing bim file: %s' % bimfile)

        with open(bimfile) as f:
            for line in f:
                l = line.split()
                chrom = util.get_chrom_num(l[0])
                if chrom not in util.ok_chromosomes:
                    continue
                sid = l[1]
                if only_hm3:
                    if sid in hm3_sids:
                        valid_sids.add(sid)
                        snps_pos_map[sid] = {'pos': int(l[3]), 'chrom': chrom}
                else:
                    valid_sids.add(sid)
                    snps_pos_map[sid] = {'pos': int(l[3]), 'chrom': chrom}

        if len(valid_sids) == 0:
            raise Exception('Unable to parse BIM file')
    else:
        raise Exception(
            'BIM file missing. Please check genotype paths provided.')

    invalid_chr = 0
    invalid_pos = 0
    invalid_p = 0
    invalid_beta = 0
    se_inferred_zscores = 0
    chrom_dict = {}
    opener = open
    if util.is_gz(filename):
        opener = gzip.open
    print('Parsing summary statistics file: %s' % filename)
    with opener(filename) as f:
        header = f.readline()
        if util.is_gz(filename):
            header = header.decode('utf-8')
        if debug:
            print('File header:')
            print(header)
        header_dict = {}
        columns = (header.strip()).split()
        index = 0
        for col in columns:
            header_dict[col] = index
            index += 1

        assert ch is None or ch in header_dict, 'Chromosome header cannot be found in summary statistic file'
        assert A2 in header_dict, 'Non-effective allele column cannot be found in summary statistic file'
        assert A1 in header_dict, 'Effective allele column cannot be found in summary statistic file'
        assert eff in header_dict, 'Effect size column not found in summary statistic file'
        assert rs in header_dict, 'SNP ID column not found in summary statistic file'
        assert pos is None or pos in header_dict, 'Position column not found in summary statistic file'
        assert pval in header_dict, 'P Value column not found in summary statistic file'
        assert not n is None or ncol in header_dict or (control_n in header_dict and case_n in header_dict), 'Sample size column not found in summary statistic ' \
                                                     'file and N not provided'

        if z_from_se:
            assert se is not None, 'SE column must be specified to infer z-scores from SEs'
            assert se in header_dict, 'SE column not found in summary stats file, this is required to infer z-scores from SEs'

        # header_dict now contains the header column name for each corresponding input
        bad_chromosomes = set()
        line_i = 1
        for line in f:
            line_i += 1
            if line_i % 1000 == 0 and num_lines > 0:
                sys.stdout.write('\r%0.2f%%' % (100.0 * (float(line_i) /
                                                         (num_lines))))
                sys.stdout.flush()
            if util.is_gz(filename):
                line = line.decode('utf-8')
            l = (line.strip()).split()
            # get the SNP ID first
            sid = l[header_dict[rs]]
            # check the SNP ID
            if sid in valid_sids:
                # Get the chromosome information
                chrom = 0
                if not ch is None and ch in header_dict:
                    chrom = util.get_chrom_num(l[header_dict[ch]])
                    # Check if the chromosome of the SNP is correct
                    if not chrom == snps_pos_map[sid]['chrom']:
                        invalid_chr += 1
                        continue
                else:
                    chrom = snps_pos_map[sid]['chrom']

                #Parse position
                pos_read = 0
                if not pos is None and pos in header_dict:
                    pos_read = int(l[header_dict[pos]])
                    if not pos_read == snps_pos_map[sid]['pos']:
                        invalid_pos += 1
                        if match_genomic_pos:
                            continue
                else:
                    pos_read = snps_pos_map[sid]['pos']

                #Get the sample size
                N = parse_sample_size(l, n, ncol, case_n, control_n,
                                      header_dict)

                #Parse raw beta
                beta_read = float(l[header_dict[eff]])
                if not isfinite(beta_read):
                    invalid_beta += 1
                    continue

                raw_beta = get_raw_beta(beta_read, eff_type)

                #Parse p-value and effect size
                pval_read = float(l[header_dict[pval]])
                if pval_read == 0 or not isfinite(stats.norm.ppf(pval_read)):
                    invalid_p += 1

                beta = get_beta(pval_read, raw_beta, beta_read, l, header_dict,
                                se, z_from_se, N, eff_type,
                                se_inferred_zscores)

                if beta == None:
                    continue

                #All necessary information was found, so we should store things

                if not chrom in chrom_dict:
                    chrom_dict[chrom] = {
                        'ps': [],
                        'infos': [],
                        'freqs': [],
                        'nts': [],
                        'sids': [],
                        'positions': [],
                        'ns': [],
                        'log_odds': [],
                        'betas': []
                    }
                chrom_dict[chrom]['sids'].append(sid)
                chrom_dict[chrom]['positions'].append(pos_read)
                chrom_dict[chrom]['ps'].append(pval_read)
                chrom_dict[chrom]['ns'].append(N)
                chrom_dict[chrom]['log_odds'].append(raw_beta)
                chrom_dict[chrom]['betas'].append(beta)

                # Get the INFO score
                info_sc = -1
                if info is not None and info in header_dict:
                    info_sc = float(l[header_dict[info]])
                chrom_dict[chrom]['infos'].append(info_sc)

                #Parse nucleotides
                nt = parse_nucleotides(l, header_dict, A1, A2)
                chrom_dict[chrom]['nts'].append(nt)

                # Parse the frequency
                freq = parse_freq(l, header_dict, reffreq, control_freq,
                                  case_freq, control_n, case_n)
                chrom_dict[chrom]['freqs'].append(freq)

        if len(bad_chromosomes) > 0:
            if debug:
                print('Ignored chromosomes: %s' %
                      (','.join(list(bad_chromosomes))))
                print(
                    'Please note that only data on chromosomes 1-23, and X are parsed.'
                )

    if num_lines > 0:
        sys.stdout.write('\r%0.2f%%\n' % (100.0))
        sys.stdout.flush()
    print('SS file loaded, now sorting and storing in HDF5 file.')
    assert not 'sum_stats' in hdf5_file, 'Something is wrong with HDF5 file?'
    ssg = hdf5_file.create_group('sum_stats')
    num_snps = 0
    num_non_finite = 0
    for chrom in chrom_dict:
        if debug:
            print('%d SNPs on chromosome %s' %
                  (len(chrom_dict[chrom]['positions']), chrom))
        assert len(chrom_dict[chrom]['positions']) == len(
            chrom_dict[chrom]['betas']) == len(chrom_dict[chrom]['ps']) == len(
                chrom_dict[chrom]
                ['nts']), 'Problems with parsing summary stats'
        sl = list(
            zip(chrom_dict[chrom]['positions'], chrom_dict[chrom]['sids'],
                chrom_dict[chrom]['nts'], chrom_dict[chrom]['betas'],
                chrom_dict[chrom]['log_odds'], chrom_dict[chrom]['infos'],
                chrom_dict[chrom]['freqs'], chrom_dict[chrom]['ps'],
                chrom_dict[chrom]['ns']))
        sl.sort()
        ps = []
        betas = []
        nts = []
        sids = []
        positions = []
        log_odds = []
        infos = []
        freqs = []
        ns = []
        prev_pos = -1
        for pos, sid, nt, beta, lo, info, frq, p, num_ind in sl:
            if pos == prev_pos:
                if debug:
                    print('duplicated position %d' % pos)
                continue
            else:
                prev_pos = pos
            if not sp.isfinite(beta):
                num_non_finite += 1
                continue
            ps.append(p)
            betas.append(beta)
            nts.append(nt)
            sids.append(sid)
            positions.append(pos)
            log_odds.append(lo)
            infos.append(info)
            freqs.append(frq)
            ns.append(num_ind)
        nts = sp.array(nts, dtype=util.nts_dtype)
        sids = sp.array(sids, dtype=util.sids_dtype)
        if debug:
            if not num_non_finite == 0:
                print('%d SNPs have non-finite statistics on chromosome %s' %
                      (num_non_finite, chrom))
            print('Still %d SNPs on chromosome %s' % (len(ps), chrom))
        g = ssg.create_group('chrom_%s' % chrom)
        g.create_dataset('ps', data=sp.array(ps))
        g.create_dataset('freqs', data=freqs)
        g.create_dataset('betas', data=betas)
        g.create_dataset('log_odds', data=log_odds)
        num_snps += len(log_odds)
        g.create_dataset('infos', data=infos)
        g.create_dataset('nts', data=nts)
        g.create_dataset('sids', data=sids)
        g.create_dataset('positions', data=positions)
        g.create_dataset('ns', data=ns)
        hdf5_file.flush()

    summary_dict[3.09] = {'name': 'dash', 'value': 'Summary statistics'}
    summary_dict[3.1] = {
        'name': 'Num SNPs parsed from sum stats file',
        'value': num_snps
    }
    if invalid_p > 0 or debug:
        summary_dict[3.2] = {
            'name': 'Num invalid P-values in sum stats',
            'value': invalid_p
        }
    if invalid_beta > 0 or debug:
        summary_dict[3.21] = {
            'name': 'Num invalid betas in sum stats',
            'value': invalid_beta
        }
    if se_inferred_zscores > 0 or debug:
        summary_dict[3.22] = {
            'name': 'Num z-scores inferred from SEs and effects',
            'value': se_inferred_zscores
        }
    if invalid_chr > 0 or debug:
        summary_dict[3.4] = {
            'name': 'SNPs w non-matching chromosomes excluded',
            'value': invalid_chr
        }
    if invalid_pos > 0 or debug:
        if match_genomic_pos:
            summary_dict[3.3] = {
                'name': 'SNPs w non-matching positions excluded',
                'value': invalid_pos
            }
        else:
            summary_dict[3.3] = {
                'name': 'SNPs w non-matching positions (not excluded)',
                'value': invalid_pos
            }
Example #3
0
def parse_sum_stats_custom(filename=None, bimfile=None, only_hm3=False, hdf5_file=None, n=None, ch=None, pos=None,
                    A1=None, A2=None, reffreq=None, case_freq=None, control_freq=None, case_n=None,
                    control_n=None, info=None, rs=None, pval=None, eff=None, ncol=None,
                    input_is_beta=False, match_genomic_pos=False, debug=False, summary_dict = None):
    # Check required fields are here
    assert not A2 is None, 'Require header for non-effective allele'
    assert not A1 is None, 'Require header for effective allele'
    assert not rs is None, 'Require header for RS ID'
    assert not eff is None, 'Require header for Statistics'
    assert not pval is None, 'Require header for pval'
    assert not ncol is None or not n is None or (control_n is not None and case_n is not None), 'Require either N or NCOL information'

    if ch is None:
        assert not bimfile is None, 'Require bimfile when chromosome header not provided'
        print("Chromosome Header not provided, will use info from bim file")
    if pos is None:
        assert not bimfile is None, 'Require bimfile when position header not provided'
        print("Position Header not provided, will use info from bim file")

    num_lines = util.count_lines(filename)
    snps_pos_map = {}
    if only_hm3:
        if debug:
            print('Loading HapMap3 SNPs')
        hm3_sids = util.load_hapmap_SNPs()
  
    if bimfile is not None:
        valid_sids = set()
        if debug:
            print('Parsing bim file: %s' % bimfile)
        
        with open(bimfile) as f:
            for line in f:
                l = line.split()
                chrom = util.get_chrom_num(l[0])
                if chrom not in util.ok_chromosomes:
                    continue
                sid = l[1]
                if only_hm3:
                    if sid in hm3_sids:
                        valid_sids.add(sid)
                        snps_pos_map[sid] = {'pos':int(l[3]), 'chrom':chrom}
                else:
                    valid_sids.add(sid)
                    snps_pos_map[sid] = {'pos':int(l[3]), 'chrom':chrom}

        if len(valid_sids)==0:
            raise Exception('Unable to parse BIM file')
    else:
        raise Exception('BIM file missing. Please check genotype paths provided.')
        
    invalid_chr = 0
    invalid_pos = 0
    invalid_p = 0
    invalid_beta = 0
    chrom_dict = {}
    opener = open
    if is_gz(filename):
        opener = gzip.open
    print('Parsing summary statistics file: %s' % filename)
    with opener(filename) as f:
        header = f.readline()
        if is_gz(filename):
            header = header.decode('utf-8')
        if debug:
            print('File header:')
            print(header)
        header_dict = {}
        columns = (header.strip()).split()
        index = 0
        for col in columns:
            header_dict[col] = index
            index += 1
        assert ch is None or ch in header_dict, 'Chromosome header cannot be found in summary statistic file'
        assert A2 in header_dict, 'Non-effective allele column cannot be found in summary statistic file'
        assert A1 in header_dict, 'Effective allele column cannot be found in summary statistic file'
        assert eff in header_dict, 'Effect size column not found in summary statistic file'
        assert rs in header_dict, 'SNP ID column not found in summary statistic file'
        assert pos is None or pos in header_dict, 'Position column not found in summary statistic file'
        assert pval in header_dict, 'P Value column not found in summary statistic file'
        assert not n is None or ncol in header_dict or (control_n in header_dict and case_n in header_dict), 'Sample size column not found in summary statistic ' \
                                                     'file and N not provided'
        # header_dict now contains the header column name for each corresponding input
        bad_chromosomes = set()
        line_i = 1
        for line in f:
            line_i +=1
            if line_i%1000==0 and num_lines>0:
                sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (float(line_i) / (num_lines))))
                sys.stdout.flush()            
            if is_gz(filename):
                line = line.decode('utf-8')
            l = (line.strip()).split()
            # get the SNP ID first
            sid = l[header_dict[rs]]
            # check the SNP ID
            if sid in valid_sids:
                # Get the chromosome information
                chrom = 0
                if not ch is None and ch in header_dict:
                    chrom = util.get_chrom_num(l[header_dict[ch]])
                    # Check if the chromosome of the SNP is correct
                    if not chrom == snps_pos_map[sid]['chrom']:
                        invalid_chr += 1
                        continue
                else:
                    chrom = snps_pos_map[sid]['chrom']
                
    
                pos_read = 0
                if not pos is None and pos in header_dict:
                    pos_read = int(l[header_dict[pos]])
                    if not pos_read == snps_pos_map[sid]['pos']:
                        invalid_pos += 1
                        if match_genomic_pos:
                            continue
                else:
                    pos_read = snps_pos_map[sid]['pos']

                pval_read = float(l[header_dict[pval]])
                if not isfinite(stats.norm.ppf(pval_read)):
                    invalid_p += 1
                    continue

                if not isfinite(float(l[header_dict[eff]])):
                    invalid_beta += 1
                    continue

                if not chrom in chrom_dict:
                    chrom_dict[chrom] = {'ps':[], 'log_odds':[], 'infos':[], 'freqs':[],
                             'betas':[], 'nts': [], 'sids': [], 'positions': []}
                chrom_dict[chrom]['sids'].append(sid)
                chrom_dict[chrom]['positions'].append(pos_read)
                # Check the frequency
                if reffreq is not None and reffreq in header_dict:
                    if l[header_dict[reffreq]] == '.' or l[header_dict[reffreq]] == 'NA':
                        chrom_dict[chrom]['freqs'].append(-1)
                    else:
                        chrom_dict[chrom]['freqs'].append(float(l[header_dict[reffreq]]))
                elif (case_freq is not None and control_freq is not None 
                      and case_freq in header_dict and control_freq in header_dict):
                    if (case_n is not None and control_n is not None 
                          and case_n in header_dict and control_n in header_dict) :
                        if (l[header_dict[control_n]] == '.' or l[header_dict[control_n]] == 'NA' 
                            or l[header_dict[case_n]] == '.' or l[header_dict[case_n]] == 'NA' 
                            or l[header_dict[control_freq]] == '.' or l[header_dict[control_freq]] == 'NA' 
                            or l[header_dict[case_freq]] == '.' or l[header_dict[case_freq]] == 'NA'):
                            chrom_dict[chrom]['freqs'].append(-1)
                        else:
                            case_N = float(l[header_dict[case_n]])
                            control_N = float(l[header_dict[control_n]])
                            tot_N = case_N + control_N
                            a_scalar = case_N / float(tot_N)
                            u_scalar = control_N / float(tot_N)
                            freq = float(l[header_dict[case_freq]]) * a_scalar + float(l[header_dict[control_freq]]) * u_scalar
                            chrom_dict[chrom]['freqs'].append(freq)
                    else:
                        if (l[header_dict[case_freq]] == '.' or l[header_dict[case_freq]] == 'NA' 
                            or l[header_dict[control_freq]] == '.' or l[header_dict[control_freq]] == 'NA'):
                            chrom_dict[chrom]['freqs'].append(-1)
                        else:
                            freq = (float(l[header_dict[case_freq]]) + float(l[header_dict[control_freq]]))/2.0
                            chrom_dict[chrom]['freqs'].append(freq)
                else:  
                    chrom_dict[chrom]['freqs'].append(-1)
                # Get the INFO score
                info_sc = -1
                if info is not None and info in header_dict:
                    info_sc = float(l[header_dict[info]])
                chrom_dict[chrom]['infos'].append(info_sc)
                chrom_dict[chrom]['ps'].append(pval_read)
                nt = [l[header_dict[A1]].upper(), l[header_dict[A2]].upper()]
                chrom_dict[chrom]['nts'].append(nt)
                raw_beta = float(l[header_dict[eff]])
                
                if n is None:
                    if ncol not in header_dict:
                        case_N = float(l[header_dict[case_n]])
                        control_N = float(l[header_dict[control_n]])
                        N = case_N + control_N
                    else:
                        N = float(header_dict[ncol])
                else:
                    N = n
                if not input_is_beta:
                    raw_beta = sp.log(raw_beta)
                    chrom_dict[chrom]['log_odds'].append(raw_beta)
                    beta = sp.sign(raw_beta) * stats.norm.ppf(pval_read / 2.0)
                    chrom_dict[chrom]['betas'].append(beta / sp.sqrt(N))
                else:
                    beta = sp.sign(raw_beta) * stats.norm.ppf(pval_read / 2.0)
                    chrom_dict[chrom]['log_odds'].append(beta / sp.sqrt(N))
                    chrom_dict[chrom]['betas'].append(beta / sp.sqrt(N))

        if len(bad_chromosomes) > 0:
            if debug:
                print('Ignored chromosomes: %s' % (','.join(list(bad_chromosomes))))
                print('Please note that only data on chromosomes 1-23, and X are parsed.')

    if num_lines>0:
        sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%\n' % (100.0))
        sys.stdout.flush()            
    print('SS file loaded, now sorting and storing in HDF5 file.')
    assert not 'sum_stats' in hdf5_file, 'Something is wrong with HDF5 file?'
    ssg = hdf5_file.create_group('sum_stats')
    num_snps = 0
    num_non_finite = 0
    for chrom in chrom_dict:
        if debug:
            print ('%d SNPs on chromosome %s' % (len(chrom_dict[chrom]['positions']), chrom))
        assert len(chrom_dict[chrom]['positions'])==len(chrom_dict[chrom]['betas'])==len(chrom_dict[chrom]['ps'])==len(chrom_dict[chrom]['nts']), 'Problems with parsing summary stats'
        sl = list(zip(chrom_dict[chrom]['positions'], chrom_dict[chrom]['sids'], chrom_dict[chrom]['nts'],
                 chrom_dict[chrom]['betas'], chrom_dict[chrom]['log_odds'], chrom_dict[chrom]['infos'],
                 chrom_dict[chrom]['freqs'], chrom_dict[chrom]['ps']))
        sl.sort()
        ps = []
        betas = []
        nts = []
        sids = []
        positions = []
        log_odds = []
        infos = []
        freqs = []
        prev_pos = -1
        for pos, sid, nt, beta, lo, info, frq, p in sl:
            if pos == prev_pos:
                if debug:
                    print('duplicated position %d' % pos)
                continue
            else:
                prev_pos = pos
            if not sp.isfinite(beta):
                num_non_finite += 1
                continue
            ps.append(p)
            betas.append(beta)
            nts.append(nt)
            sids.append(sid)
            positions.append(pos)
            log_odds.append(lo)
            infos.append(info)
            freqs.append(frq)
        nts = sp.array(nts, dtype=util.nts_dtype)
        sids = sp.array(sids, dtype=util.sids_dtype)
        if debug:
            if not num_non_finite == 0:
                print('%d SNPs have non-finite statistics on chromosome %s' % (num_non_finite, chrom))
            print ('Still %d SNPs on chromosome %s' % (len(ps), chrom))
        g = ssg.create_group('chrom_%s' % chrom)
        g.create_dataset('ps', data=sp.array(ps))
        g.create_dataset('freqs', data=freqs)
        g.create_dataset('betas', data=betas)
        g.create_dataset('log_odds', data=log_odds)
        num_snps += len(log_odds)
        g.create_dataset('infos', data=infos)
        g.create_dataset('nts', data=nts)
        g.create_dataset('sids', data=sids)
        g.create_dataset('positions', data=positions)
        hdf5_file.flush()
    if debug:
        print('%d SNPs excluded due to invalid chromosome' % invalid_chr)
        if match_genomic_pos:
            print('%d SNPs excluded due to invalid genomic positions' % invalid_pos)
        else:
            print('%d SNPs with non-matching genomic positions (not excluded)' % invalid_pos)
        print('%d SNPs excluded due to invalid P-value' % invalid_p)
        print('%d SNPs excluded due to invalid effect sizes' % invalid_p)
        print('%d SNPs parsed from summary statistics file' % num_snps)
    summary_dict[3.09]={'name':'dash', 'value':'Summary statistics'}
    summary_dict[3.1]={'name':'Num SNPs parsed from sum stats file','value':num_snps}
    if invalid_p>0:
        summary_dict[3.2]={'name':'Num invalid P-values in sum stats','value':invalid_p}
    if invalid_beta>0:
        summary_dict[3.21]={'name':'Num invalid P-values in sum stats','value':invalid_p}
    if invalid_chr>0:
            summary_dict[3.4]={'name':'SNPs w non-matching chromosomes excluded','value':invalid_chr}
    if invalid_pos>0:
        if match_genomic_pos:
            summary_dict[3.3]={'name':'SNPs w non-matching positions excluded','value':invalid_pos}
        else:
            summary_dict[3.3]={'name':'SNPs w non-matching positions (not excluded)','value':invalid_pos}