コード例 #1
0
def extract_vcf(gwas_file,gwas_id):
    # Temporary output file:
    tempDir=os.getenv('tmpDir',"/tmp/")
    tempout = os.path.join(tempDir,gwas_id)+"."+str(uuid.uuid4())
    print("Processing vcf to", tempout)

    # Check if Sample size is available as a column
    vcf_in = VariantFile(gwas_file)
    sample = list(vcf_in.header.samples)[0]
    availcols = next(vcf_in.fetch()).format.keys()
    vcf_in.seek(0)

    if 'SS' in availcols:
        cmd = "bcftools query -f'%CHROM %POS %ID %ALT %REF[ %AF %ES %SE %LP %SS]\n' " + gwas_file + "| awk '{print $1, $2, $3, $4, $5, $6, $7, $8, 10^-$9, $10}' | grep -v inf | gzip -c > " + tempout
        #print(cmd)
        subprocess.call(cmd, shell=True)
        print("Done")
        return tempout

    global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0]
    if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys():
        SS = float(global_fields['TotalControls']) + float(global_fields['TotalCases'])
    elif 'TotalControls' in global_fields.keys():
        SS = float(global_fields['TotalControls'])
    else:
        SS = '.'
    cmd = "bcftools query -f'%CHROM %POS %ID %ALT %REF[ %AF %ES %SE %LP]\n' " + gwas_file + "| awk '{print $1, $2, $3, $4, $5, $6, $7, $8, 10^-$9, \"" + str(SS) + "\"}' | grep -v inf | gzip -c > " + tempout
    subprocess.call(cmd, shell=True)
    print("Done")
    return tempout
コード例 #2
0
ファイル: parse.py プロジェクト: explodecomputer/ldsc
def read_vcf(fh, alleles, slh=None):
    vcf_in = VariantFile(fh)
    sample = list(vcf_in.header.samples)[0]
    availcols = next(vcf_in.fetch()).format.keys()
    vcf_in.seek(0)

    # Check if sample size info is in header
    global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0]
    if alleles:
        dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str}
        usecols = list(dtype_dict.keys())

        # Read in data
        if 'SS' in availcols:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0],
                rec.samples[sample]['SS'][0], rec.alts[0], rec.ref
            ] for rec in vcf_in.fetch()]
            N = pd.Series([x[2] for x in o], dtype='float')
        else:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0],
                rec.alts[0], rec.ref
            ] for rec in vcf_in.fetch()]
            if 'TotalControls' in global_fields.keys(
            ) and 'TotalCases' in global_fields.keys():
                N = pd.Series([
                    float(global_fields['TotalControls']) +
                    float(global_fields['TotalCases'])
                ] * len(o),
                              dtype='float')
            elif 'TotalControls' in global_fields.keys():
                N = pd.Series([float(global_fields['TotalControls'])] * len(o),
                              dtype='float')
            else:
                N = pd.Series([np.NaN] * len(o), dtype='float')

        p = pd.DataFrame({
            'SNP':
            pd.Series([x[0] for x in o], dtype='str'),
            'Z':
            pd.Series([x[1] for x in o], dtype='float'),
            'N':
            N,
            'A1':
            pd.Series([x[2 + int('SS' in availcols)] for x in o], dtype='str'),
            'A2':
            pd.Series([x[3 + int('SS' in availcols)] for x in o], dtype='str')
        })
    else:
        dtype_dict = {'SNP': str, 'Z': float, 'N': float}
        usecols = list(dtype_dict.keys())
        if 'SS' in availcols:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0],
                rec.samples[sample]['SS'][0]
            ] for rec in vcf_in.fetch()]
            N = pd.Series([x[2] for x in o], dtype='float')
        else:
            o = [[
                rec.id,
                rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0]
            ] for rec in vcf_in.fetch()]
            if 'TotalControls' in global_fields.keys(
            ) and 'TotalCases' in global_fields.keys():
                N = pd.Series([
                    float(global_fields['TotalControls']) +
                    float(global_fields['TotalCases'])
                ] * len(o),
                              dtype='float')
            elif 'TotalControls' in global_fields.keys():
                N = pd.Series([float(global_fields['TotalControls'])] * len(o),
                              dtype='float')
            else:
                N = pd.Series([np.NaN] * len(o), dtype='float')

        p = pd.DataFrame({
            'SNP': pd.Series([x[0] for x in o], dtype='str'),
            'Z': pd.Series([x[1] for x in o], dtype='float'),
            'N': N
        })

    vcf_in.close()

    if slh is not None:
        compression = get_compression(slh)
        sl = []
        if compression == "gzip":
            try:
                with gzip.open(slh) as f:
                    for line in f:
                        sl.append(line.strip())
            except (AttributeError, ValueError) as e:
                raise ValueError('Improperly formatted snplist file: ' +
                                 str(e.args))
        else:
            try:
                with open(slh) as f:
                    for line in f:
                        sl.append(line.strip())
            except (AttributeError, ValueError) as e:
                raise ValueError('Improperly formatted snplist file: ' +
                                 str(e.args))
        f.close()
        p = p.loc[p['SNP'].isin(sl)]

    return (p)