def extract_vcf(gwas_file,gwas_id): # Temporary output file: tempDir=os.getenv('tmpDir',"/tmp/") tempout = os.path.join(tempDir,gwas_id)+"."+str(uuid.uuid4()) print("Processing vcf to", tempout) # Check if Sample size is available as a column vcf_in = VariantFile(gwas_file) sample = list(vcf_in.header.samples)[0] availcols = next(vcf_in.fetch()).format.keys() vcf_in.seek(0) if 'SS' in availcols: cmd = "bcftools query -f'%CHROM %POS %ID %ALT %REF[ %AF %ES %SE %LP %SS]\n' " + gwas_file + "| awk '{print $1, $2, $3, $4, $5, $6, $7, $8, 10^-$9, $10}' | grep -v inf | gzip -c > " + tempout #print(cmd) subprocess.call(cmd, shell=True) print("Done") return tempout global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0] if 'TotalControls' in global_fields.keys() and 'TotalCases' in global_fields.keys(): SS = float(global_fields['TotalControls']) + float(global_fields['TotalCases']) elif 'TotalControls' in global_fields.keys(): SS = float(global_fields['TotalControls']) else: SS = '.' cmd = "bcftools query -f'%CHROM %POS %ID %ALT %REF[ %AF %ES %SE %LP]\n' " + gwas_file + "| awk '{print $1, $2, $3, $4, $5, $6, $7, $8, 10^-$9, \"" + str(SS) + "\"}' | grep -v inf | gzip -c > " + tempout subprocess.call(cmd, shell=True) print("Done") return tempout
def read_vcf(fh, alleles, slh=None): vcf_in = VariantFile(fh) sample = list(vcf_in.header.samples)[0] availcols = next(vcf_in.fetch()).format.keys() vcf_in.seek(0) # Check if sample size info is in header global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0] if alleles: dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} usecols = list(dtype_dict.keys()) # Read in data if 'SS' in availcols: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0], rec.alts[0], rec.ref ] for rec in vcf_in.fetch()] N = pd.Series([x[2] for x in o], dtype='float') else: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0], rec.alts[0], rec.ref ] for rec in vcf_in.fetch()] if 'TotalControls' in global_fields.keys( ) and 'TotalCases' in global_fields.keys(): N = pd.Series([ float(global_fields['TotalControls']) + float(global_fields['TotalCases']) ] * len(o), dtype='float') elif 'TotalControls' in global_fields.keys(): N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') else: N = pd.Series([np.NaN] * len(o), dtype='float') p = pd.DataFrame({ 'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), 'N': N, 'A1': pd.Series([x[2 + int('SS' in availcols)] for x in o], dtype='str'), 'A2': pd.Series([x[3 + int('SS' in availcols)] for x in o], dtype='str') }) else: dtype_dict = {'SNP': str, 'Z': float, 'N': float} usecols = list(dtype_dict.keys()) if 'SS' in availcols: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0] ] for rec in vcf_in.fetch()] N = pd.Series([x[2] for x in o], dtype='float') else: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0] ] for rec in vcf_in.fetch()] if 'TotalControls' in global_fields.keys( ) and 'TotalCases' in global_fields.keys(): N = pd.Series([ float(global_fields['TotalControls']) + float(global_fields['TotalCases']) ] * len(o), dtype='float') elif 'TotalControls' in global_fields.keys(): N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') else: N = pd.Series([np.NaN] * len(o), dtype='float') p = pd.DataFrame({ 'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), 'N': N }) vcf_in.close() if slh is not None: compression = get_compression(slh) sl = [] if compression == "gzip": try: with gzip.open(slh) as f: for line in f: sl.append(line.strip()) except (AttributeError, ValueError) as e: raise ValueError('Improperly formatted snplist file: ' + str(e.args)) else: try: with open(slh) as f: for line in f: sl.append(line.strip()) except (AttributeError, ValueError) as e: raise ValueError('Improperly formatted snplist file: ' + str(e.args)) f.close() p = p.loc[p['SNP'].isin(sl)] return (p)