def test_get_location_from_rsid(): with pygwasvcf.GwasVcf(FILE) as g: g.index_rsid() chrom, pos = g.get_location_from_rsid("rs10399793") check_first_row("1", 49298) with pygwasvcf.GwasVcf(FILE, rsidx_path=FILE + ".rsidx") as g: chrom, pos = g.get_location_from_rsid("rs10399793") assert chrom == "1" assert pos == 49298
def test_get_nc_from_metadata(): with pygwasvcf.GwasVcf(FILE) as g: metadata = g.get_metadata() for rec in g.query(contig=CHROM, start=START, stop=STOP): assert pygwasvcf.VariantRecordGwasFuns.get_nc( rec, TRAIT, metadata) == 9 del metadata[TRAIT]['TotalCases'] with pytest.raises(KeyError): for rec in g.query(contig=CHROM, start=START, stop=STOP): pygwasvcf.VariantRecordGwasFuns.get_nc(rec, TRAIT, metadata)
def test_get_ss_from_metadata(): with pygwasvcf.GwasVcf(FILE) as g: metadata = g.get_metadata() for rec in g.query(contig=CHROM, start=START, stop=STOP): assert pygwasvcf.VariantRecordGwasFuns.get_ss( rec, TRAIT, metadata) == (463001 + 9) del metadata[TRAIT]['TotalCases'] for rec in g.query(contig=CHROM, start=START, stop=STOP): assert pygwasvcf.VariantRecordGwasFuns.get_ss( rec, TRAIT, metadata) == 463001
def test_get_id_chrpos(): with pygwasvcf.GwasVcf(FILE) as g: for rec in g.query(contig=CHROM, start=START, stop=STOP): assert pygwasvcf.VariantRecordGwasFuns.get_id( rec, TRAIT, create_if_missing=False) is not None del rec.samples[TRAIT]['ID'] with pytest.raises(KeyError): assert pygwasvcf.VariantRecordGwasFuns.get_id( rec, TRAIT, create_if_missing=False) assert pygwasvcf.VariantRecordGwasFuns.get_id( rec, TRAIT, create_if_missing=True) == "1-49298-T-C"
def test_get_metadata(): with pygwasvcf.GwasVcf(FILE) as g: recs = g.get_metadata() assert TRAIT in recs assert "TotalVariants" in recs[TRAIT] assert "VariantsNotRead" in recs[TRAIT] assert "HarmonisedVariants" in recs[TRAIT] assert "VariantsNotHarmonised" in recs[TRAIT] assert "SwitchedAlleles" in recs[TRAIT] assert "TotalControls" in recs[TRAIT] assert "TotalCases" in recs[TRAIT] assert "StudyType" in recs[TRAIT]
def test_index_rsid(): # delete old index if present if os.path.exists(FILE + ".rsidx"): os.remove(FILE + ".rsidx") # index GWAS-VCF with pygwasvcf.GwasVcf(FILE) as g: g.index_rsid() # check index exists assert os.path.exists(FILE + ".rsidx") # check contents of index with sqlite3.connect(FILE + ".rsidx") as dbconn: cur = dbconn.cursor() cur.execute("SELECT * FROM rsid_to_coord") for rec in cur.fetchall(): assert rec[0] is not None assert isinstance(rec[0], int) assert rec[1] is not None assert isinstance(rec[1], str) assert rec[2] is not None assert isinstance(rec[2], int)
def read_vcf(fh, alleles, slh=None, trait=None): with pygwasvcf.GwasVcf(fh) as vcf_in: traits = vcf_in.get_traits() if trait is not None: assert trait in traits else: trait = traits[0] # get global field info from header metadata = vcf_in.get_metadata() if alleles: dtype_dict = { 'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str } # Read in data o = [ [ pygwasvcf.VariantRecordGwasFuns.get_id( rec, trait, create_if_missing=True), # rsid or chr-pos-ref-alt pygwasvcf.VariantRecordGwasFuns.get_beta(rec, trait) / pygwasvcf.VariantRecordGwasFuns.get_se(rec, trait), pygwasvcf.VariantRecordGwasFuns.get_ss( rec, trait, metadata ), # if per-snp sample size unavailable then take from header rec.alts[0], rec.ref ] for rec in vcf_in.query() ] N = pd.Series([x[2] for x in o], dtype='float') p = pd.DataFrame({ 'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), 'N': N, 'A1': pd.Series([x[3] for x in o], dtype='str'), 'A2': pd.Series([x[4] for x in o], dtype='str') }) else: dtype_dict = {'SNP': str, 'Z': float, 'N': float} o = [ [ pygwasvcf.VariantRecordGwasFuns.get_id( rec, trait, create_if_missing=True), # rsid or chr-pos-ref-alt pygwasvcf.VariantRecordGwasFuns.get_beta(rec, trait) / pygwasvcf.VariantRecordGwasFuns.get_se(rec, trait), pygwasvcf.VariantRecordGwasFuns.get_ss( rec, trait, metadata ), # if per-snp sample size unavailable then take from header ] for rec in vcf_in.fetch() ] N = pd.Series([x[2] for x in o], dtype='float') p = pd.DataFrame({ 'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), 'N': N }) if slh is not None: compression = get_compression(slh) sl = [] if compression == "gzip": try: with gzip.open(slh) as f: for line in f: sl.append(line.strip()) except (AttributeError, ValueError) as e: raise ValueError('Improperly formatted snplist file: ' + str(e.args)) else: try: with open(slh) as f: for line in f: sl.append(line.strip()) except (AttributeError, ValueError) as e: raise ValueError('Improperly formatted snplist file: ' + str(e.args)) f.close() p = p.loc[p['SNP'].isin(sl)] return (p)
def test_get_nc(): with pygwasvcf.GwasVcf(FILE) as g: for rec in g.query(contig=CHROM, start=START, stop=STOP): with pytest.raises(KeyError): assert pygwasvcf.VariantRecordGwasFuns.get_nc(rec, TRAIT)
def test_get_id_rsid(): with pygwasvcf.GwasVcf(FILE) as g: for rec in g.query(contig=CHROM, start=START, stop=STOP): assert pygwasvcf.VariantRecordGwasFuns.get_id( rec, TRAIT) == "rs10399793"
def test_get_af(): with pygwasvcf.GwasVcf(FILE) as g: for rec in g.query(contig=CHROM, start=START, stop=STOP): assert pygwasvcf.VariantRecordGwasFuns.get_af( rec, TRAIT) == pytest.approx(0.623765)
def test_query_by_rsid(): with pygwasvcf.GwasVcf(FILE) as g: g.index_rsid() for num, row in enumerate(g.query(variant_id="rs10399793")): check_first_row(row.chrom, row.pos) assert num == 0
def test_query_by_chr_pos(): with pygwasvcf.GwasVcf(FILE) as g: for num, row in enumerate(g.query(contig="1", start=49297, stop=49298)): check_first_row(row.chrom, row.pos) assert num == 0
def test_close(): with pygwasvcf.GwasVcf(FILE) as g: assert not g.is_closed() assert g.is_closed() g = pygwasvcf.GwasVcf(FILE) assert g.is_closed()
def test_query_all(): with pygwasvcf.GwasVcf(FILE) as g: for num, row in enumerate(g.query()): if num == 0: check_first_row(row.chrom, row.pos) assert num > 0