Example #1
0
def test_relatedness_coefficient():
    kid = Sample('fam1', 'kid', 'dad', 'mom', '2', '2')
    dad = Sample('fam1', 'dad', '-9', '-9', '1', '2')
    mom = Sample('fam1', 'mom', '-9', '-9', '2', '2')
    gma = Sample('fam1', 'gma', '-9', '-9', '2', '2')
    ggma = Sample('fam1', 'ggma', '-9', '-9', '2', '2')
    kid.mom = mom
    kid.dad = dad
    mom.mom = gma
    gma.mom = ggma

    unrelated = Sample('fam1', 'un', '-9', '-9', '2', '2')

    from io import StringIO
    p = Ped(StringIO())
    p.families['fam1'] = Family([kid, mom, dad, gma, ggma, unrelated])
    rel = p.relatedness_coefficient("mom", "dad")
    assert rel == 0.0, rel
    d = p.relatedness_coefficient("mom", "kid")
    assert d == 0.5, d
    d = p.relatedness_coefficient("dad", "gma")
    assert d == 0.0, d

    d = p.relatedness_coefficient("mom", "gma")
    assert d == 0.5, d

    d = p.relatedness_coefficient("kid", "gma")
    assert d == 0.25, d

    d = p.relatedness_coefficient("kid", "ggma")
    assert d == 0.125, d

    assert p.relatedness_coefficient("mom", "mom") == 1.0
Example #2
0
def t_ped_check():
    try:
        import pandas as pd
        import cyvcf2
        cyvcf2
    except ImportError:
        return
    p = Ped(op.join(HERE, 'peddy/tests/test.mendel.ped'))
    v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz'))
    assert isinstance(v, pd.DataFrame), v

    # remove samples
    f = list(p.families.values())[0]
    l = len(f.samples)
    s = f.samples[-1]
    f.samples = f.samples[:-1]
    assert l - 1 == len(f.samples)
    v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz'))
    assert isinstance(v, pd.DataFrame), v
    assert "ibs0" in v.columns

    # changed the sample id of a sample
    s.sample_id = "XDFSDFX"
    f.samples.append(s)
    v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz'))
    assert isinstance(v, pd.DataFrame), v
Example #3
0
def t_ped_check():
    try:
        import pandas as pd
        import cyvcf2
        cyvcf2
    except ImportError:
        return
    p = Ped(op.join(HERE, 'peddy/tests/test.mendel.ped'))
    v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz'))
    assert isinstance(v, pd.DataFrame), v

    # remove samples
    f = list(p.families.values())[0]
    l = len(f.samples)
    s = f.samples[-1]
    f.samples = f.samples[:-1]
    assert l -1 == len(f.samples)
    v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz'))
    assert isinstance(v, pd.DataFrame), v
    assert "ibs0" in v.columns

    # changed the sample id of a sample
    s.sample_id = "XDFSDFX"
    f.samples.append(s)
    v = p.ped_check(op.join(HERE, b'peddy/tests/test.mendel.vcf.gz'))
    assert isinstance(v, pd.DataFrame), v
Example #4
0
    def create_samples(self):
        ped = Ped(self.ped_path)
        cols = [
            'sample_id', 'family_id', 'name', 'paternal_id', 'maternal_id',
            'sex', 'phenotype'
        ]
        if ped.header is None:
            ped.header = [x for x in cols if x != 'name']
        samples = [fix_sample_name(s) for s in self.vcf.samples]
        cols = [
            'sample_id', 'family_id', 'name', 'paternal_id', 'maternal_id',
            'sex', 'phenotype'
        ]
        idxs, rows, not_in_vcf = [], [], []
        cols.extend(ped.header[6:])
        sample_id = 1
        for i, s in enumerate(ped.samples(), start=1):
            try:
                idxs.append(samples.index(fix_sample_name(s.sample_id)))
            except ValueError:
                not_in_vcf.append(s.sample_id)
                continue
            rows.append([
                sample_id,
                s.family_id,
                fix_sample_name(s.sample_id),
                fix_sample_name(str(s.paternal_id)),
                fix_sample_name(str(s.maternal_id)),
                '1' if s.sex == 'male' else '2' if s.sex == 'female' else '-9',
                '2' if s.affected is True else
                '1' if s.affected is False else '-9',
            ] + s.attrs)
            sample_id += 1

        if len(not_in_vcf) > 0:
            print("not in VCF: %s" % ",".join(not_in_vcf), file=sys.stderr)
        scols = [sql.Column('sample_id', sql.Integer, primary_key=True)]
        for i, col in enumerate(cols[1:], start=1):
            vals = None
            try:
                vals = [r[i] for r in rows]
                l = max(len(v) for v in vals)
                scols.append(sql.Column(col, Unicode(l)))
            except:
                print(col, vals, file=sys.stderr)
                raise

        t = sql.Table('samples', self.metadata, *scols)
        t.drop(checkfirst=True)
        t.create()

        self.engine.execute(t.insert(), [dict(zip(cols, r)) for r in rows])

        # track the order to pull from the genotype fields.
        self.sample_idxs = np.array(idxs)
        return [r[2] for r in rows]
Example #5
0
def test_sex_check():
    if sys.version_info[0] == 3:
        return

    p = Ped(op.join(HERE, 'peddy/tests/test.mendel.ped'))
    df = p.sex_check(op.join(HERE, 'peddy/tests/test.mendel.vcf.gz'))

    assert "predicted_sex" in df.columns
    assert "ped_sex", df.columns
    assert "error" in df.columns
Example #6
0
def test_relatedness_coefficient():
    kid = Sample('fam1', 'kid', 'dad', 'mom', '2', '2')
    dad = Sample('fam1', 'dad', '-9', '-9', '1', '2')
    mom = Sample('fam1', 'mom', '-9', '-9', '2', '2')
    gma = Sample('fam1', 'gma', '-9', '-9', '2', '2')
    ggma = Sample('fam1', 'ggma', '-9', '-9', '2', '2')
    kid.mom = mom
    kid.dad = dad
    mom.mom = gma
    gma.mom = ggma

    unrelated = Sample('fam1', 'un', '-9', '-9', '2', '2')

    from io import StringIO
    p = Ped(StringIO())
    p.families['fam1'] = Family([kid, mom, dad, gma, ggma, unrelated])
    rel = p.relatedness_coefficient("mom", "dad")
    assert rel == 0.0, rel
    d = p.relatedness_coefficient("mom", "kid")
    assert d == 0.5, d
    d = p.relatedness_coefficient("dad", "gma")
    assert d == 0.0, d

    d = p.relatedness_coefficient("mom", "gma")
    assert d == 0.5, d

    d = p.relatedness_coefficient("kid", "gma")
    assert d == 0.25, d

    d = p.relatedness_coefficient("kid", "ggma")
    assert d == 0.125, d

    assert p.relatedness_coefficient("mom", "mom") == 1.0
Example #7
0
def test_relatedness_coefficient_missing_parent():

    gma = Sample('X28935', 'gma', '-9', '-9', '2', '1')
    mom = Sample('X28935', 'mom', '-9', 'gma', '2', '1')
    dad = Sample('X28935', 'dad', '-9', '-9', '1', '1')

    kid1 = Sample('X28935', 'kid1', '-9', 'mom', '1', '1')
    kid2 = Sample('X28935', 'kid2', '-9', 'mom', '2', '1')

    kid1 = Sample('X28935', 'kid1', 'dad', 'mom', '1', '1')
    kid2 = Sample('X28935', 'kid2', 'dad', 'mom', '2', '1')

    kid1.mom = mom
    kid2.mom = mom
    mom.mom = gma
    kid1.dad = dad
    kid2.dad = dad

    from io import StringIO
    p = Ped(StringIO())
    p.families['X28935'] = Family([kid1, kid2, mom, gma])  #, dad])

    assert "siblings" in p.relation('kid1', 'kid2'), p.relation('kid1', 'kid2')

    v = p.relatedness_coefficient('kid1', 'kid2')
    assert v == 0.5, v

    v = p.relatedness_coefficient('gma', 'kid2')
    assert v == 0.25, v

    v = p.relatedness_coefficient('gma', 'kid1')
    assert v == 0.25, v

    v = p.relatedness_coefficient('gma', 'mom')
    assert v == 0.5, v
Example #8
0
def test_relation():
    kid = Sample('fam1', 'kid', 'dad', 'mom', '2', '2')
    dad = Sample('fam1', 'dad', '-9', '-9', '1', '2')
    mom = Sample('fam1', 'mom', '-9', '-9', '2', '2')
    kid.mom = mom
    kid.dad = dad

    from io import StringIO
    p = Ped(StringIO())
    p.families['fam1'] = Family([kid, mom, dad])
    assert p.relation("mom", "dad") == "mom-dad"
Example #9
0
def test_relation():
    kid = Sample('fam1', 'kid', 'dad', 'mom', '2', '2')
    dad = Sample('fam1', 'dad', '-9', '-9', '1', '2')
    mom = Sample('fam1', 'mom', '-9', '-9', '2', '2')
    kid.mom = mom
    kid.dad = dad

    from io import StringIO
    p = Ped(StringIO())
    p.families['fam1'] = Family([kid, mom, dad])
    assert p.relation("mom", "dad") == "mom-dad"
Example #10
0
    def create_samples(self):
        ped = Ped(self.ped_path)
        cols = ["sample_id", "family_id", "name", "paternal_id", "maternal_id", "sex", "phenotype"]
        if ped.header is None:
            ped.header = [x for x in cols if x != "name"]
        samples = [fix_sample_name(s) for s in self.vcf.samples]
        cols = ["sample_id", "family_id", "name", "paternal_id", "maternal_id", "sex", "phenotype"]
        idxs, rows, not_in_vcf = [], [], []
        cols.extend(ped.header[6:])
        sample_id = 1
        for i, s in enumerate(ped.samples(), start=1):
            try:
                idxs.append(samples.index(fix_sample_name(s.sample_id)))
            except ValueError:
                not_in_vcf.append(s.sample_id)
                continue
            rows.append(
                [
                    sample_id,
                    s.family_id,
                    fix_sample_name(s.sample_id),
                    fix_sample_name(str(s.paternal_id)),
                    fix_sample_name(str(s.maternal_id)),
                    "1" if s.sex == "male" else "2" if s.sex == "female" else "-9",
                    "2" if s.affected is True else "1" if s.affected is False else "-9",
                ]
                + s.attrs
            )
            sample_id += 1

        if len(not_in_vcf) > 0:
            print("not in VCF: %s" % ",".join(not_in_vcf), file=sys.stderr)
        scols = [sql.Column("sample_id", sql.Integer, primary_key=True)]
        for i, col in enumerate(cols[1:], start=1):
            vals = None
            try:
                vals = [r[i] for r in rows]
                l = max(len(v) for v in vals)
                scols.append(sql.Column(col, Unicode(l)))
            except:
                print(col, vals, file=sys.stderr)
                raise

        t = sql.Table("samples", self.metadata, *scols)
        t.drop(checkfirst=True)
        t.create()

        self.engine.execute(t.insert(), [dict(zip(cols, r)) for r in rows])

        # track the order to pull from the genotype fields.
        self.sample_idxs = np.array(idxs)
        return [r[2] for r in rows]
Example #11
0
def test_relatedness_coefficient_missing_parent():

    gma = Sample('X28935', 'gma', '-9', '-9', '2', '1')
    mom = Sample('X28935', 'mom', '-9', 'gma', '2', '1')
    dad = Sample('X28935', 'dad', '-9', '-9', '1', '1')

    kid1 = Sample('X28935', 'kid1', '-9', 'mom', '1', '1')
    kid2 = Sample('X28935', 'kid2', '-9', 'mom', '2', '1')

    kid1 = Sample('X28935', 'kid1', 'dad', 'mom', '1', '1')
    kid2 = Sample('X28935', 'kid2', 'dad', 'mom', '2', '1')

    kid1.mom = mom
    kid2.mom = mom
    mom.mom = gma
    kid1.dad = dad
    kid2.dad = dad

    from io import StringIO
    p = Ped(StringIO())
    p.families['X28935'] = Family([kid1, kid2, mom, gma])#, dad])

    assert "siblings" in p.relation('kid1', 'kid2'), p.relation('kid1', 'kid2')

    v = p.relatedness_coefficient('kid1', 'kid2')
    assert v == 0.5, v

    v = p.relatedness_coefficient('gma', 'kid2')
    assert v == 0.25, v

    v = p.relatedness_coefficient('gma', 'kid1')
    assert v == 0.25, v

    v = p.relatedness_coefficient('gma', 'mom')
    assert v == 0.5, v
Example #12
0
def generate_trios(pedfile, f1=True):
    """
    Given a PED file, specify whether you want
    to output trios w/r/t the F1 or F2 generation
    (i.e., whether the kid in each trio is an F1 or F2).
    """
    from peddy import Ped
    ped = Ped(pedfile)
    p0s = [k for k in ped.samples() if k.mom is None]
    f1s = [k for k in ped.samples() if k.mom and k.dad and k.mom.mom is None]
    f2s = [k for k in ped.samples() if k.mom and k.mom.mom]
    if f1: trios = f1s
    else: trios = f2s
    for i in trios:
        yield (i.sample_id, i.mom.sample_id, i.dad.sample_id)
Example #13
0
def test_trios():
    p = Ped(op.join(HERE, 'peddy/tests/a.ped'))
    f = p.families['family_4']
    trios = list(f.trios())
    assert len(trios) == 3

    assert [t[0] for t in trios] == list(f.affecteds)
Example #14
0
def test_relatedness_coefficient_missing_gparent():
    p = Ped(open(os.path.join(HERE, "peddy/tests/test.fam.ped")))
    # uncle
    v = p.relatedness_coefficient('101806-101806', '101811-101811')
    assert v == 0.25, v
    v = p.relatedness_coefficient('101806-101806', '101809-101809')
    assert v == 0.25, v
    # parent-child
    v = p.relatedness_coefficient('101806-101806', '101653-101653')
    assert v == 0.5, v

    p = Ped(open(os.path.join(HERE, "peddy/tests/test.fam2.ped")))
    v = p.relatedness_coefficient('101806-101806', '101811-101811')
    assert v == 0.25, v
    v = p.relatedness_coefficient('101806-101806', '101809-101809')
    assert v == 0.25, v

    # parent-child
    v = p.relatedness_coefficient('101806-101806', '101653-101653')
    assert v == 0.5, v
Example #15
0
def test_relatedness_coefficient_missing_gparent():
    p = Ped(open(os.path.join(HERE, "peddy/tests/test.fam.ped")))
    # uncle
    v = p.relatedness_coefficient('101806-101806', '101811-101811')
    assert v == 0.25, v
    v = p.relatedness_coefficient('101806-101806', '101809-101809')
    assert v == 0.25, v
    # parent-child
    v = p.relatedness_coefficient('101806-101806', '101653-101653')
    assert v == 0.5, v

    p = Ped(open(os.path.join(HERE, "peddy/tests/test.fam2.ped")))
    v = p.relatedness_coefficient('101806-101806', '101811-101811')
    assert v == 0.25, v
    v = p.relatedness_coefficient('101806-101806', '101809-101809')
    assert v == 0.25, v

    # parent-child
    v = p.relatedness_coefficient('101806-101806', '101653-101653')
    assert v == 0.5, v
Example #16
0
def test_json():
    p = Ped(op.join(HERE, 'peddy/tests/test.mendel.ped'))
    json = p.to_json()
    #expected = '[{"maternal_id": "-9", "paternal_id": "-9", "sex": "male", "family_id": "CEPH1463", "phenotype": "affected", "sample_id": "NA12889"}, {"maternal_id": "-9", "paternal_id": "-9", "sex": "female", "family_id": "CEPH1463", "phenotype": "affected", "sample_id": "NA12890"}, {"maternal_id": "NA12890", "paternal_id": "NA12889", "sex": "male", "family_id": "CEPH1463", "phenotype": "affected", "sample_id": "NA12877"}]'
    # this test may fail if order of dicts is changed
    assert "CEPH1463" in json, json
Example #17
0
def run(args):
    print ('\t'.join(['chrom', 'start', 'end', 'sample_id', 'parent_id', 'n_vars', 'hap_start']))
    vcf = VCF(args.vcf)
    ped = Ped(args.ped)

    samples = [s for s in ped.samples()]
    kids = [s for s in samples if s.mom is not None and s.dad is not None]
    fams = set([s.family_id for s in samples])

    smp2ped = dict(zip([s.sample_id for s in samples], samples))

    exclude = read_exclude(args.exclude)

    #kid = smp2ped[args.kid].sample_id

    #dad, mom = smp2ped[kid].paternal_id, smp2ped[kid].maternal_id

    #samples_in_ped = [s.sample_id for s in samples]
    #samples_in_fam = [s.sample_id for s in samples if s.family_id == smp2ped[kid].family_id]

    # restrict VCF to the samples in the current family
    #vcf.set_samples(samples_in_ped)

    smp2idx = dict(zip(vcf.samples, range(len(vcf.samples))))

    bad_positions = []
    v_feats = []

    haps = defaultdict()
    inf_positions = defaultdict(list)

    fam_dict = defaultdict(lambda: defaultdict())

    
    for fam in fams:
        mom = [s for s in samples if s.family_id == fam and s.mom is None and s.dad is None and s.sex == 'female'][0]
        dad = [s for s in samples if s.family_id == fam and s.mom is None and s.dad is None and s.sex == 'male'][0]
        sibs = [s.sample_id for s in kids if s.dad == dad and s.mom == mom]

        fam_dict[fam]['mom'] = mom
        fam_dict[fam]['dad'] = dad
        fam_dict[fam]['sibs'] = ','.join(sibs) 

    nused, i, report_at, t0 = 0, 0, 1000, time.time()
    for i,v in enumerate(vcf(args.chrom)):
        if i != 0 and i % report_at == 0:
            persec = i / float(time.time() - t0)
            print("%s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" % (v.CHROM, v.POS,
                  persec, 100.0 * nused/i, nused, i), file=sys.stderr)

        if args.exclude and len(exclude[v.CHROM].search(v.start, v.end)) > 0: continue
        if v.var_type != 'snp': continue
        if v.FILTER not in ('PASS', None): continue
        if v.call_rate < 0.90: continue 
        if len(v.ALT) > 1: continue
        if len(v.ALT[0]) > 1: continue
        
        gts = v.gt_types
        quals = v.gt_quals
        rd, ad = v.gt_ref_depths, v.gt_alt_depths

        for fam in fam_dict:

            mom = fam_dict[fam]['mom']
            dad = fam_dict[fam]['dad']
            sibs = fam_dict[fam]['sibs'].split(',')

            if args.inf_parent and args.inf_parent not in (mom, dad): continue

            try:
                mi, di = smp2idx[mom.sample_id], smp2idx[dad.sample_id]
            except KeyError: sys.exit()

            # ensure we're at an informative site

            if gts[mi] != HOM_REF and gts[di] != HOM_REF: continue
            if gts[mi] == HOM_REF and gts[di] == HOM_REF: continue
            
            sib_gts = [gts[smp2idx[k]] for k in sibs]

            inf_parent = None
            
            if (gts[mi] == HET and gts[di] == HOM_REF): inf_parent = mom.sample_id
            elif (gts[di] == HET and gts[mi] == HOM_REF): inf_parent = dad.sample_id
            else: continue
            
            # check that both parents are "high-quality"
            if not is_good_site(mi, quals, gts, ad, rd, het_ab=args.ab): continue
            if not is_good_site(di, quals, gts, ad, rd, het_ab=args.ab): continue

            if args.inf_parent and inf_parent != args.inf_parent: continue

            # catalog the "states" of each child w/r/t the informative parent
            # if kids are HETs, their state w/r/t to the INF parent is 0
            states = []
            sib_pass = []
            for i,k in enumerate(sibs):
                k_idx = smp2idx[k]
                k_pass = is_good_site(k_idx, quals, gts, ad, rd, het_ab=args.ab)
                sib_pass.append(k_pass)
                k_state = -1
                if v.CHROM in ('chrX', 'X') and k.sex == "male":
                    k_state = 0 if gts[k_idx] == HOM_ALT else 1
                else:
                    k_state = 0 if gts[k_idx] == HET else 1

                states.append(k_state)

            if not all([x is True for x in sib_pass]): continue

            if sum([s < 0 for s in states]) > 0: continue
            
            if inf_parent not in haps:
                haps[inf_parent] = np.array(states)
            else:
                haps[inf_parent] = np.vstack((haps[inf_parent], np.array(states)))

            inf_positions[inf_parent].append(v.start)

        nused += 1

    persec = i / float(time.time() - t0)
    xos = get_xo(args, haps, inf_positions)
Example #18
0
def test_6():
    p = Ped(op.join(HERE, 'peddy/tests/a6.ped'))
    assert len(list(p.samples())) == 14
    for sam in p.samples():
        assert sam.family_id[:3] == "fam"
Example #19
0
def test_distant():

    p = Ped(op.join(HERE, 'peddy/tests/test-unknown-gma.ped'))

    d = p.relatedness_coefficient('kid1', 'cousin1')
    assert d == 0.125, d
    d = p.relatedness_coefficient('kid1', 'aunt')
    assert d == 0.25, d
    d = p.relatedness_coefficient('cousin1', 'aunt')
    assert d == 0.5, d
    d = p.relatedness_coefficient('mom', 'aunt')
    assert d == 0.5, d

    r = p.relation('kid1', 'cousin1')
    assert r == 'cousins', r

    r = p.relation('kid1', 'grandma')
    assert r == 'grandchild', r

    r = p.relation('kid1', 'aunt')
    assert r == 'niece/nephew', r

    # because we don't know that the uncle is related
    r = p.relation('kid1', 'uncle')
    assert r == 'related at unknown level', r

    r = p.relation('cousin1', 'mom')
    assert r == 'niece/nephew', r
    r = p.relation('cousin1', 'dad')
    # because we don't know that the dad is related
    assert r == 'related at unknown level', r
Example #20
0
def test_ped():

    p = Ped(op.join(HERE, 'peddy/tests/a.ped'))
    assert len(p.families) == 4

    assert len(list(p.samples())) == 14
Example #21
0
def test_getattr():
    p = Ped(op.join(HERE, 'peddy/tests/a.ped'))
    li = list(p.samples(ethnicity='caucasianNEuropean'))
    assert len(li) == 5
    for item in li:
        assert item.ethnicity == 'caucasianNEuropean'
Example #22
0
def test_distant():

    p = Ped(op.join(HERE, 'peddy/tests/test-unknown-gma.ped'))

    d = p.relatedness_coefficient('kid1', 'cousin1')
    assert d == 0.125, d
    d = p.relatedness_coefficient('kid1', 'aunt')
    assert d == 0.25, d
    d = p.relatedness_coefficient('cousin1', 'aunt')
    assert d == 0.5, d
    d = p.relatedness_coefficient('mom', 'aunt')
    assert d == 0.5, d

    r = p.relation('kid1', 'cousin1')
    assert r == 'cousins', r

    r = p.relation('kid1', 'grandma')
    assert r == 'grandchild', r

    r = p.relation('kid1', 'aunt')
    assert r == 'niece/nephew', r

    # because we don't know that the uncle is related
    r = p.relation('kid1', 'uncle')
    assert r == 'related at unknown level', r

    r = p.relation('cousin1', 'mom')
    assert r == 'niece/nephew', r
    r = p.relation('cousin1', 'dad')
    # because we don't know that the dad is related
    assert r == 'related at unknown level', r
Example #23
0
def test_json():
    p = Ped(op.join(HERE, 'peddy/tests/test.mendel.ped'))
    json = p.to_json()
    #expected = '[{"maternal_id": "-9", "paternal_id": "-9", "sex": "male", "family_id": "CEPH1463", "phenotype": "affected", "sample_id": "NA12889"}, {"maternal_id": "-9", "paternal_id": "-9", "sex": "female", "family_id": "CEPH1463", "phenotype": "affected", "sample_id": "NA12890"}, {"maternal_id": "NA12890", "paternal_id": "NA12889", "sex": "male", "family_id": "CEPH1463", "phenotype": "affected", "sample_id": "NA12877"}]'
    # this test may fail if order of dicts is changed
    assert "CEPH1463" in json, json
import csv
import sys
from peddy import Ped
from collections import defaultdict

samples = [s for s in Ped(sys.argv[2]).samples()]
kids = [s for s in samples if s.mom is not None and s.dad is not None]

gt = defaultdict()
alleles = defaultdict()
reads = defaultdict()

with open(sys.argv[1]) as f:
    fh = csv.reader(f, delimiter='\t')
    for l in fh:
        gt[l[0]] = l[1]
        alleles[l[0]] = l[2]
        reads[l[0]] = l[3]

md_seen = []

inh_errors = 0
total_inh = 0

for s in kids:
    if s.dad.sample_id == '8477': continue
    parent_gts = [gt[p] for p in (s.mom.sample_id, s.dad.sample_id)]
    #if 'unk' in parent_gts: continue
    #if gt[s.sample_id] == 'unk': continue
    if gt[s.sample_id] not in parent_gts: 
        print ('\t'.join([s.family_id, s.sample_id, gt[s.sample_id], gt[s.mom.sample_id], gt[s.dad.sample_id], alleles[s.sample_id], alleles[s.mom.sample_id], alleles[s.dad.sample_id], reads[s.sample_id]]))
Example #25
0
def test_getattr():
    p = Ped(op.join(HERE, 'peddy/tests/a.ped'))
    li = list(p.samples(ethnicity='caucasianNEuropean'))
    assert len(li) == 5
    for item in li:
        assert item.ethnicity == 'caucasianNEuropean'
Example #26
0
def run(args):
    ped = Ped(args.ped)
    vcf = VCF(args.vcf, gts012=True)

    ped_samples = [s.sample_id for s in ped.samples()]
    vcf_samples = set(vcf.samples)

    samples = [s for s in ped_samples if s in vcf_samples]

    exclude = read_exclude(args.exclude)

    vcf = VCF(args.vcf, samples=samples, gts012=True)
    if args.region:
        vcf_iter = vcf(args.region)
    else:
        vcf_iter = vcf

    pctile1 = 10
    # build a dict of sample_id to sample index
    smp2idx = dict(zip(vcf.samples, range(len(vcf.samples))))

    # get the Ped objects for the family of interest
    if args.families is None:
        fams = ped.families.values()
    else:
        fams = [ped.families[f] for f in args.families.split(",")]
    if len(fams) == 0:
        sys.exit('Families %s not found in ped file' % args.families)

    # create a simple dictionary of info for each family member
    fs = [get_family_dict(fam, smp2idx, args) for fam in fams]
    del fam

    fsites = open("%s.sites" % args.prefix, "w")
    # fcalls contains the crossovers for all samples.
    try:
        vcf["AB"]
        has_abs = True
    except KeyError:
        has_abs = False

    smp2gt = defaultdict(int)

    nused, i, report_at, t0 = 0, 0, 20000, time.time()
    for i, v in enumerate(vcf_iter, start=1):
        if i % report_at == 0:
            persec = i / float(time.time() - t0)
            print("%s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" %
                  (v.CHROM, v.POS, persec, 100.0 * nused / i, nused, i),
                  file=sys.stderr)
            if i == 20000:
                report_at = 100000
            if i == 100000:
                report_at = 200000
                for f in fs:
                    for k in f:
                        if k.startswith('fh'): f[k].flush()
            sys.stderr.flush()
        if v.var_type != 'snp': continue  ### no indels
        if len(v.ALT) > 1: continue
        #if len(v.REF) > 3 or len(v.ALT) > 1 or len(v.ALT[0]) > 3:
        #    continue
        #if v.call_rate < 0.95: continue
        if v.call_rate < 0.90: continue
        if v.FILTER is not None: continue
        if int(v.INFO.get('AC')) == 1: continue

        if exclude is not None and 0 != len(exclude[v.CHROM].search(
                v.start, v.end)):
            continue

        # expensive to get gt_bases and we only need it at the crossover.
        gt_bases = None
        gt_types, gt_quals, gt_depths = v.gt_types, v.gt_quals, v.gt_depths

        ### added by tom 2019-11-13, as 1st percentile of depths on chr17
        ### were often negative...
        #gt_depths[gt_depths < 0] = 0

        gt_phases = v.gt_phases
        ipctiles, pctiles = None, None
        sample_abs = None

        nsites = 0  # track the number of families that had this as an informative site.
        for f in fs:
            #if ipctiles is not None and ipctiles[0] < pctile1:
            #    break

            # is_informative only needs gt_types, so we check that first...
            add_genotype_info(f, gt_types=gt_types, gt_phases=gt_phases)

            # ############## PHASED ####################
            if all(f['gt_phase']):
                if gt_bases is None:
                    gt_bases = v.gt_bases

                nsites += phased_check(f, v, gt_bases)
                continue
            # ############ END PHASED ####################

            # need exactly 1 het parent for unphased checks.
            if 1 != ((f['gt_type'][0] == HET) + (f['gt_type'][1] == HET)):
                continue

            if not is_informative(f):
                continue
            # now wee need to add quality and depth.
            add_genotype_info(f, gt_quals=gt_quals, gt_depths=gt_depths)

            if not passes_quality_control(f, args):
                continue

            # detect crossovers.
            for parent, (p1, p2) in [("dad", (0, 1)), ("mom", (1, 0))]:

                if not (f['gt_type'][p1] == HET
                        and f['gt_type'][p2] == HOM_REF):
                    continue

                if gt_bases is None:
                    gt_bases = v.gt_bases
                if sample_abs is None:
                    sample_abs = get_allele_balance(v, has_abs)
                    if sample_abs is None: break

                fam_abs = sample_abs[f['idxs']]
                if all(np.isnan(fam_abs)): continue
                off = 0.31  # require that  off <= alt/(ref+alt) <= 1-off
                if ((fam_abs[p1] >= 1 - off) | (fam_abs[p1] <= off)): continue
                if np.any((1 - off < fam_abs[2:]) | (fam_abs[2:] <= off)):
                    continue

                fam_bases = "\t".join(gt_bases[f['idxs']])

                fam_abs = "|".join("%.2f" % val for val in fam_abs)

                # calculate on first use. we found that having a low 1st pctile
                # was a good indicator of increased chance of spurious XO even
                # in families with decent depth.
                #print (np.mean(gt_depths), np.median(gt_depths))
                if pctiles is None:
                    ipctiles = np.percentile(gt_depths, (1, 5, 10, 50, 90))
                    pctiles = "|".join("%.0f" % de for de in ipctiles)
                #if ipctiles[0] < pctile1:
                #    break
                fam_depths = "|".join(map(str, gt_depths[f['idxs']]))
                nsites += 1
                val = 1 if f['gt_type'][2] == f['gt_type'][3] else 0
                f['fh-%s' % parent].write('\t'.join(
                    str(s) for s in [
                        v.CHROM, v.POS - 1, v.POS, f['ids'][p1],
                        f['family_id'], val, fam_bases, fam_depths,
                        "%.2f" % v.call_rate, pctiles, fam_abs
                    ]) + '\n')

        fsites.write("%s:%d\t%d\n" % (v.CHROM, v.POS, nsites))
        if nsites > 0:
            nused += 1
    fsites.close()
    persec = i / float(time.time() - t0)
    print("finished at %s:%d (%.1f/sec) %.2f%% informative (%d/%d variants)" %
          (v.CHROM, v.POS, persec, 100.0 * nused / i, nused, i),
          file=sys.stderr)
    kept = _remove_empty(fs)
    call_all(kept, args.prefix, min_sites=20)
Example #27
0
def test_ped():

    p = Ped('peddy/tests/a.ped')
    assert len(p.families) == 4

    assert len(list(p.samples())) == 14
Example #28
0
def test_ped():

    p = Ped(op.join(HERE, 'peddy/tests/a.ped'))
    assert len(p.families) == 4

    assert len(list(p.samples())) == 14
Example #29
0
def run(pedf, region, ref, bams, min_req_alts=MIN_REQ_ALTS):
    print(pedf, file=sys.stderr)
    ped = Ped(pedf)
    bams = " ".join(bams)

    cmd = CMD.format(**locals())

    sample_names = None

    trios = []
    for f in ped.families.values():
        trios.extend(f.trios(affected=None))
    print("found: %d trios" % len(trios), file=sys.stderr)
    if len(trios) == 0:
        raise Exception("found no trios")

    p = sp.Popen(cmd, shell=True, stderr=sys.stderr, stdout=sp.PIPE)
    atexit.register(p.kill)
    for i, line in enumerate(p.stdout):
        if line[0] == '#':
            if line.startswith("#CHROM"):
                sample_names = line.rstrip().split("\t")[9:]
                print(
                    """##INFO=<ID=MOSAIC,Number=1,Type=String,Description="Pipe-delimited list of samples with evidence of mosaicism">"""
                )

            print(line, end="")
            continue
        toks = line.rstrip().split("\t")
        format = toks[8].split(":")
        if i % 1000 == 0:
            print("mosaic: checked ...", i, file=sys.stderr)
            sys.stderr.flush()

        samples = {
            sample_names[k]: dict(zip(format, t.split(":")))
            for k, t in enumerate(toks[9:])
        }

        candidates = []
        for kid, mom, dad in trios:
            try:
                mom = samples[mom.sample_id]['AO'].split(",")
                if not any('0' == m for m in mom): continue

                dad = samples[dad.sample_id]['AO'].split(",")
                if not any('0' == d for d in dad): continue

                parents = [mom[k] + dad[k] for k in range(len(dad))]
                if not '00' in parents: continue

                skid = samples[kid.sample_id]
                kid_alts = map(int, skid['AO'].split(","))
            except KeyError:  # require all samples to be called.
                continue

            if not any(a >= MIN_REQ_ALTS and parents[k] == '00'
                       for k, a in enumerate(kid_alts)):
                continue

            candidates.append(
                "%s:%s:%s:%s" %
                (kid.sample_id, skid['RO'], skid['AO'], skid['QA']))
        if not candidates:
            continue

        toks[7] = "MOSAIC=%s;%s" % ("|".join(candidates), toks[7])
        print("\t".join(toks))
        sys.stdout.flush()
Example #30
0
def test_6():
    p = Ped(op.join(HERE, 'peddy/tests/a6.ped'))
    assert len(list(p.samples())) == 14
    for sam in p.samples():
        assert sam.family_id[:3] == "fam"