Example #1
0
def _get_counts(seqarr):

    outarr = reftrick(seqarr[:].view(np.uint8), GETCONS).view("S1")

    ## allele arrays
    seq0 = seqarr.copy()
    seq1 = seqarr.copy()

    seq0[seq0 == "R"] = "G"
    seq0[seq0 == "K"] = "G"
    seq0[seq0 == "S"] = "G"
    seq0[seq0 == "Y"] = "T"
    seq0[seq0 == "W"] = "T"
    seq0[seq0 == "M"] = "C"

    seq1[seq1 == "R"] = "A"
    seq1[seq1 == "K"] = "T"
    seq1[seq1 == "S"] = "C"
    seq1[seq1 == "Y"] = "C"
    seq1[seq1 == "W"] = "A"
    seq1[seq1 == "M"] = "A"

    ## fill array
    ntaxa, nsites = seqarr.shape
    priarr = np.zeros((ntaxa * 2, nsites), dtype=np.uint8)
    secarr = np.zeros((ntaxa * 2, nsites), dtype=np.uint8)

    ## add 2 if allele is the outgroup
    priarr[::2] += seq0 == outarr[:, 0]
    priarr[1::2] += seq1 == outarr[:, 0]
    secarr[::2] += seq0 == outarr[:, 1]
    secarr[1::2] += seq1 == outarr[:, 1]

    ## invert so zero is ancestral and >1 is derived
    return priarr, secarr
Example #2
0
def _loci_to_arr(loci, taxdict, mindict):
    """
    return a frequency array from a loci file for all loci with taxa from 
    taxdict and min coverage from mindict. 
    """

    ## make the array (4 or 5) and a mask array to remove loci without cov
    nloci = len(loci)
    keep = np.zeros(nloci, dtype=np.bool_)
    arr = np.zeros((nloci, 4, 300), dtype=np.float64)

    ## six rows b/c one for each p3, and for the fused p3 ancestor
    if len(taxdict) == 5:
        arr = np.zeros((nloci, 6, 300), dtype=np.float64)

    ## if not mindict, make one that requires 1 in each taxon
    if isinstance(mindict, int):
        mindict = {i: mindict for i in taxdict}
    elif isinstance(mindict, dict):
        mindict = {i: mindict[i] for i in taxdict}
    else:
        mindict = {i: 1 for i in taxdict}

    ## raise error if names are not 'p[int]'
    allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5']
    if any([i not in allowed_names for i in taxdict]):
        raise IPyradError(\
            "keys in taxdict must be named 'p1' through 'p4' or 'p5'")

    ## parse key names
    keys = sorted([i for i in taxdict.keys() if i[0] == 'p'])
    outg = keys[-1]

    ## grab seqs just for the good guys
    for loc in xrange(nloci):

        ## parse the locus
        lines = loci[loc].split("\n")[:-1]
        names = [i.split()[0] for i in lines]
        seqs = np.array([list(i.split()[1]) for i in lines])

        ## check that names cover the taxdict (still need to check by site)
        covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \
                for tax in taxdict]

        ## keep locus
        if all(covs):
            keep[loc] = True

            ## get the refseq
            refidx = np.where([i in taxdict[outg] for i in names])[0]
            refseq = seqs[refidx].view(np.uint8)
            ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]])

            ## freq of ref in outgroup
            iseq = _reffreq2(ancestral, refseq, GETCONS2)
            arr[loc, -1, :iseq.shape[1]] = iseq

            ## enter 4-taxon freqs
            if len(taxdict) == 4:
                for tidx, key in enumerate(keys[:-1]):

                    ## get idx of names in test tax
                    nidx = np.where([i in taxdict[key] for i in names])[0]
                    sidx = seqs[nidx].view(np.uint8)

                    ## get freq of sidx
                    iseq = _reffreq2(ancestral, sidx, GETCONS2)

                    ## fill it in
                    arr[loc, tidx, :iseq.shape[1]] = iseq

            else:

                ## entere p5; and fill it in
                iseq = _reffreq2(ancestral, refseq, GETCONS2)
                arr[loc, -1, :iseq.shape[1]] = iseq

                ## enter p1
                nidx = np.where([i in taxdict['p1'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 0, :iseq.shape[1]] = iseq

                ## enter p2
                nidx = np.where([i in taxdict['p2'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 1, :iseq.shape[1]] = iseq

                ## enter p3 with p4 masked, and p4 with p3 masked
                nidx = np.where([i in taxdict['p3'] for i in names])[0]
                nidy = np.where([i in taxdict['p4'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                sidy = seqs[nidy].view(np.uint8)
                xseq = _reffreq2(ancestral, sidx, GETCONS2)
                yseq = _reffreq2(ancestral, sidy, GETCONS2)
                mask3 = xseq != 0
                mask4 = yseq != 0
                xseq[mask4] = 0
                yseq[mask3] = 0
                arr[loc, 2, :xseq.shape[1]] = xseq
                arr[loc, 3, :yseq.shape[1]] = yseq

                ## enter p34
                nidx = nidx.tolist() + nidy.tolist()
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 4, :iseq.shape[1]] = iseq

    ## size-down array to the number of loci that have taxa for the test
    arr = arr[keep, :, :]

    ## size-down sites to
    arr = masknulls(arr)

    return arr, keep