Example #1
0
    def remove_samples(self, samps):
        ## Allow to just pass in one sample as a string
        if isinstance(samps, str):
            samps = [samps]

        if set(samps) > set(self.samples_vcforder):
            raise IPyradError(
                "  Trying to remove samples not present in the vcf file: {}".
                format(samps))

        ## Remove the samples from the sample list
        mask = np.isin(self.samples_vcforder, samps)
        self.samples_vcforder = self.samples_vcforder[~mask]

        self.genotypes = self.genotypes[:, ~mask]
        ## Remove biallelic singletons. If you don't do this you get
        ## a nasty error during svd, like this:
        ## https://stackoverflow.com/questions/33447808/sklearns-plsregression-valueerror-array-must-not-contain-infs-or-nans
        ac = self.genotypes.count_alleles()
        flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
        self.genotypes = self.genotypes.compress(flt, axis=0)

        if len(self.samples_vcforder) < self.ncomponents:
            self.ncomponents = len(self.samples_vcforder)
            print(
                "  INFO: Number of PCs may not exceed the number of samples.\n  Setting number of PCs = {}"
                .format(self.ncomponents))
Example #2
0
File: pca.py Project: tle003/ipyrad
    def plot_pairwise_dist(self, labels=None, ax=None, cmap=None, cdict=None, metric="euclidean"):
        """
        Plot pairwise distances between all samples

        labels: bool or list
                by default labels aren't included. If labels == True, then labels are read in
                from the vcf file. Alternatively, labels can be passed in as a list, should
                be same length as the number of samples.
        """
        allele_counts = self.genotypes.to_n_alt()
        dist = allel.pairwise_distance(allele_counts, metric=metric)
        if not ax:
            fig = plt.figure(figsize=(5, 5))
            ax = fig.add_subplot(1, 1, 1)

        if isinstance(labels, bool):
            if labels:
                labels = list(self.samples_vcforder)
        elif isinstance(labels, type(None)):
            pass
        else:
            ## If not bool or None (default), then check to make sure the list passed in
            ## is the right length
            if not len(labels) == len(self.samples_vcforder):
                raise IPyradError(LABELS_LENGTH_ERROR.format(len(labels), len(self.samples_vcforder)))

        allel.plot.pairwise_distance(dist, labels=labels, ax=ax, colorbar=False)
Example #3
0
    def plot(self,
             show_test_labels=True,
             use_edge_lengths=False,
             collapse_outgroup=False,
             pct_tree_x=0.5,
             pct_tree_y=0.7,
             *args,
             **kwargs):
        """ draw a multi-panel figure with tree, tests, and results """

        ## check for attributes
        if not self.newick:
            raise IPyradError("baba plot requires a newick treefile")
        if not self.tests:
            raise IPyradError("baba plot must have a .tests attribute")

        ## ensure tests is a list
        if isinstance(self.tests, dict):
            self.tests = [self.tests]

        ## re-decompose the tree
        ttree = toytree.tree(
            self.newick,
            orient='down',
            use_edge_lengths=use_edge_lengths,
        )

        ## make the plot
        canvas, axes, panel = baba_panel_plot(
            ttree=ttree,
            tests=self.tests,
            boots=self.results_boots,
            show_test_labels=show_test_labels,
            use_edge_lengths=use_edge_lengths,
            collapse_outgroup=collapse_outgroup,
            pct_tree_x=pct_tree_x,
            pct_tree_y=pct_tree_y,
            *args,
            **kwargs)
        return canvas, axes, panel
Example #4
0
def showstats(parsedict):
    """ loads assembly or dies, and print stats to screen """

    #project_dir = parsedict['1']
    project_dir = parsedict["project_dir"]
    if not project_dir:
        project_dir = "./"
    ## Be nice if somebody also puts in the file extension
    #assembly_name = parsedict['0']
    assembly_name = parsedict["assembly_name"]
    my_assembly = os.path.join(project_dir, assembly_name)

    ## If the project_dir doesn't exist don't even bother trying harder.
    if not os.path.isdir(project_dir):
        msg = """
    Trying to print stats for Assembly ({}) that doesn't exist. You must 
    first run steps before you can show results.
    """.format(project_dir)
        sys.exit(msg)

    if not assembly_name:
        msg = """
    Assembly name is not set in params.txt, meaning it was either changed or
    erased since the Assembly was started. Please restore the original name. 
    You can find the name of your Assembly in the "project dir": {}.
    """.format(project_dir)
        raise IPyradError(msg)

    data = ip.load_json(my_assembly, quiet=True, cli=True)

    print("\nSummary stats of Assembly {}".format(data.name) \
         +"\n------------------------------------------------")
    
    if not data.stats.empty:
        print(data.stats)
        print("\n\nFull stats files"\
         +"\n------------------------------------------------")

        fullcurdir = os.path.realpath(os.path.curdir)
        for i in range(1, 8):
            #enumerate(sorted(data.stats_files)):
            key = "s"+str(i)
            try:
                val = data.stats_files[key]
                val = val.replace(fullcurdir, ".")                
                print("step {}: {}".format(i, val))
            except (KeyError, AttributeError):
                print("step {}: None".format(i))
        print("\n")
    else:
        print("No stats to display")
Example #5
0
File: pca.py Project: tle003/ipyrad
    def __init__(self, 
        data=None, 
        pops=None,
        ncomps=10,
        quiet=True):
        """ 
        ipyrad.analysis Baba Class object.

        Parameters
        ----------
        data : Assembly object or path to file
            Either an ipyrad assembly or a  string path to a .vcf file. If
            it's a string path then you'll probably want to specify pops as
            well or else all your dots will be the same color.
            
        pops : dict or path to file
            A dictionary specifying the population assignment of each
            sample. This is optional, since by default if you used a pops
            file during your assembly the assembly object will include
            the pops info internally.
        ncomps : int
            The number of PCs to calculate. Probably most people won't care
            to mess with this, but it's simple enough to make it flexible. 

        Functions
        ---------
        run()
            ...
        plot()
            ...

        """
        self.quiet = quiet
        self.ncomponents = ncomps

        ## parse data as (1) path to data file, or (2) ndarray
        if isinstance(data, Assembly):
            self.assembly = data
            self.pops = data.populations
            try:
                self.data = data.outfiles.vcf
            except AttributeError as inst:
                raise IPyradError(MISSING_VCF_ERROR)  
        else:
            ## You need a dummy assembly because we use the machinery
            ## of _link_populations below to read in the pops data
            self.assembly = Assembly("ipyrad-pca-tmp", quiet=True)
            self.data = os.path.realpath(data)
            self.pops = {}

        if pops:
            if isinstance(pops, dict):
                ## This is kind of stupid since we're just going to undo this
                ## in like 5 lines, but it gets the passed in pops into the
                ## same format as an assembly.populations dict, just easier to
                ## treat everything the same way.
                self.pops = {x:(0, y) for x, y in pops.items()}
            else:
                if not os.path.isfile(pops):
                    raise IPyradError("popfile does not exist - {}".format(pops))

                ## If the file you pass in doesn't have the stupid ipyrad minsamp
                mindat = [i.lstrip("#").lstrip().rstrip() for i in \
                          open(pops, 'r').readlines() if i.startswith("#")]
                if not mindat:
                    lines = open(pops, 'r').readlines()
                    p = set([x.split()[1].strip() for x in lines])
                    with open(pops, 'a') as outfile:
                        outfile.write("# " + " ".join(["{}:1".format(x) for x in p]))

                self.assembly.paramsdict["pop_assign_file"] = os.path.realpath(pops)
                self.assembly._link_populations()
                self.pops = self.assembly.populations
            
        ## Here the populations continues to maintain info about minsamps,
        ## which we just get rid of for clarity. Sorry this is dumb, I couldn't
        ## figure out a clean way to extract from a tuple inside the dict values.
        tmpdict = {}
        for samp in self.pops:
            tmpdict[samp] = self.pops[samp][1]
        self.pops = tmpdict

        ## Read in the vcf and extract the samples and the data
        ## This will set self.samples_vcforder which is a list of sample names
        ## in the order they appear in the vcf file
        self._load_calldata()

        ## If no pops linked yet (either none in the assembly or none passed in)
        ## then everybody goes into one giant default population.
        if not self.pops:
            self.pops = {"All_samples":self.samples_vcforder}

        if not self.quiet:
            print("  Using populations:\n{}".format(self.pops))
        if not self.pops:
            print("  No populations assigned, so PCA will be monochrome.")
Example #6
0
File: pca.py Project: tle003/ipyrad
    def plot(self, pcs=[1, 2], ax=None, cmap=None, cdict=None, legend=True, title=None, outfile=None):
        """
        Do the PCA and plot it.

        Parameters
        ---------
        pcs: list of ints
        ...
        ax: matplotlib axis
        ...
        cmap: matplotlib colormap
        ...
        cdict: dictionary mapping pop names to colors
        ...
        legend: boolean, whether or not to show the legend

        """
        ## Specify which 2 pcs to plot, default is pc1 and pc2
        pc1 = pcs[0] - 1
        pc2 = pcs[1] - 1
        if pc1 < 0 or pc2 > self.ncomponents - 1:
            raise IPyradError("PCs are 1-indexed. 1 is min & {} is max".format(self.ncomponents))

        ## Convert genotype data to allele count data
        ## We do this here because we might want to try different ways
        ## of accounting for missing data and "alt" allele counts treat
        ## missing data as "ref"
        allele_counts = self.genotypes.to_n_alt()

        ## Actually do the pca
        if self.ncomponents > len(self.samples_vcforder):
            self.ncomponents = len(self.samples_vcforder)
            print("  INFO: # PCs < # samples. Forcing # PCs = {}".format(self.ncomponents))
        coords, model = allel.pca(allele_counts, n_components=self.ncomponents, scaler='patterson')

        self.pcs = pd.DataFrame(coords,
                                index=self.samples_vcforder,
                                columns=["PC{}".format(x) for x in range(1,self.ncomponents+1)])

        ## Just allow folks to pass in the name of the cmap they want to use
        if isinstance(cmap, str):
            try:
                cmap = cm.get_cmap(cmap)
            except:
                raise IPyradError("  Bad cmap value: {}".format(cmap))


        if not cmap and not cdict:
            if not self.quiet:
                print("  Using default cmap: Spectral")
            cmap = cm.get_cmap('Spectral')

        if cmap:
            if cdict:
                print("  Passing in both cmap and cdict defaults to using the cmap value.")
            popcolors = cmap(np.arange(len(self.pops))/len(self.pops))
            cdict = {i:j for i, j in zip(self.pops.keys(), popcolors)}

        fig = ""
        if not ax:
            fig = plt.figure(figsize=(6, 5))
            ax = fig.add_subplot(1, 1, 1)

        x = coords[:, pc1]
        y = coords[:, pc2]
        for pop in self.pops:
            ## Don't include pops with no samples, it makes the legend look stupid
            ## TODO: This doesn't prevent empty pops from showing up in the legend for some reason.
            if len(self.pops[pop]) > 0:
                mask = np.isin(self.samples_vcforder, self.pops[pop])
                ax.plot(x[mask], y[mask], marker='o', linestyle=' ', color=cdict[pop], label=pop, markersize=6, mec='k', mew=.5)

        ax.set_xlabel('PC%s (%.1f%%)' % (pc1+1, model.explained_variance_ratio_[pc1]*100))
        ax.set_ylabel('PC%s (%.1f%%)' % (pc2+1, model.explained_variance_ratio_[pc2]*100))

        if legend:
            ax.legend(bbox_to_anchor=(1, 1), loc='upper left')

        if fig:
            fig.tight_layout()

        if title:
            ax.set_title(title)

        if outfile:
            try:
                plt.savefig(outfile, format="png", bbox_inches="tight")
            except:
                print("  Saving pca.plot() failed to save figure to {}".format(outfile))

        return ax
Example #7
0
def _loci_to_arr(loci, taxdict, mindict):
    """
    return a frequency array from a loci file for all loci with taxa from 
    taxdict and min coverage from mindict. 
    """

    ## make the array (4 or 5) and a mask array to remove loci without cov
    nloci = len(loci)
    keep = np.zeros(nloci, dtype=np.bool_)
    arr = np.zeros((nloci, 4, 300), dtype=np.float64)

    ## six rows b/c one for each p3, and for the fused p3 ancestor
    if len(taxdict) == 5:
        arr = np.zeros((nloci, 6, 300), dtype=np.float64)

    ## if not mindict, make one that requires 1 in each taxon
    if isinstance(mindict, int):
        mindict = {i: mindict for i in taxdict}
    elif isinstance(mindict, dict):
        mindict = {i: mindict[i] for i in taxdict}
    else:
        mindict = {i: 1 for i in taxdict}

    ## raise error if names are not 'p[int]'
    allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5']
    if any([i not in allowed_names for i in taxdict]):
        raise IPyradError(\
            "keys in taxdict must be named 'p1' through 'p4' or 'p5'")

    ## parse key names
    keys = sorted([i for i in taxdict.keys() if i[0] == 'p'])
    outg = keys[-1]

    ## grab seqs just for the good guys
    for loc in xrange(nloci):

        ## parse the locus
        lines = loci[loc].split("\n")[:-1]
        names = [i.split()[0] for i in lines]
        seqs = np.array([list(i.split()[1]) for i in lines])

        ## check that names cover the taxdict (still need to check by site)
        covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \
                for tax in taxdict]

        ## keep locus
        if all(covs):
            keep[loc] = True

            ## get the refseq
            refidx = np.where([i in taxdict[outg] for i in names])[0]
            refseq = seqs[refidx].view(np.uint8)
            ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]])

            ## freq of ref in outgroup
            iseq = _reffreq2(ancestral, refseq, GETCONS2)
            arr[loc, -1, :iseq.shape[1]] = iseq

            ## enter 4-taxon freqs
            if len(taxdict) == 4:
                for tidx, key in enumerate(keys[:-1]):

                    ## get idx of names in test tax
                    nidx = np.where([i in taxdict[key] for i in names])[0]
                    sidx = seqs[nidx].view(np.uint8)

                    ## get freq of sidx
                    iseq = _reffreq2(ancestral, sidx, GETCONS2)

                    ## fill it in
                    arr[loc, tidx, :iseq.shape[1]] = iseq

            else:

                ## entere p5; and fill it in
                iseq = _reffreq2(ancestral, refseq, GETCONS2)
                arr[loc, -1, :iseq.shape[1]] = iseq

                ## enter p1
                nidx = np.where([i in taxdict['p1'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 0, :iseq.shape[1]] = iseq

                ## enter p2
                nidx = np.where([i in taxdict['p2'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 1, :iseq.shape[1]] = iseq

                ## enter p3 with p4 masked, and p4 with p3 masked
                nidx = np.where([i in taxdict['p3'] for i in names])[0]
                nidy = np.where([i in taxdict['p4'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                sidy = seqs[nidy].view(np.uint8)
                xseq = _reffreq2(ancestral, sidx, GETCONS2)
                yseq = _reffreq2(ancestral, sidy, GETCONS2)
                mask3 = xseq != 0
                mask4 = yseq != 0
                xseq[mask4] = 0
                yseq[mask3] = 0
                arr[loc, 2, :xseq.shape[1]] = xseq
                arr[loc, 3, :yseq.shape[1]] = yseq

                ## enter p34
                nidx = nidx.tolist() + nidy.tolist()
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 4, :iseq.shape[1]] = iseq

    ## size-down array to the number of loci that have taxa for the test
    arr = arr[keep, :, :]

    ## size-down sites to
    arr = masknulls(arr)

    return arr, keep
Example #8
0
def batch(
    baba,
    ipyclient=None,
):
    """
    distributes jobs to the parallel client
    """

    ## parse args
    handle = baba.data
    taxdicts = baba.tests
    mindicts = baba.params.mincov
    nboots = baba.params.nboots

    ## if ms generator make into reusable list
    sims = 0
    if isinstance(handle, types.GeneratorType):
        handle = list(handle)
        sims = 1
    else:
        ## expand locifile path to full path
        handle = os.path.realpath(handle)

    ## parse taxdicts into names and lists if it a dictionary
    #if isinstance(taxdicts, dict):
    #    names, taxdicts = taxdicts.keys(), taxdicts.values()
    #else:
    #    names = []
    names = []
    if isinstance(taxdicts, dict):
        taxdicts = [taxdicts]

    ## an array to hold results (len(taxdicts), nboots)
    tot = len(taxdicts)
    resarr = np.zeros((tot, 7), dtype=np.float64)
    bootsarr = np.zeros((tot, nboots), dtype=np.float64)
    paneldict = {}

    ## TODO: Setup a wrapper to find and cleanup ipyclient
    ## define the function and parallelization to use,
    ## if no ipyclient then drops back to using multiprocessing.
    if not ipyclient:
        # ipyclient = ip.core.parallel.get_client(**self._ipcluster)
        raise IPyradError("you must enter an ipyparallel.Client() object")
    else:
        lbview = ipyclient.load_balanced_view()

    ## submit jobs to run on the cluster queue
    start = time.time()
    asyncs = {}
    idx = 0

    ## prepare data before sending to engines
    ## if it's a str (locifile) then parse it here just once.
    if isinstance(handle, str):
        with open(handle, 'r') as infile:
            loci = infile.read().strip().split("|\n")
    if isinstance(handle, list):
        pass  #sims()

    ## iterate over tests (repeats mindicts if fewer than taxdicts)
    itests = iter(taxdicts)
    imdict = itertools.cycle([mindicts])

    #for test, mindict in zip(taxdicts, itertools.cycle([mindicts])):
    for i in xrange(len(ipyclient)):

        ## next entries unless fewer than len ipyclient, skip
        try:
            test = next(itests)
            mindict = next(imdict)
        except StopIteration:
            continue

        ## if it's sim data then convert to an array
        if sims:
            loci = _msp_to_arr(handle, test)
            args = (loci, test, mindict, nboots)
            print("not yet implemented")
            #asyncs[idx] = lbview.apply_async(dstat, *args)
        else:
            args = [loci, test, mindict, nboots]
            asyncs[idx] = lbview.apply(dstat, *args)
        idx += 1

    ## block until finished, print progress if requested.
    finished = 0
    try:
        while 1:
            keys = [i for (i, j) in asyncs.items() if j.ready()]
            ## check for failures
            for job in keys:
                if not asyncs[job].successful():
                    raise IPyradWarningExit(\
                        " error: {}: {}".format(job, asyncs[job].exception()))
                ## enter results for successful jobs
                else:
                    _res, _bot = asyncs[job].result()

                    ## store D4 results
                    if _res.shape[0] == 1:
                        resarr[job] = _res.T.as_matrix()[:, 0]
                        bootsarr[job] = _bot

                    ## or store D5 results
                    else:
                        paneldict[job] = _res.T

                    ## remove old job
                    del asyncs[job]
                    finished += 1

                    ## submit next job if there is one.
                    try:
                        test = next(itests)
                        mindict = next(imdict)
                        if sims:
                            loci = _msp_to_arr(handle, test)
                            args = (loci, test, mindict, nboots)
                            print("not yet implemented")
                            #asyncs[idx] = lbview.apply_async(dstat, *args)
                        else:
                            args = [loci, test, mindict, nboots]
                            asyncs[idx] = lbview.apply(dstat, *args)
                        idx += 1
                    except StopIteration:
                        pass

            ## count finished and break if all are done.
            #fin = idx - len(asyncs)
            elap = datetime.timedelta(seconds=int(time.time() - start))
            printstr = " calculating D-stats  | {} | "
            progressbar(tot, finished, printstr.format(elap), spacer="")
            time.sleep(0.1)
            if not asyncs:
                print("")
                break

    except KeyboardInterrupt as inst:
        ## cancel all jobs (ipy & multiproc modes) and then raise error
        try:
            ipyclient.abort()
        except Exception:
            pass
        raise inst

    ## dress up resarr as a Pandas DataFrame if 4-part test
    if len(test) == 4:
        if not names:
            names = range(len(taxdicts))
        #print("resarr")
        #print(resarr)
        resarr = pd.DataFrame(resarr,
                              index=names,
                              columns=[
                                  "dstat", "bootmean", "bootstd", "Z", "ABBA",
                                  "BABA", "nloci"
                              ])

        ## sort results and bootsarr to match if test names were supplied
        resarr = resarr.sort_index()
        order = [list(resarr.index).index(i) for i in names]
        bootsarr = bootsarr[order]
        return resarr, bootsarr
    else:
        ## order results dfs
        listres = []
        for key in range(len(paneldict)):
            listres.append(paneldict[key])

        ## make into a multi-index dataframe
        ntests = len(paneldict)
        multi_index = [
            np.array([[i] * 3 for i in range(ntests)]).flatten(),
            np.array(['p3', 'p4', 'shared'] * ntests),
        ]
        resarr = pd.DataFrame(
            data=pd.concat(listres).as_matrix(),
            index=multi_index,
            columns=listres[0].columns,
        )
        return resarr, None
Example #9
0
    def plot(
            self,
            show_test_labels=True,
            use_edge_lengths=True,
            collapse_outgroup=False,
            pct_tree_x=0.5,
            pct_tree_y=0.2,
            subset_tests=None,
            #toytree_kwargs=None,
            *args,
            **kwargs):
        """ 
        Draw a multi-panel figure with tree, tests, and results 
        
        Parameters:
        -----------
        height: int
        ...

        width: int
        ...

        show_test_labels: bool
        ...

        use_edge_lengths: bool
        ...

        collapse_outgroups: bool
        ...

        pct_tree_x: float
        ...

        pct_tree_y: float
        ...

        subset_tests: list
        ...

        ...

        """

        ## check for attributes
        if not self.newick:
            raise IPyradError("baba plot requires a newick treefile")
        if not self.tests:
            raise IPyradError("baba plot must have a .tests attribute")

        ## ensure tests is a list
        if isinstance(self.tests, dict):
            self.tests = [self.tests]

        ## re-decompose the tree
        ttree = toytree.tree(
            self.newick,
            orient='down',
            use_edge_lengths=use_edge_lengths,
        )

        ## subset test to show fewer
        if subset_tests != None:
            #tests = self.tests[subset_tests]
            tests = [self.tests[i] for i in subset_tests]
            boots = self.results_boots[subset_tests]
        else:
            tests = self.tests
            boots = self.results_boots

        ## make the plot
        canvas, axes, panel = baba_panel_plot(
            ttree=ttree,
            tests=tests,
            boots=boots,
            show_test_labels=show_test_labels,
            use_edge_lengths=use_edge_lengths,
            collapse_outgroup=collapse_outgroup,
            pct_tree_x=pct_tree_x,
            pct_tree_y=pct_tree_y,
            *args,
            **kwargs)
        return canvas, axes, panel