Exemple #1
0
    def run_mrbayes(self, ipyclient, force=False, quiet=False):
        """
        calls the mrbayes block in each nexus file.
        """

        ## get all the nexus files for this object
        minidir = os.path.realpath(os.path.join(self.workdir, self.name))
        nexus_files = glob.glob(os.path.join(minidir, "*.nex"))

        ## clear existing files
        #existing = glob.glob(os.path.join(self.workdir, self.name, "*.nex"))
        existing = glob.glob(os.path.join(minidir, "*.nex.*"))
        if any(existing):
            if force:
                for rfile in existing:
                    os.remove(rfile)
            else:
                raise IPyradWarningExit(EXISTING_NEXdot_FILES.format(minidir))

        ## write new nexus files, or should users do that before this?
        #self.write_nexus_files(force=True)

        ## load balancer
        lbview = ipyclient.load_balanced_view()

        ## submit each to be processed
        asyncs = []
        for nex in nexus_files:
            async = lbview.apply(_call_mb, nex)
            asyncs.append(async)

        ## track progress
        start = time.time()
        printstr = "[mb] infer gene-tree posteriors | {} | "
        while 1:
            ready = [i.ready() for i in asyncs]
            elapsed = datetime.timedelta(seconds=int(time.time() - start))
            if not quiet:
                progressbar(len(ready),
                            sum(ready),
                            printstr.format(elapsed),
                            spacer="")
            if len(ready) == sum(ready):
                if not quiet:
                    print("")
                break
            else:
                time.sleep(0.1)

        ## check success
        for async in asyncs:
            if not async .successful():
                raise IPyradWarningExit(async .result())
Exemple #2
0
    def run_mbsum(self, ipyclient, force=False, quiet=False):
        """
        Sums two replicate mrbayes runs for each locus
        """
        minidir = os.path.realpath(os.path.join(self.workdir, self.name))
        trees1 = glob.glob(os.path.join(minidir, "*.run1.t"))
        trees2 = glob.glob(os.path.join(minidir, "*.run2.t"))

        ## clear existing files
        existing = glob.glob(os.path.join(self.workdir, self.name, "*.sumt"))
        if any(existing):
            if force:
                for rfile in existing:
                    os.remove(rfile)
            else:
                path = os.path.join(self.workdir, self.name)
                raise IPyradWarningExit(EXISTING_SUMT_FILES.format(path))

        ## load balancer
        lbview = ipyclient.load_balanced_view()

        ## submit each to be processed
        asyncs = []
        for tidx in xrange(len(trees1)):
            rep1 = trees1[tidx]
            rep2 = trees2[tidx]
            outname = os.path.join(minidir, str(tidx) + ".sumt")
            async = lbview.apply(_call_mbsum, *(rep1, rep2, outname))
            asyncs.append(async)

        ## track progress
        start = time.time()
        printstr = "[mbsum] sum replicate runs      | {} | "
        while 1:
            ready = [i.ready() for i in asyncs]
            elapsed = datetime.timedelta(seconds=int(time.time() - start))
            if not quiet:
                progressbar(len(ready),
                            sum(ready),
                            printstr.format(elapsed),
                            spacer="")
            if len(ready) == sum(ready):
                if not quiet:
                    print("")
                break
            else:
                time.sleep(0.1)

        ## check success
        for async in asyncs:
            if not async .successful():
                raise IPyradWarningExit(async .result())
Exemple #3
0
def batch(
    baba,
    ipyclient=None,
):
    """
    distributes jobs to the parallel client
    """

    ## parse args
    handle = baba.data
    taxdicts = baba.tests
    mindicts = baba.params.mincov
    nboots = baba.params.nboots

    ## if ms generator make into reusable list
    sims = 0
    if isinstance(handle, types.GeneratorType):
        handle = list(handle)
        sims = 1
    else:
        ## expand locifile path to full path
        handle = os.path.realpath(handle)

    ## parse taxdicts into names and lists if it a dictionary
    #if isinstance(taxdicts, dict):
    #    names, taxdicts = taxdicts.keys(), taxdicts.values()
    #else:
    #    names = []
    names = []
    if isinstance(taxdicts, dict):
        taxdicts = [taxdicts]

    ## an array to hold results (len(taxdicts), nboots)
    tot = len(taxdicts)
    resarr = np.zeros((tot, 7), dtype=np.float64)
    bootsarr = np.zeros((tot, nboots), dtype=np.float64)
    paneldict = {}

    ## TODO: Setup a wrapper to find and cleanup ipyclient
    ## define the function and parallelization to use,
    ## if no ipyclient then drops back to using multiprocessing.
    if not ipyclient:
        # ipyclient = ip.core.parallel.get_client(**self._ipcluster)
        raise IPyradError("you must enter an ipyparallel.Client() object")
    else:
        lbview = ipyclient.load_balanced_view()

    ## submit jobs to run on the cluster queue
    start = time.time()
    asyncs = {}
    idx = 0

    ## prepare data before sending to engines
    ## if it's a str (locifile) then parse it here just once.
    if isinstance(handle, str):
        with open(handle, 'r') as infile:
            loci = infile.read().strip().split("|\n")
    if isinstance(handle, list):
        pass  #sims()

    ## iterate over tests (repeats mindicts if fewer than taxdicts)
    itests = iter(taxdicts)
    imdict = itertools.cycle([mindicts])

    #for test, mindict in zip(taxdicts, itertools.cycle([mindicts])):
    for i in xrange(len(ipyclient)):

        ## next entries unless fewer than len ipyclient, skip
        try:
            test = next(itests)
            mindict = next(imdict)
        except StopIteration:
            continue

        ## if it's sim data then convert to an array
        if sims:
            loci = _msp_to_arr(handle, test)
            args = (loci, test, mindict, nboots)
            print("not yet implemented")
            #asyncs[idx] = lbview.apply_async(dstat, *args)
        else:
            args = [loci, test, mindict, nboots]
            asyncs[idx] = lbview.apply(dstat, *args)
        idx += 1

    ## block until finished, print progress if requested.
    finished = 0
    try:
        while 1:
            keys = [i for (i, j) in asyncs.items() if j.ready()]
            ## check for failures
            for job in keys:
                if not asyncs[job].successful():
                    raise IPyradWarningExit(\
                        " error: {}: {}".format(job, asyncs[job].exception()))
                ## enter results for successful jobs
                else:
                    _res, _bot = asyncs[job].result()

                    ## store D4 results
                    if _res.shape[0] == 1:
                        resarr[job] = _res.T.as_matrix()[:, 0]
                        bootsarr[job] = _bot

                    ## or store D5 results
                    else:
                        paneldict[job] = _res.T

                    ## remove old job
                    del asyncs[job]
                    finished += 1

                    ## submit next job if there is one.
                    try:
                        test = next(itests)
                        mindict = next(imdict)
                        if sims:
                            loci = _msp_to_arr(handle, test)
                            args = (loci, test, mindict, nboots)
                            print("not yet implemented")
                            #asyncs[idx] = lbview.apply_async(dstat, *args)
                        else:
                            args = [loci, test, mindict, nboots]
                            asyncs[idx] = lbview.apply(dstat, *args)
                        idx += 1
                    except StopIteration:
                        pass

            ## count finished and break if all are done.
            #fin = idx - len(asyncs)
            elap = datetime.timedelta(seconds=int(time.time() - start))
            printstr = " calculating D-stats  | {} | "
            progressbar(tot, finished, printstr.format(elap), spacer="")
            time.sleep(0.1)
            if not asyncs:
                print("")
                break

    except KeyboardInterrupt as inst:
        ## cancel all jobs (ipy & multiproc modes) and then raise error
        try:
            ipyclient.abort()
        except Exception:
            pass
        raise inst

    ## dress up resarr as a Pandas DataFrame if 4-part test
    if len(test) == 4:
        if not names:
            names = range(len(taxdicts))
        #print("resarr")
        #print(resarr)
        resarr = pd.DataFrame(resarr,
                              index=names,
                              columns=[
                                  "dstat", "bootmean", "bootstd", "Z", "ABBA",
                                  "BABA", "nloci"
                              ])

        ## sort results and bootsarr to match if test names were supplied
        resarr = resarr.sort_index()
        order = [list(resarr.index).index(i) for i in names]
        bootsarr = bootsarr[order]
        return resarr, bootsarr
    else:
        ## order results dfs
        listres = []
        for key in range(len(paneldict)):
            listres.append(paneldict[key])

        ## make into a multi-index dataframe
        ntests = len(paneldict)
        multi_index = [
            np.array([[i] * 3 for i in range(ntests)]).flatten(),
            np.array(['p3', 'p4', 'shared'] * ntests),
        ]
        resarr = pd.DataFrame(
            data=pd.concat(listres).as_matrix(),
            index=multi_index,
            columns=listres[0].columns,
        )
        return resarr, None
Exemple #4
0
    def run_bucky(self, ipyclient, force=False, quiet=False, subname=False):
        """
        Runs bucky for a given set of parameters and stores the result 
        to the ipa.bucky object. The results will be stored by default
        with the name '{name}-{alpha}' unless a argument is passed for
        'subname' to customize the output name. 

        Parameters:
        -----------
        subname (str):
            A custom name prefix for the output files produced by the bucky
            analysis and output into the {workdir}/{name} directory.
        force (bool):
            If True then existing result files with the same name prefix
            will be overwritten. 
        quiet (bool):
            If True the progress bars will be suppressed. 
        ipyclient (ipyparallel.Client)
            An active ipyparallel client to distribute jobs to.

        """

        ## check for existing results files
        minidir = os.path.realpath(os.path.join(self.workdir, self.name))
        infiles = glob.glob(os.path.join(minidir, "*.sumt"))
        outroot = os.path.realpath(os.path.join(self.workdir, self.name))

        ## build alpha list
        if isinstance(self.params.bucky_alpha, list):
            alphas = self.params.bucky_alpha
        else:
            alphas = [self.params.bucky_alpha]

        ## load balancer
        lbview = ipyclient.load_balanced_view()

        ## submit each to be processed
        asyncs = []
        for alpha in alphas:
            pathname = os.path.join(outroot, "CF-a" + str(alpha))
            if (os.path.exists(pathname)) and (force != True):
                print("BUCKy results already exist for this object at alpha={}\n".format(alpha) +\
                      "use force=True to overwrite existing results")
            else:
                args = [
                    alpha, self.params.bucky_nchains, self.params.bucky_nreps,
                    self.params.bucky_niter, pathname, infiles
                ]
                async = lbview.apply(_call_bucky, *args)
                asyncs.append(async)

        ## track progress
        start = time.time()
        printstr = "[bucky] infer CF posteriors     | {} | "
        while 1:
            ready = [i.ready() for i in asyncs]
            elapsed = datetime.timedelta(seconds=int(time.time() - start))
            if not quiet:
                progressbar(len(ready),
                            sum(ready),
                            printstr.format(elapsed),
                            spacer="")
            if len(ready) == sum(ready):
                if not quiet:
                    print("")
                break
            else:
                time.sleep(0.1)

        ## check success
        for async in asyncs:
            if not async .successful():
                raise IPyradWarningExit(async .result())
Exemple #5
0
    def _submit_jobs(self, force, ipyclient, name_fields, name_separator,
                     dry_run):
        """
        Download the accessions into a the designated workdir. 
        If file already exists it will only be overwritten if 
        force=True. Temporary files are removed. 
        """

        ## get Run data with default fields (1,4,6,30)
        df = self.fetch_runinfo(range(31), quiet=True)
        sys.stdout.flush()

        ## if not ipyclient then use multiprocessing
        if ipyclient:
            lb = ipyclient.load_balanced_view()

        ## if Run has samples with same name (replicates) then
        ## we need to include the accessions in the file names
        if name_fields:
            ## indexing requires -1 ints
            fields = [int(i) - 1 for i in fields_checker(name_fields)]
            ## make accession names, no spaces allowed
            df['Accession'] = pd.Series(df[df.columns[fields[0]]],
                                        index=df.index)
            for field in fields[1:]:
                df.Accession += name_separator + df[df.columns[field]]
            df.Accession = [i.replace(" ", "_") for i in df.Accession]
            ## check that names are unique
            if not df.Accession.shape[0] == df.Accession.unique().shape[0]:
                raise IPyradWarningExit("names are not unique:\n{}"\
                    .format(df.Accession))

        ## backup default naming scheme
        else:
            if len(set(df.SampleName)) != len(df.SampleName):
                accs = (i + "-" + j for i, j in zip(df.SampleName, df.Run))
                df.Accession = accs
            else:
                df.Accession = df.SampleName

        if dry_run:
            print("\rThe following files will be written to: {}".format(
                self.workdir))
            print("{}\n".format(df.Accession))
        else:
            ## iterate over and download
            asyncs = []
            for idx in df.index:

                ## get args for this run
                srr = df.Run[idx]
                outname = df.Accession[idx]
                paired = df.spots_with_mates.values.astype(
                    int).nonzero()[0].any()
                fpath = os.path.join(self.workdir, outname + ".fastq.gz")

                ## skip if exists and not force
                skip = False
                if force:
                    if os.path.exists(fpath):
                        os.remove(fpath)
                else:
                    if os.path.exists(fpath):
                        skip = True
                        sys.stdout.flush()
                        print("[skip] file already exists: {}".format(fpath))

                ## single job progress bar
                tidx = df.Accession.shape[0]
                #if not ipyclient:

                ## submit job to run
                if not skip:
                    args = (self, srr, outname, paired)
                    if ipyclient:
                        async = lb.apply_async(call_fastq_dump_on_SRRs, *args)
                        asyncs.append(async)
                    else:
                        print("Downloading file {}/{}: {}".format(
                            idx + 1, tidx, fpath))
                        call_fastq_dump_on_SRRs(*args)
                        sys.stdout.flush()

            ## progress bar while blocking parallel
            if ipyclient:
                tots = df.Accession.shape[0]
                printstr = " Downloading fastq files | {} | "
                start = time.time()
                while 1:
                    elapsed = datetime.timedelta(seconds=int(time.time() -
                                                             start))
                    ready = sum([i.ready() for i in asyncs])
                    progressbar(tots,
                                ready,
                                printstr.format(elapsed),
                                spacer="")
                    time.sleep(0.1)
                    if tots == ready:
                        print("")
                        break
                self._report(tots)

                ## check for fails
                for async in asyncs:
                    if not async .successful():
                        raise IPyradWarningExit(async .result())
Exemple #6
0
    def _prun(self, force=False, ipyclient=None):
        """
        Download the accessions into a the designated workdir. 
        If file already exists it will only be overwritten if 
        force=True. Temporary files are removed. 
        """

        ## ensure output directory
        if not os.path.exists(self.workdir):
            os.makedirs(self.workdir)

        ## TODO: parallelize with ipyclient...
        lbview = ipyclient.load_balanced_view()

        ## wrap in a try statement to shutdown on interrupt
        try:
            ## download files
            if self.is_project:
                ## get Run data
                srrs, accs = self.fetch_runinfo()

                ## if Run has samples with same name (replicates) then
                ## we need to include the accessions in the file names
                if len(set(accs)) != len(accs):
                    accs = (i + "-" + j for i, j in zip(accs, srrs))

                ## iterate over and download
                skipped = 0
                asyncs = []
                start = time.time()
                for srr, acc in zip(srrs, accs):

                    ## clean up acc if it is not nicely formatted
                    ## i.e., do not allow spaces, ...
                    acc = acc.replace(" ", "_")\
                             .replace('"', "")\
                             .replace("'", "")

                    ## print filename
                    fpath = os.path.join(self.workdir, acc + ".fastq.gz")
                    self._accession = srr

                    ## skip if exists and not force
                    skip = False
                    if force:
                        if os.path.exists(fpath):
                            os.remove(fpath)
                    else:
                        if os.path.exists(fpath):
                            skip = True
                            skipped += 1
                    if not skip:
                        async = lbview.apply(_call_fastq_dump_on_SRRs,
                                             *(self, acc))
                        asyncs.append(async)

                if skipped:
                    print("\nSkipping {} samples already present in workdir"\
                        .format(skipped))

                tots = len(srrs)
                printstr = " Downloading fastq files | {} | "
                while 1:
                    elapsed = datetime.timedelta(seconds=int(time.time() -
                                                             start))
                    ready = sum([i.ready() for i in asyncs])
                    progressbar(tots,
                                ready,
                                printstr.format(elapsed),
                                spacer="")
                    time.sleep(0.1)
                    if tots == ready:
                        print("")
                        break
                self._report(tots)

                ## check for fails
                for async in asyncs:
                    if not async .successful():
                        raise IPyradWarningExit(async .result())

            else:
Exemple #7
0
def inference(data, ipyclient, bidx):
    """ run inference and store results """

    ## a distributor of chunks
    njobs = sum(1 for _ in iter(xrange(data.svd.checkpoint_arr, 
                                       data.svd.nquarts, data.svd.chunk)))
    jobiter = iter(xrange(data.svd.checkpoint_arr, 
                          data.svd.nquarts, data.svd.chunk))
    #LOGGER.info("chunksize: %s, start: %s, total: %s, njobs: %s", \
    #        data.svd.chunk, data.svd.checkpoint_arr, data.svd.nquarts, njobs)

    ## make a distributor for engines
    lbview = ipyclient.load_balanced_view()
    #LOGGER.info("sending jobs to %s Engines", len(ipyclient))

    ## open a view to the super h5 array
    with h5py.File(data.svd.h5out, 'w') as out5:
        out5.create_dataset("quartets", (data.svd.nquarts, 4), 
                            dtype=np.uint16, chunks=(data.svd.chunk, 4))
        out5.create_dataset("weights", (data.svd.nquarts,), 
                            dtype=np.float16, chunks=(data.svd.chunk,))

    ## submit initial n jobs
    assert len(ipyclient) > 0, "No ipyparallel Engines found"
    res = {}
    for i in range(len(ipyclient)):
        try:
            res[i] = lbview.apply(worker, [data, jobiter.next()])
        except StopIteration:
            continue

    ## iterate over remaining jobs
    keys = res.keys()
    finished = 0

    while res.keys():
        time.sleep(1)
        if not bidx:
            progressbar(njobs, finished)
        for key in keys:
            try:
                ## query for finished results
                result = res[key].get(0)
                ## put it into the super array
                insert_to_array(data, result)
                ## delete result, update checkpoint
                del res[key]
                finished += 1
                ## update the minimum quartets finished/filled.
                with h5py.File(data.svd.h5out, 'r') as tmp5:
                    ww = tmp5["weights"][:]
                    try:
                        data.svd.checkpoint_arr = np.where(ww == 0)[0].min()
                        #LOGGER.info("arr saved at %s", data.svd.checkpoint_arr)
                    except (ValueError, AttributeError):
                        ## array is full (no zeros)
                        pass
                ## submit new jobs
                try:
                    res[key] = lbview.apply(worker, 
                                    [data, jobiter.next()])
                    #LOGGER.info("new job added to Engine %s", key)
                except StopIteration:
                    continue

            except (ipp.error.TimeoutError, KeyError):
                continue

    if not bidx:
        progressbar(njobs, finished)                
        print("")

    ## convert to txt file for wQMC
    dump(data)    

    ## run quartet joining algorithm
    if not bidx:
        run_qmc(data, boot=0)
    else:
        run_qmc(data, boot=1)

    ## reset the checkpoint_arr
    data.svd.checkpoint_arr = 0
Exemple #8
0
def run(data, nboots, method, nquarts, force, ipyclient):
    """ 
    Run svd4tet inference on a sequence or SNP alignment for all samples 
    the Assembly. 

    By default the job starts from 0 or where it last left off, unless 
    force=True, then it starts from 0. 
    """

    ## load svd attributes if they exist
    fresh = 0
    if not force:
        try:
            if data.svd.checkpoint_boot or data.svd.checkpoint_arr:
                print("  loading from svd checkpoint")
                print("  array checkpoint: {}".format(data.svd.checkpoint_arr))
                print("  boots checkpoint: {}".format(data.svd.checkpoint_boot))
                print("  sampling method: {}".format(data.svd.method))
                ## require method to be same as loaded type
                assert method == data.svd.method, \
                    "loaded object method={}, cannot change methods midstream"+\
                    " use force argument to start new run with new method."

            else:
                fresh = 1

        except (AttributeError, IOError):
            fresh = 1

    ## if svd results do not exist or force then restart
    if force or fresh:
        ## make an analysis directory if it doesn't exist
        data.dirs.svd = os.path.realpath(
                            os.path.join(
                                data.dirs.project, data.name+"_analysis_svd"))
        if not os.path.exists(data.dirs.svd):
            try:
                os.mkdir(data.dirs.svd)
            except OSError:
                ## if not there then create new svd directory
                data.dirs.svd = os.path.join(
                                    os.path.curdir, data.name+"_analysis_svd")
                os.mkdir(data.dirs.svd)
                print("  output directory created at: {}".format(data.dirs.svd))

        ## init new svd ObjDict
        data = svd_obj_init(data, method)

        ## get the real seq array into hdf5 h5in
        data = get_seqarray(data, boot=False)

        ## make quartet arrays into hdf5. Allow subsetting samples eventually.
        ## and optimize chunk value given remaining quartets and ipyclient    
        if method == "equal":
            ## print equal header
            print("  loading {} random quartet samples for starting tree inference"\
                  .format(nquarts))
            ## grab test number for starting tree
            data = get_quartets(data, method, nquarts, ipyclient)
            print("  inferring {} x 3 quartet trees for starting tree"\
                  .format(nquarts))
            ## infer starting tree
            inference(data, ipyclient, bidx=0)
            ## sample quartets from starting tree
            print("  loading {} equal-splits quartets from starting tree"\
                  .format(nquarts))            
            data = equal_splits(data, nquarts, ipyclient)
            ## remove starting tree tmp files
            tmps = [data.svd.tre, data.svd.wtre, data.svd.tboots, 
                    data.svd.wboots, data.svd.btre, data.svd.bwtre]
            for tmp in tmps:
                try:
                    os.remove(tmp)
                except OSError:
                    continue

        ## will sample all or random set of quartets    
        else:
            if method == "random":
                print("  loading {} random quartet samples"\
                      .format(nquarts))
            else:   
                nquarts = n_choose_k(len(data.samples), 4)                
                print("  loading all {} possible quartets"\
                      .format(nquarts))
            data = get_quartets(data, method, nquarts, ipyclient)

    ## run the full inference 
    if not data.svd.checkpoint_boot:
        print("  inferring {} x 3 quartet trees".format(nquarts))
        inference(data, ipyclient, bidx=0)
    else:
        print("  full inference finished")
        progressbar(20, 20)

    ## run the bootstrap replicates
    if nboots:
        print("  running {} bootstrap replicates".format(nboots))
    
        ## get current boot
        for bidx in range(data.svd.checkpoint_boot, nboots):
        
            if data.svd.checkpoint_arr == 0:
                data = get_seqarray(data, boot=True)
                #LOGGER.info("  new boot array sampled")
                data.svd.checkpoint_boot = bidx
            ## start boot inference
            progressbar(nboots, bidx)
            inference(data, ipyclient, bidx=True)
        progressbar(20, 20)

        ## write outputs with bootstraps
        write_outputs(data, with_boots=1)

    else:
        ## write outputs without bootstraps
        write_outputs(data, with_boots=0)