Ejemplo n.º 1
0
    def run(self, steps=None, ipyclient=None, force=False, quiet=False):
        """
        Submits an ordered list of jobs to a load-balancer to complete 
        the following tasks, and reports a progress bar:
        (1) Write nexus files for each locus
        (2) Run mrBayes on each locus to get a posterior of gene trees
        (3) Run mbsum (a bucky tool) on the posterior set of trees
        (4) Run Bucky on the summarized set of trees for all alpha values.

        Parameters:
        -----------
        ipyclient (ipyparallel.Client())
            A connected ipyparallel Client object used to distribute jobs
        force (bool):
            Whether to overwrite existing files with the same name and workdir
            if they exist. Default is False.
        quiet (bool):
            Whether to suppress progress information. Default is False.
        steps (list):
            A list of integers of steps to perform. This is useful if a 
            job was interrupted, or you created a new bucky object copy, 
            or you wish to run an analysis under a new set of parameters, 
            after having run it once. For example, if you finished running
            steps 1 and 2 (write nexus files and infer mrbayes posteriors), 
            but you want to rerun steps 3 and 4 with new settings, then you
            could enter `steps=[3,4]` and also `force=True` to run steps 3 
            and 4 with a new set of parameters. Default argument is None 
            which means run all steps. 
        """

        ## require ipyclient
        if not ipyclient:
            raise IPyradWarningExit("an ipyclient object is required")

        ## check the steps argument
        if not steps:
            steps = [1, 2, 3, 4]
        if isinstance(steps, str):
            steps = [int(i) for i in steps]
        if isinstance(steps, list):
            if not all(isinstance(i, int) for i in steps):
                raise IPyradWarningExit("steps must be a list of integers")

        ## run steps ------------------------------------------------------
        ## todo: wrap this function so it plays nice when interrupted.
        if 1 in steps:
            self.write_nexus_files(force=force, quiet=quiet)
        if 2 in steps:
            self.run_mrbayes(force=force, quiet=quiet, ipyclient=ipyclient)
        if 3 in steps:
            self.run_mbsum(force=force, quiet=quiet, ipyclient=ipyclient)
        if 4 in steps:
            self.run_bucky(froce=force, quiet=quiet, ipyclient=ipyclient)

        ## track progress asyncs
        while 1:
            if self.asyncs:
                print('jobs are running')
            else:
                break
Ejemplo n.º 2
0
    def __init__(self, accession, workdir, paired=False):

        ## TODO:
        if paired:
            raise IPyradWarningExit(
                "sorry, paired data is not yet supported, stay tuned.")

        ## check imports
        for binary in ['fastq-dump', 'esearch']:
            if not sps.call(
                    "type " + binary, shell=True, stdout=sps.PIPE,
                    stderr=sps.PIPE) == 0:
                raise IPyradWarningExit(MISSING_IMPORTS)

        ## store attributes
        self.accession = accession
        self.workdir = os.path.abspath(os.path.expanduser(workdir))
        self.is_sample = False
        self.is_project = False

        ##
        if any([i in self.accession for i in ["SRR", "ERR", "DRR"]]):
            self.is_sample = True
        elif any([i in self.accession for i in ["SRP", "ERP", "DRP"]]):
            self.is_project = True
        else:
            raise IPyradWarningExit(ACCESSION_ID)
Ejemplo n.º 3
0
def main():
    """ main function """
    ## not in ipython
    ip.__interactive__ = 0

    header = \
    "\n --------------------------------------------------"+\
    "\n  Analysis tools for ipyrad [v.{}]".format(ip.__version__)+\
    "\n  svd4tet -- fast quartet and tree inference "+\
    "\n --------------------------------------------------"
    print(header)

    ## parse params file input (returns to stdout if --help or --version)
    args = parse_command_line()

    ## if JSON, load it
    if args.json:
        data = ip.load_json(args.json)
        data.outfiles.svdinput = data.outfiles.svdinput

    ## else create a tmp assembly for the seqarray
    else:
        if not args.output:
            raise IPyradWarningExit("  -o output_prefix required")
        if not args.seq:
            raise IPyradWarningExit("  -s sequence file required")
        ## create new JSON (Assembly) object
        data = ip.Assembly(args.output, quiet=True)
        data.outfiles.svdinput = args.seq
        data.set_params(1, "./")

        ## parse samples from the sequence file
        names = []
        with iter(open(args.seq, 'r')) as infile:
            infile.next().strip().split()
            while 1:
                try:
                    names.append(infile.next().split()[0])
                except StopIteration:
                    break
        ## store as Samples in Assembly
        data.samples = {name:ip.Sample(name) for name in names}

    ## store ipcluster info
    data._ipcluster["cores"] = args.cores

    if args.MPI:
        data._ipcluster["engines"] = "MPI"
    else:
        data._ipcluster["engines"] = "Local"

    ## launch ipcluster and register for later destruction
    data = ipcontroller_init(data)

    ## run svd4tet
    args = [data, args.boots, args.method, args.nquartets, args.force]
    data._clientwrapper(ipa.svd4tet.run, args, 45)
Ejemplo n.º 4
0
def getassembly(args, parsedict):
    """ 
    loads assembly or creates a new one and set its params from 
    parsedict. Does not launch ipcluster. 
    """

    ## Creating an assembly with a full path in the name will "work"
    ## but it is potentially dangerous, so here we have assembly_name
    ## and assembly_file, name is used for creating new in cwd, file is
    ## used for loading existing.
    ##
    ## Be nice if the user includes the extension.
    #project_dir = ip.core.assembly._expander(parsedict['1'])
    #assembly_name = parsedict['0']
    project_dir = ip.core.assembly._expander(parsedict['project_dir'])
    assembly_name = parsedict['assembly_name']
    assembly_file = os.path.join(project_dir, assembly_name)

    ## Assembly creation will handle error checking  on
    ## the format of the assembly_name

    ## make sure the working directory exists.
    if not os.path.exists(project_dir):
        os.mkdir(project_dir)

    try:
        ## If 1 and force then go ahead and create a new assembly
        if ('1' in args.steps) and args.force:
            data = ip.Assembly(assembly_name, cli=True)
        else:
            data = ip.load_json(assembly_file, cli=True)
            data._cli = True

    except IPyradWarningExit as _:
        ## if no assembly is found then go ahead and make one
        if '1' not in args.steps:
            raise IPyradWarningExit(\
                "  Error: You must first run step 1 on the assembly: {}"\
                .format(assembly_file))
        else:
            ## create a new assembly object
            data = ip.Assembly(assembly_name, cli=True)

    ## for entering some params...
    for param in parsedict:

        ## trap assignment of assembly_name since it is immutable.
        if param == "assembly_name":
            ## Raise error if user tried to change assembly name
            if parsedict[param] != data.name:
                data.set_params(param, parsedict[param])
        else:
            ## all other params should be handled by set_params
            try:
                data.set_params(param, parsedict[param])
            except IndexError as _:
                print("  Malformed params file: {}".format(args.params))
                print("  Bad parameter {} - {}".format(param, parsedict[param]))
                sys.exit(-1)
    return data
Ejemplo n.º 5
0
def parse_params(args):
    """ Parse the params file args, create and return Assembly object."""

    ## check that params.txt file is correctly formatted.
    try:
        with open(args.params) as paramsin:
            plines = paramsin.readlines()
    except IOError as _:
        sys.exit("  No params file found")

    ## check header: big version changes can be distinguished by the header
    legacy_version = 0
    try:
        ## try to update the Assembly ...
        legacy_version = 1
        if not len(plines[0].split()[0]) == 7:
            raise IPyradWarningExit("""
        Error: file '{}' is not compatible with ipyrad v.{}.
        Please create and update a new params file using the -n argument. 
        For info on which parameters have changed see the changelog:
        (http://ipyrad.readthedocs.io/releasenotes.html)
        """.format(args.params, ip.__version__))

    except IndexError:
        raise IPyradWarningExit("""
        Error: Params file should not have any empty lines at the top
        of the file. Verify there are no blank lines and rerun ipyrad.
        Offending file - {}
        """.format(args.params))

    ## update and backup
    if legacy_version:
        #which version...
        #update_to_6()
        pass

    ## make into a dict. Ignore blank lines at the end of file
    ## Really this will ignore all blank lines
    items = [
        i.split("##")[0].strip() for i in plines[1:] if not i.strip() == ""
    ]

    #keys = [i.split("]")[-2][-1] for i in plines[1:]]
    #keys = range(len(plines)-1)
    keys = ip.Assembly('null', quiet=True).paramsdict.keys()
    parsedict = {str(i): j for i, j in zip(keys, items)}
    return parsedict
Ejemplo n.º 6
0
def branch_assembly(args, parsedict):
    """ 
    Load the passed in assembly and create a branch. Copy it
    to a new assembly, and also write out the appropriate params.txt
    """

    ## Get the current assembly
    data = getassembly(args, parsedict)


    ## get arguments to branch command
    bargs = args.branch

    ## get new name, trim off .txt if it was accidentally added
    newname = bargs[0]
    if newname.endswith(".txt"):
        newname = newname[:-4]

    ## look for subsamples
    if len(bargs) > 1:
        ## Branching and subsampling at step 6 is a bad idea, it messes up
        ## indexing into the hdf5 cluster file. Warn against this.
        if any([x.stats.state == 6 for x in data.samples.values()]):
            pass
            ## TODODODODODO
            #print("wat")

        ## are we removing or keeping listed samples?
        subsamples = bargs[1:]

        ## drop the matching samples
        if bargs[1] == "-":
            ## check drop names
            fails = [i for i in subsamples[1:] if i not in data.samples.keys()]
            if any(fails):
                raise IPyradWarningExit("\
                    \n  Failed: unrecognized names requested, check spelling:\n  {}"\
                    .format("\n  ".join([i for i in fails])))
            print("  dropping {} samples".format(len(subsamples)-1))
            subsamples = list(set(data.samples.keys()) - set(subsamples))

        ## If the arg after the new param name is a file that exists
        if os.path.exists(bargs[1]):
            new_data = data.branch(newname, infile=bargs[1])
        else:
            new_data = data.branch(newname, subsamples)

    ## keeping all samples
    else:
        new_data = data.branch(newname, None)

    print("  creating a new branch called '{}' with {} Samples".\
             format(new_data.name, len(new_data.samples)))

    print("  writing new params file to {}"\
            .format("params-"+new_data.name+".txt\n"))
    new_data.write_params("params-"+new_data.name+".txt", force=args.force)
Ejemplo n.º 7
0
    def fetch_runinfo(self, fields=None, quiet=False):
        """
        Call esearch to grep SRR info for a project (SRP). Use the command
        sra.fetch_fields to see available fields to be fetched. This function
        returns a DataFrame with runinfo for the selected fields.

        Parameters:
        -----------
        Fields: (tuple or list)
            The default fields returned are 1-30. You can enter a list 
            or tuple of fewer numbers to select fewer fields. Example, 
            (1,4,6,29,30) returns a neat dataframe with Run IDs, 
            Number of reads (SE and PE), ScientificName, and SampleName. 
        """
        if not quiet:
            print("\rFetching project data...", end="")

        ## if no entry then fetch (nearly) all fields.
        if fields == None:  
            fields = range(30)
        fields = fields_checker(fields)

        ## command strings
        es_cmd = [
            "esearch", 
            "-db", "sra", 
            "-query", self.accession,
        ]

        ef_cmd = [
            "efetch", 
            "--format", "runinfo",
        ]

        cut_cmd = [
            "cut", 
            "-d", ",", 
            "-f", ",".join(fields),
        ]

        ## pipe commands together
        proc1 = sps.Popen(es_cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
        proc2 = sps.Popen(ef_cmd, stdin=proc1.stdout, stderr=sps.STDOUT, stdout=sps.PIPE)
        proc3 = sps.Popen(cut_cmd, stdin=proc2.stdout, stderr=sps.STDOUT, stdout=sps.PIPE)
        o, e = proc3.communicate()
        proc2.stdout.close()
        proc1.stdout.close()
        
        if o:
            vals = o.strip().split("\n")
            names = vals[0].split(",")
            items = [i.split(",") for i in vals[1:]]
            return pd.DataFrame(items, columns=names)
        else:
            raise IPyradWarningExit("no samples found in {}".format(self.accession))
Ejemplo n.º 8
0
    def fetch_runinfo(self):
        """
        Call esearch to grep SRR info for a project (SRP). 
        Returns two lists: SRRs and ACCs. 
        """
        print("\rFetching project data...", end="")

        es_cmd = [
            "esearch",
            "-db",
            "sra",
            "-query",
            self.accession,
        ]

        ef_cmd = [
            "efetch",
            "--format",
            "runinfo",
        ]

        cut_cmd = [
            "cut",
            "-d",
            ",",
            "-f",
            "1,30",
        ]

        ## this will grep SRR for SRPs, and ERR for ERPs, etc.
        grep_cmd = ["grep", self.accession[:2] + "R"]

        ## pipe commands together
        proc1 = sps.Popen(es_cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
        proc2 = sps.Popen(ef_cmd,
                          stdin=proc1.stdout,
                          stderr=sps.STDOUT,
                          stdout=sps.PIPE)
        proc3 = sps.Popen(cut_cmd,
                          stdin=proc2.stdout,
                          stderr=sps.STDOUT,
                          stdout=sps.PIPE)
        proc4 = sps.Popen(grep_cmd,
                          stdin=proc3.stdout,
                          stderr=sps.STDOUT,
                          stdout=sps.PIPE)
        o, e = proc4.communicate()

        if o:
            srrlist = o.strip().split("\n")
            SRRs, ACCs = zip(*[i.split(",") for i in srrlist])
            return SRRs, ACCs
        else:
            raise IPyradWarningExit("no samples found in {}".format(
                self.accession))
Ejemplo n.º 9
0
def getassembly(args, parsedict):
    """ loads assembly or creates a new one and set its params from 
    parsedict. Does not launch ipcluster. 
    """

    ## Creating an assembly with a full path in the name will "work"
    ## but it is potentially dangerous, so here we have assembly_name
    ## and assembly_file, name is used for creating new in cwd, file is
    ## used for loading existing.
    ##
    ## Be nice if the user includes the extension.
    project_dir = ip.core.assembly.expander(parsedict['1'])
    assembly_name = parsedict['0']
    assembly_file = os.path.join(project_dir, assembly_name)

    ## Assembly creation will handle error checking  on
    ## the format of the assembly_name

    ## make sure the working directory exists.
    if not os.path.exists(project_dir):
        os.mkdir(project_dir)

    try:

        ## If 1 and force then go ahead and create a new assembly
        if '1' in args.steps and args.force:
            data = ip.Assembly(assembly_name)
        else:
            data = ip.load_json(assembly_file)

    except IPyradWarningExit as inst:
        ## if no assembly is found then go ahead and make one
        if '1' not in args.steps:
            raise IPyradWarningExit("""
    Error: Steps >1 ({}) requested but no current assembly found - {}
    """.format(args.steps, assembly_file))
        else:
            ## create a new assembly object
            data = ip.Assembly(assembly_name)

    ## for entering some params...
    for param in parsedict:
        ## trap assignment of assembly_name since it is immutable.
        if param == str(0):
            ## only pass to set_params if user tried to change assembly_name
            ## it will raise an Exit error
            if parsedict[param] != data.name:
                data.set_params(param, parsedict[param])
        else:
            ## all other params should be handled by set_params
            data.set_params(param, parsedict[param])

    return data
Ejemplo n.º 10
0
    def run_mrbayes(self, ipyclient, force=False, quiet=False):
        """
        calls the mrbayes block in each nexus file.
        """

        ## get all the nexus files for this object
        minidir = os.path.realpath(os.path.join(self.workdir, self.name))
        nexus_files = glob.glob(os.path.join(minidir, "*.nex"))

        ## clear existing files
        #existing = glob.glob(os.path.join(self.workdir, self.name, "*.nex"))
        existing = glob.glob(os.path.join(minidir, "*.nex.*"))
        if any(existing):
            if force:
                for rfile in existing:
                    os.remove(rfile)
            else:
                raise IPyradWarningExit(EXISTING_NEXdot_FILES.format(minidir))

        ## write new nexus files, or should users do that before this?
        #self.write_nexus_files(force=True)

        ## load balancer
        lbview = ipyclient.load_balanced_view()

        ## submit each to be processed
        asyncs = []
        for nex in nexus_files:
            async = lbview.apply(_call_mb, nex)
            asyncs.append(async)

        ## track progress
        start = time.time()
        printstr = "[mb] infer gene-tree posteriors | {} | "
        while 1:
            ready = [i.ready() for i in asyncs]
            elapsed = datetime.timedelta(seconds=int(time.time() - start))
            if not quiet:
                progressbar(len(ready),
                            sum(ready),
                            printstr.format(elapsed),
                            spacer="")
            if len(ready) == sum(ready):
                if not quiet:
                    print("")
                break
            else:
                time.sleep(0.1)

        ## check success
        for async in asyncs:
            if not async .successful():
                raise IPyradWarningExit(async .result())
Ejemplo n.º 11
0
    def run_mbsum(self, ipyclient, force=False, quiet=False):
        """
        Sums two replicate mrbayes runs for each locus
        """
        minidir = os.path.realpath(os.path.join(self.workdir, self.name))
        trees1 = glob.glob(os.path.join(minidir, "*.run1.t"))
        trees2 = glob.glob(os.path.join(minidir, "*.run2.t"))

        ## clear existing files
        existing = glob.glob(os.path.join(self.workdir, self.name, "*.sumt"))
        if any(existing):
            if force:
                for rfile in existing:
                    os.remove(rfile)
            else:
                path = os.path.join(self.workdir, self.name)
                raise IPyradWarningExit(EXISTING_SUMT_FILES.format(path))

        ## load balancer
        lbview = ipyclient.load_balanced_view()

        ## submit each to be processed
        asyncs = []
        for tidx in xrange(len(trees1)):
            rep1 = trees1[tidx]
            rep2 = trees2[tidx]
            outname = os.path.join(minidir, str(tidx) + ".sumt")
            async = lbview.apply(_call_mbsum, *(rep1, rep2, outname))
            asyncs.append(async)

        ## track progress
        start = time.time()
        printstr = "[mbsum] sum replicate runs      | {} | "
        while 1:
            ready = [i.ready() for i in asyncs]
            elapsed = datetime.timedelta(seconds=int(time.time() - start))
            if not quiet:
                progressbar(len(ready),
                            sum(ready),
                            printstr.format(elapsed),
                            spacer="")
            if len(ready) == sum(ready):
                if not quiet:
                    print("")
                break
            else:
                time.sleep(0.1)

        ## check success
        for async in asyncs:
            if not async .successful():
                raise IPyradWarningExit(async .result())
Ejemplo n.º 12
0
    def __init__(
        self,
        accession,
        workdir="sra-fastq-data",
    ):

        ## check imports
        for binary in ['fastq-dump', 'esearch']:
            if not sps.call(
                    "type " + binary, shell=True, stdout=sps.PIPE,
                    stderr=sps.PIPE) == 0:
                raise IPyradWarningExit(MISSING_IMPORTS)

        ## store attributes
        self.accession = accession
        self.workdir = os.path.abspath(os.path.expanduser(workdir))
        self.is_sample = False
        self.is_project = False
        self._oldtmpdir = None

        ## cluster attributes
        self._ipcluster = {
            "cluster_id": "",
            "profile": "default",
            "engines": "Local",
            "quiet": 0,
            "timeout": 60,
            "cores": 0,
            "threads": 2,
            "pids": {},
        }

        ##
        if any([i in self.accession for i in ["SRR", "ERR", "DRR"]]):
            self.is_sample = True
        elif any([i in self.accession for i in ["SRP", "ERP", "DRP"]]):
            self.is_project = True
        else:
            raise IPyradWarningExit(ACCESSION_ID)
Ejemplo n.º 13
0
    def __init__(self, name, data, workdir=None, mapfile=None):
        self.name = name
        self.data = os.path.abspath(os.path.expanduser(data))
        self.mainparams = _MainParams()
        self.extraparams = _ExtraParams()
        self.clumppparams = _ClumppParams()
        self.asyncs = []

        ## check that bpp is installed and in path
        for binary in ['structure']:
            if not subprocess.call("type " + binary,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE) == 0:
                raise IPyradWarningExit(MISSING_IMPORTS)

        ## make workdir if it does not exist
        if workdir:
            self.workdir = os.path.abspath(os.path.expanduser(workdir))
        else:
            self.workdir = OPJ(os.path.abspath('.'), "analysis-structure")
        if not os.path.exists(self.workdir):
            os.makedirs(self.workdir)

        ## check that strfile exists, print and parse some info from it
        with open(data) as ifile:
            lines = ifile.readlines()
            self.ntaxa = len(lines) // 2
            self.nsites = len(lines[0].strip().split()[1:])
            self.labels = [i.split('\t')[0].strip() for i in lines][::2]
            self.popdata = [i.split('\t')[1] for i in lines][::2]
            self.popflag = [i.split('\t')[2] for i in lines][::2]
            self.locdata = [i.split('\t')[3] for i in lines][::2]
            self.phenotype = [i.split('\t')[4] for i in lines][::2]
            #self.extra = [i.split('\t')[5] for i in lines][::2] #default extracols=0
            del lines

        ## if mapfile then parse it to an array
        if mapfile:
            with open(mapfile) as inmap:
                maparr = np.genfromtxt(inmap)[:, [0, 3]].astype(np.uint64)
                spans = np.zeros((maparr[-1, 0], 2), np.uint64)
                spans = get_spans(maparr, spans)
                self.maparr = spans
                self.nsites = spans.shape[0]
        else:
            self.maparr = None
Ejemplo n.º 14
0
def fields_checker(fields):
    """
    returns a fields argument formatted as a list of strings.
    and doesn't allow zero.
    """
    ## make sure fields will work
    if isinstance(fields, int):
        fields = str(fields)
    if isinstance(fields, str):
        if "," in fields:
            fields = [str(i) for i in fields.split(",")]
        else:
            fields = [str(fields)]
    elif isinstance(fields, (tuple, list)):
        fields = [str(i) for i in fields]
    else:
        raise IPyradWarningExit("fields not properly formatted")

    ## do not allow zero in fields
    fields = [i for i in fields if i != '0']

    return fields
Ejemplo n.º 15
0
def main():
    """ main function """
    ## turn off traceback for the CLI
    ip.__interactive__ = 0

    ## Check for a new version on anaconda
    _check_version()

    ## parse params file input (returns to stdout if --help or --version)
    args = parse_command_line()

    ## Turn the debug output written to ipyrad_log.txt up to 11!
    ## Clean up the old one first, it's cleaner to do this here than
    ## at the end (exceptions, etc)
    if os.path.exists(ip.__debugflag__):
        os.remove(ip.__debugflag__)

    if args.debug:
        print("\n  ** Enabling debug mode ** ")
        ip._debug_on()
        atexit.register(ip._debug_off)        

    ## create new paramsfile if -n
    if args.new:
        ## Create a tmp assembly, call write_params to make default params.txt
        try:
            tmpassembly = ip.Assembly(args.new, quiet=True, cli=True)
            tmpassembly.write_params("params-{}.txt".format(args.new), 
                                     force=args.force)
        except Exception as inst:
            print(inst)
            sys.exit(2)

        print("\n  New file 'params-{}.txt' created in {}\n".\
               format(args.new, os.path.realpath(os.path.curdir)))
        sys.exit(2)


    ## if params then must provide action argument with it
    if args.params:
        if not any([args.branch, args.results, args.steps]):
            print("""
    Must provide action argument along with -p argument for params file. 
    e.g., ipyrad -p params-test.txt -r              ## shows results
    e.g., ipyrad -p params-test.txt -s 12           ## runs steps 1 & 2
    e.g., ipyrad -p params-test.txt -b newbranch    ## branch this assembly
    """)
            sys.exit(2)

    if not args.params:
        if any([args.branch, args.results, args.steps]):
            print("""
    Must provide params file for branching, doing steps, or getting results.
    e.g., ipyrad -p params-test.txt -r              ## shows results
    e.g., ipyrad -p params-test.txt -s 12           ## runs steps 1 & 2
    e.g., ipyrad -p params-test.txt -b newbranch    ## branch this assembly
    """)

    ## if branching, or merging do not allow steps in same command
    ## print spacer
    if any([args.branch, args.merge]):        
        args.steps = ""    
        print("")    

    ## always print the header when doing steps
    header = \
    "\n -------------------------------------------------------------"+\
    "\n  ipyrad [v.{}]".format(ip.__version__)+\
    "\n  Interactive assembly and analysis of RAD-seq data"+\
    "\n -------------------------------------------------------------"

    ## Log the current version. End run around the LOGGER
    ## so it'll always print regardless of log level.
    with open(ip.__debugfile__, 'a') as logfile:
        logfile.write(header)
        logfile.write("\n  Begin run: {}".format(time.strftime("%Y-%m-%d %H:%M")))
        logfile.write("\n  Using args {}".format(vars(args)))
        logfile.write("\n  Platform info: {}".format(os.uname()))

    ## if merging just do the merge and exit
    if args.merge:
        print(header)
        merge_assemblies(args)
        sys.exit(1)

    ## if download data do it and then exit. Runs single core in CLI. 
    if args.download:
        if len(args.download) == 1:
            downloaddir = "sra-fastqs"
        else:
            downloaddir = args.download[1]
        sratools_download(args.download[0], workdir=downloaddir, force=args.force)
        sys.exit(1)

    ## create new Assembly or load existing Assembly, quit if args.results
    elif args.params:
        parsedict = parse_params(args)

        if args.branch:
            branch_assembly(args, parsedict)

        elif args.steps:
            ## print header
            print(header)

            ## Only blank the log file if we're actually going to run a new
            ## assembly. This used to be in __init__, but had the side effect
            ## of occasionally blanking the log file in an undesirable fashion
            ## for instance if you run a long assembly and it crashes and
            ## then you run `-r` and it blanks the log, it's crazymaking.
            if os.path.exists(ip.__debugfile__):
                if os.path.getsize(ip.__debugfile__) > 50000000:
                    with open(ip.__debugfile__, 'w') as clear:
                        clear.write("file reset")

            ## run Assembly steps
            ## launch or load assembly with custom profile/pid
            data = getassembly(args, parsedict)

            ## set CLI ipcluster terms
            data._ipcluster["threads"] = args.threads

            ## if ipyclient is running (and matched profile) then use that one
            if args.ipcluster:
                ipyclient = ipp.Client(profile=args.ipcluster)
                data._ipcluster["cores"] = len(ipyclient)

            ## if not then we need to register and launch an ipcluster instance
            else:
                ## set CLI ipcluster terms
                ipyclient = None
                data._ipcluster["cores"] = args.cores if args.cores else detect_cpus()
                data._ipcluster["engines"] = "Local"
                if args.MPI:
                    data._ipcluster["engines"] = "MPI"
                    if not args.cores:
                        raise IPyradWarningExit("must provide -c argument with --MPI")
                ## register to have a cluster-id with "ip- name"
                data = register_ipcluster(data)

            ## set to print headers
            data._headers = 1

            ## run assembly steps
            steps = list(args.steps)
            data.run(
                steps=steps, 
                force=args.force, 
                preview=args.preview, 
                show_cluster=1, 
                ipyclient=ipyclient)
                     
        if args.results:
            showstats(parsedict)
Ejemplo n.º 16
0
def loci2bpp(name,
             locifile,
             imap,
             guidetree,
             minmap=None,
             maxloci=None,
             infer_sptree=0,
             infer_delimit=0,
             delimit_alg=(0, 5),
             seed=12345,
             burnin=1000,
             nsample=10000,
             sampfreq=2,
             thetaprior=(5, 5),
             tauprior=(4, 2, 1),
             traits_df=None,
             nu=0,
             kappa=0,
             useseqdata=1,
             usetraitdata=1,
             cleandata=0,
             wdir=None,
             finetune=(0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01),
             verbose=0):
    """
    Converts loci file format to bpp file format, i.e., concatenated phylip-like
    format, and produces imap and ctl input files for bpp.

    Parameters:
    -----------
    name:
        A prefix name for output files that will be produced
    locifile:
        A .loci file produced by ipyrad.
    imap:
        A Python dictionary with 'species' names as keys, and lists of sample
        names for the values. Any sample that is not included in the imap
        dictionary will be filtered out of the data when converting the .loci
        file into the bpp formatted sequence file. Each species in the imap
        dictionary must also be present in the input 'guidetree'.
    guidetree:
        A newick string species tree hypothesis [e.g., (((a,b),(c,d)),e);]
        All species in the imap dictionary must also be present in the guidetree

    Optional parameters:
    --------------------
    infer_sptree:
        Default=0, only infer parameters on a fixed species tree. If 1, then the
        input tree is treated as a guidetree and tree search is employed to find
        the best tree. The results will include support values for the inferred
        topology.
    infer_delimit:
        Default=0, no delimitation. If 1 then splits in the tree that separate
        'species' will be collapsed to test whether fewer species are a better
        fit to the data than the number in the input guidetree.
    delimit_alg:
        Species delimitation algorithm. This is a tuple. The first value
        is the algorithm (0 or 1) and the following values are arguments
        for the given algorithm. See other ctl files for examples of what the
        delimitation line looks like. This is where you can enter the params
        (e.g., alpha, migration) for the two different algorithms.
        For example, the following args would produce the following ctl lines:
           alg=0, epsilon=5
           > delimit_alg = (0, 5)
           speciesdelimitation = 1 0 5

           alg=1, alpha=2, migration=1
           > delimit_alg = (1, 2, 1)
           speciesdelimitation = 1 1 2 1

           alg=1, alpha=2, migration=1, diagnosis=0, ?=1
           > delimit_alg = (1, 2, 1, 0, 1)
           speciesdelimitation = 1 1 2 1 0 1
    seed:
        A random number seed at start of analysis.
    burnin:
        Number of burnin generations in mcmc
    nsample:
        Number of mcmc generations to run.
    sampfreq:
        How often to sample from the mcmc chain.
    thetaprior:
        Prior on theta (4Neu), gamma distributed. mean = a/b. e.g., (5, 5)
    tauprior
        Prior on root tau, gamma distributed mean = a/b. Last number is 
        dirichlet prior for other taus. e.g., (4, 2, 1)
    traits_df:
        A pandas DataFrame with trait data properly formatted. This means only
        quantitative traits are included, and missing values are NaN.
        The first column contains sample names, with "Indiv" as the header.
        The following columns have a header row with trait names. This script
        will write a CSV trait file with trait values mean-standardized, with
        NaN replaced by "NA", and with sample not present in IMAP removed.
    nu:
        A prior on phenotypic trait variance (0) for iBPP analysis.
    kappa:
        A prior on phenotypic trait mean (0) for iBPP analysis.
    useseqdata:
        If false inference proceeds without sequence data (can be used to test
        the effect of priors on the tree distributions).
    usetraitdata:
        If false inference proceeds without trait data (can be used to test
        the effect of priors on the trait distributions).
    cleandata:
        If 1 then sites with missing or hetero characters are removed.
    wdir:
        A working directory to write files to.
    finetune:
        See bpp documentation.
    verbose:
        If verbose=1 the ctl file text will also be written to screen (stderr).

    """
    ## check args
    if not imap:
        raise IPyradWarningExit(IMAP_REQUIRED)
    if minmap:
        if minmap.keys() != imap.keys():
            raise IPyradWarningExit(KEYS_DIFFER)

    ## working directory, make sure it exists
    if wdir:
        wdir = os.path.abspath(wdir)
        if not os.path.exists(wdir):
            raise IPyradWarningExit(" working directory (wdir) does not exist")
    else:
        wdir = os.path.curdir

    ## if traits_df then we make '.ibpp' files
    prog = 'bpp'
    if isinstance(traits_df, pd.DataFrame):
        prog = 'ibpp'
    outfile = OPJ(wdir, "{}.{}.seq.txt".format(name, prog))
    mapfile = OPJ(wdir, "{}.{}.imap.txt".format(name, prog))

    ## open outhandles
    fout = open(outfile, 'w')
    fmap = open(mapfile, 'w')

    ## parse the loci file
    with open(locifile, 'r') as infile:
        ## split on "//" for legacy compatibility
        loci = infile.read().strip().split("|\n")
        nloci = len(loci)

    ## all samples
    samples = list(itertools.chain(*imap.values()))

    ## iterate over loci, printing to outfile
    nkept = 0
    for iloc in xrange(nloci):
        lines = loci[iloc].split("//")[0].split()
        names = lines[::2]
        names = ["^" + i for i in names]
        seqs = [list(i) for i in lines[1::2]]
        seqlen = len(seqs[0])

        ## whether to skip this locus based on filters below
        skip = 0

        ## if minmap filter for sample coverage
        if minmap:
            covd = {}
            for group, vals in imap.items():
                covd[group] = sum(["^" + i in names for i in vals])
            ## check that coverage is good enough
            if not all([covd[group] >= minmap[group] for group in minmap]):
                skip = 1

        ## too many loci?
        if maxloci:
            if nkept >= maxloci:
                skip = 1

        ## build locus as a string
        if not skip:
            ## convert to phylip with caret starter and replace - with N.
            data = ["{:<30} {}".format(i, "".join(k).replace("-", "N")) for \
                (i, k) in zip(names, seqs) if i[1:] in samples]

            ## if not empty, write to the file
            if data:
                fout.write("{} {}\n\n{}\n\n"\
                           .format(len(data), seqlen, "\n".join(data)))
                nkept += 1

    ## close up shop
    fout.close()

    ## write the imap file:
    data = ["{:<30} {}".format(val, key) for key \
            in sorted(imap) for val in imap[key]]
    fmap.write("\n".join(data))
    fmap.close()

    ## write ctl file
    write_ctl(name, imap, guidetree, nkept, infer_sptree, infer_delimit,
              delimit_alg, seed, burnin, nsample, sampfreq, thetaprior,
              tauprior, traits_df, nu, kappa, cleandata, useseqdata,
              usetraitdata, wdir, finetune, verbose)

    ## print message?
    sys.stderr.write("new files created ({} loci, {} species, {} samples)\n"\
                     .format(nkept, len(imap.keys()),
                             sum([len(i) for i in imap.values()])))
    sys.stderr.write("  {}.{}.seq.txt\n".format(name, prog))
    sys.stderr.write("  {}.{}.imap.txt\n".format(name, prog))
    sys.stderr.write("  {}.{}.ctl.txt\n".format(name, prog))
    if isinstance(traits_df, pd.DataFrame):
        sys.stderr.write("  {}.{}.traits.txt\n".format(name, prog))

    ## return the ctl file string
    return os.path.abspath("{}.{}.ctl.txt".format(OPJ(wdir, name), prog))
Ejemplo n.º 17
0
def main():
    """ main function """

    ## parse params file input (returns to stdout if --help or --version)
    args = parse_command_line()
    print(HEADER.format(ip.__version__))

    ## set random seed
    np.random.seed(args.rseed)

    ## debugger----------------------------------------
    if os.path.exists(ip.__debugflag__):
        os.remove(ip.__debugflag__)
    if args.debug:
        print("\n  ** Enabling debug mode ** ")
        ip._debug_on()

    ## if JSON, load existing Tetrad analysis -----------------------
    if args.json:
        data = ipa.tetrad(name=args.name, workdir=args.workdir, load=True)
        ## if force then remove all results
        if args.force:
            data._refresh()

    ## else create a new tmp assembly for the seqarray-----------------
    else:
        ## create new Tetrad class Object if it doesn't exist
        newjson = os.path.join(args.workdir, args.name + '.tet.json')
        ## if not quiet...
        print("tetrad instance: {}".format(args.name))

        if (not os.path.exists(newjson)) or args.force:
            ## purge any files associated with this name if forced
            if args.force:
                ## init an object in the correct location just to refresh
                ipa.tetrad(name=args.name,
                           workdir=args.workdir,
                           data=args.seq,
                           initarr=False,
                           save_invariants=args.invariants,
                           cli=True,
                           quiet=True)._refresh()

            ## create new tetrad object
            data = ipa.tetrad(
                name=args.name,
                workdir=args.workdir,
                method=args.method,
                data=args.seq,
                resolve=args.resolve,
                mapfile=args.map,
                guidetree=args.tree,
                nboots=args.boots,
                nquartets=args.nquartets,
                cli=True,
                save_invariants=args.invariants,
            )
        else:
            raise SystemExit(QUARTET_EXISTS\
            .format(args.name, args.workdir, args.workdir, args.name, args.name))

    ## boots can be set either for a new object or loaded JSON to continue it
    if args.boots:
        data.params.nboots = int(args.boots)

    ## if ipyclient is running (and matched profile) then use that one
    if args.ipcluster:
        ipyclient = ipp.Client(profile=args.ipcluster)
        data._ipcluster["cores"] = len(ipyclient)

    ## if not then we need to register and launch an ipcluster instance
    else:
        ## set CLI ipcluster terms
        ipyclient = None
        data._ipcluster["cores"] = args.cores if args.cores else detect_cpus()
        data._ipcluster["engines"] = "Local"
        if args.MPI:
            data._ipcluster["engines"] = "MPI"
            if not args.cores:
                raise IPyradWarningExit("must provide -c argument with --MPI")
        ## register to have a cluster-id with "ip- name"
        data = register_ipcluster(data)

    ## message about whether we are continuing from existing
    if data.checkpoint.boots:
        print(
            LOADING_MESSAGE.format(data.name, data.params.method,
                                   data.checkpoint.boots))

    ## run tetrad main function within a wrapper. The wrapper creates an
    ## ipyclient view and appends to the list of arguments to run 'run'.
    data.run(force=args.force, ipyclient=ipyclient)
Ejemplo n.º 18
0
import copy
import itertools
import subprocess
import numpy as np

from collections import Counter
from ipyrad.assemble.util import DUCT, IPyradWarningExit

try:
    ## when you have time go back and set attrubutes on toytrees
    from toytree import ete3mini as ete
except ImportError:
    raise IPyradWarningExit("""
    Error: bpp requires the dependency 'toytree', which we haven't yet
    included in the ipyrad installation. For now, you can install toytree
    using conda with the following command: 

    conda install toytree -c eaton-lab
    """)


class Bpp(object):
    """
    BPP analysis utility function for creating input files, setting parameters, 
    and submitting bpp jobs to run on a parallel cluster. Converts loci 
    file format data to bpp file format, i.e., concatenated phylip-like
    format, and produces imap and ctl input files for bpp. The main 
    functions are 'write_bpp_files()' and 'run()'.

    Parameters:
    -----------
Ejemplo n.º 19
0
    def __init__(self,
                 name,
                 data=None,
                 workdir="analysis-bpp",
                 guidetree=None,
                 imap=None,
                 *args,
                 **kwargs):

        ## path attributes
        self.name = name
        self.asyncs = []
        self._kwargs = {
            "maxloci": None,
            "minmap": None,
            "minsnps": 0,
            "infer_sptree": 0,
            "infer_delimit": 0,
            "delimit_alg": (0, 5),
            "seed": 12345,
            "burnin": 1000,
            "nsample": 10000,
            "sampfreq": 2,
            "thetaprior": (2, 2000),
            "tauprior": (2, 2000, 1),
            "usedata": 1,
            "cleandata": 0,
            "finetune": (0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01),
            "copied": False,
        }
        self._kwargs.update(kwargs)

        ## support for legacy args
        if self._kwargs.get("locifile"):
            data = self._kwargs.get("locifile")
        if not data:
            raise IPyradWarningExit(
                "must enter a 'data' argument (an ipyrad .loci file).")

        ## set the guidetree
        if not guidetree:
            raise IPyradWarningExit(
                "must enter a 'guidetree' argument (a newick file or string).")
        self.tree = ete.Tree(guidetree)

        ## check workdir
        if workdir:
            self.workdir = os.path.abspath(os.path.expanduser(workdir))
        else:
            self.workdir = os.path.join(os.path.curdir, "analysis-bpp")
        if not os.path.exists(self.workdir):
            os.makedirs(self.workdir)

        ## parsing imap dictionary, or create simple 1-1 mapping
        if not imap:
            self.imap = {i: [i] for i in self.tree.get_leaf_names()}
        else:
            self.imap = {}
            for key, val in imap.items():
                if isinstance(val, (int, str)):
                    self.imap[key] = [str(val)]
                elif isinstance(val, list):
                    self.imap[key] = val
                else:
                    raise IPyradWarningExit(
                        "imap dictionary is not properly formatted")

        ## update stats if alleles instead of loci
        if not self._kwargs["minmap"]:
            self._kwargs["minmap"] = {i: 1 for i in self.tree.get_leaf_names()}

        if ('.alleles.loci' in data) and (not self._kwargs['copied']):
            ## add 0/1 to names
            keys = self.imap.keys()
            for key in keys:
                oldvals = self.imap[key]
                newvals = []
                for val in oldvals:
                    newvals += [val + "_0", val + "_1"]
                self.imap[key] = newvals

            ## double the minmap (copied attribute protects from double 2X)
            self._kwargs["minmap"] = \
                {key: val*2 for key, val in self._kwargs['minmap'].items()}

        ## checks
        assert isinstance(self.imap, dict), "you must enter an IMAP dictionary"
        assert set(self.imap.keys()) == set(self.tree.get_leaf_names()), \
               "IMAP keys must match guidetree names: \n{}\n{}"\
               .format(self.imap.keys(), self.tree.get_leaf_names())

        ## filters
        self.filters = Params()
        self.filters.minmap = self._kwargs["minmap"]
        self.filters.maxloci = self._kwargs["maxloci"]
        self.filters.minsnps = self._kwargs["minsnps"]

        ## set bpp parameters with defaults
        self.params = Params()
        notparams = set(["workdir", "maxloci", "minmap", "minsnps", "copied"])
        for key in set(self._kwargs.keys()) - notparams:
            self.params[key] = self._kwargs[key]

        ## results files
        self.files = Params()
        self.files.data = data
        self.files.mcmcfiles = []
        self.files.outfiles = []
Ejemplo n.º 20
0
def batch(
    baba,
    ipyclient=None,
):
    """
    distributes jobs to the parallel client
    """

    ## parse args
    handle = baba.data
    taxdicts = baba.tests
    mindicts = baba.params.mincov
    nboots = baba.params.nboots

    ## if ms generator make into reusable list
    sims = 0
    if isinstance(handle, types.GeneratorType):
        handle = list(handle)
        sims = 1
    else:
        ## expand locifile path to full path
        handle = os.path.realpath(handle)

    ## parse taxdicts into names and lists if it a dictionary
    #if isinstance(taxdicts, dict):
    #    names, taxdicts = taxdicts.keys(), taxdicts.values()
    #else:
    #    names = []
    names = []
    if isinstance(taxdicts, dict):
        taxdicts = [taxdicts]

    ## an array to hold results (len(taxdicts), nboots)
    tot = len(taxdicts)
    resarr = np.zeros((tot, 7), dtype=np.float64)
    bootsarr = np.zeros((tot, nboots), dtype=np.float64)
    paneldict = {}

    ## TODO: Setup a wrapper to find and cleanup ipyclient
    ## define the function and parallelization to use,
    ## if no ipyclient then drops back to using multiprocessing.
    if not ipyclient:
        # ipyclient = ip.core.parallel.get_client(**self._ipcluster)
        raise IPyradError("you must enter an ipyparallel.Client() object")
    else:
        lbview = ipyclient.load_balanced_view()

    ## submit jobs to run on the cluster queue
    start = time.time()
    asyncs = {}
    idx = 0

    ## prepare data before sending to engines
    ## if it's a str (locifile) then parse it here just once.
    if isinstance(handle, str):
        with open(handle, 'r') as infile:
            loci = infile.read().strip().split("|\n")
    if isinstance(handle, list):
        pass  #sims()

    ## iterate over tests (repeats mindicts if fewer than taxdicts)
    itests = iter(taxdicts)
    imdict = itertools.cycle([mindicts])

    #for test, mindict in zip(taxdicts, itertools.cycle([mindicts])):
    for i in xrange(len(ipyclient)):

        ## next entries unless fewer than len ipyclient, skip
        try:
            test = next(itests)
            mindict = next(imdict)
        except StopIteration:
            continue

        ## if it's sim data then convert to an array
        if sims:
            loci = _msp_to_arr(handle, test)
            args = (loci, test, mindict, nboots)
            print("not yet implemented")
            #asyncs[idx] = lbview.apply_async(dstat, *args)
        else:
            args = [loci, test, mindict, nboots]
            asyncs[idx] = lbview.apply(dstat, *args)
        idx += 1

    ## block until finished, print progress if requested.
    finished = 0
    try:
        while 1:
            keys = [i for (i, j) in asyncs.items() if j.ready()]
            ## check for failures
            for job in keys:
                if not asyncs[job].successful():
                    raise IPyradWarningExit(\
                        " error: {}: {}".format(job, asyncs[job].exception()))
                ## enter results for successful jobs
                else:
                    _res, _bot = asyncs[job].result()

                    ## store D4 results
                    if _res.shape[0] == 1:
                        resarr[job] = _res.T.as_matrix()[:, 0]
                        bootsarr[job] = _bot

                    ## or store D5 results
                    else:
                        paneldict[job] = _res.T

                    ## remove old job
                    del asyncs[job]
                    finished += 1

                    ## submit next job if there is one.
                    try:
                        test = next(itests)
                        mindict = next(imdict)
                        if sims:
                            loci = _msp_to_arr(handle, test)
                            args = (loci, test, mindict, nboots)
                            print("not yet implemented")
                            #asyncs[idx] = lbview.apply_async(dstat, *args)
                        else:
                            args = [loci, test, mindict, nboots]
                            asyncs[idx] = lbview.apply(dstat, *args)
                        idx += 1
                    except StopIteration:
                        pass

            ## count finished and break if all are done.
            #fin = idx - len(asyncs)
            elap = datetime.timedelta(seconds=int(time.time() - start))
            printstr = " calculating D-stats  | {} | "
            progressbar(tot, finished, printstr.format(elap), spacer="")
            time.sleep(0.1)
            if not asyncs:
                print("")
                break

    except KeyboardInterrupt as inst:
        ## cancel all jobs (ipy & multiproc modes) and then raise error
        try:
            ipyclient.abort()
        except Exception:
            pass
        raise inst

    ## dress up resarr as a Pandas DataFrame if 4-part test
    if len(test) == 4:
        if not names:
            names = range(len(taxdicts))
        #print("resarr")
        #print(resarr)
        resarr = pd.DataFrame(resarr,
                              index=names,
                              columns=[
                                  "dstat", "bootmean", "bootstd", "Z", "ABBA",
                                  "BABA", "nloci"
                              ])

        ## sort results and bootsarr to match if test names were supplied
        resarr = resarr.sort_index()
        order = [list(resarr.index).index(i) for i in names]
        bootsarr = bootsarr[order]
        return resarr, bootsarr
    else:
        ## order results dfs
        listres = []
        for key in range(len(paneldict)):
            listres.append(paneldict[key])

        ## make into a multi-index dataframe
        ntests = len(paneldict)
        multi_index = [
            np.array([[i] * 3 for i in range(ntests)]).flatten(),
            np.array(['p3', 'p4', 'shared'] * ntests),
        ]
        resarr = pd.DataFrame(
            data=pd.concat(listres).as_matrix(),
            index=multi_index,
            columns=listres[0].columns,
        )
        return resarr, None
Ejemplo n.º 21
0
Archivo: pca.py Proyecto: tle003/ipyrad
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
import numpy as np
import itertools
import copy
import os

try:
    ## when you have time go back and set attrubutes on toytrees
    import allel
except ImportError:
    raise IPyradWarningExit("""
    Error: pca requires the dependency 'scikit-allel', which we haven't yet
    included in the ipyrad installation. For now, you can install scikit-allel
    using conda with the following command: 

    conda install scikit-allel -c conda-forge
    """)


## set floating point precision in data frames to 3 for prettier printing
pd.set_option('precision', 3)


class PCA(object):
    "new pca class object"
    def __init__(self, 
        data=None, 
        pops=None,
        ncomps=10,
Ejemplo n.º 22
0
def write_ctl(name, imap, guidetree, nloci, infer_sptree, infer_delimit,
              delimit_alg, seed, burnin, nsample, sampfreq, thetaprior,
              tauprior, traits_df, nu0, kappa0, cleandata, useseqdata,
              usetraitdata, wdir, finetune, verbose):
    """ write outfile with any args in argdict """

    ## A string to store ctl info
    ctl = []

    ## check the tree (can do this better once we install ete3 w/ ipyrad)
    if not guidetree.endswith(";"):
        guidetree += ";"

    ## if traits_df then we make '.ibpp' files
    prog = 'bpp'
    if isinstance(traits_df, pd.DataFrame):
        prog = 'ibpp'

    ## write the top header info
    ctl.append("seed = {}".format(seed))
    ctl.append("seqfile = {}.{}.seq.txt".format(OPJ(wdir, name), prog))
    ctl.append("Imapfile = {}.{}.imap.txt".format(OPJ(wdir, name), prog))
    ctl.append("mcmcfile = {}.{}.mcmc.txt".format(OPJ(wdir, name), prog))
    ctl.append("outfile = {}.{}.out.txt".format(OPJ(wdir, name), prog))
    if isinstance(traits_df, pd.DataFrame):
        ctl.append("traitfile = {}.{}.traits.txt".format(
            OPJ(wdir, name), prog))

    ## number of loci (checks that seq file exists and parses from there)
    ctl.append("nloci = {}".format(nloci))
    ctl.append("usedata = {}".format(useseqdata))
    ctl.append("cleandata = {}".format(cleandata))

    ## infer species tree
    if infer_sptree:
        ctl.append("speciestree = 1 0.4 0.2 0.1")
    else:
        ctl.append("speciestree = 0")

    ## infer delimitation (with algorithm 1 by default)
    ctl.append("speciesdelimitation = {} {} {}"\
               .format(infer_delimit, delimit_alg[0],
                       " ".join([str(i) for i in delimit_alg[1:]])))

    ## if using iBPP (if not traits_df, we assume you're using bpp (v.3.3+)
    if isinstance(traits_df, pd.DataFrame):
        ## check that the data frame is properly formatted
        try:
            traits_df.values.astype(float)
        except Exception:
            raise IPyradWarningExit(PDREAD_ERROR)

        ## subsample to keep only samples that are in IMAP, we do not need to
        ## standarize traits b/c ibpp does that for us.
        samples = sorted(list(itertools.chain(*imap.values())))
        didx = [list(traits_df.index).index(i) for i in traits_df.index \
                if i not in samples]
        dtraits = traits_df.drop(traits_df.index[didx])

        ## mean standardize traits values after excluding samples
        straits = dtraits.apply(lambda x: (x - x.mean()) / (x.std()))

        ## convert NaN to "NA" cuz that's what ibpp likes, and write to file
        ftraits = straits.fillna("NA")
        traitdict = ftraits.T.to_dict("list")

        ## get reverse imap dict
        rev = {val: key for key in sorted(imap) for val in imap[key]}

        ## write trait file
        traitfile = "{}.{}.traits.txt".format(os.path.join(wdir, name), prog)
        with open(traitfile, 'w') as tout:
            tout.write("Indiv\n")
            tout.write("\t".join(['Species'] + list(ftraits.columns)) + "\n")
            #for key in sorted(traitdict):
            #    tout.write("\t".join([key, rev[key]] + \
            #        ["^"+str(i) for i in traitdict[key]])+"\n"
            #        )
            nindT = 0
            for ikey in sorted(imap.keys()):
                samps = imap[ikey]
                for samp in sorted(samps):
                    if samp in traitdict:
                        tout.write("\t".join([samp, rev[samp]] + \
                            [str(i) for i in traitdict[samp]])+"\n"
                        )
                        nindT += 1

        #    tout.write("Indiv\n"+"\t".join(["Species"]+\
        #    ["t_{}".format(i) for i in range(len(traitdict.values()[0]))])+"\n")
        #    for key in sorted(traitdict):
        #        print >>tout, "\t".join([key, rev[key]] + \
        #                                [str(i) for i in traitdict[key]])
        #ftraits.to_csv(traitfile)

        ## write ntraits and nindT and traitfilename
        ctl.append("ntraits = {}".format(traits_df.shape[1]))
        ctl.append("nindT = {}".format(nindT))  #traits_df.shape[0]))
        ctl.append("usetraitdata = {}".format(usetraitdata))
        ctl.append("useseqdata = {}".format(useseqdata))

        ## trait priors
        ctl.append("nu0 = {}".format(nu0))
        ctl.append("kappa0 = {}".format(kappa0))

        ## remove ibpp incompatible options
        ctl.remove("usedata = {}".format(useseqdata))
        ctl.remove("speciestree = {}".format(infer_sptree))

    ## get tree values
    nspecies = str(len(imap))
    species = " ".join(sorted(imap))
    ninds = " ".join([str(len(imap[i])) for i in sorted(imap)])

    ## write the tree
    ctl.append("""\
species&tree = {} {}
                 {}
                 {}""".format(nspecies, species, ninds, guidetree))

    ## priors
    ctl.append("thetaprior = {} {}".format(*thetaprior))
    ctl.append("tauprior = {} {} {}".format(*tauprior))

    ## other values, fixed for now
    ctl.append("finetune = 1: {}".format(" ".join([str(i) for i in finetune])))
    #CTL.append("finetune = 1: 1 0.002 0.01 0.01 0.02 0.005 1.0")
    ctl.append("print = 1 0 0 0")
    ctl.append("burnin = {}".format(burnin))
    ctl.append("sampfreq = {}".format(sampfreq))
    ctl.append("nsample = {}".format(nsample))

    ## write out the ctl file
    with open("{}.{}.ctl.txt".format(OPJ(wdir, name), prog), 'w') as out:
        out.write("\n".join(ctl))

    ## if verbose print ctl
    if verbose:
        sys.stderr.write("ctl file\n--------\n" + "\n".join(ctl) +
                         "\n--------\n\n")
Ejemplo n.º 23
0
def loci2cf(name, locifile, popdict, wdir=None, ipyclient=None):
    """ 
    Convert ipyrad .loci file to an iqtree-pomo 'counts' file

    Parameters:
    -----------
    name:
        A prefix name for output files that will be produced
    locifile:
        A .loci file produced by ipyrad.
    popdict: 
        A python dictionary grouping Clade names to Sample names. 
        Example: {"A": ['a', 'b', 'c'], "B": ['d', 'e', 'f']}
    ipyclient:
        If you pass it an ipyclient it will distribute work over
        remote engines, otherwise we use multiprocessing (todo).
    """

    ## working directory, make sure it exists
    if wdir:
        wdir = os.path.abspath(wdir)
        if not os.path.exists(wdir):
            raise IPyradWarningExit(" working directory (wdir) does not exist")
    else:
        wdir = os.path.curdir

    ## output file path
    name = name.rsplit(".cf")[0]
    outfile = os.path.join(wdir, "{}.cf".format(name))
    out = open(outfile, 'w')

    ## parse loci file
    with open(locifile) as inloc:
        loci = inloc.read().strip().split("|\n")

    ## get all names
    names = list(itertools.chain(*popdict.values()))
    popkeys = sorted(popdict.keys())

    ## count nsites
    nsites = sum(len(loc.split("\n")[0].split()[1]) for loc in loci[:])

    ## print the header
    out.write(
        HEADER.format(**{
            "NPOP": len(popdict),
            "NSITES": nsites,
            "VTAXA": "\t".join(popkeys)
        }))

    ## build print string
    outstr = "chr{:<8}  {:<4}  "
    for cidx in xrange(len(popkeys)):
        outstr += "{:<8}  "

    toprint = []
    for idx in xrange(len(loci)):
        dat = loci[idx].split("\n")
        seqs = np.array([list(i.split()[1]) for i in dat[:-1]])
        names = [i.split()[0] for i in dat[:-1]]
        data = np.zeros((seqs.shape[1], len(popkeys), 4), dtype=np.uint16)

        for sidx in xrange(seqs.shape[1]):
            for cidx in xrange(len(popkeys)):
                for name in popdict[popkeys[cidx]]:
                    if name in names:
                        base = seqs[names.index(name), sidx]
                        if base in list("ACGT"):
                            data[sidx, cidx, BASE2IDX[base]] += 2
                        elif base in list("RSYMKW"):
                            base1, base2 = AMBIGS[base]
                            data[sidx, cidx, BASE2IDX[base1]] += 1
                            data[sidx, cidx, BASE2IDX[base2]] += 1

            ## print string for one locus
            sdat = [",".join([str(i) for i in i.tolist()]) for i in data[sidx]]
            #print outstr.format(idx+1, sidx+1, *sdat)
            toprint.append(outstr.format(idx + 1, sidx + 1, *sdat))

        ## if 10K loci, then print and clear
        if not idx % 10000:
            out.write("\n".join(toprint) + "\n")
            toprint = []

    ## close handle
    out.write("\n".join(toprint) + "\n")
    out.close()
Ejemplo n.º 24
0
                        break
                self._report(tots)

                ## check for fails
                for async in asyncs:
                    if not async .successful():
                        raise IPyradWarningExit(async .result())

            else:
                self._accession = self.accession
                _call_fastq_dump_on_SRRs(self)
                self.report(1)

        except KeyboardInterrupt as inst:
            if ipyclient:
                raise IPyradWarningExit("interrupted -- ipcluster shutdown")
            else:
                raise IPyradWarningExit("interrupted")

        finally:
            if not ipyclient.outstanding:
                ipyclient.purge_everything()
            else:
                ## nanny: kill the engines left running, report kill.
                ipyclient.shutdown(hub=True, block=False)
                ipyclient.close()
                print("\nwarning: ipcluster shutdown and must be restarted")

    def _run(self, force=False, ipyclient=None):
        """
        Download the accessions into a the designated workdir. 
Ejemplo n.º 25
0
    def write_nexus_files(self, force=False, quiet=False):
        """
        Write nexus files to {workdir}/{name}/[0-N].nex, If the directory already
        exists an exception will be raised unless you use the force flag which 
        will remove all files in the directory. 

        Parameters:
        -----------
        force (bool):
            If True then all files in {workdir}/{name}/*.nex* will be removed. 

        """

        ## clear existing files
        existing = glob.glob(os.path.join(self.workdir, self.name, "*.nex"))
        if any(existing):
            if force:
                for rfile in existing:
                    os.remove(rfile)
            else:
                path = os.path.join(self.workdir, self.name)
                raise IPyradWarningExit(EXISTING_NEX_FILES.format(path))

        ## parse the loci or alleles file
        with open(self.files.data) as infile:
            loci = iter(infile.read().strip().split("|\n"))

        ## use entered samples or parse them from the file
        if not self.samples:
            with open(self.files.data) as infile:
                samples = set((i.split()[0] for i in infile.readlines() \
                               if "//" not in i))
        else:
            samples = set(self.samples)

        ## keep track of how many loci pass filtering
        totn = len(samples)
        nloci = 0

        ## this set is just used for matching, then we randomly
        ## subsample for real within the locus so it varies
        if self._alleles:
            msamples = {i + rbin() for i in samples}
        else:
            msamples = samples

        ## write subsampled set of loci
        for loc in loci:
            ## get names and seqs from locus
            dat = loc.split("\n")[:-1]
            try:
                names = [i.split()[0] for i in dat]
                snames = set(names)
                seqs = np.array([list(i.split()[1]) for i in dat])
            except IndexError:
                print(ALLELESBUGFIXED)
                continue

            ## check name matches
            if len(snames.intersection(msamples)) == totn:

                ## prune sample names if alleles. Done here so it is randomly
                ## different in every locus which allele is selected from
                ## each sample (e.g., 0 or 1)
                if self._alleles:
                    _samples = [i + rbin() for i in samples]
                else:
                    _samples = samples

                ## re-order seqs to be in set order
                seqsamp = seqs[[names.index(tax) for tax in _samples]]

                ## resolve ambiguities randomly if .loci file otherwise
                ## sample one of the alleles if .alleles file.
                if not self._alleles:
                    seqsamp = _resolveambig(seqsamp)

                ## find parsimony informative sites
                if _count_PIS(seqsamp, self.params.minsnps):
                    ## keep the locus
                    nloci += 1

                    ## remove empty columns given this sampling
                    copied = seqsamp.copy()
                    copied[copied == "-"] == "N"
                    rmcol = np.all(copied == "N", axis=0)
                    seqsamp = seqsamp[:, ~rmcol]

                    ## write nexus file
                    if self._alleles:
                        ## trim off the allele number
                        samps = [i.rsplit("_", 1)[0] for i in _samples]
                        mdict = dict(
                            zip(samps, [i.tostring() for i in seqsamp]))
                    else:
                        mdict = dict(
                            zip(_samples, [i.tostring() for i in seqsamp]))
                    self._write_nex(mdict, nloci)

                    ## quit early if using maxloci
                    if nloci == self.params.maxloci:
                        break

        ## print data size
        if not quiet:
            path = os.path.join(self.workdir, self.name)
            path = path.replace(os.path.expanduser("~"), "~")
            print("wrote {} nexus files to {}".format(nloci, path))
Ejemplo n.º 26
0
def get_client(cluster_id, profile, engines, timeout, cores, quiet, spacer,
               **kwargs):
    """ 
    Creates a client to view ipcluster engines for a given profile and 
    returns it with at least one engine spun up and ready to go. If no 
    engines are found after nwait amount of time then an error is raised.
    If engines==MPI it waits a bit longer to find engines. If the number
    of engines is set then it waits even longer to try to find that number
    of engines.
    """

    ## save stds for later, we're gonna hide them to prevent external printing
    save_stdout = sys.stdout
    save_stderr = sys.stderr
    sys.stdout = cStringIO.StringIO()
    sys.stderr = cStringIO.StringIO()

    ## get cluster_info print string
    connection_string = "{}establishing parallel connection:".format(spacer)

    ## wrapped search for ipcluster
    try:
        ## are we looking for a running ipcluster instance?
        if profile not in [None, "default"]:
            args = {'profile': profile, "timeout": timeout}
        else:
            clusterargs = [cluster_id, profile, timeout]
            argnames = ["cluster_id", "profile", "timeout"]
            args = {key: value for key, value in zip(argnames, clusterargs)}

        ## get connection within timeout window of wait time and hide messages
        ipyclient = ipp.Client(**args)
        sys.stdout = save_stdout
        sys.stderr = save_stderr

        ## check that all engines have connected
        if (engines == "MPI") or ("ipyrad-cli-" in cluster_id):
            if not quiet:
                print(connection_string)

        for _ in range(6000):
            initid = len(ipyclient)
            time.sleep(0.01)
            ## If MPI then wait for all engines to start so we can report
            ## how many cores are on each host. If Local then only wait for
            ## one engine to be ready and then just go.
            if (engines == "MPI") or ("ipyrad-cli-" in cluster_id):
                ## wait for cores to be connected
                if cores:
                    time.sleep(0.1)
                    if initid == cores:
                        break
                if initid:
                    time.sleep(3)
                    if len(ipyclient) == initid:
                        break
            else:
                if cores:
                    if initid == cores:
                        break
                else:
                    if initid:
                        break

    except KeyboardInterrupt as inst:
        ## ensure stdout is reset even if Exception was raised
        sys.stdout = save_stdout
        sys.stderr = save_stderr
        raise inst

    ## This is raised if ipcluster is not running ------------
    except IOError as inst:
        ## ensure stdout is reset even if Exception was raised
        sys.stdout = save_stdout
        sys.stderr = save_stderr
        if "ipyrad-cli-" in cluster_id:
            raise IPyradWarningExit(NO_IPCLUSTER_API)
        else:
            raise IPyradWarningExit(NO_IPCLUSTER_CLI)

    except (ipp.TimeoutError, ipp.NoEnginesRegistered) as inst:
        ## raised by ipp if no connection file is found for 'nwait' seconds
        sys.stdout = save_stdout
        sys.stderr = save_stderr
        raise inst

    except Exception as inst:
        ## if any other exceptions were missed...
        sys.stdout = save_stdout
        sys.stderr = save_stderr
        raise inst

    finally:
        ## ensure that no matter what we reset the stds
        sys.stdout = save_stdout
        sys.stderr = save_stderr

    return ipyclient
Ejemplo n.º 27
0
    def _submit_jobs(self, force, ipyclient, name_fields, name_separator,
                     dry_run):
        """
        Download the accessions into a the designated workdir. 
        If file already exists it will only be overwritten if 
        force=True. Temporary files are removed. 
        """

        ## get Run data with default fields (1,4,6,30)
        df = self.fetch_runinfo(range(31), quiet=True)
        sys.stdout.flush()

        ## if not ipyclient then use multiprocessing
        if ipyclient:
            lb = ipyclient.load_balanced_view()

        ## if Run has samples with same name (replicates) then
        ## we need to include the accessions in the file names
        if name_fields:
            ## indexing requires -1 ints
            fields = [int(i) - 1 for i in fields_checker(name_fields)]
            ## make accession names, no spaces allowed
            df['Accession'] = pd.Series(df[df.columns[fields[0]]],
                                        index=df.index)
            for field in fields[1:]:
                df.Accession += name_separator + df[df.columns[field]]
            df.Accession = [i.replace(" ", "_") for i in df.Accession]
            ## check that names are unique
            if not df.Accession.shape[0] == df.Accession.unique().shape[0]:
                raise IPyradWarningExit("names are not unique:\n{}"\
                    .format(df.Accession))

        ## backup default naming scheme
        else:
            if len(set(df.SampleName)) != len(df.SampleName):
                accs = (i + "-" + j for i, j in zip(df.SampleName, df.Run))
                df.Accession = accs
            else:
                df.Accession = df.SampleName

        if dry_run:
            print("\rThe following files will be written to: {}".format(
                self.workdir))
            print("{}\n".format(df.Accession))
        else:
            ## iterate over and download
            asyncs = []
            for idx in df.index:

                ## get args for this run
                srr = df.Run[idx]
                outname = df.Accession[idx]
                paired = df.spots_with_mates.values.astype(
                    int).nonzero()[0].any()
                fpath = os.path.join(self.workdir, outname + ".fastq.gz")

                ## skip if exists and not force
                skip = False
                if force:
                    if os.path.exists(fpath):
                        os.remove(fpath)
                else:
                    if os.path.exists(fpath):
                        skip = True
                        sys.stdout.flush()
                        print("[skip] file already exists: {}".format(fpath))

                ## single job progress bar
                tidx = df.Accession.shape[0]
                #if not ipyclient:

                ## submit job to run
                if not skip:
                    args = (self, srr, outname, paired)
                    if ipyclient:
                        async = lb.apply_async(call_fastq_dump_on_SRRs, *args)
                        asyncs.append(async)
                    else:
                        print("Downloading file {}/{}: {}".format(
                            idx + 1, tidx, fpath))
                        call_fastq_dump_on_SRRs(*args)
                        sys.stdout.flush()

            ## progress bar while blocking parallel
            if ipyclient:
                tots = df.Accession.shape[0]
                printstr = " Downloading fastq files | {} | "
                start = time.time()
                while 1:
                    elapsed = datetime.timedelta(seconds=int(time.time() -
                                                             start))
                    ready = sum([i.ready() for i in asyncs])
                    progressbar(tots,
                                ready,
                                printstr.format(elapsed),
                                spacer="")
                    time.sleep(0.1)
                    if tots == ready:
                        print("")
                        break
                self._report(tots)

                ## check for fails
                for async in asyncs:
                    if not async .successful():
                        raise IPyradWarningExit(async .result())
Ejemplo n.º 28
0
def main():
    """ main function """

    ## parse params file input (returns to stdout if --help or --version)
    args = parse_command_line()
    print(HEADER.format(ip.__version__))

    ## set random seed
    np.random.seed(args.rseed)
    random.seed(args.rseed)

    ## debugger----------------------------------------
    if os.path.exists(ip.__debugflag__):
        os.remove(ip.__debugflag__)
    if args.debug:
        print("\n  ** Enabling debug mode ** ")
        ip.debug_on()
        atexit.register(ip.debug_off)

    ## if JSON, load existing Tetrad analysis -----------------------
    if args.json:
        #data = ipa.tetrad.load_json(args.json)
        data = ipa.tetrad(name=args.name, workdir=args.workdir, load=True)
        ## if force then remove all results
        if args.force:
            data.refresh()

    ## else create a new tmp assembly for the seqarray-----------------
    else:
        ## create new Tetrad class Object if it doesn't exist
        newjson = os.path.join(args.workdir, args.name + '.tet.json')
        if (not os.path.exists(newjson)) or args.force:
            ## purge any files associated with this name if forced
            if args.force:
                ipa.tetrad(name=args.name,
                           workdir=args.workdir,
                           seqfile=args.seq,
                           initarr=False,
                           quiet=True).refresh()

            ## create new tetrad object
            data = ipa.tetrad(
                name=args.name,
                workdir=args.workdir,
                method=args.method,
                seqfile=args.seq,
                resolve=args.resolve,
                mapfile=args.map,
                guidetreefile=args.tree,
                nboots=args.boots,
                nquartets=args.nquartets,
                cli=True,
            )
            ## if not quiet...
            print("tetrad instance: {}".format(args.name))

        else:
            raise SystemExit(QUARTET_EXISTS\
            .format(args.name, args.workdir, args.workdir, args.name, args.name))

    ## boots can be set either for a new object or loaded JSON to continue it
    if args.boots:
        data.nboots = int(args.boots)

    ## set CLI ipcluster terms
    data._ipcluster["cores"] = args.cores if args.cores else detect_cpus()

    ## if more ipcluster args from command-line then use those
    if args.MPI:
        data._ipcluster["engines"] = "MPI"
        if not args.cores:
            raise IPyradWarningExit("must provide -c argument with --MPI")
    else:
        data._ipcluster["engines"] = "Local"

    ## launch a NEW ipcluster instance and register "cluster_id"
    ## for later destruction, and to avoid conflicts between
    ## simultaneous ipcluster instances. If a user wanted to use
    ## an ipcluster instance that is already running instead then
    ## they have to use the API, or to have set args.ipcluster
    if args.ipcluster:
        data._ipcluster["cluster_id"] = ""
    else:
        data = register_ipcluster(data)

    ## message about whether we are continuing from existing
    if data.checkpoint.boots or data.checkpoint.arr:
        print(
            ipa.tetrad.LOADING_MESSAGE.format(data.name, data.method,
                                              data.checkpoint.boots,
                                              data.checkpoint.arr))

    ## run tetrad main function within a wrapper. The wrapper creates an
    ## ipyclient view and appends to the list of arguments to run 'run'.
    data.run(force=args.force)
Ejemplo n.º 29
0
def parse_command_line():
    """ Parse CLI args. Only three options now. """

    ## create the parser
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
  * Example command-line usage ---------------------------------------------- 

  * Read in sequence/SNP data file, provide linkage, output name, ambig option. 
     tetrad -s data.snps.phy -n test             ## input phylip and give name
     tetrad -s data.snps.phy -l data.snps.map    ## use one SNP per locus
     tetrad -s data.snps.phy -n noambigs -r 0    ## do not use hetero sites

  * Load saved/checkpointed analysis from '.tet.json' file, or force restart. 
     tetrad -j test.tet.json -b 100         ## continue 'test' until 100 boots
     tetrad -j test.tet.json -b 100 -f      ## force restart of 'test'

  * Sampling modes: 'equal' uses guide tree to sample quartets more efficiently 
     tetrad -s data.snps -m all                         ## sample all quartets
     tetrad -s data.snps -m random -q 1e6 -x 123        ## sample 1M randomly
     tetrad -s data.snps -m equal -q 1e6 -t guide.tre   ## sample 1M across tree

  * HPC optimization: Set -c to the number of nodes to improve efficiency
     tetrad -s data.phy -c 16               ## e.g., use 16 cores across 4 nodes

  * Documentation: http://ipyrad.readthedocs.org/en/latest/
    """)

    ## add arguments

    ## get version from ipyrad
    ipyversion = str(pkg_resources.get_distribution('ipyrad'))
    parser.add_argument('-v',
                        '--version',
                        action='version',
                        version="tetrad " + ipyversion.split()[1])

    parser.add_argument('-f',
                        "--force",
                        action='store_true',
                        help="force overwrite of existing data")

    #parser.add_argument('-q', "--quiet", action='store_true',
    #    help="do not print to stderror or stdout.")

    parser.add_argument(
        '-s',
        metavar="seq",
        dest="seq",
        type=str,
        default=None,
        help="path to input phylip file (SNPs of full sequence file)")

    parser.add_argument(
        '-j',
        metavar='json',
        dest="json",
        type=str,
        default=None,
        help="load checkpointed/saved analysis from JSON file.")

    parser.add_argument(
        '-m',
        metavar="method",
        dest="method",
        type=str,
        default="all",
        help="method for sampling quartets (all, random, or equal)")

    parser.add_argument('-q',
                        metavar="nquartets",
                        dest="nquartets",
                        type=int,
                        default=0,
                        help="number of quartets to sample (if not -m all)")

    parser.add_argument('-b',
                        metavar="boots",
                        dest="boots",
                        type=int,
                        default=0,
                        help="number of non-parametric bootstrap replicates")

    parser.add_argument(
        '-l',
        metavar="map_file",
        dest="map",
        type=str,
        default=None,
        help="map file of snp linkages (e.g., ipyrad .snps.map)")

    parser.add_argument('-r',
                        metavar="resolve",
                        dest='resolve',
                        type=int,
                        default=1,
                        help="randomly resolve heterozygous sites (default=1)")

    parser.add_argument('-n',
                        metavar="name",
                        dest="name",
                        type=str,
                        default="test",
                        help="output name prefix (default: 'test')")

    parser.add_argument(
        '-o',
        metavar="workdir",
        dest="workdir",
        type=str,
        default="./analysis-tetrad",
        help="output directory (default: creates ./analysis-tetrad)")

    parser.add_argument(
        '-t',
        metavar="starting_tree",
        dest="tree",
        type=str,
        default=None,
        help="newick file starting tree for equal splits sampling")

    parser.add_argument(
        "-c",
        metavar="CPUs/cores",
        dest="cores",
        type=int,
        default=0,
        help="setting n Nodes improves parallel efficiency on HPC")

    parser.add_argument(
        "-x",
        metavar="random_seed",
        dest="rseed",
        type=int,
        default=None,
        help="random seed for quartet sampling and/or bootstrapping")

    parser.add_argument(
        '-d',
        "--debug",
        action='store_true',
        help="print lots more info to debugger: ipyrad_log.txt.")

    parser.add_argument("--MPI",
                        action='store_true',
                        help="connect to parallel CPUs across multiple nodes")

    parser.add_argument(
        "--ipcluster",
        action='store_true',
        help="connect to ipcluster instance with profile=default")

    ## if no args then return help message
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    ## parse args
    args = parser.parse_args()

    ## RAISE errors right away for some bad argument combinations:
    if args.method not in ["random", "equal", "all"]:
        raise IPyradWarningExit("  method argument (-m) must be one of"+\
        """ "all", "random", or "equal.\n""")

    ## if 'random' require nquarts argument
    if args.method == 'random':
        if not args.nquartets:
            raise IPyradWarningExit(\
            "  Number of quartets (-q) is required with method = random\n")

    ## if 'equal' method require starting tree and nquarts
    if args.method == 'equal':
        raise IPyradWarningExit(\
            "  The equal sampling method is currently for developers only.\n")
        if not args.nquartets:
            raise IPyradWarningExit(\
            "  Number of quartets (-q) is required with method = equal\n")
        if not args.tree:
            raise IPyradWarningExit(\
            "  Input guide tree (-t) is required with method = equal\n")

    ## required args
    if not any(x in ["seq", "json"] for x in vars(args).keys()):
        print("""
    Bad arguments: tetrad command must include at least one of (-s or -j) 
    """)
        parser.print_help()
        sys.exit(1)

    return args
Ejemplo n.º 30
0
    def run(self,
            force=False,
            ipyclient=None,
            name_fields=30,
            name_separator="_",
            dry_run=False):
        """
        Download the accessions into a the designated workdir. 

        Parameters
        ----------
        force: (bool)
            If force=True then existing files with the same name
            will be overwritten. 

        ipyclient: (ipyparallel.Client)
            If provided, work will be distributed across a parallel
            client, otherwise download will be run on a single core.

        name_fields: (int, str):
            Provide the index of the name fields to be used as a prefix
            for fastq output files. The default is 30, which is the 
            SampleName field. Use sra.fetch_fields to see all available
            fields and their indices. A likely alternative is 1 (Run). 
            If multiple are listed then they will be joined by a "_" 
            character. For example (29,30) would yield something like:
            latin-name_sample-name (e.g., mus_musculus-NR10123).

        dry_run: (bool)
            If True then a table of file names that _would_ be downloaded
            will be shown, but the actual files will note be downloaded.
        """

        ## temporarily set directory for tmpfiles used by fastq-dump
        ## if this fails then just skip it.
        try:
            ## ensure output directory, also used as tmpdir
            if not os.path.exists(self.workdir):
                os.makedirs(self.workdir)

            ## get original directory for sra files
            ## probably /home/ncbi/public/sra by default.
            self._set_vdbconfig_path()

            ## register ipyclient for cleanup
            if ipyclient:
                self._ipcluster["pids"] = {}
                for eid in ipyclient.ids:
                    engine = ipyclient[eid]
                    if not engine.outstanding:
                        pid = engine.apply(os.getpid).get()
                        self._ipcluster["pids"][eid] = pid

            ## submit jobs to engines or local
            self._submit_jobs(
                force=force,
                ipyclient=ipyclient,
                name_fields=name_fields,
                name_separator=name_separator,
                dry_run=dry_run,
            )

        except IPyradWarningExit as inst:
            print(inst)
        ## exceptions to catch, cleanup and handle ipyclient interrupts
        except KeyboardInterrupt:
            print("keyboard interrupt...")
        except Exception as inst:
            print("Exception in run() - {}".format(inst))
        finally:
            ## reset working sra path
            self._restore_vdbconfig_path()

            ## if it made a new sra directory then it should be empty when
            ## we are finished if all .sra files were removed. If so, then
            ## let's also remove the dir. if not empty, leave it.
            sradir = os.path.join(self.workdir, "sra")
            if os.path.exists(sradir) and (not os.listdir(sradir)):
                shutil.rmtree(sradir)
            else:
                ## print warning
                try:
                    print(FAILED_DOWNLOAD.format(os.listdir(sradir)))
                except OSError as inst:
                    ## If sra dir doesn't even exist something very bad is broken.
                    raise IPyradWarningExit("Download failed. Exiting.")
                ## remove fastq file matching to cached sra file
                for srr in os.listdir(sradir):
                    isrr = srr.split(".")[0]
                    ipath = os.path.join(self.workdir, "*_{}*.gz".format(isrr))
                    ifile = glob.glob(ipath)[0]
                    if os.path.exists(ifile):
                        os.remove(ifile)
                ## remove cache of sra files
                shutil.rmtree(sradir)

            ## cleanup ipcluster shutdown
            if ipyclient:
                ## send SIGINT (2) to all engines still running tasks
                try:
                    ipyclient.abort()
                    time.sleep(0.5)
                    for engine_id, pid in self._ipcluster["pids"].items():
                        if ipyclient.queue_status()[engine_id]["tasks"]:
                            os.kill(pid, 2)
                        time.sleep(0.1)
                except ipp.NoEnginesRegistered:
                    pass
                ## clean memory space
                if not ipyclient.outstanding:
                    ipyclient.purge_everything()
                ## uh oh, kill everything, something bad happened
                else:
                    ipyclient.shutdown(hub=True, block=False)
                    ipyclient.close()
                    print(
                        "\nwarning: ipcluster shutdown and must be restarted")