def run(self, steps=None, ipyclient=None, force=False, quiet=False): """ Submits an ordered list of jobs to a load-balancer to complete the following tasks, and reports a progress bar: (1) Write nexus files for each locus (2) Run mrBayes on each locus to get a posterior of gene trees (3) Run mbsum (a bucky tool) on the posterior set of trees (4) Run Bucky on the summarized set of trees for all alpha values. Parameters: ----------- ipyclient (ipyparallel.Client()) A connected ipyparallel Client object used to distribute jobs force (bool): Whether to overwrite existing files with the same name and workdir if they exist. Default is False. quiet (bool): Whether to suppress progress information. Default is False. steps (list): A list of integers of steps to perform. This is useful if a job was interrupted, or you created a new bucky object copy, or you wish to run an analysis under a new set of parameters, after having run it once. For example, if you finished running steps 1 and 2 (write nexus files and infer mrbayes posteriors), but you want to rerun steps 3 and 4 with new settings, then you could enter `steps=[3,4]` and also `force=True` to run steps 3 and 4 with a new set of parameters. Default argument is None which means run all steps. """ ## require ipyclient if not ipyclient: raise IPyradWarningExit("an ipyclient object is required") ## check the steps argument if not steps: steps = [1, 2, 3, 4] if isinstance(steps, str): steps = [int(i) for i in steps] if isinstance(steps, list): if not all(isinstance(i, int) for i in steps): raise IPyradWarningExit("steps must be a list of integers") ## run steps ------------------------------------------------------ ## todo: wrap this function so it plays nice when interrupted. if 1 in steps: self.write_nexus_files(force=force, quiet=quiet) if 2 in steps: self.run_mrbayes(force=force, quiet=quiet, ipyclient=ipyclient) if 3 in steps: self.run_mbsum(force=force, quiet=quiet, ipyclient=ipyclient) if 4 in steps: self.run_bucky(froce=force, quiet=quiet, ipyclient=ipyclient) ## track progress asyncs while 1: if self.asyncs: print('jobs are running') else: break
def __init__(self, accession, workdir, paired=False): ## TODO: if paired: raise IPyradWarningExit( "sorry, paired data is not yet supported, stay tuned.") ## check imports for binary in ['fastq-dump', 'esearch']: if not sps.call( "type " + binary, shell=True, stdout=sps.PIPE, stderr=sps.PIPE) == 0: raise IPyradWarningExit(MISSING_IMPORTS) ## store attributes self.accession = accession self.workdir = os.path.abspath(os.path.expanduser(workdir)) self.is_sample = False self.is_project = False ## if any([i in self.accession for i in ["SRR", "ERR", "DRR"]]): self.is_sample = True elif any([i in self.accession for i in ["SRP", "ERP", "DRP"]]): self.is_project = True else: raise IPyradWarningExit(ACCESSION_ID)
def main(): """ main function """ ## not in ipython ip.__interactive__ = 0 header = \ "\n --------------------------------------------------"+\ "\n Analysis tools for ipyrad [v.{}]".format(ip.__version__)+\ "\n svd4tet -- fast quartet and tree inference "+\ "\n --------------------------------------------------" print(header) ## parse params file input (returns to stdout if --help or --version) args = parse_command_line() ## if JSON, load it if args.json: data = ip.load_json(args.json) data.outfiles.svdinput = data.outfiles.svdinput ## else create a tmp assembly for the seqarray else: if not args.output: raise IPyradWarningExit(" -o output_prefix required") if not args.seq: raise IPyradWarningExit(" -s sequence file required") ## create new JSON (Assembly) object data = ip.Assembly(args.output, quiet=True) data.outfiles.svdinput = args.seq data.set_params(1, "./") ## parse samples from the sequence file names = [] with iter(open(args.seq, 'r')) as infile: infile.next().strip().split() while 1: try: names.append(infile.next().split()[0]) except StopIteration: break ## store as Samples in Assembly data.samples = {name:ip.Sample(name) for name in names} ## store ipcluster info data._ipcluster["cores"] = args.cores if args.MPI: data._ipcluster["engines"] = "MPI" else: data._ipcluster["engines"] = "Local" ## launch ipcluster and register for later destruction data = ipcontroller_init(data) ## run svd4tet args = [data, args.boots, args.method, args.nquartets, args.force] data._clientwrapper(ipa.svd4tet.run, args, 45)
def getassembly(args, parsedict): """ loads assembly or creates a new one and set its params from parsedict. Does not launch ipcluster. """ ## Creating an assembly with a full path in the name will "work" ## but it is potentially dangerous, so here we have assembly_name ## and assembly_file, name is used for creating new in cwd, file is ## used for loading existing. ## ## Be nice if the user includes the extension. #project_dir = ip.core.assembly._expander(parsedict['1']) #assembly_name = parsedict['0'] project_dir = ip.core.assembly._expander(parsedict['project_dir']) assembly_name = parsedict['assembly_name'] assembly_file = os.path.join(project_dir, assembly_name) ## Assembly creation will handle error checking on ## the format of the assembly_name ## make sure the working directory exists. if not os.path.exists(project_dir): os.mkdir(project_dir) try: ## If 1 and force then go ahead and create a new assembly if ('1' in args.steps) and args.force: data = ip.Assembly(assembly_name, cli=True) else: data = ip.load_json(assembly_file, cli=True) data._cli = True except IPyradWarningExit as _: ## if no assembly is found then go ahead and make one if '1' not in args.steps: raise IPyradWarningExit(\ " Error: You must first run step 1 on the assembly: {}"\ .format(assembly_file)) else: ## create a new assembly object data = ip.Assembly(assembly_name, cli=True) ## for entering some params... for param in parsedict: ## trap assignment of assembly_name since it is immutable. if param == "assembly_name": ## Raise error if user tried to change assembly name if parsedict[param] != data.name: data.set_params(param, parsedict[param]) else: ## all other params should be handled by set_params try: data.set_params(param, parsedict[param]) except IndexError as _: print(" Malformed params file: {}".format(args.params)) print(" Bad parameter {} - {}".format(param, parsedict[param])) sys.exit(-1) return data
def parse_params(args): """ Parse the params file args, create and return Assembly object.""" ## check that params.txt file is correctly formatted. try: with open(args.params) as paramsin: plines = paramsin.readlines() except IOError as _: sys.exit(" No params file found") ## check header: big version changes can be distinguished by the header legacy_version = 0 try: ## try to update the Assembly ... legacy_version = 1 if not len(plines[0].split()[0]) == 7: raise IPyradWarningExit(""" Error: file '{}' is not compatible with ipyrad v.{}. Please create and update a new params file using the -n argument. For info on which parameters have changed see the changelog: (http://ipyrad.readthedocs.io/releasenotes.html) """.format(args.params, ip.__version__)) except IndexError: raise IPyradWarningExit(""" Error: Params file should not have any empty lines at the top of the file. Verify there are no blank lines and rerun ipyrad. Offending file - {} """.format(args.params)) ## update and backup if legacy_version: #which version... #update_to_6() pass ## make into a dict. Ignore blank lines at the end of file ## Really this will ignore all blank lines items = [ i.split("##")[0].strip() for i in plines[1:] if not i.strip() == "" ] #keys = [i.split("]")[-2][-1] for i in plines[1:]] #keys = range(len(plines)-1) keys = ip.Assembly('null', quiet=True).paramsdict.keys() parsedict = {str(i): j for i, j in zip(keys, items)} return parsedict
def branch_assembly(args, parsedict): """ Load the passed in assembly and create a branch. Copy it to a new assembly, and also write out the appropriate params.txt """ ## Get the current assembly data = getassembly(args, parsedict) ## get arguments to branch command bargs = args.branch ## get new name, trim off .txt if it was accidentally added newname = bargs[0] if newname.endswith(".txt"): newname = newname[:-4] ## look for subsamples if len(bargs) > 1: ## Branching and subsampling at step 6 is a bad idea, it messes up ## indexing into the hdf5 cluster file. Warn against this. if any([x.stats.state == 6 for x in data.samples.values()]): pass ## TODODODODODO #print("wat") ## are we removing or keeping listed samples? subsamples = bargs[1:] ## drop the matching samples if bargs[1] == "-": ## check drop names fails = [i for i in subsamples[1:] if i not in data.samples.keys()] if any(fails): raise IPyradWarningExit("\ \n Failed: unrecognized names requested, check spelling:\n {}"\ .format("\n ".join([i for i in fails]))) print(" dropping {} samples".format(len(subsamples)-1)) subsamples = list(set(data.samples.keys()) - set(subsamples)) ## If the arg after the new param name is a file that exists if os.path.exists(bargs[1]): new_data = data.branch(newname, infile=bargs[1]) else: new_data = data.branch(newname, subsamples) ## keeping all samples else: new_data = data.branch(newname, None) print(" creating a new branch called '{}' with {} Samples".\ format(new_data.name, len(new_data.samples))) print(" writing new params file to {}"\ .format("params-"+new_data.name+".txt\n")) new_data.write_params("params-"+new_data.name+".txt", force=args.force)
def fetch_runinfo(self, fields=None, quiet=False): """ Call esearch to grep SRR info for a project (SRP). Use the command sra.fetch_fields to see available fields to be fetched. This function returns a DataFrame with runinfo for the selected fields. Parameters: ----------- Fields: (tuple or list) The default fields returned are 1-30. You can enter a list or tuple of fewer numbers to select fewer fields. Example, (1,4,6,29,30) returns a neat dataframe with Run IDs, Number of reads (SE and PE), ScientificName, and SampleName. """ if not quiet: print("\rFetching project data...", end="") ## if no entry then fetch (nearly) all fields. if fields == None: fields = range(30) fields = fields_checker(fields) ## command strings es_cmd = [ "esearch", "-db", "sra", "-query", self.accession, ] ef_cmd = [ "efetch", "--format", "runinfo", ] cut_cmd = [ "cut", "-d", ",", "-f", ",".join(fields), ] ## pipe commands together proc1 = sps.Popen(es_cmd, stderr=sps.STDOUT, stdout=sps.PIPE) proc2 = sps.Popen(ef_cmd, stdin=proc1.stdout, stderr=sps.STDOUT, stdout=sps.PIPE) proc3 = sps.Popen(cut_cmd, stdin=proc2.stdout, stderr=sps.STDOUT, stdout=sps.PIPE) o, e = proc3.communicate() proc2.stdout.close() proc1.stdout.close() if o: vals = o.strip().split("\n") names = vals[0].split(",") items = [i.split(",") for i in vals[1:]] return pd.DataFrame(items, columns=names) else: raise IPyradWarningExit("no samples found in {}".format(self.accession))
def fetch_runinfo(self): """ Call esearch to grep SRR info for a project (SRP). Returns two lists: SRRs and ACCs. """ print("\rFetching project data...", end="") es_cmd = [ "esearch", "-db", "sra", "-query", self.accession, ] ef_cmd = [ "efetch", "--format", "runinfo", ] cut_cmd = [ "cut", "-d", ",", "-f", "1,30", ] ## this will grep SRR for SRPs, and ERR for ERPs, etc. grep_cmd = ["grep", self.accession[:2] + "R"] ## pipe commands together proc1 = sps.Popen(es_cmd, stderr=sps.STDOUT, stdout=sps.PIPE) proc2 = sps.Popen(ef_cmd, stdin=proc1.stdout, stderr=sps.STDOUT, stdout=sps.PIPE) proc3 = sps.Popen(cut_cmd, stdin=proc2.stdout, stderr=sps.STDOUT, stdout=sps.PIPE) proc4 = sps.Popen(grep_cmd, stdin=proc3.stdout, stderr=sps.STDOUT, stdout=sps.PIPE) o, e = proc4.communicate() if o: srrlist = o.strip().split("\n") SRRs, ACCs = zip(*[i.split(",") for i in srrlist]) return SRRs, ACCs else: raise IPyradWarningExit("no samples found in {}".format( self.accession))
def getassembly(args, parsedict): """ loads assembly or creates a new one and set its params from parsedict. Does not launch ipcluster. """ ## Creating an assembly with a full path in the name will "work" ## but it is potentially dangerous, so here we have assembly_name ## and assembly_file, name is used for creating new in cwd, file is ## used for loading existing. ## ## Be nice if the user includes the extension. project_dir = ip.core.assembly.expander(parsedict['1']) assembly_name = parsedict['0'] assembly_file = os.path.join(project_dir, assembly_name) ## Assembly creation will handle error checking on ## the format of the assembly_name ## make sure the working directory exists. if not os.path.exists(project_dir): os.mkdir(project_dir) try: ## If 1 and force then go ahead and create a new assembly if '1' in args.steps and args.force: data = ip.Assembly(assembly_name) else: data = ip.load_json(assembly_file) except IPyradWarningExit as inst: ## if no assembly is found then go ahead and make one if '1' not in args.steps: raise IPyradWarningExit(""" Error: Steps >1 ({}) requested but no current assembly found - {} """.format(args.steps, assembly_file)) else: ## create a new assembly object data = ip.Assembly(assembly_name) ## for entering some params... for param in parsedict: ## trap assignment of assembly_name since it is immutable. if param == str(0): ## only pass to set_params if user tried to change assembly_name ## it will raise an Exit error if parsedict[param] != data.name: data.set_params(param, parsedict[param]) else: ## all other params should be handled by set_params data.set_params(param, parsedict[param]) return data
def run_mrbayes(self, ipyclient, force=False, quiet=False): """ calls the mrbayes block in each nexus file. """ ## get all the nexus files for this object minidir = os.path.realpath(os.path.join(self.workdir, self.name)) nexus_files = glob.glob(os.path.join(minidir, "*.nex")) ## clear existing files #existing = glob.glob(os.path.join(self.workdir, self.name, "*.nex")) existing = glob.glob(os.path.join(minidir, "*.nex.*")) if any(existing): if force: for rfile in existing: os.remove(rfile) else: raise IPyradWarningExit(EXISTING_NEXdot_FILES.format(minidir)) ## write new nexus files, or should users do that before this? #self.write_nexus_files(force=True) ## load balancer lbview = ipyclient.load_balanced_view() ## submit each to be processed asyncs = [] for nex in nexus_files: async = lbview.apply(_call_mb, nex) asyncs.append(async) ## track progress start = time.time() printstr = "[mb] infer gene-tree posteriors | {} | " while 1: ready = [i.ready() for i in asyncs] elapsed = datetime.timedelta(seconds=int(time.time() - start)) if not quiet: progressbar(len(ready), sum(ready), printstr.format(elapsed), spacer="") if len(ready) == sum(ready): if not quiet: print("") break else: time.sleep(0.1) ## check success for async in asyncs: if not async .successful(): raise IPyradWarningExit(async .result())
def run_mbsum(self, ipyclient, force=False, quiet=False): """ Sums two replicate mrbayes runs for each locus """ minidir = os.path.realpath(os.path.join(self.workdir, self.name)) trees1 = glob.glob(os.path.join(minidir, "*.run1.t")) trees2 = glob.glob(os.path.join(minidir, "*.run2.t")) ## clear existing files existing = glob.glob(os.path.join(self.workdir, self.name, "*.sumt")) if any(existing): if force: for rfile in existing: os.remove(rfile) else: path = os.path.join(self.workdir, self.name) raise IPyradWarningExit(EXISTING_SUMT_FILES.format(path)) ## load balancer lbview = ipyclient.load_balanced_view() ## submit each to be processed asyncs = [] for tidx in xrange(len(trees1)): rep1 = trees1[tidx] rep2 = trees2[tidx] outname = os.path.join(minidir, str(tidx) + ".sumt") async = lbview.apply(_call_mbsum, *(rep1, rep2, outname)) asyncs.append(async) ## track progress start = time.time() printstr = "[mbsum] sum replicate runs | {} | " while 1: ready = [i.ready() for i in asyncs] elapsed = datetime.timedelta(seconds=int(time.time() - start)) if not quiet: progressbar(len(ready), sum(ready), printstr.format(elapsed), spacer="") if len(ready) == sum(ready): if not quiet: print("") break else: time.sleep(0.1) ## check success for async in asyncs: if not async .successful(): raise IPyradWarningExit(async .result())
def __init__( self, accession, workdir="sra-fastq-data", ): ## check imports for binary in ['fastq-dump', 'esearch']: if not sps.call( "type " + binary, shell=True, stdout=sps.PIPE, stderr=sps.PIPE) == 0: raise IPyradWarningExit(MISSING_IMPORTS) ## store attributes self.accession = accession self.workdir = os.path.abspath(os.path.expanduser(workdir)) self.is_sample = False self.is_project = False self._oldtmpdir = None ## cluster attributes self._ipcluster = { "cluster_id": "", "profile": "default", "engines": "Local", "quiet": 0, "timeout": 60, "cores": 0, "threads": 2, "pids": {}, } ## if any([i in self.accession for i in ["SRR", "ERR", "DRR"]]): self.is_sample = True elif any([i in self.accession for i in ["SRP", "ERP", "DRP"]]): self.is_project = True else: raise IPyradWarningExit(ACCESSION_ID)
def __init__(self, name, data, workdir=None, mapfile=None): self.name = name self.data = os.path.abspath(os.path.expanduser(data)) self.mainparams = _MainParams() self.extraparams = _ExtraParams() self.clumppparams = _ClumppParams() self.asyncs = [] ## check that bpp is installed and in path for binary in ['structure']: if not subprocess.call("type " + binary, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0: raise IPyradWarningExit(MISSING_IMPORTS) ## make workdir if it does not exist if workdir: self.workdir = os.path.abspath(os.path.expanduser(workdir)) else: self.workdir = OPJ(os.path.abspath('.'), "analysis-structure") if not os.path.exists(self.workdir): os.makedirs(self.workdir) ## check that strfile exists, print and parse some info from it with open(data) as ifile: lines = ifile.readlines() self.ntaxa = len(lines) // 2 self.nsites = len(lines[0].strip().split()[1:]) self.labels = [i.split('\t')[0].strip() for i in lines][::2] self.popdata = [i.split('\t')[1] for i in lines][::2] self.popflag = [i.split('\t')[2] for i in lines][::2] self.locdata = [i.split('\t')[3] for i in lines][::2] self.phenotype = [i.split('\t')[4] for i in lines][::2] #self.extra = [i.split('\t')[5] for i in lines][::2] #default extracols=0 del lines ## if mapfile then parse it to an array if mapfile: with open(mapfile) as inmap: maparr = np.genfromtxt(inmap)[:, [0, 3]].astype(np.uint64) spans = np.zeros((maparr[-1, 0], 2), np.uint64) spans = get_spans(maparr, spans) self.maparr = spans self.nsites = spans.shape[0] else: self.maparr = None
def fields_checker(fields): """ returns a fields argument formatted as a list of strings. and doesn't allow zero. """ ## make sure fields will work if isinstance(fields, int): fields = str(fields) if isinstance(fields, str): if "," in fields: fields = [str(i) for i in fields.split(",")] else: fields = [str(fields)] elif isinstance(fields, (tuple, list)): fields = [str(i) for i in fields] else: raise IPyradWarningExit("fields not properly formatted") ## do not allow zero in fields fields = [i for i in fields if i != '0'] return fields
def main(): """ main function """ ## turn off traceback for the CLI ip.__interactive__ = 0 ## Check for a new version on anaconda _check_version() ## parse params file input (returns to stdout if --help or --version) args = parse_command_line() ## Turn the debug output written to ipyrad_log.txt up to 11! ## Clean up the old one first, it's cleaner to do this here than ## at the end (exceptions, etc) if os.path.exists(ip.__debugflag__): os.remove(ip.__debugflag__) if args.debug: print("\n ** Enabling debug mode ** ") ip._debug_on() atexit.register(ip._debug_off) ## create new paramsfile if -n if args.new: ## Create a tmp assembly, call write_params to make default params.txt try: tmpassembly = ip.Assembly(args.new, quiet=True, cli=True) tmpassembly.write_params("params-{}.txt".format(args.new), force=args.force) except Exception as inst: print(inst) sys.exit(2) print("\n New file 'params-{}.txt' created in {}\n".\ format(args.new, os.path.realpath(os.path.curdir))) sys.exit(2) ## if params then must provide action argument with it if args.params: if not any([args.branch, args.results, args.steps]): print(""" Must provide action argument along with -p argument for params file. e.g., ipyrad -p params-test.txt -r ## shows results e.g., ipyrad -p params-test.txt -s 12 ## runs steps 1 & 2 e.g., ipyrad -p params-test.txt -b newbranch ## branch this assembly """) sys.exit(2) if not args.params: if any([args.branch, args.results, args.steps]): print(""" Must provide params file for branching, doing steps, or getting results. e.g., ipyrad -p params-test.txt -r ## shows results e.g., ipyrad -p params-test.txt -s 12 ## runs steps 1 & 2 e.g., ipyrad -p params-test.txt -b newbranch ## branch this assembly """) ## if branching, or merging do not allow steps in same command ## print spacer if any([args.branch, args.merge]): args.steps = "" print("") ## always print the header when doing steps header = \ "\n -------------------------------------------------------------"+\ "\n ipyrad [v.{}]".format(ip.__version__)+\ "\n Interactive assembly and analysis of RAD-seq data"+\ "\n -------------------------------------------------------------" ## Log the current version. End run around the LOGGER ## so it'll always print regardless of log level. with open(ip.__debugfile__, 'a') as logfile: logfile.write(header) logfile.write("\n Begin run: {}".format(time.strftime("%Y-%m-%d %H:%M"))) logfile.write("\n Using args {}".format(vars(args))) logfile.write("\n Platform info: {}".format(os.uname())) ## if merging just do the merge and exit if args.merge: print(header) merge_assemblies(args) sys.exit(1) ## if download data do it and then exit. Runs single core in CLI. if args.download: if len(args.download) == 1: downloaddir = "sra-fastqs" else: downloaddir = args.download[1] sratools_download(args.download[0], workdir=downloaddir, force=args.force) sys.exit(1) ## create new Assembly or load existing Assembly, quit if args.results elif args.params: parsedict = parse_params(args) if args.branch: branch_assembly(args, parsedict) elif args.steps: ## print header print(header) ## Only blank the log file if we're actually going to run a new ## assembly. This used to be in __init__, but had the side effect ## of occasionally blanking the log file in an undesirable fashion ## for instance if you run a long assembly and it crashes and ## then you run `-r` and it blanks the log, it's crazymaking. if os.path.exists(ip.__debugfile__): if os.path.getsize(ip.__debugfile__) > 50000000: with open(ip.__debugfile__, 'w') as clear: clear.write("file reset") ## run Assembly steps ## launch or load assembly with custom profile/pid data = getassembly(args, parsedict) ## set CLI ipcluster terms data._ipcluster["threads"] = args.threads ## if ipyclient is running (and matched profile) then use that one if args.ipcluster: ipyclient = ipp.Client(profile=args.ipcluster) data._ipcluster["cores"] = len(ipyclient) ## if not then we need to register and launch an ipcluster instance else: ## set CLI ipcluster terms ipyclient = None data._ipcluster["cores"] = args.cores if args.cores else detect_cpus() data._ipcluster["engines"] = "Local" if args.MPI: data._ipcluster["engines"] = "MPI" if not args.cores: raise IPyradWarningExit("must provide -c argument with --MPI") ## register to have a cluster-id with "ip- name" data = register_ipcluster(data) ## set to print headers data._headers = 1 ## run assembly steps steps = list(args.steps) data.run( steps=steps, force=args.force, preview=args.preview, show_cluster=1, ipyclient=ipyclient) if args.results: showstats(parsedict)
def loci2bpp(name, locifile, imap, guidetree, minmap=None, maxloci=None, infer_sptree=0, infer_delimit=0, delimit_alg=(0, 5), seed=12345, burnin=1000, nsample=10000, sampfreq=2, thetaprior=(5, 5), tauprior=(4, 2, 1), traits_df=None, nu=0, kappa=0, useseqdata=1, usetraitdata=1, cleandata=0, wdir=None, finetune=(0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01), verbose=0): """ Converts loci file format to bpp file format, i.e., concatenated phylip-like format, and produces imap and ctl input files for bpp. Parameters: ----------- name: A prefix name for output files that will be produced locifile: A .loci file produced by ipyrad. imap: A Python dictionary with 'species' names as keys, and lists of sample names for the values. Any sample that is not included in the imap dictionary will be filtered out of the data when converting the .loci file into the bpp formatted sequence file. Each species in the imap dictionary must also be present in the input 'guidetree'. guidetree: A newick string species tree hypothesis [e.g., (((a,b),(c,d)),e);] All species in the imap dictionary must also be present in the guidetree Optional parameters: -------------------- infer_sptree: Default=0, only infer parameters on a fixed species tree. If 1, then the input tree is treated as a guidetree and tree search is employed to find the best tree. The results will include support values for the inferred topology. infer_delimit: Default=0, no delimitation. If 1 then splits in the tree that separate 'species' will be collapsed to test whether fewer species are a better fit to the data than the number in the input guidetree. delimit_alg: Species delimitation algorithm. This is a tuple. The first value is the algorithm (0 or 1) and the following values are arguments for the given algorithm. See other ctl files for examples of what the delimitation line looks like. This is where you can enter the params (e.g., alpha, migration) for the two different algorithms. For example, the following args would produce the following ctl lines: alg=0, epsilon=5 > delimit_alg = (0, 5) speciesdelimitation = 1 0 5 alg=1, alpha=2, migration=1 > delimit_alg = (1, 2, 1) speciesdelimitation = 1 1 2 1 alg=1, alpha=2, migration=1, diagnosis=0, ?=1 > delimit_alg = (1, 2, 1, 0, 1) speciesdelimitation = 1 1 2 1 0 1 seed: A random number seed at start of analysis. burnin: Number of burnin generations in mcmc nsample: Number of mcmc generations to run. sampfreq: How often to sample from the mcmc chain. thetaprior: Prior on theta (4Neu), gamma distributed. mean = a/b. e.g., (5, 5) tauprior Prior on root tau, gamma distributed mean = a/b. Last number is dirichlet prior for other taus. e.g., (4, 2, 1) traits_df: A pandas DataFrame with trait data properly formatted. This means only quantitative traits are included, and missing values are NaN. The first column contains sample names, with "Indiv" as the header. The following columns have a header row with trait names. This script will write a CSV trait file with trait values mean-standardized, with NaN replaced by "NA", and with sample not present in IMAP removed. nu: A prior on phenotypic trait variance (0) for iBPP analysis. kappa: A prior on phenotypic trait mean (0) for iBPP analysis. useseqdata: If false inference proceeds without sequence data (can be used to test the effect of priors on the tree distributions). usetraitdata: If false inference proceeds without trait data (can be used to test the effect of priors on the trait distributions). cleandata: If 1 then sites with missing or hetero characters are removed. wdir: A working directory to write files to. finetune: See bpp documentation. verbose: If verbose=1 the ctl file text will also be written to screen (stderr). """ ## check args if not imap: raise IPyradWarningExit(IMAP_REQUIRED) if minmap: if minmap.keys() != imap.keys(): raise IPyradWarningExit(KEYS_DIFFER) ## working directory, make sure it exists if wdir: wdir = os.path.abspath(wdir) if not os.path.exists(wdir): raise IPyradWarningExit(" working directory (wdir) does not exist") else: wdir = os.path.curdir ## if traits_df then we make '.ibpp' files prog = 'bpp' if isinstance(traits_df, pd.DataFrame): prog = 'ibpp' outfile = OPJ(wdir, "{}.{}.seq.txt".format(name, prog)) mapfile = OPJ(wdir, "{}.{}.imap.txt".format(name, prog)) ## open outhandles fout = open(outfile, 'w') fmap = open(mapfile, 'w') ## parse the loci file with open(locifile, 'r') as infile: ## split on "//" for legacy compatibility loci = infile.read().strip().split("|\n") nloci = len(loci) ## all samples samples = list(itertools.chain(*imap.values())) ## iterate over loci, printing to outfile nkept = 0 for iloc in xrange(nloci): lines = loci[iloc].split("//")[0].split() names = lines[::2] names = ["^" + i for i in names] seqs = [list(i) for i in lines[1::2]] seqlen = len(seqs[0]) ## whether to skip this locus based on filters below skip = 0 ## if minmap filter for sample coverage if minmap: covd = {} for group, vals in imap.items(): covd[group] = sum(["^" + i in names for i in vals]) ## check that coverage is good enough if not all([covd[group] >= minmap[group] for group in minmap]): skip = 1 ## too many loci? if maxloci: if nkept >= maxloci: skip = 1 ## build locus as a string if not skip: ## convert to phylip with caret starter and replace - with N. data = ["{:<30} {}".format(i, "".join(k).replace("-", "N")) for \ (i, k) in zip(names, seqs) if i[1:] in samples] ## if not empty, write to the file if data: fout.write("{} {}\n\n{}\n\n"\ .format(len(data), seqlen, "\n".join(data))) nkept += 1 ## close up shop fout.close() ## write the imap file: data = ["{:<30} {}".format(val, key) for key \ in sorted(imap) for val in imap[key]] fmap.write("\n".join(data)) fmap.close() ## write ctl file write_ctl(name, imap, guidetree, nkept, infer_sptree, infer_delimit, delimit_alg, seed, burnin, nsample, sampfreq, thetaprior, tauprior, traits_df, nu, kappa, cleandata, useseqdata, usetraitdata, wdir, finetune, verbose) ## print message? sys.stderr.write("new files created ({} loci, {} species, {} samples)\n"\ .format(nkept, len(imap.keys()), sum([len(i) for i in imap.values()]))) sys.stderr.write(" {}.{}.seq.txt\n".format(name, prog)) sys.stderr.write(" {}.{}.imap.txt\n".format(name, prog)) sys.stderr.write(" {}.{}.ctl.txt\n".format(name, prog)) if isinstance(traits_df, pd.DataFrame): sys.stderr.write(" {}.{}.traits.txt\n".format(name, prog)) ## return the ctl file string return os.path.abspath("{}.{}.ctl.txt".format(OPJ(wdir, name), prog))
def main(): """ main function """ ## parse params file input (returns to stdout if --help or --version) args = parse_command_line() print(HEADER.format(ip.__version__)) ## set random seed np.random.seed(args.rseed) ## debugger---------------------------------------- if os.path.exists(ip.__debugflag__): os.remove(ip.__debugflag__) if args.debug: print("\n ** Enabling debug mode ** ") ip._debug_on() ## if JSON, load existing Tetrad analysis ----------------------- if args.json: data = ipa.tetrad(name=args.name, workdir=args.workdir, load=True) ## if force then remove all results if args.force: data._refresh() ## else create a new tmp assembly for the seqarray----------------- else: ## create new Tetrad class Object if it doesn't exist newjson = os.path.join(args.workdir, args.name + '.tet.json') ## if not quiet... print("tetrad instance: {}".format(args.name)) if (not os.path.exists(newjson)) or args.force: ## purge any files associated with this name if forced if args.force: ## init an object in the correct location just to refresh ipa.tetrad(name=args.name, workdir=args.workdir, data=args.seq, initarr=False, save_invariants=args.invariants, cli=True, quiet=True)._refresh() ## create new tetrad object data = ipa.tetrad( name=args.name, workdir=args.workdir, method=args.method, data=args.seq, resolve=args.resolve, mapfile=args.map, guidetree=args.tree, nboots=args.boots, nquartets=args.nquartets, cli=True, save_invariants=args.invariants, ) else: raise SystemExit(QUARTET_EXISTS\ .format(args.name, args.workdir, args.workdir, args.name, args.name)) ## boots can be set either for a new object or loaded JSON to continue it if args.boots: data.params.nboots = int(args.boots) ## if ipyclient is running (and matched profile) then use that one if args.ipcluster: ipyclient = ipp.Client(profile=args.ipcluster) data._ipcluster["cores"] = len(ipyclient) ## if not then we need to register and launch an ipcluster instance else: ## set CLI ipcluster terms ipyclient = None data._ipcluster["cores"] = args.cores if args.cores else detect_cpus() data._ipcluster["engines"] = "Local" if args.MPI: data._ipcluster["engines"] = "MPI" if not args.cores: raise IPyradWarningExit("must provide -c argument with --MPI") ## register to have a cluster-id with "ip- name" data = register_ipcluster(data) ## message about whether we are continuing from existing if data.checkpoint.boots: print( LOADING_MESSAGE.format(data.name, data.params.method, data.checkpoint.boots)) ## run tetrad main function within a wrapper. The wrapper creates an ## ipyclient view and appends to the list of arguments to run 'run'. data.run(force=args.force, ipyclient=ipyclient)
import copy import itertools import subprocess import numpy as np from collections import Counter from ipyrad.assemble.util import DUCT, IPyradWarningExit try: ## when you have time go back and set attrubutes on toytrees from toytree import ete3mini as ete except ImportError: raise IPyradWarningExit(""" Error: bpp requires the dependency 'toytree', which we haven't yet included in the ipyrad installation. For now, you can install toytree using conda with the following command: conda install toytree -c eaton-lab """) class Bpp(object): """ BPP analysis utility function for creating input files, setting parameters, and submitting bpp jobs to run on a parallel cluster. Converts loci file format data to bpp file format, i.e., concatenated phylip-like format, and produces imap and ctl input files for bpp. The main functions are 'write_bpp_files()' and 'run()'. Parameters: -----------
def __init__(self, name, data=None, workdir="analysis-bpp", guidetree=None, imap=None, *args, **kwargs): ## path attributes self.name = name self.asyncs = [] self._kwargs = { "maxloci": None, "minmap": None, "minsnps": 0, "infer_sptree": 0, "infer_delimit": 0, "delimit_alg": (0, 5), "seed": 12345, "burnin": 1000, "nsample": 10000, "sampfreq": 2, "thetaprior": (2, 2000), "tauprior": (2, 2000, 1), "usedata": 1, "cleandata": 0, "finetune": (0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01), "copied": False, } self._kwargs.update(kwargs) ## support for legacy args if self._kwargs.get("locifile"): data = self._kwargs.get("locifile") if not data: raise IPyradWarningExit( "must enter a 'data' argument (an ipyrad .loci file).") ## set the guidetree if not guidetree: raise IPyradWarningExit( "must enter a 'guidetree' argument (a newick file or string).") self.tree = ete.Tree(guidetree) ## check workdir if workdir: self.workdir = os.path.abspath(os.path.expanduser(workdir)) else: self.workdir = os.path.join(os.path.curdir, "analysis-bpp") if not os.path.exists(self.workdir): os.makedirs(self.workdir) ## parsing imap dictionary, or create simple 1-1 mapping if not imap: self.imap = {i: [i] for i in self.tree.get_leaf_names()} else: self.imap = {} for key, val in imap.items(): if isinstance(val, (int, str)): self.imap[key] = [str(val)] elif isinstance(val, list): self.imap[key] = val else: raise IPyradWarningExit( "imap dictionary is not properly formatted") ## update stats if alleles instead of loci if not self._kwargs["minmap"]: self._kwargs["minmap"] = {i: 1 for i in self.tree.get_leaf_names()} if ('.alleles.loci' in data) and (not self._kwargs['copied']): ## add 0/1 to names keys = self.imap.keys() for key in keys: oldvals = self.imap[key] newvals = [] for val in oldvals: newvals += [val + "_0", val + "_1"] self.imap[key] = newvals ## double the minmap (copied attribute protects from double 2X) self._kwargs["minmap"] = \ {key: val*2 for key, val in self._kwargs['minmap'].items()} ## checks assert isinstance(self.imap, dict), "you must enter an IMAP dictionary" assert set(self.imap.keys()) == set(self.tree.get_leaf_names()), \ "IMAP keys must match guidetree names: \n{}\n{}"\ .format(self.imap.keys(), self.tree.get_leaf_names()) ## filters self.filters = Params() self.filters.minmap = self._kwargs["minmap"] self.filters.maxloci = self._kwargs["maxloci"] self.filters.minsnps = self._kwargs["minsnps"] ## set bpp parameters with defaults self.params = Params() notparams = set(["workdir", "maxloci", "minmap", "minsnps", "copied"]) for key in set(self._kwargs.keys()) - notparams: self.params[key] = self._kwargs[key] ## results files self.files = Params() self.files.data = data self.files.mcmcfiles = [] self.files.outfiles = []
def batch( baba, ipyclient=None, ): """ distributes jobs to the parallel client """ ## parse args handle = baba.data taxdicts = baba.tests mindicts = baba.params.mincov nboots = baba.params.nboots ## if ms generator make into reusable list sims = 0 if isinstance(handle, types.GeneratorType): handle = list(handle) sims = 1 else: ## expand locifile path to full path handle = os.path.realpath(handle) ## parse taxdicts into names and lists if it a dictionary #if isinstance(taxdicts, dict): # names, taxdicts = taxdicts.keys(), taxdicts.values() #else: # names = [] names = [] if isinstance(taxdicts, dict): taxdicts = [taxdicts] ## an array to hold results (len(taxdicts), nboots) tot = len(taxdicts) resarr = np.zeros((tot, 7), dtype=np.float64) bootsarr = np.zeros((tot, nboots), dtype=np.float64) paneldict = {} ## TODO: Setup a wrapper to find and cleanup ipyclient ## define the function and parallelization to use, ## if no ipyclient then drops back to using multiprocessing. if not ipyclient: # ipyclient = ip.core.parallel.get_client(**self._ipcluster) raise IPyradError("you must enter an ipyparallel.Client() object") else: lbview = ipyclient.load_balanced_view() ## submit jobs to run on the cluster queue start = time.time() asyncs = {} idx = 0 ## prepare data before sending to engines ## if it's a str (locifile) then parse it here just once. if isinstance(handle, str): with open(handle, 'r') as infile: loci = infile.read().strip().split("|\n") if isinstance(handle, list): pass #sims() ## iterate over tests (repeats mindicts if fewer than taxdicts) itests = iter(taxdicts) imdict = itertools.cycle([mindicts]) #for test, mindict in zip(taxdicts, itertools.cycle([mindicts])): for i in xrange(len(ipyclient)): ## next entries unless fewer than len ipyclient, skip try: test = next(itests) mindict = next(imdict) except StopIteration: continue ## if it's sim data then convert to an array if sims: loci = _msp_to_arr(handle, test) args = (loci, test, mindict, nboots) print("not yet implemented") #asyncs[idx] = lbview.apply_async(dstat, *args) else: args = [loci, test, mindict, nboots] asyncs[idx] = lbview.apply(dstat, *args) idx += 1 ## block until finished, print progress if requested. finished = 0 try: while 1: keys = [i for (i, j) in asyncs.items() if j.ready()] ## check for failures for job in keys: if not asyncs[job].successful(): raise IPyradWarningExit(\ " error: {}: {}".format(job, asyncs[job].exception())) ## enter results for successful jobs else: _res, _bot = asyncs[job].result() ## store D4 results if _res.shape[0] == 1: resarr[job] = _res.T.as_matrix()[:, 0] bootsarr[job] = _bot ## or store D5 results else: paneldict[job] = _res.T ## remove old job del asyncs[job] finished += 1 ## submit next job if there is one. try: test = next(itests) mindict = next(imdict) if sims: loci = _msp_to_arr(handle, test) args = (loci, test, mindict, nboots) print("not yet implemented") #asyncs[idx] = lbview.apply_async(dstat, *args) else: args = [loci, test, mindict, nboots] asyncs[idx] = lbview.apply(dstat, *args) idx += 1 except StopIteration: pass ## count finished and break if all are done. #fin = idx - len(asyncs) elap = datetime.timedelta(seconds=int(time.time() - start)) printstr = " calculating D-stats | {} | " progressbar(tot, finished, printstr.format(elap), spacer="") time.sleep(0.1) if not asyncs: print("") break except KeyboardInterrupt as inst: ## cancel all jobs (ipy & multiproc modes) and then raise error try: ipyclient.abort() except Exception: pass raise inst ## dress up resarr as a Pandas DataFrame if 4-part test if len(test) == 4: if not names: names = range(len(taxdicts)) #print("resarr") #print(resarr) resarr = pd.DataFrame(resarr, index=names, columns=[ "dstat", "bootmean", "bootstd", "Z", "ABBA", "BABA", "nloci" ]) ## sort results and bootsarr to match if test names were supplied resarr = resarr.sort_index() order = [list(resarr.index).index(i) for i in names] bootsarr = bootsarr[order] return resarr, bootsarr else: ## order results dfs listres = [] for key in range(len(paneldict)): listres.append(paneldict[key]) ## make into a multi-index dataframe ntests = len(paneldict) multi_index = [ np.array([[i] * 3 for i in range(ntests)]).flatten(), np.array(['p3', 'p4', 'shared'] * ntests), ] resarr = pd.DataFrame( data=pd.concat(listres).as_matrix(), index=multi_index, columns=listres[0].columns, ) return resarr, None
import matplotlib.pyplot as plt import matplotlib.cm as cm import pandas as pd import numpy as np import itertools import copy import os try: ## when you have time go back and set attrubutes on toytrees import allel except ImportError: raise IPyradWarningExit(""" Error: pca requires the dependency 'scikit-allel', which we haven't yet included in the ipyrad installation. For now, you can install scikit-allel using conda with the following command: conda install scikit-allel -c conda-forge """) ## set floating point precision in data frames to 3 for prettier printing pd.set_option('precision', 3) class PCA(object): "new pca class object" def __init__(self, data=None, pops=None, ncomps=10,
def write_ctl(name, imap, guidetree, nloci, infer_sptree, infer_delimit, delimit_alg, seed, burnin, nsample, sampfreq, thetaprior, tauprior, traits_df, nu0, kappa0, cleandata, useseqdata, usetraitdata, wdir, finetune, verbose): """ write outfile with any args in argdict """ ## A string to store ctl info ctl = [] ## check the tree (can do this better once we install ete3 w/ ipyrad) if not guidetree.endswith(";"): guidetree += ";" ## if traits_df then we make '.ibpp' files prog = 'bpp' if isinstance(traits_df, pd.DataFrame): prog = 'ibpp' ## write the top header info ctl.append("seed = {}".format(seed)) ctl.append("seqfile = {}.{}.seq.txt".format(OPJ(wdir, name), prog)) ctl.append("Imapfile = {}.{}.imap.txt".format(OPJ(wdir, name), prog)) ctl.append("mcmcfile = {}.{}.mcmc.txt".format(OPJ(wdir, name), prog)) ctl.append("outfile = {}.{}.out.txt".format(OPJ(wdir, name), prog)) if isinstance(traits_df, pd.DataFrame): ctl.append("traitfile = {}.{}.traits.txt".format( OPJ(wdir, name), prog)) ## number of loci (checks that seq file exists and parses from there) ctl.append("nloci = {}".format(nloci)) ctl.append("usedata = {}".format(useseqdata)) ctl.append("cleandata = {}".format(cleandata)) ## infer species tree if infer_sptree: ctl.append("speciestree = 1 0.4 0.2 0.1") else: ctl.append("speciestree = 0") ## infer delimitation (with algorithm 1 by default) ctl.append("speciesdelimitation = {} {} {}"\ .format(infer_delimit, delimit_alg[0], " ".join([str(i) for i in delimit_alg[1:]]))) ## if using iBPP (if not traits_df, we assume you're using bpp (v.3.3+) if isinstance(traits_df, pd.DataFrame): ## check that the data frame is properly formatted try: traits_df.values.astype(float) except Exception: raise IPyradWarningExit(PDREAD_ERROR) ## subsample to keep only samples that are in IMAP, we do not need to ## standarize traits b/c ibpp does that for us. samples = sorted(list(itertools.chain(*imap.values()))) didx = [list(traits_df.index).index(i) for i in traits_df.index \ if i not in samples] dtraits = traits_df.drop(traits_df.index[didx]) ## mean standardize traits values after excluding samples straits = dtraits.apply(lambda x: (x - x.mean()) / (x.std())) ## convert NaN to "NA" cuz that's what ibpp likes, and write to file ftraits = straits.fillna("NA") traitdict = ftraits.T.to_dict("list") ## get reverse imap dict rev = {val: key for key in sorted(imap) for val in imap[key]} ## write trait file traitfile = "{}.{}.traits.txt".format(os.path.join(wdir, name), prog) with open(traitfile, 'w') as tout: tout.write("Indiv\n") tout.write("\t".join(['Species'] + list(ftraits.columns)) + "\n") #for key in sorted(traitdict): # tout.write("\t".join([key, rev[key]] + \ # ["^"+str(i) for i in traitdict[key]])+"\n" # ) nindT = 0 for ikey in sorted(imap.keys()): samps = imap[ikey] for samp in sorted(samps): if samp in traitdict: tout.write("\t".join([samp, rev[samp]] + \ [str(i) for i in traitdict[samp]])+"\n" ) nindT += 1 # tout.write("Indiv\n"+"\t".join(["Species"]+\ # ["t_{}".format(i) for i in range(len(traitdict.values()[0]))])+"\n") # for key in sorted(traitdict): # print >>tout, "\t".join([key, rev[key]] + \ # [str(i) for i in traitdict[key]]) #ftraits.to_csv(traitfile) ## write ntraits and nindT and traitfilename ctl.append("ntraits = {}".format(traits_df.shape[1])) ctl.append("nindT = {}".format(nindT)) #traits_df.shape[0])) ctl.append("usetraitdata = {}".format(usetraitdata)) ctl.append("useseqdata = {}".format(useseqdata)) ## trait priors ctl.append("nu0 = {}".format(nu0)) ctl.append("kappa0 = {}".format(kappa0)) ## remove ibpp incompatible options ctl.remove("usedata = {}".format(useseqdata)) ctl.remove("speciestree = {}".format(infer_sptree)) ## get tree values nspecies = str(len(imap)) species = " ".join(sorted(imap)) ninds = " ".join([str(len(imap[i])) for i in sorted(imap)]) ## write the tree ctl.append("""\ species&tree = {} {} {} {}""".format(nspecies, species, ninds, guidetree)) ## priors ctl.append("thetaprior = {} {}".format(*thetaprior)) ctl.append("tauprior = {} {} {}".format(*tauprior)) ## other values, fixed for now ctl.append("finetune = 1: {}".format(" ".join([str(i) for i in finetune]))) #CTL.append("finetune = 1: 1 0.002 0.01 0.01 0.02 0.005 1.0") ctl.append("print = 1 0 0 0") ctl.append("burnin = {}".format(burnin)) ctl.append("sampfreq = {}".format(sampfreq)) ctl.append("nsample = {}".format(nsample)) ## write out the ctl file with open("{}.{}.ctl.txt".format(OPJ(wdir, name), prog), 'w') as out: out.write("\n".join(ctl)) ## if verbose print ctl if verbose: sys.stderr.write("ctl file\n--------\n" + "\n".join(ctl) + "\n--------\n\n")
def loci2cf(name, locifile, popdict, wdir=None, ipyclient=None): """ Convert ipyrad .loci file to an iqtree-pomo 'counts' file Parameters: ----------- name: A prefix name for output files that will be produced locifile: A .loci file produced by ipyrad. popdict: A python dictionary grouping Clade names to Sample names. Example: {"A": ['a', 'b', 'c'], "B": ['d', 'e', 'f']} ipyclient: If you pass it an ipyclient it will distribute work over remote engines, otherwise we use multiprocessing (todo). """ ## working directory, make sure it exists if wdir: wdir = os.path.abspath(wdir) if not os.path.exists(wdir): raise IPyradWarningExit(" working directory (wdir) does not exist") else: wdir = os.path.curdir ## output file path name = name.rsplit(".cf")[0] outfile = os.path.join(wdir, "{}.cf".format(name)) out = open(outfile, 'w') ## parse loci file with open(locifile) as inloc: loci = inloc.read().strip().split("|\n") ## get all names names = list(itertools.chain(*popdict.values())) popkeys = sorted(popdict.keys()) ## count nsites nsites = sum(len(loc.split("\n")[0].split()[1]) for loc in loci[:]) ## print the header out.write( HEADER.format(**{ "NPOP": len(popdict), "NSITES": nsites, "VTAXA": "\t".join(popkeys) })) ## build print string outstr = "chr{:<8} {:<4} " for cidx in xrange(len(popkeys)): outstr += "{:<8} " toprint = [] for idx in xrange(len(loci)): dat = loci[idx].split("\n") seqs = np.array([list(i.split()[1]) for i in dat[:-1]]) names = [i.split()[0] for i in dat[:-1]] data = np.zeros((seqs.shape[1], len(popkeys), 4), dtype=np.uint16) for sidx in xrange(seqs.shape[1]): for cidx in xrange(len(popkeys)): for name in popdict[popkeys[cidx]]: if name in names: base = seqs[names.index(name), sidx] if base in list("ACGT"): data[sidx, cidx, BASE2IDX[base]] += 2 elif base in list("RSYMKW"): base1, base2 = AMBIGS[base] data[sidx, cidx, BASE2IDX[base1]] += 1 data[sidx, cidx, BASE2IDX[base2]] += 1 ## print string for one locus sdat = [",".join([str(i) for i in i.tolist()]) for i in data[sidx]] #print outstr.format(idx+1, sidx+1, *sdat) toprint.append(outstr.format(idx + 1, sidx + 1, *sdat)) ## if 10K loci, then print and clear if not idx % 10000: out.write("\n".join(toprint) + "\n") toprint = [] ## close handle out.write("\n".join(toprint) + "\n") out.close()
break self._report(tots) ## check for fails for async in asyncs: if not async .successful(): raise IPyradWarningExit(async .result()) else: self._accession = self.accession _call_fastq_dump_on_SRRs(self) self.report(1) except KeyboardInterrupt as inst: if ipyclient: raise IPyradWarningExit("interrupted -- ipcluster shutdown") else: raise IPyradWarningExit("interrupted") finally: if not ipyclient.outstanding: ipyclient.purge_everything() else: ## nanny: kill the engines left running, report kill. ipyclient.shutdown(hub=True, block=False) ipyclient.close() print("\nwarning: ipcluster shutdown and must be restarted") def _run(self, force=False, ipyclient=None): """ Download the accessions into a the designated workdir.
def write_nexus_files(self, force=False, quiet=False): """ Write nexus files to {workdir}/{name}/[0-N].nex, If the directory already exists an exception will be raised unless you use the force flag which will remove all files in the directory. Parameters: ----------- force (bool): If True then all files in {workdir}/{name}/*.nex* will be removed. """ ## clear existing files existing = glob.glob(os.path.join(self.workdir, self.name, "*.nex")) if any(existing): if force: for rfile in existing: os.remove(rfile) else: path = os.path.join(self.workdir, self.name) raise IPyradWarningExit(EXISTING_NEX_FILES.format(path)) ## parse the loci or alleles file with open(self.files.data) as infile: loci = iter(infile.read().strip().split("|\n")) ## use entered samples or parse them from the file if not self.samples: with open(self.files.data) as infile: samples = set((i.split()[0] for i in infile.readlines() \ if "//" not in i)) else: samples = set(self.samples) ## keep track of how many loci pass filtering totn = len(samples) nloci = 0 ## this set is just used for matching, then we randomly ## subsample for real within the locus so it varies if self._alleles: msamples = {i + rbin() for i in samples} else: msamples = samples ## write subsampled set of loci for loc in loci: ## get names and seqs from locus dat = loc.split("\n")[:-1] try: names = [i.split()[0] for i in dat] snames = set(names) seqs = np.array([list(i.split()[1]) for i in dat]) except IndexError: print(ALLELESBUGFIXED) continue ## check name matches if len(snames.intersection(msamples)) == totn: ## prune sample names if alleles. Done here so it is randomly ## different in every locus which allele is selected from ## each sample (e.g., 0 or 1) if self._alleles: _samples = [i + rbin() for i in samples] else: _samples = samples ## re-order seqs to be in set order seqsamp = seqs[[names.index(tax) for tax in _samples]] ## resolve ambiguities randomly if .loci file otherwise ## sample one of the alleles if .alleles file. if not self._alleles: seqsamp = _resolveambig(seqsamp) ## find parsimony informative sites if _count_PIS(seqsamp, self.params.minsnps): ## keep the locus nloci += 1 ## remove empty columns given this sampling copied = seqsamp.copy() copied[copied == "-"] == "N" rmcol = np.all(copied == "N", axis=0) seqsamp = seqsamp[:, ~rmcol] ## write nexus file if self._alleles: ## trim off the allele number samps = [i.rsplit("_", 1)[0] for i in _samples] mdict = dict( zip(samps, [i.tostring() for i in seqsamp])) else: mdict = dict( zip(_samples, [i.tostring() for i in seqsamp])) self._write_nex(mdict, nloci) ## quit early if using maxloci if nloci == self.params.maxloci: break ## print data size if not quiet: path = os.path.join(self.workdir, self.name) path = path.replace(os.path.expanduser("~"), "~") print("wrote {} nexus files to {}".format(nloci, path))
def get_client(cluster_id, profile, engines, timeout, cores, quiet, spacer, **kwargs): """ Creates a client to view ipcluster engines for a given profile and returns it with at least one engine spun up and ready to go. If no engines are found after nwait amount of time then an error is raised. If engines==MPI it waits a bit longer to find engines. If the number of engines is set then it waits even longer to try to find that number of engines. """ ## save stds for later, we're gonna hide them to prevent external printing save_stdout = sys.stdout save_stderr = sys.stderr sys.stdout = cStringIO.StringIO() sys.stderr = cStringIO.StringIO() ## get cluster_info print string connection_string = "{}establishing parallel connection:".format(spacer) ## wrapped search for ipcluster try: ## are we looking for a running ipcluster instance? if profile not in [None, "default"]: args = {'profile': profile, "timeout": timeout} else: clusterargs = [cluster_id, profile, timeout] argnames = ["cluster_id", "profile", "timeout"] args = {key: value for key, value in zip(argnames, clusterargs)} ## get connection within timeout window of wait time and hide messages ipyclient = ipp.Client(**args) sys.stdout = save_stdout sys.stderr = save_stderr ## check that all engines have connected if (engines == "MPI") or ("ipyrad-cli-" in cluster_id): if not quiet: print(connection_string) for _ in range(6000): initid = len(ipyclient) time.sleep(0.01) ## If MPI then wait for all engines to start so we can report ## how many cores are on each host. If Local then only wait for ## one engine to be ready and then just go. if (engines == "MPI") or ("ipyrad-cli-" in cluster_id): ## wait for cores to be connected if cores: time.sleep(0.1) if initid == cores: break if initid: time.sleep(3) if len(ipyclient) == initid: break else: if cores: if initid == cores: break else: if initid: break except KeyboardInterrupt as inst: ## ensure stdout is reset even if Exception was raised sys.stdout = save_stdout sys.stderr = save_stderr raise inst ## This is raised if ipcluster is not running ------------ except IOError as inst: ## ensure stdout is reset even if Exception was raised sys.stdout = save_stdout sys.stderr = save_stderr if "ipyrad-cli-" in cluster_id: raise IPyradWarningExit(NO_IPCLUSTER_API) else: raise IPyradWarningExit(NO_IPCLUSTER_CLI) except (ipp.TimeoutError, ipp.NoEnginesRegistered) as inst: ## raised by ipp if no connection file is found for 'nwait' seconds sys.stdout = save_stdout sys.stderr = save_stderr raise inst except Exception as inst: ## if any other exceptions were missed... sys.stdout = save_stdout sys.stderr = save_stderr raise inst finally: ## ensure that no matter what we reset the stds sys.stdout = save_stdout sys.stderr = save_stderr return ipyclient
def _submit_jobs(self, force, ipyclient, name_fields, name_separator, dry_run): """ Download the accessions into a the designated workdir. If file already exists it will only be overwritten if force=True. Temporary files are removed. """ ## get Run data with default fields (1,4,6,30) df = self.fetch_runinfo(range(31), quiet=True) sys.stdout.flush() ## if not ipyclient then use multiprocessing if ipyclient: lb = ipyclient.load_balanced_view() ## if Run has samples with same name (replicates) then ## we need to include the accessions in the file names if name_fields: ## indexing requires -1 ints fields = [int(i) - 1 for i in fields_checker(name_fields)] ## make accession names, no spaces allowed df['Accession'] = pd.Series(df[df.columns[fields[0]]], index=df.index) for field in fields[1:]: df.Accession += name_separator + df[df.columns[field]] df.Accession = [i.replace(" ", "_") for i in df.Accession] ## check that names are unique if not df.Accession.shape[0] == df.Accession.unique().shape[0]: raise IPyradWarningExit("names are not unique:\n{}"\ .format(df.Accession)) ## backup default naming scheme else: if len(set(df.SampleName)) != len(df.SampleName): accs = (i + "-" + j for i, j in zip(df.SampleName, df.Run)) df.Accession = accs else: df.Accession = df.SampleName if dry_run: print("\rThe following files will be written to: {}".format( self.workdir)) print("{}\n".format(df.Accession)) else: ## iterate over and download asyncs = [] for idx in df.index: ## get args for this run srr = df.Run[idx] outname = df.Accession[idx] paired = df.spots_with_mates.values.astype( int).nonzero()[0].any() fpath = os.path.join(self.workdir, outname + ".fastq.gz") ## skip if exists and not force skip = False if force: if os.path.exists(fpath): os.remove(fpath) else: if os.path.exists(fpath): skip = True sys.stdout.flush() print("[skip] file already exists: {}".format(fpath)) ## single job progress bar tidx = df.Accession.shape[0] #if not ipyclient: ## submit job to run if not skip: args = (self, srr, outname, paired) if ipyclient: async = lb.apply_async(call_fastq_dump_on_SRRs, *args) asyncs.append(async) else: print("Downloading file {}/{}: {}".format( idx + 1, tidx, fpath)) call_fastq_dump_on_SRRs(*args) sys.stdout.flush() ## progress bar while blocking parallel if ipyclient: tots = df.Accession.shape[0] printstr = " Downloading fastq files | {} | " start = time.time() while 1: elapsed = datetime.timedelta(seconds=int(time.time() - start)) ready = sum([i.ready() for i in asyncs]) progressbar(tots, ready, printstr.format(elapsed), spacer="") time.sleep(0.1) if tots == ready: print("") break self._report(tots) ## check for fails for async in asyncs: if not async .successful(): raise IPyradWarningExit(async .result())
def main(): """ main function """ ## parse params file input (returns to stdout if --help or --version) args = parse_command_line() print(HEADER.format(ip.__version__)) ## set random seed np.random.seed(args.rseed) random.seed(args.rseed) ## debugger---------------------------------------- if os.path.exists(ip.__debugflag__): os.remove(ip.__debugflag__) if args.debug: print("\n ** Enabling debug mode ** ") ip.debug_on() atexit.register(ip.debug_off) ## if JSON, load existing Tetrad analysis ----------------------- if args.json: #data = ipa.tetrad.load_json(args.json) data = ipa.tetrad(name=args.name, workdir=args.workdir, load=True) ## if force then remove all results if args.force: data.refresh() ## else create a new tmp assembly for the seqarray----------------- else: ## create new Tetrad class Object if it doesn't exist newjson = os.path.join(args.workdir, args.name + '.tet.json') if (not os.path.exists(newjson)) or args.force: ## purge any files associated with this name if forced if args.force: ipa.tetrad(name=args.name, workdir=args.workdir, seqfile=args.seq, initarr=False, quiet=True).refresh() ## create new tetrad object data = ipa.tetrad( name=args.name, workdir=args.workdir, method=args.method, seqfile=args.seq, resolve=args.resolve, mapfile=args.map, guidetreefile=args.tree, nboots=args.boots, nquartets=args.nquartets, cli=True, ) ## if not quiet... print("tetrad instance: {}".format(args.name)) else: raise SystemExit(QUARTET_EXISTS\ .format(args.name, args.workdir, args.workdir, args.name, args.name)) ## boots can be set either for a new object or loaded JSON to continue it if args.boots: data.nboots = int(args.boots) ## set CLI ipcluster terms data._ipcluster["cores"] = args.cores if args.cores else detect_cpus() ## if more ipcluster args from command-line then use those if args.MPI: data._ipcluster["engines"] = "MPI" if not args.cores: raise IPyradWarningExit("must provide -c argument with --MPI") else: data._ipcluster["engines"] = "Local" ## launch a NEW ipcluster instance and register "cluster_id" ## for later destruction, and to avoid conflicts between ## simultaneous ipcluster instances. If a user wanted to use ## an ipcluster instance that is already running instead then ## they have to use the API, or to have set args.ipcluster if args.ipcluster: data._ipcluster["cluster_id"] = "" else: data = register_ipcluster(data) ## message about whether we are continuing from existing if data.checkpoint.boots or data.checkpoint.arr: print( ipa.tetrad.LOADING_MESSAGE.format(data.name, data.method, data.checkpoint.boots, data.checkpoint.arr)) ## run tetrad main function within a wrapper. The wrapper creates an ## ipyclient view and appends to the list of arguments to run 'run'. data.run(force=args.force)
def parse_command_line(): """ Parse CLI args. Only three options now. """ ## create the parser parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" * Example command-line usage ---------------------------------------------- * Read in sequence/SNP data file, provide linkage, output name, ambig option. tetrad -s data.snps.phy -n test ## input phylip and give name tetrad -s data.snps.phy -l data.snps.map ## use one SNP per locus tetrad -s data.snps.phy -n noambigs -r 0 ## do not use hetero sites * Load saved/checkpointed analysis from '.tet.json' file, or force restart. tetrad -j test.tet.json -b 100 ## continue 'test' until 100 boots tetrad -j test.tet.json -b 100 -f ## force restart of 'test' * Sampling modes: 'equal' uses guide tree to sample quartets more efficiently tetrad -s data.snps -m all ## sample all quartets tetrad -s data.snps -m random -q 1e6 -x 123 ## sample 1M randomly tetrad -s data.snps -m equal -q 1e6 -t guide.tre ## sample 1M across tree * HPC optimization: Set -c to the number of nodes to improve efficiency tetrad -s data.phy -c 16 ## e.g., use 16 cores across 4 nodes * Documentation: http://ipyrad.readthedocs.org/en/latest/ """) ## add arguments ## get version from ipyrad ipyversion = str(pkg_resources.get_distribution('ipyrad')) parser.add_argument('-v', '--version', action='version', version="tetrad " + ipyversion.split()[1]) parser.add_argument('-f', "--force", action='store_true', help="force overwrite of existing data") #parser.add_argument('-q', "--quiet", action='store_true', # help="do not print to stderror or stdout.") parser.add_argument( '-s', metavar="seq", dest="seq", type=str, default=None, help="path to input phylip file (SNPs of full sequence file)") parser.add_argument( '-j', metavar='json', dest="json", type=str, default=None, help="load checkpointed/saved analysis from JSON file.") parser.add_argument( '-m', metavar="method", dest="method", type=str, default="all", help="method for sampling quartets (all, random, or equal)") parser.add_argument('-q', metavar="nquartets", dest="nquartets", type=int, default=0, help="number of quartets to sample (if not -m all)") parser.add_argument('-b', metavar="boots", dest="boots", type=int, default=0, help="number of non-parametric bootstrap replicates") parser.add_argument( '-l', metavar="map_file", dest="map", type=str, default=None, help="map file of snp linkages (e.g., ipyrad .snps.map)") parser.add_argument('-r', metavar="resolve", dest='resolve', type=int, default=1, help="randomly resolve heterozygous sites (default=1)") parser.add_argument('-n', metavar="name", dest="name", type=str, default="test", help="output name prefix (default: 'test')") parser.add_argument( '-o', metavar="workdir", dest="workdir", type=str, default="./analysis-tetrad", help="output directory (default: creates ./analysis-tetrad)") parser.add_argument( '-t', metavar="starting_tree", dest="tree", type=str, default=None, help="newick file starting tree for equal splits sampling") parser.add_argument( "-c", metavar="CPUs/cores", dest="cores", type=int, default=0, help="setting n Nodes improves parallel efficiency on HPC") parser.add_argument( "-x", metavar="random_seed", dest="rseed", type=int, default=None, help="random seed for quartet sampling and/or bootstrapping") parser.add_argument( '-d', "--debug", action='store_true', help="print lots more info to debugger: ipyrad_log.txt.") parser.add_argument("--MPI", action='store_true', help="connect to parallel CPUs across multiple nodes") parser.add_argument( "--ipcluster", action='store_true', help="connect to ipcluster instance with profile=default") ## if no args then return help message if len(sys.argv) == 1: parser.print_help() sys.exit(1) ## parse args args = parser.parse_args() ## RAISE errors right away for some bad argument combinations: if args.method not in ["random", "equal", "all"]: raise IPyradWarningExit(" method argument (-m) must be one of"+\ """ "all", "random", or "equal.\n""") ## if 'random' require nquarts argument if args.method == 'random': if not args.nquartets: raise IPyradWarningExit(\ " Number of quartets (-q) is required with method = random\n") ## if 'equal' method require starting tree and nquarts if args.method == 'equal': raise IPyradWarningExit(\ " The equal sampling method is currently for developers only.\n") if not args.nquartets: raise IPyradWarningExit(\ " Number of quartets (-q) is required with method = equal\n") if not args.tree: raise IPyradWarningExit(\ " Input guide tree (-t) is required with method = equal\n") ## required args if not any(x in ["seq", "json"] for x in vars(args).keys()): print(""" Bad arguments: tetrad command must include at least one of (-s or -j) """) parser.print_help() sys.exit(1) return args
def run(self, force=False, ipyclient=None, name_fields=30, name_separator="_", dry_run=False): """ Download the accessions into a the designated workdir. Parameters ---------- force: (bool) If force=True then existing files with the same name will be overwritten. ipyclient: (ipyparallel.Client) If provided, work will be distributed across a parallel client, otherwise download will be run on a single core. name_fields: (int, str): Provide the index of the name fields to be used as a prefix for fastq output files. The default is 30, which is the SampleName field. Use sra.fetch_fields to see all available fields and their indices. A likely alternative is 1 (Run). If multiple are listed then they will be joined by a "_" character. For example (29,30) would yield something like: latin-name_sample-name (e.g., mus_musculus-NR10123). dry_run: (bool) If True then a table of file names that _would_ be downloaded will be shown, but the actual files will note be downloaded. """ ## temporarily set directory for tmpfiles used by fastq-dump ## if this fails then just skip it. try: ## ensure output directory, also used as tmpdir if not os.path.exists(self.workdir): os.makedirs(self.workdir) ## get original directory for sra files ## probably /home/ncbi/public/sra by default. self._set_vdbconfig_path() ## register ipyclient for cleanup if ipyclient: self._ipcluster["pids"] = {} for eid in ipyclient.ids: engine = ipyclient[eid] if not engine.outstanding: pid = engine.apply(os.getpid).get() self._ipcluster["pids"][eid] = pid ## submit jobs to engines or local self._submit_jobs( force=force, ipyclient=ipyclient, name_fields=name_fields, name_separator=name_separator, dry_run=dry_run, ) except IPyradWarningExit as inst: print(inst) ## exceptions to catch, cleanup and handle ipyclient interrupts except KeyboardInterrupt: print("keyboard interrupt...") except Exception as inst: print("Exception in run() - {}".format(inst)) finally: ## reset working sra path self._restore_vdbconfig_path() ## if it made a new sra directory then it should be empty when ## we are finished if all .sra files were removed. If so, then ## let's also remove the dir. if not empty, leave it. sradir = os.path.join(self.workdir, "sra") if os.path.exists(sradir) and (not os.listdir(sradir)): shutil.rmtree(sradir) else: ## print warning try: print(FAILED_DOWNLOAD.format(os.listdir(sradir))) except OSError as inst: ## If sra dir doesn't even exist something very bad is broken. raise IPyradWarningExit("Download failed. Exiting.") ## remove fastq file matching to cached sra file for srr in os.listdir(sradir): isrr = srr.split(".")[0] ipath = os.path.join(self.workdir, "*_{}*.gz".format(isrr)) ifile = glob.glob(ipath)[0] if os.path.exists(ifile): os.remove(ifile) ## remove cache of sra files shutil.rmtree(sradir) ## cleanup ipcluster shutdown if ipyclient: ## send SIGINT (2) to all engines still running tasks try: ipyclient.abort() time.sleep(0.5) for engine_id, pid in self._ipcluster["pids"].items(): if ipyclient.queue_status()[engine_id]["tasks"]: os.kill(pid, 2) time.sleep(0.1) except ipp.NoEnginesRegistered: pass ## clean memory space if not ipyclient.outstanding: ipyclient.purge_everything() ## uh oh, kill everything, something bad happened else: ipyclient.shutdown(hub=True, block=False) ipyclient.close() print( "\nwarning: ipcluster shutdown and must be restarted")