def __init__(self, name, controller="Local"): ## obj name self.name = name print("New Assembly object `{}` created".format(self.name)) ## launch ipcluster and register for later destruction self.__ipname__ = ipcontroller_init(controller) ## get binaries of dependencies self.vsearch, self.muscle, self.smalt, self.samtools = getbins() ## link a log history of executed workflow self.log = [] self._stamp(self.name + " created") self.statsfiles = ObjDict() ## samples linked self.samples = ObjDict() ## multiplex files linked self.barcodes = ObjDict() ## an object for storing data directories for this Assembly self.dirs = ObjDict() ## the default params dict self.paramsdict = OrderedDict([ ("working_directory", os.path.realpath(os.path.curdir)), ("raw_fastq_path", os.path.join(os.path.realpath(os.path.curdir), "*.fastq")), ("barcodes_path", os.path.join(os.path.realpath(os.path.curdir), "*.barcodes.txt")), ("sorted_fastq_path", ""), ("restriction_overhang", ("TGCAG", "")), ("max_low_qual_bases", 5), ("engines_per_job", 4), ("mindepth_statistical", 6), ("mindepth_majrule", 6), ("datatype", 'rad'), ("clust_threshold", .85), ("minsamp", 4), ("max_shared_heterozygosity", .25), ("prefix_outname", self.name), ("phred_Qscore_offset", 33), ("max_barcode_mismatch", 1), ("filter_adapters", 0), ("filter_min_trim_len", 35), ("ploidy", 2), ("max_stack_size", 1000), ("max_Ns_consens", (5, 5)), ("max_Hs_consens", (8, 8)), ("max_SNPs_locus", (100, 100)), ("max_Indels_locus", (5, 99)), ("trim_overhang", (1, 2, 2, 1)), ("hierarchical_clustering", 0), ("assembly_method", "denovo"), ("reference_sequence", "") ])
def __init__(self, name=""): ## a sample name self.name = name self.barcode = "" self.merged = 0 ## stats dictionary self.stats = pd.Series(index=[ "state", "reads_raw", "reads_filtered", "refseq_mapped_reads", "refseq_unmapped_reads", "clusters_total", "clusters_kept", "hetero_est", "error_est", "reads_consens", ]) ## link to files self.files = ObjDict({ "fastqs": [], "edits": [], "mapped_reads": [], "unmapped_reads": [], "clusters": [], "depths": [], "consens": [], "database": [] }) ## store cluster depth information self.depths = ObjDict() self.depths.total = [] self.depths.mjmin = [] self.depths.statmin = [] ## assignments for hierarchical clustering self.group = []
def __init__(self, name, controller="Local"): ## obj name self.name = name print("New Assembly object `{}` created".format(self.name)) ## launch ipcluster and register for later destruction self.__ipname__ = ipcontroller_init(controller) ## get binaries of dependencies self.vsearch, self.muscle, self.smalt, self.samtools = getbins() ## link a log history of executed workflow self.log = [] self._stamp(self.name+" created") self.statsfiles = ObjDict() ## samples linked self.samples = ObjDict() ## multiplex files linked self.barcodes = ObjDict() ## an object for storing data directories for this Assembly self.dirs = ObjDict() ## the default params dict self.paramsdict = OrderedDict([ ("working_directory", os.path.realpath( os.path.curdir)), ("raw_fastq_path", os.path.join( os.path.realpath( os.path.curdir), "*.fastq")), ("barcodes_path", os.path.join( os.path.realpath( os.path.curdir), "*.barcodes.txt")), ("sorted_fastq_path", ""), ("restriction_overhang", ("TGCAG", "")), ("max_low_qual_bases", 5), ("engines_per_job", 4), ("mindepth_statistical", 6), ("mindepth_majrule", 6), ("datatype", 'rad'), ("clust_threshold", .85), ("minsamp", 4), ("max_shared_heterozygosity", .25), ("prefix_outname", self.name), ("phred_Qscore_offset", 33), ("max_barcode_mismatch", 1), ("filter_adapters", 0), ("filter_min_trim_len", 35), ("ploidy", 2), ("max_stack_size", 1000), ("max_Ns_consens", (5, 5)), ("max_Hs_consens", (8, 8)), ("max_SNPs_locus", (100, 100)), ("max_Indels_locus", (5, 99)), ("trim_overhang", (1, 2, 2, 1)), ("hierarchical_clustering", 0), ("assembly_method", "denovo"), ("reference_sequence", "") ])
class Assembly(object): """ An ipyrad Assembly class object. The core object in ipyrad used to store and retrieve results, to call assembly functions, and to link to Sample objects. Parameters ---------- name : str A name should be passed when creating a new Assembly object. This name will be used as a prefix for all files saved to disk associated with this Assembly. It is automatically set as the prefix name (parameter 14). Attributes ---------- name : str A name for the Assembly object. Used for all saved files on disk. samples : dict Returns a dict with Sample names as keys and Sample objects as values. barcodes : dict Returns a dictionary with Sample names as keys and barcodes as values. The barcodes information is fetched from parameter 3 `[Assembly].paramsdict['barcodes_path']`. vsearch : str The path to the default vsearch executable. If not found, this can be changed by setting [Assembly].vsearch = [newpath]. muscle : str The path to the default muscle executable. If not found, this can be changed by setting `[Assembly].muscle = [newpath]`. smalt : str The path to the default smalt executable. If not found, this can be changed by setting `[Assembly].smalt = [newpath]`. samtools : str The path to the default samtools executable. If not found, this can be changed by setting `[Assembly].samtools = [newpath]`. log : list A list of all modifications to the Assembly object and its Samples with time stamps. Use `print [Assembly].log` for easier viewing. dirs : dict Returns a dictionary with the location of directories that contain linked Sample object files and stats results. Returns ------- object A new assembly object is returned. """ def __init__(self, name, controller="Local"): ## obj name self.name = name print("New Assembly object `{}` created".format(self.name)) ## launch ipcluster and register for later destruction self.__ipname__ = ipcontroller_init(controller) ## get binaries of dependencies self.vsearch, self.muscle, self.smalt, self.samtools = getbins() ## link a log history of executed workflow self.log = [] self._stamp(self.name+" created") self.statsfiles = ObjDict() ## samples linked self.samples = ObjDict() ## multiplex files linked self.barcodes = ObjDict() ## an object for storing data directories for this Assembly self.dirs = ObjDict() ## the default params dict self.paramsdict = OrderedDict([ ("working_directory", os.path.realpath( os.path.curdir)), ("raw_fastq_path", os.path.join( os.path.realpath( os.path.curdir), "*.fastq")), ("barcodes_path", os.path.join( os.path.realpath( os.path.curdir), "*.barcodes.txt")), ("sorted_fastq_path", ""), ("restriction_overhang", ("TGCAG", "")), ("max_low_qual_bases", 5), ("engines_per_job", 4), ("mindepth_statistical", 6), ("mindepth_majrule", 6), ("datatype", 'rad'), ("clust_threshold", .85), ("minsamp", 4), ("max_shared_heterozygosity", .25), ("prefix_outname", self.name), ("phred_Qscore_offset", 33), ("max_barcode_mismatch", 1), ("filter_adapters", 0), ("filter_min_trim_len", 35), ("ploidy", 2), ("max_stack_size", 1000), ("max_Ns_consens", (5, 5)), ("max_Hs_consens", (8, 8)), ("max_SNPs_locus", (100, 100)), ("max_Indels_locus", (5, 99)), ("trim_overhang", (1, 2, 2, 1)), ("hierarchical_clustering", 0), ("assembly_method", "denovo"), ("reference_sequence", "") ]) def __str__(self): return "<ipyrad.Assembly object {}>".format(self.name) @property def stats(self): """ Returns a data frame with Sample data and state. """ nameordered = self.samples.keys() nameordered.sort() return pd.DataFrame([self.samples[i].stats for i in nameordered], index=nameordered).dropna(axis=1, how='all') #dtype=[int, int, int, int, int, float, float, int]) @property def files(self): """ Returns a data frame with Sample files. Not very readable... """ nameordered = self.samples.keys() nameordered.sort() ## replace curdir with . for shorter printing #fullcurdir = os.path.realpath(os.path.curdir) return pd.DataFrame([self.samples[i].files for i in nameordered], index=nameordered).dropna(axis=1, how='all') def _stamp(self, event): """ Stamps an event into the log history. """ tev = time.strftime("%m/%d/%y %H:%M:%S", time.gmtime()) self.log.append((self.name, tev, event)) def link_fastqs(self, path=None, merged=False, force=False, append=False): """ Create Sample objects for samples in sorted_fastq_path. Note ---- link_fastqs() is called automatically during step2() if no Samples are yet present in the Assembly object (data were not demultiplexed in step1().) It looks for demultiplexed data files located in the [sorted_fastq_path]. Parameters ---------- path : str Path to the fastq files to be linked to Sample objects. The default location is to select all files in the 'sorted_fastq_path'. Alternatively a different path can be entered here. merged : bool Set to True if files represent first and second reads that were merged using some external software such as `PEAR` or `VSEARCH`. append : bool The default action is to overwrite fastq files linked to Samples if they already have linked files. Use append=True to instead append additional fastq files to a Sample (file names should be formatted the same as usual, e.g., [name]_R1_[optional].fastq.gz). Returns ------- str Prints the number of new Sample objects created and the number of fastq files linked to Sample objects in the Assembly object. """ ## cannot both force and append at once if force and append: raise Exception("Cannot use force and append at the same time.") if self.samples and not (force or append): raise Exception("Files already linked to `{}`. ".format(self.name)\ +"Use force=True to replace all files, or append=True to " +"add additional files to existing Samples.") ## get path to data files if not path: path = self.paramsdict["sorted_fastq_path"] ## does location exist, if no files selected, try selecting all if os.path.isdir(path): path += "*" ## grab fastqs/fq/gzip/all fastqs = glob.glob(path) fastqs = [i for i in fastqs if i.endswith(".gz") \ or i.endswith(".fastq") \ or i.endswith(".fq")] ## sort alphabetical fastqs.sort() ## link pairs into tuples if 'pair' in self.paramsdict["datatype"]: ## check that names fit the paired naming convention r1_files = [i for i in fastqs if "_R1_" in i] r2_files = [i.replace("_R1_", "_R2_") for i in r1_files] if not any(["_R1_" in i for i in fastqs]) or \ (len(r1_files) != len(r2_files)): raise Exception("File name format error: paired file names " \ +"must be identical except for _R1_ and _R2_ in their names.") fastqs = [(i, j) for i, j in zip(r1_files, r2_files)] ## data are not paired, create empty tuple pair else: if any(["_R2_" in i for i in fastqs]): print("Given the presence of '_R2_' in file names, this "\ +"is a warning that if your data are paired-end you should set "\ +"the Assembly object datatype to a paired type (e.g., "\ +"pairddrad or pairgbs) prior to running link_fastqs().") fastqs = [(i, ) for i in fastqs] ## counters for the printed output created = 0 linked = 0 appended = 0 for fastqtuple in list(fastqs): assert isinstance(fastqtuple, tuple), "fastqs not a tuple." ## local counters createdinc = 0 linkedinc = 0 appendinc = 0 ## remove file extension from name sname = _name_from_file(fastqtuple[0]) if sname not in self.samples: ## create new Sample self.samples[sname] = Sample(sname) self.samples[sname].stats.state = 1 self.samples[sname].barcode = None self.samples[sname].files.fastqs.append(fastqtuple) createdinc += 1 linkedinc += 1 else: ## if not forcing, shouldn't be here with existing Samples if append: if fastqtuple not in self.samples[sname].files.fastqs: self.samples[sname].files.fastqs.append(fastqtuple) appendinc += 1 else: print("The files {} are already in Sample {}, "\ .format(fastqtuple, sname) \ +"cannot append duplicate files to a Sample.\n") elif force: ## create new Sample self.samples[sname] = Sample(sname) self.samples[sname].stats.state = 1 self.samples[sname].barcode = None self.samples[sname].files.fastqs.append(fastqtuple) createdinc += 1 linkedinc += 1 else: print("The files {} are already in Sample.".format(sname) \ + " Use append=True to append additional files to a Sample"\ + " or force=True to replace all existing Samples.") ## record whether data were merged. if merged: self.samples[sname].merged = 1 ## do not allow merged=False and .forward in file names if (merged == False) and ('forward' in fastqtuple[0]): print(\ "If R1 and R2 data are merged (e.g., with PEAR) " \ + "use link_fastqs(merge=True) to indicate this. You " \ + "may need force=True to overwrite existing files.\n") ## if fastqs already demultiplexed, try to link stats if any([linkedinc, createdinc, appendinc]): gzipped = bool(fastqtuple[0].endswith(".gz")) nreads = 0 ## iterate over files if there are multiple for alltuples in self.samples[sname].files.fastqs: nreads += bufcount(alltuples[0], gzipped) self.samples[sname].stats.reads_raw = nreads/4 created += createdinc linked += linkedinc appended += appendinc ## print if data were linked print("{} new Samples created in `{}`.".format(created, self.name)) if linked: print("{} fastq files linked to {} new Samples.".\ format(linked, len(self.samples))) if appended: print("{} fastq files appended to {} existing Samples.".\ format(appended, len(self.samples))) def link_barcodes(self): """ creates a self.barcodes object to save barcodes info as a dictionary, if there is a barcodes file in self.paramsdict["barcodes_path"] """ ## in case fuzzy selected try: barcodefile = glob.glob(self.paramsdict["barcodes_path"])[0] except IndexError: print("Barcodes file not found:", self.paramsdict["barcodes_path"]) ## parse barcodefile bdf = pd.read_csv(barcodefile, header=None, delim_whitespace=1) bdf = bdf.dropna() ## make sure upper case bdf[1] = bdf[1].str.upper() ## set attribute on Assembly object self.barcodes = dict(zip(bdf[0], bdf[1])) # ## for each barcode create a Sample # for key in self.barcodes: # samp = Sample(key) # samp.state = 0 # samp.barcode = self.barcodes[key] # if samp not in self.samples: # self.samples[samp.name] = samp def get_params(self, param=""): """ pretty prints params if called as a function """ fullcurdir = os.path.realpath(os.path.curdir) if not param: for index, (key, value) in enumerate(self.paramsdict.items()): if isinstance(value, str): value = value.replace(fullcurdir, ".") sys.stdout.write(" {:<4}{:<28}{:<45}\n".format(index+1, key, value)) else: try: if int(param): #sys.stdout.write(self.paramsdict.values()[int(param)-1]) return self.paramsdict.values()[int(param)-1] except (ValueError, TypeError, NameError, IndexError): return 'key not recognized' #def save(self, name=""): # if not name: # print("must enter a filename for saved object") # else: # json.dumps(self) def set_params(self, param, newvalue): """ Set a parameter to a new value. Raises error if newvalue is wrong type. Note ---- Use [Assembly].get_params() to see the parameter values currently linked to the Assembly object. Parameters ---------- param : int or str The index (e.g., 1) or string name (e.g., "working_directory") for the parameter that will be changed. newvalue : int, str, or tuple The new value for the parameter selected for `param`. Use `ipyrad.get_params_info()` to get further information about a given parameter. If the wrong type is entered for newvalue (e.g., a str when it should be an int), an error will be raised. Further information about each parameter is also available in the documentation. Examples -------- ## param 1 takes only a str as input [Assembly].set_params(1, 'new_directory') [Assembly].set_params('working_directory', 'new_directory') ## param 6 must be a tuple or str, if str it is converted to a tuple ## with the second entry empty. [Assembly].set_params(6, 'TGCAG') [Assembly].set_params('restriction_overhang', ('CTGCAG', 'CCGG') ## param 13 can be an int or a float: [Assembly].set_params(13, 4) [Assembly].set_params('max_shared_heterozygosity', 0.25) """ ## require parameter recognition assert (param in range(50)) or \ (param in [str(i) for i in range(50)]) or \ (param in self.paramsdict.keys()), \ "Parameter key not recognized: `{}`.".format(param) ## make string param = str(param) ## if matching if param in ['1', 'working_directory']: self.paramsdict['working_directory'] = expander(newvalue) self._stamp("[1] set to "+newvalue) self.dirs["working"] = self.paramsdict["working_directory"] elif param in ['2', 'raw_fastq_path']: fullrawpath = expander(newvalue) if os.path.isdir(fullrawpath): fullrawpath = os.path.join(fullrawpath, "*.gz") self.paramsdict['raw_fastq_path'] = fullrawpath self._stamp("[2] set to "+newvalue) #if not self.paramdict["raw_fastq_path"]: self.dirs["fastqs"] = os.path.dirname( self.paramsdict["raw_fastq_path"]) elif param in ['3', 'barcodes_path']: #assert type(newvalue) is StringType, "arg must be a string" fullbarpath = expander(newvalue) if glob.glob(fullbarpath): self.paramsdict['barcodes_path'] = fullbarpath self.link_barcodes() self._stamp("[3] set to "+newvalue) elif not fullbarpath: self.paramsdict['barcodes_path'] = fullbarpath self._stamp("[3] set to empty") else: print('cannot find barcodes file') elif param in ['4', 'sorted_fastq_path']: assert isinstance(newvalue, str), \ "sorted_fastq_path must be a string, e.g., /home/data/fastqs/*" newvalue = expander(newvalue) if os.path.isdir(newvalue): newvalue = os.path.join(newvalue, "*.gz") self.paramsdict['sorted_fastq_path'] = newvalue ## link_fastqs will check that files exist #self.link_fastqs() self._stamp("[4] set to "+newvalue) self.dirs["fastqs"] = os.path.dirname( self.paramsdict["sorted_fastq_path"]) elif param in ['5', 'restriction_overhang']: newvalue = tuplecheck(newvalue, str) assert isinstance(newvalue, tuple), \ "cut site must be a tuple, e.g., (TGCAG, '') or (TGCAG, CCGG)" self.paramsdict['restriction_overhang'] = newvalue self._stamp("[5] set to "+str(newvalue)) elif param in ['6', 'max_low_qual_bases']: self.paramsdict['max_low_qual_bases'] = int(newvalue) self._stamp("[6] set to "+str(newvalue)) elif param in ['7', "engines_per_job"]: self.paramsdict['engines_per_job'] = int(newvalue) self._stamp("[7] set to "+str(newvalue)) elif param in ['8', 'mindepth_statistical']: ## do not allow values below 5 if int(newvalue) < 5: print("error: mindepth statistical cannot be set < 5") ## do not allow majrule to be > statistical elif int(newvalue) < self.paramsdict["mindepth_majrule"]: print("error: mindepth statistical cannot be less than \ mindepth_majrule") else: self.paramsdict['mindepth_statistical'] = int(newvalue) self._stamp("[8] set to "+str(newvalue)) elif param in ['9', 'mindepth_majrule']: if int(newvalue) > self.paramsdict["mindepth_statistical"]: print("error: mindepth_majrule cannot be > \ mindepth_statistical") else: self.paramsdict['mindepth_majrule'] = int(newvalue) self._stamp("[9] set to "+str(newvalue)) elif param in ['10', 'datatype']: ## list of allowed datatypes datatypes = ['rad', 'gbs', 'ddrad', 'pairddrad', 'pairgbs', 'merged', '2brad'] ## raise error if something else if self.paramsdict['datatype'] not in datatypes: print("error: datatype not recognized") else: self.paramsdict['datatype'] = str(newvalue) self._stamp("[10] set to "+newvalue) elif param in ['11', 'clust_threshold']: self.paramsdict['clust_threshold'] = float(newvalue) self._stamp("[11] set to {}".format(newvalue)) elif param in ['12', 'minsamp']: self.paramsdict['minsamp'] = int(newvalue) self._stamp("[12] set to {}".format(int(newvalue))) elif param in ['13', 'max_shared_heterozygosity']: self.paramsdict['max_shared_heterozygosity'] = newvalue self._stamp("[13] set to {}".format(newvalue)) elif param in ['14', 'prefix_outname']: self.paramsdict['prefix_outname'] = newvalue self._stamp("[14] set to {}".format(newvalue)) elif param in ['15', 'phred_Qscore_offset']: self.paramsdict['phred_Qscore_offset'] = int(newvalue) self._stamp("[15] set to {}".format(int(newvalue))) elif param in ['16', 'max_barcode_mismatch']: self.paramsdict['max_barcode_mismatch'] = int(newvalue) self._stamp("[16] set to {}".format(int(newvalue))) ### .... elif param in ['17', 'filter_adapters']: self.paramsdict['filter_adapters'] = int(newvalue) self._stamp("[17] set to "+str(newvalue)) elif param in ['18', 'filter_min_trim_len']: self.paramsdict['filter_min_trim_len'] = int(newvalue) self._stamp("[18] set to {}".format(int(newvalue))) elif param in ['19', 'ploidy']: self.paramsdict['ploidy'] = int(newvalue) self._stamp("[19] set to {}".format(int(newvalue))) elif param in ['20', 'max_stack_size']: self.paramsdict['max_stack_size'] = int(newvalue) self._stamp("[20] set to {}".format(int(newvalue))) elif param in ['21', 'max_Ns_consens']: newvalue = tuplecheck(newvalue) assert isinstance(newvalue, tuple), \ "max_Ns_consens should be a tuple e.g., (8,8)" self.paramsdict['max_Ns_consens'] = newvalue self._stamp("[21] set to {}".format(newvalue)) elif param in ['22', 'max_Hs_consens']: newvalue = tuplecheck(newvalue) assert isinstance(newvalue, tuple), \ "max_Hs_consens should be a tuple e.g., (1,2,2,1)" self.paramsdict['max_Hs_consens'] = newvalue self._stamp("[22] set to {}".format(newvalue)) elif param in ['23', 'max_SNPs_locus']: newvalue = tuplecheck(newvalue) assert isinstance(newvalue, tuple), \ "max_SNPs_locus should be a tuple e.g., (20,20)" self.paramsdict['max_SNPs_locus'] = newvalue self._stamp("[23] set to {}".format(newvalue)) elif param in ['24', 'max_Indels_locus']: newvalue = tuplecheck(newvalue) assert isinstance(newvalue, tuple), \ "max_Indels_locus should be a tuple e.g., (5, 100)" self.paramsdict['max_Indels_locus'] = newvalue self._stamp("[24] set to {}".format(newvalue)) elif param in ['25', 'trim_overhang']: newvalue = tuplecheck(newvalue) assert isinstance(newvalue, tuple), \ "trim_overhang should be a tuple e.g., (1,2,2,1)" self.paramsdict['trim_overhang'] = newvalue self._stamp("[25] set to {}".format(newvalue)) elif param in ['27', 'assembly_method']: self.paramsdict['assembly_method'] = newvalue LOGGER.info("assembly method set to %s", newvalue) assert self.paramsdict['assembly_method'] in \ ["denovo", "reference", "hybrid"], \ "The assembly_method option must be one of the following: "+\ "denovo, reference, or hybrid." self._stamp("[27] set to {}".format(newvalue)) elif param in ['28', 'reference_sequence']: fullrawpath = expander(newvalue) if not os.path.isfile(fullrawpath): raise Exception(\ "Reference sequence file not found. This must be an absolute path "\ +"(/home/wat/ipyrad/data/referece.gz) or a path relative to the "\ +"directory where you're running ipyrad (./data/reference.gz). ") self.paramsdict['reference_sequence'] = fullrawpath self._stamp("[28] set to "+fullrawpath) def copy(self, newname): """ Returns a copy of the Assemlbly object. Does not allow Assembly object names to be replicated in namespace or path. """ if (newname == self.name) or (os.path.exists(newname+".assembly")): print("Assembly object named {} already exists".format(newname)) else: ## create a copy of the Assembly obj newobj = copy.deepcopy(self) newobj.name = newname newobj.set_params(14, newname) ## create copies of each Sample obj for sample in self.samples: newobj.samples[sample] = copy.deepcopy(self.samples[sample]) return newobj def file_tree(self): """ prints the project data structure. TODO: this needs work. prints way too much other junk if [work] is home dir. """ startpath = self.paramsdict["working_directory"] if startpath in [".", "", "./", os.path.expanduser(startpath)]: print("./") else: for root, _, files in os.walk(startpath): level = root.replace(startpath, '').count(os.sep) indent = ' ' * 4 * (level) print('{}{}/'.format(indent, os.path.basename(root))) subindent = ' ' * 4 * (level + 1) for fname in files: print('{}{}'.format(subindent, fname)) def _save(self): """ Pickle the Assembly object. Could be used for checkpointing before and after assembly steps. Currently it is called after assembly steps. """ dillout = open(os.path.join( self.paramsdict["working_directory"], self.name+".assembly"), "wb") dill.dump(self, dillout) dillout.close() def step1(self, preview=0): """ step 1: demultiplex raw reads """ ## launch parallel client within guarded statement try: ipyclient = ipp.Client(cluster_id=self.__ipname__) if not self.samples: assemble.demultiplex.run(self, preview, ipyclient) self._stamp("s1_demultiplexing:") else: print("Samples already found in `{}`.".format(self.name) \ + "Use ip.merge() to combine samples \nfrom multiple " \ + "Assembly objects.\n") except (KeyboardInterrupt, SystemExit, AttributeError): logging.error("assembly step1 interrupted.") raise ## close client when done or if interrupted finally: ipyclient.shutdown(block=1) ipyclient.close() ## pickle the data obj self._save() ## TODO: make a step Class object def step2(self, samples="", preview=0, force=False): """ step 2: edit raw reads. Takes dictionary keys (sample names) either individually, or as a list, or it takes no argument to select all samples in the Assembly object. Only samples in state =1 will be edited, all others are skipped. To overwrite data use the argument force=True. """ ## launch parallel client within guarded statement ipyclient = ipp.Client(cluster_id=self.__ipname__) try: ipyclient = ipp.Client(cluster_id=self.__ipname__) if samples: ## if sample key, replace with sample obj assert isinstance(samples, list), \ "to subselect samples enter as a list, e.g., [A, B]." for sample in samples: ## get sample from dict key sample = self.samples[sample] assemble.rawedit.run(self, sample, ipyclient, force) else: ## TODO: Remove return of client if not self.samples: assert self.samples, "No Samples in "+self.name for _, sample in self.samples.items(): assemble.rawedit.run(self, sample, ipyclient, force) except (KeyboardInterrupt, AttributeError, SystemExit): LOGGER.error("assembly step2 interrupted!") raise ## close parallel client if done or interrupted finally: logging.info("assembly step2 cleaning up.") ipyclient.shutdown(block=1) ipyclient.close() ## checkpoint the data obj self._save() def step3(self, samples=None, preview=0, noreverse=0, force=False): """ step 3: clustering within samples """ ## Require reference seq for reference-based methods if self.paramsdict['assembly_method'] != "denovo": assert self.paramsdict['reference_sequence'], \ "Reference or hybrid assembly requires a value for "+\ "reference_sequence_path paramter." ## index the reference sequence index_reference_sequence(self) ## launch parallel client ipyclient = ipp.Client(cluster_id=self.__ipname__) try: ## sampling if samples: ## if string make a list(tuple) assert isinstance(samples, list), \ "to subselect samples enter as a list, e.g., [A, B]." ## make into a tuple list with (key, sample) ## allows for names as keys or Sample objects subsamples = [] for sample in samples: if self.samples.get(sample): subsamples.append((sample, self.samples[sample])) if subsamples: print("Clustering {} samples using {} engines per job.".\ format(len(samples), self.paramsdict["engines_per_job"])) ## run assemble.cluster_within.run(self, subsamples, ipyclient, preview, noreverse, force) else: print("No samples found. Check that names are correct") else: ## if no samples selected and no samples exist assert self.samples, "no Samples found in {}".format(self.name) ## print to screen print("clustering {} samples using {} engines per job".\ format(len(self.samples), self.paramsdict["engines_per_job"])) ## run assemble.cluster_within.run(self, self.samples.items(), ipyclient, preview, noreverse, force) except (KeyboardInterrupt, SystemExit): print("assembly step3 interrupted") raise ## close parallel client if done or interrupted finally: ipyclient.close() if preview: print(".") ## pickle the data object self._save() def step4(self, samples=None, preview=0, force=False, subsample=None): """ step 4: Joint estimation of error rate and heterozygosity. If you want to overwrite data for a file, first set its state to 3: data.samples['sample'].stats['state'] = 3 """ ## launch parallel client ipyclient = ipp.Client(cluster_id=self.__ipname__) try: ## sampling if samples: ## make a list keys or samples if isinstance(samples, str): samples = list([samples]) else: samples = list(samples) ## if keys are in list if any([isinstance(i, str) for i in samples]): ## make into a subsampled sample dict subsamples = {i: self.samples[i] for i in samples} ## send to function assemble.jointestimate.run(self, subsamples.values(), ipyclient, force, subsample) else: ## if no sample, then do all samples if not self.samples: ## if no samples in data, try linking edits from working dir #self.link_clustfiles() if not self.samples: print("Assembly object has no samples in state 3.") ## run clustering for all samples assemble.jointestimate.run(self, self.samples.values(), ipyclient, force, subsample) except (KeyboardInterrupt, SystemExit): print("assembly step4 interrupted") raise ## close parallel client if done or interrupted finally: ipyclient.close() if preview: print(".") ## pickle the data object self._save() def step5(self, samples="", preview=0): """ step 5: Consensus base calling from clusters within samples. If you want to overwrite data for a file, first set its state to 3 or 4. e.g., data.samples['sample'].stats['state'] = 3 """ ## sampling if samples: ## make a list keys or samples if isinstance(samples, str): samples = list([samples]) else: samples = list(samples) ## if keys are in list if any([isinstance(i, str) for i in samples]): ## make into a subsampled sample dict subsamples = {i: self.samples[i] for i in samples} ## send to function assemble.consens_se.run(self, subsamples.values()) else: ## if no sample, then do all samples if not self.samples: ## if no samples in data, try linking edits from working dir #self.link_clustfiles() if not self.samples: print("Assembly object has no samples in state=3") ## run clustering for all samples assemble.consens_se.run(self, self.samples.values()) ## pickle the data object self._save() def run(self, steps=0, force=False, preview=False): """ Select steps of an analysis. If no steps are entered then all steps are run. Enter steps as a string, e.g., "1", "123", "12345" """ if not steps: steps = "123457" else: steps = str(steps) if '1' in steps: self.step1(preview=preview) if '2' in steps: self.step2(force=force, preview=preview) if '3' in steps: self.step3(force=force, preview=preview) if '4' in steps: self.step4(force=force, preview=preview)
class Assembly(object): """ An ipyrad Assembly class object. The core object in ipyrad used to store and retrieve results, to call assembly functions, and to link to Sample objects. Parameters ---------- name : str A name should be passed when creating a new Assembly object. This name will be used as a prefix for all files saved to disk associated with this Assembly. It is automatically set as the prefix name (parameter 14). Attributes ---------- name : str A name for the Assembly object. Used for all saved files on disk. samples : dict Returns a dict with Sample names as keys and Sample objects as values. barcodes : dict Returns a dictionary with Sample names as keys and barcodes as values. The barcodes information is fetched from parameter 3 `[Assembly].paramsdict['barcodes_path']`. vsearch : str The path to the default vsearch executable. If not found, this can be changed by setting [Assembly].vsearch = [newpath]. muscle : str The path to the default muscle executable. If not found, this can be changed by setting `[Assembly].muscle = [newpath]`. smalt : str The path to the default smalt executable. If not found, this can be changed by setting `[Assembly].smalt = [newpath]`. samtools : str The path to the default samtools executable. If not found, this can be changed by setting `[Assembly].samtools = [newpath]`. log : list A list of all modifications to the Assembly object and its Samples with time stamps. Use `print [Assembly].log` for easier viewing. dirs : dict Returns a dictionary with the location of directories that contain linked Sample object files and stats results. Returns ------- object A new assembly object is returned. """ def __init__(self, name, controller="Local"): ## obj name self.name = name print("New Assembly object `{}` created".format(self.name)) ## launch ipcluster and register for later destruction self.__ipname__ = ipcontroller_init(controller) ## get binaries of dependencies self.vsearch, self.muscle, self.smalt, self.samtools = getbins() ## link a log history of executed workflow self.log = [] self._stamp(self.name + " created") self.statsfiles = ObjDict() ## samples linked self.samples = ObjDict() ## multiplex files linked self.barcodes = ObjDict() ## an object for storing data directories for this Assembly self.dirs = ObjDict() ## the default params dict self.paramsdict = OrderedDict([ ("working_directory", os.path.realpath(os.path.curdir)), ("raw_fastq_path", os.path.join(os.path.realpath(os.path.curdir), "*.fastq")), ("barcodes_path", os.path.join(os.path.realpath(os.path.curdir), "*.barcodes.txt")), ("sorted_fastq_path", ""), ("restriction_overhang", ("TGCAG", "")), ("max_low_qual_bases", 5), ("engines_per_job", 4), ("mindepth_statistical", 6), ("mindepth_majrule", 6), ("datatype", 'rad'), ("clust_threshold", .85), ("minsamp", 4), ("max_shared_heterozygosity", .25), ("prefix_outname", self.name), ("phred_Qscore_offset", 33), ("max_barcode_mismatch", 1), ("filter_adapters", 0), ("filter_min_trim_len", 35), ("ploidy", 2), ("max_stack_size", 1000), ("max_Ns_consens", (5, 5)), ("max_Hs_consens", (8, 8)), ("max_SNPs_locus", (100, 100)), ("max_Indels_locus", (5, 99)), ("trim_overhang", (1, 2, 2, 1)), ("hierarchical_clustering", 0), ("assembly_method", "denovo"), ("reference_sequence", "") ]) def __str__(self): return "<ipyrad.Assembly object {}>".format(self.name) @property def stats(self): """ Returns a data frame with Sample data and state. """ nameordered = self.samples.keys() nameordered.sort() return pd.DataFrame([self.samples[i].stats for i in nameordered], index=nameordered).dropna(axis=1, how='all') #dtype=[int, int, int, int, int, float, float, int]) @property def files(self): """ Returns a data frame with Sample files. Not very readable... """ nameordered = self.samples.keys() nameordered.sort() ## replace curdir with . for shorter printing #fullcurdir = os.path.realpath(os.path.curdir) return pd.DataFrame([self.samples[i].files for i in nameordered], index=nameordered).dropna(axis=1, how='all') def _stamp(self, event): """ Stamps an event into the log history. """ tev = time.strftime("%m/%d/%y %H:%M:%S", time.gmtime()) self.log.append((self.name, tev, event)) def link_fastqs(self, path=None, merged=False, force=False, append=False): """ Create Sample objects for samples in sorted_fastq_path. Note ---- link_fastqs() is called automatically during step2() if no Samples are yet present in the Assembly object (data were not demultiplexed in step1().) It looks for demultiplexed data files located in the [sorted_fastq_path]. Parameters ---------- path : str Path to the fastq files to be linked to Sample objects. The default location is to select all files in the 'sorted_fastq_path'. Alternatively a different path can be entered here. merged : bool Set to True if files represent first and second reads that were merged using some external software such as `PEAR` or `VSEARCH`. append : bool The default action is to overwrite fastq files linked to Samples if they already have linked files. Use append=True to instead append additional fastq files to a Sample (file names should be formatted the same as usual, e.g., [name]_R1_[optional].fastq.gz). Returns ------- str Prints the number of new Sample objects created and the number of fastq files linked to Sample objects in the Assembly object. """ ## cannot both force and append at once if force and append: raise Exception("Cannot use force and append at the same time.") if self.samples and not (force or append): raise Exception("Files already linked to `{}`. ".format(self.name)\ +"Use force=True to replace all files, or append=True to " +"add additional files to existing Samples.") ## get path to data files if not path: path = self.paramsdict["sorted_fastq_path"] ## does location exist, if no files selected, try selecting all if os.path.isdir(path): path += "*" ## grab fastqs/fq/gzip/all fastqs = glob.glob(path) fastqs = [i for i in fastqs if i.endswith(".gz") \ or i.endswith(".fastq") \ or i.endswith(".fq")] ## sort alphabetical fastqs.sort() ## link pairs into tuples if 'pair' in self.paramsdict["datatype"]: ## check that names fit the paired naming convention r1_files = [i for i in fastqs if "_R1_" in i] r2_files = [i.replace("_R1_", "_R2_") for i in r1_files] if not any(["_R1_" in i for i in fastqs]) or \ (len(r1_files) != len(r2_files)): raise Exception("File name format error: paired file names " \ +"must be identical except for _R1_ and _R2_ in their names.") fastqs = [(i, j) for i, j in zip(r1_files, r2_files)] ## data are not paired, create empty tuple pair else: if any(["_R2_" in i for i in fastqs]): print("Given the presence of '_R2_' in file names, this "\ +"is a warning that if your data are paired-end you should set "\ +"the Assembly object datatype to a paired type (e.g., "\ +"pairddrad or pairgbs) prior to running link_fastqs().") fastqs = [(i, ) for i in fastqs] ## counters for the printed output created = 0 linked = 0 appended = 0 for fastqtuple in list(fastqs): assert isinstance(fastqtuple, tuple), "fastqs not a tuple." ## local counters createdinc = 0 linkedinc = 0 appendinc = 0 ## remove file extension from name sname = _name_from_file(fastqtuple[0]) if sname not in self.samples: ## create new Sample self.samples[sname] = Sample(sname) self.samples[sname].stats.state = 1 self.samples[sname].barcode = None self.samples[sname].files.fastqs.append(fastqtuple) createdinc += 1 linkedinc += 1 else: ## if not forcing, shouldn't be here with existing Samples if append: if fastqtuple not in self.samples[sname].files.fastqs: self.samples[sname].files.fastqs.append(fastqtuple) appendinc += 1 else: print("The files {} are already in Sample {}, "\ .format(fastqtuple, sname) \ +"cannot append duplicate files to a Sample.\n") elif force: ## create new Sample self.samples[sname] = Sample(sname) self.samples[sname].stats.state = 1 self.samples[sname].barcode = None self.samples[sname].files.fastqs.append(fastqtuple) createdinc += 1 linkedinc += 1 else: print("The files {} are already in Sample.".format(sname) \ + " Use append=True to append additional files to a Sample"\ + " or force=True to replace all existing Samples.") ## record whether data were merged. if merged: self.samples[sname].merged = 1 ## do not allow merged=False and .forward in file names if (merged == False) and ('forward' in fastqtuple[0]): print(\ "If R1 and R2 data are merged (e.g., with PEAR) " \ + "use link_fastqs(merge=True) to indicate this. You " \ + "may need force=True to overwrite existing files.\n") ## if fastqs already demultiplexed, try to link stats if any([linkedinc, createdinc, appendinc]): gzipped = bool(fastqtuple[0].endswith(".gz")) nreads = 0 ## iterate over files if there are multiple for alltuples in self.samples[sname].files.fastqs: nreads += bufcount(alltuples[0], gzipped) self.samples[sname].stats.reads_raw = nreads / 4 created += createdinc linked += linkedinc appended += appendinc ## print if data were linked print("{} new Samples created in `{}`.".format(created, self.name)) if linked: print("{} fastq files linked to {} new Samples.".\ format(linked, len(self.samples))) if appended: print("{} fastq files appended to {} existing Samples.".\ format(appended, len(self.samples))) def link_barcodes(self): """ creates a self.barcodes object to save barcodes info as a dictionary, if there is a barcodes file in self.paramsdict["barcodes_path"] """ ## in case fuzzy selected try: barcodefile = glob.glob(self.paramsdict["barcodes_path"])[0] except IndexError: print("Barcodes file not found:", self.paramsdict["barcodes_path"]) ## parse barcodefile bdf = pd.read_csv(barcodefile, header=None, delim_whitespace=1) bdf = bdf.dropna() ## make sure upper case bdf[1] = bdf[1].str.upper() ## set attribute on Assembly object self.barcodes = dict(zip(bdf[0], bdf[1])) # ## for each barcode create a Sample # for key in self.barcodes: # samp = Sample(key) # samp.state = 0 # samp.barcode = self.barcodes[key] # if samp not in self.samples: # self.samples[samp.name] = samp def get_params(self, param=""): """ pretty prints params if called as a function """ fullcurdir = os.path.realpath(os.path.curdir) if not param: for index, (key, value) in enumerate(self.paramsdict.items()): if isinstance(value, str): value = value.replace(fullcurdir, ".") sys.stdout.write(" {:<4}{:<28}{:<45}\n".format( index + 1, key, value)) else: try: if int(param): #sys.stdout.write(self.paramsdict.values()[int(param)-1]) return self.paramsdict.values()[int(param) - 1] except (ValueError, TypeError, NameError, IndexError): return 'key not recognized' #def save(self, name=""): # if not name: # print("must enter a filename for saved object") # else: # json.dumps(self) def set_params(self, param, newvalue): """ Set a parameter to a new value. Raises error if newvalue is wrong type. Note ---- Use [Assembly].get_params() to see the parameter values currently linked to the Assembly object. Parameters ---------- param : int or str The index (e.g., 1) or string name (e.g., "working_directory") for the parameter that will be changed. newvalue : int, str, or tuple The new value for the parameter selected for `param`. Use `ipyrad.get_params_info()` to get further information about a given parameter. If the wrong type is entered for newvalue (e.g., a str when it should be an int), an error will be raised. Further information about each parameter is also available in the documentation. Examples -------- ## param 1 takes only a str as input [Assembly].set_params(1, 'new_directory') [Assembly].set_params('working_directory', 'new_directory') ## param 6 must be a tuple or str, if str it is converted to a tuple ## with the second entry empty. [Assembly].set_params(6, 'TGCAG') [Assembly].set_params('restriction_overhang', ('CTGCAG', 'CCGG') ## param 13 can be an int or a float: [Assembly].set_params(13, 4) [Assembly].set_params('max_shared_heterozygosity', 0.25) """ ## require parameter recognition assert (param in range(50)) or \ (param in [str(i) for i in range(50)]) or \ (param in self.paramsdict.keys()), \ "Parameter key not recognized: `{}`.".format(param) ## make string param = str(param) ## if matching if param in ['1', 'working_directory']: self.paramsdict['working_directory'] = expander(newvalue) self._stamp("[1] set to " + newvalue) self.dirs["working"] = self.paramsdict["working_directory"] elif param in ['2', 'raw_fastq_path']: fullrawpath = expander(newvalue) if os.path.isdir(fullrawpath): fullrawpath = os.path.join(fullrawpath, "*.gz") self.paramsdict['raw_fastq_path'] = fullrawpath self._stamp("[2] set to " + newvalue) #if not self.paramdict["raw_fastq_path"]: self.dirs["fastqs"] = os.path.dirname( self.paramsdict["raw_fastq_path"]) elif param in ['3', 'barcodes_path']: #assert type(newvalue) is StringType, "arg must be a string" fullbarpath = expander(newvalue) if glob.glob(fullbarpath): self.paramsdict['barcodes_path'] = fullbarpath self.link_barcodes() self._stamp("[3] set to " + newvalue) elif not fullbarpath: self.paramsdict['barcodes_path'] = fullbarpath self._stamp("[3] set to empty") else: print('cannot find barcodes file') elif param in ['4', 'sorted_fastq_path']: assert isinstance(newvalue, str), \ "sorted_fastq_path must be a string, e.g., /home/data/fastqs/*" newvalue = expander(newvalue) if os.path.isdir(newvalue): newvalue = os.path.join(newvalue, "*.gz") self.paramsdict['sorted_fastq_path'] = newvalue ## link_fastqs will check that files exist #self.link_fastqs() self._stamp("[4] set to " + newvalue) self.dirs["fastqs"] = os.path.dirname( self.paramsdict["sorted_fastq_path"]) elif param in ['5', 'restriction_overhang']: newvalue = tuplecheck(newvalue, str) assert isinstance(newvalue, tuple), \ "cut site must be a tuple, e.g., (TGCAG, '') or (TGCAG, CCGG)" self.paramsdict['restriction_overhang'] = newvalue self._stamp("[5] set to " + str(newvalue)) elif param in ['6', 'max_low_qual_bases']: self.paramsdict['max_low_qual_bases'] = int(newvalue) self._stamp("[6] set to " + str(newvalue)) elif param in ['7', "engines_per_job"]: self.paramsdict['engines_per_job'] = int(newvalue) self._stamp("[7] set to " + str(newvalue)) elif param in ['8', 'mindepth_statistical']: ## do not allow values below 5 if int(newvalue) < 5: print("error: mindepth statistical cannot be set < 5") ## do not allow majrule to be > statistical elif int(newvalue) < self.paramsdict["mindepth_majrule"]: print("error: mindepth statistical cannot be less than \ mindepth_majrule") else: self.paramsdict['mindepth_statistical'] = int(newvalue) self._stamp("[8] set to " + str(newvalue)) elif param in ['9', 'mindepth_majrule']: if int(newvalue) > self.paramsdict["mindepth_statistical"]: print("error: mindepth_majrule cannot be > \ mindepth_statistical") else: self.paramsdict['mindepth_majrule'] = int(newvalue) self._stamp("[9] set to " + str(newvalue)) elif param in ['10', 'datatype']: ## list of allowed datatypes datatypes = [ 'rad', 'gbs', 'ddrad', 'pairddrad', 'pairgbs', 'merged', '2brad' ] ## raise error if something else if self.paramsdict['datatype'] not in datatypes: print("error: datatype not recognized") else: self.paramsdict['datatype'] = str(newvalue) self._stamp("[10] set to " + newvalue) elif param in ['11', 'clust_threshold']: self.paramsdict['clust_threshold'] = float(newvalue) self._stamp("[11] set to {}".format(newvalue)) elif param in ['12', 'minsamp']: self.paramsdict['minsamp'] = int(newvalue) self._stamp("[12] set to {}".format(int(newvalue))) elif param in ['13', 'max_shared_heterozygosity']: self.paramsdict['max_shared_heterozygosity'] = newvalue self._stamp("[13] set to {}".format(newvalue)) elif param in ['14', 'prefix_outname']: self.paramsdict['prefix_outname'] = newvalue self._stamp("[14] set to {}".format(newvalue)) elif param in ['15', 'phred_Qscore_offset']: self.paramsdict['phred_Qscore_offset'] = int(newvalue) self._stamp("[15] set to {}".format(int(newvalue))) elif param in ['16', 'max_barcode_mismatch']: self.paramsdict['max_barcode_mismatch'] = int(newvalue) self._stamp("[16] set to {}".format(int(newvalue))) ### .... elif param in ['17', 'filter_adapters']: self.paramsdict['filter_adapters'] = int(newvalue) self._stamp("[17] set to " + str(newvalue)) elif param in ['18', 'filter_min_trim_len']: self.paramsdict['filter_min_trim_len'] = int(newvalue) self._stamp("[18] set to {}".format(int(newvalue))) elif param in ['19', 'ploidy']: self.paramsdict['ploidy'] = int(newvalue) self._stamp("[19] set to {}".format(int(newvalue))) elif param in ['20', 'max_stack_size']: self.paramsdict['max_stack_size'] = int(newvalue) self._stamp("[20] set to {}".format(int(newvalue))) elif param in ['21', 'max_Ns_consens']: newvalue = tuplecheck(newvalue) assert isinstance(newvalue, tuple), \ "max_Ns_consens should be a tuple e.g., (8,8)" self.paramsdict['max_Ns_consens'] = newvalue self._stamp("[21] set to {}".format(newvalue)) elif param in ['22', 'max_Hs_consens']: newvalue = tuplecheck(newvalue) assert isinstance(newvalue, tuple), \ "max_Hs_consens should be a tuple e.g., (1,2,2,1)" self.paramsdict['max_Hs_consens'] = newvalue self._stamp("[22] set to {}".format(newvalue)) elif param in ['23', 'max_SNPs_locus']: newvalue = tuplecheck(newvalue) assert isinstance(newvalue, tuple), \ "max_SNPs_locus should be a tuple e.g., (20,20)" self.paramsdict['max_SNPs_locus'] = newvalue self._stamp("[23] set to {}".format(newvalue)) elif param in ['24', 'max_Indels_locus']: newvalue = tuplecheck(newvalue) assert isinstance(newvalue, tuple), \ "max_Indels_locus should be a tuple e.g., (5, 100)" self.paramsdict['max_Indels_locus'] = newvalue self._stamp("[24] set to {}".format(newvalue)) elif param in ['25', 'trim_overhang']: newvalue = tuplecheck(newvalue) assert isinstance(newvalue, tuple), \ "trim_overhang should be a tuple e.g., (1,2,2,1)" self.paramsdict['trim_overhang'] = newvalue self._stamp("[25] set to {}".format(newvalue)) elif param in ['27', 'assembly_method']: self.paramsdict['assembly_method'] = newvalue LOGGER.info("assembly method set to %s", newvalue) assert self.paramsdict['assembly_method'] in \ ["denovo", "reference", "hybrid"], \ "The assembly_method option must be one of the following: "+\ "denovo, reference, or hybrid." self._stamp("[27] set to {}".format(newvalue)) elif param in ['28', 'reference_sequence']: fullrawpath = expander(newvalue) if not os.path.isfile(fullrawpath): raise Exception(\ "Reference sequence file not found. This must be an absolute path "\ +"(/home/wat/ipyrad/data/referece.gz) or a path relative to the "\ +"directory where you're running ipyrad (./data/reference.gz). ") self.paramsdict['reference_sequence'] = fullrawpath self._stamp("[28] set to " + fullrawpath) def copy(self, newname): """ Returns a copy of the Assemlbly object. Does not allow Assembly object names to be replicated in namespace or path. """ if (newname == self.name) or (os.path.exists(newname + ".assembly")): print("Assembly object named {} already exists".format(newname)) else: ## create a copy of the Assembly obj newobj = copy.deepcopy(self) newobj.name = newname newobj.set_params(14, newname) ## create copies of each Sample obj for sample in self.samples: newobj.samples[sample] = copy.deepcopy(self.samples[sample]) return newobj def file_tree(self): """ prints the project data structure. TODO: this needs work. prints way too much other junk if [work] is home dir. """ startpath = self.paramsdict["working_directory"] if startpath in [".", "", "./", os.path.expanduser(startpath)]: print("./") else: for root, _, files in os.walk(startpath): level = root.replace(startpath, '').count(os.sep) indent = ' ' * 4 * (level) print('{}{}/'.format(indent, os.path.basename(root))) subindent = ' ' * 4 * (level + 1) for fname in files: print('{}{}'.format(subindent, fname)) def _save(self): """ Pickle the Assembly object. Could be used for checkpointing before and after assembly steps. Currently it is called after assembly steps. """ dillout = open( os.path.join(self.paramsdict["working_directory"], self.name + ".assembly"), "wb") dill.dump(self, dillout) dillout.close() def step1(self, preview=0): """ step 1: demultiplex raw reads """ ## launch parallel client within guarded statement try: ipyclient = ipp.Client(cluster_id=self.__ipname__) if not self.samples: assemble.demultiplex.run(self, preview, ipyclient) self._stamp("s1_demultiplexing:") else: print("Samples already found in `{}`.".format(self.name) \ + "Use ip.merge() to combine samples \nfrom multiple " \ + "Assembly objects.\n") except (KeyboardInterrupt, SystemExit, AttributeError): logging.error("assembly step1 interrupted.") raise ## close client when done or if interrupted finally: ipyclient.shutdown(block=1) ipyclient.close() ## pickle the data obj self._save() ## TODO: make a step Class object def step2(self, samples="", preview=0, force=False): """ step 2: edit raw reads. Takes dictionary keys (sample names) either individually, or as a list, or it takes no argument to select all samples in the Assembly object. Only samples in state =1 will be edited, all others are skipped. To overwrite data use the argument force=True. """ ## launch parallel client within guarded statement ipyclient = ipp.Client(cluster_id=self.__ipname__) try: ipyclient = ipp.Client(cluster_id=self.__ipname__) if samples: ## if sample key, replace with sample obj assert isinstance(samples, list), \ "to subselect samples enter as a list, e.g., [A, B]." for sample in samples: ## get sample from dict key sample = self.samples[sample] assemble.rawedit.run(self, sample, ipyclient, force) else: ## TODO: Remove return of client if not self.samples: assert self.samples, "No Samples in " + self.name for _, sample in self.samples.items(): assemble.rawedit.run(self, sample, ipyclient, force) except (KeyboardInterrupt, AttributeError, SystemExit): LOGGER.error("assembly step2 interrupted!") raise ## close parallel client if done or interrupted finally: logging.info("assembly step2 cleaning up.") ipyclient.shutdown(block=1) ipyclient.close() ## checkpoint the data obj self._save() def step3(self, samples=None, preview=0, noreverse=0, force=False): """ step 3: clustering within samples """ ## Require reference seq for reference-based methods if self.paramsdict['assembly_method'] != "denovo": assert self.paramsdict['reference_sequence'], \ "Reference or hybrid assembly requires a value for "+\ "reference_sequence_path paramter." ## index the reference sequence index_reference_sequence(self) ## launch parallel client ipyclient = ipp.Client(cluster_id=self.__ipname__) try: ## sampling if samples: ## if string make a list(tuple) assert isinstance(samples, list), \ "to subselect samples enter as a list, e.g., [A, B]." ## make into a tuple list with (key, sample) ## allows for names as keys or Sample objects subsamples = [] for sample in samples: if self.samples.get(sample): subsamples.append((sample, self.samples[sample])) if subsamples: print("Clustering {} samples using {} engines per job.".\ format(len(samples), self.paramsdict["engines_per_job"])) ## run assemble.cluster_within.run(self, subsamples, ipyclient, preview, noreverse, force) else: print("No samples found. Check that names are correct") else: ## if no samples selected and no samples exist assert self.samples, "no Samples found in {}".format(self.name) ## print to screen print("clustering {} samples using {} engines per job".\ format(len(self.samples), self.paramsdict["engines_per_job"])) ## run assemble.cluster_within.run(self, self.samples.items(), ipyclient, preview, noreverse, force) except (KeyboardInterrupt, SystemExit): print("assembly step3 interrupted") raise ## close parallel client if done or interrupted finally: ipyclient.close() if preview: print(".") ## pickle the data object self._save() def step4(self, samples=None, preview=0, force=False, subsample=None): """ step 4: Joint estimation of error rate and heterozygosity. If you want to overwrite data for a file, first set its state to 3: data.samples['sample'].stats['state'] = 3 """ ## launch parallel client ipyclient = ipp.Client(cluster_id=self.__ipname__) try: ## sampling if samples: ## make a list keys or samples if isinstance(samples, str): samples = list([samples]) else: samples = list(samples) ## if keys are in list if any([isinstance(i, str) for i in samples]): ## make into a subsampled sample dict subsamples = {i: self.samples[i] for i in samples} ## send to function assemble.jointestimate.run(self, subsamples.values(), ipyclient, force, subsample) else: ## if no sample, then do all samples if not self.samples: ## if no samples in data, try linking edits from working dir #self.link_clustfiles() if not self.samples: print("Assembly object has no samples in state 3.") ## run clustering for all samples assemble.jointestimate.run(self, self.samples.values(), ipyclient, force, subsample) except (KeyboardInterrupt, SystemExit): print("assembly step4 interrupted") raise ## close parallel client if done or interrupted finally: ipyclient.close() if preview: print(".") ## pickle the data object self._save() def step5(self, samples="", preview=0): """ step 5: Consensus base calling from clusters within samples. If you want to overwrite data for a file, first set its state to 3 or 4. e.g., data.samples['sample'].stats['state'] = 3 """ ## sampling if samples: ## make a list keys or samples if isinstance(samples, str): samples = list([samples]) else: samples = list(samples) ## if keys are in list if any([isinstance(i, str) for i in samples]): ## make into a subsampled sample dict subsamples = {i: self.samples[i] for i in samples} ## send to function assemble.consens_se.run(self, subsamples.values()) else: ## if no sample, then do all samples if not self.samples: ## if no samples in data, try linking edits from working dir #self.link_clustfiles() if not self.samples: print("Assembly object has no samples in state=3") ## run clustering for all samples assemble.consens_se.run(self, self.samples.values()) ## pickle the data object self._save() def run(self, steps=0, force=False, preview=False): """ Select steps of an analysis. If no steps are entered then all steps are run. Enter steps as a string, e.g., "1", "123", "12345" """ if not steps: steps = "123457" else: steps = str(steps) if '1' in steps: self.step1(preview=preview) if '2' in steps: self.step2(force=force, preview=preview) if '3' in steps: self.step3(force=force, preview=preview) if '4' in steps: self.step4(force=force, preview=preview)
def consensus(args): """ from a clust file handle, reads in all copies at a locus and sorts bases at each site, tests for errors at the site according to error rate, calls consensus. """ ## unpack args data, sample, tmpchunk, point = args ## read in cluster file 2 lines at a time infile = gzip.open(tmpchunk) #sample.files["clusters"]) duo = itertools.izip(*[iter(infile)] * 2) ## store read depth info for later output files datadict = {} ## counters locus = 0 minsamp_filtered = 0 nheteros = 0 ## iterate over clusters while 1: try: first = duo.next() except StopIteration: break itera = [first[0], first[1]] fname = itera[0].split(";")[0] ## local containers and counters for this locus" locus += 1 ## recording n loci sloc = [] ## list for sequence data nloc = [] ## list for names used for gbs filters ## grab seqs until end of cluster while itera[0] != "//\n": ## append sequence * number of dereps " nreps = int(itera[0].split(";")[-2].split("=")[1]) for _ in xrange(nreps): sloc.append(tuple(itera[1].strip())) nloc.append(itera[0]) ## move on to the next sequence itera = duo.next() ## now that all seqs in this loc are read in ## check that none overlap leftjust overhang if gbs if data.paramsdict["datatype"] in ['gbs', 'merged']: ## TODO: test these new changes to gbs filter ## edge filters leftjust = rightjust = None rights = [] ## get leftjust and rights for i, j in zip(nloc, sloc): leftjust, rights = gbs_edgefilter(i, j, leftjust, rights) if rights: ## record in name that there was a reverse hit" fname = fname[:-2] + "c1" try: rightjust = min([min(i) for i in rights]) except ValueError: sloc = "" for seq in xrange(len(sloc)): sloc[seq] = sloc[seq][leftjust:] if rightjust: sloc[seq] = sloc[seq][:rightjust + 1] ## Apply depth filter if (len(sloc) >= data.paramsdict["mindepth_majrule"]) and \ (len(sloc) <= data.paramsdict["max_stack_size"]): ## this loc passed the minsamp filter minsamp_filtered += 1 ## get stacks of bases at each site arrayed = numpy.array(sloc) stacked = [Counter(seq) for seq in arrayed.T] ## apply functions to list of sites in stacked ## filter by site for paralogs and make consens calls consens = [filter2(data, site) for site in stacked] ## filtered by locus for paralog if "@" not in consens: ## get hetero sites heteros = [i[0] for i in enumerate(consens) \ if i[1] in list("RKSYWM")] ## filter for max number of hetero sites exceedmaxploid = 0 if len(heteros) <= data.paramsdict["max_Hs_consens"]: ## filter for more than x alleles given ploidy. Only ## relevant if locus is polymorphic at more than one site if len(heteros) > 1: consens, exceedmaxploid = filter3( data, consens, heteros, sloc) ## if the locus passed paralog filtering if not exceedmaxploid: consens = "".join(consens).replace("-", "N") ## if a site is stripped then I need to remove the site ## from the site counter (stacked) shortconl = consens.lstrip("N") if len(shortconl) < len(consens): stacked = stacked[-len(shortconl):] shortcon = consens.rstrip("N") if len(shortcon) < len(shortconl): stacked = stacked[:len(shortcon)] ## this function which removes low coverage sites next ## to poly repeats that are likely sequencing errors ## also edits 'stacked' shortcon, stacked = removerepeat_Ns(shortcon, stacked) ## only allow maxN internal "N"s in a locus if shortcon.count("N") <= int( data.paramsdict["max_Ns_consens"]): ## minimum length for clustering in vsearch if len(shortcon) >= 32: ## keep for counter nheteros += len(heteros) ## store the consens seq #consdic[fname] = shortcon ## create dataobj w/ name fname dataobj = ObjDict() ## store qual and depth data dataobj.seq = shortcon #[len(cut1):] dataobj.Cs = [i["C"] for i in stacked] dataobj.As = [i["A"] for i in stacked] dataobj.Ts = [i["T"] for i in stacked] dataobj.Gs = [i["G"] for i in stacked] #Cs = [i["C"] for i in stacked] #As = [i["A"] for i in stacked] #Ts = [i["T"] for i in stacked] #Gs = [i["G"] for i in stacked] #dfconsens = pd.DataFrame([list(shortcon), # Cs, As, Ts, Gs]) tag = "_".join(fname.split("_")[-2:]) datadict[tag] = dataobj #datadict[tag] = dfconsens else: pass #print "maxN filtered loc", locus else: pass #print "ploid filtered loc", locus else: pass #print "maxH filtered loc", locus else: pass #print "third base filtered loc", locus else: pass #print "mindepth filtered loc", locus data.dirs.consens = os.path.join(data.dirs.clusts, "consens") #if not os.path.exists(os.path.join(...)): # os.mkdir(consensdir) ## get filename consenshandle = "" #os.path.join([consensdir, sample.name+"consens.gz"]) ## write to file with gzip.open(consenshandle, 'wb') as outfile: outfile.write("\n".join([">"+sample.name+"_"+obj+"\n"+\ datadict[obj].seq for obj in datadict])) #for obj in datadict: # outfile.write(">"+sample.name+"_"+obj+"\n"+datadict[obj].seq+"\n") ## count the number of polymorphic sites if 'ddrad' in data.paramsdict["datatype"]: if 'pair' in data.paramsdict["datatype"]: sub = 4 + len(data.paramsdict["restriction_overhang"][0]) else: sub = len(data.paramsdict["restriction_overhang"][0]) elif 'gbs' in data.paramsdict["datatype"]: if 'pair' in data.paramsdict["datatype"]: sub = 4 + len(data.paramsdict["restriction_overhang"][0]) * 2 # (len(params["cut"])*2) else: sub = len(data.paramsdict["restriction_overhang"][0]) elif data.paramsdict["datatype"] == "merged": sub = len(data.paramsdict["restriction_overhang"][0]) * 2 else: sub = len(data.paramsdict["restriction_overhang"][0]) nsites = sum([len(datadict[i].seq) - sub for i in datadict]) ldic = len(datadict) try: poly = nheteros / float(nsites) except ZeroDivisionError: poly = 0. ## dump the quality score and depth info into a pickle #pickleout = gzip.open(handle.replace("clustS.gz", "bindata"), 'wb') #pickle.dump(datadict, pickleout) #pickleout.close() return [sample.name, locus, minsamp_filtered, ldic, nsites, nheteros, poly]
def consensus(args): """ from a clust file handle, reads in all copies at a locus and sorts bases at each site, tests for errors at the site according to error rate, calls consensus. """ ## unpack args data, sample, tmpchunk, point = args ## read in cluster file 2 lines at a time infile = gzip.open(tmpchunk) #sample.files["clusters"]) duo = itertools.izip(*[iter(infile)]*2) ## store read depth info for later output files datadict = {} ## counters locus = 0 minsamp_filtered = 0 nheteros = 0 ## iterate over clusters while 1: try: first = duo.next() except StopIteration: break itera = [first[0], first[1]] fname = itera[0].split(";")[0] ## local containers and counters for this locus" locus += 1 ## recording n loci sloc = [] ## list for sequence data nloc = [] ## list for names used for gbs filters ## grab seqs until end of cluster while itera[0] != "//\n": ## append sequence * number of dereps " nreps = int(itera[0].split(";")[-2].split("=")[1]) for _ in xrange(nreps): sloc.append(tuple(itera[1].strip())) nloc.append(itera[0]) ## move on to the next sequence itera = duo.next() ## now that all seqs in this loc are read in ## check that none overlap leftjust overhang if gbs if data.paramsdict["datatype"] in ['gbs', 'merged']: ## TODO: test these new changes to gbs filter ## edge filters leftjust = rightjust = None rights = [] ## get leftjust and rights for i, j in zip(nloc, sloc): leftjust, rights = gbs_edgefilter(i, j, leftjust, rights) if rights: ## record in name that there was a reverse hit" fname = fname[:-2]+"c1" try: rightjust = min([min(i) for i in rights]) except ValueError: sloc = "" for seq in xrange(len(sloc)): sloc[seq] = sloc[seq][leftjust:] if rightjust: sloc[seq] = sloc[seq][:rightjust+1] ## Apply depth filter if (len(sloc) >= data.paramsdict["mindepth_majrule"]) and \ (len(sloc) <= data.paramsdict["max_stack_size"]): ## this loc passed the minsamp filter minsamp_filtered += 1 ## get stacks of bases at each site arrayed = numpy.array(sloc) stacked = [Counter(seq) for seq in arrayed.T] ## apply functions to list of sites in stacked ## filter by site for paralogs and make consens calls consens = [filter2(data, site) for site in stacked] ## filtered by locus for paralog if "@" not in consens: ## get hetero sites heteros = [i[0] for i in enumerate(consens) \ if i[1] in list("RKSYWM")] ## filter for max number of hetero sites exceedmaxploid = 0 if len(heteros) <= data.paramsdict["max_Hs_consens"]: ## filter for more than x alleles given ploidy. Only ## relevant if locus is polymorphic at more than one site if len(heteros) > 1: consens, exceedmaxploid = filter3(data, consens, heteros, sloc) ## if the locus passed paralog filtering if not exceedmaxploid: consens = "".join(consens).replace("-", "N") ## if a site is stripped then I need to remove the site ## from the site counter (stacked) shortconl = consens.lstrip("N") if len(shortconl) < len(consens): stacked = stacked[-len(shortconl):] shortcon = consens.rstrip("N") if len(shortcon) < len(shortconl): stacked = stacked[:len(shortcon)] ## this function which removes low coverage sites next ## to poly repeats that are likely sequencing errors ## also edits 'stacked' shortcon, stacked = removerepeat_Ns(shortcon, stacked) ## only allow maxN internal "N"s in a locus if shortcon.count("N") <= int( data.paramsdict["max_Ns_consens"]): ## minimum length for clustering in vsearch if len(shortcon) >= 32: ## keep for counter nheteros += len(heteros) ## store the consens seq #consdic[fname] = shortcon ## create dataobj w/ name fname dataobj = ObjDict() ## store qual and depth data dataobj.seq = shortcon #[len(cut1):] dataobj.Cs = [i["C"] for i in stacked] dataobj.As = [i["A"] for i in stacked] dataobj.Ts = [i["T"] for i in stacked] dataobj.Gs = [i["G"] for i in stacked] #Cs = [i["C"] for i in stacked] #As = [i["A"] for i in stacked] #Ts = [i["T"] for i in stacked] #Gs = [i["G"] for i in stacked] #dfconsens = pd.DataFrame([list(shortcon), # Cs, As, Ts, Gs]) tag = "_".join(fname.split("_")[-2:]) datadict[tag] = dataobj #datadict[tag] = dfconsens else: pass #print "maxN filtered loc", locus else: pass #print "ploid filtered loc", locus else: pass #print "maxH filtered loc", locus else: pass #print "third base filtered loc", locus else: pass #print "mindepth filtered loc", locus data.dirs.consens = os.path.join(data.dirs.clusts, "consens") #if not os.path.exists(os.path.join(...)): # os.mkdir(consensdir) ## get filename consenshandle = "" #os.path.join([consensdir, sample.name+"consens.gz"]) ## write to file with gzip.open(consenshandle, 'wb') as outfile: outfile.write("\n".join([">"+sample.name+"_"+obj+"\n"+\ datadict[obj].seq for obj in datadict])) #for obj in datadict: # outfile.write(">"+sample.name+"_"+obj+"\n"+datadict[obj].seq+"\n") ## count the number of polymorphic sites if 'ddrad' in data.paramsdict["datatype"]: if 'pair' in data.paramsdict["datatype"]: sub = 4 + len(data.paramsdict["restriction_overhang"][0]) else: sub = len(data.paramsdict["restriction_overhang"][0]) elif 'gbs' in data.paramsdict["datatype"]: if 'pair' in data.paramsdict["datatype"]: sub = 4 + len(data.paramsdict["restriction_overhang"][0])*2 # (len(params["cut"])*2) else: sub = len(data.paramsdict["restriction_overhang"][0]) elif data.paramsdict["datatype"] == "merged": sub = len(data.paramsdict["restriction_overhang"][0])*2 else: sub = len(data.paramsdict["restriction_overhang"][0]) nsites = sum([len(datadict[i].seq)-sub for i in datadict]) ldic = len(datadict) try: poly = nheteros/float(nsites) except ZeroDivisionError: poly = 0. ## dump the quality score and depth info into a pickle #pickleout = gzip.open(handle.replace("clustS.gz", "bindata"), 'wb') #pickle.dump(datadict, pickleout) #pickleout.close() return [sample.name, locus, minsamp_filtered, ldic, nsites, nheteros, poly]