def __init__(self, name, controller="Local"): ## obj name self.name = name print("New Assembly object `{}` created".format(self.name)) ## launch ipcluster and register for later destruction self.__ipname__ = ipcontroller_init(controller) ## get binaries of dependencies self.vsearch, self.muscle, self.smalt, self.samtools = getbins() ## link a log history of executed workflow self.log = [] self._stamp(self.name + " created") self.statsfiles = ObjDict() ## samples linked self.samples = ObjDict() ## multiplex files linked self.barcodes = ObjDict() ## an object for storing data directories for this Assembly self.dirs = ObjDict() ## the default params dict self.paramsdict = OrderedDict([ ("working_directory", os.path.realpath(os.path.curdir)), ("raw_fastq_path", os.path.join(os.path.realpath(os.path.curdir), "*.fastq")), ("barcodes_path", os.path.join(os.path.realpath(os.path.curdir), "*.barcodes.txt")), ("sorted_fastq_path", ""), ("restriction_overhang", ("TGCAG", "")), ("max_low_qual_bases", 5), ("engines_per_job", 4), ("mindepth_statistical", 6), ("mindepth_majrule", 6), ("datatype", 'rad'), ("clust_threshold", .85), ("minsamp", 4), ("max_shared_heterozygosity", .25), ("prefix_outname", self.name), ("phred_Qscore_offset", 33), ("max_barcode_mismatch", 1), ("filter_adapters", 0), ("filter_min_trim_len", 35), ("ploidy", 2), ("max_stack_size", 1000), ("max_Ns_consens", (5, 5)), ("max_Hs_consens", (8, 8)), ("max_SNPs_locus", (100, 100)), ("max_Indels_locus", (5, 99)), ("trim_overhang", (1, 2, 2, 1)), ("hierarchical_clustering", 0), ("assembly_method", "denovo"), ("reference_sequence", "") ])
def __init__(self, name=""): ## a sample name self.name = name self.barcode = "" self.merged = 0 ## stats dictionary self.stats = pd.Series(index=[ "state", "reads_raw", "reads_filtered", "refseq_mapped_reads", "refseq_unmapped_reads", "clusters_total", "clusters_kept", "hetero_est", "error_est", "reads_consens", ]) ## link to files self.files = ObjDict({ "fastqs": [], "edits": [], "mapped_reads": [], "unmapped_reads": [], "clusters": [], "depths": [], "consens": [], "database": [] }) ## store cluster depth information self.depths = ObjDict() self.depths.total = [] self.depths.mjmin = [] self.depths.statmin = [] ## assignments for hierarchical clustering self.group = []
def consensus(args): """ from a clust file handle, reads in all copies at a locus and sorts bases at each site, tests for errors at the site according to error rate, calls consensus. """ ## unpack args data, sample, tmpchunk, point = args ## read in cluster file 2 lines at a time infile = gzip.open(tmpchunk) #sample.files["clusters"]) duo = itertools.izip(*[iter(infile)] * 2) ## store read depth info for later output files datadict = {} ## counters locus = 0 minsamp_filtered = 0 nheteros = 0 ## iterate over clusters while 1: try: first = duo.next() except StopIteration: break itera = [first[0], first[1]] fname = itera[0].split(";")[0] ## local containers and counters for this locus" locus += 1 ## recording n loci sloc = [] ## list for sequence data nloc = [] ## list for names used for gbs filters ## grab seqs until end of cluster while itera[0] != "//\n": ## append sequence * number of dereps " nreps = int(itera[0].split(";")[-2].split("=")[1]) for _ in xrange(nreps): sloc.append(tuple(itera[1].strip())) nloc.append(itera[0]) ## move on to the next sequence itera = duo.next() ## now that all seqs in this loc are read in ## check that none overlap leftjust overhang if gbs if data.paramsdict["datatype"] in ['gbs', 'merged']: ## TODO: test these new changes to gbs filter ## edge filters leftjust = rightjust = None rights = [] ## get leftjust and rights for i, j in zip(nloc, sloc): leftjust, rights = gbs_edgefilter(i, j, leftjust, rights) if rights: ## record in name that there was a reverse hit" fname = fname[:-2] + "c1" try: rightjust = min([min(i) for i in rights]) except ValueError: sloc = "" for seq in xrange(len(sloc)): sloc[seq] = sloc[seq][leftjust:] if rightjust: sloc[seq] = sloc[seq][:rightjust + 1] ## Apply depth filter if (len(sloc) >= data.paramsdict["mindepth_majrule"]) and \ (len(sloc) <= data.paramsdict["max_stack_size"]): ## this loc passed the minsamp filter minsamp_filtered += 1 ## get stacks of bases at each site arrayed = numpy.array(sloc) stacked = [Counter(seq) for seq in arrayed.T] ## apply functions to list of sites in stacked ## filter by site for paralogs and make consens calls consens = [filter2(data, site) for site in stacked] ## filtered by locus for paralog if "@" not in consens: ## get hetero sites heteros = [i[0] for i in enumerate(consens) \ if i[1] in list("RKSYWM")] ## filter for max number of hetero sites exceedmaxploid = 0 if len(heteros) <= data.paramsdict["max_Hs_consens"]: ## filter for more than x alleles given ploidy. Only ## relevant if locus is polymorphic at more than one site if len(heteros) > 1: consens, exceedmaxploid = filter3( data, consens, heteros, sloc) ## if the locus passed paralog filtering if not exceedmaxploid: consens = "".join(consens).replace("-", "N") ## if a site is stripped then I need to remove the site ## from the site counter (stacked) shortconl = consens.lstrip("N") if len(shortconl) < len(consens): stacked = stacked[-len(shortconl):] shortcon = consens.rstrip("N") if len(shortcon) < len(shortconl): stacked = stacked[:len(shortcon)] ## this function which removes low coverage sites next ## to poly repeats that are likely sequencing errors ## also edits 'stacked' shortcon, stacked = removerepeat_Ns(shortcon, stacked) ## only allow maxN internal "N"s in a locus if shortcon.count("N") <= int( data.paramsdict["max_Ns_consens"]): ## minimum length for clustering in vsearch if len(shortcon) >= 32: ## keep for counter nheteros += len(heteros) ## store the consens seq #consdic[fname] = shortcon ## create dataobj w/ name fname dataobj = ObjDict() ## store qual and depth data dataobj.seq = shortcon #[len(cut1):] dataobj.Cs = [i["C"] for i in stacked] dataobj.As = [i["A"] for i in stacked] dataobj.Ts = [i["T"] for i in stacked] dataobj.Gs = [i["G"] for i in stacked] #Cs = [i["C"] for i in stacked] #As = [i["A"] for i in stacked] #Ts = [i["T"] for i in stacked] #Gs = [i["G"] for i in stacked] #dfconsens = pd.DataFrame([list(shortcon), # Cs, As, Ts, Gs]) tag = "_".join(fname.split("_")[-2:]) datadict[tag] = dataobj #datadict[tag] = dfconsens else: pass #print "maxN filtered loc", locus else: pass #print "ploid filtered loc", locus else: pass #print "maxH filtered loc", locus else: pass #print "third base filtered loc", locus else: pass #print "mindepth filtered loc", locus data.dirs.consens = os.path.join(data.dirs.clusts, "consens") #if not os.path.exists(os.path.join(...)): # os.mkdir(consensdir) ## get filename consenshandle = "" #os.path.join([consensdir, sample.name+"consens.gz"]) ## write to file with gzip.open(consenshandle, 'wb') as outfile: outfile.write("\n".join([">"+sample.name+"_"+obj+"\n"+\ datadict[obj].seq for obj in datadict])) #for obj in datadict: # outfile.write(">"+sample.name+"_"+obj+"\n"+datadict[obj].seq+"\n") ## count the number of polymorphic sites if 'ddrad' in data.paramsdict["datatype"]: if 'pair' in data.paramsdict["datatype"]: sub = 4 + len(data.paramsdict["restriction_overhang"][0]) else: sub = len(data.paramsdict["restriction_overhang"][0]) elif 'gbs' in data.paramsdict["datatype"]: if 'pair' in data.paramsdict["datatype"]: sub = 4 + len(data.paramsdict["restriction_overhang"][0]) * 2 # (len(params["cut"])*2) else: sub = len(data.paramsdict["restriction_overhang"][0]) elif data.paramsdict["datatype"] == "merged": sub = len(data.paramsdict["restriction_overhang"][0]) * 2 else: sub = len(data.paramsdict["restriction_overhang"][0]) nsites = sum([len(datadict[i].seq) - sub for i in datadict]) ldic = len(datadict) try: poly = nheteros / float(nsites) except ZeroDivisionError: poly = 0. ## dump the quality score and depth info into a pickle #pickleout = gzip.open(handle.replace("clustS.gz", "bindata"), 'wb') #pickle.dump(datadict, pickleout) #pickleout.close() return [sample.name, locus, minsamp_filtered, ldic, nsites, nheteros, poly]