def renormalize(self): items=self.bMainList.selectedItems() if len(items)!=1: print("Need 1 item") return for citem in items: cname=str(citem.text()) cexp=self.explist[cname] newexp=hs.normalizereads(cexp) newexp.studyname=newexp.studyname+'_norm' self.addexp(newexp)
def filterandnormalize(expdat,seqs,exclude=False,subseq=False,numreads=10000): """ filter away sequences in seqs and then renormalize the data and recalculate origreads per sample input: expdat seqs - a list of sequences (ACGT) to remove exclude - False to remove seqs, True to only keep seqs subseq - False to look for exact match only, True to look for subsequence match (slower) output: newexp - the experiment without seqs and renormalied to 10k reads/sample """ params=locals() newexp=hs.filterseqs(expdat,seqs,exclude=not(exclude),subseq=subseq) newexp=hs.normalizereads(newexp,fixorig=True,numreads=numreads) newexp.filters.append("filter sequences and normalize to numreads %d" % numreads) hs.addcommand(newexp,"filterandnormalize",params=params,replaceparams={'expdat':expdat}) return newexp
def cleantaxonomy(self): items=self.bMainList.selectedItems() if len(items)!=1: print("Need 1 item") return for citem in items: cname=str(citem.text()) cexp=self.explist[cname] ctwin = CleanTaxonomyWindow(cexp) res=ctwin.exec_() if res==QtGui.QDialog.Accepted: newexp=hs.copyexp(cexp) if ctwin.cMitochondria.checkState(): newexp=hs.filtertaxonomy(newexp,'mitochondria',exclude=True) if ctwin.cChloroplast.checkState(): newexp=hs.filtertaxonomy(newexp,'Streptophyta',exclude=True) newexp=hs.filtertaxonomy(newexp,'Chloroplast',exclude=True) if ctwin.cUnknown.checkState(): newexp=hs.filtertaxonomy(newexp,'nknown',exclude=True) if ctwin.cBacteria.checkState(): newexp=hs.filtertaxonomy(newexp,'Bacteria;',exclude=True,exact=True) newexp=hs.normalizereads(newexp) newexp.studyname=cexp.studyname+'.ct' self.addexp(newexp)
def filtern(expdat): """ delete sequences containing "N" from experiment and renormalize input: expdat : Experiment output: newexp : Experiment experiment without sequences containing "N" """ params=locals() keeplist=[] for idx,cseq in enumerate(expdat.seqs): if "N" in cseq: continue if "n" in cseq: continue keeplist.append(idx) newexp=hs.reorderbacteria(expdat,keeplist) newexp=hs.normalizereads(newexp) newexp.filters.append('Filter sequences containing N') hs.addcommand(newexp,"filtern",params=params,replaceparams={'expdat':expdat}) hs.Debug(6,'%d sequences before filtering, %d after' % (len(expdat.seqs),len(newexp.seqs))) return newexp
def cleantaxonomy(expdat,mitochondria=True,chloroplast=True,bacteria=True,unknown=True,exclude=True): """ remove common non-16s sequences from the experiment and renormalize input: expdat : Experiment mitochondria : bool remove mitochondrial sequences chloroplast : bool remove chloroplast sequences bacteria : bool remove sequences only identified as "Bacteria" (no finer identification) unknown : bool remove unknown sequences exclude : bool True (default) to remove these sequecnes, False to keep them and throw other output: newexp : Experiment the renormalized experiment without these bacteria """ params=locals() newexp=hs.copyexp(expdat) if mitochondria: if exclude: newexp=hs.filtertaxonomy(newexp,'mitochondria',exclude=True) else: ne1=hs.filtertaxonomy(newexp,'mitochondria',exclude=False) if chloroplast: if exclude: newexp=hs.filtertaxonomy(newexp,'Streptophyta',exclude=True) newexp=hs.filtertaxonomy(newexp,'Chloroplast',exclude=True) else: ne2=hs.filtertaxonomy(newexp,'Streptophyta',exclude=False) ne3=hs.filtertaxonomy(newexp,'Chloroplast',exclude=False) if unknown: if exclude: newexp=hs.filtertaxonomy(newexp,'Unknown',exclude=True) newexp=hs.filtertaxonomy(newexp,'Unclassified;',exclude=True,exact=True) else: ne4=hs.filtertaxonomy(newexp,'Unknown',exclude=False) ne5=hs.filtertaxonomy(newexp,'Unclassified;',exclude=False,exact=True) if bacteria: if exclude: newexp=hs.filtertaxonomy(newexp,'Bacteria;',exclude=True,exact=True) else: ne6=hs.filtertaxonomy(newexp,'Bacteria;',exclude=False,exact=True) if exclude: newexp=hs.normalizereads(newexp) else: allseqs=[] allseqs+=(ne1.seqs) allseqs+=(ne2.seqs) allseqs+=(ne3.seqs) allseqs+=(ne4.seqs) allseqs+=(ne5.seqs) allseqs+=(ne6.seqs) allseqs=list(set(allseqs)) newexp=hs.filterseqs(newexp,allseqs) newexp.filters.append('Clean Taxonomy (remove mitochondria etc.)') hs.addcommand(newexp,"cleantaxonomy",params=params,replaceparams={'expdat':expdat}) hs.Debug(6,'%d sequences before filtering, %d after' % (len(expdat.seqs),len(newexp.seqs))) return newexp
def filtersimilarsamples(expdat,field,method='mean'): """ join similar samples into one sample (i.e. to remove samples of same individual) input: expdat : Experiment field : string Name of the field containing the values (for which similar values will be joined) method : string What to do with samples with similar value. options: 'mean' - replace with a sample containing the mean of the samples 'median'- replace with a sample containing the median of the samples 'random' - replace with a single random sample out of these samples 'sum' - replace with sum of original reads in all samples, renormalized after to 10k and orignumreads updated 'fracpres' - replace with fraction of samples where the bacteria is present output: newexp : Experiment like the input experiment but only one sample per unique value in field """ params=locals() newexp=hs.copyexp(expdat) if method=='sum': newexp=hs.toorigreads(newexp) uvals=hs.getfieldvals(expdat,field,ounique=True) keep=[] for cval in uvals: cpos=hs.findsamples(expdat,field,cval) if len(cpos)==1: keep.append(cpos[0]) continue if method=='random': keep.append(cpos[np.random.randint(len(cpos))]) continue # set the mapping file values cmap=expdat.smap[expdat.samples[cpos[0]]] for ccpos in cpos[1:]: for cfield in cmap.keys(): if cmap[cfield]!=expdat.smap[expdat.samples[ccpos]][cfield]: cmap[cfield]='NA' if method=='mean': cval=np.mean(expdat.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval keep.append(cpos[0]) elif method=='median': cval=np.median(expdat.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval keep.append(cpos[0]) elif method=='sum': cval=np.sum(newexp.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval newexp.origreads[cpos[0]]=np.sum(hs.reorder(expdat.origreads,cpos)) keep.append(cpos[0]) elif method=='fracpres': cval=np.sum(expdat.data[:,cpos]>0,axis=1) newexp.data[:,cpos[0]]=cval/len(cpos) keep.append(cpos[0]) else: hs.Debug(9,'method %s not supported' % method) return False newexp.smap[expdat.samples[cpos[0]]]=cmap newexp=hs.reordersamples(newexp,keep) if method=='sum': newexp=hs.normalizereads(newexp) newexp.filters.append('Filter similar samples field %s method %s' % (field,method)) hs.addcommand(newexp,"filtersimilarsamples",params=params,replaceparams={'expdat':expdat}) hs.Debug(6,'%d samples before filtering, %d after' % (len(expdat.samples),len(newexp.samples))) return newexp