def make_VDJtools_dir( indir, outdir=False, VDJout_dir=None, genes=False, emptycols=['D'], overwrite=False, filetype='fastq', ): if not genes: genes = {'V':'V','J':'J','C':'C'} if not outdir: outdir = indir VDJout_dir,_ = reptools.build_path(True, VDJout_dir, 'VDJtools', outdir) filetypes = reptools.select_filetypes(filetype) typefiles = [fn for fn in os.listdir(indir) if os.path.splitext(fn)[1] in filetypes] if len(typefiles)==0: print('No files of specified type found.\n') return if overwrite: reptools.remove_dir(VDJout_dir,recursive=True) reptools.cautious_mkdir(VDJout_dir) for fn in typefiles: outfn = os.path.splitext(fn)[0]+'.tab' reptools.make_VDJtools(os.path.join(indir,fn),os.path.join(VDJout_dir,outfn),genes,emptycols,filetype)
def denoise_dir( indir, outdir=False, weight_by_qual=True, threshold=10, indel_threshold=100, FASTQout_dir=None, #FASTAout_dir = False, subs=True, indels=True, deambig=True, filetype='fastq', overwrite=False): filetypes = reptools.select_filetypes(filetype) infiles = [ fn for fn in os.listdir(indir) if os.path.splitext(fn.lower())[1] in filetypes ] if len(infiles) == 0: print('No fastq files found.\n') return if not outdir: outdir = indir FASTQout_dir, _ = reptools.build_path(True, FASTQout_dir, 'denoisedCDR3', outdir) FASTAout_dir = False #FASTAout_dir = build_path(True, FASTAout_dir, 'denoisedCDR3_fasta', outdir) #make output directory, deleting pre-existing data is overwrite is set for pth in [FASTQout_dir, FASTAout_dir]: if pth: reptools.reptools.cautious_mkdir(pth, overwrite=overwrite) for fn in infiles: FASTQout = reptools.make_unpaired_filepaths(FASTQout_dir, os.path.splitext(fn)[0]) FASTAout = reptools.make_unpaired_filepaths(FASTAout_dir, os.path.splitext(fn)[0], 'fas') _ = reptools.denoise_file(os.path.join(indir, fn), weight_by_qual=weight_by_qual, threshold=threshold, indel_threshold=indel_threshold, FASTQout=True, FASTAout=False, FASTQout_fn=FASTQout, change_logs=False, subs=subs, indels=indels, deambig=deambig, overwrite=overwrite)[0] return (FASTQout_dir)
def EEfilter_dir(indir, outdir=False, FASTQout_dir=None, FASTAout_dir=False, maxee=1, overwrite=False, filetype='fastq'): if not outdir: outdir = indir FASTQout_dir, _ = reptools.build_path(True, FASTQout_dir, 'EEfilteredCDR3', outdir) FASTAout_dir = False if not FASTQout_dir and not FASTAout_dir: raise ValueError( 'Please supply one or both of FASTQout_dir and FASTAout_dir to EEfilter_dir()' ) for pth in [FASTQout_dir, FASTAout_dir]: if pth: reptools.cautious_mkdir(pth, overwrite=overwrite) filetypes = reptools.select_filetypes(filetype) infiles = [ fn for fn in os.listdir(indir) if os.path.splitext(fn.lower())[1] in filetypes ] if len(infiles) == 0: print('No fastq files found.\n') return for fn in infiles: FASTQout = reptools.make_unpaired_filepaths(FASTQout_dir, os.path.splitext(fn)[0]) FASTAout = reptools.make_unpaired_filepaths(FASTAout_dir, os.path.splitext(fn)[0], 'fas') _ = reptools.EEfilter_file(os.path.join(indir, fn), FASTQout=FASTQout, FASTAout=FASTAout, maxee=maxee) return (FASTQout_dir)
def counts_csv(csvfile,paireddirs=(),unpaireddirs=(),pairsuffixes=('_1','_2'),filetype=None,overwrite=False,basename=False): if not overwrite and os.path.exists(csvfile): raise IOError('Target file already exists. To overwrite, set overwrite=True.') filetypes = reptools.select_filetypes(filetype) counts={} for dir in paireddirs: filelist = [os.path.join(dir,fn) for fn in os.listdir(dir) if os.path.splitext(fn)[1] in filetypes] counts[dir] = {} for fn in filelist: root = os.path.splitext(os.path.split(fn)[1])[0][:-len(pairsuffixes[0])] if root not in list(counts.keys()): counts[dir][root]=0 if os.path.getsize(fn)>0: if os.path.splitext(fn)[1] in ['.fastq','.fq']: counts[dir][root] += reptools.fastqcounter(fn) else: counts[dir][root] += reptools.fascounter(fn) for dir in unpaireddirs: counts[dir] = {} filelist = [os.path.join(dir,fn) for fn in os.listdir(dir) if os.path.splitext(fn)[1] in filetypes] for fn in filelist: root = os.path.splitext(os.path.split(fn)[1])[0] if os.path.getsize(fn)>0: if os.path.splitext(fn)[1] in ['.fastq','.fq']: counts[dir][root] = reptools.fastqcounter(fn) else: counts[dir][root] = reptools.fascounter(fn) else: counts[dir][root] = 0 # allroots = sorted(set([k for dict in counts for k in counts[dict] ])) if basename: titles = [os.path.split(path)[1] for path in paireddirs] + [os.path.split(path)[1] for path in unpaireddirs] else: titles = paireddirs+unpaireddirs with open(csvfile,'wb') as out_handle: out_handle.write('root,%s\n' % (','.join(titles) )) for root in allroots: out_handle.write('%s,%s\n' % (root,','.join([str(counts[dir][root]) if root in counts[dir] else '0' for dir in paireddirs+unpaireddirs] ) ) )
def v081_CDR3slice_dir( indir, outdir, genedict, db_files, db_dir=False, hitsDir=False, store_search_out=False, mincols=(20, 15), id=(0.93, 0.93), strand='both', evalue=(0.001, 0.001), genes=('V', 'J'), locations=('VregionC104start', 'JregionF118start'), filetype='fastq', overwrite=False, usearchpath='usearch', stellarPath='stellar', swipePath='swipe', blastPath='blastn', makeblastdbPath='makeblastdb', alnout=False, #for debugging use only algorithm='swipe', Vdb_length=30, threads=10, verbose=True, **kwargs): #modification: the databases and gene dict are now modified to look at the last Vdb_length bases of V only import os import tempfile import reptools import reptools.test filetypes = reptools.select_filetypes(filetype) typefiles = [ fn for fn in os.listdir(indir) if os.path.splitext(fn)[1] in filetypes ] if len(typefiles) == 0: print('No files of specified type found.\n') reptools.ensure_dir(outdir) return if overwrite: reptools.remove_dir(outdir) if alnout: reptools.remove_dir(alnout) reptools.cautious_mkdir(outdir) if alnout: reptools.cautious_mkdir(alnout) if store_search_out is True: #note that store_searchout can be True, False, or contain a path to write uSearch files to store_search_out = os.path.join(outdir, 'search_reports') if store_search_out: reptools.cautious_mkdir(store_search_out) if db_dir: db_files = [os.path.join(db_dir, fn) for fn in db_files] counts = {} if algorithm.lower() == 'swipe': hitExt = '.tsv' elif algorithm.lower() == 'stellar': hitExt = '.gff' elif algorithm.lower() in ['local', 'ublast']: hitExt = '.u14' elif algorithm.lower() == 'blast': hitExt = '.b6' else: raise ValueError( 'Unknown algorithm: must be "swipe","stellar","blast","local" or "ublast". ' 'local uses usearch local.') #make modified V database and gene dictionary, to work with only last 30 bases (by default - value in Vdb_length) (temp_Vdb, temp_genedict) = reptools.make_shorter_db(db_files[0], genedict, Vdb_length) db_files = [temp_Vdb, db_files[1]] #new db_files list #call reptools.CDR3slice() for fn in typefiles: if hitsDir: hitFiles = [] hitFiles = [ fn for fn in os.listdir(hitsDir) if os.path.splitext(fn)[0] == hitExt ] for gene in genes: hitFiles = hitFiles + [ os.path.join(hitsDir, fn) for fn in hitFiles if os.path.splitext(fn)[0][-1] == gene ] if len(hitFiles) != len(genes): raise ValueError('Missing hits file for %s' % fn) else: hitFiles = False infile = os.path.join(indir, fn) outfile = os.path.join(outdir, fn) alnoutfile = False if alnout: alnoutfile = os.path.join(alnout, os.path.splitext(fn)[0] + '.aln') reptools.test.v081_CDR3slice(infile, outfile, db_files=db_files, genedict=temp_genedict, hitFiles=hitFiles, store_search_out=store_search_out, mincols=mincols, id=id, strand=strand, evalue=evalue, genes=genes, locations=locations, usearchpath=usearchpath, stellarPath=stellarPath, swipePath=swipePath, blastPath=blastPath, makeblastdbPath=makeblastdbPath, alnout=alnoutfile, algorithm=algorithm, threads=threads, verbose=verbose, **kwargs) os.remove(temp_Vdb) os.remove(temp_genedict)