def snp(jobs, cpu_job, outdir, reference, tracker): #create sym links for read pairs in subfolders for cfsan snp pipeline if not tracker.check_status('initalize'): input_path = os.path.join(outdir, 'trimmed') checkexists(os.path.join(outdir, 'snp_reads')) fastqs = list(getfiles(input_path))[0] for read_pair in fastqs: sid = os.path.basename(read_pair[0]).split('_')[0] read_path = os.path.join(*[outdir, 'snp_reads', sid]) checkexists(read_path) os.link(read_pair[0], os.path.join(read_path, os.path.basename(read_pair[0]))) os.link(read_pair[1], os.path.join(read_path, os.path.basename(read_pair[1]))) tracker.update_status_done('initalize') if not tracker.check_status('cfsan'): total_cpus = jobs * cpu_job cfsan_snp(outdir, reference, total_cpus) dist_matrix_path = os.path.join( *[outdir, 'cfsan', 'snp_distance_matrix.tsv']) shutil.copyfile(dist_matrix_path, os.path.join(outdir, 'snp_distance_matrix.tsv')) tracker.update_status_done('cfsan') if not tracker.check_status('snp_tree'): tree_file = build_tree(outdir) in_path = os.path.join(*[outdir, 'snp_tree', tree_file]) out_path = os.path.join(outdir, 'snp_tree.tree') shutil.copyfile(in_path, out_path) tracker.update_status_done('snp_tree')
def assemble_reads(jobs,cpu_job,outdir): logfile = os.path.join(outdir,'assembly.log') input_path = os.path.join(outdir,'trimmed') #determine free ram free_ram = int(psutil.virtual_memory()[1]/1000000000) ram_job = int(free_ram / jobs) #assemble assemblies_path = os.path.join(outdir,"assemblies") checkexists(assemblies_path) #get trimmed reads fastqs = list(getfiles(input_path))[0] cmds = [] read_path = '' for read_pair in fastqs: #main command read1 = os.path.basename(read_pair[0]) read2 = os.path.basename(read_pair[1]) sid = os.path.basename(read_pair[0]).split('_')[0] cmds.append('bash -c \"shovill --R1 /data/{0} --R2 /data/{1} --outdir /output/{2} --cpus {3} --ram {4} && mv /output/{2}/contigs.fa /output/{2}/{2}.fa \"'.format(read1,read2,sid,cpu_job,ram_job)) if read_path == '': read_path = os.path.dirname(os.path.abspath(read_pair[1])) elif read_path != os.path.dirname(os.path.abspath(read_pair[1])): print("Reads cannot be in multiple locations. Exiting.") sys.exit() else: pass #start multiprocessing pool = mp.Pool(processes=jobs) print("Begining assembly of reads:\n Number of Jobs: {0}\n CPUs/Job: {1}".format(jobs,cpu_job)) #denote logs with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('Assembly\n') #begin multiprocessing results = pool.starmap_async(cd.call,[['staphb/shovill:1.0.4',cmd,'/data',{read_path:"/data",os.path.join(outdir,'assemblies'):"/output"}] for cmd in cmds]) stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') print("Finished Assembling Reads")
def build_tree(outdir, model='GTR+G'): logfile = os.path.join(outdir, 'tree.log') input_path = os.path.join(outdir, "cfsan") #remove tree dir if it exists shutil.rmtree(os.path.join(outdir, 'snp_tree'), ignore_errors=True) #create path cg_path = os.path.join(outdir, "snp_tree") checkexists(cg_path) shutil.copyfile(os.path.join(input_path, 'snpma.fasta'), os.path.join(cg_path, 'snpma.fasta')) #count the number of isolates with open(os.path.join(os.path.join(cg_path, 'snpma.fasta')), 'r') as infasta: isolate_counter = 0 for line in infasta.readlines(): if line[0] == '>': isolate_counter += 1 #iqtree command if isolate_counter > 3: command = "sh -c 'iqtree -s snpma.fasta -m {0} -bb 1000 '".format( model) tree_file = 'snpma.fasta.contree' else: command = "sh -c 'iqtree -s snpma.fasta -m {0} '".format(model) tree_file = 'snpma.fasta.treefile' print("Building the Phylogeny") #denote logs with open(logfile, 'a') as outlog: outlog.write('***********\n') outlog.write('Building Tree\n') stdout = cd.call('staphb/iqtree:1.6.7', command, '/data', {cg_path: "/data"}, sig_default=False) outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') print("Finished Bulding Phylogeny") return tree_file
def annotate_assemblies(jobs,cpu_job,outdir): logfile = os.path.join(outdir,'annotation.log') input_path = os.path.join(outdir,"assemblies") #annotation annotated_path = os.path.join(outdir,"annotated") checkexists(annotated_path) #get assemblies fastas = list(getfiles(input_path))[0] #setup command list for annotating cmds = [] for path in fastas: #main command assembly_file = os.path.basename(path) assembly_dir = os.path.basename(os.path.dirname(path)) sid = os.path.basename(path).split('.')[0] cmds.append('prokka --compliant --outdir /output/{0} --prefix {1} --cpus {2} {3}/{4}'.format(sid,sid,cpu_job,assembly_dir,assembly_file)) #start multiprocessing pool = mp.Pool(processes=jobs) print("Begining annotation of assemblies:\n Number of Jobs: {0}\n CPUs/Job: {1}".format(jobs,cpu_job)) #denote logs with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('Annotating\n') #begin multiprocessing results = pool.starmap_async(cd.call,[['staphb/prokka:1.14.0',cmd,'/data',{input_path:"/data",os.path.join(outdir,'annotated'):"/output"}] for cmd in cmds]) stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') #move finished .gff up a dir for root,dirs,files in os.walk(annotated_path): for file in files: if '.gff' in file: current_path = os.path.join(root,file) destination_path = os.path.join(os.path.dirname(root),file) os.rename(current_path,destination_path) print("Finished Annotating Assemblies")
def q_trim(reads, jobs, cpu, outdir, tracker): minlength = 100 windowsize = 4 qscore = 30 logfile = os.path.join(outdir, 'qtrim.log') cmds = [] read_path = '' for read_pair in reads: #main command main_cmd = 'java -jar /Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads {0}'.format( cpu) sid = os.path.basename(read_pair[0]).split('_')[0] args = ' {read1} {read2} -baseout /output/{sid}.fastq.gz SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}'.format( minlength=minlength, windowsize=windowsize, qscore=qscore, read1=os.path.basename(read_pair[0]), read2=os.path.basename(read_pair[1]), sid=sid) cmds.append(main_cmd + args) if read_path == '': read_path = os.path.dirname(os.path.abspath(read_pair[1])) elif read_path != os.path.dirname(os.path.abspath(read_pair[1])): print("Reads cannot be in multiple locations. Exiting.") sys.exit() else: pass checkexists(os.path.join(outdir, "trimmed")) #start multiprocessing pool = mp.Pool(processes=jobs) print( "Begining quality trimming of reads:\n Number of Jobs: {0}\n CPUs/Job: {1}" .format(jobs, cpu)) #denote logs with open(logfile, 'a') as outlog: outlog.write('***********\n') outlog.write('Trimmomatic\n') #begin multiprocessing results = pool.starmap_async(cd.call, [[ 'staphb/trimmomatic:0.39', cmd, '/data', { read_path: "/data", os.path.join(outdir, 'trimmed'): "/output" } ] for cmd in cmds]) stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') #remove unpaired reads for root, dirs, files in os.walk(os.path.join(outdir, 'trimmed')): for file in files: if "U.fastq.gz" in file: os.remove(os.path.join(root, file)) print("Finished Quality Trimming Reads") #update status tracker.update_status_done('trimmed')