covmode = 1 cov = 0.8 boots = 10 min_genomes = 10 remove_singleton_gcs = True module_cutoff = 0.75 temp_folder = pjoin(config_file['temp_folder'], "binsets", binset_name) gc_folder = pjoin(temp_folder, "gene_clusters") kegg_folder = pjoin(temp_folder, "KEGGs") freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) os.makedirs(gc_folder, exist_ok=True) title2log("copying bins to temp_folder", logfile) os.makedirs(pjoin(temp_folder, "bins"), exist_ok=True) for bin_ in tqdm(os.listdir(pjoin(out_folder, "bins"))): shutil.copyfile(pjoin(out_folder, "bins", bin_, bin_ + ".gff"), pjoin(temp_folder, "bins", bin_ + ".gff")) shutil.copyfile(pjoin(out_folder, "bins", bin_, bin_ + ".faa"), pjoin(temp_folder, "bins", bin_ + ".faa")) shutil.copyfile(pjoin(out_folder, "bins", bin_, bin_ + ".db"), pjoin(temp_folder, "bins", bin_ + ".db")) title2log("copying big faa to temp_folder", logfile) shutil.copyfile(pjoin(out_folder, binset_name + ".faa"), pjoin(temp_folder, "all_proteoms.faa")) title2log("Running mmseqs easy-cluster for gene clusters", logfile)
from Bio import SeqIO script, binning_name, config_file, root_folder, out_folder, threads = sys.argv logfile = pjoin(out_folder, 'logs', "binning.log") config_file = generate_config(config_file) call("conda env export > {out_folder}/logs/binning.yaml".format( out_folder=out_folder), shell=True) with open( "{out_folder}/logs/binning_settings.json".format( out_folder=out_folder), "w") as handle: json.dump(config_file, handle, indent=2, sort_keys=True) title2log("copying assemblies to temp_folder", logfile) def anvio_completeness(fna): os.makedirs("temp") shutil.copy(fna, "temp/genome.fna") call(""" anvi-setup-scg-taxonomy -T {threads} 2> /dev/null anvi-script-reformat-fasta temp/genome.fna -o temp/contigs4anvio.fa -l 0 --simplify-names 2> /dev/null anvi-gen-contigs-database -f temp/contigs4anvio.fa -o temp/contigs.db -T {threads} 2> /dev/null anvi-run-hmms -c temp/contigs.db -T {threads} 2> /dev/null anvi-run-scg-taxonomy -c temp/contigs.db -T {threads} 2> /dev/null anvi-estimate-genome-completeness -c temp/contigs.db -o temp/completeness.txt 2> /dev/null """.format(threads=24), shell=True) with open("temp/completeness.txt") as handle:
call("conda env export > {out_folder}/logs/library_rrna_spliting.yaml".format( out_folder=out_folder), shell=True) with open( "{out_folder}/logs/library_rrna_spliting_settings.json".format( out_folder=out_folder), "w") as handle: json.dump(config_file, handle, indent=2, sort_keys=True) temp_folder = pjoin(config_file['temp_folder'], "library_processing", lib_name) freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) title2log( "Ungzipping {lib_name}'s reads to temp_folder".format(lib_name=lib_name), logfile) call(""" unpigz -kc {out_folder}/{lname}_fwd.fastq.gz > {temp}/fwd.fastq unpigz -kc {out_folder}/{lname}_rev.fastq.gz > {temp}/rev.fastq unpigz -kc {out_folder}/{lname}_unp.fastq.gz > {temp}/unp.fastq """.format(out_folder=out_folder, lname=lib_name, temp=temp_folder), shell=True) refs = "".join( [" --ref " + f for f in config_file['libraries_config']['sortmerna_refs']]) title2log("Running sortmeRNA on {lib_name}".format(lib_name=lib_name), logfile) call("""
binnings = [pjoin(root_folder, "binnings", binni, "bins") for binni in config_file['binsets'][binset_name]['binnings']] binsets = config_file['binsets'][binset_name]['binsets'] external_bins = config_file['binsets'][binset_name]['external_bins'] tbinfoder = "{temp}/bins".format(temp = temp_folder) cbinfoder = "{temp}/clean_bins".format(temp = temp_folder) os.makedirs(cbinfoder, exist_ok = True) stats = {} formating_dat = { 'out_folder' : out_folder, 'temp_folder' : temp_folder, 'threads' : threads, 'binset_name' : binset_name, 'logfile' : logfile } if binnings != [] or external_bins != "": title2log("copying bins to temp_folder", logfile) os.makedirs(pjoin(temp_folder, "bins") , exist_ok = True) for binni in binnings: title2log("copying bins from " + binni.split("/")[-2], logfile) call("cp {binni}/* {temp}/bins/".format(binni = binni, temp = temp_folder), shell = True) if external_bins: title2log("copying bins from the external_bins folder "+ external_bins , logfile) for b in os.listdir(external_bins): shutil.copy(f"{external_bins}/{b}", f"{temp_folder}/bins/") title2log("Handle unbinned", logfile) append2unkept = lambda f : call("cat {folder}/{f} >> {ofolder}/{binset}_unkept.fna".format(f = f, folder = tbinfoder, ofolder = cbinfoder, binset = binset_name), shell=True) for f in os.listdir(tbinfoder): if f.endswith("-unbinned.fna"):
alternate_root = config_file['mappings'][mapping_name]['alternate_root'] ani = config_file['mappings'][mapping_name]['min_nucleotide_id'] min_len = config_file['mappings'][mapping_name]['min_len'] threads = int(threads) mrna_flag = "_mrna" if is_rna else "" if not alternate_root: alternate_root = binset temp_folder = pjoin(config_file['temp_folder'], "mappings", mapping_name) freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) title2log("copying binset to temp_folder", logfile) shutil.copy(pjoin(root_folder, "binsets", binset, alternate_root + ".fna"), pjoin(temp_folder, "binset.fna")) seqid = 0.95 cov = 0.9 covmode = 2 if precluster: call( f"mmseqs easy-cluster --min-seq-id {seqid} --cov-mode {covmode} -c {cov} --threads {threads} {temp_folder}/binset.fna {temp_folder}/binset {temp_folder}/mmseqs_temp #> {logfile}", shell=True) title2log("indexing binset to temp_folder", logfile) if method == "bwa-mem2": call( "bwa-mem2 index {temp}/binset.fna >> {out_folder}/logs/mapping.log 2>&1"
else : return (inbin, len(lens)) min_bin_size = config_file['binnings'][binning_name]['min_bin_size'] method = config_file['binnings'][binning_name]['binner'] cuda = config_file['binnings'][binning_name]['other_parameters'].get('cuda') if cuda : cuda = bool(cuda) temp_folder = pjoin(config_file['temp_folder'], "binnings", binning_name) freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) title2log("copying assemblies to temp_folder", logfile) assemblies = [pjoin(root_folder, "assemblies", ass, "assembly.fna") for ass in config_file['binnings'][binning_name]['assemblies']] for ass in assemblies: call("cat {ass} >> {temp}/assembly.fna".format(ass = ass, temp = temp_folder), shell = True) title2log("indexing assembly to temp_folder", logfile) call("bwa-mem2 index {temp}/assembly.fna >> {out_folder}/logs/binning.log 2>&1".format(temp = temp_folder, threads = threads, out_folder = out_folder), shell=True) freetxt_line("Starting mappings", logfile) for lib in config_file['binnings'][binning_name]['libraries']: title2log("copying {lib} to temp_folder".format(lib = lib), logfile) call(""" unpigz -kc {root_folder}/libraries/{lname}/{lname}_fwd.fastq.gz >> {temp}/fwd.fastq 2>> {out_folder}/logs/binning.log
config_file = generate_config(config_file) call("conda env export > {out_folder}/logs/assembly.yaml".format( out_folder=out_folder), shell=True) with open( "{out_folder}/logs/assembly_settings.json".format( out_folder=out_folder), "w") as handle: json.dump(config_file, handle, indent=2, sort_keys=True) temp_folder = pjoin(config_file['temp_folder'], "assemblies", ass_name) freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) title2log("cat-ing libs to temp_folder".format(ass_name=ass_name), logfile) for lib in config_file['assemblies'][ass_name]['libraries']: call(""" cat {root_folder}/libraries/{lname}/{lname}_fwd.fastq.gz >> {temp}/fwd.fastq.gz cat {root_folder}/libraries/{lname}/{lname}_rev.fastq.gz >> {temp}/rev.fastq.gz cat {root_folder}/libraries/{lname}/{lname}_unp.fastq.gz >> {temp}/unp.fastq.gz """.format(root_folder=root_folder, lname=lib, temp=temp_folder), shell=True) if config_file['assemblies'][ass_name]['preprocess'] == 'none': pass elif config_file['assemblies'][ass_name]['preprocess'] == 'bbnorm': title2log("Running bbnorm diginorm".format(ass_name=ass_name), logfile) call("""
script , binset_name, config_file, root_folder, out_folder, threads = sys.argv logfile = pjoin(out_folder, 'logs', "gtdbtk.log") config_file = generate_config(config_file) call("conda env export > {out_folder}/logs/gtdbtk.yaml".format(out_folder = out_folder), shell=True) with open("{out_folder}/logs/gtdbtk_settings.json".format(out_folder = out_folder), "w") as handle: json.dump(config_file, handle, indent = 2, sort_keys = True) temp_folder = pjoin(config_file['temp_folder'], "gtdbtk", binset_name) freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) title2log("copying bins to temp_folder", logfile) os.makedirs(pjoin(temp_folder, "bins") , exist_ok = True) for bin_ in tqdm(os.listdir(pjoin(out_folder, "bins"))): shutil.copyfile(pjoin(out_folder, "bins", bin_, bin_ + ".fna"), pjoin(temp_folder, "bins", bin_ + ".fna")) call(f"gtdbtk classify_wf --out_dir {temp_folder}/gtdbtk --genome_dir {temp_folder}/bins/ -x fna --cpus {threads} --scratch_dir {temp_folder} --pplacer_cpus {threads} >> {logfile}", shell = True) classif = csv2dict(f"{temp_folder}/gtdbtk/gtdbtk.ar122.summary.tsv", sep = "\t") classif.update(csv2dict(f"{temp_folder}/gtdbtk/gtdbtk.bac120.summary.tsv", sep = "\t")) binset_stats = csv2dict(pjoin(root_folder, "binsets", binset_name, binset_name + "_basics.csv")) cleanz = {k : {'gtdbtk_classif' : v['classification'], 'gtdbtk_notes' : ";".join([ field + "=" + v[field].replace(" ","_") for field in ['note','classification_method','warnings'] if v[field] != "N/A"]), 'translation_table' : v["translation_table"]} for k,v in classif.items()} for k,v in cleanz.items(): binset_stats[k].update(v)
call("conda env export > {out_folder}/logs/library_processing.yaml".format( out_folder=out_folder), shell=True) with open( "{out_folder}/logs/library_processing_settings.json".format( out_folder=out_folder), "w") as handle: json.dump(config_file, handle, indent=2, sort_keys=True) rna = config_file['libraries'][lib_name]["rna"] refs = " ".join([ "--ref " + f for f in config_file['libraries'][lib_name]["sortmerna_refs"].split(";") ]) title2log("Starting processing library {}".format(lib_name), logfile) temp_folder = pjoin(config_file['temp_folder'], "library_processing", lib_name) freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) os.makedirs(pjoin(out_folder, "logs/fastp_logs/")) paired_fastp_line = "fastp -h /dev/null -j {temp}/{lib1}.json --in1 {temp}/{lib1} --in2 {temp}/{lib2} --out1 {temp}/{lib1}_clean.fastq --out2 {temp}/{lib2}_clean.fastq --unpaired1 {temp}/{lib1}_unp.fastq --unpaired2 {temp}/{lib2}_unp.fastq -w {threads} >> {log} 2>&1" single_fastp_line = "fastp -h /dev/null -j {temp}/{lib}.json --in1 {temp}/{lib} --out1 {temp}/{lib}_clean.fastq -w {threads} >> {log} 2>&1" qc_log = {'paired': dict(), 'unpaired': dict()} for fwd, rev in zip(config_file['libraries'][lib_name]["fwd"], config_file['libraries'][lib_name]["rev"]): title2log( "QCing paired reads_library {} and {}, ".format(