out_folder=out_folder), "w") as handle: json.dump(config_file, handle, indent=2, sort_keys=True) threads = int(threads) seqid = 0.7 covmode = 1 cov = 0.8 boots = 10 min_genomes = 10 remove_singleton_gcs = True module_cutoff = 0.75 temp_folder = pjoin(config_file['temp_folder'], "binsets", binset_name) gc_folder = pjoin(temp_folder, "gene_clusters") kegg_folder = pjoin(temp_folder, "KEGGs") freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) os.makedirs(gc_folder, exist_ok=True) title2log("copying bins to temp_folder", logfile) os.makedirs(pjoin(temp_folder, "bins"), exist_ok=True) for bin_ in tqdm(os.listdir(pjoin(out_folder, "bins"))): shutil.copyfile(pjoin(out_folder, "bins", bin_, bin_ + ".gff"), pjoin(temp_folder, "bins", bin_ + ".gff")) shutil.copyfile(pjoin(out_folder, "bins", bin_, bin_ + ".faa"), pjoin(temp_folder, "bins", bin_ + ".faa")) shutil.copyfile(pjoin(out_folder, "bins", bin_, bin_ + ".db"), pjoin(temp_folder, "bins", bin_ + ".db"))
logfile = pjoin(out_folder, 'logs', "binset.log") config_file = generate_config(config_file) call("conda env export > {out_folder}/logs/binset.yaml".format(out_folder = out_folder), shell=True) with open("{out_folder}/logs/binning_settings.json".format(out_folder = out_folder), "w") as handle: json.dump(config_file, handle, indent = 2, sort_keys = True) min_completeness = config_file['binsets'][binset_name]['min_completeness'] max_contamination = config_file['binsets'][binset_name]['max_contamination'] min_size = config_file['binsets'][binset_name]['min_size'] min_coding = config_file['binsets'][binset_name]['min_coding'] keep_fails = config_file['binsets'][binset_name]['keep_fails'] temp_folder = pjoin(config_file['temp_folder'], "binsets", binset_name) freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) binnings = [pjoin(root_folder, "binnings", binni, "bins") for binni in config_file['binsets'][binset_name]['binnings']] binsets = config_file['binsets'][binset_name]['binsets'] external_bins = config_file['binsets'][binset_name]['external_bins'] tbinfoder = "{temp}/bins".format(temp = temp_folder) cbinfoder = "{temp}/clean_bins".format(temp = temp_folder) os.makedirs(cbinfoder, exist_ok = True) stats = {} formating_dat = { 'out_folder' : out_folder, 'temp_folder' : temp_folder, 'threads' : threads,
taxfield = config_file['mappings'][mapping_name]['taxfield'] precluster = config_file['mappings'][mapping_name]['precluster'] keep_mapped = config_file['mappings'][mapping_name]['keep_mapped'] alternate_root = config_file['mappings'][mapping_name]['alternate_root'] ani = config_file['mappings'][mapping_name]['min_nucleotide_id'] min_len = config_file['mappings'][mapping_name]['min_len'] threads = int(threads) mrna_flag = "_mrna" if is_rna else "" if not alternate_root: alternate_root = binset temp_folder = pjoin(config_file['temp_folder'], "mappings", mapping_name) freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) title2log("copying binset to temp_folder", logfile) shutil.copy(pjoin(root_folder, "binsets", binset, alternate_root + ".fna"), pjoin(temp_folder, "binset.fna")) seqid = 0.95 cov = 0.9 covmode = 2 if precluster: call( f"mmseqs easy-cluster --min-seq-id {seqid} --cov-mode {covmode} -c {cov} --threads {threads} {temp_folder}/binset.fna {temp_folder}/binset {temp_folder}/mmseqs_temp #> {logfile}", shell=True)
script, lib_name, config_file, root_folder, out_folder, threads = sys.argv logfile = pjoin(out_folder, 'logs', lib_name + ".log") config_file = validate_description_json(config_file) call("conda env export > {out_folder}/logs/library_rrna_spliting.yaml".format( out_folder=out_folder), shell=True) with open( "{out_folder}/logs/library_rrna_spliting_settings.json".format( out_folder=out_folder), "w") as handle: json.dump(config_file, handle, indent=2, sort_keys=True) temp_folder = pjoin(config_file['temp_folder'], "library_processing", lib_name) freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) title2log( "Ungzipping {lib_name}'s reads to temp_folder".format(lib_name=lib_name), logfile) call(""" unpigz -kc {out_folder}/{lname}_fwd.fastq.gz > {temp}/fwd.fastq unpigz -kc {out_folder}/{lname}_rev.fastq.gz > {temp}/rev.fastq unpigz -kc {out_folder}/{lname}_unp.fastq.gz > {temp}/unp.fastq """.format(out_folder=out_folder, lname=lib_name, temp=temp_folder), shell=True) refs = "".join(
shell=True) with open( "{out_folder}/logs/library_processing_settings.json".format( out_folder=out_folder), "w") as handle: json.dump(config_file, handle, indent=2, sort_keys=True) rna = config_file['libraries'][lib_name]["rna"] refs = " ".join([ "--ref " + f for f in config_file['libraries'][lib_name]["sortmerna_refs"].split(";") ]) title2log("Starting processing library {}".format(lib_name), logfile) temp_folder = pjoin(config_file['temp_folder'], "library_processing", lib_name) freetxt_line("Creating temp folder: " + temp_folder, logfile) os.makedirs(temp_folder, exist_ok=True) os.makedirs(pjoin(out_folder, "logs/fastp_logs/")) paired_fastp_line = "fastp -h /dev/null -j {temp}/{lib1}.json --in1 {temp}/{lib1} --in2 {temp}/{lib2} --out1 {temp}/{lib1}_clean.fastq --out2 {temp}/{lib2}_clean.fastq --unpaired1 {temp}/{lib1}_unp.fastq --unpaired2 {temp}/{lib2}_unp.fastq -w {threads} >> {log} 2>&1" single_fastp_line = "fastp -h /dev/null -j {temp}/{lib}.json --in1 {temp}/{lib} --out1 {temp}/{lib}_clean.fastq -w {threads} >> {log} 2>&1" qc_log = {'paired': dict(), 'unpaired': dict()} for fwd, rev in zip(config_file['libraries'][lib_name]["fwd"], config_file['libraries'][lib_name]["rev"]): title2log( "QCing paired reads_library {} and {}, ".format( os.path.basename(fwd), os.path.basename(rev)), logfile) freetxt_line( "first copying reads to {temp_folder} ".format(