covmode = 1
cov = 0.8
boots = 10
min_genomes = 10
remove_singleton_gcs = True
module_cutoff = 0.75
temp_folder = pjoin(config_file['temp_folder'], "binsets", binset_name)
gc_folder = pjoin(temp_folder, "gene_clusters")
kegg_folder = pjoin(temp_folder, "KEGGs")

freetxt_line("Creating temp folder: " + temp_folder, logfile)

os.makedirs(temp_folder, exist_ok=True)
os.makedirs(gc_folder, exist_ok=True)

title2log("copying bins to temp_folder", logfile)

os.makedirs(pjoin(temp_folder, "bins"), exist_ok=True)
for bin_ in tqdm(os.listdir(pjoin(out_folder, "bins"))):
    shutil.copyfile(pjoin(out_folder, "bins", bin_, bin_ + ".gff"),
                    pjoin(temp_folder, "bins", bin_ + ".gff"))
    shutil.copyfile(pjoin(out_folder, "bins", bin_, bin_ + ".faa"),
                    pjoin(temp_folder, "bins", bin_ + ".faa"))
    shutil.copyfile(pjoin(out_folder, "bins", bin_, bin_ + ".db"),
                    pjoin(temp_folder, "bins", bin_ + ".db"))

title2log("copying big faa to temp_folder", logfile)
shutil.copyfile(pjoin(out_folder, binset_name + ".faa"),
                pjoin(temp_folder, "all_proteoms.faa"))

title2log("Running mmseqs easy-cluster for gene clusters", logfile)
Exemple #2
0
from Bio import SeqIO

script, binning_name, config_file, root_folder, out_folder, threads = sys.argv

logfile = pjoin(out_folder, 'logs', "binning.log")
config_file = generate_config(config_file)

call("conda env export > {out_folder}/logs/binning.yaml".format(
    out_folder=out_folder),
     shell=True)
with open(
        "{out_folder}/logs/binning_settings.json".format(
            out_folder=out_folder), "w") as handle:
    json.dump(config_file, handle, indent=2, sort_keys=True)

title2log("copying assemblies to temp_folder", logfile)


def anvio_completeness(fna):
    os.makedirs("temp")
    shutil.copy(fna, "temp/genome.fna")
    call("""
    anvi-setup-scg-taxonomy -T {threads} 2> /dev/null
    anvi-script-reformat-fasta temp/genome.fna -o temp/contigs4anvio.fa -l 0 --simplify-names 2> /dev/null
    anvi-gen-contigs-database -f temp/contigs4anvio.fa -o temp/contigs.db -T {threads} 2> /dev/null
    anvi-run-hmms -c temp/contigs.db -T {threads} 2> /dev/null
    anvi-run-scg-taxonomy -c temp/contigs.db -T {threads} 2> /dev/null
    anvi-estimate-genome-completeness -c  temp/contigs.db -o temp/completeness.txt 2> /dev/null
    """.format(threads=24),
         shell=True)
    with open("temp/completeness.txt") as handle:
call("conda env export > {out_folder}/logs/library_rrna_spliting.yaml".format(
    out_folder=out_folder),
     shell=True)
with open(
        "{out_folder}/logs/library_rrna_spliting_settings.json".format(
            out_folder=out_folder), "w") as handle:
    json.dump(config_file, handle, indent=2, sort_keys=True)

temp_folder = pjoin(config_file['temp_folder'], "library_processing", lib_name)
freetxt_line("Creating temp folder: " + temp_folder, logfile)

os.makedirs(temp_folder, exist_ok=True)

title2log(
    "Ungzipping {lib_name}'s reads to temp_folder".format(lib_name=lib_name),
    logfile)

call("""
unpigz -kc {out_folder}/{lname}_fwd.fastq.gz > {temp}/fwd.fastq
unpigz -kc {out_folder}/{lname}_rev.fastq.gz > {temp}/rev.fastq
unpigz -kc {out_folder}/{lname}_unp.fastq.gz > {temp}/unp.fastq
""".format(out_folder=out_folder, lname=lib_name, temp=temp_folder),
     shell=True)

refs = "".join(
    [" --ref " + f for f in config_file['libraries_config']['sortmerna_refs']])

title2log("Running sortmeRNA on {lib_name}".format(lib_name=lib_name), logfile)

call("""
Exemple #4
0
binnings = [pjoin(root_folder, "binnings", binni, "bins") for binni in config_file['binsets'][binset_name]['binnings']]
binsets = config_file['binsets'][binset_name]['binsets']
external_bins =  config_file['binsets'][binset_name]['external_bins']
tbinfoder = "{temp}/bins".format(temp = temp_folder)
cbinfoder = "{temp}/clean_bins".format(temp = temp_folder)
os.makedirs(cbinfoder, exist_ok = True)
stats = {}
formating_dat = {
'out_folder' : out_folder,
'temp_folder' : temp_folder,
'threads' : threads,
'binset_name' : binset_name,
'logfile' : logfile
}
if binnings != [] or external_bins != "":
    title2log("copying bins to temp_folder", logfile)
    os.makedirs(pjoin(temp_folder, "bins") , exist_ok = True)
    for binni in binnings:
        title2log("copying bins from " + binni.split("/")[-2], logfile)
        call("cp {binni}/* {temp}/bins/".format(binni = binni, temp = temp_folder), shell = True)
    if external_bins:
        title2log("copying bins from the external_bins folder "+ external_bins , logfile)
        for b in os.listdir(external_bins):
            shutil.copy(f"{external_bins}/{b}", f"{temp_folder}/bins/")

    title2log("Handle unbinned", logfile)

    append2unkept = lambda f : call("cat {folder}/{f} >> {ofolder}/{binset}_unkept.fna".format(f = f, folder = tbinfoder, ofolder = cbinfoder, binset = binset_name), shell=True)

    for f in os.listdir(tbinfoder):
        if f.endswith("-unbinned.fna"):
Exemple #5
0
alternate_root = config_file['mappings'][mapping_name]['alternate_root']
ani = config_file['mappings'][mapping_name]['min_nucleotide_id']
min_len = config_file['mappings'][mapping_name]['min_len']

threads = int(threads)
mrna_flag = "_mrna" if is_rna else ""

if not alternate_root:
    alternate_root = binset

temp_folder = pjoin(config_file['temp_folder'], "mappings", mapping_name)
freetxt_line("Creating temp folder: " + temp_folder, logfile)

os.makedirs(temp_folder, exist_ok=True)

title2log("copying binset to temp_folder", logfile)

shutil.copy(pjoin(root_folder, "binsets", binset, alternate_root + ".fna"),
            pjoin(temp_folder, "binset.fna"))
seqid = 0.95
cov = 0.9
covmode = 2
if precluster:
    call(
        f"mmseqs easy-cluster --min-seq-id {seqid} --cov-mode {covmode} -c {cov} --threads {threads} {temp_folder}/binset.fna {temp_folder}/binset {temp_folder}/mmseqs_temp #> {logfile}",
        shell=True)

title2log("indexing binset to temp_folder", logfile)
if method == "bwa-mem2":
    call(
        "bwa-mem2 index {temp}/binset.fna >> {out_folder}/logs/mapping.log  2>&1"
    else :
        return (inbin, len(lens))

min_bin_size = config_file['binnings'][binning_name]['min_bin_size']
method = config_file['binnings'][binning_name]['binner']

cuda = config_file['binnings'][binning_name]['other_parameters'].get('cuda')
if cuda :
    cuda = bool(cuda)

temp_folder = pjoin(config_file['temp_folder'], "binnings", binning_name)
freetxt_line("Creating temp folder: " + temp_folder, logfile)

os.makedirs(temp_folder, exist_ok=True)

title2log("copying assemblies to temp_folder", logfile)

assemblies = [pjoin(root_folder, "assemblies", ass, "assembly.fna") for ass in config_file['binnings'][binning_name]['assemblies']]
for ass in assemblies:
    call("cat {ass} >> {temp}/assembly.fna".format(ass = ass, temp = temp_folder), shell = True)

title2log("indexing assembly to temp_folder", logfile)

call("bwa-mem2 index {temp}/assembly.fna >> {out_folder}/logs/binning.log  2>&1".format(temp = temp_folder, threads = threads, out_folder = out_folder), shell=True)

freetxt_line("Starting mappings", logfile)

for lib in config_file['binnings'][binning_name]['libraries']:
    title2log("copying {lib} to temp_folder".format(lib = lib), logfile)
    call("""
    unpigz -kc {root_folder}/libraries/{lname}/{lname}_fwd.fastq.gz >> {temp}/fwd.fastq 2>> {out_folder}/logs/binning.log
Exemple #7
0
config_file = generate_config(config_file)

call("conda env export > {out_folder}/logs/assembly.yaml".format(
    out_folder=out_folder),
     shell=True)
with open(
        "{out_folder}/logs/assembly_settings.json".format(
            out_folder=out_folder), "w") as handle:
    json.dump(config_file, handle, indent=2, sort_keys=True)

temp_folder = pjoin(config_file['temp_folder'], "assemblies", ass_name)
freetxt_line("Creating temp folder: " + temp_folder, logfile)

os.makedirs(temp_folder, exist_ok=True)

title2log("cat-ing libs to temp_folder".format(ass_name=ass_name), logfile)

for lib in config_file['assemblies'][ass_name]['libraries']:
    call("""
    cat {root_folder}/libraries/{lname}/{lname}_fwd.fastq.gz >> {temp}/fwd.fastq.gz
    cat {root_folder}/libraries/{lname}/{lname}_rev.fastq.gz >> {temp}/rev.fastq.gz
    cat {root_folder}/libraries/{lname}/{lname}_unp.fastq.gz >> {temp}/unp.fastq.gz
    """.format(root_folder=root_folder, lname=lib, temp=temp_folder),
         shell=True)

if config_file['assemblies'][ass_name]['preprocess'] == 'none':
    pass
elif config_file['assemblies'][ass_name]['preprocess'] == 'bbnorm':
    title2log("Running bbnorm diginorm".format(ass_name=ass_name), logfile)

    call("""
script , binset_name, config_file, root_folder, out_folder, threads = sys.argv

logfile = pjoin(out_folder, 'logs', "gtdbtk.log")
config_file = generate_config(config_file)

call("conda env export > {out_folder}/logs/gtdbtk.yaml".format(out_folder = out_folder), shell=True)
with open("{out_folder}/logs/gtdbtk_settings.json".format(out_folder = out_folder), "w") as handle:
    json.dump(config_file, handle, indent = 2, sort_keys = True)


temp_folder = pjoin(config_file['temp_folder'], "gtdbtk", binset_name)
freetxt_line("Creating temp folder: " + temp_folder, logfile)
os.makedirs(temp_folder, exist_ok=True)

title2log("copying bins to temp_folder", logfile)

os.makedirs(pjoin(temp_folder, "bins") , exist_ok = True)
for bin_ in tqdm(os.listdir(pjoin(out_folder, "bins"))):
    shutil.copyfile(pjoin(out_folder, "bins", bin_, bin_ + ".fna"), pjoin(temp_folder, "bins", bin_ + ".fna"))

call(f"gtdbtk classify_wf --out_dir {temp_folder}/gtdbtk --genome_dir {temp_folder}/bins/ -x fna --cpus {threads} --scratch_dir {temp_folder} --pplacer_cpus {threads} >> {logfile}", shell = True)

classif = csv2dict(f"{temp_folder}/gtdbtk/gtdbtk.ar122.summary.tsv", sep = "\t")
classif.update(csv2dict(f"{temp_folder}/gtdbtk/gtdbtk.bac120.summary.tsv", sep = "\t"))

binset_stats = csv2dict(pjoin(root_folder, "binsets", binset_name, binset_name + "_basics.csv"))
cleanz = {k : {'gtdbtk_classif' : v['classification'], 'gtdbtk_notes' : ";".join([ field + "=" + v[field].replace(" ","_") for field in ['note','classification_method','warnings'] if v[field] != "N/A"]), 'translation_table' : v["translation_table"]} for k,v in classif.items()}

for k,v in cleanz.items():
    binset_stats[k].update(v)
Exemple #9
0
call("conda env export > {out_folder}/logs/library_processing.yaml".format(
    out_folder=out_folder),
     shell=True)
with open(
        "{out_folder}/logs/library_processing_settings.json".format(
            out_folder=out_folder), "w") as handle:
    json.dump(config_file, handle, indent=2, sort_keys=True)

rna = config_file['libraries'][lib_name]["rna"]
refs = " ".join([
    "--ref " + f
    for f in config_file['libraries'][lib_name]["sortmerna_refs"].split(";")
])

title2log("Starting processing library {}".format(lib_name), logfile)

temp_folder = pjoin(config_file['temp_folder'], "library_processing", lib_name)
freetxt_line("Creating temp folder: " + temp_folder, logfile)

os.makedirs(temp_folder, exist_ok=True)
os.makedirs(pjoin(out_folder, "logs/fastp_logs/"))

paired_fastp_line = "fastp -h /dev/null -j {temp}/{lib1}.json  --in1 {temp}/{lib1} --in2 {temp}/{lib2} --out1 {temp}/{lib1}_clean.fastq --out2 {temp}/{lib2}_clean.fastq --unpaired1 {temp}/{lib1}_unp.fastq --unpaired2 {temp}/{lib2}_unp.fastq  -w {threads}  >> {log} 2>&1"
single_fastp_line = "fastp -h /dev/null -j {temp}/{lib}.json  --in1 {temp}/{lib}  --out1 {temp}/{lib}_clean.fastq  -w {threads}  >> {log} 2>&1"
qc_log = {'paired': dict(), 'unpaired': dict()}

for fwd, rev in zip(config_file['libraries'][lib_name]["fwd"],
                    config_file['libraries'][lib_name]["rev"]):
    title2log(
        "QCing paired reads_library {} and {}, ".format(