def run_cutadapt(input_list: list): # Note the order, it hardly depends from the order of the upstream dataframe columns sample_name, sample_file_1, sample_file_2 = input_list _ADAPTER = "AGATCGGAAGAG" out_file_1, out_file_2, log_file = [ os.path.join(cutadaptDir, "{}_cutadapt.{}".format(sample_name, i)) for i in ("1.fq.gz", "2.fq.gz", "log") ] cmd = "cutadapt -a {ad} -A {ad} -m 50 -o {o1} -p {o2} {i1} {i2}".format( ad=_ADAPTER, i1=sample_file_1, i2=sample_file_2, o1=out_file_1, o2=out_file_2) try: for _f in [out_file_1, out_file_2, log_file]: if os.path.exists(_f): os.remove(_f) log = subprocess.getoutput(cmd) except PermissionError: raise ValueError( "Permission denied, please run `sudo chmod -R 777 {}`".format( os.path.dirname(sample_file_1))) Utilities.dump_string(log, file=log_file) return { "sample_name": sample_name, "trimmed_file_1": out_file_1, "trimmed_file_2": out_file_2 }
def run_spades(input_list: list): # Same about the order sample_name, sample_file_1, sample_file_2 = input_list out_dir = os.path.join(spadesDir, sample_name) subprocess.getoutput("rm -rf {}".format(out_dir)) os.makedirs(out_dir) cmd = "spades.py --careful -o {out} -1 {i1} -2 {i2}".format( out=out_dir, i1=sample_file_1, i2=sample_file_2) log = subprocess.getoutput(cmd) log_file = os.path.join(out_dir, "{}_spades.log".format(sample_name)) Utilities.dump_string(log, file=log_file) return { "sample_name": sample_name, "assembly": os.path.join(out_dir, "contigs.fasta") }
def mp_get_and_blast_largest_contig(assembly_file: str): if os.path.getsize(assembly_file) == 0: print("Cannot process the empty file: '{}'".format(assembly_file)) return with open(assembly_file) as f: contig_records = sorted(list(SeqIO.parse(f, "fasta")), key=lambda x: len(x), reverse=True) f.close() largest_contig = randomize_gene_slice(contig_records[0]).format("fasta") # The delay to avoid NCBI ban randomize_sleep() # NCBI query result_handle = attempt_func(NCBIWWW.qblast, ("blastn", "nt", largest_contig)) blast_record = NCBIXML.read(result_handle) # Based on: https://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc95 _E_VALUE_THRESH = 0.04 _QUERY_REPORT_SYMBOLS = 75 high_scoring_pairs = [] for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < _E_VALUE_THRESH: high_scoring_pairs.append( dict(title=alignment.title, length=alignment.length, expect=hsp.expect, score=hsp.score, bits=hsp.bits, identities=hsp.identities, positives=hsp.positives, assembly_file=assembly_file, query="...\n".join([ hsp.query[:_QUERY_REPORT_SYMBOLS], hsp.match[:_QUERY_REPORT_SYMBOLS], hsp.sbjct[:_QUERY_REPORT_SYMBOLS], "" ]))) high_scoring_pairs = sorted(high_scoring_pairs, key=lambda x: x.get("score"), reverse=True) # Export BLAST results Utilities.dump_string( json.dumps(high_scoring_pairs, sort_keys=True, indent=4), "{}.BLAST.json".format(os.path.splitext(assembly_file)[0])) return high_scoring_pairs
def dump_index_guide(input_nucleotide_fasta: str, output_dir: str): if not Utilities.is_file_valid(input_nucleotide_fasta): raise ValueError(f"Invalid file: '{input_nucleotide_fasta}'") cmd_0 = f""" export IMG=ivasilyev/bwt_filtering_pipeline_worker:latest && \ docker pull "$IMG" && \ docker run --rm -v /data:/data -v /data1:/data1 -v /data2:/data2 -it "$IMG" \ bash -c ' cd "{output_dir}"; python3 "$HOME/scripts/cook_the_reference.py" \ --input "{input_nucleotide_fasta}" \ --output "{output_dir}"; ' """ cmd = Utilities.join_lines(cmd_0) out_file = os.path.join(output_dir, "index.sh") Utilities.dump_string(cmd, out_file) print(f"For indexing, run outside of Docker: 'bash \"{out_file}\"'")
"Tmt": "Trimethoprim", "Bla": "CBL", "Bla_ESBL": "ESBL", "Bla_broad": "BSBL", "Bla_broad_inhR": "BSBL-inhR" }, inplace=True) phenotype_df = pd.concat([initial_sample_data_df, antibiogram_df], axis=1, sort=False).sort_index() phenotype_df.index.names = [INDEX_COL_NAME] phenotype_df = process_header(phenotype_df).transpose().reset_index() # Utilities.dump_tsv(phenotype_df, os.path.join(article_dir, "phenotype.tsv")) Utilities.dump_string(phenotype_df.to_latex(index=False, header=True), os.path.join(article_dir, "phenotype.tex")) genotype_df = pd.concat( [ ncbi_accessions_df, combined_assembly_statistics_df, kleborate_results_df ], axis=1, sort=False).sort_index() # .sort_values(["Patient ID", "Sample Number"]) genotype_df.index.names = [INDEX_COL_NAME] # genotype_df.replace({"_": "\\_"}, regex=True) genotype_df = process_header(genotype_df, capitalize=False).transpose().reset_index() # Utilities.dump_tsv(genotype_df, os.path.join(article_dir, "genotype.tsv")) Utilities.dump_string(genotype_df.to_latex(index=False, header=True),
templates_dir = os.path.join(ProjectDescriber.ROOT_DIR, "reports", "1") template = jinja2.Template( Utilities.load_string(os.path.join(templates_dir, "template.txt"))) for sample_name in SAMPLE_NAMES: # sample_name = SAMPLE_NAMES[0] # combined_assembly_statistics_df = Utilities.load_tsv( os.path.join(".", ProjectDescriber.OWNER, ProjectDescriber.NAME, "data", "tables", "combined_assembly_statistics.tsv")) submission_report_df = Utilities.load_tsv( os.path.join(".", ProjectDescriber.OWNER, ProjectDescriber.NAME, "data", "tables", "ncbi", "submission_report.tsv")) # submission_combined_df = pd.concat([ i.set_index(INDEX_COL_NAME) for i in (combined_assembly_statistics_df, submission_report_df) ], axis=1, sort=False) submission_combined_df.index.names = [INDEX_COL_NAME] # rendering_dict = submission_combined_df.loc[sample_name, :].to_dict() rendering_dict.update(TOOL_VERSIONS) # out_dir = os.path.join(templates_dir, "out") os.makedirs(out_dir, exist_ok=True) Utilities.dump_string(template.render(rendering_dict), os.path.join(out_dir, "{}.txt".format(sample_name)))