def samtools_filter_fixmate_sort_single_job(sample_id, candidates, paths): samtools_filter_fixmate_sort_jobs = [] for candidate in candidates: alignment = "/".join([ paths['plasmid_output'], candidate['accession'] + ".sam", ]) samtools_filter_fixmate_sort_job = { 'job_name': "_".join([ 'samtools_filter_fixmate_sort', sample_id, candidate['accession'] ]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 4', 'remote_command': os.path.join(job_script_path, 'samtools_filter_fixmate_sort.sh'), 'args': [ "--input", alignment, "--flags", 1540, "--output", re.sub('\.sam$', '.bam', alignment), ] } samtools_filter_fixmate_sort_jobs.append( samtools_filter_fixmate_sort_job) run_jobs(samtools_filter_fixmate_sort_jobs)
def main(args, logger=None): """ main entrypoint Args: args(): Returns: (void) """ analysis_id = uuid.uuid4() curDir = os.getcwd() output_dir = args.outdir # metadata_file = args.metadata_file reference = os.path.abspath(args.reference) # sensitivePath = str(options.sensitivePath).lstrip().rstrip() # sensitiveCols = str(options.sensitiveCols).lstrip().rstrip() # outputFile = str(options.outputFile).lstrip().rstrip() # bcidCol = str( str(options.bcidCol).lstrip().rstrip() ) # naValue = str( str(options.naValue).lstrip().rstrip() ) # metadata = result_parsers.parse_workflow_results(metadata_file) # distance = read(distancePath) # treeFile = "".join(read(treePath)) if not logger: logging.basicConfig( format="%(message)s", stream=sys.stdout, level=logging.DEBUG, ) structlog.configure_once( processors=[ structlog.stdlib.add_log_level, structlog.processors.JSONRenderer() ], logger_factory=structlog.stdlib.LoggerFactory(), wrapper_class=structlog.stdlib.BoundLogger, context_class=structlog.threadlocal.wrap_dict(dict), ) logger = structlog.get_logger( analysis_id=str(uuid.uuid4()), pipeline_version=cpo_pipeline.__version__, ) inputs = [] with open(args.input_file) as input_file: fieldnames = [ 'sample_id', 'reads1', 'reads2', ] reader = csv.DictReader( (row for row in input_file if not row.startswith('#')), delimiter='\t', fieldnames=fieldnames) for row in reader: inputs.append(row) os.environ['QT_QPA_PLATFORM'] = 'offscreen' paths = { 'logs': os.path.abspath(os.path.join( output_dir, 'logs', )), 'snippy_output': os.path.abspath(os.path.join(output_dir, "snippy")), } for output_subdir in paths.values(): try: os.makedirs(output_subdir) except OSError as e: if e.errno != errno.EEXIST: raise job_script_path = resource_filename('data', 'job_scripts') contigs_paths = [] for sample_id in [input["sample_id"] for input in inputs]: contigs = os.path.abspath( os.path.join(args.result_dir, sample_id, "assembly", "contigs.fa")) contigs_paths.append(contigs) snippy_dirs = [ os.path.join( paths['snippy_output'], os.path.basename(os.path.dirname(os.path.dirname(contigs)))) for contigs in contigs_paths ] snippy_jobs = [{ 'job_name': 'snippy', 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8 -shell y', 'remote_command': os.path.join(job_script_path, 'snippy.sh'), 'args': [ "--ref", reference, "--R1", input['reads1'], "--R2", input['reads2'], "--outdir", os.path.join( paths['snippy_output'], input['sample_id'], ), ] } for input in inputs] run_jobs(snippy_jobs) snippy_core_jobs = [{ 'job_name': 'snippy-core', 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8 -shell y', 'remote_command': os.path.join(job_script_path, 'snippy-core.sh'), 'args': [ "--ref", reference, "--outdir", paths["snippy_output"], ] + snippy_dirs }] run_jobs(snippy_core_jobs) snp_dists_jobs = [{ 'job_name': 'snp-dists', 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'snp-dists.sh'), 'args': [ "--alignment", os.path.join(paths["snippy_output"], "core.aln"), "--output_file", os.path.join(paths["snippy_output"], "core.aln.matrix.tsv"), ] }] run_jobs(snp_dists_jobs) iqtree_jobs = [{ 'job_name': 'iqtree', 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'iqtree.sh'), 'args': [ "--alignment", os.path.join(paths["snippy_output"], "core.full.aln"), "--model", "GTR+G4", ] }] run_jobs(iqtree_jobs) clonalframeml_jobs = [{ 'job_name': 'clonalframeml', 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'clonalframeml.sh'), 'args': [ "--alignment", os.path.join(paths["snippy_output"], "core.full.aln"), "--treefile", os.path.join(paths["snippy_output"], "core.full.aln.treefile"), "--output_file", os.path.join(paths["snippy_output"], "core.full.aln.clonalframeml"), ] }] run_jobs(clonalframeml_jobs) maskrc_svg_jobs = [{ 'job_name': 'maskrc-svg', 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'maskrc-svg.sh'), 'args': [ "--alignment", os.path.join(paths["snippy_output"], "core.full.aln"), "--svg", os.path.join(paths["snippy_output"], "core.full.maskrc.svg"), "--clonalframeml", os.path.join(paths["snippy_output"], "core.full.aln.clonalframeml"), "--output_file", os.path.join(paths["snippy_output"], "core.full.maskrc.aln"), ] }] run_jobs(maskrc_svg_jobs) snp_sites_jobs = [{ 'job_name': 'snp-sites', 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'snp-sites.sh'), 'args': [ "--alignment", os.path.join(paths["snippy_output"], "core.full.maskrc.aln"), "--output_file", os.path.join(paths["snippy_output"], "core.full.maskrc.snp.aln"), ] }] run_jobs(snp_sites_jobs) iqtree_jobs = [{ 'job_name': 'iqtree', 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'iqtree.sh'), 'args': [ "--alignment", os.path.join(paths["snippy_output"], "core.full.maskrc.aln"), "--model", "GTR+G+ASC", ] }] run_jobs(iqtree_jobs) snp_dists_jobs = [{ 'job_name': 'snp-sites', 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'snp-dists.sh'), 'args': [ "--alignment", os.path.join(paths["snippy_output"], "core.aln"), "--output_file", os.path.join(paths["snippy_output"], "core.matrix.tab"), ] }, { 'job_name': 'snp-sites', 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'snp-dists.sh'), 'args': [ "--alignment", os.path.join(paths["snippy_output"], "core.full.maskrc.snp.aln"), "--output_file", os.path.join(paths["snippy_output"], "core.full.maskrc.snp.matrix.tab"), ] }] run_jobs(snp_dists_jobs) exit(0) distanceDict = {} #store the distance matrix as rowname:list<string> for i in range(len(distance)): temp = distance[i].split("\t") distanceDict[temp[0]] = temp[1:] #region create box tree #region step5: tree construction treeFile = "".join(read(treePath)) t = e.Tree(treeFile) t.set_outgroup(t & "Reference") #set the tree style ts = e.TreeStyle() ts.show_leaf_name = True ts.show_branch_length = True ts.scale = 2000 #pixel per branch length unit ts.branch_vertical_margin = 15 #pixel between branches style2 = e.NodeStyle() style2["fgcolor"] = "#000000" style2["shape"] = "circle" style2["vt_line_color"] = "#0000aa" style2["hz_line_color"] = "#0000aa" style2["vt_line_width"] = 2 style2["hz_line_width"] = 2 style2["vt_line_type"] = 0 # 0 solid, 1 dashed, 2 dotted style2["hz_line_type"] = 0 for n in t.traverse(): n.set_style(style2) #find the plasmid origins plasmidIncs = {} for key in metadata: for plasmid in metadata[key]['plasmids']: for inc in plasmid['PlasmidRepType'].split(","): if (inc.lower().find("inc") > -1): if not (inc in plasmidIncs): plasmidIncs[inc] = [metadata[key]['ID']] else: if metadata[key]['ID'] not in plasmidIncs[inc]: plasmidIncs[inc].append(metadata[key]['ID']) #plasmidIncs = sorted(plasmidIncs) for n in t.traverse(): #loop through the nodes of a tree if (n.is_leaf() and n.name == "Reference"): #if its the reference branch, populate the faces with column headers index = 0 if len(sensitivePath) > 0: #sensitive metadat @ chris for sensitive_data_column in sensitive_meta_data.get_columns(): (t & "Reference").add_face(addFace(sensitive_data_column), index, "aligned") index = index + 1 (t & "Reference").add_face(addFace("SampleID"), index, "aligned") index = index + 1 (t & "Reference").add_face(addFace("New?"), index, "aligned") index = index + 1 for i in range( len(plasmidIncs) ): #this loop adds the columns (aka the incs) to the reference node (t & "Reference").add_face( addFace(list(plasmidIncs.keys())[i]), i + index, "aligned") index = index + len(plasmidIncs) (t & "Reference").add_face(addFace("MLSTScheme"), index, "aligned") index = index + 1 (t & "Reference").add_face(addFace("Sequence Type"), index, "aligned") index = index + 1 (t & "Reference").add_face(addFace("Carbapenamases"), index, "aligned") index = index + 1 (t & "Reference").add_face(addFace("Plasmid Best Match"), index, "aligned") index = index + 1 (t & "Reference").add_face(addFace("Best Match Identity"), index, "aligned") index = index + 1 for i in range(len( distanceDict[list(distanceDict.keys()) [0]])): #this loop adds the distance matrix (t & "Reference").add_face( addFace(distanceDict[list(distanceDict.keys())[0]][i]), index + i, "aligned") index = index + len(distanceDict[list(distanceDict.keys())[0]]) elif (n.is_leaf() and not n.name == "Reference"): #not reference branches, populate with metadata index = 0 if (n.name.replace(".fa", "") in metadata.keys()): mData = metadata[n.name.replace(".fa", "")] else: mData = metadata["na"] n.add_face(addFace(mData.ID), index, "aligned") index = index + 1 if (mData['new']): #new column face = e.RectFace( 30, 30, "green", "green") # TextFace("Y",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 face.vt_align = 1 face.ht_align = 1 n.add_face(face, index, "aligned") index = index + 1 for incs in plasmidIncs: #this loop adds presence/absence to the sample nodes if (n.name.replace(".fa", "") in plasmidIncs[incs]): face = e.RectFace( 30, 30, "black", "black") # TextFace("Y",fsize=10,tight_text=True) face.border.margin = 5 face.margin_right = 5 face.margin_left = 5 face.vt_align = 1 face.ht_align = 1 n.add_face(face, list(plasmidIncs.keys()).index(incs) + index, "aligned") index = index + len(plasmidIncs) n.add_face(addFace(mData['MLSTSpecies']), index, "aligned") index = index + 1 n.add_face(addFace(mData['SequenceType']), index, "aligned") index = index + 1 n.add_face(addFace(mData['CarbapenemResistanceGenes']), index, "aligned") index = index + 1 n.add_face(addFace(mData['plasmidBestMatch']), index, "aligned") index = index + 1 n.add_face(addFace(mData['plasmididentity']), index, "aligned") index = index + 1 for i in range(len( distanceDict[list(distanceDict.keys()) [0]])): #this loop adds distance matrix if (n.name in distanceDict ): #make sure the column is in the distance matrice n.add_face(addFace(list(distanceDict[n.name])[i]), index + i, "aligned") t.render(outputFile, w=5000, units="mm", tree_style=ts) #save it as a png, pdf, svg or an phyloxml
def refseq_plasmids(sample_id, paths): mash_jobs = [ { 'job_name': "_".join(['mash_screen_refseq_plasmid', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(paths['job_scripts'], 'mash_screen.sh'), 'args': [ "--R1", paths['reads1_fastq'], "--R2", paths['reads2_fastq'], "--queries", paths['mash_refseq_plasmid_db'], "--min-identity", 0.975, "--output_file", os.path.join( paths['refseq_plasmid_output'], 'mash_screen.tsv', ), ], }, ] run_jobs(mash_jobs) mash_screen_result_path = os.path.join( paths['refseq_plasmid_output'], 'mash_screen.tsv', ) mash_screen_results = result_parsers.parse_mash_screen_result( mash_screen_result_path ) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(mash_screen_result_path) ) for result in mash_screen_results: result['accession'] = re.search('ref\|(.*)\|', result['query_id']).group(1) candidates_keys = [ 'identity', 'accession', ] with open(os.path.join(paths['refseq_plasmid_output'], 'candidates.tsv'), 'w+') as candidates_file: writer = csv.DictWriter(candidates_file, candidates_keys, delimiter='\t', extrasaction='ignore') writer.writerows(mash_screen_results) candidates = [] with open(os.path.join(paths['refseq_plasmid_output'], 'candidates.tsv'), 'r') as candidates_file: reader = csv.DictReader(candidates_file, fieldnames=candidates_keys, delimiter='\t') for row in reader: row['fasta_path'] = os.path.join( paths['refseq_plasmid_output'], 'candidates', row['accession'] + '.fna', ) candidates.append(row) for candidate in candidates: candidate['database'] = 'refseq' # NCBI Rate-limits downloads to 3 per second. for candidate in candidates: candidate_fasta = os.path.join( candidate['fasta_path'] ) url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + \ "&".join([ "db=nucleotide", "id=" + candidate['accession'], "rettype=fasta", ]) def download_retry(url, candidate): """ NCBI Rate-limits refseq downloads to 3 per second from each IP. When multiple files are being analyzed simultaneously this limit may be exceeded. Retry """ try: urllib.request.urlretrieve(url, candidate['fasta_path']) logger.info( "file_downloaded", timestamp=str(now()), url=url, accession=candidate['accession'], sample_id=sample_id, ) except HTTPError as e: if int(e.code) == 429: time.sleep(5) logger.info( "retried_download", timestamp=str(now()), url=url, accession=candidate['accession'], sample_id=sample_id, ) download_retry(url, candidate) else: logger.error( "download_failed", timestamp=str(now()), url=url, sample_id=sample_id, ) download_retry(url, candidate) time.sleep(2) return candidates
def samtools_filter_fixmate_sort_discrete_jobs(sample_id, candidates, paths): samtools_view_jobs = [] for candidate in candidates: alignment = os.path.join( paths['plasmid_output_path'], candidate['accession'] + ".sam", ) samtools_view_job = { 'job_name': "_".join(['samtools_view', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 4', 'remote_command': os.path.join(job_script_path, 'samtools_view.sh'), # '--f 1540' excludes the following reads: # - read unmapped (0x4) # - read fails platform/vendor quality checks (0x200) # - read is PCR or optical duplicate (0x400) 'args': [ "--input", alignment, "--flags", 1540, "--output", re.sub("\.sam$", ".mapped.dedup.bam", alignment), ] } samtools_view_jobs.append(samtools_view_job) run_jobs(samtools_view_jobs) samtools_sort_jobs = [] for candidate in candidates: alignment = "/".join([ paths['plasmid_output'], candidate['accession'] + ".mapped.dedup.bam", ]) samtools_sort_job = { 'job_name': "_".join(['samtools_sort', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 4', 'remote_command': os.path.join(job_script_path, 'samtools_sort.sh'), 'args': [ "--input", alignment, "--name-order", "--output", re.sub("\.bam$", ".namesort.bam", alignment), ] } samtools_sort_jobs.append(samtools_sort_job) run_jobs(samtools_sort_jobs) samtools_fixmate_jobs = [] for candidate in candidates: alignment = "/".join([ paths['plasmid_output'], candidate['accession'] + ".mapped.dedup.namesort.bam", ]) samtools_fixmate_job = { 'job_name': "_".join(['samtools_fixmate', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 4', 'remote_command': os.path.join(job_script_path, 'samtools_fixmate.sh'), 'args': [ "--input", alignment, "--output", re.sub("\.bam$", ".fixmate.bam", alignment), ] } samtools_fixmate_jobs.append(samtools_fixmate_job) run_jobs(samtools_fixmate_jobs) samtools_sort_jobs = [] for candidate in candidates: alignment = "/".join([ paths['plasmid_output'], candidate['accession'] + ".mapped.dedup.namesort.fixmate.bam", ]) samtools_sort_job = { 'job_name': "_".join(['samtools_sort', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 4', 'remote_command': os.path.join(job_script_path, 'samtools_sort.sh'), 'args': [ "--input", alignment, "--output", re.sub("\.bam$", ".coordsort.bam", alignment), ] } samtools_sort_jobs.append(samtools_sort_job) run_jobs(samtools_sort_jobs) samtools_markdup_jobs = [] for candidate in candidates: alignment = "/".join([ paths['plasmid_output'], candidate['accession'] + ".mapped.dedup.namesort.fixmate.coordsort.bam", ]) samtools_markdup_job = { 'job_name': "_".join(['samtools_markdup', sample_id], candidate['accession']), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 4', 'remote_command': os.path.join(job_script_path, 'samtools_markdup.sh'), 'args': [ "--input", alignment, "--output", re.sub("\.bam$", ".markdup.bam", alignment), ] } samtools_markdup_jobs.append(samtools_markdup_job) run_jobs(samtools_markdup_jobs)
def custom_plasmids(sample_id, paths): mash_jobs = [ { 'job_name': "_".join(['mash_screen_custom_plasmid', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8 -shell y', 'remote_command': os.path.join(paths['job_scripts'], 'mash_screen_custom_db.sh'), 'args': [ "--R1", paths['reads1_fastq'], "--R2", paths['reads2_fastq'], "--min-identity", 0.996, "--plasmid-db-dir", os.path.join( paths['mash_custom_plasmid_db'], "mash", ), "--output_file", os.path.join( paths['custom_plasmid_output'], 'mash_screen.tsv', ) ], }, ] run_jobs(mash_jobs) mash_screen_results = result_parsers.parse_mash_screen_result( os.path.join( paths['custom_plasmid_output'], 'mash_screen.tsv', ) ) custom_plasmid_db_data = {} for dat_file in glob.glob(os.path.join(paths['mash_custom_plasmid_db'], "data", "*.dat")): [dat] = parsers.custom_plasmid_db_dat_parser(dat_file) custom_plasmid_db_data[dat['accession']] = dat for mash_screen_result in mash_screen_results: accession = re.sub('\.fna$', '', mash_screen_result['query_id']) mash_screen_result['accession'] = accession mash_screen_result['allele'] = custom_plasmid_db_data[accession]['allele'] mash_screen_result['circularity'] = custom_plasmid_db_data[accession]['circularity'] mash_screen_result['plasmid_length'] = custom_plasmid_db_data[accession]['plasmid_length'] mash_screen_result['incompatibility_group'] = custom_plasmid_db_data[accession]['incompatibility_group'] mash_screen_results.sort(key=operator.itemgetter('accession')) mash_screen_results.sort(key=operator.itemgetter('plasmid_length'), reverse=True) mash_screen_results.sort(key=operator.itemgetter('identity'), reverse=True) mash_screen_results.sort(key=operator.itemgetter('circularity')) mash_screen_results.sort(key=operator.itemgetter('incompatibility_group')) candidates_keys = [ 'identity', 'accession', 'circularity', 'plasmid_length', 'allele', 'incompatibility_group', ] with open(os.path.join(paths['custom_plasmid_output'], 'candidates.tsv'), 'w+') as candidates_file: writer = csv.DictWriter(candidates_file, candidates_keys, delimiter='\t', extrasaction='ignore') writer.writerows(mash_screen_results) candidates = [] with open(os.path.join(paths['custom_plasmid_output'], 'candidates.tsv'), 'r') as candidates_file: reader = csv.DictReader(candidates_file, fieldnames=candidates_keys, delimiter='\t') for row in reader: row['fasta_path'] = os.path.join( paths['custom_plasmid_output'], 'candidates', row['accession'] + '.fna', ) candidates.append(row) for candidate in candidates: candidate['database'] = 'custom' for candidate in candidates: candidate_fasta_db_path = os.path.join( paths['mash_custom_plasmid_db'], candidate['accession'] + ".fna" ) shutil.copyfile(candidate_fasta_db_path, candidate['fasta_path']) logger.info( "file_copied", timestamp=str(now()), accession=candidate['accession'], sample_id=sample_id ) return candidates
def main(args): """ main entrypoint Args: args(): Returns: (void) """ config = configparser.ConfigParser() config.read(args.config_file) try: mash_genome_db = args.mash_genome_db except AttributeError: try: mash_genome_db = config['databases']['mash_genome_db'] if not os.path.exists(mash_genome_db): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), mash_genome_db) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/mash_genome_db", configuration_value=mash_genome_db, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/mash_genome_db", error_message=str(e), ) sample_id = args.sample_id reads1_fastq = args.reads1_fastq reads2_fastq = args.reads2_fastq output_dir = args.outdir prepare_output_directories(output_dir, sample_id) #dictionary to store QC PASS/FAIL flags qc_verdicts = { "multiple_species_contamination": None, "fastq_contains_plasmids": None, "acceptable_coverage": None, "acceptable_fastqc_forward": None, "acceptable_fastqc_reverse": None, "acceptable_quast_assembly_metrics": None, "acceptable_busco_assembly_metrics": None } qc_thresholds = { # genome mash will include all hits with scores (top hit score - $thisvalue) "mash_hits_genome_score_cutoff": 300, # plasmid mash will include all hits with scores (top hit score - $thisvalue) "mash_hits_plasmid_score_cutoff": 100, # sequencing coverage greater than ($thisvalue) will pass the QC "coverage_cutoff": 30, # QUAST QC: assembly length within +-($thisvalue) percent # in reference to reference length will pass the QC "quast_assembly_length_cutoff": 0.10, # BUSCO QC: complete single genes greater than ($thisvalue) percent will pass the QC "busco_complete_single_cutoff": 0.90, # BUSCO QC: complete duplicate genes less than ($thisvalue) percent will pass the QC "busco_complete_duplicate_cutoff": 0.10 } paths = { "output_dir": output_dir, 'logs': os.path.join( output_dir, sample_id, 'logs', ), "mash_genome_path": os.path.join(output_dir, sample_id, "pre-assembly_qc", "mash_dist.genome.tsv"), "fastqc_output_path": os.path.join(output_dir, sample_id, "pre-assembly_qc", "fastqc"), "totalbp_path": os.path.join(output_dir, sample_id, "pre-assembly_qc", "totalbp"), "estimated_coverage_stats_path": os.path.join(output_dir, sample_id, "pre-assembly_qc", "estimated_coverage_stats.tsv"), "reference_genome_path": os.path.join(output_dir, sample_id, "reference"), "assembly_output": os.path.join(output_dir, sample_id, "assembly"), "quast_path": os.path.join(output_dir, sample_id, "post-assembly_qc", "quast"), } job_script_path = resource_filename('data', 'job_scripts') estimated_genome_sizes_path = resource_filename( 'data', 'estimated_genome_sizes.tsv') estimated_genome_sizes = input_parsers.parse_estimated_genome_sizes( estimated_genome_sizes_path) pre_assembly_qc_jobs = [{ 'job_name': "_".join(['mash_dist_sort_head', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'mash_dist_sort_head.sh'), 'args': [ "--R1", reads1_fastq, "--R2", reads2_fastq, "--queries", mash_genome_db, "--output_file", paths['mash_genome_path'] ], }, { 'job_name': "_".join(['fastqc', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'fastqc.sh'), 'args': [ "--R1", reads1_fastq, "--R2", reads2_fastq, "--output_dir", paths['fastqc_output_path'] ], }, { 'job_name': "_".join(['seqtk_totalbp', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'seqtk_totalbp.sh'), 'args': [ "--R1", reads1_fastq, "--R2", reads2_fastq, "--output_file", paths['totalbp_path'] ], }] run_jobs(pre_assembly_qc_jobs) #parse genome mash results mash_dist_results = [] try: mash_dist_results = result_parsers.parse_mash_dist_result( paths["mash_genome_path"]) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(paths["mash_genome_path"]), closest_match_reference_id=mash_dist_results[0]['reference_id'], ) except Exception as e: logger.info( "result_parsing_failed", timestamp=str(now()), filename=os.path.abspath(paths["mash_genome_path"]), error_message=e.message, ) # parse fastqc fastqc_results = {} for read in ["R1", "R2"]: try: [fastqc_result_summary_path] = glob.glob( os.path.join(paths['fastqc_output_path'], "*_" + read + "_*" + "fastqc", 'summary.txt')) fastqc_results[read] = result_parsers.parse_fastqc_result( fastqc_result_summary_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(fastqc_result_summary_path), summary=fastqc_results[read], ) except Exception as e: logger.error("result_parsing_failed", timestamp=str(now()), filename=fastqc_result_summary_path) fastqc_results["R1"] = { "basic_statistics": "FAILED_TO_PARSE", "per_base_sequence_quality": "FAILED_TO_PARSE", "per_tile_sequence_quality": "FAILED_TO_PARSE", "per_sequence_quality_scores": "FAILED_TO_PARSE", "per_base_sequence_content": "FAILED_TO_PARSE", "per_sequence_gc_content": "FAILED_TO_PARSE", "per_base_n_content": "FAILED_TO_PARSE", "sequence_length_distribution": "FAILED_TO_PARSE", "sequence_duplication_levels": "FAILED_TO_PARSE", "overrepresented_sequences": "FAILED_TO_PARSE", "adapter_content": "FAILED_TO_PARSE", } fastqc_results["R2"] = { "basic_statistics": "FAILED_TO_PARSE", "per_base_sequence_quality": "FAILED_TO_PARSE", "per_tile_sequence_quality": "FAILED_TO_PARSE", "per_sequence_quality_scores": "FAILED_TO_PARSE", "per_base_sequence_content": "FAILED_TO_PARSE", "per_sequence_gc_content": "FAILED_TO_PARSE", "per_base_n_content": "FAILED_TO_PARSE", "sequence_length_distribution": "FAILED_TO_PARSE", "sequence_duplication_levels": "FAILED_TO_PARSE", "overrepresented_sequences": "FAILED_TO_PARSE", "adapter_content": "FAILED_TO_PARSE", } #look at fastqc results qc_verdicts["acceptable_fastqc_forward"] = qc.fastqc_qc_check( fastqc_results["R1"]) qc_verdicts["acceptable_fastqc_reverse"] = qc.fastqc_qc_check( fastqc_results["R2"]) try: reference_genome = mash_dist_results[0]['reference_id'] except Exception as e: logger.error( "failed_quality_control_check", timestamp=str(now()), qc_check_failed="determine_reference_sequence", error_message=e.message, ) # build the save paths try: os.makedirs(paths['reference_genome_path']) except OSError as exc: if exc.errno != errno.EEXIST: raise download_refseq_reference(reference_genome, paths['reference_genome_path']) # If the user passes an expected organism NCBI taxonomy ID, then # use that to estimate the genome size. Otherwise, use the downloaded reference. estimated_genome_size = DEFAULT_ESTIMATED_GENOME_SIZE if args.expected_organism_ncbi_taxid: estimated_genome_size = get_estimated_genome_size( estimated_genome_sizes, args.expected_organism_ncbi_taxid) else: try: [reference_genome_assembly_stats_path ] = glob.glob(paths["reference_genome_path"] + "/*_assembly_stats.txt") except ValueError: logger.error( "result_parsing_failed", timestamp=str(now()), filename=str(os.path.abspath(paths["reference_genome_path"])) + "/*_assembly_stats.txt", ) try: reference_genome_assembly_stats = result_parsers.parse_reference_genome_assembly_stats( reference_genome_assembly_stats_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(reference_genome_assembly_stats_path), total_length=reference_genome_assembly_stats['total_length'], contig_count=reference_genome_assembly_stats['contig_count'], contig_N50=reference_genome_assembly_stats['contig_N50'], organism_name=reference_genome_assembly_stats['organism_name'], infraspecific_name=reference_genome_assembly_stats[ 'infraspecific_name'], ncbi_taxonomy_id=reference_genome_assembly_stats['taxid'], refseq_assembly_accession=reference_genome_assembly_stats[ 'refseq_assembly_accession'], ) estimated_genome_size = reference_genome_assembly_stats[ 'total_length'] except Exception as e: logger.error( "result_parsing_failed", timestamp=str(now()), filename=os.path.abspath(reference_genome_assembly_stats_path), error_message=e.message, ) total_bp = result_parsers.parse_total_bp(paths["totalbp_path"]) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(paths["totalbp_path"]), total_bp=total_bp, ) estimated_depth_of_coverage = total_bp / estimated_genome_size if estimated_depth_of_coverage >= int(qc_thresholds["coverage_cutoff"]): qc_verdicts["acceptable_coverage"] = True estimated_coverage_stats_headers = [ 'sample_id', 'total_bp', 'estimated_genome_size', 'estimated_depth_of_coverage', ] with open(paths['estimated_coverage_stats_path'], 'w+') as f: writer = csv.DictWriter(f, fieldnames=estimated_coverage_stats_headers, delimiter='\t') writer.writeheader() writer.writerow({ 'sample_id': sample_id, 'total_bp': int(total_bp), 'estimated_genome_size': int(estimated_genome_size), 'estimated_depth_of_coverage': round(estimated_depth_of_coverage, 4), }) assembly_jobs = [{ 'job_name': "_".join(['shovill', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 16 -l h_vmem=4G', 'remote_command': os.path.join(job_script_path, 'shovill.sh'), 'args': [ "--R1", reads1_fastq, "--R2", reads2_fastq, "--mincov", "3", "--minlen", "500", "--output_dir", paths['assembly_output'] ], }] run_jobs(assembly_jobs) post_assembly_qc_jobs = [ { 'job_name': "_".join(['quast', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'quast.sh'), 'args': [ "--input", os.path.join(paths['assembly_output'], "contigs.fa"), "--outdir", paths['quast_path'] ] }, ] run_jobs(post_assembly_qc_jobs) busco_short_summary_contigs_path = os.path.abspath( paths["quast_path"] + "/busco_stats/short_summary_contigs.txt") busco_results = result_parsers.parse_busco_result( busco_short_summary_contigs_path) logger.info("parsed_result_file", timestamp=str(now()), filename=os.path.abspath(busco_short_summary_contigs_path), busco_results=busco_results) quast_report_path = os.path.abspath(paths["quast_path"] + "/report.txt") quast_results = result_parsers.parse_quast_result(quast_report_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(quast_report_path), num_contigs=quast_results["num_contigs"], N50=quast_results["N50"], ) qc_verdicts["acceptable_busco_assembly_metrics"] = qc.busco_qc_check( busco_results, qc_thresholds) qc_verdicts["acceptable_quast_assembly_metrics"] = qc.quast_qc_check( quast_results, estimated_genome_size)
def main(args): """ main entrypoint Args: args(): Returns: (void) """ config = configparser.ConfigParser() config.read(args.config_file) sample_id = args.sample_id output_dir = args.outdir try: assembly = args.assembly except AttributeError: assembly = os.path.join(output_dir, sample_id, 'assembly', 'contigs.fa') try: mlst_scheme_map_file = args.mlst_scheme_map_file except AttributeError: mlst_scheme_map_file = resource_filename('data', 'scheme_species_map.tab') if not mlst_scheme_map_file: mlst_scheme_map_file = resource_filename('data', 'scheme_species_map.tab') paths = { "output_dir": output_dir, 'logs': os.path.join( output_dir, sample_id, 'logs', ), 'mlst_path': os.path.join(output_dir, sample_id, 'typing', 'mlst', 'mlst.tsv'), 'mob_recon_path': os.path.join(output_dir, sample_id, 'typing', 'mob_recon'), 'abricate_plasmidfinder_path': os.path.join(output_dir, sample_id, 'typing', 'abricate', 'abricate_plasmidfinder.tsv'), } job_script_path = resource_filename('data', 'job_scripts') typing_jobs = [{ 'job_name': "_".join(['mlst', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'mlst.sh'), 'args': [ "--input", assembly, "--label", sample_id, "--output_file", paths['mlst_path'] ] }, { 'job_name': "_".join(['abricate', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'abricate.sh'), 'args': [ "--input", assembly, "--database", "plasmidfinder", "--output_file", paths['abricate_plasmidfinder_path'] ] }, { 'job_name': "_".join(['mob_recon', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'mob_recon.sh'), 'args': ["--input", assembly, "--output_dir", paths['mob_recon_path']] }] run_jobs(typing_jobs) mlst_report = os.path.join(output_dir, sample_id, "typing", "mlst", "mlst.tsv") mlst_hits = result_parsers.parse_mlst_result(mlst_report) # TODO: Check that there is only one MLST result in the report, and handle # cases where the report is malformed. [mlst_hit] = mlst_hits logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(mlst_report), scheme_id=mlst_hit["scheme_id"], sequence_type=mlst_hit["sequence_type"], ) mlst_scheme_map = input_parsers.parse_scheme_species_map( mlst_scheme_map_file) mlst_species = "Undefined" for scheme in mlst_scheme_map: if 'species' in scheme and scheme['scheme_id'] == mlst_hit['scheme_id']: mlst_species = scheme['species'] mob_recon_contig_report_path = os.path.join(output_dir, sample_id, "typing", "mob_recon", "contig_report.txt") mob_recon_contig_report = result_parsers.parse_mob_recon_contig_report( mob_recon_contig_report_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(mob_recon_contig_report_path), num_records=len(mob_recon_contig_report), ) mob_recon_aggregate_report_path = os.path.join( output_dir, sample_id, "typing", "mob_recon", "mobtyper_aggregate_report.txt") mob_recon_aggregate_report = result_parsers.parse_mob_recon_mobtyper_aggregate_report( mob_recon_aggregate_report_path) logger.info( "parsed_result_file", timestamp=str(now()), filename=os.path.abspath(mob_recon_aggregate_report_path), num_records=len(mob_recon_aggregate_report), ) def extract_contig_num(contig_id): """ Given a contig_id from a mob_recon contig_report.txt file, return only the contig number. Args: contig_id (str): contig_id field from mob_recon contig_report.txt For example: "contigs.fa|contig00054_len=2672_cov=424.9_corr=0_origname=NODE_54_length_2672_cov_424.949312_pilon_sw=shovill-spades/1.0.1_date=20181024" Returns: str: contig number. For example: "00054" """ prefix = '|contig' suffix = '_len=' prefix_index = contig_id.find(prefix) + len(prefix) suffix_index = contig_id.find(suffix) contig_num = contig_id[prefix_index:suffix_index] return contig_num def get_plasmid_contigs(mob_recon_contig_report): """ Given a list of dicts generated by parsing a mob_recon contig_report.txt file, return a list of plasmid contigs. Args: mob_recon_contig_report (list of dict): Returns: list: plasmid contigs For example: ['00021', '00022', '00032', ...] """ plasmid_contigs = [] for contig_report_record in mob_recon_contig_report: contig_num = extract_contig_num(contig_report_record['contig_id']) if contig_num not in plasmid_contigs and contig_report_record[ 'rep_type']: plasmid_contigs.append(contig_num) return plasmid_contigs def get_likely_plasmid_contigs(mob_recon_contig_report): """ Given a list of dicts generated by parsing a mob_recon contig_report.txt file, return a list of likely plasmid contigs. Args: mob_recon_contig_report (list of dict): Returns: list: likely plasmid contigs For example: ['00054', '00039', '00061', ...] """ likely_plasmid_contigs = [] for contig_report_record in mob_recon_contig_report: contig_num = extract_contig_num(contig_report_record['contig_id']) if contig_num not in likely_plasmid_contigs and not contig_report_record[ 'rep_type']: likely_plasmid_contigs.append(contig_num) return likely_plasmid_contigs def get_plasmid_origins(mob_recon_contig_report): """ Given a list of dicts generated by parsing a mob_recon contig_report.txt file, return a list of plasmid origins. Args: mob_recon_contig_report (list of dict): Returns: list: plasmid origins For example: ['rep_cluster_1254', 'IncL/M', 'IncN', ...] """ origins = [] for contig_report_record in mob_recon_contig_report: if contig_report_record['rep_type']: if contig_report_record['rep_type'] not in origins: origins.append(contig_report_record['rep_type']) return origins plasmid_contigs = get_plasmid_contigs(mob_recon_contig_report) likely_plasmid_contigs = get_likely_plasmid_contigs( mob_recon_contig_report) origins = get_plasmid_origins(mob_recon_contig_report)
def main(args): """ main entrypoint Args: args(argparse.Namespace): Parsed command-line arguments. Returns: (void) """ config = configparser.ConfigParser() config.read(args.config_file) sample_id = args.sample_id output_dir = args.outdir try: assembly = args.assembly except AttributeError: assembly = os.path.join(output_dir, sample_id, 'assembly', 'contigs.fa') try: card_path = args.card_json except AttributeError: try: card_path = config['databases']['card_json'] if not os.path.exists(card_path): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), card_path) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/card_json", configuration_value=card_path, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/card_json", error_message=str(e), ) try: abricate_datadir = args.abricate_datadir except AttributeError: try: abricate_datadir = config['databases']['abricate_datadir'] if not os.path.exists(abricate_datadir): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), abricate_datadir) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/abricate_datadir", configuration_value=abricate_datadir, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/abricate_datadir", error_message=str(e), ) try: abricate_cpo_plasmid_db = args.abricate_cpo_plasmid_db except AttributeError: try: abricate_cpo_plasmid_db = config['databases'][ 'abricate_cpo_plasmid_db'] if not os.path.exists( os.path.join(abricate_datadir, abricate_cpo_plasmid_db)): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), abricate_cpo_plasmid_db) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/abricate_cpo_plasmid_db", configuration_value=abricate_cpo_plasmid_db, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/abricate_cpo_plasmid_db", ) paths = { "output_dir": output_dir, 'logs': os.path.join( output_dir, sample_id, 'logs', ), 'abricate_path': os.path.join(output_dir, sample_id, 'resistance', 'abricate', 'abricate.tsv'), 'rgi_path': os.path.join(output_dir, sample_id, 'resistance', 'rgi'), } job_script_path = resource_filename('data', 'job_scripts') resistance_jobs = [{ 'job_name': "_".join(['abricate', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'abricate.sh'), 'args': [ "--input", assembly, "--datadir", abricate_datadir, "--database", abricate_cpo_plasmid_db, "--output_file", paths['abricate_path'] ] }, { 'job_name': "_".join(['rgi', sample_id]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(job_script_path, 'rgi.sh'), 'args': [ "--input", assembly, "--card_json", card_path, "--output_dir", paths['rgi_path'] ] }] run_jobs(resistance_jobs) abricate_report_path = os.path.join(output_dir, sample_id, "resistance", "abricate", "abricate.tsv") abricate_report = result_parsers.parse_abricate_result( abricate_report_path) logger.info("parsed_result_file", timestamp=str(datetime.datetime.utcnow().replace( tzinfo=datetime.timezone.utc).isoformat()), filename=os.path.abspath(abricate_report_path), resistance_genes=[{ key: record[key] for key in [ "gene", "accession", "database", "percent_coverage", "percent_identity", ] } for record in abricate_report]) rgi_report_path = os.path.join(output_dir, sample_id, "resistance", "rgi", "rgi.txt") rgi_report = result_parsers.parse_rgi_result_txt(rgi_report_path) logger.info("parsed_result_file", timestamp=str(datetime.datetime.utcnow().replace( tzinfo=datetime.timezone.utc).isoformat()), filename=os.path.abspath(rgi_report_path), resistance_genes=[{ key: record[key] for key in [ "best_hit_aro", "aro", ] } for record in rgi_report]) def get_abricate_carbapenemases(abricate_report): """ Given a list of dicts generated by parsing an abricate report file, return a list of carbapenemases. Args: abricate_report (list of dict): Returns: list: likely plasmid contigs For example: ['NDM-1', '', '', ...] """ abricate_carbapenemases = [] for abricate_report_record in abricate_report: abricate_carbapenemases.append(abricate_report_record['gene']) return abricate_carbapenemases def get_rgi_carbapenemases(rgi_report): """ Given a list of dicts generated by parsing an rgi report file, return a list of carbapenemases. Args: rgi_report (list of dict): Returns: list: likely plasmid contigs For example: ['', '', '', ...] """ rgi_carbapenemases = [] for rgi_report_record in rgi_report: if re.search("carbapenem", rgi_report_record['drug_class']): rgi_carbapenemases.append(rgi_report_record['best_hit_aro']) return rgi_carbapenemases
def main(args): """ main entrypoint Args: args(): Returns: (void) """ config = configparser.ConfigParser() config.read(args.config_file) try: mash_refseq_plasmid_db = args.mash_refseq_plasmid_db except AttributeError: try: mash_refseq_plasmid_db = config['databases'][ 'mash_refseq_plasmid_db'] if not os.path.exists(mash_refseq_plasmid_db): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), mash_refseq_plasmid_db) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/mash_refseq_plasmid_db", configuration_value=mash_refseq_plasmid_db, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/mash_refseq_plasmid_db", error_message=str(e), ) try: mash_custom_plasmid_db = args.mash_custom_plasmid_db except AttributeError: try: mash_custom_plasmid_db = config['databases'][ 'mash_custom_plasmid_db'] if not os.path.exists(mash_custom_plasmid_db): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), mash_custom_plasmid_db) logger.info( "configuration_loaded", timestamp=str(now()), configuration_attribute="databases/mash_custom_plasmid_db", configuration_value=mash_custom_plasmid_db, ) except Exception as e: logger.error( "configuration_failed", timestamp=str(now()), configuration_attribute="databases/mash_custom_plasmid_db", error_message=str(e), ) sample_id = args.sample_id output_dir = args.outdir paths = { 'job_scripts': resource_filename('data', 'job_scripts'), 'reads1_fastq': args.reads1_fastq, 'reads2_fastq': args.reads2_fastq, 'mash_custom_plasmid_db': mash_custom_plasmid_db, 'mash_refseq_plasmid_db': mash_refseq_plasmid_db, 'output_dir': output_dir, 'logs': os.path.join( output_dir, sample_id, 'logs', ), 'plasmid_output': os.path.join( output_dir, sample_id, "plasmids", ), "refseq_plasmid_output": os.path.join( output_dir, sample_id, "plasmids", "refseq_plasmids", ), "custom_plasmid_output": os.path.join( output_dir, sample_id, "plasmids", "custom_plasmids", ), } os.makedirs(paths['logs'], exist_ok=True) os.makedirs(os.path.join( paths['custom_plasmid_output'], 'candidates', ), exist_ok=True) os.makedirs(os.path.join( paths['refseq_plasmid_output'], 'candidates', ), exist_ok=True) refseq_candidates = strategies.refseq_plasmids(sample_id, paths) custom_candidates = strategies.custom_plasmids(sample_id, paths) candidates = refseq_candidates + custom_candidates samtools_faidx_jobs = [] bwa_index_jobs = [] for candidate in candidates: samtools_faidx_job = { 'job_name': "_".join(['samtools_faidx', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 2', 'remote_command': os.path.join(paths['job_scripts'], 'samtools_faidx.sh'), 'args': [ "--fasta", candidate['fasta_path'], ] } bwa_index_job = { 'job_name': "_".join(['bwa_index', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 2', 'remote_command': os.path.join(paths['job_scripts'], 'bwa_index.sh'), 'args': [ "--fasta", candidate['fasta_path'], ] } samtools_faidx_jobs.append(samtools_faidx_job) bwa_index_jobs.append(bwa_index_job) run_jobs(samtools_faidx_jobs + bwa_index_jobs) bwa_mem_jobs = [] for candidate in candidates: bwa_mem_job = { 'job_name': "_".join(['bwa_mem', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8 -shell y', 'remote_command': os.path.join(paths['job_scripts'], 'bwa_mem.sh'), 'args': [ "--reference", candidate['fasta_path'], "--R1", paths['reads1_fastq'], "--R2", paths['reads2_fastq'], "--output", re.sub("\.fna$", ".sam", candidate['fasta_path']) ] } bwa_mem_jobs.append(bwa_mem_job) run_jobs(bwa_mem_jobs) samtools_filter_fixmate_sort_jobs = [] for candidate in candidates: alignment = os.path.join( re.sub("\.fna$", ".sam", candidate['fasta_path'])) samtools_filter_fixmate_sort_job = { 'job_name': "_".join([ 'samtools_filter_fixmate_sort', sample_id, candidate['accession'] ]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 4', 'remote_command': os.path.join(paths['job_scripts'], 'samtools_filter_fixmate_sort.sh'), 'args': [ "--input", alignment, "--flags", 1540, "--output", re.sub('\.sam$', '.bam', alignment), ] } samtools_filter_fixmate_sort_jobs.append( samtools_filter_fixmate_sort_job) run_jobs(samtools_filter_fixmate_sort_jobs) for candidate in candidates: sam_alignment = "/".join([ re.sub('\.fna$', '.sam', candidate['fasta_path']), ]) os.remove(sam_alignment) samtools_index_jobs = [] for candidate in candidates: alignment = os.path.join( re.sub('\.fna', '.bam', candidate['fasta_path'])) samtools_index_job = { 'job_name': "_".join(['samtools_index', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 4', 'remote_command': os.path.join(paths['job_scripts'], 'samtools_index.sh'), 'args': [ "--input", alignment, ] } samtools_index_jobs.append(samtools_index_job) run_jobs(samtools_index_jobs) samtools_depth_jobs = [] for candidate in candidates: alignment = os.path.join( re.sub('\.fna', '.bam', candidate['fasta_path'])) samtools_depth_job = { 'job_name': "_".join(['samtools_depth', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 1', 'remote_command': os.path.join(paths['job_scripts'], 'samtools_depth.sh'), 'args': [ "--input", alignment, "--output", re.sub('\.bam$', '.depth', alignment), ] } samtools_depth_jobs.append(samtools_depth_job) run_jobs(samtools_depth_jobs) for candidate in candidates: depth = os.path.join( re.sub('\.fna$', '.depth', candidate['fasta_path']), ) MINIMUM_DEPTH = 10 MINIMUM_COVERAGE_PERCENT = 95.0 positions_above_minimum_depth = 0 total_length = 0 with open(depth) as depth_file: for line in depth_file: [_, position, depth] = line.split() total_length += 1 if int(depth) >= MINIMUM_DEPTH: positions_above_minimum_depth += 1 candidate['bases_above_minimum_depth'] = positions_above_minimum_depth try: candidate[ 'percent_above_minimum_depth'] = positions_above_minimum_depth / total_length except ZeroDivisionError: candidate['percent_above_minimum_depth'] = 0.0 freebayes_jobs = [] for candidate in candidates: alignment = re.sub('\.fna$', '.bam', candidate['fasta_path']) reference = candidate['fasta_path'] vcf = re.sub('\.fna$', '.vcf', candidate['fasta_path']) freebayes_job = { 'job_name': "_".join(['freebayes', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 8', 'remote_command': os.path.join(paths['job_scripts'], 'freebayes.sh'), 'args': [ "--input", alignment, "--reference", reference, "--output", vcf, ] } freebayes_jobs.append(freebayes_job) run_jobs(freebayes_jobs) bcftools_view_jobs = [] for candidate in candidates: vcf = re.sub('\.fna$', '.vcf', candidate['fasta_path']) bcftools_view_job = { 'job_name': "_".join(['bcftools_view', sample_id, candidate['accession']]), 'output_path': paths['logs'], 'error_path': paths['logs'], 'native_specification': '-pe smp 2 -shell y', 'remote_command': os.path.join(paths['job_scripts'], 'bcftools_view.sh'), 'args': [ "--input", vcf, "--output", re.sub('\.vcf$', '.snps.vcf', vcf), ] } bcftools_view_jobs.append(bcftools_view_job) run_jobs(bcftools_view_jobs) for candidate in candidates: snps_vcf = re.sub('\.fna$', '.snps.vcf', candidate['fasta_path']) snps = 0 with open(snps_vcf, 'r') as f: for line in f: if not line.startswith('#'): snps += 1 candidate['snps'] = snps plasmid_output_summary = os.path.join(paths['plasmid_output'], 'custom_plasmid.txt') plasmid_output_final = os.path.join(output_dir, sample_id, 'final_plasmid.tsv') custom_candidates = [c for c in candidates if c['database'] == 'custom'] custom_candidates.sort(key=operator.itemgetter('snps')) custom_candidates.sort(key=operator.itemgetter('plasmid_length'), reverse=True) custom_candidates.sort( key=operator.itemgetter('percent_above_minimum_depth'), reverse=True) custom_best_candidate = next(iter(custom_candidates), None) with open(plasmid_output_final, 'w+') as f: fieldnames = [ 'sample_id', 'accession', 'circularity', 'plasmid_length', 'bases_above_minimum_depth', 'percent_above_minimum_depth', 'snps', 'allele', 'incompatibility_group' ] writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore') writer.writeheader() if custom_best_candidate: f.write(args.sample_id + '\t') # Truncate floats to 4 digits writer.writerow({ k: round(v, 4) if isinstance(v, float) else v for k, v in custom_best_candidate.items() }) with open(plasmid_output_summary, 'w+') as f: fieldnames = [ 'sample_id', 'accession', 'circularity', 'plasmid_length', 'bases_above_minimum_depth', 'percent_above_minimum_depth', 'snps', 'allele', 'incompatibility_group' ] writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter='\t', extrasaction='ignore') writer.writeheader() for candidate in custom_candidates: f.write(args.sample_id + '\t') # Truncate floats to 4 digits writer.writerow({ k: round(v, 4) if isinstance(v, float) else v for k, v in candidate.items() })