def download_retry(url, candidate):
     """
       NCBI Rate-limits refseq downloads to 3 per second from each IP.
       When multiple files are being analyzed simultaneously this 
       limit may be exceeded.
       Retry
     """
     try:
         urllib.request.urlretrieve(url, candidate['fasta_path'])
         logger.info(
             "file_downloaded",
             timestamp=str(now()),
             url=url,
             accession=candidate['accession'],
             sample_id=sample_id,
         )
     except HTTPError as e:
         if int(e.code) == 429:
             time.sleep(5)
             logger.info(
                 "retried_download",
                 timestamp=str(now()),
                 url=url,
                 accession=candidate['accession'],
                 sample_id=sample_id,
             )
             download_retry(url, candidate)
         else:
             logger.error(
                 "download_failed",
                 timestamp=str(now()),
                 url=url,
                 sample_id=sample_id,
             )
def custom_plasmids(sample_id, paths):
    mash_jobs = [
        {
            'job_name': "_".join(['mash_screen_custom_plasmid', sample_id]),
            'output_path': paths['logs'],
            'error_path': paths['logs'],
            'native_specification': '-pe smp 8 -shell y',
            'remote_command': os.path.join(paths['job_scripts'], 'mash_screen_custom_db.sh'),
            'args': [
                "--R1", paths['reads1_fastq'],
                "--R2", paths['reads2_fastq'],
                "--min-identity", 0.996,
                "--plasmid-db-dir", os.path.join(
                    paths['mash_custom_plasmid_db'],
                    "mash",
                ),
                "--output_file", os.path.join(
                    paths['custom_plasmid_output'],
                    'mash_screen.tsv',
                )
            ],
        },
    ]
    
    run_jobs(mash_jobs)

    
    mash_screen_results = result_parsers.parse_mash_screen_result(
        os.path.join(
            paths['custom_plasmid_output'],
            'mash_screen.tsv',
        )
    )

    custom_plasmid_db_data = {}
    for dat_file in glob.glob(os.path.join(paths['mash_custom_plasmid_db'], "data", "*.dat")):
        [dat] = parsers.custom_plasmid_db_dat_parser(dat_file)
        custom_plasmid_db_data[dat['accession']] = dat

    for mash_screen_result in mash_screen_results:
        accession = re.sub('\.fna$', '', mash_screen_result['query_id'])
        mash_screen_result['accession'] = accession
        mash_screen_result['allele'] = custom_plasmid_db_data[accession]['allele']
        mash_screen_result['circularity'] = custom_plasmid_db_data[accession]['circularity']
        mash_screen_result['plasmid_length'] = custom_plasmid_db_data[accession]['plasmid_length']
        mash_screen_result['incompatibility_group'] = custom_plasmid_db_data[accession]['incompatibility_group']

    mash_screen_results.sort(key=operator.itemgetter('accession'))
    mash_screen_results.sort(key=operator.itemgetter('plasmid_length'), reverse=True)
    mash_screen_results.sort(key=operator.itemgetter('identity'), reverse=True)
    mash_screen_results.sort(key=operator.itemgetter('circularity'))
    mash_screen_results.sort(key=operator.itemgetter('incompatibility_group'))

    candidates_keys = [
        'identity',
        'accession',
        'circularity',
        'plasmid_length',
        'allele',
        'incompatibility_group',
    ]
    
    with open(os.path.join(paths['custom_plasmid_output'], 'candidates.tsv'), 'w+') as candidates_file:
        writer = csv.DictWriter(candidates_file, candidates_keys,
                                delimiter='\t', extrasaction='ignore')
        writer.writerows(mash_screen_results)

    candidates = []
    with open(os.path.join(paths['custom_plasmid_output'], 'candidates.tsv'), 'r') as candidates_file:
        reader = csv.DictReader(candidates_file, fieldnames=candidates_keys, delimiter='\t')
        for row in reader:
            row['fasta_path'] = os.path.join(
                paths['custom_plasmid_output'],
                'candidates',
                row['accession'] + '.fna',
            )
            candidates.append(row)

    for candidate in candidates:
        candidate['database'] = 'custom'

    for candidate in candidates:
        candidate_fasta_db_path = os.path.join(
            paths['mash_custom_plasmid_db'],
            candidate['accession'] + ".fna"
        )
        shutil.copyfile(candidate_fasta_db_path, candidate['fasta_path'])
        logger.info(
            "file_copied",
            timestamp=str(now()),
            accession=candidate['accession'],
            sample_id=sample_id
        )
    return candidates
def refseq_plasmids(sample_id, paths):

    mash_jobs = [
        {
            'job_name': "_".join(['mash_screen_refseq_plasmid', sample_id]),
            'output_path': paths['logs'],
            'error_path': paths['logs'],
            'native_specification': '-pe smp 8',
            'remote_command': os.path.join(paths['job_scripts'], 'mash_screen.sh'),
            'args': [
                "--R1", paths['reads1_fastq'],
                "--R2", paths['reads2_fastq'],
                "--queries", paths['mash_refseq_plasmid_db'],
                "--min-identity", 0.975,
                "--output_file", os.path.join(
                    paths['refseq_plasmid_output'],
                    'mash_screen.tsv',
                ),
            ],
        },
    ]
    run_jobs(mash_jobs)

    mash_screen_result_path = os.path.join(
        paths['refseq_plasmid_output'],
        'mash_screen.tsv',
    )
    mash_screen_results = result_parsers.parse_mash_screen_result(
        mash_screen_result_path
    )
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(mash_screen_result_path)
    )
    
    for result in mash_screen_results:
        result['accession'] = re.search('ref\|(.*)\|', result['query_id']).group(1)
        
    candidates_keys = [
        'identity',
        'accession',
    ]
    
    with open(os.path.join(paths['refseq_plasmid_output'], 'candidates.tsv'), 'w+') as candidates_file:
        writer = csv.DictWriter(candidates_file, candidates_keys,
                                delimiter='\t', extrasaction='ignore')
        writer.writerows(mash_screen_results)

    candidates = []
    with open(os.path.join(paths['refseq_plasmid_output'], 'candidates.tsv'), 'r') as candidates_file:
        reader = csv.DictReader(candidates_file, fieldnames=candidates_keys, delimiter='\t')
        for row in reader:
            row['fasta_path'] = os.path.join(
                paths['refseq_plasmid_output'],
                'candidates',
                row['accession'] + '.fna',
            )
            candidates.append(row)

    for candidate in candidates:
        candidate['database'] = 'refseq'
    
    # NCBI Rate-limits downloads to 3 per second.
    for candidate in candidates:
        candidate_fasta = os.path.join(
            candidate['fasta_path']
        )
        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + \
            "&".join([
                "db=nucleotide",
                "id=" + candidate['accession'],
                "rettype=fasta",
            ])
        def download_retry(url, candidate):
            """
              NCBI Rate-limits refseq downloads to 3 per second from each IP.
              When multiple files are being analyzed simultaneously this 
              limit may be exceeded.
              Retry
            """
            try:
                urllib.request.urlretrieve(url, candidate['fasta_path'])
                logger.info(
                    "file_downloaded",
                    timestamp=str(now()),
                    url=url,
                    accession=candidate['accession'],
                    sample_id=sample_id,
                )
            except HTTPError as e:
                if int(e.code) == 429:
                    time.sleep(5)
                    logger.info(
                        "retried_download",
                        timestamp=str(now()),
                        url=url,
                        accession=candidate['accession'],
                        sample_id=sample_id,
                    )
                    download_retry(url, candidate)
                else:
                    logger.error(
                        "download_failed",
                        timestamp=str(now()),
                        url=url,
                        sample_id=sample_id,
                    )
        download_retry(url, candidate)
        time.sleep(2)

    return candidates
def main(args):
    """
    """

    config = configparser.ConfigParser()
    config.read(args.config_file)

    analysis_id = uuid.uuid4()

    logger.new(
        analysis_id=str(uuid.uuid4()),
        sample_id=args.sample_id,
        pipeline_version=cpo_pipeline.__version__,
    )

    logger.info(
        "analysis_started",
        timestamp=str(now()),
    )

    cpo_pipeline.plasmids.pipeline.main(args)

    cpo_pipeline.assembly.pipeline.main(args)

    cpo_pipeline.typing.pipeline.main(args)

    cpo_pipeline.resistance.pipeline.main(args)

    final_outputs = collect_final_outputs(args.outdir, args.sample_id)
    logger.info(
        "collected_final_outputs",
        final_outputs=final_outputs,
    )

    final_output_path = "/".join(
        [args.outdir, args.sample_id, 'final_output.tsv'])

    final_outputs_headers = [
        'sample_id',
        'bp',
        'est_genome_size',
        'Coverage',
        'MASH_BEST_HIT',
        'MLST_SCHEME',
        'MLST',
        'MLST_ALLELE_1',
        'MLST_ALLELE_2',
        'MLST_ALLELE_3',
        'MLST_ALLELE_4',
        'MLST_ALLELE_5',
        'MLST_ALLELE_6',
        'MLST_ALLELE_7',
    ]

    with open(final_output_path, 'w+') as f:
        writer = csv.DictWriter(f,
                                fieldnames=final_outputs_headers,
                                delimiter='\t')
        writer.writeheader()
        writer.writerow(final_outputs)

    logger.info(
        "analysis_completed",
        timestamp=str(now()),
    )
def collect_final_outputs(outdir, sample_id):
    final_outputs = {}
    final_outputs['sample_id'] = sample_id
    total_bp_path = os.path.join(outdir, sample_id, 'pre-assembly_qc',
                                 'totalbp')

    try:
        total_bp = cpo_pipeline.assembly.parsers.result_parsers.parse_total_bp(
            total_bp_path)
        logger.info(
            "parsed_result_file",
            timestamp=str(now()),
            filename=os.path.abspath(total_bp_path),
        )
    except FileNotFoundError:
        logger.error(
            "output_parsing_failed",
            timestamp=str(now()),
        )
        total_bp = None

    estimated_genome_coverage_stats_path = os.path.join(
        outdir, sample_id, 'pre-assembly_qc', 'estimated_coverage_stats.tsv')

    try:
        estimated_coverage_stats = cpo_pipeline.assembly.parsers.result_parsers.parse_estimated_coverage_stats(
            estimated_genome_coverage_stats_path)
        logger.info(
            "parsed_result_file",
            timestamp=str(now()),
            filename=os.path.abspath(estimated_genome_coverage_stats_path),
        )
    except FileNotFoundError:
        logger.error(
            "output_parsing_failed",
            timestamp=str(now()),
            filename=os.path.abspath(estimated_genome_coverage_stats_path),
        )
        estimated_coverage_stats = {
            'sample_id': sample_id,
            'total_bp': '-',
            'estimated_genome_size': '-',
            'estimated_depth_of_coverage': '-',
        }

    reference_genome_assembly_stats_glob = os.path.join(
        outdir, sample_id, 'reference', "*_assembly_stats.txt")
    try:
        [reference_genome_assembly_stats_path
         ] = glob.glob(reference_genome_assembly_stats_glob)
    except ValueError:
        logger.error(
            "result_parsing_failed",
            timestamp=str(now()),
            filename=str(reference_genome_assembly_stats_glob),
        )
    try:
        reference_genome_assembly_stats = cpo_pipeline.assembly.parsers.result_parsers.parse_reference_genome_assembly_stats(
            reference_genome_assembly_stats_path)
    except FileNotFoundError:
        logger.error(
            "output_parsing_failed",
            timestamp=str(now()),
            filename=os.path.abspath(reference_genome_assembly_stats_path),
        )
        reference_genome_assembly_stats = {
            'organism_name': 'Unknown (parsing failed)',
            'infraspecific_name': 'Unknown (parsing failed)',
            'refseq_assembly_accession': 'Unknown (parsing failed)',
            'taxid': 'Unknown (parsing failed)',
            'total_length': 0,
            'contig_count': 0,
            'contig_N50': 0,
        }

    mlst_result_path = os.path.join(outdir, sample_id, 'typing', 'mlst',
                                    'mlst.tsv')
    try:
        [mlst_result
         ] = cpo_pipeline.typing.parsers.result_parsers.parse_mlst_result(
             mlst_result_path)
    except ValueError:
        logger.error(
            "output_parsing_failed",
            timestamp=str(now()),
        )
        mlst_result = {
            'contig_file':
            os.path.join(outdir, sample_id, 'assembly', 'contigs.fa'),
            'scheme_id':
            '-',
            'sequence_type':
            '-',
            'multi_locus_alleles': {
                'adk': '-',
                'fumc': '-',
                'gyrB': '-',
                'icd': '-',
                'mdh': '-',
                'purA': '-',
                'recA': '-'
            }
        }

    final_outputs['bp'] = total_bp
    final_outputs['est_genome_size'] = estimated_coverage_stats[
        'estimated_genome_size']
    final_outputs['Coverage'] = round(
        estimated_coverage_stats['estimated_depth_of_coverage'], 2)
    final_outputs['MASH_BEST_HIT'] = " ".join([
        reference_genome_assembly_stats['organism_name'],
        reference_genome_assembly_stats['infraspecific_name'],
    ])
    final_outputs['MLST_SCHEME'] = mlst_result['scheme_id']
    final_outputs['MLST'] = mlst_result['sequence_type']
    allele_number = 1
    for key, value in mlst_result['multi_locus_alleles'].items():
        final_outputs['MLST_ALLELE_' +
                      str(allele_number)] = key + "(" + value + ")"
        allele_number += 1

    return final_outputs
Ejemplo n.º 6
0
def main(args):
    """
    main entrypoint
    Args:
        args():
    Returns:
        (void)
    """

    config = configparser.ConfigParser()
    config.read(args.config_file)

    try:
        mash_genome_db = args.mash_genome_db
    except AttributeError:
        try:
            mash_genome_db = config['databases']['mash_genome_db']
            if not os.path.exists(mash_genome_db):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        mash_genome_db)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/mash_genome_db",
                configuration_value=mash_genome_db,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/mash_genome_db",
                error_message=str(e),
            )

    sample_id = args.sample_id
    reads1_fastq = args.reads1_fastq
    reads2_fastq = args.reads2_fastq
    output_dir = args.outdir

    prepare_output_directories(output_dir, sample_id)

    #dictionary to store QC PASS/FAIL flags
    qc_verdicts = {
        "multiple_species_contamination": None,
        "fastq_contains_plasmids": None,
        "acceptable_coverage": None,
        "acceptable_fastqc_forward": None,
        "acceptable_fastqc_reverse": None,
        "acceptable_quast_assembly_metrics": None,
        "acceptable_busco_assembly_metrics": None
    }

    qc_thresholds = {
        # genome mash will include all hits with scores (top hit score - $thisvalue)
        "mash_hits_genome_score_cutoff": 300,
        # plasmid mash will include all hits with scores (top hit score - $thisvalue)
        "mash_hits_plasmid_score_cutoff": 100,
        # sequencing coverage greater than ($thisvalue) will pass the QC
        "coverage_cutoff": 30,
        # QUAST QC: assembly length within +-($thisvalue) percent
        # in reference to reference length will pass the QC
        "quast_assembly_length_cutoff": 0.10,
        # BUSCO QC: complete single genes greater than ($thisvalue) percent will pass the QC
        "busco_complete_single_cutoff": 0.90,
        # BUSCO QC: complete duplicate genes less than ($thisvalue) percent will pass the QC
        "busco_complete_duplicate_cutoff": 0.10
    }

    paths = {
        "output_dir":
        output_dir,
        'logs':
        os.path.join(
            output_dir,
            sample_id,
            'logs',
        ),
        "mash_genome_path":
        os.path.join(output_dir, sample_id, "pre-assembly_qc",
                     "mash_dist.genome.tsv"),
        "fastqc_output_path":
        os.path.join(output_dir, sample_id, "pre-assembly_qc", "fastqc"),
        "totalbp_path":
        os.path.join(output_dir, sample_id, "pre-assembly_qc", "totalbp"),
        "estimated_coverage_stats_path":
        os.path.join(output_dir, sample_id, "pre-assembly_qc",
                     "estimated_coverage_stats.tsv"),
        "reference_genome_path":
        os.path.join(output_dir, sample_id, "reference"),
        "assembly_output":
        os.path.join(output_dir, sample_id, "assembly"),
        "quast_path":
        os.path.join(output_dir, sample_id, "post-assembly_qc", "quast"),
    }

    job_script_path = resource_filename('data', 'job_scripts')
    estimated_genome_sizes_path = resource_filename(
        'data', 'estimated_genome_sizes.tsv')
    estimated_genome_sizes = input_parsers.parse_estimated_genome_sizes(
        estimated_genome_sizes_path)

    pre_assembly_qc_jobs = [{
        'job_name':
        "_".join(['mash_dist_sort_head', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'mash_dist_sort_head.sh'),
        'args': [
            "--R1", reads1_fastq, "--R2", reads2_fastq, "--queries",
            mash_genome_db, "--output_file", paths['mash_genome_path']
        ],
    }, {
        'job_name':
        "_".join(['fastqc', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'fastqc.sh'),
        'args': [
            "--R1", reads1_fastq, "--R2", reads2_fastq, "--output_dir",
            paths['fastqc_output_path']
        ],
    }, {
        'job_name':
        "_".join(['seqtk_totalbp', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'seqtk_totalbp.sh'),
        'args': [
            "--R1", reads1_fastq, "--R2", reads2_fastq, "--output_file",
            paths['totalbp_path']
        ],
    }]

    run_jobs(pre_assembly_qc_jobs)

    #parse genome mash results
    mash_dist_results = []
    try:
        mash_dist_results = result_parsers.parse_mash_dist_result(
            paths["mash_genome_path"])
        logger.info(
            "parsed_result_file",
            timestamp=str(now()),
            filename=os.path.abspath(paths["mash_genome_path"]),
            closest_match_reference_id=mash_dist_results[0]['reference_id'],
        )
    except Exception as e:
        logger.info(
            "result_parsing_failed",
            timestamp=str(now()),
            filename=os.path.abspath(paths["mash_genome_path"]),
            error_message=e.message,
        )

    # parse fastqc
    fastqc_results = {}
    for read in ["R1", "R2"]:
        try:
            [fastqc_result_summary_path] = glob.glob(
                os.path.join(paths['fastqc_output_path'],
                             "*_" + read + "_*" + "fastqc", 'summary.txt'))
            fastqc_results[read] = result_parsers.parse_fastqc_result(
                fastqc_result_summary_path)
            logger.info(
                "parsed_result_file",
                timestamp=str(now()),
                filename=os.path.abspath(fastqc_result_summary_path),
                summary=fastqc_results[read],
            )
        except Exception as e:
            logger.error("result_parsing_failed",
                         timestamp=str(now()),
                         filename=fastqc_result_summary_path)
            fastqc_results["R1"] = {
                "basic_statistics": "FAILED_TO_PARSE",
                "per_base_sequence_quality": "FAILED_TO_PARSE",
                "per_tile_sequence_quality": "FAILED_TO_PARSE",
                "per_sequence_quality_scores": "FAILED_TO_PARSE",
                "per_base_sequence_content": "FAILED_TO_PARSE",
                "per_sequence_gc_content": "FAILED_TO_PARSE",
                "per_base_n_content": "FAILED_TO_PARSE",
                "sequence_length_distribution": "FAILED_TO_PARSE",
                "sequence_duplication_levels": "FAILED_TO_PARSE",
                "overrepresented_sequences": "FAILED_TO_PARSE",
                "adapter_content": "FAILED_TO_PARSE",
            }

            fastqc_results["R2"] = {
                "basic_statistics": "FAILED_TO_PARSE",
                "per_base_sequence_quality": "FAILED_TO_PARSE",
                "per_tile_sequence_quality": "FAILED_TO_PARSE",
                "per_sequence_quality_scores": "FAILED_TO_PARSE",
                "per_base_sequence_content": "FAILED_TO_PARSE",
                "per_sequence_gc_content": "FAILED_TO_PARSE",
                "per_base_n_content": "FAILED_TO_PARSE",
                "sequence_length_distribution": "FAILED_TO_PARSE",
                "sequence_duplication_levels": "FAILED_TO_PARSE",
                "overrepresented_sequences": "FAILED_TO_PARSE",
                "adapter_content": "FAILED_TO_PARSE",
            }

    #look at fastqc results
    qc_verdicts["acceptable_fastqc_forward"] = qc.fastqc_qc_check(
        fastqc_results["R1"])
    qc_verdicts["acceptable_fastqc_reverse"] = qc.fastqc_qc_check(
        fastqc_results["R2"])

    try:
        reference_genome = mash_dist_results[0]['reference_id']
    except Exception as e:
        logger.error(
            "failed_quality_control_check",
            timestamp=str(now()),
            qc_check_failed="determine_reference_sequence",
            error_message=e.message,
        )

    # build the save paths
    try:
        os.makedirs(paths['reference_genome_path'])
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise

    download_refseq_reference(reference_genome, paths['reference_genome_path'])

    # If the user passes an expected organism NCBI taxonomy ID, then
    # use that to estimate the genome size. Otherwise, use the downloaded reference.
    estimated_genome_size = DEFAULT_ESTIMATED_GENOME_SIZE
    if args.expected_organism_ncbi_taxid:
        estimated_genome_size = get_estimated_genome_size(
            estimated_genome_sizes, args.expected_organism_ncbi_taxid)
    else:
        try:
            [reference_genome_assembly_stats_path
             ] = glob.glob(paths["reference_genome_path"] +
                           "/*_assembly_stats.txt")
        except ValueError:
            logger.error(
                "result_parsing_failed",
                timestamp=str(now()),
                filename=str(os.path.abspath(paths["reference_genome_path"])) +
                "/*_assembly_stats.txt",
            )

        try:
            reference_genome_assembly_stats = result_parsers.parse_reference_genome_assembly_stats(
                reference_genome_assembly_stats_path)
            logger.info(
                "parsed_result_file",
                timestamp=str(now()),
                filename=os.path.abspath(reference_genome_assembly_stats_path),
                total_length=reference_genome_assembly_stats['total_length'],
                contig_count=reference_genome_assembly_stats['contig_count'],
                contig_N50=reference_genome_assembly_stats['contig_N50'],
                organism_name=reference_genome_assembly_stats['organism_name'],
                infraspecific_name=reference_genome_assembly_stats[
                    'infraspecific_name'],
                ncbi_taxonomy_id=reference_genome_assembly_stats['taxid'],
                refseq_assembly_accession=reference_genome_assembly_stats[
                    'refseq_assembly_accession'],
            )
            estimated_genome_size = reference_genome_assembly_stats[
                'total_length']
        except Exception as e:
            logger.error(
                "result_parsing_failed",
                timestamp=str(now()),
                filename=os.path.abspath(reference_genome_assembly_stats_path),
                error_message=e.message,
            )

    total_bp = result_parsers.parse_total_bp(paths["totalbp_path"])
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(paths["totalbp_path"]),
        total_bp=total_bp,
    )

    estimated_depth_of_coverage = total_bp / estimated_genome_size

    if estimated_depth_of_coverage >= int(qc_thresholds["coverage_cutoff"]):
        qc_verdicts["acceptable_coverage"] = True

    estimated_coverage_stats_headers = [
        'sample_id',
        'total_bp',
        'estimated_genome_size',
        'estimated_depth_of_coverage',
    ]

    with open(paths['estimated_coverage_stats_path'], 'w+') as f:
        writer = csv.DictWriter(f,
                                fieldnames=estimated_coverage_stats_headers,
                                delimiter='\t')
        writer.writeheader()
        writer.writerow({
            'sample_id':
            sample_id,
            'total_bp':
            int(total_bp),
            'estimated_genome_size':
            int(estimated_genome_size),
            'estimated_depth_of_coverage':
            round(estimated_depth_of_coverage, 4),
        })

    assembly_jobs = [{
        'job_name':
        "_".join(['shovill', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 16 -l h_vmem=4G',
        'remote_command':
        os.path.join(job_script_path, 'shovill.sh'),
        'args': [
            "--R1", reads1_fastq, "--R2", reads2_fastq, "--mincov", "3",
            "--minlen", "500", "--output_dir", paths['assembly_output']
        ],
    }]

    run_jobs(assembly_jobs)

    post_assembly_qc_jobs = [
        {
            'job_name':
            "_".join(['quast', sample_id]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 8',
            'remote_command':
            os.path.join(job_script_path, 'quast.sh'),
            'args': [
                "--input",
                os.path.join(paths['assembly_output'], "contigs.fa"),
                "--outdir", paths['quast_path']
            ]
        },
    ]

    run_jobs(post_assembly_qc_jobs)

    busco_short_summary_contigs_path = os.path.abspath(
        paths["quast_path"] + "/busco_stats/short_summary_contigs.txt")
    busco_results = result_parsers.parse_busco_result(
        busco_short_summary_contigs_path)
    logger.info("parsed_result_file",
                timestamp=str(now()),
                filename=os.path.abspath(busco_short_summary_contigs_path),
                busco_results=busco_results)
    quast_report_path = os.path.abspath(paths["quast_path"] + "/report.txt")
    quast_results = result_parsers.parse_quast_result(quast_report_path)
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(quast_report_path),
        num_contigs=quast_results["num_contigs"],
        N50=quast_results["N50"],
    )

    qc_verdicts["acceptable_busco_assembly_metrics"] = qc.busco_qc_check(
        busco_results, qc_thresholds)
    qc_verdicts["acceptable_quast_assembly_metrics"] = qc.quast_qc_check(
        quast_results, estimated_genome_size)
Ejemplo n.º 7
0
def download_refseq_reference(reference_id, download_path):
    """
    Given a mash_hit, download the query sequence from NCBI FTP servers
    Will fail if the download_path doesn't exist.
    Args:
        mash_hit(dict):
        download_path(str):
    Returns:
        (void)
    """
    def mash_reference_id_to_ncbi_ftp_path(reference_id):
        """
        Args:
            query_id (str): Mash reference ID (column 1 of mash dist report)
        Returns:
            list: Directory names used to locate reference genome
                  on ftp://ftp.ncbi.nlm.nih.gov/genomes/all/
        For example:
            "GCF/001/022/155"
        """
        prefix = reference_id.split('_')[0]
        digits = reference_id.split('_')[1].split('.')[0]
        path_list = [prefix
                     ] + [digits[i:i + 3] for i in range(0, len(digits), 3)]

        return "/".join(path_list)

    ncbi_ftp_path = mash_reference_id_to_ncbi_ftp_path(reference_id)
    assembly = reference_id[:reference_id.find("_genomic.fna.gz")]

    ncbi_ftp_server_base = "ftp://ftp.ncbi.nlm.nih.gov"
    fasta_url = "/".join([
        ncbi_ftp_server_base, "genomes", "all", ncbi_ftp_path, assembly,
        reference_id
    ])
    assembly_stat_url = "/".join([
        ncbi_ftp_server_base, "genomes", "all", ncbi_ftp_path, assembly,
        assembly + "_assembly_stats.txt"
    ])

    #fetch the files
    try:
        urllib.request.urlretrieve(fasta_url,
                                   "/".join([download_path, reference_id]))
        logger.info(
            "file_downloaded",
            timestamp=str(now()),
            url=fasta_url,
        )
    except Exception as e:
        logging.error(
            "download_failed",
            timestamp=str(now()),
            url=fasta_url,
        )
    try:
        urllib.request.urlretrieve(
            assembly_stat_url,
            "/".join([download_path, assembly + "_assembly_stats.txt"]))
        logger.info(
            "file_downloaded",
            timestamp=str(now()),
            url=assembly_stat_url,
        )
    except Exception as e:
        logging.error(
            "download_failed",
            timestamp=str(now()),
            url=assembly_stat_url,
        )
Ejemplo n.º 8
0
    args = parser.parse_args()

    logging.basicConfig(
        format="%(message)s",
        stream=sys.stdout,
        level=logging.DEBUG,
    )

    structlog.configure_once(
        processors=[
            structlog.stdlib.add_log_level,
            structlog.processors.JSONRenderer()
        ],
        logger_factory=structlog.stdlib.LoggerFactory(),
        wrapper_class=structlog.stdlib.BoundLogger,
        context_class=structlog.threadlocal.wrap_dict(dict),
    )

    logger = structlog.get_logger(
        analysis_id=str(uuid.uuid4()),
        sample_id=args.sample_id,
        pipeline_version=cpo_pipeline.__version__,
    )

    logger.info(
        "analysis_started",
        timestamp=str(now()),
    )

    main(args)
Ejemplo n.º 9
0
def run_jobs(jobs):
    """
    """
    with drmaa.Session() as session:
        running_jobs = []
        for job in jobs:
            prepared_job = prepare_job(job, session)
            job_id = session.runJob(prepared_job)
            job_name = prepared_job.jobName
            logger.info(
                "job_submitted",
                timestamp=str(now()),
                job_name=job_name,
                job_id=job_id,
            )
            running_jobs.append({"id": job_id, "name": job_name})
        session.synchronize([x['id'] for x in running_jobs],
                            drmaa.Session.TIMEOUT_WAIT_FOREVER, False)

        for job in running_jobs:
            job_info = session.wait(job["id"],
                                    drmaa.Session.TIMEOUT_WAIT_FOREVER)
            resource_usage = job_info.resourceUsage
            float_fields = [
                "io",
                "iow",
                "mem",
                "cpu",
                "vmem",
                "maxvmem",
                "priority",
                "ru_wallclock",
                "ru_utime",
                "ru_stime",
                "ru_maxrss",
                "ru_ixrss",
                "ru_ismrss",
                "ru_idrss",
                "ru_isrss",
                "ru_minflt",
                "ru_majflt",
                "ru_nswap",
                "ru_inblock",
                "ru_oublock",
                "ru_msgsnd",
                "ru_msgrcv",
                "ru_nsignals",
                "ru_nvcsw",
                "ru_nivcsw",
                "acct_cpu",
                "acct_mem",
                "acct_io",
                "acct_iow",
                "acct_maxvmem",
            ]
            for float_field in float_fields:
                resource_usage[float_field] = float(
                    resource_usage[float_field])
            int_fields = ["exit_status"]
            for int_field in int_fields:
                resource_usage[int_field] = int(
                    float(resource_usage[int_field]))
            # Convert unix epoch timestamps to ISO8601 (YYYY-MM-DDTHH:mm:ss+tz)
            time_fields = [
                "submission_time",
                "start_time",
                "end_time",
            ]
            for time_field in time_fields:
                unix_timestamp = resource_usage[time_field]
                iso8601_timestamp = str(
                    datetime.datetime.fromtimestamp(
                        int(float(unix_timestamp)),
                        datetime.timezone.utc).isoformat())
                resource_usage[time_field] = iso8601_timestamp
            logger.info(
                "job_completed",
                timestamp=str(now()),
                job_id=job["id"],
                job_name=job["name"],
                resource_usage=job_info.resourceUsage,
                exit_status=job_info.exitStatus,
            )
def main(args):
    """
    main entrypoint
    Args:
        args():
    Returns:
        (void)
    """

    config = configparser.ConfigParser()
    config.read(args.config_file)

    sample_id = args.sample_id
    output_dir = args.outdir

    try:
        assembly = args.assembly
    except AttributeError:
        assembly = os.path.join(output_dir, sample_id, 'assembly',
                                'contigs.fa')

    try:
        mlst_scheme_map_file = args.mlst_scheme_map_file
    except AttributeError:
        mlst_scheme_map_file = resource_filename('data',
                                                 'scheme_species_map.tab')
    if not mlst_scheme_map_file:
        mlst_scheme_map_file = resource_filename('data',
                                                 'scheme_species_map.tab')

    paths = {
        "output_dir":
        output_dir,
        'logs':
        os.path.join(
            output_dir,
            sample_id,
            'logs',
        ),
        'mlst_path':
        os.path.join(output_dir, sample_id, 'typing', 'mlst', 'mlst.tsv'),
        'mob_recon_path':
        os.path.join(output_dir, sample_id, 'typing', 'mob_recon'),
        'abricate_plasmidfinder_path':
        os.path.join(output_dir, sample_id, 'typing', 'abricate',
                     'abricate_plasmidfinder.tsv'),
    }

    job_script_path = resource_filename('data', 'job_scripts')

    typing_jobs = [{
        'job_name':
        "_".join(['mlst', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'mlst.sh'),
        'args': [
            "--input", assembly, "--label", sample_id, "--output_file",
            paths['mlst_path']
        ]
    }, {
        'job_name':
        "_".join(['abricate', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'abricate.sh'),
        'args': [
            "--input", assembly, "--database", "plasmidfinder",
            "--output_file", paths['abricate_plasmidfinder_path']
        ]
    }, {
        'job_name':
        "_".join(['mob_recon', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'mob_recon.sh'),
        'args': ["--input", assembly, "--output_dir", paths['mob_recon_path']]
    }]

    run_jobs(typing_jobs)

    mlst_report = os.path.join(output_dir, sample_id, "typing", "mlst",
                               "mlst.tsv")
    mlst_hits = result_parsers.parse_mlst_result(mlst_report)
    # TODO: Check that there is only one MLST result in the report, and handle
    #       cases where the report is malformed.
    [mlst_hit] = mlst_hits
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(mlst_report),
        scheme_id=mlst_hit["scheme_id"],
        sequence_type=mlst_hit["sequence_type"],
    )
    mlst_scheme_map = input_parsers.parse_scheme_species_map(
        mlst_scheme_map_file)
    mlst_species = "Undefined"
    for scheme in mlst_scheme_map:
        if 'species' in scheme and scheme['scheme_id'] == mlst_hit['scheme_id']:
            mlst_species = scheme['species']

    mob_recon_contig_report_path = os.path.join(output_dir, sample_id,
                                                "typing", "mob_recon",
                                                "contig_report.txt")

    mob_recon_contig_report = result_parsers.parse_mob_recon_contig_report(
        mob_recon_contig_report_path)
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(mob_recon_contig_report_path),
        num_records=len(mob_recon_contig_report),
    )

    mob_recon_aggregate_report_path = os.path.join(
        output_dir, sample_id, "typing", "mob_recon",
        "mobtyper_aggregate_report.txt")

    mob_recon_aggregate_report = result_parsers.parse_mob_recon_mobtyper_aggregate_report(
        mob_recon_aggregate_report_path)
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(mob_recon_aggregate_report_path),
        num_records=len(mob_recon_aggregate_report),
    )

    def extract_contig_num(contig_id):
        """
        Given a contig_id from a mob_recon contig_report.txt file, return only the contig number.
        Args:
            contig_id (str): contig_id field from mob_recon contig_report.txt
            For example: "contigs.fa|contig00054_len=2672_cov=424.9_corr=0_origname=NODE_54_length_2672_cov_424.949312_pilon_sw=shovill-spades/1.0.1_date=20181024"
        Returns:
            str: contig number.
            For example: "00054"
        """
        prefix = '|contig'
        suffix = '_len='
        prefix_index = contig_id.find(prefix) + len(prefix)
        suffix_index = contig_id.find(suffix)
        contig_num = contig_id[prefix_index:suffix_index]
        return contig_num

    def get_plasmid_contigs(mob_recon_contig_report):
        """
        Given a list of dicts generated by parsing a mob_recon contig_report.txt file,
        return a list of plasmid contigs.
        Args:
            mob_recon_contig_report (list of dict):
        Returns:
            list: plasmid contigs
            For example: ['00021', '00022', '00032', ...]
        """
        plasmid_contigs = []
        for contig_report_record in mob_recon_contig_report:
            contig_num = extract_contig_num(contig_report_record['contig_id'])
            if contig_num not in plasmid_contigs and contig_report_record[
                    'rep_type']:
                plasmid_contigs.append(contig_num)
        return plasmid_contigs

    def get_likely_plasmid_contigs(mob_recon_contig_report):
        """
        Given a list of dicts generated by parsing a mob_recon contig_report.txt file,
        return a list of likely plasmid contigs.
        Args:
            mob_recon_contig_report (list of dict):
        Returns:
            list: likely plasmid contigs
            For example: ['00054', '00039', '00061', ...]
        """
        likely_plasmid_contigs = []
        for contig_report_record in mob_recon_contig_report:
            contig_num = extract_contig_num(contig_report_record['contig_id'])
            if contig_num not in likely_plasmid_contigs and not contig_report_record[
                    'rep_type']:
                likely_plasmid_contigs.append(contig_num)
        return likely_plasmid_contigs

    def get_plasmid_origins(mob_recon_contig_report):
        """
        Given a list of dicts generated by parsing a mob_recon contig_report.txt file,
        return a list of plasmid origins.
        Args:
            mob_recon_contig_report (list of dict):
        Returns:
            list: plasmid origins
            For example: ['rep_cluster_1254', 'IncL/M', 'IncN', ...]
        """
        origins = []
        for contig_report_record in mob_recon_contig_report:
            if contig_report_record['rep_type']:
                if contig_report_record['rep_type'] not in origins:
                    origins.append(contig_report_record['rep_type'])
        return origins

    plasmid_contigs = get_plasmid_contigs(mob_recon_contig_report)
    likely_plasmid_contigs = get_likely_plasmid_contigs(
        mob_recon_contig_report)
    origins = get_plasmid_origins(mob_recon_contig_report)
def main(args):
    """
    main entrypoint
    Args:
        args(argparse.Namespace): Parsed command-line arguments.
    Returns:
        (void)
    """

    config = configparser.ConfigParser()
    config.read(args.config_file)

    sample_id = args.sample_id
    output_dir = args.outdir

    try:
        assembly = args.assembly
    except AttributeError:
        assembly = os.path.join(output_dir, sample_id, 'assembly',
                                'contigs.fa')

    try:
        card_path = args.card_json
    except AttributeError:
        try:
            card_path = config['databases']['card_json']
            if not os.path.exists(card_path):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT), card_path)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/card_json",
                configuration_value=card_path,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/card_json",
                error_message=str(e),
            )

    try:
        abricate_datadir = args.abricate_datadir
    except AttributeError:
        try:
            abricate_datadir = config['databases']['abricate_datadir']
            if not os.path.exists(abricate_datadir):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        abricate_datadir)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/abricate_datadir",
                configuration_value=abricate_datadir,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/abricate_datadir",
                error_message=str(e),
            )

    try:
        abricate_cpo_plasmid_db = args.abricate_cpo_plasmid_db
    except AttributeError:
        try:
            abricate_cpo_plasmid_db = config['databases'][
                'abricate_cpo_plasmid_db']
            if not os.path.exists(
                    os.path.join(abricate_datadir, abricate_cpo_plasmid_db)):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        abricate_cpo_plasmid_db)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/abricate_cpo_plasmid_db",
                configuration_value=abricate_cpo_plasmid_db,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/abricate_cpo_plasmid_db",
            )

    paths = {
        "output_dir":
        output_dir,
        'logs':
        os.path.join(
            output_dir,
            sample_id,
            'logs',
        ),
        'abricate_path':
        os.path.join(output_dir, sample_id, 'resistance', 'abricate',
                     'abricate.tsv'),
        'rgi_path':
        os.path.join(output_dir, sample_id, 'resistance', 'rgi'),
    }

    job_script_path = resource_filename('data', 'job_scripts')

    resistance_jobs = [{
        'job_name':
        "_".join(['abricate', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'abricate.sh'),
        'args': [
            "--input", assembly, "--datadir", abricate_datadir, "--database",
            abricate_cpo_plasmid_db, "--output_file", paths['abricate_path']
        ]
    }, {
        'job_name':
        "_".join(['rgi', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'rgi.sh'),
        'args': [
            "--input", assembly, "--card_json", card_path, "--output_dir",
            paths['rgi_path']
        ]
    }]

    run_jobs(resistance_jobs)

    abricate_report_path = os.path.join(output_dir, sample_id, "resistance",
                                        "abricate", "abricate.tsv")
    abricate_report = result_parsers.parse_abricate_result(
        abricate_report_path)
    logger.info("parsed_result_file",
                timestamp=str(datetime.datetime.utcnow().replace(
                    tzinfo=datetime.timezone.utc).isoformat()),
                filename=os.path.abspath(abricate_report_path),
                resistance_genes=[{
                    key: record[key]
                    for key in [
                        "gene",
                        "accession",
                        "database",
                        "percent_coverage",
                        "percent_identity",
                    ]
                } for record in abricate_report])

    rgi_report_path = os.path.join(output_dir, sample_id, "resistance", "rgi",
                                   "rgi.txt")
    rgi_report = result_parsers.parse_rgi_result_txt(rgi_report_path)
    logger.info("parsed_result_file",
                timestamp=str(datetime.datetime.utcnow().replace(
                    tzinfo=datetime.timezone.utc).isoformat()),
                filename=os.path.abspath(rgi_report_path),
                resistance_genes=[{
                    key: record[key]
                    for key in [
                        "best_hit_aro",
                        "aro",
                    ]
                } for record in rgi_report])

    def get_abricate_carbapenemases(abricate_report):
        """
        Given a list of dicts generated by parsing an abricate report file,
        return a list of carbapenemases.
        Args:
            abricate_report (list of dict):
        Returns:
            list: likely plasmid contigs
            For example: ['NDM-1', '', '', ...]
        """
        abricate_carbapenemases = []
        for abricate_report_record in abricate_report:
            abricate_carbapenemases.append(abricate_report_record['gene'])
        return abricate_carbapenemases

    def get_rgi_carbapenemases(rgi_report):
        """
        Given a list of dicts generated by parsing an rgi report file,
        return a list of carbapenemases.
        Args:
            rgi_report (list of dict):
        Returns:
            list: likely plasmid contigs
            For example: ['', '', '', ...]
        """
        rgi_carbapenemases = []
        for rgi_report_record in rgi_report:
            if re.search("carbapenem", rgi_report_record['drug_class']):
                rgi_carbapenemases.append(rgi_report_record['best_hit_aro'])
        return rgi_carbapenemases
def main(args):
    """
    main entrypoint
    Args:
        args():
    Returns:
        (void)
    """

    config = configparser.ConfigParser()
    config.read(args.config_file)

    try:
        mash_refseq_plasmid_db = args.mash_refseq_plasmid_db
    except AttributeError:
        try:
            mash_refseq_plasmid_db = config['databases'][
                'mash_refseq_plasmid_db']
            if not os.path.exists(mash_refseq_plasmid_db):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        mash_refseq_plasmid_db)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/mash_refseq_plasmid_db",
                configuration_value=mash_refseq_plasmid_db,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/mash_refseq_plasmid_db",
                error_message=str(e),
            )

    try:
        mash_custom_plasmid_db = args.mash_custom_plasmid_db
    except AttributeError:
        try:
            mash_custom_plasmid_db = config['databases'][
                'mash_custom_plasmid_db']
            if not os.path.exists(mash_custom_plasmid_db):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        mash_custom_plasmid_db)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/mash_custom_plasmid_db",
                configuration_value=mash_custom_plasmid_db,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/mash_custom_plasmid_db",
                error_message=str(e),
            )

    sample_id = args.sample_id
    output_dir = args.outdir

    paths = {
        'job_scripts':
        resource_filename('data', 'job_scripts'),
        'reads1_fastq':
        args.reads1_fastq,
        'reads2_fastq':
        args.reads2_fastq,
        'mash_custom_plasmid_db':
        mash_custom_plasmid_db,
        'mash_refseq_plasmid_db':
        mash_refseq_plasmid_db,
        'output_dir':
        output_dir,
        'logs':
        os.path.join(
            output_dir,
            sample_id,
            'logs',
        ),
        'plasmid_output':
        os.path.join(
            output_dir,
            sample_id,
            "plasmids",
        ),
        "refseq_plasmid_output":
        os.path.join(
            output_dir,
            sample_id,
            "plasmids",
            "refseq_plasmids",
        ),
        "custom_plasmid_output":
        os.path.join(
            output_dir,
            sample_id,
            "plasmids",
            "custom_plasmids",
        ),
    }

    os.makedirs(paths['logs'], exist_ok=True)

    os.makedirs(os.path.join(
        paths['custom_plasmid_output'],
        'candidates',
    ),
                exist_ok=True)

    os.makedirs(os.path.join(
        paths['refseq_plasmid_output'],
        'candidates',
    ),
                exist_ok=True)

    refseq_candidates = strategies.refseq_plasmids(sample_id, paths)
    custom_candidates = strategies.custom_plasmids(sample_id, paths)

    candidates = refseq_candidates + custom_candidates

    samtools_faidx_jobs = []
    bwa_index_jobs = []
    for candidate in candidates:
        samtools_faidx_job = {
            'job_name':
            "_".join(['samtools_faidx', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 2',
            'remote_command':
            os.path.join(paths['job_scripts'], 'samtools_faidx.sh'),
            'args': [
                "--fasta",
                candidate['fasta_path'],
            ]
        }
        bwa_index_job = {
            'job_name':
            "_".join(['bwa_index', sample_id, candidate['accession']]),
            'output_path': paths['logs'],
            'error_path': paths['logs'],
            'native_specification': '-pe smp 2',
            'remote_command': os.path.join(paths['job_scripts'],
                                           'bwa_index.sh'),
            'args': [
                "--fasta",
                candidate['fasta_path'],
            ]
        }
        samtools_faidx_jobs.append(samtools_faidx_job)
        bwa_index_jobs.append(bwa_index_job)

    run_jobs(samtools_faidx_jobs + bwa_index_jobs)

    bwa_mem_jobs = []
    for candidate in candidates:
        bwa_mem_job = {
            'job_name':
            "_".join(['bwa_mem', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 8 -shell y',
            'remote_command':
            os.path.join(paths['job_scripts'], 'bwa_mem.sh'),
            'args': [
                "--reference", candidate['fasta_path'], "--R1",
                paths['reads1_fastq'], "--R2", paths['reads2_fastq'],
                "--output",
                re.sub("\.fna$", ".sam", candidate['fasta_path'])
            ]
        }
        bwa_mem_jobs.append(bwa_mem_job)

    run_jobs(bwa_mem_jobs)

    samtools_filter_fixmate_sort_jobs = []
    for candidate in candidates:
        alignment = os.path.join(
            re.sub("\.fna$", ".sam", candidate['fasta_path']))
        samtools_filter_fixmate_sort_job = {
            'job_name':
            "_".join([
                'samtools_filter_fixmate_sort', sample_id,
                candidate['accession']
            ]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 4',
            'remote_command':
            os.path.join(paths['job_scripts'],
                         'samtools_filter_fixmate_sort.sh'),
            'args': [
                "--input",
                alignment,
                "--flags",
                1540,
                "--output",
                re.sub('\.sam$', '.bam', alignment),
            ]
        }
        samtools_filter_fixmate_sort_jobs.append(
            samtools_filter_fixmate_sort_job)

    run_jobs(samtools_filter_fixmate_sort_jobs)

    for candidate in candidates:
        sam_alignment = "/".join([
            re.sub('\.fna$', '.sam', candidate['fasta_path']),
        ])
        os.remove(sam_alignment)

    samtools_index_jobs = []
    for candidate in candidates:
        alignment = os.path.join(
            re.sub('\.fna', '.bam', candidate['fasta_path']))
        samtools_index_job = {
            'job_name':
            "_".join(['samtools_index', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 4',
            'remote_command':
            os.path.join(paths['job_scripts'], 'samtools_index.sh'),
            'args': [
                "--input",
                alignment,
            ]
        }
        samtools_index_jobs.append(samtools_index_job)

    run_jobs(samtools_index_jobs)

    samtools_depth_jobs = []
    for candidate in candidates:
        alignment = os.path.join(
            re.sub('\.fna', '.bam', candidate['fasta_path']))
        samtools_depth_job = {
            'job_name':
            "_".join(['samtools_depth', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 1',
            'remote_command':
            os.path.join(paths['job_scripts'], 'samtools_depth.sh'),
            'args': [
                "--input",
                alignment,
                "--output",
                re.sub('\.bam$', '.depth', alignment),
            ]
        }
        samtools_depth_jobs.append(samtools_depth_job)

    run_jobs(samtools_depth_jobs)

    for candidate in candidates:
        depth = os.path.join(
            re.sub('\.fna$', '.depth', candidate['fasta_path']), )
        MINIMUM_DEPTH = 10
        MINIMUM_COVERAGE_PERCENT = 95.0
        positions_above_minimum_depth = 0
        total_length = 0
        with open(depth) as depth_file:
            for line in depth_file:
                [_, position, depth] = line.split()
                total_length += 1
                if int(depth) >= MINIMUM_DEPTH:
                    positions_above_minimum_depth += 1
        candidate['bases_above_minimum_depth'] = positions_above_minimum_depth
        try:
            candidate[
                'percent_above_minimum_depth'] = positions_above_minimum_depth / total_length
        except ZeroDivisionError:
            candidate['percent_above_minimum_depth'] = 0.0

    freebayes_jobs = []
    for candidate in candidates:
        alignment = re.sub('\.fna$', '.bam', candidate['fasta_path'])
        reference = candidate['fasta_path']
        vcf = re.sub('\.fna$', '.vcf', candidate['fasta_path'])
        freebayes_job = {
            'job_name':
            "_".join(['freebayes', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 8',
            'remote_command':
            os.path.join(paths['job_scripts'], 'freebayes.sh'),
            'args': [
                "--input",
                alignment,
                "--reference",
                reference,
                "--output",
                vcf,
            ]
        }
        freebayes_jobs.append(freebayes_job)

    run_jobs(freebayes_jobs)

    bcftools_view_jobs = []
    for candidate in candidates:
        vcf = re.sub('\.fna$', '.vcf', candidate['fasta_path'])
        bcftools_view_job = {
            'job_name':
            "_".join(['bcftools_view', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 2 -shell y',
            'remote_command':
            os.path.join(paths['job_scripts'], 'bcftools_view.sh'),
            'args': [
                "--input",
                vcf,
                "--output",
                re.sub('\.vcf$', '.snps.vcf', vcf),
            ]
        }
        bcftools_view_jobs.append(bcftools_view_job)

    run_jobs(bcftools_view_jobs)

    for candidate in candidates:
        snps_vcf = re.sub('\.fna$', '.snps.vcf', candidate['fasta_path'])
        snps = 0
        with open(snps_vcf, 'r') as f:
            for line in f:
                if not line.startswith('#'):
                    snps += 1
        candidate['snps'] = snps

    plasmid_output_summary = os.path.join(paths['plasmid_output'],
                                          'custom_plasmid.txt')

    plasmid_output_final = os.path.join(output_dir, sample_id,
                                        'final_plasmid.tsv')

    custom_candidates = [c for c in candidates if c['database'] == 'custom']
    custom_candidates.sort(key=operator.itemgetter('snps'))
    custom_candidates.sort(key=operator.itemgetter('plasmid_length'),
                           reverse=True)
    custom_candidates.sort(
        key=operator.itemgetter('percent_above_minimum_depth'), reverse=True)
    custom_best_candidate = next(iter(custom_candidates), None)

    with open(plasmid_output_final, 'w+') as f:
        fieldnames = [
            'sample_id', 'accession', 'circularity', 'plasmid_length',
            'bases_above_minimum_depth', 'percent_above_minimum_depth', 'snps',
            'allele', 'incompatibility_group'
        ]

        writer = csv.DictWriter(f,
                                fieldnames=fieldnames,
                                delimiter='\t',
                                extrasaction='ignore')
        writer.writeheader()
        if custom_best_candidate:
            f.write(args.sample_id + '\t')
            # Truncate floats to 4 digits
            writer.writerow({
                k: round(v, 4) if isinstance(v, float) else v
                for k, v in custom_best_candidate.items()
            })

    with open(plasmid_output_summary, 'w+') as f:
        fieldnames = [
            'sample_id', 'accession', 'circularity', 'plasmid_length',
            'bases_above_minimum_depth', 'percent_above_minimum_depth', 'snps',
            'allele', 'incompatibility_group'
        ]
        writer = csv.DictWriter(f,
                                fieldnames=fieldnames,
                                delimiter='\t',
                                extrasaction='ignore')
        writer.writeheader()
        for candidate in custom_candidates:
            f.write(args.sample_id + '\t')
            # Truncate floats to 4 digits
            writer.writerow({
                k: round(v, 4) if isinstance(v, float) else v
                for k, v in candidate.items()
            })