def samtools_filter_fixmate_sort_single_job(sample_id, candidates, paths):
    samtools_filter_fixmate_sort_jobs = []
    for candidate in candidates:
        alignment = "/".join([
            paths['plasmid_output'],
            candidate['accession'] + ".sam",
        ])
        samtools_filter_fixmate_sort_job = {
            'job_name':
            "_".join([
                'samtools_filter_fixmate_sort', sample_id,
                candidate['accession']
            ]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 4',
            'remote_command':
            os.path.join(job_script_path, 'samtools_filter_fixmate_sort.sh'),
            'args': [
                "--input",
                alignment,
                "--flags",
                1540,
                "--output",
                re.sub('\.sam$', '.bam', alignment),
            ]
        }
        samtools_filter_fixmate_sort_jobs.append(
            samtools_filter_fixmate_sort_job)

    run_jobs(samtools_filter_fixmate_sort_jobs)
Beispiel #2
0
def main(args, logger=None):
    """
    main entrypoint
    Args:
        args():
    Returns:
        (void)
    """

    analysis_id = uuid.uuid4()

    curDir = os.getcwd()
    output_dir = args.outdir
    # metadata_file = args.metadata_file
    reference = os.path.abspath(args.reference)

    # sensitivePath = str(options.sensitivePath).lstrip().rstrip()
    # sensitiveCols = str(options.sensitiveCols).lstrip().rstrip()
    # outputFile = str(options.outputFile).lstrip().rstrip()
    # bcidCol = str( str(options.bcidCol).lstrip().rstrip() )
    # naValue = str( str(options.naValue).lstrip().rstrip() )

    # metadata = result_parsers.parse_workflow_results(metadata_file)
    # distance = read(distancePath)
    # treeFile = "".join(read(treePath))

    if not logger:
        logging.basicConfig(
            format="%(message)s",
            stream=sys.stdout,
            level=logging.DEBUG,
        )

        structlog.configure_once(
            processors=[
                structlog.stdlib.add_log_level,
                structlog.processors.JSONRenderer()
            ],
            logger_factory=structlog.stdlib.LoggerFactory(),
            wrapper_class=structlog.stdlib.BoundLogger,
            context_class=structlog.threadlocal.wrap_dict(dict),
        )
        logger = structlog.get_logger(
            analysis_id=str(uuid.uuid4()),
            pipeline_version=cpo_pipeline.__version__,
        )

    inputs = []
    with open(args.input_file) as input_file:
        fieldnames = [
            'sample_id',
            'reads1',
            'reads2',
        ]
        reader = csv.DictReader(
            (row for row in input_file if not row.startswith('#')),
            delimiter='\t',
            fieldnames=fieldnames)
        for row in reader:
            inputs.append(row)

    os.environ['QT_QPA_PLATFORM'] = 'offscreen'

    paths = {
        'logs': os.path.abspath(os.path.join(
            output_dir,
            'logs',
        )),
        'snippy_output': os.path.abspath(os.path.join(output_dir, "snippy")),
    }

    for output_subdir in paths.values():
        try:
            os.makedirs(output_subdir)
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

    job_script_path = resource_filename('data', 'job_scripts')

    contigs_paths = []
    for sample_id in [input["sample_id"] for input in inputs]:
        contigs = os.path.abspath(
            os.path.join(args.result_dir, sample_id, "assembly", "contigs.fa"))
        contigs_paths.append(contigs)

    snippy_dirs = [
        os.path.join(
            paths['snippy_output'],
            os.path.basename(os.path.dirname(os.path.dirname(contigs))))
        for contigs in contigs_paths
    ]

    snippy_jobs = [{
        'job_name':
        'snippy',
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8 -shell y',
        'remote_command':
        os.path.join(job_script_path, 'snippy.sh'),
        'args': [
            "--ref",
            reference,
            "--R1",
            input['reads1'],
            "--R2",
            input['reads2'],
            "--outdir",
            os.path.join(
                paths['snippy_output'],
                input['sample_id'],
            ),
        ]
    } for input in inputs]

    run_jobs(snippy_jobs)

    snippy_core_jobs = [{
        'job_name':
        'snippy-core',
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8 -shell y',
        'remote_command':
        os.path.join(job_script_path, 'snippy-core.sh'),
        'args': [
            "--ref",
            reference,
            "--outdir",
            paths["snippy_output"],
        ] + snippy_dirs
    }]

    run_jobs(snippy_core_jobs)

    snp_dists_jobs = [{
        'job_name':
        'snp-dists',
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'snp-dists.sh'),
        'args': [
            "--alignment",
            os.path.join(paths["snippy_output"], "core.aln"),
            "--output_file",
            os.path.join(paths["snippy_output"], "core.aln.matrix.tsv"),
        ]
    }]

    run_jobs(snp_dists_jobs)

    iqtree_jobs = [{
        'job_name':
        'iqtree',
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'iqtree.sh'),
        'args': [
            "--alignment",
            os.path.join(paths["snippy_output"], "core.full.aln"),
            "--model",
            "GTR+G4",
        ]
    }]

    run_jobs(iqtree_jobs)

    clonalframeml_jobs = [{
        'job_name':
        'clonalframeml',
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'clonalframeml.sh'),
        'args': [
            "--alignment",
            os.path.join(paths["snippy_output"], "core.full.aln"),
            "--treefile",
            os.path.join(paths["snippy_output"], "core.full.aln.treefile"),
            "--output_file",
            os.path.join(paths["snippy_output"],
                         "core.full.aln.clonalframeml"),
        ]
    }]

    run_jobs(clonalframeml_jobs)

    maskrc_svg_jobs = [{
        'job_name':
        'maskrc-svg',
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'maskrc-svg.sh'),
        'args': [
            "--alignment",
            os.path.join(paths["snippy_output"], "core.full.aln"),
            "--svg",
            os.path.join(paths["snippy_output"], "core.full.maskrc.svg"),
            "--clonalframeml",
            os.path.join(paths["snippy_output"],
                         "core.full.aln.clonalframeml"),
            "--output_file",
            os.path.join(paths["snippy_output"], "core.full.maskrc.aln"),
        ]
    }]

    run_jobs(maskrc_svg_jobs)

    snp_sites_jobs = [{
        'job_name':
        'snp-sites',
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'snp-sites.sh'),
        'args': [
            "--alignment",
            os.path.join(paths["snippy_output"], "core.full.maskrc.aln"),
            "--output_file",
            os.path.join(paths["snippy_output"], "core.full.maskrc.snp.aln"),
        ]
    }]

    run_jobs(snp_sites_jobs)

    iqtree_jobs = [{
        'job_name':
        'iqtree',
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'iqtree.sh'),
        'args': [
            "--alignment",
            os.path.join(paths["snippy_output"], "core.full.maskrc.aln"),
            "--model",
            "GTR+G+ASC",
        ]
    }]

    run_jobs(iqtree_jobs)

    snp_dists_jobs = [{
        'job_name':
        'snp-sites',
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'snp-dists.sh'),
        'args': [
            "--alignment",
            os.path.join(paths["snippy_output"], "core.aln"),
            "--output_file",
            os.path.join(paths["snippy_output"], "core.matrix.tab"),
        ]
    }, {
        'job_name':
        'snp-sites',
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'snp-dists.sh'),
        'args': [
            "--alignment",
            os.path.join(paths["snippy_output"], "core.full.maskrc.snp.aln"),
            "--output_file",
            os.path.join(paths["snippy_output"],
                         "core.full.maskrc.snp.matrix.tab"),
        ]
    }]

    run_jobs(snp_dists_jobs)

    exit(0)
    distanceDict = {}  #store the distance matrix as rowname:list<string>

    for i in range(len(distance)):
        temp = distance[i].split("\t")
        distanceDict[temp[0]] = temp[1:]

    #region create box tree
    #region step5: tree construction
    treeFile = "".join(read(treePath))
    t = e.Tree(treeFile)
    t.set_outgroup(t & "Reference")

    #set the tree style
    ts = e.TreeStyle()
    ts.show_leaf_name = True
    ts.show_branch_length = True
    ts.scale = 2000  #pixel per branch length unit
    ts.branch_vertical_margin = 15  #pixel between branches
    style2 = e.NodeStyle()
    style2["fgcolor"] = "#000000"
    style2["shape"] = "circle"
    style2["vt_line_color"] = "#0000aa"
    style2["hz_line_color"] = "#0000aa"
    style2["vt_line_width"] = 2
    style2["hz_line_width"] = 2
    style2["vt_line_type"] = 0  # 0 solid, 1 dashed, 2 dotted
    style2["hz_line_type"] = 0
    for n in t.traverse():
        n.set_style(style2)

    #find the plasmid origins
    plasmidIncs = {}
    for key in metadata:
        for plasmid in metadata[key]['plasmids']:
            for inc in plasmid['PlasmidRepType'].split(","):
                if (inc.lower().find("inc") > -1):
                    if not (inc in plasmidIncs):
                        plasmidIncs[inc] = [metadata[key]['ID']]
                    else:
                        if metadata[key]['ID'] not in plasmidIncs[inc]:
                            plasmidIncs[inc].append(metadata[key]['ID'])
    #plasmidIncs = sorted(plasmidIncs)
    for n in t.traverse():  #loop through the nodes of a tree
        if (n.is_leaf() and n.name == "Reference"):
            #if its the reference branch, populate the faces with column headers
            index = 0

            if len(sensitivePath) > 0:  #sensitive metadat @ chris
                for sensitive_data_column in sensitive_meta_data.get_columns():
                    (t & "Reference").add_face(addFace(sensitive_data_column),
                                               index, "aligned")
                    index = index + 1

            (t & "Reference").add_face(addFace("SampleID"), index, "aligned")
            index = index + 1
            (t & "Reference").add_face(addFace("New?"), index, "aligned")
            index = index + 1
            for i in range(
                    len(plasmidIncs)
            ):  #this loop adds the columns (aka the incs) to the reference node
                (t & "Reference").add_face(
                    addFace(list(plasmidIncs.keys())[i]), i + index, "aligned")
            index = index + len(plasmidIncs)
            (t & "Reference").add_face(addFace("MLSTScheme"), index, "aligned")
            index = index + 1
            (t & "Reference").add_face(addFace("Sequence Type"), index,
                                       "aligned")
            index = index + 1
            (t & "Reference").add_face(addFace("Carbapenamases"), index,
                                       "aligned")
            index = index + 1
            (t & "Reference").add_face(addFace("Plasmid Best Match"), index,
                                       "aligned")
            index = index + 1
            (t & "Reference").add_face(addFace("Best Match Identity"), index,
                                       "aligned")
            index = index + 1
            for i in range(len(
                    distanceDict[list(distanceDict.keys())
                                 [0]])):  #this loop adds the distance matrix
                (t & "Reference").add_face(
                    addFace(distanceDict[list(distanceDict.keys())[0]][i]),
                    index + i, "aligned")
            index = index + len(distanceDict[list(distanceDict.keys())[0]])
        elif (n.is_leaf() and not n.name == "Reference"):
            #not reference branches, populate with metadata
            index = 0

            if (n.name.replace(".fa", "") in metadata.keys()):
                mData = metadata[n.name.replace(".fa", "")]
            else:
                mData = metadata["na"]
            n.add_face(addFace(mData.ID), index, "aligned")
            index = index + 1
            if (mData['new']):  #new column
                face = e.RectFace(
                    30, 30, "green",
                    "green")  # TextFace("Y",fsize=10,tight_text=True)
                face.border.margin = 5
                face.margin_right = 5
                face.margin_left = 5
                face.vt_align = 1
                face.ht_align = 1
                n.add_face(face, index, "aligned")
            index = index + 1
            for incs in plasmidIncs:  #this loop adds presence/absence to the sample nodes
                if (n.name.replace(".fa", "") in plasmidIncs[incs]):
                    face = e.RectFace(
                        30, 30, "black",
                        "black")  # TextFace("Y",fsize=10,tight_text=True)
                    face.border.margin = 5
                    face.margin_right = 5
                    face.margin_left = 5
                    face.vt_align = 1
                    face.ht_align = 1
                    n.add_face(face,
                               list(plasmidIncs.keys()).index(incs) + index,
                               "aligned")
            index = index + len(plasmidIncs)
            n.add_face(addFace(mData['MLSTSpecies']), index, "aligned")
            index = index + 1
            n.add_face(addFace(mData['SequenceType']), index, "aligned")
            index = index + 1
            n.add_face(addFace(mData['CarbapenemResistanceGenes']), index,
                       "aligned")
            index = index + 1
            n.add_face(addFace(mData['plasmidBestMatch']), index, "aligned")
            index = index + 1
            n.add_face(addFace(mData['plasmididentity']), index, "aligned")
            index = index + 1
            for i in range(len(
                    distanceDict[list(distanceDict.keys())
                                 [0]])):  #this loop adds distance matrix
                if (n.name in distanceDict
                    ):  #make sure the column is in the distance matrice
                    n.add_face(addFace(list(distanceDict[n.name])[i]),
                               index + i, "aligned")

    t.render(outputFile, w=5000, units="mm",
             tree_style=ts)  #save it as a png, pdf, svg or an phyloxml
def refseq_plasmids(sample_id, paths):

    mash_jobs = [
        {
            'job_name': "_".join(['mash_screen_refseq_plasmid', sample_id]),
            'output_path': paths['logs'],
            'error_path': paths['logs'],
            'native_specification': '-pe smp 8',
            'remote_command': os.path.join(paths['job_scripts'], 'mash_screen.sh'),
            'args': [
                "--R1", paths['reads1_fastq'],
                "--R2", paths['reads2_fastq'],
                "--queries", paths['mash_refseq_plasmid_db'],
                "--min-identity", 0.975,
                "--output_file", os.path.join(
                    paths['refseq_plasmid_output'],
                    'mash_screen.tsv',
                ),
            ],
        },
    ]
    run_jobs(mash_jobs)

    mash_screen_result_path = os.path.join(
        paths['refseq_plasmid_output'],
        'mash_screen.tsv',
    )
    mash_screen_results = result_parsers.parse_mash_screen_result(
        mash_screen_result_path
    )
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(mash_screen_result_path)
    )
    
    for result in mash_screen_results:
        result['accession'] = re.search('ref\|(.*)\|', result['query_id']).group(1)
        
    candidates_keys = [
        'identity',
        'accession',
    ]
    
    with open(os.path.join(paths['refseq_plasmid_output'], 'candidates.tsv'), 'w+') as candidates_file:
        writer = csv.DictWriter(candidates_file, candidates_keys,
                                delimiter='\t', extrasaction='ignore')
        writer.writerows(mash_screen_results)

    candidates = []
    with open(os.path.join(paths['refseq_plasmid_output'], 'candidates.tsv'), 'r') as candidates_file:
        reader = csv.DictReader(candidates_file, fieldnames=candidates_keys, delimiter='\t')
        for row in reader:
            row['fasta_path'] = os.path.join(
                paths['refseq_plasmid_output'],
                'candidates',
                row['accession'] + '.fna',
            )
            candidates.append(row)

    for candidate in candidates:
        candidate['database'] = 'refseq'
    
    # NCBI Rate-limits downloads to 3 per second.
    for candidate in candidates:
        candidate_fasta = os.path.join(
            candidate['fasta_path']
        )
        url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + \
            "&".join([
                "db=nucleotide",
                "id=" + candidate['accession'],
                "rettype=fasta",
            ])
        def download_retry(url, candidate):
            """
              NCBI Rate-limits refseq downloads to 3 per second from each IP.
              When multiple files are being analyzed simultaneously this 
              limit may be exceeded.
              Retry
            """
            try:
                urllib.request.urlretrieve(url, candidate['fasta_path'])
                logger.info(
                    "file_downloaded",
                    timestamp=str(now()),
                    url=url,
                    accession=candidate['accession'],
                    sample_id=sample_id,
                )
            except HTTPError as e:
                if int(e.code) == 429:
                    time.sleep(5)
                    logger.info(
                        "retried_download",
                        timestamp=str(now()),
                        url=url,
                        accession=candidate['accession'],
                        sample_id=sample_id,
                    )
                    download_retry(url, candidate)
                else:
                    logger.error(
                        "download_failed",
                        timestamp=str(now()),
                        url=url,
                        sample_id=sample_id,
                    )
        download_retry(url, candidate)
        time.sleep(2)

    return candidates
def samtools_filter_fixmate_sort_discrete_jobs(sample_id, candidates, paths):
    samtools_view_jobs = []
    for candidate in candidates:
        alignment = os.path.join(
            paths['plasmid_output_path'],
            candidate['accession'] + ".sam",
        )
        samtools_view_job = {
            'job_name': "_".join(['samtools_view', sample_id, candidate['accession']]),
            'output_path': paths['logs'],
            'error_path': paths['logs'],
            'native_specification': '-pe smp 4',
            'remote_command': os.path.join(job_script_path, 'samtools_view.sh'),
            # '--f 1540' excludes the following reads:
            # - read unmapped (0x4)
            # - read fails platform/vendor quality checks (0x200)
            # - read is PCR or optical duplicate (0x400)
            'args': [
                "--input", alignment,
                "--flags", 1540,
                "--output", re.sub("\.sam$", ".mapped.dedup.bam", alignment),
            ]
        }
        samtools_view_jobs.append(samtools_view_job)

    run_jobs(samtools_view_jobs)

    samtools_sort_jobs = []
    for candidate in candidates:
        alignment = "/".join([
            paths['plasmid_output'],
            candidate['accession'] + ".mapped.dedup.bam",
        ])
        samtools_sort_job = {
            'job_name': "_".join(['samtools_sort', sample_id, candidate['accession']]),
            'output_path': paths['logs'],
            'error_path': paths['logs'],
            'native_specification': '-pe smp 4',
            'remote_command': os.path.join(job_script_path, 'samtools_sort.sh'),
            'args': [
                "--input", alignment,
                "--name-order",
                "--output", re.sub("\.bam$", ".namesort.bam", alignment),
            ]
        }
        samtools_sort_jobs.append(samtools_sort_job)

    run_jobs(samtools_sort_jobs)

    samtools_fixmate_jobs = []
    for candidate in candidates:
        alignment = "/".join([
            paths['plasmid_output'],
            candidate['accession'] + ".mapped.dedup.namesort.bam",
        ])
        samtools_fixmate_job = {
            'job_name': "_".join(['samtools_fixmate', sample_id, candidate['accession']]),
            'output_path': paths['logs'],
            'error_path': paths['logs'],
            'native_specification': '-pe smp 4',
            'remote_command': os.path.join(job_script_path, 'samtools_fixmate.sh'),
            'args': [
                "--input", alignment,
                "--output", re.sub("\.bam$", ".fixmate.bam", alignment),
            ]
        }
        samtools_fixmate_jobs.append(samtools_fixmate_job)

    run_jobs(samtools_fixmate_jobs)

    samtools_sort_jobs = []
    for candidate in candidates:
        alignment = "/".join([
            paths['plasmid_output'],
            candidate['accession'] + ".mapped.dedup.namesort.fixmate.bam",
        ])
        samtools_sort_job = {
            'job_name': "_".join(['samtools_sort', sample_id, candidate['accession']]),
            'output_path': paths['logs'],
            'error_path': paths['logs'],
            'native_specification': '-pe smp 4',
            'remote_command': os.path.join(job_script_path, 'samtools_sort.sh'),
            'args': [
                "--input", alignment,
                "--output", re.sub("\.bam$", ".coordsort.bam", alignment),
            ]
        }
        samtools_sort_jobs.append(samtools_sort_job)

    run_jobs(samtools_sort_jobs)

    samtools_markdup_jobs = []
    for candidate in candidates:
        alignment = "/".join([
            paths['plasmid_output'],
            candidate['accession'] + ".mapped.dedup.namesort.fixmate.coordsort.bam",
        ])
        samtools_markdup_job = {
            'job_name': "_".join(['samtools_markdup', sample_id], candidate['accession']),
            'output_path': paths['logs'],
            'error_path': paths['logs'],
            'native_specification': '-pe smp 4',
            'remote_command': os.path.join(job_script_path, 'samtools_markdup.sh'),
            'args': [
                "--input", alignment,
                "--output", re.sub("\.bam$", ".markdup.bam", alignment),
            ]
        }
        samtools_markdup_jobs.append(samtools_markdup_job)

    run_jobs(samtools_markdup_jobs)
def custom_plasmids(sample_id, paths):
    mash_jobs = [
        {
            'job_name': "_".join(['mash_screen_custom_plasmid', sample_id]),
            'output_path': paths['logs'],
            'error_path': paths['logs'],
            'native_specification': '-pe smp 8 -shell y',
            'remote_command': os.path.join(paths['job_scripts'], 'mash_screen_custom_db.sh'),
            'args': [
                "--R1", paths['reads1_fastq'],
                "--R2", paths['reads2_fastq'],
                "--min-identity", 0.996,
                "--plasmid-db-dir", os.path.join(
                    paths['mash_custom_plasmid_db'],
                    "mash",
                ),
                "--output_file", os.path.join(
                    paths['custom_plasmid_output'],
                    'mash_screen.tsv',
                )
            ],
        },
    ]
    
    run_jobs(mash_jobs)

    
    mash_screen_results = result_parsers.parse_mash_screen_result(
        os.path.join(
            paths['custom_plasmid_output'],
            'mash_screen.tsv',
        )
    )

    custom_plasmid_db_data = {}
    for dat_file in glob.glob(os.path.join(paths['mash_custom_plasmid_db'], "data", "*.dat")):
        [dat] = parsers.custom_plasmid_db_dat_parser(dat_file)
        custom_plasmid_db_data[dat['accession']] = dat

    for mash_screen_result in mash_screen_results:
        accession = re.sub('\.fna$', '', mash_screen_result['query_id'])
        mash_screen_result['accession'] = accession
        mash_screen_result['allele'] = custom_plasmid_db_data[accession]['allele']
        mash_screen_result['circularity'] = custom_plasmid_db_data[accession]['circularity']
        mash_screen_result['plasmid_length'] = custom_plasmid_db_data[accession]['plasmid_length']
        mash_screen_result['incompatibility_group'] = custom_plasmid_db_data[accession]['incompatibility_group']

    mash_screen_results.sort(key=operator.itemgetter('accession'))
    mash_screen_results.sort(key=operator.itemgetter('plasmid_length'), reverse=True)
    mash_screen_results.sort(key=operator.itemgetter('identity'), reverse=True)
    mash_screen_results.sort(key=operator.itemgetter('circularity'))
    mash_screen_results.sort(key=operator.itemgetter('incompatibility_group'))

    candidates_keys = [
        'identity',
        'accession',
        'circularity',
        'plasmid_length',
        'allele',
        'incompatibility_group',
    ]
    
    with open(os.path.join(paths['custom_plasmid_output'], 'candidates.tsv'), 'w+') as candidates_file:
        writer = csv.DictWriter(candidates_file, candidates_keys,
                                delimiter='\t', extrasaction='ignore')
        writer.writerows(mash_screen_results)

    candidates = []
    with open(os.path.join(paths['custom_plasmid_output'], 'candidates.tsv'), 'r') as candidates_file:
        reader = csv.DictReader(candidates_file, fieldnames=candidates_keys, delimiter='\t')
        for row in reader:
            row['fasta_path'] = os.path.join(
                paths['custom_plasmid_output'],
                'candidates',
                row['accession'] + '.fna',
            )
            candidates.append(row)

    for candidate in candidates:
        candidate['database'] = 'custom'

    for candidate in candidates:
        candidate_fasta_db_path = os.path.join(
            paths['mash_custom_plasmid_db'],
            candidate['accession'] + ".fna"
        )
        shutil.copyfile(candidate_fasta_db_path, candidate['fasta_path'])
        logger.info(
            "file_copied",
            timestamp=str(now()),
            accession=candidate['accession'],
            sample_id=sample_id
        )
    return candidates
Beispiel #6
0
def main(args):
    """
    main entrypoint
    Args:
        args():
    Returns:
        (void)
    """

    config = configparser.ConfigParser()
    config.read(args.config_file)

    try:
        mash_genome_db = args.mash_genome_db
    except AttributeError:
        try:
            mash_genome_db = config['databases']['mash_genome_db']
            if not os.path.exists(mash_genome_db):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        mash_genome_db)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/mash_genome_db",
                configuration_value=mash_genome_db,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/mash_genome_db",
                error_message=str(e),
            )

    sample_id = args.sample_id
    reads1_fastq = args.reads1_fastq
    reads2_fastq = args.reads2_fastq
    output_dir = args.outdir

    prepare_output_directories(output_dir, sample_id)

    #dictionary to store QC PASS/FAIL flags
    qc_verdicts = {
        "multiple_species_contamination": None,
        "fastq_contains_plasmids": None,
        "acceptable_coverage": None,
        "acceptable_fastqc_forward": None,
        "acceptable_fastqc_reverse": None,
        "acceptable_quast_assembly_metrics": None,
        "acceptable_busco_assembly_metrics": None
    }

    qc_thresholds = {
        # genome mash will include all hits with scores (top hit score - $thisvalue)
        "mash_hits_genome_score_cutoff": 300,
        # plasmid mash will include all hits with scores (top hit score - $thisvalue)
        "mash_hits_plasmid_score_cutoff": 100,
        # sequencing coverage greater than ($thisvalue) will pass the QC
        "coverage_cutoff": 30,
        # QUAST QC: assembly length within +-($thisvalue) percent
        # in reference to reference length will pass the QC
        "quast_assembly_length_cutoff": 0.10,
        # BUSCO QC: complete single genes greater than ($thisvalue) percent will pass the QC
        "busco_complete_single_cutoff": 0.90,
        # BUSCO QC: complete duplicate genes less than ($thisvalue) percent will pass the QC
        "busco_complete_duplicate_cutoff": 0.10
    }

    paths = {
        "output_dir":
        output_dir,
        'logs':
        os.path.join(
            output_dir,
            sample_id,
            'logs',
        ),
        "mash_genome_path":
        os.path.join(output_dir, sample_id, "pre-assembly_qc",
                     "mash_dist.genome.tsv"),
        "fastqc_output_path":
        os.path.join(output_dir, sample_id, "pre-assembly_qc", "fastqc"),
        "totalbp_path":
        os.path.join(output_dir, sample_id, "pre-assembly_qc", "totalbp"),
        "estimated_coverage_stats_path":
        os.path.join(output_dir, sample_id, "pre-assembly_qc",
                     "estimated_coverage_stats.tsv"),
        "reference_genome_path":
        os.path.join(output_dir, sample_id, "reference"),
        "assembly_output":
        os.path.join(output_dir, sample_id, "assembly"),
        "quast_path":
        os.path.join(output_dir, sample_id, "post-assembly_qc", "quast"),
    }

    job_script_path = resource_filename('data', 'job_scripts')
    estimated_genome_sizes_path = resource_filename(
        'data', 'estimated_genome_sizes.tsv')
    estimated_genome_sizes = input_parsers.parse_estimated_genome_sizes(
        estimated_genome_sizes_path)

    pre_assembly_qc_jobs = [{
        'job_name':
        "_".join(['mash_dist_sort_head', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'mash_dist_sort_head.sh'),
        'args': [
            "--R1", reads1_fastq, "--R2", reads2_fastq, "--queries",
            mash_genome_db, "--output_file", paths['mash_genome_path']
        ],
    }, {
        'job_name':
        "_".join(['fastqc', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'fastqc.sh'),
        'args': [
            "--R1", reads1_fastq, "--R2", reads2_fastq, "--output_dir",
            paths['fastqc_output_path']
        ],
    }, {
        'job_name':
        "_".join(['seqtk_totalbp', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'seqtk_totalbp.sh'),
        'args': [
            "--R1", reads1_fastq, "--R2", reads2_fastq, "--output_file",
            paths['totalbp_path']
        ],
    }]

    run_jobs(pre_assembly_qc_jobs)

    #parse genome mash results
    mash_dist_results = []
    try:
        mash_dist_results = result_parsers.parse_mash_dist_result(
            paths["mash_genome_path"])
        logger.info(
            "parsed_result_file",
            timestamp=str(now()),
            filename=os.path.abspath(paths["mash_genome_path"]),
            closest_match_reference_id=mash_dist_results[0]['reference_id'],
        )
    except Exception as e:
        logger.info(
            "result_parsing_failed",
            timestamp=str(now()),
            filename=os.path.abspath(paths["mash_genome_path"]),
            error_message=e.message,
        )

    # parse fastqc
    fastqc_results = {}
    for read in ["R1", "R2"]:
        try:
            [fastqc_result_summary_path] = glob.glob(
                os.path.join(paths['fastqc_output_path'],
                             "*_" + read + "_*" + "fastqc", 'summary.txt'))
            fastqc_results[read] = result_parsers.parse_fastqc_result(
                fastqc_result_summary_path)
            logger.info(
                "parsed_result_file",
                timestamp=str(now()),
                filename=os.path.abspath(fastqc_result_summary_path),
                summary=fastqc_results[read],
            )
        except Exception as e:
            logger.error("result_parsing_failed",
                         timestamp=str(now()),
                         filename=fastqc_result_summary_path)
            fastqc_results["R1"] = {
                "basic_statistics": "FAILED_TO_PARSE",
                "per_base_sequence_quality": "FAILED_TO_PARSE",
                "per_tile_sequence_quality": "FAILED_TO_PARSE",
                "per_sequence_quality_scores": "FAILED_TO_PARSE",
                "per_base_sequence_content": "FAILED_TO_PARSE",
                "per_sequence_gc_content": "FAILED_TO_PARSE",
                "per_base_n_content": "FAILED_TO_PARSE",
                "sequence_length_distribution": "FAILED_TO_PARSE",
                "sequence_duplication_levels": "FAILED_TO_PARSE",
                "overrepresented_sequences": "FAILED_TO_PARSE",
                "adapter_content": "FAILED_TO_PARSE",
            }

            fastqc_results["R2"] = {
                "basic_statistics": "FAILED_TO_PARSE",
                "per_base_sequence_quality": "FAILED_TO_PARSE",
                "per_tile_sequence_quality": "FAILED_TO_PARSE",
                "per_sequence_quality_scores": "FAILED_TO_PARSE",
                "per_base_sequence_content": "FAILED_TO_PARSE",
                "per_sequence_gc_content": "FAILED_TO_PARSE",
                "per_base_n_content": "FAILED_TO_PARSE",
                "sequence_length_distribution": "FAILED_TO_PARSE",
                "sequence_duplication_levels": "FAILED_TO_PARSE",
                "overrepresented_sequences": "FAILED_TO_PARSE",
                "adapter_content": "FAILED_TO_PARSE",
            }

    #look at fastqc results
    qc_verdicts["acceptable_fastqc_forward"] = qc.fastqc_qc_check(
        fastqc_results["R1"])
    qc_verdicts["acceptable_fastqc_reverse"] = qc.fastqc_qc_check(
        fastqc_results["R2"])

    try:
        reference_genome = mash_dist_results[0]['reference_id']
    except Exception as e:
        logger.error(
            "failed_quality_control_check",
            timestamp=str(now()),
            qc_check_failed="determine_reference_sequence",
            error_message=e.message,
        )

    # build the save paths
    try:
        os.makedirs(paths['reference_genome_path'])
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise

    download_refseq_reference(reference_genome, paths['reference_genome_path'])

    # If the user passes an expected organism NCBI taxonomy ID, then
    # use that to estimate the genome size. Otherwise, use the downloaded reference.
    estimated_genome_size = DEFAULT_ESTIMATED_GENOME_SIZE
    if args.expected_organism_ncbi_taxid:
        estimated_genome_size = get_estimated_genome_size(
            estimated_genome_sizes, args.expected_organism_ncbi_taxid)
    else:
        try:
            [reference_genome_assembly_stats_path
             ] = glob.glob(paths["reference_genome_path"] +
                           "/*_assembly_stats.txt")
        except ValueError:
            logger.error(
                "result_parsing_failed",
                timestamp=str(now()),
                filename=str(os.path.abspath(paths["reference_genome_path"])) +
                "/*_assembly_stats.txt",
            )

        try:
            reference_genome_assembly_stats = result_parsers.parse_reference_genome_assembly_stats(
                reference_genome_assembly_stats_path)
            logger.info(
                "parsed_result_file",
                timestamp=str(now()),
                filename=os.path.abspath(reference_genome_assembly_stats_path),
                total_length=reference_genome_assembly_stats['total_length'],
                contig_count=reference_genome_assembly_stats['contig_count'],
                contig_N50=reference_genome_assembly_stats['contig_N50'],
                organism_name=reference_genome_assembly_stats['organism_name'],
                infraspecific_name=reference_genome_assembly_stats[
                    'infraspecific_name'],
                ncbi_taxonomy_id=reference_genome_assembly_stats['taxid'],
                refseq_assembly_accession=reference_genome_assembly_stats[
                    'refseq_assembly_accession'],
            )
            estimated_genome_size = reference_genome_assembly_stats[
                'total_length']
        except Exception as e:
            logger.error(
                "result_parsing_failed",
                timestamp=str(now()),
                filename=os.path.abspath(reference_genome_assembly_stats_path),
                error_message=e.message,
            )

    total_bp = result_parsers.parse_total_bp(paths["totalbp_path"])
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(paths["totalbp_path"]),
        total_bp=total_bp,
    )

    estimated_depth_of_coverage = total_bp / estimated_genome_size

    if estimated_depth_of_coverage >= int(qc_thresholds["coverage_cutoff"]):
        qc_verdicts["acceptable_coverage"] = True

    estimated_coverage_stats_headers = [
        'sample_id',
        'total_bp',
        'estimated_genome_size',
        'estimated_depth_of_coverage',
    ]

    with open(paths['estimated_coverage_stats_path'], 'w+') as f:
        writer = csv.DictWriter(f,
                                fieldnames=estimated_coverage_stats_headers,
                                delimiter='\t')
        writer.writeheader()
        writer.writerow({
            'sample_id':
            sample_id,
            'total_bp':
            int(total_bp),
            'estimated_genome_size':
            int(estimated_genome_size),
            'estimated_depth_of_coverage':
            round(estimated_depth_of_coverage, 4),
        })

    assembly_jobs = [{
        'job_name':
        "_".join(['shovill', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 16 -l h_vmem=4G',
        'remote_command':
        os.path.join(job_script_path, 'shovill.sh'),
        'args': [
            "--R1", reads1_fastq, "--R2", reads2_fastq, "--mincov", "3",
            "--minlen", "500", "--output_dir", paths['assembly_output']
        ],
    }]

    run_jobs(assembly_jobs)

    post_assembly_qc_jobs = [
        {
            'job_name':
            "_".join(['quast', sample_id]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 8',
            'remote_command':
            os.path.join(job_script_path, 'quast.sh'),
            'args': [
                "--input",
                os.path.join(paths['assembly_output'], "contigs.fa"),
                "--outdir", paths['quast_path']
            ]
        },
    ]

    run_jobs(post_assembly_qc_jobs)

    busco_short_summary_contigs_path = os.path.abspath(
        paths["quast_path"] + "/busco_stats/short_summary_contigs.txt")
    busco_results = result_parsers.parse_busco_result(
        busco_short_summary_contigs_path)
    logger.info("parsed_result_file",
                timestamp=str(now()),
                filename=os.path.abspath(busco_short_summary_contigs_path),
                busco_results=busco_results)
    quast_report_path = os.path.abspath(paths["quast_path"] + "/report.txt")
    quast_results = result_parsers.parse_quast_result(quast_report_path)
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(quast_report_path),
        num_contigs=quast_results["num_contigs"],
        N50=quast_results["N50"],
    )

    qc_verdicts["acceptable_busco_assembly_metrics"] = qc.busco_qc_check(
        busco_results, qc_thresholds)
    qc_verdicts["acceptable_quast_assembly_metrics"] = qc.quast_qc_check(
        quast_results, estimated_genome_size)
def main(args):
    """
    main entrypoint
    Args:
        args():
    Returns:
        (void)
    """

    config = configparser.ConfigParser()
    config.read(args.config_file)

    sample_id = args.sample_id
    output_dir = args.outdir

    try:
        assembly = args.assembly
    except AttributeError:
        assembly = os.path.join(output_dir, sample_id, 'assembly',
                                'contigs.fa')

    try:
        mlst_scheme_map_file = args.mlst_scheme_map_file
    except AttributeError:
        mlst_scheme_map_file = resource_filename('data',
                                                 'scheme_species_map.tab')
    if not mlst_scheme_map_file:
        mlst_scheme_map_file = resource_filename('data',
                                                 'scheme_species_map.tab')

    paths = {
        "output_dir":
        output_dir,
        'logs':
        os.path.join(
            output_dir,
            sample_id,
            'logs',
        ),
        'mlst_path':
        os.path.join(output_dir, sample_id, 'typing', 'mlst', 'mlst.tsv'),
        'mob_recon_path':
        os.path.join(output_dir, sample_id, 'typing', 'mob_recon'),
        'abricate_plasmidfinder_path':
        os.path.join(output_dir, sample_id, 'typing', 'abricate',
                     'abricate_plasmidfinder.tsv'),
    }

    job_script_path = resource_filename('data', 'job_scripts')

    typing_jobs = [{
        'job_name':
        "_".join(['mlst', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'mlst.sh'),
        'args': [
            "--input", assembly, "--label", sample_id, "--output_file",
            paths['mlst_path']
        ]
    }, {
        'job_name':
        "_".join(['abricate', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'abricate.sh'),
        'args': [
            "--input", assembly, "--database", "plasmidfinder",
            "--output_file", paths['abricate_plasmidfinder_path']
        ]
    }, {
        'job_name':
        "_".join(['mob_recon', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'mob_recon.sh'),
        'args': ["--input", assembly, "--output_dir", paths['mob_recon_path']]
    }]

    run_jobs(typing_jobs)

    mlst_report = os.path.join(output_dir, sample_id, "typing", "mlst",
                               "mlst.tsv")
    mlst_hits = result_parsers.parse_mlst_result(mlst_report)
    # TODO: Check that there is only one MLST result in the report, and handle
    #       cases where the report is malformed.
    [mlst_hit] = mlst_hits
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(mlst_report),
        scheme_id=mlst_hit["scheme_id"],
        sequence_type=mlst_hit["sequence_type"],
    )
    mlst_scheme_map = input_parsers.parse_scheme_species_map(
        mlst_scheme_map_file)
    mlst_species = "Undefined"
    for scheme in mlst_scheme_map:
        if 'species' in scheme and scheme['scheme_id'] == mlst_hit['scheme_id']:
            mlst_species = scheme['species']

    mob_recon_contig_report_path = os.path.join(output_dir, sample_id,
                                                "typing", "mob_recon",
                                                "contig_report.txt")

    mob_recon_contig_report = result_parsers.parse_mob_recon_contig_report(
        mob_recon_contig_report_path)
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(mob_recon_contig_report_path),
        num_records=len(mob_recon_contig_report),
    )

    mob_recon_aggregate_report_path = os.path.join(
        output_dir, sample_id, "typing", "mob_recon",
        "mobtyper_aggregate_report.txt")

    mob_recon_aggregate_report = result_parsers.parse_mob_recon_mobtyper_aggregate_report(
        mob_recon_aggregate_report_path)
    logger.info(
        "parsed_result_file",
        timestamp=str(now()),
        filename=os.path.abspath(mob_recon_aggregate_report_path),
        num_records=len(mob_recon_aggregate_report),
    )

    def extract_contig_num(contig_id):
        """
        Given a contig_id from a mob_recon contig_report.txt file, return only the contig number.
        Args:
            contig_id (str): contig_id field from mob_recon contig_report.txt
            For example: "contigs.fa|contig00054_len=2672_cov=424.9_corr=0_origname=NODE_54_length_2672_cov_424.949312_pilon_sw=shovill-spades/1.0.1_date=20181024"
        Returns:
            str: contig number.
            For example: "00054"
        """
        prefix = '|contig'
        suffix = '_len='
        prefix_index = contig_id.find(prefix) + len(prefix)
        suffix_index = contig_id.find(suffix)
        contig_num = contig_id[prefix_index:suffix_index]
        return contig_num

    def get_plasmid_contigs(mob_recon_contig_report):
        """
        Given a list of dicts generated by parsing a mob_recon contig_report.txt file,
        return a list of plasmid contigs.
        Args:
            mob_recon_contig_report (list of dict):
        Returns:
            list: plasmid contigs
            For example: ['00021', '00022', '00032', ...]
        """
        plasmid_contigs = []
        for contig_report_record in mob_recon_contig_report:
            contig_num = extract_contig_num(contig_report_record['contig_id'])
            if contig_num not in plasmid_contigs and contig_report_record[
                    'rep_type']:
                plasmid_contigs.append(contig_num)
        return plasmid_contigs

    def get_likely_plasmid_contigs(mob_recon_contig_report):
        """
        Given a list of dicts generated by parsing a mob_recon contig_report.txt file,
        return a list of likely plasmid contigs.
        Args:
            mob_recon_contig_report (list of dict):
        Returns:
            list: likely plasmid contigs
            For example: ['00054', '00039', '00061', ...]
        """
        likely_plasmid_contigs = []
        for contig_report_record in mob_recon_contig_report:
            contig_num = extract_contig_num(contig_report_record['contig_id'])
            if contig_num not in likely_plasmid_contigs and not contig_report_record[
                    'rep_type']:
                likely_plasmid_contigs.append(contig_num)
        return likely_plasmid_contigs

    def get_plasmid_origins(mob_recon_contig_report):
        """
        Given a list of dicts generated by parsing a mob_recon contig_report.txt file,
        return a list of plasmid origins.
        Args:
            mob_recon_contig_report (list of dict):
        Returns:
            list: plasmid origins
            For example: ['rep_cluster_1254', 'IncL/M', 'IncN', ...]
        """
        origins = []
        for contig_report_record in mob_recon_contig_report:
            if contig_report_record['rep_type']:
                if contig_report_record['rep_type'] not in origins:
                    origins.append(contig_report_record['rep_type'])
        return origins

    plasmid_contigs = get_plasmid_contigs(mob_recon_contig_report)
    likely_plasmid_contigs = get_likely_plasmid_contigs(
        mob_recon_contig_report)
    origins = get_plasmid_origins(mob_recon_contig_report)
def main(args):
    """
    main entrypoint
    Args:
        args(argparse.Namespace): Parsed command-line arguments.
    Returns:
        (void)
    """

    config = configparser.ConfigParser()
    config.read(args.config_file)

    sample_id = args.sample_id
    output_dir = args.outdir

    try:
        assembly = args.assembly
    except AttributeError:
        assembly = os.path.join(output_dir, sample_id, 'assembly',
                                'contigs.fa')

    try:
        card_path = args.card_json
    except AttributeError:
        try:
            card_path = config['databases']['card_json']
            if not os.path.exists(card_path):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT), card_path)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/card_json",
                configuration_value=card_path,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/card_json",
                error_message=str(e),
            )

    try:
        abricate_datadir = args.abricate_datadir
    except AttributeError:
        try:
            abricate_datadir = config['databases']['abricate_datadir']
            if not os.path.exists(abricate_datadir):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        abricate_datadir)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/abricate_datadir",
                configuration_value=abricate_datadir,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/abricate_datadir",
                error_message=str(e),
            )

    try:
        abricate_cpo_plasmid_db = args.abricate_cpo_plasmid_db
    except AttributeError:
        try:
            abricate_cpo_plasmid_db = config['databases'][
                'abricate_cpo_plasmid_db']
            if not os.path.exists(
                    os.path.join(abricate_datadir, abricate_cpo_plasmid_db)):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        abricate_cpo_plasmid_db)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/abricate_cpo_plasmid_db",
                configuration_value=abricate_cpo_plasmid_db,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/abricate_cpo_plasmid_db",
            )

    paths = {
        "output_dir":
        output_dir,
        'logs':
        os.path.join(
            output_dir,
            sample_id,
            'logs',
        ),
        'abricate_path':
        os.path.join(output_dir, sample_id, 'resistance', 'abricate',
                     'abricate.tsv'),
        'rgi_path':
        os.path.join(output_dir, sample_id, 'resistance', 'rgi'),
    }

    job_script_path = resource_filename('data', 'job_scripts')

    resistance_jobs = [{
        'job_name':
        "_".join(['abricate', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'abricate.sh'),
        'args': [
            "--input", assembly, "--datadir", abricate_datadir, "--database",
            abricate_cpo_plasmid_db, "--output_file", paths['abricate_path']
        ]
    }, {
        'job_name':
        "_".join(['rgi', sample_id]),
        'output_path':
        paths['logs'],
        'error_path':
        paths['logs'],
        'native_specification':
        '-pe smp 8',
        'remote_command':
        os.path.join(job_script_path, 'rgi.sh'),
        'args': [
            "--input", assembly, "--card_json", card_path, "--output_dir",
            paths['rgi_path']
        ]
    }]

    run_jobs(resistance_jobs)

    abricate_report_path = os.path.join(output_dir, sample_id, "resistance",
                                        "abricate", "abricate.tsv")
    abricate_report = result_parsers.parse_abricate_result(
        abricate_report_path)
    logger.info("parsed_result_file",
                timestamp=str(datetime.datetime.utcnow().replace(
                    tzinfo=datetime.timezone.utc).isoformat()),
                filename=os.path.abspath(abricate_report_path),
                resistance_genes=[{
                    key: record[key]
                    for key in [
                        "gene",
                        "accession",
                        "database",
                        "percent_coverage",
                        "percent_identity",
                    ]
                } for record in abricate_report])

    rgi_report_path = os.path.join(output_dir, sample_id, "resistance", "rgi",
                                   "rgi.txt")
    rgi_report = result_parsers.parse_rgi_result_txt(rgi_report_path)
    logger.info("parsed_result_file",
                timestamp=str(datetime.datetime.utcnow().replace(
                    tzinfo=datetime.timezone.utc).isoformat()),
                filename=os.path.abspath(rgi_report_path),
                resistance_genes=[{
                    key: record[key]
                    for key in [
                        "best_hit_aro",
                        "aro",
                    ]
                } for record in rgi_report])

    def get_abricate_carbapenemases(abricate_report):
        """
        Given a list of dicts generated by parsing an abricate report file,
        return a list of carbapenemases.
        Args:
            abricate_report (list of dict):
        Returns:
            list: likely plasmid contigs
            For example: ['NDM-1', '', '', ...]
        """
        abricate_carbapenemases = []
        for abricate_report_record in abricate_report:
            abricate_carbapenemases.append(abricate_report_record['gene'])
        return abricate_carbapenemases

    def get_rgi_carbapenemases(rgi_report):
        """
        Given a list of dicts generated by parsing an rgi report file,
        return a list of carbapenemases.
        Args:
            rgi_report (list of dict):
        Returns:
            list: likely plasmid contigs
            For example: ['', '', '', ...]
        """
        rgi_carbapenemases = []
        for rgi_report_record in rgi_report:
            if re.search("carbapenem", rgi_report_record['drug_class']):
                rgi_carbapenemases.append(rgi_report_record['best_hit_aro'])
        return rgi_carbapenemases
def main(args):
    """
    main entrypoint
    Args:
        args():
    Returns:
        (void)
    """

    config = configparser.ConfigParser()
    config.read(args.config_file)

    try:
        mash_refseq_plasmid_db = args.mash_refseq_plasmid_db
    except AttributeError:
        try:
            mash_refseq_plasmid_db = config['databases'][
                'mash_refseq_plasmid_db']
            if not os.path.exists(mash_refseq_plasmid_db):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        mash_refseq_plasmid_db)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/mash_refseq_plasmid_db",
                configuration_value=mash_refseq_plasmid_db,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/mash_refseq_plasmid_db",
                error_message=str(e),
            )

    try:
        mash_custom_plasmid_db = args.mash_custom_plasmid_db
    except AttributeError:
        try:
            mash_custom_plasmid_db = config['databases'][
                'mash_custom_plasmid_db']
            if not os.path.exists(mash_custom_plasmid_db):
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        mash_custom_plasmid_db)
            logger.info(
                "configuration_loaded",
                timestamp=str(now()),
                configuration_attribute="databases/mash_custom_plasmid_db",
                configuration_value=mash_custom_plasmid_db,
            )
        except Exception as e:
            logger.error(
                "configuration_failed",
                timestamp=str(now()),
                configuration_attribute="databases/mash_custom_plasmid_db",
                error_message=str(e),
            )

    sample_id = args.sample_id
    output_dir = args.outdir

    paths = {
        'job_scripts':
        resource_filename('data', 'job_scripts'),
        'reads1_fastq':
        args.reads1_fastq,
        'reads2_fastq':
        args.reads2_fastq,
        'mash_custom_plasmid_db':
        mash_custom_plasmid_db,
        'mash_refseq_plasmid_db':
        mash_refseq_plasmid_db,
        'output_dir':
        output_dir,
        'logs':
        os.path.join(
            output_dir,
            sample_id,
            'logs',
        ),
        'plasmid_output':
        os.path.join(
            output_dir,
            sample_id,
            "plasmids",
        ),
        "refseq_plasmid_output":
        os.path.join(
            output_dir,
            sample_id,
            "plasmids",
            "refseq_plasmids",
        ),
        "custom_plasmid_output":
        os.path.join(
            output_dir,
            sample_id,
            "plasmids",
            "custom_plasmids",
        ),
    }

    os.makedirs(paths['logs'], exist_ok=True)

    os.makedirs(os.path.join(
        paths['custom_plasmid_output'],
        'candidates',
    ),
                exist_ok=True)

    os.makedirs(os.path.join(
        paths['refseq_plasmid_output'],
        'candidates',
    ),
                exist_ok=True)

    refseq_candidates = strategies.refseq_plasmids(sample_id, paths)
    custom_candidates = strategies.custom_plasmids(sample_id, paths)

    candidates = refseq_candidates + custom_candidates

    samtools_faidx_jobs = []
    bwa_index_jobs = []
    for candidate in candidates:
        samtools_faidx_job = {
            'job_name':
            "_".join(['samtools_faidx', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 2',
            'remote_command':
            os.path.join(paths['job_scripts'], 'samtools_faidx.sh'),
            'args': [
                "--fasta",
                candidate['fasta_path'],
            ]
        }
        bwa_index_job = {
            'job_name':
            "_".join(['bwa_index', sample_id, candidate['accession']]),
            'output_path': paths['logs'],
            'error_path': paths['logs'],
            'native_specification': '-pe smp 2',
            'remote_command': os.path.join(paths['job_scripts'],
                                           'bwa_index.sh'),
            'args': [
                "--fasta",
                candidate['fasta_path'],
            ]
        }
        samtools_faidx_jobs.append(samtools_faidx_job)
        bwa_index_jobs.append(bwa_index_job)

    run_jobs(samtools_faidx_jobs + bwa_index_jobs)

    bwa_mem_jobs = []
    for candidate in candidates:
        bwa_mem_job = {
            'job_name':
            "_".join(['bwa_mem', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 8 -shell y',
            'remote_command':
            os.path.join(paths['job_scripts'], 'bwa_mem.sh'),
            'args': [
                "--reference", candidate['fasta_path'], "--R1",
                paths['reads1_fastq'], "--R2", paths['reads2_fastq'],
                "--output",
                re.sub("\.fna$", ".sam", candidate['fasta_path'])
            ]
        }
        bwa_mem_jobs.append(bwa_mem_job)

    run_jobs(bwa_mem_jobs)

    samtools_filter_fixmate_sort_jobs = []
    for candidate in candidates:
        alignment = os.path.join(
            re.sub("\.fna$", ".sam", candidate['fasta_path']))
        samtools_filter_fixmate_sort_job = {
            'job_name':
            "_".join([
                'samtools_filter_fixmate_sort', sample_id,
                candidate['accession']
            ]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 4',
            'remote_command':
            os.path.join(paths['job_scripts'],
                         'samtools_filter_fixmate_sort.sh'),
            'args': [
                "--input",
                alignment,
                "--flags",
                1540,
                "--output",
                re.sub('\.sam$', '.bam', alignment),
            ]
        }
        samtools_filter_fixmate_sort_jobs.append(
            samtools_filter_fixmate_sort_job)

    run_jobs(samtools_filter_fixmate_sort_jobs)

    for candidate in candidates:
        sam_alignment = "/".join([
            re.sub('\.fna$', '.sam', candidate['fasta_path']),
        ])
        os.remove(sam_alignment)

    samtools_index_jobs = []
    for candidate in candidates:
        alignment = os.path.join(
            re.sub('\.fna', '.bam', candidate['fasta_path']))
        samtools_index_job = {
            'job_name':
            "_".join(['samtools_index', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 4',
            'remote_command':
            os.path.join(paths['job_scripts'], 'samtools_index.sh'),
            'args': [
                "--input",
                alignment,
            ]
        }
        samtools_index_jobs.append(samtools_index_job)

    run_jobs(samtools_index_jobs)

    samtools_depth_jobs = []
    for candidate in candidates:
        alignment = os.path.join(
            re.sub('\.fna', '.bam', candidate['fasta_path']))
        samtools_depth_job = {
            'job_name':
            "_".join(['samtools_depth', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 1',
            'remote_command':
            os.path.join(paths['job_scripts'], 'samtools_depth.sh'),
            'args': [
                "--input",
                alignment,
                "--output",
                re.sub('\.bam$', '.depth', alignment),
            ]
        }
        samtools_depth_jobs.append(samtools_depth_job)

    run_jobs(samtools_depth_jobs)

    for candidate in candidates:
        depth = os.path.join(
            re.sub('\.fna$', '.depth', candidate['fasta_path']), )
        MINIMUM_DEPTH = 10
        MINIMUM_COVERAGE_PERCENT = 95.0
        positions_above_minimum_depth = 0
        total_length = 0
        with open(depth) as depth_file:
            for line in depth_file:
                [_, position, depth] = line.split()
                total_length += 1
                if int(depth) >= MINIMUM_DEPTH:
                    positions_above_minimum_depth += 1
        candidate['bases_above_minimum_depth'] = positions_above_minimum_depth
        try:
            candidate[
                'percent_above_minimum_depth'] = positions_above_minimum_depth / total_length
        except ZeroDivisionError:
            candidate['percent_above_minimum_depth'] = 0.0

    freebayes_jobs = []
    for candidate in candidates:
        alignment = re.sub('\.fna$', '.bam', candidate['fasta_path'])
        reference = candidate['fasta_path']
        vcf = re.sub('\.fna$', '.vcf', candidate['fasta_path'])
        freebayes_job = {
            'job_name':
            "_".join(['freebayes', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 8',
            'remote_command':
            os.path.join(paths['job_scripts'], 'freebayes.sh'),
            'args': [
                "--input",
                alignment,
                "--reference",
                reference,
                "--output",
                vcf,
            ]
        }
        freebayes_jobs.append(freebayes_job)

    run_jobs(freebayes_jobs)

    bcftools_view_jobs = []
    for candidate in candidates:
        vcf = re.sub('\.fna$', '.vcf', candidate['fasta_path'])
        bcftools_view_job = {
            'job_name':
            "_".join(['bcftools_view', sample_id, candidate['accession']]),
            'output_path':
            paths['logs'],
            'error_path':
            paths['logs'],
            'native_specification':
            '-pe smp 2 -shell y',
            'remote_command':
            os.path.join(paths['job_scripts'], 'bcftools_view.sh'),
            'args': [
                "--input",
                vcf,
                "--output",
                re.sub('\.vcf$', '.snps.vcf', vcf),
            ]
        }
        bcftools_view_jobs.append(bcftools_view_job)

    run_jobs(bcftools_view_jobs)

    for candidate in candidates:
        snps_vcf = re.sub('\.fna$', '.snps.vcf', candidate['fasta_path'])
        snps = 0
        with open(snps_vcf, 'r') as f:
            for line in f:
                if not line.startswith('#'):
                    snps += 1
        candidate['snps'] = snps

    plasmid_output_summary = os.path.join(paths['plasmid_output'],
                                          'custom_plasmid.txt')

    plasmid_output_final = os.path.join(output_dir, sample_id,
                                        'final_plasmid.tsv')

    custom_candidates = [c for c in candidates if c['database'] == 'custom']
    custom_candidates.sort(key=operator.itemgetter('snps'))
    custom_candidates.sort(key=operator.itemgetter('plasmid_length'),
                           reverse=True)
    custom_candidates.sort(
        key=operator.itemgetter('percent_above_minimum_depth'), reverse=True)
    custom_best_candidate = next(iter(custom_candidates), None)

    with open(plasmid_output_final, 'w+') as f:
        fieldnames = [
            'sample_id', 'accession', 'circularity', 'plasmid_length',
            'bases_above_minimum_depth', 'percent_above_minimum_depth', 'snps',
            'allele', 'incompatibility_group'
        ]

        writer = csv.DictWriter(f,
                                fieldnames=fieldnames,
                                delimiter='\t',
                                extrasaction='ignore')
        writer.writeheader()
        if custom_best_candidate:
            f.write(args.sample_id + '\t')
            # Truncate floats to 4 digits
            writer.writerow({
                k: round(v, 4) if isinstance(v, float) else v
                for k, v in custom_best_candidate.items()
            })

    with open(plasmid_output_summary, 'w+') as f:
        fieldnames = [
            'sample_id', 'accession', 'circularity', 'plasmid_length',
            'bases_above_minimum_depth', 'percent_above_minimum_depth', 'snps',
            'allele', 'incompatibility_group'
        ]
        writer = csv.DictWriter(f,
                                fieldnames=fieldnames,
                                delimiter='\t',
                                extrasaction='ignore')
        writer.writeheader()
        for candidate in custom_candidates:
            f.write(args.sample_id + '\t')
            # Truncate floats to 4 digits
            writer.writerow({
                k: round(v, 4) if isinstance(v, float) else v
                for k, v in candidate.items()
            })