Esempio n. 1
0
def main():
    default_database_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'databases')
    args = parse_args()
    if args.debug:
        init_console_logger(3)
    logging.info('Running Mob-typer v. {}'.format(__version__))
    if not args.outdir:
        logging.info('Error, no output directory specified, please specify one')
        sys.exit()

    if not args.infile:
        logging.info('Error, no fasta specified, please specify one')
        sys.exit()

    if not os.path.isfile(args.infile):
        logging.info('Error, fasta file does not exist')
        sys.exit()

    if not os.path.isdir(args.outdir):
        os.mkdir(args.outdir, 0o755)

    if not isinstance(args.num_threads, int):
        logging.info('Error number of threads must be an integer, you specified "{}"'.format(args.num_threads))

    database_dir = os.path.abspath(args.database_directory)
    verify_init(logging, database_dir)
    # Script arguments
    input_fasta = args.infile
    out_dir = args.outdir
    num_threads = int(args.num_threads)
    keep_tmp = args.keep_tmp
    if database_dir == default_database_dir:
        mob_ref = args.plasmid_mob
        mpf_ref = args.plasmid_mpf
        orit_ref = args.plasmid_orit
        mash_db = args.plasmid_mash_db
        replicon_ref = args.plasmid_replicons
    else:
        mob_ref = os.path.join(database_dir, 'mob.proteins.faa')
        mpf_ref = os.path.join(database_dir, 'mpf.proteins.faa')
        orit_ref = os.path.join(database_dir, 'orit.fas')
        mash_db = os.path.join(database_dir, 'ncbi_plasmid_full_seqs.fas.msh')
        replicon_ref = os.path.join(database_dir, 'rep.dna.fas')


    tmp_dir = os.path.join(out_dir, '__tmp')
    file_id = os.path.basename(input_fasta)
    fixed_fasta = os.path.join(tmp_dir, 'fixed.input.fasta')
    replicon_blast_results = os.path.join(tmp_dir, 'replicon_blast_results.txt')
    mob_blast_results = os.path.join(tmp_dir, 'mobtyper_blast_results.txt')
    mpf_blast_results = os.path.join(tmp_dir, 'mpf_blast_results.txt')
    orit_blast_results = os.path.join(tmp_dir, 'orit_blast_results.txt')
    if os.path.isfile(mob_blast_results):
        os.remove(mob_blast_results)
    if os.path.isfile(mpf_blast_results):
        os.remove(mpf_blast_results)
    if os.path.isfile(orit_blast_results):
        os.remove(orit_blast_results)
    if os.path.isfile(replicon_blast_results):
        os.remove(replicon_blast_results)
    report_file = os.path.join(out_dir, 'mobtyper_' + file_id + '_report.txt')
    mash_file = os.path.join(tmp_dir, 'mash_' + file_id + '.txt')

    # Input numeric params

    min_rep_ident = float(args.min_rep_ident)
    min_mob_ident = float(args.min_mob_ident)
    min_ori_ident = float(args.min_ori_ident)
    min_mpf_ident = float(args.min_mpf_ident)

    idents = {'min_rep_ident': min_rep_ident, 'min_mob_ident': min_mob_ident, 'min_ori_ident': min_ori_ident}

    for param in idents:

        value = float(idents[param])

        if value < 60:
            logging.error("Error: {} is too low, please specify an integer between 70 - 100".format(param))
            sys.exit(-1)
        if value > 100:
            logging.error("Error: {} is too high, please specify an integer between 70 - 100".format(param))
            sys.exit(-1)


    min_rep_cov = float(args.min_rep_cov)
    min_mob_cov = float(args.min_mob_cov)
    min_ori_cov = float(args.min_ori_cov)
    min_mpf_cov = float(args.min_mpf_cov)



    covs = {'min_rep_cov': min_rep_cov, 'min_mob_cov': min_mob_cov, 'min_con_cov': min_ori_cov,
            'min_rpp_cov': min_ori_cov}

    for param in covs:

        value = float(covs[param])

        if value < 60:
            logging.error("Error: {} is too low, please specify an integer between 50 - 100".format(param))
            sys.exit(-1)
        if value > 100:
            logging.error("Error: {} is too high, please specify an integer between 50 - 100".format(param))
            sys.exit(-1)


    min_rep_evalue = float(args.min_rep_evalue)
    min_mob_evalue = float(args.min_mob_evalue)
    min_ori_evalue = float(args.min_ori_evalue)
    min_mpf_evalue = float(args.min_mpf_evalue)


    evalues = {'min_rep_evalue': min_rep_evalue, 'min_mob_evalue': min_mob_evalue, 'min_con_evalue': min_ori_evalue}

    for param in evalues:

        value = float(evalues[param])

        if value > 1:
            logging.error("Error: {} is too high, please specify an float evalue between 0 to 1".format(param))
            sys.exit(-1)


    check_dependencies(logging)

    needed_dbs = [replicon_ref, mob_ref, mash_db, mpf_ref]

    for db in needed_dbs:
        if (not os.path.isfile(db)):
            logging.info('Warning! Needed database missing "{}"'.format(db))
            mob_suite.mob_init.main()


    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir, 0o755)

    fix_fasta_header(input_fasta, fixed_fasta)

    # run individual marker blasts
    logging.info('Running replicon blast on {}'.format(replicon_ref))
    replicon_contigs = getRepliconContigs(
        replicon_blast(replicon_ref, fixed_fasta, min_rep_ident, min_rep_cov, min_rep_evalue, tmp_dir, replicon_blast_results,
                       num_threads=num_threads))
    found_replicons = dict()
    for contig_id in replicon_contigs:
        for hit in replicon_contigs[contig_id]:
            acs, type = hit.split('|')
            found_replicons[acs] = type


    logging.info('Running relaxase blast on {}'.format(mob_ref))

    mob_contigs = getRepliconContigs(
        mob_blast(mob_ref, fixed_fasta, min_mob_ident, min_mob_cov, min_mob_evalue, tmp_dir, mob_blast_results, num_threads=num_threads))
    found_mob = dict()
    for contig_id in mob_contigs:
        for hit in mob_contigs[contig_id]:
            acs, type = hit.split('|')
            found_mob[acs] = type

    # print (found_mob)

    logging.info('Running mpf blast on {}'.format(mob_ref))
    mpf_contigs = getRepliconContigs(
        mob_blast(mpf_ref, fixed_fasta, min_mpf_ident, min_mpf_cov, min_mpf_evalue, tmp_dir, mpf_blast_results, num_threads=num_threads))
    found_mpf = dict()
    for contig_id in mpf_contigs:
        for hit in mpf_contigs[contig_id]:
            acs, type = hit.split('|')
            found_mpf[acs] = type

    # print(found_mpf)

    logging.info('Running orit blast on {}'.format(replicon_ref))
    orit_contigs = getRepliconContigs(
        replicon_blast(orit_ref, fixed_fasta, min_ori_ident, min_ori_cov, min_ori_evalue, tmp_dir, orit_blast_results,
                       num_threads=num_threads))
    found_orit = dict()
    for contig_id in orit_contigs:
        for hit in orit_contigs[contig_id]:
            acs, type = hit.split('|')
            found_orit[acs] = type


    # Get closest neighbor by mash distance
    m = mash()
    mash_distances = dict()
    mashfile_handle = open(mash_file, 'w')
    m.run_mash(mash_db, fixed_fasta, mashfile_handle)
    mash_results = m.read_mash(mash_file)
    mash_top_hit = getMashBestHit(mash_results)

    results_fh = open(report_file, 'w')
    results_fh.write("file_id\tnum_contigs\ttotal_length\tgc\t" \
                     "rep_type(s)\trep_type_accession(s)\t" \
                     "relaxase_type(s)\trelaxase_type_accession(s)\t" \
                     "mpf_type\tmpf_type_accession(s)\t" \
                     "orit_type(s)\torit_accession(s)\tPredictedMobility\t" \
                     "mash_nearest_neighbor\tmash_neighbor_distance\tmash_neighbor_cluster\n")

    if len(found_replicons) > 0:
        rep_types = ",".join(list(found_replicons.values()))
        rep_acs = ",".join(list(found_replicons.keys()))
    else:
        rep_types = "-"
        rep_acs = "-"

    if len(found_mob) > 0:
        mob_types = ",".join(list(found_mob.values()))
        mob_acs = ",".join(list(found_mob.keys()))
    else:
        mob_types = "-"
        mob_acs = "-"

    if len(found_mpf) > 0:
        mpf_type = determine_mpf_type(found_mpf)
        mpf_acs = ",".join(list(found_mpf.keys()))
    else:
        mpf_type = "-"
        mpf_acs = "-"

    if len(found_orit) > 0:
        orit_types = ",".join(list(found_orit.values()))
        orit_acs = ",".join(list(found_orit.keys()))
    else:
        orit_types = "-"
        orit_acs = "-"
    stats = calcFastaStats(fixed_fasta)
    predicted_mobility = 'Non-mobilizable'

    if mob_acs != '-' or orit_acs != '-':
        predicted_mobility = 'Mobilizable'

    if mob_acs != '-' and mpf_acs != '-':
        predicted_mobility = 'Conjugative'

    string = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(file_id, stats['num_seq'],
                                                                                     stats['size'], stats['gc_content'],
                                                                                     rep_types, rep_acs, mob_types,
                                                                                     mob_acs, mpf_type, mpf_acs,
                                                                                     orit_types, orit_acs,
                                                                                     predicted_mobility,
                                                                                     mash_top_hit['top_hit'],
                                                                                     mash_top_hit['mash_hit_score'],
                                                                                     mash_top_hit['clustid'])
    results_fh.write(string)

    if not keep_tmp:
        shutil.rmtree(tmp_dir)

    print("{}".format(string))
Esempio n. 2
0
def main():

    args = parse_args()

    if args.debug:
        init_console_logger(3)
    logging.info("MOB-recon v. {} ".format(__version__))

    if not args.outdir:
        logging.error(
            'Error, no output directory specified, please specify one')
        sys.exit(-1)

    if not args.infile:
        logging.error('Error, no fasta specified, please specify one')
        sys.exit(-1)

    if not os.path.isfile(args.infile):
        logging.error('Error, input fasta file does not exist: "{}"'.format(
            args.infile))
        sys.exit(-1)

    logging.info('Processing fasta file {}'.format(args.infile))
    logging.info('Analysis directory {}'.format(args.outdir))

    if not os.path.isdir(args.outdir):
        os.mkdir(args.outdir, 0o755)

    # Check that the needed databases have been initialized
    database_dir = os.path.abspath(args.database_directory)
    verify_init(logging, database_dir)
    status_file = os.path.join(database_dir, 'status.txt')

    if not os.path.isfile(status_file):
        logging.info(
            'Warning! Needed databases have not been initialize please run mob_init and try again'
        )
        mob_suite.mob_init.main()

    plasmid_files = dict()
    input_fasta = args.infile
    out_dir = args.outdir
    num_threads = args.num_threads
    tmp_dir = os.path.join(out_dir, '__tmp')
    file_id = os.path.basename(input_fasta)
    fixed_fasta = os.path.join(tmp_dir, 'fixed.input.fasta')
    chromosome_file = os.path.join(out_dir, 'chromosome.fasta')
    replicon_blast_results = os.path.join(tmp_dir,
                                          'replicon_blast_results.txt')
    mob_blast_results = os.path.join(tmp_dir, 'mobrecon_blast_results.txt')
    repetitive_blast_results = os.path.join(tmp_dir,
                                            'repetitive_blast_results.txt')
    contig_blast_results = os.path.join(tmp_dir, 'contig_blast_results.txt')

    # Input numeric params
    min_rep_ident = float(args.min_rep_ident)
    min_mob_ident = float(args.min_mob_ident)
    min_con_ident = float(args.min_con_ident)
    min_rpp_ident = float(args.min_rpp_ident)

    idents = {
        'min_rep_ident': min_rep_ident,
        'min_mob_ident': min_mob_ident,
        'min_con_ident': min_con_ident,
        'min_rpp_ident': min_rpp_ident
    }

    for param in idents:
        value = float(idents[param])
        if value < 60:
            logging.error(
                "Error: {} is too low, please specify an integer between 70 - 100"
                .format(param))
            sys.exit(-1)
        if value > 100:
            logging.error(
                "Error: {} is too high, please specify an integer between 70 - 100"
                .format(param))
            sys.exit(-1)

    min_rep_cov = float(args.min_rep_cov)
    min_mob_cov = float(args.min_mob_cov)
    min_con_cov = float(args.min_con_cov)
    min_rpp_cov = float(args.min_rpp_cov)

    covs = {
        'min_rep_cov': min_rep_cov,
        'min_mob_cov': min_mob_cov,
        'min_con_cov': min_con_cov,
        'min_rpp_cov': min_rpp_cov
    }

    for param in covs:
        value = float(covs[param])
        if value < 60:
            logging.error(
                "Error: {} is too low, please specify an integer between 50 - 100"
                .format(param))
            sys.exit(-1)
        if value > 100:
            logging.error(
                "Error: {} is too high, please specify an integer between 50 - 100"
                .format(param))
            sys.exit(-1)

    min_rep_evalue = float(args.min_rep_evalue)
    min_mob_evalue = float(args.min_mob_evalue)
    min_con_evalue = float(args.min_con_evalue)
    min_rpp_evalue = float(args.min_rpp_evalue)

    evalues = {
        'min_rep_evalue': min_rep_evalue,
        'min_mob_evalue': min_mob_evalue,
        'min_con_evalue': min_con_evalue,
        'min_rpp_evalue': min_rpp_evalue
    }

    for param in evalues:
        value = float(evalues[param])
        if value > 1:
            logging.error(
                "Error: {} is too high, please specify an float evalue between 0 to 1"
                .format(param))
            sys.exit(-1)

    min_overlapp = int(args.min_overlap)

    min_length = int(args.min_length)

    # Input numeric params
    min_rep_ident = float(args.min_rep_ident)
    min_mob_ident = float(args.min_mob_ident)
    min_con_ident = float(args.min_con_ident)
    min_rpp_ident = float(args.min_rpp_ident)

    idents = {
        'min_rep_ident': min_rep_ident,
        'min_mob_ident': min_mob_ident,
        'min_con_ident': min_con_ident,
        'min_rpp_ident': min_rpp_ident
    }

    for param in idents:
        value = idents[param]
        if value < 60:
            logging.error(
                "Error: {} is too low, please specify an integer between 70 - 100"
                .format(param))
            sys.exit(-1)
        if value > 100:
            logging.error(
                "Error: {} is too high, please specify an integer between 70 - 100"
                .format(param))
            sys.exit(-1)

    min_rep_cov = float(args.min_rep_cov)
    min_mob_cov = float(args.min_mob_cov)
    min_con_cov = float(args.min_con_cov)
    min_rpp_cov = float(args.min_rpp_cov)

    covs = {
        'min_rep_cov': min_rep_cov,
        'min_mob_cov': min_mob_cov,
        'min_con_cov': min_con_cov,
        'min_rpp_cov': min_rpp_cov
    }

    for param in covs:
        value = covs[param]
        if value < 60:
            logging.error(
                "Error: {} is too low, please specify an integer between 50 - 100"
                .format(param))
            sys.exit(-1)
        if value > 100:
            logging.error(
                "Error: {} is too high, please specify an integer between 50 - 100"
                .format(param))
            sys.exit(-1)

    min_rep_evalue = float(args.min_rep_evalue)
    min_mob_evalue = float(args.min_mob_evalue)
    min_con_evalue = float(args.min_con_evalue)
    min_rpp_evalue = float(args.min_rpp_evalue)

    evalues = {
        'min_rep_evalue': min_rep_evalue,
        'min_mob_evalue': min_mob_evalue,
        'min_con_evalue': min_con_evalue,
        'min_rpp_evalue': min_rpp_evalue
    }

    for param in evalues:
        value = evalues[param]
        if value > 1:
            logging.error(
                "Error: {} is too high, please specify an float evalue between 0 to 1"
                .format(param))
            sys.exit(-1)

    min_overlapp = args.min_overlap

    min_length = args.min_length

    # Input Databases
    default_database_dir = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), 'databases')
    if database_dir == default_database_dir:
        plasmid_ref_db = args.plasmid_db
        replicon_ref = args.plasmid_replicons
        mob_ref = args.plasmid_mob
        mash_db = args.plasmid_mash_db
        repetitive_mask_file = args.repetitive_mask
    else:
        plasmid_ref_db = os.path.join(database_dir,
                                      'ncbi_plasmid_full_seqs.fas')
        replicon_ref = os.path.join(database_dir, 'rep.dna.fas')
        mob_ref = os.path.join(database_dir, 'mob.proteins.faa')
        mash_db = os.path.join(database_dir, 'ncbi_plasmid_full_seqs.fas.msh')
        repetitive_mask_file = os.path.join(database_dir, 'repetitive.dna.fas')

    check_dependencies(logging)

    needed_dbs = [
        plasmid_ref_db, replicon_ref, mob_ref, mash_db, repetitive_mask_file,
        "{}.nin".format(repetitive_mask_file)
    ]

    for db in needed_dbs:
        if (not os.path.isfile(db)):
            logging.error('Error needed database missing "{}"'.format(db))
            sys.exit(-1)

    contig_report_file = os.path.join(out_dir, 'contig_report.txt')
    minimus_prefix = os.path.join(tmp_dir, 'minimus')
    filtered_blast = os.path.join(tmp_dir, 'filtered_blast.txt')
    repetitive_blast_report = os.path.join(out_dir,
                                           'repetitive_blast_report.txt')
    mobtyper_results_file = os.path.join(out_dir,
                                         'mobtyper_aggregate_report.txt')
    keep_tmp = args.keep_tmp

    run_circlator = args.run_circlator
    unicycler_contigs = args.unicycler_contigs

    if not isinstance(args.num_threads, int):
        logging.info(
            'Error number of threads must be an integer, you specified "{}"'.
            format(args.num_threads))

    logging.info('Creating tmp working directory {}'.format(tmp_dir))

    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir, 0o755)

    logging.info(
        'Writing cleaned header input fasta file from {} to {}'.format(
            input_fasta, fixed_fasta))
    fix_fasta_header(input_fasta, fixed_fasta)
    contig_seqs = read_fasta_dict(fixed_fasta)

    logging.info('Running replicon blast on {}'.format(replicon_ref))
    replicon_contigs = getRepliconContigs(
        replicon_blast(replicon_ref,
                       fixed_fasta,
                       min_rep_ident,
                       min_rep_cov,
                       min_rep_evalue,
                       tmp_dir,
                       replicon_blast_results,
                       num_threads=num_threads))

    logging.info('Running relaxase blast on {}'.format(mob_ref))
    mob_contigs = getRepliconContigs(
        mob_blast(mob_ref,
                  fixed_fasta,
                  min_mob_ident,
                  min_mob_cov,
                  min_mob_evalue,
                  tmp_dir,
                  mob_blast_results,
                  num_threads=num_threads))

    logging.info('Running contig blast on {}'.format(plasmid_ref_db))
    contig_blast(fixed_fasta, plasmid_ref_db, min_con_ident, min_con_cov,
                 min_con_evalue, min_length, tmp_dir, contig_blast_results)

    pcl_clusters = contig_blast_group(filtered_blast, min_overlapp)

    logging.info(
        'Running repetitive contig masking blast on {}'.format(mob_ref))
    repetitive_contigs = repetitive_blast(fixed_fasta,
                                          repetitive_mask_file,
                                          min_rpp_ident,
                                          min_rpp_cov,
                                          min_rpp_evalue,
                                          min_length,
                                          tmp_dir,
                                          repetitive_blast_results,
                                          num_threads=num_threads)

    circular_contigs = dict()

    logging.info('Running circlator minimus2 on {}'.format(fixed_fasta))
    if run_circlator:
        circular_contigs = circularize(fixed_fasta, minimus_prefix)

    if unicycler_contigs:
        for seqid in contig_seqs:
            if 'circular=true' in seqid:
                circular_contigs[seqid] = ''

    repetitive_dna = dict()
    results_fh = open(repetitive_blast_report, 'w')
    results_fh.write(
        "contig_id\tmatch_id\tmatch_type\tscore\tcontig_match_start\tcontig_match_end\n"
    )

    for contig_id in repetitive_contigs:
        match_info = repetitive_contigs[contig_id]['id'].split('|')
        repetitive_dna[contig_id] = "{}\t{}\t{}\t{}\t{}".format(
            match_info[1], match_info[len(match_info) - 1],
            repetitive_contigs[contig_id]['score'],
            repetitive_contigs[contig_id]['contig_start'],
            repetitive_contigs[contig_id]['contig_end'])
        results_fh.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
            contig_id, match_info[1], match_info[len(match_info) - 1],
            repetitive_contigs[contig_id]['score'],
            repetitive_contigs[contig_id]['contig_start'],
            repetitive_contigs[contig_id]['contig_end']))

    results_fh.close()

    seq_clusters = dict()
    cluster_bitscores = dict()
    for seqid in pcl_clusters:
        cluster_id = list(pcl_clusters[seqid].keys())[0]
        bitscore = pcl_clusters[seqid][cluster_id]
        cluster_bitscores[cluster_id] = bitscore

    sorted_cluster_bitscores = sorted(list(cluster_bitscores.items()),
                                      key=operator.itemgetter(1))
    sorted_cluster_bitscores.reverse()
    contigs_assigned = dict()
    for cluster_id, bitscore in sorted_cluster_bitscores:

        if not cluster_id in seq_clusters:
            seq_clusters[cluster_id] = dict()
        for seqid in pcl_clusters:
            if not cluster_id in pcl_clusters[seqid]:
                continue
            if seqid in contig_seqs and seqid not in contigs_assigned:
                seq_clusters[cluster_id][seqid] = contig_seqs[seqid]
                contigs_assigned[seqid] = cluster_id

    # Add sequences with known replicons regardless of whether they belong to a mcl cluster
    clust_id = 0
    refined_clusters = dict()
    for contig_id in mob_contigs:
        if not contig_id in pcl_clusters:
            if contig_id in contig_seqs:
                if not clust_id in seq_clusters:
                    seq_clusters["Novel_" + str(clust_id)] = dict()
                    if not contig_id in pcl_clusters:
                        pcl_clusters[contig_id] = dict()

                    pcl_clusters[contig_id]["Novel_" + str(clust_id)] = 0
                seq_clusters["Novel_" +
                             str(clust_id)][contig_id] = contig_seqs[contig_id]
            clust_id += 1

    # Add sequences with known relaxases regardless of whether they belong to a mcl cluster

    count_replicons = dict()
    for contig_id in replicon_contigs:
        if not contig_id in pcl_clusters:
            if contig_id in contig_seqs:
                if not clust_id in seq_clusters:
                    seq_clusters["Novel_" + str(clust_id)] = dict()
                    if not contig_id in pcl_clusters:
                        pcl_clusters[contig_id] = dict()

                    pcl_clusters[contig_id]["Novel_" + str(clust_id)] = dict()
                seq_clusters["Novel_" +
                             str(clust_id)][contig_id] = contig_seqs[contig_id]
            clust_id += 1

    refined_clusters = dict()

    # split out circular sequences from each other

    replicon_clusters = dict()
    for contig_id in replicon_contigs:

        for hit_id in replicon_contigs[contig_id]:
            id, rep_type = hit_id.split('|')

            cluster = list(pcl_clusters[contig_id].keys())[0]
            if not cluster in replicon_clusters:
                replicon_clusters[cluster] = 0
            replicon_clusters[cluster] += 1

    for id in seq_clusters:
        cluster = seq_clusters[id]

        if not id in refined_clusters:
            refined_clusters[id] = dict()

        for contig_id in cluster:
            if contig_id in circular_contigs and len(cluster) > 1 and (
                    id in replicon_clusters and replicon_clusters[id] > 1):
                if not clust_id in refined_clusters:
                    refined_clusters["Novel_" + str(clust_id)] = dict()
                refined_clusters["Novel_" +
                                 str(clust_id)][contig_id] = cluster[contig_id]
                clust_id += 1
                continue

            refined_clusters[id][contig_id] = cluster[contig_id]

    seq_clusters = refined_clusters

    m = mash()
    mash_distances = dict()
    mash_top_dists = dict()
    contig_report = list()

    results_fh = open(contig_report_file, 'w')
    results_fh.write("file_id\tcluster_id\tcontig_id\tcontig_length\tcircularity_status\trep_type\t" \
                     "rep_type_accession\trelaxase_type\trelaxase_type_accession\tmash_nearest_neighbor\t"
                     " mash_neighbor_distance\trepetitive_dna_id\tmatch_type\tscore\tcontig_match_start\tcontig_match_end\n")

    filter_list = dict()
    counter = 0

    for cluster in seq_clusters:
        clusters = seq_clusters[cluster]
        total_cluster_length = 0

        count_seqs = len(clusters)
        count_rep = 0
        count_small = 0
        temp = dict()

        for contig_id in clusters:
            temp[contig_id] = ''
            if contig_id in repetitive_contigs:
                count_rep += 1
            length = len(clusters[contig_id])
            total_cluster_length += length
            if length < 3000:
                count_small += 1

        if count_rep == count_seqs or (
                float(count_rep) / count_seqs * 100 > 50
                and count_small == count_seqs) or total_cluster_length < 1500:
            continue

        for contig_id in temp:
            filter_list[contig_id] = ''

        cluster_file = os.path.join(tmp_dir,
                                    'clust_' + str(cluster) + '.fasta')
        mash_file = os.path.join(tmp_dir, 'clust_' + str(cluster) + '.txt')
        write_fasta_dict(clusters, cluster_file)

        mashfile_handle = open(mash_file, 'w')
        m.run_mash(mash_db, cluster_file, mashfile_handle)

        mash_results = m.read_mash(mash_file)
        mash_top_hit = getMashBestHit(mash_results)

        # delete low scoring clusters
        if float(mash_top_hit['mash_hit_score']) > 0.05:
            skip = True
            for contig_id in clusters:
                if contig_id in replicon_contigs:
                    skip = False
                    break
                if contig_id in circular_contigs:
                    skip = False
                    break
                if contig_id in mob_contigs:
                    skip = False
                    break
            if skip:
                for contig_id in clusters:
                    del (filter_list[contig_id])
                continue

        new_clust_file = None
        if os.path.isfile(cluster_file):
            if float(mash_top_hit['mash_hit_score']) < 0.05:
                cluster = mash_top_hit['clustid']
                new_clust_file = os.path.join(out_dir,
                                              'plasmid_' + cluster + ".fasta")

            else:
                cluster = 'novel_' + str(counter)
                new_clust_file = os.path.join(out_dir,
                                              'plasmid_' + cluster + ".fasta")
                counter += 1

            if os.path.isfile(new_clust_file):
                temp_fh = open(cluster_file, 'r')

                data = temp_fh.read()

                temp_fh.close()
                temp_fh = open(new_clust_file, 'a')
                temp_fh.write(data)
                temp_fh.close()
                mash_file = os.path.join(tmp_dir,
                                         'clust_' + str(cluster) + '.txt')
                mashfile_handle = open(mash_file, 'w')
                m.run_mash(mash_db, cluster_file, mashfile_handle)
                mash_results = m.read_mash(mash_file)
                mash_top_hit = getMashBestHit(mash_results)

            else:
                os.rename(cluster_file, new_clust_file)

        if new_clust_file is not None:
            plasmid_files[new_clust_file] = ''

        for contig_id in clusters:
            found_replicon_string = ''
            found_replicon_id_string = ''
            found_mob_string = ''
            found_mob_id_string = ''
            contig_status = 'Incomplete'
            if contig_id in circular_contigs:
                contig_status = 'Circular'

            if contig_id in replicon_contigs:
                rep_ids = dict()
                rep_hit_ids = dict()

                for hit_id in replicon_contigs[contig_id]:
                    id, rep_type = hit_id.split('|')
                    rep_ids[rep_type] = ''
                    rep_hit_ids[id] = ''

                found_replicon_string = ','.join(list(rep_ids.keys()))
                found_replicon_id_string = ','.join(list(rep_hit_ids.keys()))

            if contig_id in mob_contigs:
                mob_ids = dict()
                mob_hit_ids = dict()

                for hit_id in mob_contigs[contig_id]:
                    id, mob_type = hit_id.split('|')
                    mob_ids[mob_type] = ''
                    mob_hit_ids[id] = ''

                found_mob_string = ','.join(list(mob_ids.keys()))
                found_mob_id_string = ','.join(list(mob_hit_ids.keys()))

            rep_dna_info = "\t\t\t\t"
            if contig_id in repetitive_dna:
                rep_dna_info = repetitive_dna[contig_id]

            results_fh.write(
                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    file_id, cluster, contig_id, len(clusters[contig_id]),
                    contig_status, found_replicon_string,
                    found_replicon_id_string, found_mob_string,
                    found_mob_id_string, mash_top_hit['top_hit'],
                    mash_top_hit['mash_hit_score'], rep_dna_info))
    chr_contigs = dict()

    for contig_id in contig_seqs:
        if contig_id not in filter_list:
            chr_contigs[contig_id] = contig_seqs[contig_id]
            rep_dna_info = "\t\t\t\t"
            if contig_id in repetitive_dna:
                rep_dna_info = repetitive_dna[contig_id]
            contig_status = 'Incomplete'
            if contig_id in circular_contigs:
                contig_status = 'Circular'
            results_fh.write(
                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    file_id, 'chromosome', contig_id,
                    len(contig_seqs[contig_id]), contig_status, '', '', '', '',
                    '', '', rep_dna_info))
    results_fh.close()
    write_fasta_dict(chr_contigs, chromosome_file)

    if args.run_typer:
        mobtyper_results = "file_id\tnum_contigs\ttotal_length\tgc\t" \
                           "rep_type(s)\trep_type_accession(s)\t" \
                           "relaxase_type(s)\trelaxase_type_accession(s)\t" \
                           "mpf_type\tmpf_type_accession(s)\t" \
                           "orit_type(s)\torit_accession(s)\tPredictedMobility\t" \
                           "mash_nearest_neighbor\tmash_neighbor_distance\tmash_neighbor_cluster\n"
        for file in plasmid_files:
            mobtyper_results = mobtyper_results + "{}".format(
                run_mob_typer(
                    file, out_dir, str(num_threads),
                    database_dir=database_dir))
        fh = open(mobtyper_results_file, 'w')
        fh.write(mobtyper_results)
        fh.close()

    if not keep_tmp:
        shutil.rmtree(tmp_dir)
Esempio n. 3
0
def main():
    args = parse_args()
    if args.debug:
        init_console_logger(3)
    logging.info('Running Mob-typer v. {}'.format(__version__))
    if not args.outdir:
        logging.info(
            'Error, no output directory specified, please specify one')
        sys.exit()

    if not args.infile:
        logging.info('Error, no fasta specified, please specify one')
        sys.exit()

    if not os.path.isfile(args.infile):
        logging.info('Error, fasta file does not exist')
        sys.exit()

    if not os.path.isdir(args.outdir):
        os.mkdir(args.outdir, 0o755)

    if not isinstance(args.num_threads, int):
        logging.info(
            'Error number of threads must be an integer, you specified "{}"'.
            format(args.num_threads))

    verify_init(logging)
    # Script arguments
    input_fasta = args.infile
    input_fasta = args.infile
    out_dir = args.outdir
    num_threads = int(args.num_threads)
    evalue = args.evalue
    keep_tmp = args.keep_tmp
    mob_ref = args.plasmid_mob
    mpf_ref = args.plasmid_mpf
    orit_ref = args.plasmid_orit
    mash_db = args.plasmid_mash_db

    tmp_dir = os.path.join(out_dir, '__tmp')
    file_id = os.path.basename(input_fasta)
    fixed_fasta = os.path.join(tmp_dir, 'fixed.input.fasta')
    replicon_ref = args.plasmid_replicons
    replicon_blast_results = os.path.join(tmp_dir,
                                          'replicon_blast_results.txt')
    mob_blast_results = os.path.join(tmp_dir, 'mobtyper_blast_results.txt')
    mpf_blast_results = os.path.join(tmp_dir, 'mpf_blast_results.txt')
    orit_blast_results = os.path.join(tmp_dir, 'orit_blast_results.txt')
    if os.path.isfile(mob_blast_results):
        os.remove(mob_blast_results)
    if os.path.isfile(mpf_blast_results):
        os.remove(mpf_blast_results)
    if os.path.isfile(orit_blast_results):
        os.remove(orit_blast_results)
    if os.path.isfile(replicon_blast_results):
        os.remove(replicon_blast_results)
    report_file = os.path.join(out_dir, 'mobtyper_' + file_id + '_report.txt')
    mash_file = os.path.join(tmp_dir, 'mash_' + file_id + '.txt')

    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir, 0o755)

    fix_fasta_header(input_fasta, fixed_fasta)

    # run individual marker blasts
    logging.info('Running replicon blast on {}'.format(replicon_ref))
    replicon_contigs = getRepliconContigs(
        replicon_blast(replicon_ref,
                       fixed_fasta,
                       80,
                       80,
                       args.evalue,
                       tmp_dir,
                       replicon_blast_results,
                       num_threads=num_threads))
    found_replicons = dict()
    for contig_id in replicon_contigs:
        for hit in replicon_contigs[contig_id]:
            acs, type = hit.split('|')
            found_replicons[acs] = type

    # print(found_replicons)

    logging.info('Running relaxase blast on {}'.format(mob_ref))

    mob_contigs = getRepliconContigs(
        mob_blast(mob_ref,
                  fixed_fasta,
                  80,
                  80,
                  args.evalue,
                  tmp_dir,
                  mob_blast_results,
                  num_threads=num_threads))
    found_mob = dict()
    for contig_id in mob_contigs:
        for hit in mob_contigs[contig_id]:
            acs, type = hit.split('|')
            found_mob[acs] = type

    # print (found_mob)

    logging.info('Running mpf blast on {}'.format(mob_ref))
    mpf_contigs = getRepliconContigs(
        mob_blast(mpf_ref,
                  fixed_fasta,
                  85,
                  85,
                  args.evalue,
                  tmp_dir,
                  mpf_blast_results,
                  num_threads=num_threads))
    found_mpf = dict()
    for contig_id in mpf_contigs:
        for hit in mpf_contigs[contig_id]:
            acs, type = hit.split('|')
            found_mpf[acs] = type

    # print(found_mpf)

    logging.info('Running orit blast on {}'.format(replicon_ref))
    orit_contigs = getRepliconContigs(
        replicon_blast(orit_ref,
                       fixed_fasta,
                       90,
                       90,
                       args.evalue,
                       tmp_dir,
                       orit_blast_results,
                       num_threads=num_threads))
    found_orit = dict()
    for contig_id in orit_contigs:
        for hit in orit_contigs[contig_id]:
            acs, type = hit.split('|')
            found_orit[acs] = type

    # print(found_orit)

    # Get closest neighbor by mash distance
    m = mash()
    mash_distances = dict()
    mashfile_handle = open(mash_file, 'w')
    m.run_mash(mash_db, fixed_fasta, mashfile_handle)
    mash_results = m.read_mash(mash_file)
    mash_top_hit = getMashBestHit(mash_results)

    results_fh = open(report_file, 'w')
    results_fh.write("file_id\tnum_contigs\ttotal_length\tgc\t" \
                     "rep_type(s)\trep_type_accession(s)\t" \
                     "relaxase_type(s)\trelaxase_type_accession(s)\t" \
                     "mpf_type\tmpf_type_accession(s)\t" \
                     "orit_type(s)\torit_accession(s)\tPredictedMobility\t" \
                     "mash_nearest_neighbor\tmash_neighbor_distance\tmash_neighbor_cluster\n")

    if len(found_replicons) > 0:
        rep_types = ",".join(list(found_replicons.values()))
        rep_acs = ",".join(list(found_replicons.keys()))
    else:
        rep_types = "-"
        rep_acs = "-"

    if len(found_mob) > 0:
        mob_types = ",".join(list(found_mob.values()))
        mob_acs = ",".join(list(found_mob.keys()))
    else:
        mob_types = "-"
        mob_acs = "-"

    if len(found_mpf) > 0:
        mpf_type = determine_mpf_type(found_mpf)
        mpf_acs = ",".join(list(found_mpf.keys()))
    else:
        mpf_type = "-"
        mpf_acs = "-"

    if len(found_orit) > 0:
        orit_types = ",".join(list(found_orit.values()))
        orit_acs = ",".join(list(found_orit.keys()))
    else:
        orit_types = "-"
        orit_acs = "-"
    stats = calcFastaStats(fixed_fasta)
    predicted_mobility = 'Non-mobilizable'

    if mob_acs != '-' or orit_acs != '-':
        predicted_mobility = 'Mobilizable'

    if mob_acs != '-' and mpf_acs != '-':
        predicted_mobility = 'Conjugative'

    string = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
        file_id, stats['num_seq'], stats['size'], stats['gc_content'],
        rep_types, rep_acs, mob_types, mob_acs, mpf_type, mpf_acs, orit_types,
        orit_acs, predicted_mobility, mash_top_hit['top_hit'],
        mash_top_hit['mash_hit_score'], mash_top_hit['clustid'])
    results_fh.write(string)

    if not keep_tmp:
        shutil.rmtree(tmp_dir)

    print("{}".format(string))
Esempio n. 4
0
def main():
    args = parse_args()
    if args.debug:
        init_console_logger(3)
    logging.info('Running plasmid detector v. {}'.format(__version__))
    if not args.outdir:
        logging.info(
            'Error, no output directory specified, please specify one')
        sys.exit()
    if not args.infile:
        logging.info('Error, no fasta specified, please specify one')
        sys.exit()
    logging.info('Processing fasta file {}'.format(args.infile))
    logging.info('Analysis directory {}'.format(args.outdir))

    if not os.path.isdir(args.outdir):
        os.mkdir(args.outdir, 0o755)

    verify_init(logging)
    plasmid_files = dict()
    input_fasta = args.infile
    out_dir = args.outdir
    num_threads = args.num_threads
    tmp_dir = os.path.join(out_dir, '__tmp')
    file_id = os.path.basename(input_fasta)
    fixed_fasta = os.path.join(tmp_dir, 'fixed.input.fasta')
    chromosome_file = os.path.join(out_dir, 'chromosome.fasta')
    replicon_blast_results = os.path.join(tmp_dir,
                                          'replicon_blast_results.txt')
    mob_blast_results = os.path.join(tmp_dir, 'mobrecon_blast_results.txt')
    repetitive_blast_results = os.path.join(tmp_dir,
                                            'repetitive_blast_results.txt')
    contig_blast_results = os.path.join(tmp_dir, 'contig_blast_results.txt')
    min_ident = args.min_ident
    min_cov = args.min_cov
    evalue = args.evalue
    min_length = args.min_length

    plasmid_ref_db = args.plasmid_db

    replicon_ref = args.plasmid_replicons
    mob_ref = args.plasmid_mob
    mash_db = args.plasmid_mash_db
    repetitive_mask_file = args.repetitive_mask
    contig_report_file = os.path.join(out_dir, 'contig_report.txt')
    minimus_prefix = os.path.join(tmp_dir, 'minimus')
    filtered_blast = os.path.join(tmp_dir, 'filtered_blast.txt')
    repetitive_blast_report = os.path.join(out_dir,
                                           'repetitive_blast_report.txt')
    mobtyper_results_file = os.path.join(out_dir,
                                         'mobtyper_aggregate_report.txt')
    keep_tmp = args.keep_tmp

    if args.run_circlator == 'False' or args.run_circlator == 'false':
        run_circlator = False
    else:
        run_circlator = True

    if args.unicycler_contigs == 'False' or args.unicycler_contigs == 'false':
        unicycler_contigs = False
    else:
        unicycler_contigs = True

    if not isinstance(args.num_threads, int):
        logging.info(
            'Error number of threads must be an integer, you specified "{}"'.
            format(args.num_threads))

    logging.info('Creating tmp working directory {}'.format(tmp_dir))

    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir, 0o755)

    logging.info(
        'Writing cleaned header input fasta file from {} to {}'.format(
            input_fasta, fixed_fasta))
    fix_fasta_header(input_fasta, fixed_fasta)
    contig_seqs = read_fasta_dict(fixed_fasta)

    logging.info('Running replicon blast on {}'.format(replicon_ref))
    replicon_contigs = getRepliconContigs(
        replicon_blast(replicon_ref,
                       fixed_fasta,
                       80,
                       80,
                       evalue,
                       tmp_dir,
                       replicon_blast_results,
                       num_threads=num_threads))

    logging.info('Running relaxase blast on {}'.format(mob_ref))
    mob_contigs = getRepliconContigs(
        mob_blast(mob_ref,
                  fixed_fasta,
                  80,
                  80,
                  evalue,
                  tmp_dir,
                  mob_blast_results,
                  num_threads=num_threads))

    logging.info('Running contig blast on {}'.format(plasmid_ref_db))
    contig_blast(fixed_fasta, plasmid_ref_db, args.min_ident, args.min_cov,
                 evalue, min_length, tmp_dir, contig_blast_results)

    pcl_clusters = contig_blast_group(filtered_blast, 10)

    logging.info(
        'Running repetitive contig masking blast on {}'.format(mob_ref))
    repetitive_contigs = repetitive_blast(fixed_fasta,
                                          repetitive_mask_file,
                                          min_ident,
                                          min_cov,
                                          evalue,
                                          300,
                                          tmp_dir,
                                          repetitive_blast_results,
                                          num_threads=num_threads)

    circular_contigs = dict()

    logging.info('Running circlator minimus2 on {}'.format(fixed_fasta))
    if run_circlator:
        circular_contigs = circularize(fixed_fasta, minimus_prefix)

    if unicycler_contigs:
        for seqid in contig_seqs:
            if 'circular=true' in seqid:
                circular_contigs[seqid] = ''

    repetitive_dna = dict()
    results_fh = open(repetitive_blast_report, 'w')
    results_fh.write(
        "contig_id\tmatch_id\tmatch_type\tscore\tcontig_match_start\tcontig_match_end\n"
    )

    for contig_id in repetitive_contigs:
        match_info = repetitive_contigs[contig_id]['id'].split('|')
        repetitive_dna[contig_id] = "{}\t{}\t{}\t{}\t{}".format(
            match_info[1], match_info[len(match_info) - 1],
            repetitive_contigs[contig_id]['score'],
            repetitive_contigs[contig_id]['contig_start'],
            repetitive_contigs[contig_id]['contig_end'])
        results_fh.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(
            contig_id, match_info[1], match_info[len(match_info) - 1],
            repetitive_contigs[contig_id]['score'],
            repetitive_contigs[contig_id]['contig_start'],
            repetitive_contigs[contig_id]['contig_end']))

    results_fh.close()

    seq_clusters = dict()
    cluster_bitscores = dict()
    for seqid in pcl_clusters:
        cluster_id = list(pcl_clusters[seqid].keys())[0]
        bitscore = pcl_clusters[seqid][cluster_id]
        cluster_bitscores[cluster_id] = bitscore

    sorted_cluster_bitscores = sorted(list(cluster_bitscores.items()),
                                      key=operator.itemgetter(1))
    sorted_cluster_bitscores.reverse()
    contigs_assigned = dict()
    for cluster_id, bitscore in sorted_cluster_bitscores:

        if not cluster_id in seq_clusters:
            seq_clusters[cluster_id] = dict()
        for seqid in pcl_clusters:
            if not cluster_id in pcl_clusters[seqid]:
                continue
            if seqid in contig_seqs and seqid not in contigs_assigned:
                seq_clusters[cluster_id][seqid] = contig_seqs[seqid]
                contigs_assigned[seqid] = cluster_id

    # Add sequences with known replicons regardless of whether they belong to a mcl cluster
    clust_id = 0
    refined_clusters = dict()
    for contig_id in mob_contigs:
        if not contig_id in pcl_clusters:
            if contig_id in contig_seqs:
                if not clust_id in seq_clusters:
                    seq_clusters["Novel_" + str(clust_id)] = dict()
                    if not contig_id in pcl_clusters:
                        pcl_clusters[contig_id] = dict()

                    pcl_clusters[contig_id]["Novel_" + str(clust_id)] = 0
                seq_clusters["Novel_" +
                             str(clust_id)][contig_id] = contig_seqs[contig_id]
            clust_id += 1

    # Add sequences with known relaxases regardless of whether they belong to a mcl cluster

    count_replicons = dict()
    for contig_id in replicon_contigs:
        if not contig_id in pcl_clusters:
            if contig_id in contig_seqs:
                if not clust_id in seq_clusters:
                    seq_clusters["Novel_" + str(clust_id)] = dict()
                    if not contig_id in pcl_clusters:
                        pcl_clusters[contig_id] = dict()

                    pcl_clusters[contig_id]["Novel_" + str(clust_id)] = dict()
                seq_clusters["Novel_" +
                             str(clust_id)][contig_id] = contig_seqs[contig_id]
            clust_id += 1

    refined_clusters = dict()

    # split out circular sequences from each other

    replicon_clusters = dict()
    for contig_id in replicon_contigs:

        for hit_id in replicon_contigs[contig_id]:
            id, rep_type = hit_id.split('|')

            cluster = list(pcl_clusters[contig_id].keys())[0]
            if not cluster in replicon_clusters:
                replicon_clusters[cluster] = 0
            replicon_clusters[cluster] += 1

    for id in seq_clusters:
        cluster = seq_clusters[id]

        if not id in refined_clusters:
            refined_clusters[id] = dict()

        for contig_id in cluster:
            if contig_id in circular_contigs and len(cluster) > 1 and (
                    id in replicon_clusters and replicon_clusters[id] > 1):
                if not clust_id in refined_clusters:
                    refined_clusters["Novel_" + str(clust_id)] = dict()
                refined_clusters["Novel_" +
                                 str(clust_id)][contig_id] = cluster[contig_id]
                clust_id += 1
                continue

            refined_clusters[id][contig_id] = cluster[contig_id]

    seq_clusters = refined_clusters

    m = mash()
    mash_distances = dict()
    mash_top_dists = dict()
    contig_report = list()

    results_fh = open(contig_report_file, 'w')
    results_fh.write("file_id\tcluster_id\tcontig_id\tcontig_length\tcircularity_status\trep_type\t" \
                     "rep_type_accession\trelaxase_type\trelaxase_type_accession\tmash_nearest_neighbor\t"
                     " mash_neighbor_distance\trepetitive_dna_id\tmatch_type\tscore\tcontig_match_start\tcontig_match_end\n")

    filter_list = dict()
    counter = 0

    for cluster in seq_clusters:
        clusters = seq_clusters[cluster]
        total_cluster_length = 0

        count_seqs = len(clusters)
        count_rep = 0
        count_small = 0
        temp = dict()

        for contig_id in clusters:
            temp[contig_id] = ''
            if contig_id in repetitive_contigs:
                count_rep += 1
            length = len(clusters[contig_id])
            total_cluster_length += length
            if length < 3000:
                count_small += 1

        if count_rep == count_seqs or (
                float(count_rep) / count_seqs * 100 > 50
                and count_small == count_seqs) or total_cluster_length < 1500:
            continue

        for contig_id in temp:
            filter_list[contig_id] = ''

        cluster_file = os.path.join(tmp_dir,
                                    'clust_' + str(cluster) + '.fasta')
        mash_file = os.path.join(tmp_dir, 'clust_' + str(cluster) + '.txt')
        write_fasta_dict(clusters, cluster_file)

        mashfile_handle = open(mash_file, 'w')
        m.run_mash(mash_db, cluster_file, mashfile_handle)

        mash_results = m.read_mash(mash_file)
        mash_top_hit = getMashBestHit(mash_results)

        # delete low scoring clusters
        if float(mash_top_hit['mash_hit_score']) > 0.05:
            skip = True
            for contig_id in clusters:
                if contig_id in replicon_contigs:
                    skip = False
                    break
                if contig_id in circular_contigs:
                    skip = False
                    break
                if contig_id in mob_contigs:
                    skip = False
                    break
            if skip:
                for contig_id in clusters:
                    del (filter_list[contig_id])
                continue

        new_clust_file = None
        if os.path.isfile(cluster_file):
            if float(mash_top_hit['mash_hit_score']) < 0.05:
                cluster = mash_top_hit['clustid']
                new_clust_file = os.path.join(out_dir,
                                              'plasmid_' + cluster + ".fasta")

            else:
                cluster = 'novel_' + str(counter)
                new_clust_file = os.path.join(out_dir,
                                              'plasmid_' + cluster + ".fasta")
                counter += 1

            if os.path.isfile(new_clust_file):
                temp_fh = open(cluster_file, 'r')

                data = temp_fh.read()

                temp_fh.close()
                temp_fh = open(new_clust_file, 'a')
                temp_fh.write(data)
                temp_fh.close()
                mash_file = os.path.join(tmp_dir,
                                         'clust_' + str(cluster) + '.txt')
                mashfile_handle = open(mash_file, 'w')
                m.run_mash(mash_db, cluster_file, mashfile_handle)
                mash_results = m.read_mash(mash_file)
                mash_top_hit = getMashBestHit(mash_results)

            else:
                os.rename(cluster_file, new_clust_file)

        if new_clust_file is not None:
            plasmid_files[new_clust_file] = ''

        for contig_id in clusters:
            found_replicon_string = ''
            found_replicon_id_string = ''
            found_mob_string = ''
            found_mob_id_string = ''
            contig_status = 'Incomplete'
            if contig_id in circular_contigs:
                contig_status = 'Circular'

            if contig_id in replicon_contigs:
                rep_ids = dict()
                rep_hit_ids = dict()

                for hit_id in replicon_contigs[contig_id]:
                    id, rep_type = hit_id.split('|')
                    rep_ids[rep_type] = ''
                    rep_hit_ids[id] = ''

                found_replicon_string = ','.join(list(rep_ids.keys()))
                found_replicon_id_string = ','.join(list(rep_hit_ids.keys()))

            if contig_id in mob_contigs:
                mob_ids = dict()
                mob_hit_ids = dict()

                for hit_id in mob_contigs[contig_id]:
                    id, mob_type = hit_id.split('|')
                    mob_ids[mob_type] = ''
                    mob_hit_ids[id] = ''

                found_mob_string = ','.join(list(mob_ids.keys()))
                found_mob_id_string = ','.join(list(mob_hit_ids.keys()))

            rep_dna_info = "\t\t\t\t"
            if contig_id in repetitive_dna:
                rep_dna_info = repetitive_dna[contig_id]

            results_fh.write(
                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    file_id, cluster, contig_id, len(clusters[contig_id]),
                    contig_status, found_replicon_string,
                    found_replicon_id_string, found_mob_string,
                    found_mob_id_string, mash_top_hit['top_hit'],
                    mash_top_hit['mash_hit_score'], rep_dna_info))
    chr_contigs = dict()

    for contig_id in contig_seqs:
        if contig_id not in filter_list:
            chr_contigs[contig_id] = contig_seqs[contig_id]
            rep_dna_info = "\t\t\t\t"
            if contig_id in repetitive_dna:
                rep_dna_info = repetitive_dna[contig_id]
            contig_status = 'Incomplete'
            if contig_id in circular_contigs:
                contig_status = 'Circular'
            results_fh.write(
                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                    file_id, 'chromosome', contig_id,
                    len(contig_seqs[contig_id]), contig_status, '', '', '', '',
                    '', '', rep_dna_info))
    results_fh.close()
    write_fasta_dict(chr_contigs, chromosome_file)

    if args.run_typer:
        mobtyper_results = "file_id\tnum_contigs\ttotal_length\tgc\t" \
                           "rep_type(s)\trep_type_accession(s)\t" \
                           "relaxase_type(s)\trelaxase_type_accession(s)\t" \
                           "mpf_type\tmpf_type_accession(s)\t" \
                           "orit_type(s)\torit_accession(s)\tPredictedMobility\t" \
                           "mash_nearest_neighbor\tmash_neighbor_distance\tmash_neighbor_cluster\n"
        for file in plasmid_files:
            mobtyper_results = mobtyper_results + "{}".format(
                run_mob_typer(file, out_dir, str(num_threads)))
        fh = open(mobtyper_results_file, 'w')
        fh.write(mobtyper_results)
        fh.close()

    if not keep_tmp:
        shutil.rmtree(tmp_dir)
Esempio n. 5
0
def main():
    default_database_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'databases')
    args = parse_args()

    if args.debug:
       logger = init_console_logger(3)
    else:
       logger = init_console_logger(2)

    logger.info('Running Mob-typer version {}'.format(__version__))

    if not args.outdir:
        logger.info('Error, no output directory specified, please specify one')
        sys.exit()

    if not args.infile:
        logger.info('Error, no fasta specified, please specify one')
        sys.exit()

    if not os.path.isfile(args.infile):
        logger.info('Error, fasta file does not exist')
        sys.exit()

    if not os.path.isdir(args.outdir):
        os.mkdir(args.outdir, 0o755)

    if not isinstance(args.num_threads, int):
        logger.info('Error number of threads must be an integer, you specified "{}"'.format(args.num_threads))

    database_dir = os.path.abspath(args.database_directory)

    verify_init(logger,database_dir)
    # Script arguments
    input_fasta = args.infile
    out_dir = args.outdir
    num_threads = int(args.num_threads)
    keep_tmp = args.keep_tmp


    if database_dir == default_database_dir:
        mob_ref = args.plasmid_mob
        mpf_ref = args.plasmid_mpf
        orit_ref = args.plasmid_orit
        mash_db = args.plasmid_mash_db
        replicon_ref = args.plasmid_replicons
    else:
        mob_ref = os.path.join(database_dir, 'mob.proteins.faa')
        mpf_ref = os.path.join(database_dir, 'mpf.proteins.faa')
        orit_ref = os.path.join(database_dir, 'orit.fas')
        mash_db = os.path.join(database_dir, 'ncbi_plasmid_full_seqs.fas.msh')
        replicon_ref = os.path.join(database_dir, 'rep.dna.fas')


    tmp_dir = os.path.join(out_dir, '__tmp')
    file_id = os.path.basename(input_fasta)
    #output_file_prefix = re.sub(r"\..*", "", file_id)  # remove file extension by matching everything  before dot

    fixed_fasta = os.path.join(tmp_dir, 'fixed.input.fasta')
    replicon_blast_results = os.path.join(tmp_dir, 'replicon_blast_results.txt')
    mob_blast_results = os.path.join(tmp_dir, 'mobtyper_blast_results.txt')
    mpf_blast_results = os.path.join(tmp_dir, 'mpf_blast_results.txt')
    orit_blast_results = os.path.join(tmp_dir, 'orit_blast_results.txt')
    if os.path.isfile(mob_blast_results):
        os.remove(mob_blast_results)
    if os.path.isfile(mpf_blast_results):
        os.remove(mpf_blast_results)
    if os.path.isfile(orit_blast_results):
        os.remove(orit_blast_results)
    if os.path.isfile(replicon_blast_results):
        os.remove(replicon_blast_results)
    report_file = os.path.join(out_dir, 'mobtyper_' + file_id + '_report.txt')
    mash_file = os.path.join(tmp_dir, 'mash_' + file_id + '.txt')


    # Input numeric params

    min_rep_ident = float(args.min_rep_ident)
    min_mob_ident = float(args.min_mob_ident)
    min_ori_ident = float(args.min_ori_ident)
    min_mpf_ident = float(args.min_mpf_ident)

    idents = {'min_rep_ident': min_rep_ident, 'min_mob_ident': min_mob_ident, 'min_ori_ident': min_ori_ident}

    for param in idents:

        value = float(idents[param])

        if value < 60:
            logger.error("Error: {} is too low, please specify an integer between 70 - 100".format(param))
            sys.exit(-1)
        if value > 100:
            logger.error("Error: {} is too high, please specify an integer between 70 - 100".format(param))
            sys.exit(-1)


    min_rep_cov = float(args.min_rep_cov)
    min_mob_cov = float(args.min_mob_cov)
    min_ori_cov = float(args.min_ori_cov)
    min_mpf_cov = float(args.min_mpf_cov)



    covs = {'min_rep_cov': min_rep_cov, 'min_mob_cov': min_mob_cov, 'min_con_cov': min_ori_cov,
            'min_rpp_cov': min_ori_cov}

    for param in covs:

        value = float(covs[param])

        if value < 60:
            logger.error("Error: {} is too low, please specify an integer between 50 - 100".format(param))
            sys.exit(-1)
        if value > 100:
            logger.error("Error: {} is too high, please specify an integer between 50 - 100".format(param))
            sys.exit(-1)


    min_rep_evalue = float(args.min_rep_evalue)
    min_mob_evalue = float(args.min_mob_evalue)
    min_ori_evalue = float(args.min_ori_evalue)
    min_mpf_evalue = float(args.min_mpf_evalue)


    evalues = {'min_rep_evalue': min_rep_evalue, 'min_mob_evalue': min_mob_evalue, 'min_con_evalue': min_ori_evalue}

    for param in evalues:

        value = float(evalues[param])

        if value > 1:
            logger.error("Error: {} is too high, please specify an float evalue between 0 to 1".format(param))
            sys.exit(-1)


    check_dependencies(logger)

    needed_dbs = [replicon_ref, mob_ref, mash_db, mpf_ref]

    for db in needed_dbs:
        if (not os.path.isfile(db)):
            logger.info('Warning! Needed database missing "{}"'.format(db))
            mob_suite.mob_init.main()


    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir, 0o755)

    fix_fasta_header(input_fasta, fixed_fasta)

    # run individual marker blasts
    logger.info('Running replicon blast on {}'.format(replicon_ref))
    replicon_contigs = getRepliconContigs(
        replicon_blast(replicon_ref, fixed_fasta, min_rep_ident, min_rep_cov, min_rep_evalue, tmp_dir, replicon_blast_results,
                       num_threads=num_threads))
    found_replicons = dict()

    for contig_id in replicon_contigs:
        for hit in replicon_contigs[contig_id]:
            acs, type = hit.split('|')
            found_replicons[acs] = type

    #print("These replicons are found")
    #print(list(found_replicons.values()))

    logger.info('Running relaxase blast on {}'.format(mob_ref))

    mob_contigs = getRepliconContigs(
        mob_blast(mob_ref, fixed_fasta, min_mob_ident, min_mob_cov, min_mob_evalue, tmp_dir, mob_blast_results, num_threads=num_threads))
    found_mob = dict()
    for contig_id in mob_contigs:
        for hit in mob_contigs[contig_id]:
            acs, type = hit.split('|')
            found_mob[acs] = type
    #print ("These are relaxeses found")
    #print (list(found_mob.values()))


    logger.info('Running mpf blast on {}'.format(mob_ref))
    mpf_contigs = getRepliconContigs(
        mob_blast(mpf_ref, fixed_fasta, min_mpf_ident, min_mpf_cov, min_mpf_evalue, tmp_dir, mpf_blast_results, num_threads=num_threads))
    found_mpf = dict()
    for contig_id in mpf_contigs:
        for hit in mpf_contigs[contig_id]:
            acs, type = hit.split('|')
            found_mpf[acs] = type

    # print(found_mpf)

    logger.info('Running orit blast on {}'.format(replicon_ref))
    orit_contigs = getRepliconContigs(
        replicon_blast(orit_ref, fixed_fasta, min_ori_ident, min_ori_cov, min_ori_evalue, tmp_dir, orit_blast_results,
                       num_threads=num_threads))
    found_orit = dict()
    for contig_id in orit_contigs:
        for hit in orit_contigs[contig_id]:
            acs, type = hit.split('|')
            found_orit[acs] = type


    # Get closest neighbor by mash distance in the entire plasmid database
    m = mash()
    #mash_distances = dict()
    mashfile_handle = open(mash_file, 'w')
    m.run_mash(mash_db, fixed_fasta, mashfile_handle)
    mash_results = m.read_mash(mash_file)
    mash_top_hit = getMashBestHit(mash_results)


    # GET HOST RANGE
    host_range_literature_report_df = pandas.DataFrame()
    if args.host_range_detailed and found_replicons:
        (host_range_refseq_rank, host_range_refseq_name, taxids, taxids_df, stats_host_range) = getRefSeqHostRange(
            replicon_name_list=list(found_replicons.values()),
            mob_cluster_id_list=[mash_top_hit['clustid']],
            relaxase_name_acc_list=None,
            relaxase_name_list=None,
            matchtype="loose_match",hr_obs_data = loadHostRangeDB())

        if '-' in taxids:
            host_range_refseq_rank = None;
            host_range_refseq_name = None

        else:
            refseqtree = getTaxonomyTree(taxids) #refseq tree
            renderTree(
                       tree=refseqtree,
                       filename_prefix=args.outdir+"/"+file_id+"_refseqhostrange_")

            #get literature report summary dataframe (might be more than 1 row if multiple replicons are present)
            host_range_literature_report_df, littaxids = getLiteratureBasedHostRange(replicon_names = list(found_replicons.values()),
                                                                                      plasmid_lit_db = loadliteratureplasmidDB(),
                                                                                      input_seq = args.infile )




            if littaxids:
                littree = getTaxonomyTree(littaxids) #get literature tree
                renderTree(
                           tree=littree,
                           filename_prefix=args.outdir+"/"+file_id+ "_literaturehostrange_")


            #write hostrange reports
            writeOutHostRangeReports(filename_prefix = args.outdir+"/"+file_id,
                                     samplename=file_id,
                                     replicon_name_list = list(found_replicons.values()),
                                     mob_cluster_id_list = [mash_top_hit['clustid']],
                                     relaxase_name_acc_list = None,
                                     relaxase_name_list = None,
                                     convergance_rank=host_range_refseq_rank,
                                     convergance_taxonomy=host_range_refseq_name,
                                     stats_host_range_dict=stats_host_range,
                                     literature_hr_report=host_range_literature_report_df)
    elif args.host_range_detailed and found_mob: #by MOB_accession numbers
        (host_range_refseq_rank, host_range_refseq_name, taxids, taxids_df, stats_host_range) = getRefSeqHostRange(
                                                                                                                    replicon_name_list=None,
                                                                                                                    mob_cluster_id_list=[mash_top_hit['clustid']],
                                                                                                                    relaxase_name_acc_list=found_mob.keys(),
                                                                                                                    relaxase_name_list=None,
                                                                                                                    matchtype="loose_match", hr_obs_data=loadHostRangeDB())

        refseqtree = getTaxonomyTree(taxids)  # refseq tree
        renderTree(
                    tree=refseqtree,
                    filename_prefix=args.outdir + "/" + file_id + "_refseqhostrange_")

        writeOutHostRangeReports(filename_prefix=args.outdir + "/" + file_id,
                                 samplename=file_id,
                                 replicon_name_list=None,
                                 mob_cluster_id_list=[mash_top_hit['clustid']],
                                 relaxase_name_acc_list=None,
                                 relaxase_name_list=None,
                                 convergance_rank=host_range_refseq_rank,
                                 convergance_taxonomy=host_range_refseq_name,
                                 stats_host_range_dict=stats_host_range
                                 )

        #print(host_range_refseq_rank, host_range_refseq_name, taxids_df["Organism"])

    else:
        host_range_refseq_rank=None; host_range_refseq_name=None

    #END HOST RANGE MODULE

    if len(found_replicons) > 0:
        found_replicons = OrderedDict(sorted(found_replicons.items(), key=itemgetter(1), reverse=False))
        rep_types = ",".join(list(found_replicons.values()))
        rep_acs = ",".join(list(found_replicons.keys()))
    else:
        rep_types = "-"
        rep_acs = "-"

    if len(found_mob) > 0:
        found_mob = OrderedDict(sorted(found_mob.items(), key=itemgetter(1), reverse=False))
        mob_types = ",".join(list(found_mob.values()))
        mob_acs = ",".join(list(found_mob.keys()))
    else:
        mob_types = "-"
        mob_acs = "-"

    if len(found_mpf) > 0:
        found_mpf = OrderedDict(sorted(found_mpf.items(), key=itemgetter(1), reverse=False))
        mpf_type = determine_mpf_type(found_mpf)
        mpf_acs = ",".join(list(found_mpf.keys()))
    else:
        mpf_type = "-"
        mpf_acs = "-"

    if len(found_orit) > 0:
        found_orit = OrderedDict(sorted(found_orit.items(), key=itemgetter(1), reverse=False))
        orit_types = ",".join(list(found_orit.values()))
        orit_acs = ",".join(list(found_orit.keys()))
    else:
        orit_types = "-"
        orit_acs = "-"
    stats = calcFastaStats(fixed_fasta)
    predicted_mobility = 'Non-mobilizable'

    if mob_acs != '-' or orit_acs != '-':
        predicted_mobility = 'Mobilizable'

    if mob_acs != '-' and mpf_acs != '-':
        predicted_mobility = 'Conjugative'


    main_report_data_dict=collections.OrderedDict({"file_id":re.sub(r"\.(fasta|fa|fas){1,1}","",file_id), "num_contigs":stats['num_seq'], "total_length": stats['size'], "gc":stats['gc_content'],
                           "rep_type(s)": rep_types, "rep_type_accession(s)": rep_acs, "relaxase_type(s)":mob_types,
                           "relaxase_type_accession(s)": mob_acs, "mpf_type": mpf_type, "mpf_type_accession(s)": mpf_acs,
                           "orit_type(s)": orit_types, "orit_accession(s)": orit_acs, "PredictedMobility": predicted_mobility,
                           "mash_nearest_neighbor": mash_top_hit['top_hit'],"mash_neighbor_distance": mash_top_hit['mash_hit_score'],
                           "mash_neighbor_cluster": mash_top_hit['clustid'], "NCBI-HR-rank":"-","NCBI-HR-Name":"-",
                           "LitRepHRPlasmClass":"-","LitPredDBHRRank":"-","LitPredDBHRRankSciName":"-",
                           "LitRepHRRankInPubs":"-", "LitRepHRNameInPubs":"-","LitMeanTransferRate":"-",
                           "LitClosestRefAcc":"-", "LitClosestRefDonorStrain":"-",
                           "LitClosestRefRecipientStrain":"-","LitClosestRefTransferRate":"-", "LitClosestConjugTemp":"-",
                           "LitPMIDs":"-","LitPMIDsNumber":"-"})
    main_report_mobtyper_df = pandas.DataFrame(columns=main_report_data_dict.keys())


    #print(host_range_literature_report_collapsed_df)
    if host_range_refseq_rank and host_range_refseq_name:
        main_report_data_dict.update({"NCBI-HR-rank":host_range_refseq_rank,"NCBI-HR-Name":host_range_refseq_name})


    if host_range_literature_report_df.empty == False:
        if host_range_literature_report_df.shape[0] >= 2: #collapse host range repor more than 2 rows
            host_range_literature_report_df = collapseLiteratureReport(host_range_literature_report_df)
        main_report_data_dict.update({"LitRepHRPlasmClass":host_range_literature_report_df["LiteratureReportedHostRangePlasmidClass"].values[0],
                                      "LitPredDBHRRank":host_range_literature_report_df["LiteraturePredictedHostRangeTreeRank"].values[0],
                                      "LitPredDBHRRankSciName": host_range_literature_report_df["LiteraturePredictedHostRangeTreeRankSciName"].values[0],
                                      "LitRepHRRankInPubs":host_range_literature_report_df["LiteratureReportedHostRangeRankInPubs"].values[0],
                                      "LitRepHRNameInPubs": host_range_literature_report_df["LiteratureReportedHostRangeNameInPubs"].values[0],
                                      "LitMeanTransferRate":host_range_literature_report_df["LiteratureMeanTransferRateRange"].values[0],
                                      "LitClosestRefAcc":host_range_literature_report_df["LiteratureClosestRefrencePlasmidAcc"].values[0],
                                      "LitClosestMashDist": host_range_literature_report_df["LiteratureClosestReferenceMashDistance"].values[0],
                                      "LitClosestRefDonorStrain": host_range_literature_report_df["LiteratureClosestReferenceDonorStrain"].values[0],
                                      "LitClosestRefRecipientStrain": host_range_literature_report_df["LiteratureClosestReferenceRecipientStrain"].values[0],
                                      "LitClosestRefTransferRate": host_range_literature_report_df["LiteratureClosestReferenceTransferRate"].values[0],
                                      "LitClosestConjugTemp": host_range_literature_report_df["LiteratureClosestReferenceConjugationTemperature"].values[0],
                                      "LitPMIDs": host_range_literature_report_df["LiteraturePMIDs"].values[0],
                                      "LitPMIDsNumber":host_range_literature_report_df["LiteraturePublicationsNumber"].values[0]
                                      })


    main_report_mobtyper_df = main_report_mobtyper_df.append(pandas.DataFrame([main_report_data_dict]),sort=False)


    main_report_mobtyper_df.to_csv(report_file, sep="\t", mode="w",encoding="UTF-8",index=False)
    if not keep_tmp:
        shutil.rmtree(tmp_dir)
    logger.info("Run completed")