def main(): default_database_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'databases') args = parse_args() if args.debug: init_console_logger(3) logging.info('Running Mob-typer v. {}'.format(__version__)) if not args.outdir: logging.info('Error, no output directory specified, please specify one') sys.exit() if not args.infile: logging.info('Error, no fasta specified, please specify one') sys.exit() if not os.path.isfile(args.infile): logging.info('Error, fasta file does not exist') sys.exit() if not os.path.isdir(args.outdir): os.mkdir(args.outdir, 0o755) if not isinstance(args.num_threads, int): logging.info('Error number of threads must be an integer, you specified "{}"'.format(args.num_threads)) database_dir = os.path.abspath(args.database_directory) verify_init(logging, database_dir) # Script arguments input_fasta = args.infile out_dir = args.outdir num_threads = int(args.num_threads) keep_tmp = args.keep_tmp if database_dir == default_database_dir: mob_ref = args.plasmid_mob mpf_ref = args.plasmid_mpf orit_ref = args.plasmid_orit mash_db = args.plasmid_mash_db replicon_ref = args.plasmid_replicons else: mob_ref = os.path.join(database_dir, 'mob.proteins.faa') mpf_ref = os.path.join(database_dir, 'mpf.proteins.faa') orit_ref = os.path.join(database_dir, 'orit.fas') mash_db = os.path.join(database_dir, 'ncbi_plasmid_full_seqs.fas.msh') replicon_ref = os.path.join(database_dir, 'rep.dna.fas') tmp_dir = os.path.join(out_dir, '__tmp') file_id = os.path.basename(input_fasta) fixed_fasta = os.path.join(tmp_dir, 'fixed.input.fasta') replicon_blast_results = os.path.join(tmp_dir, 'replicon_blast_results.txt') mob_blast_results = os.path.join(tmp_dir, 'mobtyper_blast_results.txt') mpf_blast_results = os.path.join(tmp_dir, 'mpf_blast_results.txt') orit_blast_results = os.path.join(tmp_dir, 'orit_blast_results.txt') if os.path.isfile(mob_blast_results): os.remove(mob_blast_results) if os.path.isfile(mpf_blast_results): os.remove(mpf_blast_results) if os.path.isfile(orit_blast_results): os.remove(orit_blast_results) if os.path.isfile(replicon_blast_results): os.remove(replicon_blast_results) report_file = os.path.join(out_dir, 'mobtyper_' + file_id + '_report.txt') mash_file = os.path.join(tmp_dir, 'mash_' + file_id + '.txt') # Input numeric params min_rep_ident = float(args.min_rep_ident) min_mob_ident = float(args.min_mob_ident) min_ori_ident = float(args.min_ori_ident) min_mpf_ident = float(args.min_mpf_ident) idents = {'min_rep_ident': min_rep_ident, 'min_mob_ident': min_mob_ident, 'min_ori_ident': min_ori_ident} for param in idents: value = float(idents[param]) if value < 60: logging.error("Error: {} is too low, please specify an integer between 70 - 100".format(param)) sys.exit(-1) if value > 100: logging.error("Error: {} is too high, please specify an integer between 70 - 100".format(param)) sys.exit(-1) min_rep_cov = float(args.min_rep_cov) min_mob_cov = float(args.min_mob_cov) min_ori_cov = float(args.min_ori_cov) min_mpf_cov = float(args.min_mpf_cov) covs = {'min_rep_cov': min_rep_cov, 'min_mob_cov': min_mob_cov, 'min_con_cov': min_ori_cov, 'min_rpp_cov': min_ori_cov} for param in covs: value = float(covs[param]) if value < 60: logging.error("Error: {} is too low, please specify an integer between 50 - 100".format(param)) sys.exit(-1) if value > 100: logging.error("Error: {} is too high, please specify an integer between 50 - 100".format(param)) sys.exit(-1) min_rep_evalue = float(args.min_rep_evalue) min_mob_evalue = float(args.min_mob_evalue) min_ori_evalue = float(args.min_ori_evalue) min_mpf_evalue = float(args.min_mpf_evalue) evalues = {'min_rep_evalue': min_rep_evalue, 'min_mob_evalue': min_mob_evalue, 'min_con_evalue': min_ori_evalue} for param in evalues: value = float(evalues[param]) if value > 1: logging.error("Error: {} is too high, please specify an float evalue between 0 to 1".format(param)) sys.exit(-1) check_dependencies(logging) needed_dbs = [replicon_ref, mob_ref, mash_db, mpf_ref] for db in needed_dbs: if (not os.path.isfile(db)): logging.info('Warning! Needed database missing "{}"'.format(db)) mob_suite.mob_init.main() if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir, 0o755) fix_fasta_header(input_fasta, fixed_fasta) # run individual marker blasts logging.info('Running replicon blast on {}'.format(replicon_ref)) replicon_contigs = getRepliconContigs( replicon_blast(replicon_ref, fixed_fasta, min_rep_ident, min_rep_cov, min_rep_evalue, tmp_dir, replicon_blast_results, num_threads=num_threads)) found_replicons = dict() for contig_id in replicon_contigs: for hit in replicon_contigs[contig_id]: acs, type = hit.split('|') found_replicons[acs] = type logging.info('Running relaxase blast on {}'.format(mob_ref)) mob_contigs = getRepliconContigs( mob_blast(mob_ref, fixed_fasta, min_mob_ident, min_mob_cov, min_mob_evalue, tmp_dir, mob_blast_results, num_threads=num_threads)) found_mob = dict() for contig_id in mob_contigs: for hit in mob_contigs[contig_id]: acs, type = hit.split('|') found_mob[acs] = type # print (found_mob) logging.info('Running mpf blast on {}'.format(mob_ref)) mpf_contigs = getRepliconContigs( mob_blast(mpf_ref, fixed_fasta, min_mpf_ident, min_mpf_cov, min_mpf_evalue, tmp_dir, mpf_blast_results, num_threads=num_threads)) found_mpf = dict() for contig_id in mpf_contigs: for hit in mpf_contigs[contig_id]: acs, type = hit.split('|') found_mpf[acs] = type # print(found_mpf) logging.info('Running orit blast on {}'.format(replicon_ref)) orit_contigs = getRepliconContigs( replicon_blast(orit_ref, fixed_fasta, min_ori_ident, min_ori_cov, min_ori_evalue, tmp_dir, orit_blast_results, num_threads=num_threads)) found_orit = dict() for contig_id in orit_contigs: for hit in orit_contigs[contig_id]: acs, type = hit.split('|') found_orit[acs] = type # Get closest neighbor by mash distance m = mash() mash_distances = dict() mashfile_handle = open(mash_file, 'w') m.run_mash(mash_db, fixed_fasta, mashfile_handle) mash_results = m.read_mash(mash_file) mash_top_hit = getMashBestHit(mash_results) results_fh = open(report_file, 'w') results_fh.write("file_id\tnum_contigs\ttotal_length\tgc\t" \ "rep_type(s)\trep_type_accession(s)\t" \ "relaxase_type(s)\trelaxase_type_accession(s)\t" \ "mpf_type\tmpf_type_accession(s)\t" \ "orit_type(s)\torit_accession(s)\tPredictedMobility\t" \ "mash_nearest_neighbor\tmash_neighbor_distance\tmash_neighbor_cluster\n") if len(found_replicons) > 0: rep_types = ",".join(list(found_replicons.values())) rep_acs = ",".join(list(found_replicons.keys())) else: rep_types = "-" rep_acs = "-" if len(found_mob) > 0: mob_types = ",".join(list(found_mob.values())) mob_acs = ",".join(list(found_mob.keys())) else: mob_types = "-" mob_acs = "-" if len(found_mpf) > 0: mpf_type = determine_mpf_type(found_mpf) mpf_acs = ",".join(list(found_mpf.keys())) else: mpf_type = "-" mpf_acs = "-" if len(found_orit) > 0: orit_types = ",".join(list(found_orit.values())) orit_acs = ",".join(list(found_orit.keys())) else: orit_types = "-" orit_acs = "-" stats = calcFastaStats(fixed_fasta) predicted_mobility = 'Non-mobilizable' if mob_acs != '-' or orit_acs != '-': predicted_mobility = 'Mobilizable' if mob_acs != '-' and mpf_acs != '-': predicted_mobility = 'Conjugative' string = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(file_id, stats['num_seq'], stats['size'], stats['gc_content'], rep_types, rep_acs, mob_types, mob_acs, mpf_type, mpf_acs, orit_types, orit_acs, predicted_mobility, mash_top_hit['top_hit'], mash_top_hit['mash_hit_score'], mash_top_hit['clustid']) results_fh.write(string) if not keep_tmp: shutil.rmtree(tmp_dir) print("{}".format(string))
def main(): args = parse_args() if args.debug: init_console_logger(3) logging.info("MOB-recon v. {} ".format(__version__)) if not args.outdir: logging.error( 'Error, no output directory specified, please specify one') sys.exit(-1) if not args.infile: logging.error('Error, no fasta specified, please specify one') sys.exit(-1) if not os.path.isfile(args.infile): logging.error('Error, input fasta file does not exist: "{}"'.format( args.infile)) sys.exit(-1) logging.info('Processing fasta file {}'.format(args.infile)) logging.info('Analysis directory {}'.format(args.outdir)) if not os.path.isdir(args.outdir): os.mkdir(args.outdir, 0o755) # Check that the needed databases have been initialized database_dir = os.path.abspath(args.database_directory) verify_init(logging, database_dir) status_file = os.path.join(database_dir, 'status.txt') if not os.path.isfile(status_file): logging.info( 'Warning! Needed databases have not been initialize please run mob_init and try again' ) mob_suite.mob_init.main() plasmid_files = dict() input_fasta = args.infile out_dir = args.outdir num_threads = args.num_threads tmp_dir = os.path.join(out_dir, '__tmp') file_id = os.path.basename(input_fasta) fixed_fasta = os.path.join(tmp_dir, 'fixed.input.fasta') chromosome_file = os.path.join(out_dir, 'chromosome.fasta') replicon_blast_results = os.path.join(tmp_dir, 'replicon_blast_results.txt') mob_blast_results = os.path.join(tmp_dir, 'mobrecon_blast_results.txt') repetitive_blast_results = os.path.join(tmp_dir, 'repetitive_blast_results.txt') contig_blast_results = os.path.join(tmp_dir, 'contig_blast_results.txt') # Input numeric params min_rep_ident = float(args.min_rep_ident) min_mob_ident = float(args.min_mob_ident) min_con_ident = float(args.min_con_ident) min_rpp_ident = float(args.min_rpp_ident) idents = { 'min_rep_ident': min_rep_ident, 'min_mob_ident': min_mob_ident, 'min_con_ident': min_con_ident, 'min_rpp_ident': min_rpp_ident } for param in idents: value = float(idents[param]) if value < 60: logging.error( "Error: {} is too low, please specify an integer between 70 - 100" .format(param)) sys.exit(-1) if value > 100: logging.error( "Error: {} is too high, please specify an integer between 70 - 100" .format(param)) sys.exit(-1) min_rep_cov = float(args.min_rep_cov) min_mob_cov = float(args.min_mob_cov) min_con_cov = float(args.min_con_cov) min_rpp_cov = float(args.min_rpp_cov) covs = { 'min_rep_cov': min_rep_cov, 'min_mob_cov': min_mob_cov, 'min_con_cov': min_con_cov, 'min_rpp_cov': min_rpp_cov } for param in covs: value = float(covs[param]) if value < 60: logging.error( "Error: {} is too low, please specify an integer between 50 - 100" .format(param)) sys.exit(-1) if value > 100: logging.error( "Error: {} is too high, please specify an integer between 50 - 100" .format(param)) sys.exit(-1) min_rep_evalue = float(args.min_rep_evalue) min_mob_evalue = float(args.min_mob_evalue) min_con_evalue = float(args.min_con_evalue) min_rpp_evalue = float(args.min_rpp_evalue) evalues = { 'min_rep_evalue': min_rep_evalue, 'min_mob_evalue': min_mob_evalue, 'min_con_evalue': min_con_evalue, 'min_rpp_evalue': min_rpp_evalue } for param in evalues: value = float(evalues[param]) if value > 1: logging.error( "Error: {} is too high, please specify an float evalue between 0 to 1" .format(param)) sys.exit(-1) min_overlapp = int(args.min_overlap) min_length = int(args.min_length) # Input numeric params min_rep_ident = float(args.min_rep_ident) min_mob_ident = float(args.min_mob_ident) min_con_ident = float(args.min_con_ident) min_rpp_ident = float(args.min_rpp_ident) idents = { 'min_rep_ident': min_rep_ident, 'min_mob_ident': min_mob_ident, 'min_con_ident': min_con_ident, 'min_rpp_ident': min_rpp_ident } for param in idents: value = idents[param] if value < 60: logging.error( "Error: {} is too low, please specify an integer between 70 - 100" .format(param)) sys.exit(-1) if value > 100: logging.error( "Error: {} is too high, please specify an integer between 70 - 100" .format(param)) sys.exit(-1) min_rep_cov = float(args.min_rep_cov) min_mob_cov = float(args.min_mob_cov) min_con_cov = float(args.min_con_cov) min_rpp_cov = float(args.min_rpp_cov) covs = { 'min_rep_cov': min_rep_cov, 'min_mob_cov': min_mob_cov, 'min_con_cov': min_con_cov, 'min_rpp_cov': min_rpp_cov } for param in covs: value = covs[param] if value < 60: logging.error( "Error: {} is too low, please specify an integer between 50 - 100" .format(param)) sys.exit(-1) if value > 100: logging.error( "Error: {} is too high, please specify an integer between 50 - 100" .format(param)) sys.exit(-1) min_rep_evalue = float(args.min_rep_evalue) min_mob_evalue = float(args.min_mob_evalue) min_con_evalue = float(args.min_con_evalue) min_rpp_evalue = float(args.min_rpp_evalue) evalues = { 'min_rep_evalue': min_rep_evalue, 'min_mob_evalue': min_mob_evalue, 'min_con_evalue': min_con_evalue, 'min_rpp_evalue': min_rpp_evalue } for param in evalues: value = evalues[param] if value > 1: logging.error( "Error: {} is too high, please specify an float evalue between 0 to 1" .format(param)) sys.exit(-1) min_overlapp = args.min_overlap min_length = args.min_length # Input Databases default_database_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'databases') if database_dir == default_database_dir: plasmid_ref_db = args.plasmid_db replicon_ref = args.plasmid_replicons mob_ref = args.plasmid_mob mash_db = args.plasmid_mash_db repetitive_mask_file = args.repetitive_mask else: plasmid_ref_db = os.path.join(database_dir, 'ncbi_plasmid_full_seqs.fas') replicon_ref = os.path.join(database_dir, 'rep.dna.fas') mob_ref = os.path.join(database_dir, 'mob.proteins.faa') mash_db = os.path.join(database_dir, 'ncbi_plasmid_full_seqs.fas.msh') repetitive_mask_file = os.path.join(database_dir, 'repetitive.dna.fas') check_dependencies(logging) needed_dbs = [ plasmid_ref_db, replicon_ref, mob_ref, mash_db, repetitive_mask_file, "{}.nin".format(repetitive_mask_file) ] for db in needed_dbs: if (not os.path.isfile(db)): logging.error('Error needed database missing "{}"'.format(db)) sys.exit(-1) contig_report_file = os.path.join(out_dir, 'contig_report.txt') minimus_prefix = os.path.join(tmp_dir, 'minimus') filtered_blast = os.path.join(tmp_dir, 'filtered_blast.txt') repetitive_blast_report = os.path.join(out_dir, 'repetitive_blast_report.txt') mobtyper_results_file = os.path.join(out_dir, 'mobtyper_aggregate_report.txt') keep_tmp = args.keep_tmp run_circlator = args.run_circlator unicycler_contigs = args.unicycler_contigs if not isinstance(args.num_threads, int): logging.info( 'Error number of threads must be an integer, you specified "{}"'. format(args.num_threads)) logging.info('Creating tmp working directory {}'.format(tmp_dir)) if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir, 0o755) logging.info( 'Writing cleaned header input fasta file from {} to {}'.format( input_fasta, fixed_fasta)) fix_fasta_header(input_fasta, fixed_fasta) contig_seqs = read_fasta_dict(fixed_fasta) logging.info('Running replicon blast on {}'.format(replicon_ref)) replicon_contigs = getRepliconContigs( replicon_blast(replicon_ref, fixed_fasta, min_rep_ident, min_rep_cov, min_rep_evalue, tmp_dir, replicon_blast_results, num_threads=num_threads)) logging.info('Running relaxase blast on {}'.format(mob_ref)) mob_contigs = getRepliconContigs( mob_blast(mob_ref, fixed_fasta, min_mob_ident, min_mob_cov, min_mob_evalue, tmp_dir, mob_blast_results, num_threads=num_threads)) logging.info('Running contig blast on {}'.format(plasmid_ref_db)) contig_blast(fixed_fasta, plasmid_ref_db, min_con_ident, min_con_cov, min_con_evalue, min_length, tmp_dir, contig_blast_results) pcl_clusters = contig_blast_group(filtered_blast, min_overlapp) logging.info( 'Running repetitive contig masking blast on {}'.format(mob_ref)) repetitive_contigs = repetitive_blast(fixed_fasta, repetitive_mask_file, min_rpp_ident, min_rpp_cov, min_rpp_evalue, min_length, tmp_dir, repetitive_blast_results, num_threads=num_threads) circular_contigs = dict() logging.info('Running circlator minimus2 on {}'.format(fixed_fasta)) if run_circlator: circular_contigs = circularize(fixed_fasta, minimus_prefix) if unicycler_contigs: for seqid in contig_seqs: if 'circular=true' in seqid: circular_contigs[seqid] = '' repetitive_dna = dict() results_fh = open(repetitive_blast_report, 'w') results_fh.write( "contig_id\tmatch_id\tmatch_type\tscore\tcontig_match_start\tcontig_match_end\n" ) for contig_id in repetitive_contigs: match_info = repetitive_contigs[contig_id]['id'].split('|') repetitive_dna[contig_id] = "{}\t{}\t{}\t{}\t{}".format( match_info[1], match_info[len(match_info) - 1], repetitive_contigs[contig_id]['score'], repetitive_contigs[contig_id]['contig_start'], repetitive_contigs[contig_id]['contig_end']) results_fh.write("{}\t{}\t{}\t{}\t{}\t{}\n".format( contig_id, match_info[1], match_info[len(match_info) - 1], repetitive_contigs[contig_id]['score'], repetitive_contigs[contig_id]['contig_start'], repetitive_contigs[contig_id]['contig_end'])) results_fh.close() seq_clusters = dict() cluster_bitscores = dict() for seqid in pcl_clusters: cluster_id = list(pcl_clusters[seqid].keys())[0] bitscore = pcl_clusters[seqid][cluster_id] cluster_bitscores[cluster_id] = bitscore sorted_cluster_bitscores = sorted(list(cluster_bitscores.items()), key=operator.itemgetter(1)) sorted_cluster_bitscores.reverse() contigs_assigned = dict() for cluster_id, bitscore in sorted_cluster_bitscores: if not cluster_id in seq_clusters: seq_clusters[cluster_id] = dict() for seqid in pcl_clusters: if not cluster_id in pcl_clusters[seqid]: continue if seqid in contig_seqs and seqid not in contigs_assigned: seq_clusters[cluster_id][seqid] = contig_seqs[seqid] contigs_assigned[seqid] = cluster_id # Add sequences with known replicons regardless of whether they belong to a mcl cluster clust_id = 0 refined_clusters = dict() for contig_id in mob_contigs: if not contig_id in pcl_clusters: if contig_id in contig_seqs: if not clust_id in seq_clusters: seq_clusters["Novel_" + str(clust_id)] = dict() if not contig_id in pcl_clusters: pcl_clusters[contig_id] = dict() pcl_clusters[contig_id]["Novel_" + str(clust_id)] = 0 seq_clusters["Novel_" + str(clust_id)][contig_id] = contig_seqs[contig_id] clust_id += 1 # Add sequences with known relaxases regardless of whether they belong to a mcl cluster count_replicons = dict() for contig_id in replicon_contigs: if not contig_id in pcl_clusters: if contig_id in contig_seqs: if not clust_id in seq_clusters: seq_clusters["Novel_" + str(clust_id)] = dict() if not contig_id in pcl_clusters: pcl_clusters[contig_id] = dict() pcl_clusters[contig_id]["Novel_" + str(clust_id)] = dict() seq_clusters["Novel_" + str(clust_id)][contig_id] = contig_seqs[contig_id] clust_id += 1 refined_clusters = dict() # split out circular sequences from each other replicon_clusters = dict() for contig_id in replicon_contigs: for hit_id in replicon_contigs[contig_id]: id, rep_type = hit_id.split('|') cluster = list(pcl_clusters[contig_id].keys())[0] if not cluster in replicon_clusters: replicon_clusters[cluster] = 0 replicon_clusters[cluster] += 1 for id in seq_clusters: cluster = seq_clusters[id] if not id in refined_clusters: refined_clusters[id] = dict() for contig_id in cluster: if contig_id in circular_contigs and len(cluster) > 1 and ( id in replicon_clusters and replicon_clusters[id] > 1): if not clust_id in refined_clusters: refined_clusters["Novel_" + str(clust_id)] = dict() refined_clusters["Novel_" + str(clust_id)][contig_id] = cluster[contig_id] clust_id += 1 continue refined_clusters[id][contig_id] = cluster[contig_id] seq_clusters = refined_clusters m = mash() mash_distances = dict() mash_top_dists = dict() contig_report = list() results_fh = open(contig_report_file, 'w') results_fh.write("file_id\tcluster_id\tcontig_id\tcontig_length\tcircularity_status\trep_type\t" \ "rep_type_accession\trelaxase_type\trelaxase_type_accession\tmash_nearest_neighbor\t" " mash_neighbor_distance\trepetitive_dna_id\tmatch_type\tscore\tcontig_match_start\tcontig_match_end\n") filter_list = dict() counter = 0 for cluster in seq_clusters: clusters = seq_clusters[cluster] total_cluster_length = 0 count_seqs = len(clusters) count_rep = 0 count_small = 0 temp = dict() for contig_id in clusters: temp[contig_id] = '' if contig_id in repetitive_contigs: count_rep += 1 length = len(clusters[contig_id]) total_cluster_length += length if length < 3000: count_small += 1 if count_rep == count_seqs or ( float(count_rep) / count_seqs * 100 > 50 and count_small == count_seqs) or total_cluster_length < 1500: continue for contig_id in temp: filter_list[contig_id] = '' cluster_file = os.path.join(tmp_dir, 'clust_' + str(cluster) + '.fasta') mash_file = os.path.join(tmp_dir, 'clust_' + str(cluster) + '.txt') write_fasta_dict(clusters, cluster_file) mashfile_handle = open(mash_file, 'w') m.run_mash(mash_db, cluster_file, mashfile_handle) mash_results = m.read_mash(mash_file) mash_top_hit = getMashBestHit(mash_results) # delete low scoring clusters if float(mash_top_hit['mash_hit_score']) > 0.05: skip = True for contig_id in clusters: if contig_id in replicon_contigs: skip = False break if contig_id in circular_contigs: skip = False break if contig_id in mob_contigs: skip = False break if skip: for contig_id in clusters: del (filter_list[contig_id]) continue new_clust_file = None if os.path.isfile(cluster_file): if float(mash_top_hit['mash_hit_score']) < 0.05: cluster = mash_top_hit['clustid'] new_clust_file = os.path.join(out_dir, 'plasmid_' + cluster + ".fasta") else: cluster = 'novel_' + str(counter) new_clust_file = os.path.join(out_dir, 'plasmid_' + cluster + ".fasta") counter += 1 if os.path.isfile(new_clust_file): temp_fh = open(cluster_file, 'r') data = temp_fh.read() temp_fh.close() temp_fh = open(new_clust_file, 'a') temp_fh.write(data) temp_fh.close() mash_file = os.path.join(tmp_dir, 'clust_' + str(cluster) + '.txt') mashfile_handle = open(mash_file, 'w') m.run_mash(mash_db, cluster_file, mashfile_handle) mash_results = m.read_mash(mash_file) mash_top_hit = getMashBestHit(mash_results) else: os.rename(cluster_file, new_clust_file) if new_clust_file is not None: plasmid_files[new_clust_file] = '' for contig_id in clusters: found_replicon_string = '' found_replicon_id_string = '' found_mob_string = '' found_mob_id_string = '' contig_status = 'Incomplete' if contig_id in circular_contigs: contig_status = 'Circular' if contig_id in replicon_contigs: rep_ids = dict() rep_hit_ids = dict() for hit_id in replicon_contigs[contig_id]: id, rep_type = hit_id.split('|') rep_ids[rep_type] = '' rep_hit_ids[id] = '' found_replicon_string = ','.join(list(rep_ids.keys())) found_replicon_id_string = ','.join(list(rep_hit_ids.keys())) if contig_id in mob_contigs: mob_ids = dict() mob_hit_ids = dict() for hit_id in mob_contigs[contig_id]: id, mob_type = hit_id.split('|') mob_ids[mob_type] = '' mob_hit_ids[id] = '' found_mob_string = ','.join(list(mob_ids.keys())) found_mob_id_string = ','.join(list(mob_hit_ids.keys())) rep_dna_info = "\t\t\t\t" if contig_id in repetitive_dna: rep_dna_info = repetitive_dna[contig_id] results_fh.write( "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( file_id, cluster, contig_id, len(clusters[contig_id]), contig_status, found_replicon_string, found_replicon_id_string, found_mob_string, found_mob_id_string, mash_top_hit['top_hit'], mash_top_hit['mash_hit_score'], rep_dna_info)) chr_contigs = dict() for contig_id in contig_seqs: if contig_id not in filter_list: chr_contigs[contig_id] = contig_seqs[contig_id] rep_dna_info = "\t\t\t\t" if contig_id in repetitive_dna: rep_dna_info = repetitive_dna[contig_id] contig_status = 'Incomplete' if contig_id in circular_contigs: contig_status = 'Circular' results_fh.write( "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( file_id, 'chromosome', contig_id, len(contig_seqs[contig_id]), contig_status, '', '', '', '', '', '', rep_dna_info)) results_fh.close() write_fasta_dict(chr_contigs, chromosome_file) if args.run_typer: mobtyper_results = "file_id\tnum_contigs\ttotal_length\tgc\t" \ "rep_type(s)\trep_type_accession(s)\t" \ "relaxase_type(s)\trelaxase_type_accession(s)\t" \ "mpf_type\tmpf_type_accession(s)\t" \ "orit_type(s)\torit_accession(s)\tPredictedMobility\t" \ "mash_nearest_neighbor\tmash_neighbor_distance\tmash_neighbor_cluster\n" for file in plasmid_files: mobtyper_results = mobtyper_results + "{}".format( run_mob_typer( file, out_dir, str(num_threads), database_dir=database_dir)) fh = open(mobtyper_results_file, 'w') fh.write(mobtyper_results) fh.close() if not keep_tmp: shutil.rmtree(tmp_dir)
def main(): args = parse_args() if args.debug: logging = init_console_logger(3) else: logging = init_console_logger(2) logging.info( 'Running Mob-Suite Clustering toolkit v. {}'.format(__version__)) logging.info('Processing fasta file {}'.format(args.infile)) logging.info('Analysis directory {}'.format(args.outdir)) check_dependencies(logging) input_fasta = args.infile if not os.path.isfile(input_fasta): logging.error('Error, input fasta specified does not exist: {}'.format( input_fasta)) sys.exit() mob_typer_report_file = args.mob_typer_file if not os.path.isfile(mob_typer_report_file): logging.error( 'Error, input metadata file specified does not exist: {}'.format( mob_typer_report_file)) sys.exit() mode = str(args.mode).lower() if mode not in ('update', 'build'): logging.error( 'Error you have not entered a valid mode of build or update, you entered: {}' .format(mode)) sys.exit() out_dir = args.outdir num_threads = args.num_threads if not (args.primary_cluster_dist >= 0 and args.primary_cluster_dist <= 1): logging.error( 'Error distance thresholds must be between 0 - 1: {}'.format( args.primary_cluster_dist)) sys.exit() else: primary_distance = args.primary_cluster_dist if not (args.secondary_cluster_dist >= 0 and args.secondary_cluster_dist <= 1): logging.error( 'Error distance thresholds must be between 0 - 1: {}'.format( args.secondary_cluster_dist)) sys.exit() else: secondary_distance = args.secondary_cluster_dist if not os.path.isdir(out_dir): logging.info('Creating directory {}'.format(args.outdir)) os.mkdir(out_dir, 0o755) tmp_dir = os.path.join(out_dir, '__tmp') if not os.path.isdir(tmp_dir): logging.info('Creating directory {}'.format(args.outdir)) os.mkdir(tmp_dir, 0o755) taxonomy_file = args.taxonomy records = read_file_to_dict(mob_typer_report_file, MOB_TYPER_REPORT_HEADER, separater="\t") seq_ids = [] new_seq_info = {} duplicate_keys = [] for record in records: seq_ids.append(record['sample_id']) if not record['sample_id'] in new_seq_info: new_seq_info[record['sample_id']] = record else: duplicate_keys.append(record['sample_id']) if len(duplicate_keys) > 0: logging.error( "Duplicate sequence identifiers in fasta file. Please make every sequence id unique in the input file before using this tool" ) logging.error("Duplicate sequence ids: {}".format( ",".join(duplicate_keys))) sys.exit() record_identifications = read_file_to_dict(taxonomy_file, ['sample_id', 'organism'], separater="\t") organisms = [] for record in record_identifications: organism = record['organism'] if organism == 'unknown' or organism == '' or organism == 'Unknown': organism = 'Bacteria' organisms.append(organism) seq_id = record['sample_id'] if seq_id in new_seq_info: new_seq_info[seq_id]['organism'] = organism taxids = NamesToTaxIDs(organisms) del (organisms) for seq_id in new_seq_info: organism = new_seq_info[seq_id]['organism'] if organism in taxids: new_seq_info[seq_id]['taxid'] = taxids[organism][0] else: new_seq_info[seq_id]['taxid'] = 2 if len(new_seq_info) == 0: logging.error( 'Error no MOB-typer results for sequences. Sequences must be typed with MOB-typer first' ) sys.exit() fasta_dict = read_fasta_dict(input_fasta) if len(fasta_dict) == 0: logging.error( 'Error no sequences found in input fasta: {}..cannot continue'. format(input_fasta)) sys.exit() key_set_1 = set(seq_ids) key_set_2 = set(list(fasta_dict.keys())) if len(list(key_set_1 ^ key_set_2)) > 0: logging.error( 'Error MOB-typer results: {} and input fasta: {} do not have the same set of identifiers, these must match in order to proceed' .format(mob_typer_report_file, input_fasta)) logging.error( 'Keys present in MOB-typer results: {} and not in input fasta: {} are: {}' .format(mob_typer_report_file, input_fasta, list(key_set_1 - key_set_2))) logging.error( 'Keys present in MOB-typer results: {} and not in input fasta: {} are: {}' .format(mob_typer_report_file, input_fasta, list(key_set_2 - key_set_1))) sys.exit() tmp_cluster_file = os.path.join(out_dir, 'clusters.txt') tmp_ref_fasta_file = os.path.join(tmp_dir, 'references_tmp.fasta') update_fasta = os.path.join(out_dir, 'references_updated.fasta') # Sketch and calculate distances within update sequences if len(fasta_dict) > 1: mashObj = mash() mashObj.mashsketch(input_fasta, input_fasta + ".msh", num_threads=num_threads) distance_matrix_file = os.path.join(tmp_dir, 'mash_dist_matrix.txt') mashfile_handle = open(distance_matrix_file, 'w', encoding="utf-8") mashfile_handle.write( mashObj.run_mash(input_fasta + '.msh', input_fasta + '.msh', table=True, num_threads=num_threads).decode()) mashfile_handle.close() clust_assignments = build_cluster_db( distance_matrix_file, (primary_distance, secondary_distance)) else: seq_id = next(iter(fasta_dict)) clust_assignments = {seq_id: [0, 1]} logging.info('Running MOB-cluster in {} mode'.format(mode)) if mode == 'update': if args.ref_cluster_file is None: logging.error( 'Reference fasta file must be specified, please check help for parameter reference' ) sys.exit() ref_fasta = args.ref_fasta_file if not os.path.isfile(ref_fasta): logging.error( 'Reference fasta file specified does not exist: {}'.format( ref_fasta)) sys.exit() if args.ref_cluster_file is None: logging.error( 'Reference cluster file must be specified, please check help for parameter reference' ) sys.exit() ref_cluster_file = args.ref_cluster_file if not os.path.isfile(ref_cluster_file): logging.error( 'Reference cluster file specified does not exist: {}'.format( ref_cluster_file)) sys.exit() mob_cluster_seq_info = read_sequence_info(ref_cluster_file, MOB_CLUSTER_INFO_HEADER) logging.info( 'Running mob-cluster in update mode with input file: {}'.format( input_fasta)) logging.info( 'Running mob-cluster in update mode with output directory: {}'. format(out_dir)) logging.info( 'Running mob-cluster in update mode on reference fasta file: {}'. format(ref_fasta)) logging.info( 'Reading previous cluster reference assignments from : {}'.format( ref_cluster_file)) shutil.copy(ref_cluster_file, tmp_cluster_file) shutil.copy(ref_fasta, tmp_ref_fasta_file) logging.info('Creating new cluster assignments') new_seq_info = update_existing_db(new_seq_info, mob_cluster_seq_info, clust_assignments, primary_distance, secondary_distance, num_threads) cluster_assignments = {**mob_cluster_seq_info, **new_seq_info} logging.info( 'Writting cluster assignments to : {}'.format(tmp_cluster_file)) writeClusterAssignments(tmp_cluster_file, MOB_CLUSTER_INFO_HEADER, cluster_assignments) shutil.copy(tmp_ref_fasta_file, os.path.join(out_dir, update_fasta)) else: cluster_acs = convert_num_to_acs(clust_assignments) for id in cluster_acs: primary_key = cluster_acs[id][0] secondary_key = cluster_acs[id][1] new_seq_info[id]['primary_cluster_id'] = primary_key new_seq_info[id]['primary_dist'] = primary_distance new_seq_info[id]['secondary_cluster_id'] = secondary_key new_seq_info[id]['secondary_dist'] = secondary_distance writeClusterAssignments(tmp_cluster_file, MOB_CLUSTER_INFO_HEADER, new_seq_info) shutil.copy(input_fasta, update_fasta) logging.info("Sketching new fasta {}".format(update_fasta)) mash_db_file = "{}.msh".format(update_fasta) mObj = mash() mObj.mashsketch(update_fasta, mash_db_file, num_threads=num_threads) logging.info("Building blastdb {}".format(update_fasta)) blast_runner = BlastRunner(update_fasta, '') blast_runner.makeblastdb(update_fasta, 'nucl', logging=logging) logging.info("Removing temporary directory") shutil.rmtree(tmp_dir) logging.info( "MOB-cluster completed, analysis results written to {}".format( out_dir))
def main(): args = parse_args() if args.debug: logger = init_console_logger(3) else: logger = init_console_logger(2) logger.info('Running Mob-typer version {}'.format(__version__)) logger.info('Processing fasta file {}'.format(args.infile)) if not os.path.isfile(args.infile): logger.info('Error, fasta file does not exist {}'.format(args.infile)) sys.exit() if not args.analysis_dir: tmp_dir = tempfile.TemporaryDirectory(dir=tempfile.gettempdir()).name else: tmp_dir = args.analysis_dir if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir, 0o755) if not isinstance(args.num_threads, int): logger.info( 'Error number of threads must be an integer, you specified "{}"'. format(args.num_threads)) database_dir = os.path.abspath(args.database_directory) if args.sample_id is None: sample_id = re.sub(r"\.(fasta|fa|fas){1,1}", "", os.path.basename(args.infile)) else: sample_id = args.sample_id # Script arguments input_fasta = args.infile report_file = args.out_file num_threads = int(args.num_threads) keep_tmp = args.keep_tmp if args.multi: multi = True else: multi = False if not (args.primary_cluster_dist >= 0 and args.primary_cluster_dist <= 1): logging.error( 'Error distance thresholds must be between 0 - 1: {}'.format( args.primary_cluster_dist)) sys.exit() else: primary_distance = float(args.primary_cluster_dist) if not (args.secondary_cluster_dist >= 0 and args.secondary_cluster_dist <= 1): logging.error( 'Error distance thresholds must be between 0 - 1: {}'.format( args.secondary_cluster_dist)) sys.exit() else: secondary_distance = float(args.secondary_cluster_dist) if database_dir == default_database_dir: mob_ref = args.plasmid_mob mash_db = args.plasmid_mash_db replicon_ref = args.plasmid_replicons plasmid_meta = args.plasmid_meta mpf_ref = args.plasmid_mpf plasmid_orit = args.plasmid_orit verify_init(logger, database_dir) else: mob_ref = os.path.join(database_dir, 'mob.proteins.faa') mash_db = os.path.join(database_dir, 'ncbi_plasmid_full_seqs.fas.msh') replicon_ref = os.path.join(database_dir, 'rep.dna.fas') plasmid_meta = os.path.join(database_dir, 'clusters.txt') mpf_ref = os.path.join(database_dir, 'mpf.proteins.faa') plasmid_orit = os.path.join(database_dir, 'orit.fas') LIT_PLASMID_TAXONOMY_FILE = os.path.join( database_dir, "host_range_literature_plasmidDB.txt") NCBI_PLASMID_TAXONOMY_FILE = plasmid_meta fixed_fasta = os.path.join(tmp_dir, 'fixed.input.fasta') replicon_blast_results = os.path.join(tmp_dir, 'replicon_blast_results.txt') mob_blast_results = os.path.join(tmp_dir, 'mobtyper_blast_results.txt') mpf_blast_results = os.path.join(tmp_dir, 'mpf_blast_results.txt') orit_blast_results = os.path.join(tmp_dir, 'orit_blast_results.txt') repetitive_blast_results = os.path.join(tmp_dir, 'repetitive_blast_results.txt') if os.path.isfile(mob_blast_results): os.remove(mob_blast_results) if os.path.isfile(mpf_blast_results): os.remove(mpf_blast_results) if os.path.isfile(orit_blast_results): os.remove(orit_blast_results) if os.path.isfile(replicon_blast_results): os.remove(replicon_blast_results) # Input numeric params min_rep_ident = float(args.min_rep_ident) min_mob_ident = float(args.min_mob_ident) min_ori_ident = float(args.min_rep_ident) min_mpf_ident = float(args.min_mob_ident) idents = { 'min_rep_ident': min_rep_ident, 'min_mob_ident': min_mob_ident, 'min_ori_ident': min_ori_ident } for param in idents: value = float(idents[param]) if value < 60: logger.error( "Error: {} is too low, please specify an integer between 70 - 100" .format(param)) sys.exit(-1) if value > 100: logger.error( "Error: {} is too high, please specify an integer between 70 - 100" .format(param)) sys.exit(-1) min_rep_cov = float(args.min_rep_cov) min_mob_cov = float(args.min_mob_cov) min_ori_cov = float(args.min_rep_cov) min_mpf_cov = float(args.min_mob_cov) covs = { 'min_rep_cov': min_rep_cov, 'min_mob_cov': min_mob_cov, 'min_con_cov': min_ori_cov, 'min_rpp_cov': min_ori_cov } for param in covs: value = float(covs[param]) if value < 60: logger.error( "Error: {} is too low, please specify an integer between 50 - 100" .format(param)) sys.exit(-1) if value > 100: logger.error( "Error: {} is too high, please specify an integer between 50 - 100" .format(param)) sys.exit(-1) min_rep_evalue = float(args.min_rep_evalue) min_mob_evalue = float(args.min_mob_evalue) min_ori_evalue = float(args.min_rep_evalue) min_mpf_evalue = float(args.min_mob_evalue) evalues = { 'min_rep_evalue': min_rep_evalue, 'min_mob_evalue': min_mob_evalue, 'min_con_evalue': min_ori_evalue } for param in evalues: value = float(evalues[param]) if value > 1: logger.error( "Error: {} is too high, please specify an float evalue between 0 to 1" .format(param)) sys.exit(-1) check_dependencies(logger) needed_dbs = [replicon_ref, mob_ref, mash_db, mpf_ref] for db in needed_dbs: if (not os.path.isfile(db)): logger.info('Warning! Needed database missing "{}"'.format(db)) mob_suite.mob_init.main() if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir, 0o755) # Test that ETE3 db is ok and lock process check dbstatus = ETE3_db_status_check(1, ETE3_LOCK_FILE, ETE3DBTAXAFILE, logging) if dbstatus == False: logging.error( "Exiting due to lock file not removed: {}".format(ETE3_LOCK_FILE)) sys.exit(-1) # Get cluster information reference_sequence_meta = read_sequence_info(plasmid_meta, MOB_CLUSTER_INFO_HEADER) # initilize master record tracking fix_fasta_header(input_fasta, fixed_fasta) contig_seqs = read_fasta_dict(fixed_fasta) contig_info = {} for id in contig_seqs: seq = contig_seqs[id] contig_info[id] = {} for feature in MOB_TYPER_REPORT_HEADER: contig_info[id][feature] = '' contig_info[id]['md5'] = calc_md5(seq) contig_info[id]['gc'] = GC(seq) contig_info[id]['size'] = len(seq) contig_info[id]['contig_id'] = id contig_info[id]['sample_id'] = sample_id # Makeblastdb blast_runner = BlastRunner(fixed_fasta, tmp_dir) build_success = blast_runner.makeblastdb(fixed_fasta, 'nucl', logging=logging) if build_success == False: logging.error( "Could not build blast database, check error messages..cannot continue" ) sys.exit() # run individual marker blasts contig_info = identify_biomarkers(contig_info, fixed_fasta, tmp_dir, 25, logging, \ replicon_ref, min_rep_ident, min_rep_cov, min_rep_evalue, replicon_blast_results, \ mob_ref, min_mob_ident, min_mob_cov, min_mob_evalue, mob_blast_results, \ mpf_ref, min_mpf_ident, min_mpf_cov, min_mpf_evalue, mpf_blast_results, \ None, None, None, None, \ plasmid_orit, orit_blast_results, repetitive_blast_results, \ num_threads=1) m = mash() mobtyper_results = [] mash_input_fasta = fixed_fasta + '.msh' ncbi = dict_from_alt_key_list( read_file_to_dict(NCBI_PLASMID_TAXONOMY_FILE, MOB_CLUSTER_INFO_HEADER, separater="\t"), "sample_id") lit = dict_from_alt_key_list( read_file_to_dict(LIT_PLASMID_TAXONOMY_FILE, LIT_PLASMID_TAXONOMY_HEADER, separater="\t"), "sample_id") if multi: m.mashsketch(input_fasta=fixed_fasta, output_path=mash_input_fasta, sketch_ind=True, num_threads=num_threads) mash_results = parseMash( m.run_mash(reference_db=mash_db, input_fasta=mash_input_fasta, table=False, num_threads=num_threads)) for seq_id in mash_results: record = {} for field in MOB_TYPER_REPORT_HEADER: if field in contig_info[seq_id]: record[field] = contig_info[seq_id][field] else: record[field] = '' record['sample_id'] = seq_id record['num_contigs'] = 1 distances = OrderedDict( sorted(mash_results[seq_id].items(), key=itemgetter(1), reverse=False)) for mash_neighbor_id in distances: dist = distances[mash_neighbor_id] if mash_neighbor_id not in reference_sequence_meta: continue else: record['mash_nearest_neighbor'] = mash_neighbor_id record['mash_neighbor_distance'] = dist record['primary_cluster_id'] = reference_sequence_meta[ mash_neighbor_id]['primary_cluster_id'] record['secondary_cluster_id'] = reference_sequence_meta[ mash_neighbor_id]['secondary_cluster_id'] record[ 'mash_neighbor_identification'] = reference_sequence_meta[ mash_neighbor_id]['organism'] break mobtyper_results.append(record) else: m.mashsketch(input_fasta=fixed_fasta, output_path=mash_input_fasta, sketch_ind=False, num_threads=num_threads) mash_results = parseMash( m.run_mash(reference_db=mash_db, input_fasta=mash_input_fasta, table=False, num_threads=num_threads)) record = {} for field in MOB_TYPER_REPORT_HEADER: record[field] = '' record['sample_id'] = sample_id fastaSeqStats = calcFastaStats(fixed_fasta) record['md5'] = fastaSeqStats['md5'] record['total_length'] = fastaSeqStats['size'] record['num_contigs'] = fastaSeqStats['num_seq'] record['gc'] = fastaSeqStats['gc_content'] record['mash_nearest_neighbor'] = '-' record['mash_neighbor_distance'] = 1 record['primary_cluster_id'] = '-' record['secondary_cluster_id'] = '-' record['mash_neighbor_identification'] = '-' for seq_id in mash_results: distances = OrderedDict( sorted(mash_results[seq_id].items(), key=itemgetter(1), reverse=False)) mash_neighbor_id = next(iter(distances)) dist = distances[mash_neighbor_id] if mash_neighbor_id not in reference_sequence_meta: continue record['mash_nearest_neighbor'] = mash_neighbor_id record['mash_neighbor_distance'] = dist record['primary_cluster_id'] = reference_sequence_meta[ mash_neighbor_id]['primary_cluster_id'] record['secondary_cluster_id'] = reference_sequence_meta[ mash_neighbor_id]['secondary_cluster_id'] record['mash_neighbor_identification'] = reference_sequence_meta[ mash_neighbor_id]['organism'] record['rep_type(s)'] = [] record['rep_type_accession(s)'] = [] record['relaxase_type(s)'] = [] record['relaxase_type_accession(s)'] = [] record['mpf_type'] = [] record['mpf_type_accession(s)'] = [] record['orit_type(s)'] = [] record['orit_accession(s)'] = [] for seq_id in contig_info: record['rep_type(s)'].append(contig_info[seq_id]['rep_type(s)']) record['rep_type_accession(s)'].append( contig_info[seq_id]['rep_type_accession(s)']) record['relaxase_type(s)'].append( contig_info[seq_id]['relaxase_type(s)']) record['relaxase_type_accession(s)'].append( contig_info[seq_id]['relaxase_type_accession(s)']) record['mpf_type'].append(contig_info[seq_id]['mpf_type']) record['mpf_type_accession(s)'].append( contig_info[seq_id]['mpf_type_accession(s)']) record['orit_type(s)'].append(contig_info[seq_id]['orit_type(s)']) record['orit_accession(s)'].append( contig_info[seq_id]['orit_accession(s)']) for field in record: tmp = [] if record[field] == None: continue if isinstance(record[field], list): length = len(record[field]) for i in range(0, length): tmp += record[field][i].split(',') elif isinstance(record[field], str) and len(record[field]) > 0: tmp += record[field].split(',') if len(tmp) > 0: record[field] = [] for d in tmp: if len(d) > 0: record[field].append(d) mobtyper_results.append(record) for i in range(0, len(mobtyper_results)): record = mobtyper_results[i] bio_markers = sort_biomarkers({ 0: { 'types': record['rep_type(s)'], 'acs': record['rep_type_accession(s)'] }, 1: { 'types': record['relaxase_type(s)'], 'acs': record['relaxase_type_accession(s)'] }, 2: { 'types': record['mpf_type'], 'acs': record['mpf_type_accession(s)'] }, 3: { 'types': record['orit_type(s)'], 'acs': record['orit_accession(s)'] }, }) record['rep_type(s)'] = bio_markers[0]['types'] record['rep_type_accession(s)'] = bio_markers[0]['acs'] record['relaxase_type(s)'] = bio_markers[1]['types'] record['relaxase_type_accession(s)'] = bio_markers[1]['acs'] record['mpf_type'] = bio_markers[2]['types'] record['mpf_type_accession(s)'] = bio_markers[2]['acs'] record['orit_type(s)'] = bio_markers[3]['types'] record['orit_accession(s)'] = bio_markers[3]['acs'] if (isinstance(record['mash_neighbor_distance'], float) or isinstance( record['mash_neighbor_distance'], int)) and record['mash_neighbor_distance'] <= primary_distance: mob_cluster_id = record['primary_cluster_id'] else: mob_cluster_id = None #Patches that sometimes results are concatonated into strings if contigs are merged into a single results if isinstance(record['rep_type(s)'], list): record['rep_type(s)'] = ",".join(record['rep_type(s)']) if isinstance(record['relaxase_type_accession(s)'], list): record['relaxase_type_accession(s)'] = ",".join( record['relaxase_type_accession(s)']) host_range = hostrange(record['rep_type(s)'].split(','), record['relaxase_type_accession(s)'].split(','), mob_cluster_id, ncbi, lit) for field in host_range: record[field] = host_range[field] if isinstance(record['mpf_type'], list): record['mpf_type'] = determine_mpf_type(record['mpf_type']) elif isinstance(record['mpf_type'], str): record['mpf_type'] = determine_mpf_type( record['mpf_type'].split(',')) for field in record: if isinstance(record[field], list): record[field] = ",".join(record[field]) record['predicted_mobility'] = 'non-mobilizable' if len(record['relaxase_type(s)']) > 0 and len(record['mpf_type']): record['predicted_mobility'] = 'conjugative' elif len(record['relaxase_type(s)']) > 0 or len( record['orit_type(s)']) > 0: record['predicted_mobility'] = 'mobilizable' mobtyper_results[i] = record writeReport(mobtyper_results, MOB_TYPER_REPORT_HEADER, report_file) if not keep_tmp: shutil.rmtree(tmp_dir) logger.info( "MOB-typer completed and results written to {}".format(report_file))
def main(): default_database_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'databases') args = parse_args() if args.debug: logger = init_console_logger(3) else: logger = init_console_logger(2) logger.info('Running Mob-typer version {}'.format(__version__)) if not args.outdir: logger.info('Error, no output directory specified, please specify one') sys.exit() if not args.infile: logger.info('Error, no fasta specified, please specify one') sys.exit() if not os.path.isfile(args.infile): logger.info('Error, fasta file does not exist') sys.exit() if not os.path.isdir(args.outdir): os.mkdir(args.outdir, 0o755) if not isinstance(args.num_threads, int): logger.info('Error number of threads must be an integer, you specified "{}"'.format(args.num_threads)) database_dir = os.path.abspath(args.database_directory) verify_init(logger,database_dir) # Script arguments input_fasta = args.infile out_dir = args.outdir num_threads = int(args.num_threads) keep_tmp = args.keep_tmp if database_dir == default_database_dir: mob_ref = args.plasmid_mob mpf_ref = args.plasmid_mpf orit_ref = args.plasmid_orit mash_db = args.plasmid_mash_db replicon_ref = args.plasmid_replicons else: mob_ref = os.path.join(database_dir, 'mob.proteins.faa') mpf_ref = os.path.join(database_dir, 'mpf.proteins.faa') orit_ref = os.path.join(database_dir, 'orit.fas') mash_db = os.path.join(database_dir, 'ncbi_plasmid_full_seqs.fas.msh') replicon_ref = os.path.join(database_dir, 'rep.dna.fas') tmp_dir = os.path.join(out_dir, '__tmp') file_id = os.path.basename(input_fasta) #output_file_prefix = re.sub(r"\..*", "", file_id) # remove file extension by matching everything before dot fixed_fasta = os.path.join(tmp_dir, 'fixed.input.fasta') replicon_blast_results = os.path.join(tmp_dir, 'replicon_blast_results.txt') mob_blast_results = os.path.join(tmp_dir, 'mobtyper_blast_results.txt') mpf_blast_results = os.path.join(tmp_dir, 'mpf_blast_results.txt') orit_blast_results = os.path.join(tmp_dir, 'orit_blast_results.txt') if os.path.isfile(mob_blast_results): os.remove(mob_blast_results) if os.path.isfile(mpf_blast_results): os.remove(mpf_blast_results) if os.path.isfile(orit_blast_results): os.remove(orit_blast_results) if os.path.isfile(replicon_blast_results): os.remove(replicon_blast_results) report_file = os.path.join(out_dir, 'mobtyper_' + file_id + '_report.txt') mash_file = os.path.join(tmp_dir, 'mash_' + file_id + '.txt') # Input numeric params min_rep_ident = float(args.min_rep_ident) min_mob_ident = float(args.min_mob_ident) min_ori_ident = float(args.min_ori_ident) min_mpf_ident = float(args.min_mpf_ident) idents = {'min_rep_ident': min_rep_ident, 'min_mob_ident': min_mob_ident, 'min_ori_ident': min_ori_ident} for param in idents: value = float(idents[param]) if value < 60: logger.error("Error: {} is too low, please specify an integer between 70 - 100".format(param)) sys.exit(-1) if value > 100: logger.error("Error: {} is too high, please specify an integer between 70 - 100".format(param)) sys.exit(-1) min_rep_cov = float(args.min_rep_cov) min_mob_cov = float(args.min_mob_cov) min_ori_cov = float(args.min_ori_cov) min_mpf_cov = float(args.min_mpf_cov) covs = {'min_rep_cov': min_rep_cov, 'min_mob_cov': min_mob_cov, 'min_con_cov': min_ori_cov, 'min_rpp_cov': min_ori_cov} for param in covs: value = float(covs[param]) if value < 60: logger.error("Error: {} is too low, please specify an integer between 50 - 100".format(param)) sys.exit(-1) if value > 100: logger.error("Error: {} is too high, please specify an integer between 50 - 100".format(param)) sys.exit(-1) min_rep_evalue = float(args.min_rep_evalue) min_mob_evalue = float(args.min_mob_evalue) min_ori_evalue = float(args.min_ori_evalue) min_mpf_evalue = float(args.min_mpf_evalue) evalues = {'min_rep_evalue': min_rep_evalue, 'min_mob_evalue': min_mob_evalue, 'min_con_evalue': min_ori_evalue} for param in evalues: value = float(evalues[param]) if value > 1: logger.error("Error: {} is too high, please specify an float evalue between 0 to 1".format(param)) sys.exit(-1) check_dependencies(logger) needed_dbs = [replicon_ref, mob_ref, mash_db, mpf_ref] for db in needed_dbs: if (not os.path.isfile(db)): logger.info('Warning! Needed database missing "{}"'.format(db)) mob_suite.mob_init.main() if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir, 0o755) fix_fasta_header(input_fasta, fixed_fasta) # run individual marker blasts logger.info('Running replicon blast on {}'.format(replicon_ref)) replicon_contigs = getRepliconContigs( replicon_blast(replicon_ref, fixed_fasta, min_rep_ident, min_rep_cov, min_rep_evalue, tmp_dir, replicon_blast_results, num_threads=num_threads)) found_replicons = dict() for contig_id in replicon_contigs: for hit in replicon_contigs[contig_id]: acs, type = hit.split('|') found_replicons[acs] = type #print("These replicons are found") #print(list(found_replicons.values())) logger.info('Running relaxase blast on {}'.format(mob_ref)) mob_contigs = getRepliconContigs( mob_blast(mob_ref, fixed_fasta, min_mob_ident, min_mob_cov, min_mob_evalue, tmp_dir, mob_blast_results, num_threads=num_threads)) found_mob = dict() for contig_id in mob_contigs: for hit in mob_contigs[contig_id]: acs, type = hit.split('|') found_mob[acs] = type #print ("These are relaxeses found") #print (list(found_mob.values())) logger.info('Running mpf blast on {}'.format(mob_ref)) mpf_contigs = getRepliconContigs( mob_blast(mpf_ref, fixed_fasta, min_mpf_ident, min_mpf_cov, min_mpf_evalue, tmp_dir, mpf_blast_results, num_threads=num_threads)) found_mpf = dict() for contig_id in mpf_contigs: for hit in mpf_contigs[contig_id]: acs, type = hit.split('|') found_mpf[acs] = type # print(found_mpf) logger.info('Running orit blast on {}'.format(replicon_ref)) orit_contigs = getRepliconContigs( replicon_blast(orit_ref, fixed_fasta, min_ori_ident, min_ori_cov, min_ori_evalue, tmp_dir, orit_blast_results, num_threads=num_threads)) found_orit = dict() for contig_id in orit_contigs: for hit in orit_contigs[contig_id]: acs, type = hit.split('|') found_orit[acs] = type # Get closest neighbor by mash distance in the entire plasmid database m = mash() #mash_distances = dict() mashfile_handle = open(mash_file, 'w') m.run_mash(mash_db, fixed_fasta, mashfile_handle) mash_results = m.read_mash(mash_file) mash_top_hit = getMashBestHit(mash_results) # GET HOST RANGE host_range_literature_report_df = pandas.DataFrame() if args.host_range_detailed and found_replicons: (host_range_refseq_rank, host_range_refseq_name, taxids, taxids_df, stats_host_range) = getRefSeqHostRange( replicon_name_list=list(found_replicons.values()), mob_cluster_id_list=[mash_top_hit['clustid']], relaxase_name_acc_list=None, relaxase_name_list=None, matchtype="loose_match",hr_obs_data = loadHostRangeDB()) if '-' in taxids: host_range_refseq_rank = None; host_range_refseq_name = None else: refseqtree = getTaxonomyTree(taxids) #refseq tree renderTree( tree=refseqtree, filename_prefix=args.outdir+"/"+file_id+"_refseqhostrange_") #get literature report summary dataframe (might be more than 1 row if multiple replicons are present) host_range_literature_report_df, littaxids = getLiteratureBasedHostRange(replicon_names = list(found_replicons.values()), plasmid_lit_db = loadliteratureplasmidDB(), input_seq = args.infile ) if littaxids: littree = getTaxonomyTree(littaxids) #get literature tree renderTree( tree=littree, filename_prefix=args.outdir+"/"+file_id+ "_literaturehostrange_") #write hostrange reports writeOutHostRangeReports(filename_prefix = args.outdir+"/"+file_id, samplename=file_id, replicon_name_list = list(found_replicons.values()), mob_cluster_id_list = [mash_top_hit['clustid']], relaxase_name_acc_list = None, relaxase_name_list = None, convergance_rank=host_range_refseq_rank, convergance_taxonomy=host_range_refseq_name, stats_host_range_dict=stats_host_range, literature_hr_report=host_range_literature_report_df) elif args.host_range_detailed and found_mob: #by MOB_accession numbers (host_range_refseq_rank, host_range_refseq_name, taxids, taxids_df, stats_host_range) = getRefSeqHostRange( replicon_name_list=None, mob_cluster_id_list=[mash_top_hit['clustid']], relaxase_name_acc_list=found_mob.keys(), relaxase_name_list=None, matchtype="loose_match", hr_obs_data=loadHostRangeDB()) refseqtree = getTaxonomyTree(taxids) # refseq tree renderTree( tree=refseqtree, filename_prefix=args.outdir + "/" + file_id + "_refseqhostrange_") writeOutHostRangeReports(filename_prefix=args.outdir + "/" + file_id, samplename=file_id, replicon_name_list=None, mob_cluster_id_list=[mash_top_hit['clustid']], relaxase_name_acc_list=None, relaxase_name_list=None, convergance_rank=host_range_refseq_rank, convergance_taxonomy=host_range_refseq_name, stats_host_range_dict=stats_host_range ) #print(host_range_refseq_rank, host_range_refseq_name, taxids_df["Organism"]) else: host_range_refseq_rank=None; host_range_refseq_name=None #END HOST RANGE MODULE if len(found_replicons) > 0: found_replicons = OrderedDict(sorted(found_replicons.items(), key=itemgetter(1), reverse=False)) rep_types = ",".join(list(found_replicons.values())) rep_acs = ",".join(list(found_replicons.keys())) else: rep_types = "-" rep_acs = "-" if len(found_mob) > 0: found_mob = OrderedDict(sorted(found_mob.items(), key=itemgetter(1), reverse=False)) mob_types = ",".join(list(found_mob.values())) mob_acs = ",".join(list(found_mob.keys())) else: mob_types = "-" mob_acs = "-" if len(found_mpf) > 0: found_mpf = OrderedDict(sorted(found_mpf.items(), key=itemgetter(1), reverse=False)) mpf_type = determine_mpf_type(found_mpf) mpf_acs = ",".join(list(found_mpf.keys())) else: mpf_type = "-" mpf_acs = "-" if len(found_orit) > 0: found_orit = OrderedDict(sorted(found_orit.items(), key=itemgetter(1), reverse=False)) orit_types = ",".join(list(found_orit.values())) orit_acs = ",".join(list(found_orit.keys())) else: orit_types = "-" orit_acs = "-" stats = calcFastaStats(fixed_fasta) predicted_mobility = 'Non-mobilizable' if mob_acs != '-' or orit_acs != '-': predicted_mobility = 'Mobilizable' if mob_acs != '-' and mpf_acs != '-': predicted_mobility = 'Conjugative' main_report_data_dict=collections.OrderedDict({"file_id":re.sub(r"\.(fasta|fa|fas){1,1}","",file_id), "num_contigs":stats['num_seq'], "total_length": stats['size'], "gc":stats['gc_content'], "rep_type(s)": rep_types, "rep_type_accession(s)": rep_acs, "relaxase_type(s)":mob_types, "relaxase_type_accession(s)": mob_acs, "mpf_type": mpf_type, "mpf_type_accession(s)": mpf_acs, "orit_type(s)": orit_types, "orit_accession(s)": orit_acs, "PredictedMobility": predicted_mobility, "mash_nearest_neighbor": mash_top_hit['top_hit'],"mash_neighbor_distance": mash_top_hit['mash_hit_score'], "mash_neighbor_cluster": mash_top_hit['clustid'], "NCBI-HR-rank":"-","NCBI-HR-Name":"-", "LitRepHRPlasmClass":"-","LitPredDBHRRank":"-","LitPredDBHRRankSciName":"-", "LitRepHRRankInPubs":"-", "LitRepHRNameInPubs":"-","LitMeanTransferRate":"-", "LitClosestRefAcc":"-", "LitClosestRefDonorStrain":"-", "LitClosestRefRecipientStrain":"-","LitClosestRefTransferRate":"-", "LitClosestConjugTemp":"-", "LitPMIDs":"-","LitPMIDsNumber":"-"}) main_report_mobtyper_df = pandas.DataFrame(columns=main_report_data_dict.keys()) #print(host_range_literature_report_collapsed_df) if host_range_refseq_rank and host_range_refseq_name: main_report_data_dict.update({"NCBI-HR-rank":host_range_refseq_rank,"NCBI-HR-Name":host_range_refseq_name}) if host_range_literature_report_df.empty == False: if host_range_literature_report_df.shape[0] >= 2: #collapse host range repor more than 2 rows host_range_literature_report_df = collapseLiteratureReport(host_range_literature_report_df) main_report_data_dict.update({"LitRepHRPlasmClass":host_range_literature_report_df["LiteratureReportedHostRangePlasmidClass"].values[0], "LitPredDBHRRank":host_range_literature_report_df["LiteraturePredictedHostRangeTreeRank"].values[0], "LitPredDBHRRankSciName": host_range_literature_report_df["LiteraturePredictedHostRangeTreeRankSciName"].values[0], "LitRepHRRankInPubs":host_range_literature_report_df["LiteratureReportedHostRangeRankInPubs"].values[0], "LitRepHRNameInPubs": host_range_literature_report_df["LiteratureReportedHostRangeNameInPubs"].values[0], "LitMeanTransferRate":host_range_literature_report_df["LiteratureMeanTransferRateRange"].values[0], "LitClosestRefAcc":host_range_literature_report_df["LiteratureClosestRefrencePlasmidAcc"].values[0], "LitClosestMashDist": host_range_literature_report_df["LiteratureClosestReferenceMashDistance"].values[0], "LitClosestRefDonorStrain": host_range_literature_report_df["LiteratureClosestReferenceDonorStrain"].values[0], "LitClosestRefRecipientStrain": host_range_literature_report_df["LiteratureClosestReferenceRecipientStrain"].values[0], "LitClosestRefTransferRate": host_range_literature_report_df["LiteratureClosestReferenceTransferRate"].values[0], "LitClosestConjugTemp": host_range_literature_report_df["LiteratureClosestReferenceConjugationTemperature"].values[0], "LitPMIDs": host_range_literature_report_df["LiteraturePMIDs"].values[0], "LitPMIDsNumber":host_range_literature_report_df["LiteraturePublicationsNumber"].values[0] }) main_report_mobtyper_df = main_report_mobtyper_df.append(pandas.DataFrame([main_report_data_dict]),sort=False) main_report_mobtyper_df.to_csv(report_file, sep="\t", mode="w",encoding="UTF-8",index=False) if not keep_tmp: shutil.rmtree(tmp_dir) logger.info("Run completed")