def do_run(args, data_path, match_score, vfs_list, cons_run): """ Perform a SeqFindr run """ matrix, y_label, exist_ord = [], [], [] in_files = util.get_fasta_files(data_path) # Reorder if requested if args.index_file is not None: in_files = util.order_inputs(args.index_file, in_files) # Handle and existing run if args.existing_data is not None: cleaned, blastxml_tmp = [], [] blast_xml = glob.glob( os.path.abspath(args.existing_data) + "/BLAST_results/*") if not cons_run: for e in blast_xml: if e.find("cons_DB=") == -1: blastxml_tmp.append(e) else: for e in blast_xml: if e.find("cons_DB=") != -1: blastxml_tmp.append(e) blast_xml = [] blast_xml = blastxml_tmp for e in blast_xml: sid = e.split("ID=")[-1].split("_blast.xml")[0] cleaned.append(sid) for i in in_files: for j in cleaned: if i.find(j) != -1: y_label.append(j) break for i in blast_xml: for j in y_label: if i.find(j) != -1: exist_ord.append(i) break in_files = exist_ord # Make sure XML are right order for idx, subject in enumerate(in_files): if args.existing_data is None: strain_id = blast.make_BLAST_database(subject) y_label.append(strain_id) database = os.path.basename(subject) blast_xml = blast.run_BLAST( args.seqs_of_interest, os.path.join(os.getcwd(), "DBs/" + database), args, cons_run) accepted_hits = blast.parse_BLAST(blast_xml, float(args.tol), args.careful) else: strain_id = y_label[idx] accepted_hits = blast.parse_BLAST(in_files[idx], float(args.tol), args.careful) row = build_matrix_row(vfs_list, accepted_hits, match_score) row.insert(0, strain_id) matrix.append(row) return matrix, y_label
def do_run(args, data_path, match_score, vfs_list, cons_run): """ Perform a SeqFindr run """ matrix, y_label, exist_ord = [], [], [] in_files = util.get_fasta_files(data_path) # Reorder if requested if args.index_file is not None: in_files = util.order_inputs(args.index_file, in_files) # Handle and existing run if args.existing_data is not None: cleaned, blastxml_tmp = [], [] blast_xml = glob.glob(os.path.abspath(args.existing_data)+"/BLAST_results/*") if not cons_run: for e in blast_xml: if e.find("cons_DB=") == -1: blastxml_tmp.append(e) else: for e in blast_xml: if e.find("cons_DB=") != -1: blastxml_tmp.append(e) blast_xml = [] blast_xml = blastxml_tmp for e in blast_xml: sid = e.split("ID=")[-1].split("_blast.xml")[0] cleaned.append(sid) for i in in_files: for j in cleaned: if i.find(j) != -1: y_label.append(j) break for i in blast_xml: for j in y_label: if i.find(j) != -1: exist_ord.append(i) break in_files = exist_ord # Make sure XML are right order for idx, subject in enumerate(in_files): if args.existing_data is None: strain_id = blast.make_BLAST_database(subject) y_label.append(strain_id) database = os.path.basename(subject) blast_xml = blast.run_BLAST(args.seqs_of_interest, os.path.join(os.getcwd(), "DBs/"+database), args, cons_run) accepted_hits = blast.parse_BLAST(blast_xml, float(args.tol), args.careful) else: strain_id = y_label[idx] accepted_hits = blast.parse_BLAST(in_files[idx], float(args.tol), args.careful) row = build_matrix_row(vfs_list, accepted_hits, match_score) row.insert(0, strain_id) matrix.append(row) return matrix, y_label
def strip_bases(args): """ Strip the 1st and last 'N' bases from mapping consensuses Uses: * args.cons * args.seqs_of_interest * arg.strip To avoid the effects of lead in and lead out coverage resulting in uncalled bases :param args: the argparse args containing args.strip value :type args: argparse args :rtype: the updated args to reflect the args.cons & args.seqs_of_interest location """ # Get in the fasta files in the consensus directory fasta_in = util.get_fasta_files(args.cons) # Build a stripped directory new_cons_dir = os.path.join(args.cons, 'stripped') try: os.mkdir(new_cons_dir) except OSError: sys.stderr.write("A stripped directory exists. Overwriting\n") # Update the args.cons to the stripped directory args.cons = new_cons_dir args.strip = int(args.strip) # Strip the start and end for fa in fasta_in: tmp = os.path.basename(fa) out = os.path.join(args.cons, tmp) with open(fa, "rU") as fin, open(out, 'w') as fout: records = SeqIO.parse(fin, "fasta") for rec in records: rec.seq = rec.seq[args.strip:-args.strip] SeqIO.write(rec, fout, "fasta") # Trim the db as well tmp = args.seqs_of_interest.split('.') stripdb = '.'.join(tmp[:-1])+'_trimmed.'+tmp[-1] with open(args.seqs_of_interest, "rU") as fin, open(stripdb, 'w') as fout: records = SeqIO.parse(fin, "fasta") for rec in records: rec.seq = rec.seq[args.strip:-args.strip] SeqIO.write(rec, fout, "fasta") # Update the args.seqs_of_interest args.seqs_of_interest = stripdb return args
def strip_bases(args): """ Strip the 1st and last 'N' bases from mapping consensuses Uses: * args.cons * args.seqs_of_interest * arg.strip To avoid the effects of lead in and lead out coverage resulting in uncalled bases :param args: the argparse args containing args.strip value :type args: argparse args :rtype: the updated args to reflect the args.cons & args.seqs_of_interest location """ # Get in the fasta files in the consensus directory fasta_in = util.get_fasta_files(args.cons) # Build a stripped directory new_cons_dir = os.path.join(args.cons, 'stripped') try: os.mkdir(new_cons_dir) except OSError: sys.stderr.write("A stripped directory exists. Overwriting\n") # Update the args.cons to the stripped directory args.cons = new_cons_dir args.strip = int(args.strip) # Strip the start and end for fa in fasta_in: tmp = os.path.basename(fa) out = os.path.join(args.cons, tmp) with open(fa, "rU") as fin, open(out, 'w') as fout: records = SeqIO.parse(fin, "fasta") for rec in records: rec.seq = rec.seq[args.strip:-args.strip] SeqIO.write(rec, fout, "fasta") # Trim the db as well tmp = args.seqs_of_interest.split('.') stripdb = '.'.join(tmp[:-1]) + '_trimmed.' + tmp[-1] with open(args.seqs_of_interest, "rU") as fin, open(stripdb, 'w') as fout: records = SeqIO.parse(fin, "fasta") for rec in records: rec.seq = rec.seq[args.strip:-args.strip] SeqIO.write(rec, fout, "fasta") # Update the args.seqs_of_interest args.seqs_of_interest = stripdb return args