def test_find_homologs(self): """find_homologs should return raw data, filtered and removed ids.""" formatdb_cmd = 'formatdb -p F -o T -i %s' % self.subjectdb_fp system(formatdb_cmd) self._paths_to_clean_up.append("formatdb.log") for suffix in ["nhr","nin","nsd","nsi","nsq"]: self._paths_to_clean_up.append(".".join(\ [self.subjectdb_fp,suffix])) blast_output,hit_ids, removed_hit_ids=\ find_homologs(self.query_fp, self.subjectdb_fp, e_value = 1e-4, max_hits = 100, working_dir = "./", blast_mat_root = None,\ wordsize = 28, percent_aligned = 0.98, DEBUG=False) self.assertEqual(hit_ids,set(["bth:BT_0001","hsa:8355"])) self.assertEqual(removed_hit_ids,set([])) i=0 for line in blast_output: if line.startswith("#"): i+=1 continue #depends on tmpfilename, skip testing self.assertEqual(blast_output[i],EXP_BLAST_OUTPUT[i]) i+=1 # Ensure low % alignment seqs are removed blast_output,hit_ids, removed_hit_ids=\ find_homologs(self.query2_fp, self.subjectdb_fp,\ e_value = 1e-4, max_hits = 100, working_dir = "./",\ blast_mat_root = None, wordsize = 28, percent_aligned = 1.00,\ DEBUG=False) self.assertEqual(hit_ids,set(["bth:BT_0001"])) self.assertEqual(removed_hit_ids,set(["hsa:8355_tweaked"])) #Ensure high % alignment seqs are not removed blast_output,hit_ids, removed_hit_ids=\ find_homologs(self.query2_fp, self.subjectdb_fp, \ e_value = 1e-4, max_hits = 100, working_dir = "./", \ blast_mat_root = None, wordsize = 28, percent_aligned = 0.75,\ DEBUG=False) self.assertEqual(hit_ids,set(["bth:BT_0001","hsa:8355_tweaked"])) self.assertEqual(removed_hit_ids,set([]))
def main(): option_parser, options, args = parse_command_line_parameters(**script_info) DEBUG = options.verbose check_options(option_parser, options) start_time = time() option_lines = format_options_as_lines(options) if DEBUG: print FORMAT_BAR print "Running with options:" for line in sorted(option_lines): print line print FORMAT_BAR # because the blast app controller uses absolute paths, make sure subject # db path is fully specified subject_db = options.subjectdb if not subject_db.startswith('/'): subject_db = join(getcwd(), subject_db) if not options.no_format_db: # initialize object inpath = FilePath(abspath(options.subjectdb)) subject_dir, subj_file = split(inpath) fdb = FormatDb(WorkingDir=subject_dir) # Currently we do not support protein blasts, but # this would be easy to add in the future... fdb.Parameters['-p'].on('F') # Create indices for record lookup fdb.Parameters['-o'].on('T') # Set input database fdb.Parameters['-i'].on(subject_db) formatdb_cmd = fdb.BaseCommand if DEBUG: print "Formatting db with command: %s" % formatdb_cmd app_result = fdb(subject_db) formatdb_filepaths = [] for v in app_result.values(): try: formatdb_filepaths.append(v.name) except AttributeError: # not a file object, so no path to return pass db_format_time = time() - start_time if DEBUG: print "Formatting subject db took: %2.f seconds" % db_format_time print "formatdb log file written to: %s" % app_result['log'] print FORMAT_BAR else: db_format_time = time() - start_time formatdb_cmd = "None (formatdb not called)" # Check that User-Supplied subjectdb is valid db_ext = [".nhr", ".nin", ".nsd", ".nsi", ".nsq"] formatdb_filepaths = [subject_db + ext for ext in db_ext] if DEBUG: print "Checking that pre-existing formatdb files exist and can be read." print "Files to be checked:" for fp in formatdb_filepaths: print fp print FORMAT_BAR try: formatdb_files = [open(db_f, "U") for db_f in formatdb_filepaths] [f.close() for f in formatdb_files] except IOError: if DEBUG: print "Cannot open user-supplied database file:", db_f option_parser.error( """Problem with -d and --no_format_db option combination: Cannot open the following user-supplied database file: %s. Consider running without --no_format_db to let formatdb generate these required files""" % db_f) if DEBUG: print "OK: BLAST Database files exist and can be read." print FORMAT_BAR # Perform BLAST search blast_results, hit_ids, removed_hit_ids = find_homologs(options.querydb, subject_db, options.e_value, options.max_hits, options.working_dir, options.blastmatroot, options.wordsize, options.percent_aligned, DEBUG=DEBUG) blast_time = (time() - start_time) - db_format_time if DEBUG: print "BLAST search took: %2.f minute(s)" % (blast_time / 60.0) print FORMAT_BAR # Create output folder outputdir = options.outputdir try: makedirs(outputdir) except OSError: pass # Record raw blast results raw_blast_results_path = join(outputdir, "raw_blast_results.txt") f = open(raw_blast_results_path, 'w') f.writelines(blast_results) f.close() # Record excluded seqs excluded_seqs_path = join(outputdir, "matching.fna") ids_to_seq_file(hit_ids, options.querydb, excluded_seqs_path, "") # Record included (screened) seqs included_seqs_path = join(outputdir, "non-matching.fna") all_ids = ids_from_fasta_lines(open(options.querydb)) included_ids = set(all_ids) - hit_ids ids_to_seq_file(included_ids, options.querydb, included_seqs_path, "") log_lines = compose_logfile_lines(start_time, db_format_time, blast_time, option_lines, formatdb_cmd, blast_results, options, all_ids, hit_ids, removed_hit_ids, included_ids, DEBUG) log_path = join(outputdir, "sequence_exclusion.log") if DEBUG: print "Writing summary to: %s" % log_path f = open(log_path, 'w') f.writelines(log_lines) f.close() if not options.no_clean: if DEBUG: print FORMAT_BAR print "| Cleanup |" print FORMAT_BAR if not options.no_format_db: if options.verbose: print "Cleaning up formatdb files:", formatdb_filepaths remove_files(formatdb_filepaths) else: if options.verbose: print "Formatdb not run...nothing to clean"
def main(): option_parser, options, args = parse_command_line_parameters(**script_info) DEBUG = options.verbose check_options(option_parser, options) start_time = time() option_lines = format_options_as_lines(options) if DEBUG: print FORMAT_BAR print "Running with options:" for line in sorted(option_lines): print line print FORMAT_BAR #because the blast app controller uses absolute paths, make sure subject #db path is fully specified subject_db = options.subjectdb if not subject_db.startswith('/'): subject_db = join(getcwd(), subject_db) if not options.no_format_db: #initialize object inpath = FilePath(abspath(options.subjectdb)) subject_dir, subj_file = split(inpath) fdb = FormatDb(WorkingDir=subject_dir) # Currently we do not support protein blasts, but # this would be easy to add in the future... fdb.Parameters['-p'].on('F') # Create indices for record lookup fdb.Parameters['-o'].on('T') # Set input database fdb.Parameters['-i'].on(subject_db) formatdb_cmd = fdb.BaseCommand if DEBUG: print "Formatting db with command: %s" % formatdb_cmd app_result = fdb(subject_db) formatdb_filepaths = [] for v in app_result.values(): try: formatdb_filepaths.append(v.name) except AttributeError: # not a file object, so no path to return pass db_format_time = time() - start_time if DEBUG: print "Formatting subject db took: %2.f seconds" % db_format_time print "formatdb log file written to: %s" % app_result['log'] print FORMAT_BAR else: db_format_time = time() - start_time formatdb_cmd = "None (formatdb not called)" # Check that User-Supplied subjectdb is valid db_ext = [".nhr", ".nin", ".nsd", ".nsi", ".nsq"] formatdb_filepaths = [subject_db + ext for ext in db_ext] if DEBUG: print "Checking that pre-existing formatdb files exist and can be read." print "Files to be checked:" for fp in formatdb_filepaths: print fp print FORMAT_BAR try: formatdb_files = [open(db_f, "U") for db_f in formatdb_filepaths] [f.close() for f in formatdb_files] except IOError: if DEBUG: print "Cannot open user-supplied database file:", db_f option_parser.error( """Problem with -d and --no_format_db option combination: Cannot open the following user-supplied database file: %s. Consider running without --no_format_db to let formatdb generate these required files""" % db_f) if DEBUG: print "OK: BLAST Database files exist and can be read." print FORMAT_BAR # Perform BLAST search blast_results,hit_ids, removed_hit_ids = find_homologs(options.querydb,\ subject_db, options.e_value,options.max_hits,\ options.working_dir,options.blastmatroot, options.wordsize,\ options.percent_aligned, DEBUG=DEBUG) blast_time = (time() - start_time) - db_format_time if DEBUG: print "BLAST search took: %2.f minute(s)" % (blast_time / 60.0) print FORMAT_BAR #Create output folder outputdir = options.outputdir try: makedirs(outputdir) except OSError: pass #Record raw blast results raw_blast_results_path = join(outputdir, "raw_blast_results.txt") f = open(raw_blast_results_path, 'w') f.writelines(blast_results) f.close() #Record excluded seqs excluded_seqs_path = join(outputdir, "matching.fna") ids_to_seq_file(hit_ids, options.querydb, excluded_seqs_path, "") #Record included (screened) seqs included_seqs_path = join(outputdir, "non-matching.fna") all_ids = ids_from_fasta_lines(open(options.querydb)) included_ids = set(all_ids) - hit_ids ids_to_seq_file(included_ids, options.querydb, included_seqs_path, "") log_lines = compose_logfile_lines(start_time, db_format_time, blast_time,\ option_lines,formatdb_cmd,\ blast_results,options,all_ids,\ hit_ids,removed_hit_ids,\ included_ids,DEBUG) log_path = join(outputdir, "sequence_exclusion.log") if DEBUG: print "Writing summary to: %s" % log_path f = open(log_path, 'w') f.writelines(log_lines) f.close() if not options.no_clean: if DEBUG: print FORMAT_BAR print "| Cleanup |" print FORMAT_BAR if not options.no_format_db: if options.verbose: print "Cleaning up formatdb files:", formatdb_filepaths remove_files(formatdb_filepaths) else: if options.verbose: print "Formatdb not run...nothing to clean"