Ejemplo n.º 1
0
    def test_find_homologs(self):
        """find_homologs should return raw data, filtered and removed ids."""
       
 
        formatdb_cmd = 'formatdb -p F -o T -i %s' % self.subjectdb_fp
        system(formatdb_cmd)
        self._paths_to_clean_up.append("formatdb.log")
        for suffix in ["nhr","nin","nsd","nsi","nsq"]:
            self._paths_to_clean_up.append(".".join(\
              [self.subjectdb_fp,suffix]))

        blast_output,hit_ids, removed_hit_ids=\
            find_homologs(self.query_fp, self.subjectdb_fp, e_value = 1e-4,
                max_hits = 100, working_dir = "./", blast_mat_root = None,\
                wordsize = 28, percent_aligned = 0.98, DEBUG=False)
        
        self.assertEqual(hit_ids,set(["bth:BT_0001","hsa:8355"]))
        self.assertEqual(removed_hit_ids,set([]))
         
        i=0
        for line in blast_output:
            
            if line.startswith("#"):
                i+=1
                continue #depends on tmpfilename, skip testing
            
            self.assertEqual(blast_output[i],EXP_BLAST_OUTPUT[i])
            i+=1

        # Ensure low % alignment seqs are removed 
        blast_output,hit_ids, removed_hit_ids=\
                find_homologs(self.query2_fp, self.subjectdb_fp,\
                e_value = 1e-4, max_hits = 100, working_dir = "./",\
                blast_mat_root = None, wordsize = 28, percent_aligned = 1.00,\
                DEBUG=False)
        
        self.assertEqual(hit_ids,set(["bth:BT_0001"]))
        self.assertEqual(removed_hit_ids,set(["hsa:8355_tweaked"]))
        
        #Ensure high % alignment seqs are not removed 
        blast_output,hit_ids, removed_hit_ids=\
                find_homologs(self.query2_fp, self.subjectdb_fp, \
                e_value = 1e-4, max_hits = 100, working_dir = "./", \
                blast_mat_root = None, wordsize = 28, percent_aligned = 0.75,\
                 DEBUG=False)
            
        self.assertEqual(hit_ids,set(["bth:BT_0001","hsa:8355_tweaked"]))
        self.assertEqual(removed_hit_ids,set([]))
Ejemplo n.º 2
0
    def test_find_homologs(self):
        """find_homologs should return raw data, filtered and removed ids."""
       
 
        formatdb_cmd = 'formatdb -p F -o T -i %s' % self.subjectdb_fp
        system(formatdb_cmd)
        self._paths_to_clean_up.append("formatdb.log")
        for suffix in ["nhr","nin","nsd","nsi","nsq"]:
            self._paths_to_clean_up.append(".".join(\
              [self.subjectdb_fp,suffix]))

        blast_output,hit_ids, removed_hit_ids=\
            find_homologs(self.query_fp, self.subjectdb_fp, e_value = 1e-4,
                max_hits = 100, working_dir = "./", blast_mat_root = None,\
                wordsize = 28, percent_aligned = 0.98, DEBUG=False)
        
        self.assertEqual(hit_ids,set(["bth:BT_0001","hsa:8355"]))
        self.assertEqual(removed_hit_ids,set([]))
         
        i=0
        for line in blast_output:
            
            if line.startswith("#"):
                i+=1
                continue #depends on tmpfilename, skip testing
            
            self.assertEqual(blast_output[i],EXP_BLAST_OUTPUT[i])
            i+=1

        # Ensure low % alignment seqs are removed 
        blast_output,hit_ids, removed_hit_ids=\
                find_homologs(self.query2_fp, self.subjectdb_fp,\
                e_value = 1e-4, max_hits = 100, working_dir = "./",\
                blast_mat_root = None, wordsize = 28, percent_aligned = 1.00,\
                DEBUG=False)
        
        self.assertEqual(hit_ids,set(["bth:BT_0001"]))
        self.assertEqual(removed_hit_ids,set(["hsa:8355_tweaked"]))
        
        #Ensure high % alignment seqs are not removed 
        blast_output,hit_ids, removed_hit_ids=\
                find_homologs(self.query2_fp, self.subjectdb_fp, \
                e_value = 1e-4, max_hits = 100, working_dir = "./", \
                blast_mat_root = None, wordsize = 28, percent_aligned = 0.75,\
                 DEBUG=False)
            
        self.assertEqual(hit_ids,set(["bth:BT_0001","hsa:8355_tweaked"]))
        self.assertEqual(removed_hit_ids,set([]))
Ejemplo n.º 3
0
def main():
    option_parser, options, args = parse_command_line_parameters(**script_info)
    DEBUG = options.verbose
    check_options(option_parser, options)
    start_time = time()
    option_lines = format_options_as_lines(options)
    if DEBUG:
        print FORMAT_BAR
        print "Running with options:"
        for line in sorted(option_lines):
            print line
        print FORMAT_BAR

    # because the blast app controller uses absolute paths, make sure subject
    # db path is fully specified

    subject_db = options.subjectdb
    if not subject_db.startswith('/'):
        subject_db = join(getcwd(), subject_db)
    if not options.no_format_db:

        # initialize object
        inpath = FilePath(abspath(options.subjectdb))
        subject_dir, subj_file = split(inpath)

        fdb = FormatDb(WorkingDir=subject_dir)
        # Currently we do not support protein blasts, but
        # this would be easy to add in the future...
        fdb.Parameters['-p'].on('F')

        # Create indices for record lookup
        fdb.Parameters['-o'].on('T')

        # Set input database
        fdb.Parameters['-i'].on(subject_db)

        formatdb_cmd = fdb.BaseCommand

        if DEBUG:
            print "Formatting db with command: %s" % formatdb_cmd

        app_result = fdb(subject_db)
        formatdb_filepaths = []
        for v in app_result.values():
            try:
                formatdb_filepaths.append(v.name)
            except AttributeError:
                # not a file object, so no path to return
                pass

        db_format_time = time() - start_time

        if DEBUG:
            print "Formatting subject db took: %2.f seconds" % db_format_time
            print "formatdb log file written to: %s" % app_result['log']
            print FORMAT_BAR
    else:
        db_format_time = time() - start_time
        formatdb_cmd = "None (formatdb not called)"
        # Check that User-Supplied subjectdb is valid
        db_ext = [".nhr", ".nin", ".nsd", ".nsi", ".nsq"]
        formatdb_filepaths = [subject_db + ext for ext in db_ext]

        if DEBUG:
            print "Checking that pre-existing formatdb files exist and can be read."
            print "Files to be checked:"
            for fp in formatdb_filepaths:
                print fp
            print FORMAT_BAR

        try:
            formatdb_files = [open(db_f, "U") for db_f in formatdb_filepaths]
            [f.close() for f in formatdb_files]
        except IOError:
            if DEBUG:
                print "Cannot open user-supplied database file:", db_f
            option_parser.error(
                """Problem with -d and --no_format_db option combination: Cannot open the following user-supplied database file: %s. Consider running without --no_format_db to let formatdb generate these required files""" %
                db_f)

        if DEBUG:
            print "OK: BLAST Database files exist and can be read."
            print FORMAT_BAR

    # Perform BLAST search
    blast_results, hit_ids, removed_hit_ids = find_homologs(options.querydb,
                                                            subject_db, options.e_value, options.max_hits,
                                                            options.working_dir, options.blastmatroot, options.wordsize,
                                                            options.percent_aligned, DEBUG=DEBUG)

    blast_time = (time() - start_time) - db_format_time

    if DEBUG:
        print "BLAST search took: %2.f minute(s)" % (blast_time / 60.0)
        print FORMAT_BAR

    # Create output folder
    outputdir = options.outputdir
    try:
        makedirs(outputdir)
    except OSError:
        pass

    # Record raw blast results
    raw_blast_results_path = join(outputdir, "raw_blast_results.txt")
    f = open(raw_blast_results_path, 'w')
    f.writelines(blast_results)
    f.close()

    # Record excluded seqs
    excluded_seqs_path = join(outputdir, "matching.fna")
    ids_to_seq_file(hit_ids, options.querydb, excluded_seqs_path, "")

    # Record included (screened) seqs
    included_seqs_path = join(outputdir, "non-matching.fna")
    all_ids = ids_from_fasta_lines(open(options.querydb))
    included_ids = set(all_ids) - hit_ids
    ids_to_seq_file(included_ids, options.querydb, included_seqs_path, "")

    log_lines = compose_logfile_lines(start_time, db_format_time, blast_time,
                                      option_lines, formatdb_cmd,
                                      blast_results, options, all_ids,
                                      hit_ids, removed_hit_ids,
                                      included_ids, DEBUG)

    log_path = join(outputdir, "sequence_exclusion.log")
    if DEBUG:
        print "Writing summary to: %s" % log_path

    f = open(log_path, 'w')
    f.writelines(log_lines)
    f.close()

    if not options.no_clean:
        if DEBUG:

            print FORMAT_BAR
            print "|                           Cleanup                        |"
            print FORMAT_BAR

        if not options.no_format_db:
            if options.verbose:
                print "Cleaning up formatdb files:", formatdb_filepaths
            remove_files(formatdb_filepaths)
        else:
            if options.verbose:
                print "Formatdb not run...nothing to clean"
Ejemplo n.º 4
0
def main():
    option_parser, options, args = parse_command_line_parameters(**script_info)
    DEBUG = options.verbose
    check_options(option_parser, options)
    start_time = time()
    option_lines = format_options_as_lines(options)
    if DEBUG:
        print FORMAT_BAR
        print "Running with options:"
        for line in sorted(option_lines):
            print line
        print FORMAT_BAR

    #because the blast app controller uses absolute paths, make sure subject
    #db path is fully specified

    subject_db = options.subjectdb
    if not subject_db.startswith('/'):
        subject_db = join(getcwd(), subject_db)
    if not options.no_format_db:

        #initialize object
        inpath = FilePath(abspath(options.subjectdb))
        subject_dir, subj_file = split(inpath)

        fdb = FormatDb(WorkingDir=subject_dir)
        # Currently we do not support protein blasts, but
        # this would be easy to add in the future...
        fdb.Parameters['-p'].on('F')

        # Create indices for record lookup
        fdb.Parameters['-o'].on('T')

        # Set input database
        fdb.Parameters['-i'].on(subject_db)

        formatdb_cmd = fdb.BaseCommand

        if DEBUG:
            print "Formatting db with command: %s" % formatdb_cmd

        app_result = fdb(subject_db)
        formatdb_filepaths = []
        for v in app_result.values():
            try:
                formatdb_filepaths.append(v.name)
            except AttributeError:
                # not a file object, so no path to return
                pass

        db_format_time = time() - start_time

        if DEBUG:
            print "Formatting subject db took: %2.f seconds" % db_format_time
            print "formatdb log file written to: %s" % app_result['log']
            print FORMAT_BAR
    else:
        db_format_time = time() - start_time
        formatdb_cmd = "None (formatdb not called)"
        # Check that User-Supplied subjectdb is valid
        db_ext = [".nhr", ".nin", ".nsd", ".nsi", ".nsq"]
        formatdb_filepaths = [subject_db + ext for ext in db_ext]

        if DEBUG:
            print "Checking that pre-existing formatdb files exist and can be read."
            print "Files to be checked:"
            for fp in formatdb_filepaths:
                print fp
            print FORMAT_BAR

        try:
            formatdb_files = [open(db_f, "U") for db_f in formatdb_filepaths]
            [f.close() for f in formatdb_files]
        except IOError:
            if DEBUG:
                print "Cannot open user-supplied database file:", db_f
            option_parser.error(
                """Problem with -d and --no_format_db option combination: Cannot open the following user-supplied database file: %s. Consider running without --no_format_db to let formatdb generate these required files"""
                % db_f)

        if DEBUG:
            print "OK: BLAST Database files exist and can be read."
            print FORMAT_BAR

    # Perform BLAST search
    blast_results,hit_ids, removed_hit_ids = find_homologs(options.querydb,\
        subject_db, options.e_value,options.max_hits,\
        options.working_dir,options.blastmatroot, options.wordsize,\
                            options.percent_aligned, DEBUG=DEBUG)

    blast_time = (time() - start_time) - db_format_time

    if DEBUG:
        print "BLAST search took: %2.f minute(s)" % (blast_time / 60.0)
        print FORMAT_BAR

    #Create output folder
    outputdir = options.outputdir
    try:
        makedirs(outputdir)
    except OSError:
        pass

    #Record raw blast results
    raw_blast_results_path = join(outputdir, "raw_blast_results.txt")
    f = open(raw_blast_results_path, 'w')
    f.writelines(blast_results)
    f.close()

    #Record excluded seqs
    excluded_seqs_path = join(outputdir, "matching.fna")
    ids_to_seq_file(hit_ids, options.querydb, excluded_seqs_path, "")

    #Record included (screened) seqs
    included_seqs_path = join(outputdir, "non-matching.fna")
    all_ids = ids_from_fasta_lines(open(options.querydb))
    included_ids = set(all_ids) - hit_ids
    ids_to_seq_file(included_ids, options.querydb, included_seqs_path, "")

    log_lines = compose_logfile_lines(start_time, db_format_time, blast_time,\
                                                   option_lines,formatdb_cmd,\
                                               blast_results,options,all_ids,\
                                                     hit_ids,removed_hit_ids,\
                                                          included_ids,DEBUG)

    log_path = join(outputdir, "sequence_exclusion.log")
    if DEBUG:
        print "Writing summary to: %s" % log_path

    f = open(log_path, 'w')
    f.writelines(log_lines)
    f.close()

    if not options.no_clean:
        if DEBUG:

            print FORMAT_BAR
            print "|                           Cleanup                        |"
            print FORMAT_BAR

        if not options.no_format_db:
            if options.verbose:
                print "Cleaning up formatdb files:", formatdb_filepaths
            remove_files(formatdb_filepaths)
        else:
            if options.verbose:
                print "Formatdb not run...nothing to clean"