def test_ids_to_seq_file(self):
        """ids_to_seq_file should lookup and write out seqs for given ids"""
        self.id_test_fp = get_tmp_filename_as_str(\
                          prefix='ExcludeByBlastTests_',suffix='.fasta')
        
        self._paths_to_clean_up.append(self.id_test_fp)        
              
         
        ids = ["bth:BT_0001"]
        ids_to_seq_file(ids, self.query_fp, self.id_test_fp)
 
        exp_lines=open(self.query_fp).readlines()[2:] #this is the bth entry
        self.assertEqual(open(self.id_test_fp).readlines(), exp_lines)
    def test_ids_to_seq_file(self):
        """ids_to_seq_file should lookup and write out seqs for given ids"""
        _, self.id_test_fp = mkstemp(prefix='ExcludeByBlastTests_',
                                     suffix='.fasta')
        close(_)

        self._paths_to_clean_up.append(self.id_test_fp)

        ids = ["bth:BT_0001"]
        ids_to_seq_file(ids, self.query_fp, self.id_test_fp)

        # this is the bth entry
        exp_lines = open(self.query_fp).readlines()[2:]
        self.assertEqual(open(self.id_test_fp).readlines(), exp_lines)
    def test_ids_to_seq_file(self):
        """ids_to_seq_file should lookup and write out seqs for given ids"""
        fd, self.id_test_fp = mkstemp(prefix='ExcludeByBlastTests_',
                                      suffix='.fasta')
        close(fd)

        self._paths_to_clean_up.append(self.id_test_fp)

        ids = ["bth:BT_0001"]
        ids_to_seq_file(ids, self.query_fp, self.id_test_fp)

        # this is the bth entry
        exp_lines = open(self.query_fp).readlines()[2:]
        self.assertEqual(open(self.id_test_fp).readlines(), exp_lines)
def main():
    option_parser, options, args = parse_command_line_parameters(**script_info)
    DEBUG = options.verbose
    check_options(option_parser, options)
    start_time = time()
    option_lines = format_options_as_lines(options)
    if DEBUG:
        print FORMAT_BAR
        print "Running with options:"
        for line in sorted(option_lines):
            print line
        print FORMAT_BAR

    # because the blast app controller uses absolute paths, make sure subject
    # db path is fully specified

    subject_db = options.subjectdb
    if not subject_db.startswith('/'):
        subject_db = join(getcwd(), subject_db)
    if not options.no_format_db:

        # initialize object
        inpath = FilePath(abspath(options.subjectdb))
        subject_dir, subj_file = split(inpath)

        fdb = FormatDb(WorkingDir=subject_dir)
        # Currently we do not support protein blasts, but
        # this would be easy to add in the future...
        fdb.Parameters['-p'].on('F')

        # Create indices for record lookup
        fdb.Parameters['-o'].on('T')

        # Set input database
        fdb.Parameters['-i'].on(subject_db)

        formatdb_cmd = fdb.BaseCommand

        if DEBUG:
            print "Formatting db with command: %s" % formatdb_cmd

        app_result = fdb(subject_db)
        formatdb_filepaths = []
        for v in app_result.values():
            try:
                formatdb_filepaths.append(v.name)
            except AttributeError:
                # not a file object, so no path to return
                pass

        db_format_time = time() - start_time

        if DEBUG:
            print "Formatting subject db took: %2.f seconds" % db_format_time
            print "formatdb log file written to: %s" % app_result['log']
            print FORMAT_BAR
    else:
        db_format_time = time() - start_time
        formatdb_cmd = "None (formatdb not called)"
        # Check that User-Supplied subjectdb is valid
        db_ext = [".nhr", ".nin", ".nsd", ".nsi", ".nsq"]
        formatdb_filepaths = [subject_db + ext for ext in db_ext]

        if DEBUG:
            print "Checking that pre-existing formatdb files exist and can be read."
            print "Files to be checked:"
            for fp in formatdb_filepaths:
                print fp
            print FORMAT_BAR

        try:
            formatdb_files = [open(db_f, "U") for db_f in formatdb_filepaths]
            [f.close() for f in formatdb_files]
        except IOError:
            if DEBUG:
                print "Cannot open user-supplied database file:", db_f
            option_parser.error(
                """Problem with -d and --no_format_db option combination: Cannot open the following user-supplied database file: %s. Consider running without --no_format_db to let formatdb generate these required files""" %
                db_f)

        if DEBUG:
            print "OK: BLAST Database files exist and can be read."
            print FORMAT_BAR

    # Perform BLAST search
    blast_results, hit_ids, removed_hit_ids = find_homologs(options.querydb,
                                                            subject_db, options.e_value, options.max_hits,
                                                            options.working_dir, options.blastmatroot, options.wordsize,
                                                            options.percent_aligned, DEBUG=DEBUG)

    blast_time = (time() - start_time) - db_format_time

    if DEBUG:
        print "BLAST search took: %2.f minute(s)" % (blast_time / 60.0)
        print FORMAT_BAR

    # Create output folder
    outputdir = options.outputdir
    try:
        makedirs(outputdir)
    except OSError:
        pass

    # Record raw blast results
    raw_blast_results_path = join(outputdir, "raw_blast_results.txt")
    f = open(raw_blast_results_path, 'w')
    f.writelines(blast_results)
    f.close()

    # Record excluded seqs
    excluded_seqs_path = join(outputdir, "matching.fna")
    ids_to_seq_file(hit_ids, options.querydb, excluded_seqs_path, "")

    # Record included (screened) seqs
    included_seqs_path = join(outputdir, "non-matching.fna")
    all_ids = ids_from_fasta_lines(open(options.querydb))
    included_ids = set(all_ids) - hit_ids
    ids_to_seq_file(included_ids, options.querydb, included_seqs_path, "")

    log_lines = compose_logfile_lines(start_time, db_format_time, blast_time,
                                      option_lines, formatdb_cmd,
                                      blast_results, options, all_ids,
                                      hit_ids, removed_hit_ids,
                                      included_ids, DEBUG)

    log_path = join(outputdir, "sequence_exclusion.log")
    if DEBUG:
        print "Writing summary to: %s" % log_path

    f = open(log_path, 'w')
    f.writelines(log_lines)
    f.close()

    if not options.no_clean:
        if DEBUG:

            print FORMAT_BAR
            print "|                           Cleanup                        |"
            print FORMAT_BAR

        if not options.no_format_db:
            if options.verbose:
                print "Cleaning up formatdb files:", formatdb_filepaths
            remove_files(formatdb_filepaths)
        else:
            if options.verbose:
                print "Formatdb not run...nothing to clean"
Esempio n. 5
0
def main():
    option_parser, options, args = parse_command_line_parameters(**script_info)
    DEBUG = options.verbose
    check_options(option_parser, options)
    start_time = time()
    option_lines = format_options_as_lines(options)
    if DEBUG:
        print FORMAT_BAR
        print "Running with options:"
        for line in sorted(option_lines):
            print line
        print FORMAT_BAR

    #because the blast app controller uses absolute paths, make sure subject
    #db path is fully specified

    subject_db = options.subjectdb
    if not subject_db.startswith('/'):
        subject_db = join(getcwd(), subject_db)
    if not options.no_format_db:

        #initialize object
        inpath = FilePath(abspath(options.subjectdb))
        subject_dir, subj_file = split(inpath)

        fdb = FormatDb(WorkingDir=subject_dir)
        # Currently we do not support protein blasts, but
        # this would be easy to add in the future...
        fdb.Parameters['-p'].on('F')

        # Create indices for record lookup
        fdb.Parameters['-o'].on('T')

        # Set input database
        fdb.Parameters['-i'].on(subject_db)

        formatdb_cmd = fdb.BaseCommand

        if DEBUG:
            print "Formatting db with command: %s" % formatdb_cmd

        app_result = fdb(subject_db)
        formatdb_filepaths = []
        for v in app_result.values():
            try:
                formatdb_filepaths.append(v.name)
            except AttributeError:
                # not a file object, so no path to return
                pass

        db_format_time = time() - start_time

        if DEBUG:
            print "Formatting subject db took: %2.f seconds" % db_format_time
            print "formatdb log file written to: %s" % app_result['log']
            print FORMAT_BAR
    else:
        db_format_time = time() - start_time
        formatdb_cmd = "None (formatdb not called)"
        # Check that User-Supplied subjectdb is valid
        db_ext = [".nhr", ".nin", ".nsd", ".nsi", ".nsq"]
        formatdb_filepaths = [subject_db + ext for ext in db_ext]

        if DEBUG:
            print "Checking that pre-existing formatdb files exist and can be read."
            print "Files to be checked:"
            for fp in formatdb_filepaths:
                print fp
            print FORMAT_BAR

        try:
            formatdb_files = [open(db_f, "U") for db_f in formatdb_filepaths]
            [f.close() for f in formatdb_files]
        except IOError:
            if DEBUG:
                print "Cannot open user-supplied database file:", db_f
            option_parser.error(
                """Problem with -d and --no_format_db option combination: Cannot open the following user-supplied database file: %s. Consider running without --no_format_db to let formatdb generate these required files"""
                % db_f)

        if DEBUG:
            print "OK: BLAST Database files exist and can be read."
            print FORMAT_BAR

    # Perform BLAST search
    blast_results,hit_ids, removed_hit_ids = find_homologs(options.querydb,\
        subject_db, options.e_value,options.max_hits,\
        options.working_dir,options.blastmatroot, options.wordsize,\
                            options.percent_aligned, DEBUG=DEBUG)

    blast_time = (time() - start_time) - db_format_time

    if DEBUG:
        print "BLAST search took: %2.f minute(s)" % (blast_time / 60.0)
        print FORMAT_BAR

    #Create output folder
    outputdir = options.outputdir
    try:
        makedirs(outputdir)
    except OSError:
        pass

    #Record raw blast results
    raw_blast_results_path = join(outputdir, "raw_blast_results.txt")
    f = open(raw_blast_results_path, 'w')
    f.writelines(blast_results)
    f.close()

    #Record excluded seqs
    excluded_seqs_path = join(outputdir, "matching.fna")
    ids_to_seq_file(hit_ids, options.querydb, excluded_seqs_path, "")

    #Record included (screened) seqs
    included_seqs_path = join(outputdir, "non-matching.fna")
    all_ids = ids_from_fasta_lines(open(options.querydb))
    included_ids = set(all_ids) - hit_ids
    ids_to_seq_file(included_ids, options.querydb, included_seqs_path, "")

    log_lines = compose_logfile_lines(start_time, db_format_time, blast_time,\
                                                   option_lines,formatdb_cmd,\
                                               blast_results,options,all_ids,\
                                                     hit_ids,removed_hit_ids,\
                                                          included_ids,DEBUG)

    log_path = join(outputdir, "sequence_exclusion.log")
    if DEBUG:
        print "Writing summary to: %s" % log_path

    f = open(log_path, 'w')
    f.writelines(log_lines)
    f.close()

    if not options.no_clean:
        if DEBUG:

            print FORMAT_BAR
            print "|                           Cleanup                        |"
            print FORMAT_BAR

        if not options.no_format_db:
            if options.verbose:
                print "Cleaning up formatdb files:", formatdb_filepaths
            remove_files(formatdb_filepaths)
        else:
            if options.verbose:
                print "Formatdb not run...nothing to clean"