Example #1
0
def get_db_filename_for_query_from_db_csv(taxon, main_data_dir):
    """Take a database name/species abbreviation or taxon name extracted from a
    query filename, and if there is a corresponding database file name in the
    database directory information csv file specified in the DataPaths(main_data_dir) module,
    then return that. Otherwise, just return 'N/A'.
    """
    # Define database file name as not applicable, by default.
    dbfn = '-'

    # Check whether the given "taxon" name exists in the database info csv.
    try:
        # Try loading the dataframe.
        df = pd.read_csv(DataPaths(main_data_dir).db_info_csv,
                         encoding='utf-8')
    except:
        # Print an error message.
        print(
            """Error: Could not load contents of csv file as pandas dataframe:\n\n
        \t%s\n\nCheck that the file was saved properly in comma separated value
        format (UTF-8 encoding).""" % DataPaths(main_data_dir).db_info_csv)

        # Exit the script.
        print('Quitting script.')
        sys.exit()

    col_list = df['Taxon'].tolist()
    # If it does, then get filename for corresponding database.
    if taxon in col_list:
        df.set_index('Taxon', inplace=True)
        dbfn = df.loc[taxon]['Filename']

    # Return database file name or 'N/A'.
    return dbfn
Example #2
0
def get_query_title_from_csv(query_filename, main_data_dir):
    """Take a query filename, look up corresponding query title in the query
    directory csv specified in the DataPaths(main_data_dir) module, and return that.
    """
    # Parse query info csv file.
    query_info_csv_path = DataPaths(main_data_dir).query_info_csv
    df = pd.read_csv(query_info_csv_path, encoding='utf-8')

    # Get query title.
    df.set_index('Filename', inplace=True)
    query_title = df.loc[query_filename]['Query title']

    # Check that query title is not a dataframe.
    assert not isinstance(query_title, pd.DataFrame), """Could not properly
    parse query info file (%s), there may be duplicate entries in this file.""" \
        % query_info_csv_path

    # Check that query title is a string.
    assert isinstance(query_title, str), """Could not properly identify query
    title for query file %s in CSV file %s. Please ensure that the query title
    contains some alphabetic characters. Query title identified: %s""" \
            % (query_filename, query_info_csv_path, query_title)

    # Return query title.
    return query_title
Example #3
0
def get_subseq_from_fasta_db(db_name, acc, subseq_coord, main_data_dir):
    """Returns a SeqRecord object corresponding to the subsequence with the
    given coordinates in the sequence with the given accessions in the given
    database file. 

    Note: the input subsequence coordinates ('subseq_coord') are the start and
    end residue numbers for the subsequence, not python-style slices.
    """
    # Get database directory from DataPaths(main_data_dir) module.
    db_dir = DataPaths(main_data_dir).dbdirpath

    # Get database filepath.
    #db_path = os.path.join(db_dir, db_name)
    db_path = os.path.join(db_dir, os.path.basename(db_name))
    assert os.path.isfile(db_path), """Path is not a file: %s""" % db_path

    # Old way:
    ## Parse database file and compile a list of sequence objects to return.
    #seq_objs = []
    #with open(db_path) as dbh:
    #    for seq in SeqIO.parse(dbh, 'fasta'):
    #        acc = seq.id.strip()
    #        if acc in accs:
    #            seq_objs.append(seq)
    #        if len(seq_objs) == len(accs):
    #            break

    # Use esl-sfetch to retrieve the sequences and write to a temporary file.
    temp_fa_path = db_path + '_TEMP_FASTA.fa'
    if os.path.isfile(temp_fa_path):
        os.remove(temp_fa_path)
    with open(temp_fa_path, 'a') as o:
        # Get sequence as text.
        subprocess.call(['esl-sfetch', db_path, acc], stdout=o)

    # Parse the fasta file to get Seq objects.
    seq_obj = None
    seq_obj = SeqIO.read(temp_fa_path, 'fasta')
    seq_obj.description = seq_obj.description.rstrip('\"') + ' ' + str(
        subseq_coord) + '\"'

    # ***Re-use code from search scaffolds to verify validity of input
    # subseq_coord...?
    #...

    # Construct new sequence.
    new_seq = ''
    for subseq in subseq_coord:
        start = subseq[0]
        end = subseq[1]
        new_seq = new_seq + seq_obj.seq[start:end + 1]  # Double-check this!

    seq_obj.seq = new_seq

    # Remove the temporary fasta file.
    os.remove(temp_fa_path)

    # Return the list of sequence objects.
    return seq_obj
Example #4
0
def get_query_taxon_from_csv(query_filename, main_data_dir):
    """Take a query filename, look up corresponding query taxon in the query
    directory csv specified in the DataPaths(main_data_dir) module, and return that.
    """
    # Parse query info csv file.
    df = pd.read_csv(DataPaths(main_data_dir).query_info_csv, encoding='utf-8')

    # Return query title.
    df.set_index('Filename', inplace=True)
    return df.loc[query_filename]['Query taxon (species if applicable)']
Example #5
0
def get_species_for_db_filename(db_filename, main_data_dir):
    """Takes a database filename, and returns the species name that appears in
    the database info csv file (may be '-' if not applicable).
    """
    df = pd.read_csv(DataPaths(main_data_dir).db_info_csv, encoding='utf-8')
    df.set_index('Filename', inplace=True)
    sp = df.loc[db_filename]['Species (if applicable)']

    #print('\nTrying to get species name from genome info csv file.')
    #print('genome info file path: ' + DataPaths(main_data_dir).db_info_csv)
    #print('db_filename: ' + db_filename)
    #print('value in species column: ' + sp)

    # Check that the value retrieved makes sense.
    assert type(sp) is str, """There is more than one entry (row) for the
    filename %s in the file %s.""" % (db_filename,
                                      DataPaths(main_data_dir).db_info_csv)

    # Return the species name from the spreadsheet.
    return sp
Example #6
0
def get_species_from_db_csv(taxon, main_data_dir):
    """Take a database name/species abbreviation from a taxon name extracted
    from a query filename. If there is a corresponding species name in the
    database directory information csv file specified in the DataPaths(main_data_dir) module
    return that.
    """
    df = pd.read_csv(DataPaths(main_data_dir).db_info_csv, encoding='utf-8')
    # Species name to return is not applicable by default.
    sp = '-'
    for f in list(df['Filename']):
        if f.rsplit('.', 1)[0] == taxon:
            df.set_index('Filename', inplace=True)
            sp = df.loc[f]['Species (if applicable)']
            break
    return sp
Example #7
0
    def hit_sequence(self, hit_rank):
        """Return a Seq object for full sequence of subject sequence.
        """
        # Get path for databases directory.
        dbdir_path = DataPaths(self.main_data_dir).dbdirpath

        # Get sequence object.
        seq_obj = None
        seq_id = self.hit_id(hit_rank)
        db_path = os.path.join(dbdir_path, self.db_file)
        assert os.path.isfile(db_path), """Given path is not a file."""
        #seq_obj = get_seqs_from_fasta_db(db_path, [seq_id])[0]
        seq_obj = get_seqs_from_fasta_db(self.db_file, [seq_id],
                                         self.main_data_dir)[0]

        # Check that it worked.
        assert seq_obj is not None, """Could not retrieve sequence for hit."""

        # Return sequence object.
        return seq_obj
Example #8
0
                        stdout=o,
                        stderr=subprocess.STDOUT)


if __name__ == '__main__':
    # Parse input.
    command_line_list = sys.argv
    query_faa = str(command_line_list[1])
    target_fna_name = str(command_line_list[2])
    target_seq_id = str(command_line_list[3])
    target_subseq_start = str(command_line_list[4])
    target_subseq_end = str(command_line_list[5])
    genetic_code = str(command_line_list[6])

    # Get filepath for specified query FASTA filename.
    query_dir = DataPaths(main_data_dir).querydirpath
    query_faa_path = os.path.join(query_dir, query_faa)
    assert os.path.isfile(query_faa_path), """Specified query file path is
    not a file: %s""" % query_faa_path

    # Get filepath for specified subject FASTA filename.
    db_dir = DataPaths(main_data_dir).dbdirpath
    target_fna_path = os.path.join(db_dir, target_fna_name)
    assert os.path.isfile(target_fna_path), """Specified database file path is
    not a file: %s""" % target_fna_path

    # Define path to FASTA file with subsequence of interest from target
    # nucleotide sequence.
    subseq_fasta_path = query_faa.rsplit('.', 1)[0] + '_subject_subseq.fna'

    # Extract relevant subsequence from input target sequence (region identified in
Example #9
0
    def __init__(self, filepath, main_data_dir):
        # Check that the input file path exists.
        assert os.path.isfile(filepath), """Input filepath does not exist:
        %s""" % filepath

        self.filepath = filepath

        self.main_data_dir = main_data_dir

        # Get basic info from file.
        info = get_srch_file_info(filepath)

        # Determine program used to generate input file.
        self.program = None
        self.program = info[0]
        assert self.program is not None, """Could not determine the name of
        the program that produced the similarity search result file: %s"""\
        % filepath

        # Determine version of program used to generate input file.
        self.version = None
        self.version = info[1]
        assert self.version is not None, """Could not determine the version of
        the program that produced the similarity search result file: %s"""\
        % filepath

        # Determine format type of input file.
        self.format = None
        self.format = info[2]
        assert self.format is not None, """Could not determine the name of
        the format type of the similarity search result file: %s"""\
        % filepath
        assert self.format != 'hmmer3-tab', """Does not work with tabular
        format."""

        # Check that the file contains only results for a search with a single
        # query.
        if not self.format == 'hhsearch':
            assert len(list(SearchIO.parse(filepath,
                                           self.format))) == 1, """More than
            one search result contained in input file: %s""" % filepath
        else:
            pass  # ...

        # Define a list of SearchIO Hit objects.
        self.hits = None
        if not self.format == 'hhsearch':
            self.hits = SearchIO.read(self.filepath, self.format)

        # Determine number of hits in input file.
        self.num_hits = None
        if not self.format == 'hhsearch':
            self.num_hits = len(self.hits)
            assert self.num_hits is not None, """Could not determine the number of
            hits listed in the similarity search result file: %s"""\
            % filepath
        else:
            pass  # ...

        #assert self.num_hits >= 0, """Could not determine the number of
        #hits listed in the similarity search result file: %s"""\
        #% filepath

        # Get the query and database file paths.
        #self.query_file = None
        self.db_file = None

        if not self.format == 'hhsearch':
            p = self.hits
            self.db_file = os.path.basename(p.target)

        else:
            pass  # ...

        #assert self.query_file is not None, """Could not determine query file
        #name listed in the similarity search result file: %s"""\
        #% filepath
        assert self.db_file is not None, """Could not determine database file
        name listed in the similarity search result file: %s"""\
        % filepath

        # Define full path to database file.
        self.db_file_path = None
        self.db_file_path = os.path.join(
            DataPaths(self.main_data_dir).dbdirpath, self.db_file)
        # Check that it is a real file.
        assert os.path.isfile(
            self.db_file_path), """Path to database is not a file:
        %s""" % self.db_file_path

        # Handle hmmsearch results differently.
        if self.format == 'hmmer3-text':
            # Re-order hits by ascending E-value of best 1 domain
            # (otherwise sequences with multiple repetitive domains may be
            # retrieved with lower E-values despite low sequence similarity
            # of each of the constituent domains with the query HMM).
            # For this it is necessary to choose the minimum of the per domain
            # E-values for each hit as the one to use for sorting.
            self.hits = list(self.hits)
            self.hits.sort(key=lambda x: min([y.evalue for y in list(x)]))
Example #10
0
    def hit_subsequence_and_coord(self, hit_rank, max_gap=10000):
        """Return a Seq object for subsequence of subject sequence that
        actually aligns to the query sequence/profile.
        """
        subseq_obj = None
        subseq_coord = None

        # Get hit object with SearchIO parser.
        searchio_hit_obj = None
        hit_num = -1
        for hit in self.hits:
            hit_num += 1
            if hit_num == hit_rank:
                searchio_hit_obj = hit
                break

        # Check that hit object was retreived.
        assert searchio_hit_obj != None, """Could not retrieve Bio.SearchIO
        hit object from file."""

        # Process SearchIO hit object to get subsequence object and
        # coordinates differently depending on format.
        if self.format == 'blast-xml':
            # Need to concatenate HSPs in a logical manner, and differently for
            # blastp vs. tblastn.
            if self.program == 'blastp':
                # Use the search_scaffolds module.
                xlist =\
                get_blastp_hit_seq_obj_and_coord(searchio_hit_obj, max_gap)
                subseq_obj = xlist[0]
                subseq_coord = xlist[1]

            elif self.program == 'tblastn':
                # Use the search_scaffolds module.
                xlist =\
                get_tblastn_hit_seq_obj_and_coord(searchio_hit_obj, max_gap)
                subseq_obj = xlist[0]
                subseq_coord = xlist[1]

        elif self.format == 'hmmer3-text':
            # More straight-forward, because the sequences can be taken
            # directly from the database file.

            # Get path for databases directory.
            dbdir_path = DataPaths(self.main_data_dir).dbdirpath

            # Get coordinates.
            subseq_coord = get_hmmer_hit_seq_coord(searchio_hit_obj,
                                                   self.db_file,
                                                   self.main_data_dir)

            # Get sequence object.
            seq_id = self.hit_id(hit_rank)
            db_path = os.path.join(dbdir_path, self.db_file)
            subseq_obj = get_subseq_from_fasta_db(db_path, seq_id,
                                                  subseq_coord,
                                                  self.main_data_dir)

        else:
            pass  # ...?

        # Check that it worked.
        assert subseq_obj is not None, """Could not retrieve sequence for hit."""
        assert subseq_coord is not None, """Could not retrieve sequence
        coordinates for hit."""

        # Return sequence object and coordinates.
        return [subseq_obj, subseq_coord]
Example #11
0
def get_seqs_from_fasta_db(db_name, accs, main_data_dir, slow=False):
    """Returns a list of SeqRecord objects corresponding to the given accessions
    in the given database file. 
    """
    # Get database directory from DataPaths(main_data_dir) module.
    db_dir = DataPaths(main_data_dir).dbdirpath

    # Get database filepath.
    #db_path = os.path.join(db_dir, db_name)
    db_path = os.path.join(db_dir, os.path.basename(db_name))
    assert os.path.isfile(db_path), """Path is not a file: %s""" % db_path

    # Old way:
    ## Parse database file and compile a list of sequence objects to return.
    #seq_objs = []
    #with open(db_path) as dbh:
    #    for seq in SeqIO.parse(dbh, 'fasta'):
    #        acc = seq.id.strip()
    #        if acc in accs:
    #            seq_objs.append(seq)
    #        if len(seq_objs) == len(accs):
    #            break

    # Retrieve sequences from fasta file and write to a temporary file.
    temp_fa_path = db_path + '_TEMP_FASTA.fa'
    if not slow:
        # Use esl-sfetch to retrieve the sequences and write to a temporary file.
        if os.path.isfile(temp_fa_path):
            os.remove(temp_fa_path)
        with open(temp_fa_path, 'a') as o:
            for acc in accs:
                # Get sequence as text.
                subprocess.call(['esl-sfetch', db_path, acc], stdout=o)
    elif slow:
        # Parse sequence using a slower method that does not make use of
        # esl-sfetch.
        if os.path.isfile(temp_fa_path):
            os.remove(temp_fa_path)
        with open(temp_fa_path, 'a') as o, open(db_path) as db_handle:
            for acc in accs:
                # Get sequence as text.
                all_seq_ids = None
                with open(db_path) as db_handle:
                    all_seq_ids = [
                        x.id for x in SeqIO.parse(db_handle, 'fasta')
                    ]
                if acc in all_seq_ids:
                    with open(db_path) as db_handle:
                        for x in SeqIO.parse(db_handle, 'fasta'):
                            if x.id == acc:
                                # Write to temp fasta file.
                                SeqIO.write([x], o, 'fasta')
                                break
                else:
                    accs_that_start_with_acc = []
                    with open(db_path) as db_handle:
                        for x in SeqIO.parse(db_handle, 'fasta'):
                            if x.id.startswith(acc):
                                accs_that_start_with_acc.append(x.id)
                    # Check that only one accession starts with.
                    #assert len(accs_that_start_with_acc) == 1, """More than one
                    #accession in file starts with %s""" % acc
                    if len(accs_that_start_with_acc) < 1:
                        print("No accessions start with %s" % acc)
                    elif len(accs_that_start_with_acc) > 1:
                        print("More than one accession starts with %s" % acc)

                    if len(accs_that_start_with_acc) >= 1:
                        with open(db_path) as db_handle:
                            for x in SeqIO.parse(db_handle, 'fasta'):
                                if x.id.startswith(acc):
                                    # Write to temp fasta file.
                                    SeqIO.write([x], o, 'fasta')
                                    break

    # Parse the fasta file to get Seq objects.
    seq_objs = []
    for s in SeqIO.parse(temp_fa_path, 'fasta'):
        seq_objs.append(s)

    # Remove the temporary fasta file.
    os.remove(temp_fa_path)

    # Return the list of sequence objects.
    return seq_objs
Example #12
0
def run_all_searches(query_file_list,
                     db_file_list,
                     outdir,
                     blast_report_evalue_cutoff,
                     blast_max_target_seqs,
                     hmmer_report_evalue_cutoff,
                     hmmer_report_score_cutoff,
                     num_threads_similarity_searching,
                     main_data_dir,
                     query_dir=None):
    """Search with every query file in a given list into every database file in
    another given list using appropriate methods.
    """
    # Current time.
    start_time = time.time()

    # Get query and database directories from DataPaths(main_data_dir).
    if query_dir == None:
        query_dir = DataPaths(main_data_dir).querydirpath
    db_dir = DataPaths(main_data_dir).dbdirpath

    # Write a query file list file to output directory.
    out_query_file = get_out_query_list_path(outdir)
    with open(out_query_file, 'w') as o:
        for q in query_file_list:
            o.write(q + '\n')

    # Write a database file list file to output directory.
    out_db_file = get_out_db_list_path(outdir)
    with open(out_db_file, 'w') as o:
        for d in db_file_list:
            o.write(d + '\n')

    # Create a log file.
    logfile = os.path.join(outdir, '0_search_log.txt')

    # Loop over each query-database pair.
    with open(logfile, 'w') as o:
        srch_num = 0
        # Loop over query files.
        for q in query_file_list:
            # Loop over database files.
            for d in db_file_list:

                # Check that database file is a single FASTA file.
                assert '.faa' not in d.rsplit('.', 1)[0], """The database file
                name %s does not appear to be formatted correctly. This may
                have resulted from a file parsing error.""" % d

                if q.rsplit('.', 1)[1] == 'afaa' and d.rsplit('.',
                                                              1)[1] == 'fna':
                    warning_text = """\nWARNING: Not searching with profile query %s
                    in nucleotide data %s\n\n""" % (q, d)
                    print(warning_text)
                    o.write(warning_text)
                else:
                    srch_num += 1

                    # Get name of output file.
                    #print(outdir)
                    outfile = search_result_filepath(q, d, outdir)
                    #print(outfile)

                    # Get full filepaths, and verify existence.
                    qfull = None
                    #if os.path.isfile(q):
                    #    qfull = q
                    #else:
                    #    qfull = os.path.join(query_dir, q)
                    qfull = os.path.join(query_dir, q)
                    assert os.path.isfile(
                        qfull), """Specified query file path is
                    not a file: %s""" % qfull

                    dfull = None
                    #if os.path.isfile(d):
                    #    dfull = d
                    #else:
                    #    dfull = os.path.join(db_dir, d)
                    dfull = os.path.join(db_dir, d)
                    assert os.path.isfile(
                        dfull), """Specified database file path
                    is not a file: %s\n Please ensure that a FASTA file with
                    the filename %s exists in the input data.""" % (dfull, d)

                    # Search start time.
                    search_start_time = time.time()

                    # Run the similarity search and get a description of the search
                    # command (write to a log file?).
                    command_descr = run_any_search(
                        qfull, dfull, outfile, blast_report_evalue_cutoff,
                        blast_max_target_seqs, hmmer_report_evalue_cutoff,
                        hmmer_report_score_cutoff,
                        num_threads_similarity_searching)

                    # Write description of search to log file.
                    o.write(command_descr + '\n')

                    # End time.
                    search_end_time = time.time()
                    # Record time elapsed.
                    search_elapsed = search_end_time - search_start_time
                    o.write('Run time: ' +
                            str(datetime.timedelta(seconds=search_elapsed)) +
                            '\n')

        # End time.
        end_time = time.time()
        # Record time elapsed.
        elapsed = end_time - start_time
        o.write('Total run time: ' + str(datetime.timedelta(seconds=elapsed)) +
                '\n')
Example #13
0
def run_any_search(queryfile, dbfile, outfile, blast_report_evalue_cutoff,
                   blast_max_target_seqs, hmmer_report_evalue_cutoff,
                   hmmer_report_score_cutoff,
                   num_threads_similarity_searching):
    """Run similarity search.

    Import info from DataPaths(main_data_dir).py to specify options for running external
    software? For example, number of threads.

    ***Need to refactor functions in the module_nhmmer_search module?
    """
    # Determine method to use based on the input file types.
    query_exten = queryfile.rsplit('.', 1)[1]
    dbfile_exten = dbfile.rsplit('.', 1)[1]
    method = determine_search_method(query_exten, dbfile_exten)

    # Get version number for software.
    #version = 'version'
    #version = get_search_software_version(method)

    # Get relevant DataPaths(main_data_dir).
    # Get cutoffs for recording hits.
    blast_evalcut = str(blast_report_evalue_cutoff)
    blast_max_target_seqs = str(blast_max_target_seqs)
    hmmer_evalcut = str(hmmer_report_evalue_cutoff)
    hmmer_scorecut = str(hmmer_report_score_cutoff)
    # Get number of threads to use.
    num_threads = str(num_threads_similarity_searching)

    # Construct search command.
    run_command = []
    if method == 'blastp':
        run_command = [
            method, '-query', queryfile, '-db', dbfile, '-out', outfile,
            '-num_threads', num_threads, '-outfmt', '5', '-evalue',
            blast_evalcut, '-max_target_seqs', blast_max_target_seqs
        ]
    elif method == 'tblastn':
        # Set the genetic code.
        tblastn_ncbi_gen_code = None
        try:
            tblastn_ncbi_gen_code = DataPaths(
                main_data_dir).tblastn_ncbi_gen_code
        except:
            tblastn_ncbi_gen_code = '1'

        run_command = [
            method, '-query', queryfile, '-db', dbfile, '-out', outfile,
            '-num_threads', num_threads, '-outfmt', '5', '-evalue',
            blast_evalcut, '-max_target_seqs', blast_max_target_seqs,
            '-db_gencode', tblastn_ncbi_gen_code
        ]
    elif method == 'blastx':
        run_command = [
            method, '-query', queryfile, '-db', dbfile, '-out', outfile,
            '-num_threads', num_threads, '-outfmt', '5', '-evalue',
            blast_evalcut, '-max_target_seqs', blast_max_target_seqs
        ]
    elif method == 'blastn':
        run_command = [
            method, '-query', queryfile, '-db', dbfile, '-out', outfile,
            '-num_threads', num_threads, '-outfmt', '5', '-evalue',
            blast_evalcut, '-max_target_seqs', blast_max_target_seqs
        ]
    elif method == 'hmmsearch':
        # Use HMM file rather than '.afaa' file.
        actual_queryfile = get_out_hmm_path(queryfile)
        #run_command = [method, "-T", hmmer_scorecut, "--cpu", num_threads,
        #        '--tblout', outfile, actual_queryfile, dbfile]
        run_command = [
            method, "-T", hmmer_scorecut, "--cpu", num_threads, '-o', outfile,
            actual_queryfile, dbfile
        ]
    elif method == 'hmmscan':
        #run_command = [method, "-T", hmmer_scorecut, "--cpu", num_threads,
        #        '--tblout', outfile, queryfile, dbfile]
        run_command = [
            method, "-T", hmmer_scorecut, "--cpu", num_threads, '-o', outfile,
            dbfile, queryfile
        ]
    elif method == 'nhmmer':
        # Use HMM file rather than '.afna' file.
        actual_queryfile = get_out_hmm_path(queryfile)
        #run_command = [method, "-T", hmmer_scorecut, "--cpu", num_threads,
        #        '--tblout', outfile, actual_queryfile, dbfile]
        run_command = [
            method, "-T", hmmer_scorecut, "--cpu", num_threads, '-o', outfile,
            actual_queryfile, dbfile
        ]

    # Prepend program name with directory path if necesssary and specified in
    # the DataPaths(main_data_dir).py file (this is a work-around for a particular remote
    # server).
    #server_program_dirpath = DataPaths(main_data_dir).server_program_dirpath
    #run_command = [os.path.join(server_program_dirpath, run_command[0])] +\
    #run_command[1:]

    # Run command.
    subprocess.call(run_command)

    # Check that the output file is not empty.
    assert os.path.getsize(outfile) != 0, """Error: Search output file is
    empty. This may be due to low memory available on this system.\nEmpty file:
    %s""" % outfile

    # Return string with command used to run search.
    #search_descr =  method + ' (' + version + ')' + ' run with command:\n\t' +\
    #        ' '.join(run_command) + '\n'
    search_descr = ' '.join(run_command)
    return search_descr