Example #1
0
    def test_parse_first_database(self):
        """Parse first db should build best_hits correctly"""

        out_total_queries, out_best_hits = parse_first_database(self.db1, [0.80], [50])
        self.assertEquals(out_total_queries, 2)
        self.assertEquals(
            out_best_hits,
            {
                "HABJ36W02EXF44": [
                    {
                        "a": {
                            "evalue": 0.0,
                            "subject_id": "NZ_ABEH01000018_641736102",
                            "bit_score": 1005.0,
                            "percentage_id": 99.42,
                            "alg_length": 519,
                        },
                        "b": {"subject_id": None, "bit_score": -1},
                    }
                ],
                "HABJ36W02DLDSY": [
                    {
                        "a": {
                            "evalue": 0.0,
                            "subject_id": "NZ_ABEH01000005_641736102",
                            "bit_score": 959.0,
                            "percentage_id": 99.22,
                            "alg_length": 512,
                        },
                        "b": {"subject_id": None, "bit_score": -1},
                    }
                ],
            },
        )
    def test_parse_first_database(self):
        """Parse first db should build best_hits correctly"""

        out_total_queries, out_best_hits = parse_first_database(self.db1,
                                                                [.80], [50])
        self.assertEquals(out_total_queries, 4)
        self.assertEquals(
            out_best_hits,
            {'HABJ36W02EXF44': [
                {'a': {'evalue': 0.0,
                       'subject_id': 'NZ_ABEH01000018_641736102',
                       'bit_score': 1005.0, 'percentage_id': 99.42,
                       'alg_length': 519},
                 'b': {'subject_id': None, 'bit_score': -1}}],
             'BLANK-TEST-NOT-IN-SECOND': [
                {'a': {'evalue': 4e-133,
                       'subject_id': 'NZ_ACZD01000120_647000262',
                       'bit_score': 482.0, 'percentage_id': 88.79,
                       'alg_length': 455},
                 'b': {'subject_id': None, 'bit_score': -1}}],
             'HABJ36W02DLDSY': [
                {'a': {'evalue': 0.0,
                       'subject_id': 'NZ_ABEH01000005_641736102',
                       'bit_score': 959.0, 'percentage_id': 99.22,
                       'alg_length': 512},
                 'b': {'subject_id': None, 'bit_score': -1}}]})
Example #3
0
    def test_parse_first_database(self):
        """Parse first db should build best_hits correctly"""

        out_total_queries, out_best_hits = parse_first_database(self.db1,
                                                                [.80], [50])
        self.assertEquals(out_total_queries, 4)
        self.assertEquals(
            out_best_hits,
            {'HABJ36W02EXF44': [
                {'a': {'evalue': 0.0,
                       'subject_id': 'NZ_ABEH01000018_641736102',
                       'bit_score': 1005.0, 'percentage_id': 99.42,
                       'alg_length': 519},
                 'b': {'subject_id': None, 'bit_score': -1}}],
             'BLANK-TEST-NOT-IN-SECOND': [
                {'a': {'evalue': 4e-133,
                       'subject_id': 'NZ_ACZD01000120_647000262',
                       'bit_score': 482.0, 'percentage_id': 88.79,
                       'alg_length': 455},
                 'b': {'subject_id': None, 'bit_score': -1}}],
             'HABJ36W02DLDSY': [
                {'a': {'evalue': 0.0,
                       'subject_id': 'NZ_ABEH01000005_641736102',
                       'bit_score': 959.0, 'percentage_id': 99.22,
                       'alg_length': 512},
                 'b': {'subject_id': None, 'bit_score': -1}}]})
Example #4
0
def compare(interest_fp, other_fp, output_dir='blast-results-compare',
            interest_pcts=None, interest_alg_lens=None, other_pcts=None,
            other_alg_lens=None, hits_to_first=False, hits_to_second=False):
    """Compare two databases and write the outputs

    Parameters
    ----------
    interest_fp : str
        BLAST results when searching against the database of interest.
    other_fp : str
        BLAST results when searching against the other database.
    output_dir : str, optional
        Name of the output file path.
    interest_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        interest database search results. If None is passed, it defaults to
        `[70]`.
    interest_alg_lens : list, optional
        Minimum alginment length to be considered a valid result in the
        interest database search results. If None is passed, it defaults to
        `[50]`.
    other_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        other database search results. If None is passed, it defaults to
        `[70]`.
    other_lengths : list, optional
        Minimum alginment length to be considered a valid result in the
        other database search results. If None is passed, it defaults to
        `[50]`.
    hits_to_first : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the first
        database.
    hits_to_second : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the second
        database.

    Raises
    ------
    click.BadParameter
        If the `interest_pcts` and the `other_pcts` lists are of different
        length.
        If the `interest_alg_lens` and the `other_alg_lens` lists are of
        different length.
    """

    if interest_pcts is None:
        interest_pcts = [70]
    if interest_alg_lens is None:
        interest_alg_lens = [50]

    db_a = open(interest_fp, 'U')
    db_b = open(other_fp, 'U')

    # try to create the output directory, if it exists, just continue
    create_dir(output_dir, False)

    # run some validations on the input parameters
    if other_pcts:
        if len(interest_pcts) != len(other_pcts):
            raise BadParameter("The percentage values for both databases "
                               "should be the same length: %s - %s" %
                               (interest_pcts, other_pcts))
    else:
        other_pcts = interest_pcts

    if other_alg_lens:
        if len(interest_alg_lens) != len(other_alg_lens):
            raise BadParameter("The alignment length values for both databases"
                               " should be the length : %s - %s" %
                               (interest_alg_lens, other_alg_lens))
    else:
        other_alg_lens = interest_alg_lens

    # process databases
    total_queries, best_hits = parse_first_database(db_a, interest_pcts,
                                                    interest_alg_lens)
    parse_second_database(db_b, best_hits, other_pcts,
                          other_alg_lens)

    # parse results
    results = process_results(interest_pcts, interest_alg_lens,
                              other_pcts, other_alg_lens, best_hits)

    # Collating output and writing full results
    for i, item in enumerate(results):
        filename = join(output_dir, "summary_" + item['filename'] + ".txt")

        with open(filename, 'w') as fd:
            fd.write('\n'.join(item['summary']))

        if i == 0:
            combined_results = []
            combined_results.append(['filename'])
            combined_results.append(['interest db (%s)' %
                                     basename(interest_fp)])
            combined_results.append(['other db (%s)' % basename(other_fp)])
            combined_results.append(['only interest'])
            combined_results.append(['both dbs'])
            combined_results.append(['no hits in interest db'])

        no_hits = total_queries - item['db_interest'] - item['db_other'] - \
            item['perfect_interest'] - item['equal']

        combined_results[0].append(item['filename'])
        combined_results[1].append(str(item['db_interest']))
        combined_results[2].append(str(item['db_other']))
        combined_results[3].append(str(item['perfect_interest']))
        combined_results[4].append(str(item['equal']))
        combined_results[5].append(str(no_hits))

        # tiny helper function to save hits files
        def save_hits(data, name):

            s_hits = sorted(data, key=itemgetter(1), reverse=True)
            filename = join(output_dir, name)
            with open(filename, 'w') as fd:
                fd.write('\n'.join(['%s\t%d' % (k, v)
                                    for k, v in s_hits if v != 0]))

        if hits_to_first:
            save_hits(item['db_seqs_counts']['a'].items(),
                      "hits_to_first_db_%s.txt" % item['filename'])

        if hits_to_second:
            save_hits(item['db_seqs_counts']['b'].items(),
                      "hits_to_second_db_%s.txt" % item['filename'])

    # saving collated results
    with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output:
        compiled_output.write('\n'.join(['\t'.join(item)
                                         for item in combined_results]))

    fn = join(output_dir, "compile_output_no_nohits.txt")
    with open(fn, 'w') as fd:
        fd.write('\n'.join(['\t'.join(item)
                            for item in combined_results[:-1]]))
Example #5
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # turn a comma-separated list of numbers into a 1-D list of integers
    list_of_ints = lambda string: map(int, string.split(","))

    input_path_interest = opts.input_path_interest
    input_path_other = opts.input_path_other
    percentage_ids = list_of_ints(opts.percentage_ids)
    alignment_lengths = list_of_ints(opts.alignment_lengths)
    hits_to_1st = opts.hits_to_1st
    hits_to_2nd = opts.hits_to_2nd
    percentage_ids_other = opts.percentage_ids_other
    alignment_lengths_other = opts.alignment_lengths_other
    output_dir = opts.output_dir

    db_a = open(input_path_interest, "U")
    db_b = open(input_path_other, "U")

    # try to create the output directory, if it exists, just continue
    create_dir(output_dir, False)

    # run some validations on the input parameters
    if percentage_ids_other:
        percentage_ids_other = list_of_ints(percentage_ids_other)
        if len(percentage_ids) != len(percentage_ids_other):
            option_parser.error(
                "The percentage values for both databases "
                "should be the same length: %s - %s" % (percentage_ids, percentage_ids_other)
            )
    else:
        percentage_ids_other = percentage_ids

    if alignment_lengths_other:
        alignment_lengths_other = list_of_ints(alignment_lengths_other)

        if len(alignment_lengths) != len(alignment_lengths_other):
            option_parser.error(
                "The alignment length values for both databases"
                " should be the length : %s - %s" % (alignment_lengths, alignment_lengths_other)
            )
    else:
        alignment_lengths_other = alignment_lengths

    # Process databases
    total_queries, best_hits = parse_first_database(db_a, percentage_ids, alignment_lengths)
    parse_second_database(db_b, best_hits, percentage_ids_other, alignment_lengths_other)

    # Parse results
    results = process_results(
        percentage_ids,
        alignment_lengths,
        percentage_ids_other,
        alignment_lengths_other,
        best_hits,
        input_path_interest,
        input_path_other,
    )

    # Collating output and writing full results
    for i, item in enumerate(results):
        filename = join(output_dir, "summary_" + item["filename"] + ".txt")

        fd = open(filename, "w")
        fd.write("\n".join(item["summary"]))
        fd.close()

        if i == 0:
            combined_results = []
            combined_results.append(["filename"])
            combined_results.append(["interestdb (%s)" % input_path_interest])
            combined_results.append(["other db (%s)" % input_path_other])
            combined_results.append(["only interest"])
            combined_results.append(["both dbs"])
            combined_results.append(["no hits in interest db"])

        no_hits = total_queries - item["db_interest"] - item["db_other"] - item["perfect_interest"] - item["equal"]
        combined_results[0].append(item["filename"])
        combined_results[1].append(str(item["db_interest"]))
        combined_results[2].append(str(item["db_other"]))
        combined_results[3].append(str(item["perfect_interest"]))
        combined_results[4].append(str(item["equal"]))
        combined_results[5].append(str(no_hits))

        # Printing count of hits to the db
        if hits_to_1st:
            s_hits = sorted(item["db_seqs_counts"]["a"].items(), key=itemgetter(1), reverse=True)

            filename = join(output_dir, "hits_to_1st_db_" + item["filename"] + ".txt")

            fd = open(filename, "w")
            fd.write("\n".join(["%s\t%d" % (k, v) for k, v in s_hits if v != 0]))
            fd.close()

        if hits_to_2nd:
            s_hits = sorted(item["db_seqs_counts"]["b"].items(), key=itemgetter(1), reverse=True)

            filename = join(output_dir, "hits_to_2nd_db_" + item["filename"] + ".txt")

            fd = open(filename, "w")
            fd.write("\n".join(["%s: %d" % (k, v) for k, v in s_hits if v != 0]))
            fd.close()

    # Printing collated results
    compiled_output_fd = open(join(output_dir, "compile_output.txt"), "w")
    compiled_output_fd.write("\n".join(["\t".join(item) for item in combined_results]))
    compiled_output_fd.close()

    compiled_output_no_hits_fd = open(join(output_dir, "compile_output_no_nohits.txt"), "w")
    compiled_output_no_hits_fd.write("\n".join(["\t".join(item) for item in combined_results[:-1]]))
    compiled_output_no_hits_fd.close()
Example #6
0
def compare(interest_fp,
            other_fp,
            output_dir='blast-results-compare',
            interest_pcts=None,
            interest_alg_lens=None,
            other_pcts=None,
            other_alg_lens=None,
            hits_to_first=False,
            hits_to_second=False):
    """Compare two databases and write the outputs

    Parameters
    ----------
    interest_fp : str
        BLAST results when searching against the database of interest.
    other_fp : str
        BLAST results when searching against the other database.
    output_dir : str, optional
        Name of the output file path.
    interest_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        interest database search results. If None is passed, it defaults to
        `[70]`.
    interest_alg_lens : list, optional
        Minimum alginment length to be considered a valid result in the
        interest database search results. If None is passed, it defaults to
        `[50]`.
    other_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        other database search results. If None is passed, it defaults to
        `[70]`.
    other_lengths : list, optional
        Minimum alginment length to be considered a valid result in the
        other database search results. If None is passed, it defaults to
        `[50]`.
    hits_to_first : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the first
        database.
    hits_to_second : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the second
        database.

    Raises
    ------
    click.BadParameter
        If the `interest_pcts` and the `other_pcts` lists are of different
        length.
        If the `interest_alg_lens` and the `other_alg_lens` lists are of
        different length.
    """

    if interest_pcts is None:
        interest_pcts = [70]
    if interest_alg_lens is None:
        interest_alg_lens = [50]

    db_a = open(interest_fp, 'U')
    db_b = open(other_fp, 'U')

    # try to create the output directory, if it exists, just continue
    create_dir(output_dir, False)

    # run some validations on the input parameters
    if other_pcts:
        if len(interest_pcts) != len(other_pcts):
            raise BadParameter("The percentage values for both databases "
                               "should be the same length: %s - %s" %
                               (interest_pcts, other_pcts))
    else:
        other_pcts = interest_pcts

    if other_alg_lens:
        if len(interest_alg_lens) != len(other_alg_lens):
            raise BadParameter("The alignment length values for both databases"
                               " should be the length : %s - %s" %
                               (interest_alg_lens, other_alg_lens))
    else:
        other_alg_lens = interest_alg_lens

    # process databases
    total_queries, best_hits = parse_first_database(db_a, interest_pcts,
                                                    interest_alg_lens)
    parse_second_database(db_b, best_hits, other_pcts, other_alg_lens)

    # parse results
    results = process_results(interest_pcts, interest_alg_lens, other_pcts,
                              other_alg_lens, best_hits)

    # Collating output and writing full results
    for i, item in enumerate(results):
        filename = join(output_dir, "summary_" + item['filename'] + ".txt")

        with open(filename, 'w') as fd:
            fd.write('\n'.join(item['summary']))

        if i == 0:
            combined_results = []
            combined_results.append(['filename'])
            combined_results.append(
                ['interest db (%s)' % basename(interest_fp)])
            combined_results.append(['other db (%s)' % basename(other_fp)])
            combined_results.append(['only interest'])
            combined_results.append(['both dbs'])
            combined_results.append(['no hits in interest db'])

        no_hits = total_queries - item['db_interest'] - item['db_other'] - \
            item['perfect_interest'] - item['equal']

        combined_results[0].append(item['filename'])
        combined_results[1].append(str(item['db_interest']))
        combined_results[2].append(str(item['db_other']))
        combined_results[3].append(str(item['perfect_interest']))
        combined_results[4].append(str(item['equal']))
        combined_results[5].append(str(no_hits))

        # tiny helper function to save hits files
        def save_hits(data, name):

            s_hits = sorted(data, key=itemgetter(1), reverse=True)
            filename = join(output_dir, name)
            with open(filename, 'w') as fd:
                fd.write('\n'.join(
                    ['%s\t%d' % (k, v) for k, v in s_hits if v != 0]))

        if hits_to_first:
            save_hits(item['db_seqs_counts']['a'].items(),
                      "hits_to_first_db_%s.txt" % item['filename'])

        if hits_to_second:
            save_hits(item['db_seqs_counts']['b'].items(),
                      "hits_to_second_db_%s.txt" % item['filename'])

    # saving collated results
    with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output:
        compiled_output.write('\n'.join(
            ['\t'.join(item) for item in combined_results]))

    fn = join(output_dir, "compile_output_no_nohits.txt")
    with open(fn, 'w') as fd:
        fd.write('\n'.join(['\t'.join(item)
                            for item in combined_results[:-1]]))