Example #1
0
    def test_process_results(self):
        """Check results are processed and summarized correctly"""
        best_hits = {
            "HABJ36W02EXF44": [
                {
                    "a": {
                        "evalue": 0.0,
                        "subject_id": "NZ_ABEH01000018_641736102",
                        "bit_score": 1005.0,
                        "percentage_id": 99.42,
                        "alg_length": 519,
                    },
                    "b": {"subject_id": None, "bit_score": -1},
                }
            ],
            "HABJ36W02DLDSY": [
                {
                    "a": {
                        "evalue": 0.0,
                        "subject_id": "NZ_ABEH01000005_641736102",
                        "bit_score": 959.0,
                        "percentage_id": 99.22,
                        "alg_length": 512,
                    },
                    "b": {"subject_id": None, "bit_score": -1},
                }
            ],
        }

        out_results = process_results([0.80], [50], [0.30], [30], best_hits)
        self.assertEquals(
            out_results,
            [
                {
                    "db_interest": 0,
                    "db_other": 0,
                    "db_seqs_counts": {
                        "a": {"NZ_ABEH01000005_641736102": 1, "NZ_ABEH01000018_641736102": 1},
                        "b": {None: 0},
                    },
                    "perfect_interest": 2,
                    "equal": 0,
                    "summary": [
                        "#SeqId\tFirst\tSecond",
                        "HABJ36W02EXF44\tNZ_ABEH01000018_641736102\t",
                        "HABJ36W02DLDSY\tNZ_ABEH01000005_641736102\t",
                    ],
                    "filename": "p1_0-a1_50_p2_0-a2_30",
                }
            ],
        )
    def test_process_results(self):
        """Check results are processed and summarized correctly"""
        best_hits = {
            'HABJ36W02EXF44': [{
                'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000018_641736102',
                      'bit_score': 1005.0, 'percentage_id': 99.42,
                      'alg_length': 519},
                'b': {'subject_id': None, 'bit_score': -1}}],
            'HABJ36W02DLDSY': [{
                'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000005_641736102',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512},
                'b': {'subject_id': None, 'bit_score': -1}}],
            'SAME-VALUES': [{
                'a': {'evalue': 0.0, 'subject_id': 'RESULT-A',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512},
                'b': {'evalue': 0.0, 'subject_id': 'RESULT-B',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512}}],
            'OTHER-BETTER': [{
                'a': {'evalue': 0.0, 'subject_id': 'RESULT-A',
                      'bit_score': 10.0, 'percentage_id': 10.0,
                      'alg_length': 10},
                'b': {'evalue': 0.0, 'subject_id': 'RESULT-B',
                      'bit_score': 959.0, 'percentage_id': 100,
                      'alg_length': 900}}]
        }

        out_results = process_results([0.80], [50], [0.30], [30], best_hits)
        self.assertEquals(out_results, [{
            'db_interest': 0, 'db_other': 1, 'db_seqs_counts': {
                'a': {'NZ_ABEH01000005_641736102': 1,
                      'RESULT-A': 1,
                      'NZ_ABEH01000018_641736102': 1},
                'b': {None: 0, 'RESULT-B': 2}},
            'perfect_interest': 2, 'equal': 1, 'summary':
            ['#SeqId\tFirst\tSecond',
             'HABJ36W02EXF44\tNZ_ABEH01000018_641736102\t',
             'OTHER-BETTER\n\t',
             'SAME-VALUES\tRESULT-A\tRESULT-B',
             'HABJ36W02DLDSY\tNZ_ABEH01000005_641736102\t'],
            'filename': 'p1_0-a1_50_p2_0-a2_30'}])
Example #3
0
    def test_process_results(self):
        """Check results are processed and summarized correctly"""
        best_hits = {
            'HABJ36W02EXF44': [{
                'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000018_641736102',
                      'bit_score': 1005.0, 'percentage_id': 99.42,
                      'alg_length': 519},
                'b': {'subject_id': None, 'bit_score': -1}}],
            'HABJ36W02DLDSY': [{
                'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000005_641736102',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512},
                'b': {'subject_id': None, 'bit_score': -1}}],
            'SAME-VALUES': [{
                'a': {'evalue': 0.0, 'subject_id': 'RESULT-A',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512},
                'b': {'evalue': 0.0, 'subject_id': 'RESULT-B',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512}}],
            'OTHER-BETTER': [{
                'a': {'evalue': 0.0, 'subject_id': 'RESULT-A',
                      'bit_score': 10.0, 'percentage_id': 10.0,
                      'alg_length': 10},
                'b': {'evalue': 0.0, 'subject_id': 'RESULT-B',
                      'bit_score': 959.0, 'percentage_id': 100,
                      'alg_length': 900}}]
        }

        out_results = process_results([0.80], [50], [0.30], [30], best_hits)
        self.assertEquals(out_results, [{
            'db_interest': 0, 'db_other': 1, 'db_seqs_counts': {
                'a': {'NZ_ABEH01000005_641736102': 1,
                      'RESULT-A': 1,
                      'NZ_ABEH01000018_641736102': 1},
                'b': {None: 0, 'RESULT-B': 2}},
            'perfect_interest': 2, 'equal': 1, 'summary':
            ['#SeqId\tFirst\tSecond',
             'HABJ36W02EXF44\tNZ_ABEH01000018_641736102\t',
             'OTHER-BETTER\n\t',
             'SAME-VALUES\tRESULT-A\tRESULT-B',
             'HABJ36W02DLDSY\tNZ_ABEH01000005_641736102\t'],
            'filename': 'p1_0-a1_50_p2_0-a2_30'}])
Example #4
0
    def test_process_results(self):
        """Check results are processed and summarized correctly"""
        best_hits = {
            'HABJ36W02EXF44': [{
                'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000018_641736102',
                      'bit_score': 1005.0, 'percentage_id': 99.42,
                      'alg_length': 519},
                'b': {'subject_id': None, 'bit_score': -1}}],
            'HABJ36W02DLDSY': [{
                'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000005_641736102',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512},
                'b': {'subject_id': None, 'bit_score': -1}}],
            'SAME-VALUES': [{
                'a': {'evalue': 0.0, 'subject_id': 'RESULT-A',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512},
                'b': {'evalue': 0.0, 'subject_id': 'RESULT-B',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512}}],
            'OTHER-BETTER': [{
                'a': {'evalue': 0.0, 'subject_id': 'RESULT-A',
                      'bit_score': 10.0, 'percentage_id': 10.0,
                      'alg_length': 10},
                'b': {'evalue': 0.0, 'subject_id': 'RESULT-B',
                      'bit_score': 959.0, 'percentage_id': 100,
                      'alg_length': 900}}],
            'NO-VALS': [None]
        }

        out_results = process_results([0.80], [50], [0.30], [30], best_hits,
                                      self.base, False, False)
        # removing the file pointers so we don't need to test
        out_results[0].pop('summary_fh')
        out_results[0].pop('db_seqs_counts')
        self.assertEquals(out_results, [{
            'db_interest': 0, 'db_other': 1, 'perfect_interest': 2, 'equal': 1,
            'filename': 'p1_0-a1_50_p2_0-a2_30'}])
Example #5
0
    def test_process_results(self):
        """Check results are processed and summarized correctly"""
        best_hits = {
            'HABJ36W02EXF44': [{
                'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000018_641736102',
                      'bit_score': 1005.0, 'percentage_id': 99.42,
                      'alg_length': 519},
                'b': {'subject_id': None, 'bit_score': -1}}],
            'HABJ36W02DLDSY': [{
                'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000005_641736102',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512},
                'b': {'subject_id': None, 'bit_score': -1}}],
            'SAME-VALUES': [{
                'a': {'evalue': 0.0, 'subject_id': 'RESULT-A',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512},
                'b': {'evalue': 0.0, 'subject_id': 'RESULT-B',
                      'bit_score': 959.0, 'percentage_id': 99.22,
                      'alg_length': 512}}],
            'OTHER-BETTER': [{
                'a': {'evalue': 0.0, 'subject_id': 'RESULT-A',
                      'bit_score': 10.0, 'percentage_id': 10.0,
                      'alg_length': 10},
                'b': {'evalue': 0.0, 'subject_id': 'RESULT-B',
                      'bit_score': 959.0, 'percentage_id': 100,
                      'alg_length': 900}}],
            'NO-VALS': [None]
        }

        out_results = process_results([0.80], [50], [0.30], [30], best_hits,
                                      self.base, False, False)
        # removing the file pointers so we don't need to test
        out_results[0].pop('summary_fh')
        out_results[0].pop('db_seqs_counts')
        self.assertEquals(out_results, [{
            'db_interest': 0, 'db_other': 1, 'perfect_interest': 2, 'equal': 1,
            'filename': 'p1_0-a1_50_p2_0-a2_30'}])
Example #6
0
def compare(interest_fp, other_fp, output_dir='blast-results-compare',
            interest_pcts=None, interest_alg_lens=None, other_pcts=None,
            other_alg_lens=None, hits_to_first=False, hits_to_second=False):
    """Compare two databases and write the outputs

    Parameters
    ----------
    interest_fp : str
        BLAST results when searching against the database of interest.
    other_fp : str
        BLAST results when searching against the other database.
    output_dir : str, optional
        Name of the output file path.
    interest_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        interest database search results. If None is passed, it defaults to
        `[70]`.
    interest_alg_lens : list, optional
        Minimum alginment length to be considered a valid result in the
        interest database search results. If None is passed, it defaults to
        `[50]`.
    other_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        other database search results. If None is passed, it defaults to
        `[70]`.
    other_lengths : list, optional
        Minimum alginment length to be considered a valid result in the
        other database search results. If None is passed, it defaults to
        `[50]`.
    hits_to_first : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the first
        database.
    hits_to_second : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the second
        database.

    Raises
    ------
    click.BadParameter
        If the `interest_pcts` and the `other_pcts` lists are of different
        length.
        If the `interest_alg_lens` and the `other_alg_lens` lists are of
        different length.
    """

    if interest_pcts is None:
        interest_pcts = [70]
    if interest_alg_lens is None:
        interest_alg_lens = [50]

    db_a = open(interest_fp, 'U')
    db_b = open(other_fp, 'U')

    # try to create the output directory, if it exists, just continue
    create_dir(output_dir, False)

    # run some validations on the input parameters
    if other_pcts:
        if len(interest_pcts) != len(other_pcts):
            raise BadParameter("The percentage values for both databases "
                               "should be the same length: %s - %s" %
                               (interest_pcts, other_pcts))
    else:
        other_pcts = interest_pcts

    if other_alg_lens:
        if len(interest_alg_lens) != len(other_alg_lens):
            raise BadParameter("The alignment length values for both databases"
                               " should be the length : %s - %s" %
                               (interest_alg_lens, other_alg_lens))
    else:
        other_alg_lens = interest_alg_lens

    # process databases
    total_queries, best_hits = parse_first_database(db_a, interest_pcts,
                                                    interest_alg_lens)
    parse_second_database(db_b, best_hits, other_pcts,
                          other_alg_lens)

    # parse results
    results = process_results(interest_pcts, interest_alg_lens,
                              other_pcts, other_alg_lens, best_hits)

    # Collating output and writing full results
    for i, item in enumerate(results):
        filename = join(output_dir, "summary_" + item['filename'] + ".txt")

        with open(filename, 'w') as fd:
            fd.write('\n'.join(item['summary']))

        if i == 0:
            combined_results = []
            combined_results.append(['filename'])
            combined_results.append(['interest db (%s)' %
                                     basename(interest_fp)])
            combined_results.append(['other db (%s)' % basename(other_fp)])
            combined_results.append(['only interest'])
            combined_results.append(['both dbs'])
            combined_results.append(['no hits in interest db'])

        no_hits = total_queries - item['db_interest'] - item['db_other'] - \
            item['perfect_interest'] - item['equal']

        combined_results[0].append(item['filename'])
        combined_results[1].append(str(item['db_interest']))
        combined_results[2].append(str(item['db_other']))
        combined_results[3].append(str(item['perfect_interest']))
        combined_results[4].append(str(item['equal']))
        combined_results[5].append(str(no_hits))

        # tiny helper function to save hits files
        def save_hits(data, name):

            s_hits = sorted(data, key=itemgetter(1), reverse=True)
            filename = join(output_dir, name)
            with open(filename, 'w') as fd:
                fd.write('\n'.join(['%s\t%d' % (k, v)
                                    for k, v in s_hits if v != 0]))

        if hits_to_first:
            save_hits(item['db_seqs_counts']['a'].items(),
                      "hits_to_first_db_%s.txt" % item['filename'])

        if hits_to_second:
            save_hits(item['db_seqs_counts']['b'].items(),
                      "hits_to_second_db_%s.txt" % item['filename'])

    # saving collated results
    with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output:
        compiled_output.write('\n'.join(['\t'.join(item)
                                         for item in combined_results]))

    fn = join(output_dir, "compile_output_no_nohits.txt")
    with open(fn, 'w') as fd:
        fd.write('\n'.join(['\t'.join(item)
                            for item in combined_results[:-1]]))
Example #7
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    # turn a comma-separated list of numbers into a 1-D list of integers
    list_of_ints = lambda string: map(int, string.split(","))

    input_path_interest = opts.input_path_interest
    input_path_other = opts.input_path_other
    percentage_ids = list_of_ints(opts.percentage_ids)
    alignment_lengths = list_of_ints(opts.alignment_lengths)
    hits_to_1st = opts.hits_to_1st
    hits_to_2nd = opts.hits_to_2nd
    percentage_ids_other = opts.percentage_ids_other
    alignment_lengths_other = opts.alignment_lengths_other
    output_dir = opts.output_dir

    db_a = open(input_path_interest, "U")
    db_b = open(input_path_other, "U")

    # try to create the output directory, if it exists, just continue
    create_dir(output_dir, False)

    # run some validations on the input parameters
    if percentage_ids_other:
        percentage_ids_other = list_of_ints(percentage_ids_other)
        if len(percentage_ids) != len(percentage_ids_other):
            option_parser.error(
                "The percentage values for both databases "
                "should be the same length: %s - %s" % (percentage_ids, percentage_ids_other)
            )
    else:
        percentage_ids_other = percentage_ids

    if alignment_lengths_other:
        alignment_lengths_other = list_of_ints(alignment_lengths_other)

        if len(alignment_lengths) != len(alignment_lengths_other):
            option_parser.error(
                "The alignment length values for both databases"
                " should be the length : %s - %s" % (alignment_lengths, alignment_lengths_other)
            )
    else:
        alignment_lengths_other = alignment_lengths

    # Process databases
    total_queries, best_hits = parse_first_database(db_a, percentage_ids, alignment_lengths)
    parse_second_database(db_b, best_hits, percentage_ids_other, alignment_lengths_other)

    # Parse results
    results = process_results(
        percentage_ids,
        alignment_lengths,
        percentage_ids_other,
        alignment_lengths_other,
        best_hits,
        input_path_interest,
        input_path_other,
    )

    # Collating output and writing full results
    for i, item in enumerate(results):
        filename = join(output_dir, "summary_" + item["filename"] + ".txt")

        fd = open(filename, "w")
        fd.write("\n".join(item["summary"]))
        fd.close()

        if i == 0:
            combined_results = []
            combined_results.append(["filename"])
            combined_results.append(["interestdb (%s)" % input_path_interest])
            combined_results.append(["other db (%s)" % input_path_other])
            combined_results.append(["only interest"])
            combined_results.append(["both dbs"])
            combined_results.append(["no hits in interest db"])

        no_hits = total_queries - item["db_interest"] - item["db_other"] - item["perfect_interest"] - item["equal"]
        combined_results[0].append(item["filename"])
        combined_results[1].append(str(item["db_interest"]))
        combined_results[2].append(str(item["db_other"]))
        combined_results[3].append(str(item["perfect_interest"]))
        combined_results[4].append(str(item["equal"]))
        combined_results[5].append(str(no_hits))

        # Printing count of hits to the db
        if hits_to_1st:
            s_hits = sorted(item["db_seqs_counts"]["a"].items(), key=itemgetter(1), reverse=True)

            filename = join(output_dir, "hits_to_1st_db_" + item["filename"] + ".txt")

            fd = open(filename, "w")
            fd.write("\n".join(["%s\t%d" % (k, v) for k, v in s_hits if v != 0]))
            fd.close()

        if hits_to_2nd:
            s_hits = sorted(item["db_seqs_counts"]["b"].items(), key=itemgetter(1), reverse=True)

            filename = join(output_dir, "hits_to_2nd_db_" + item["filename"] + ".txt")

            fd = open(filename, "w")
            fd.write("\n".join(["%s: %d" % (k, v) for k, v in s_hits if v != 0]))
            fd.close()

    # Printing collated results
    compiled_output_fd = open(join(output_dir, "compile_output.txt"), "w")
    compiled_output_fd.write("\n".join(["\t".join(item) for item in combined_results]))
    compiled_output_fd.close()

    compiled_output_no_hits_fd = open(join(output_dir, "compile_output_no_nohits.txt"), "w")
    compiled_output_no_hits_fd.write("\n".join(["\t".join(item) for item in combined_results[:-1]]))
    compiled_output_no_hits_fd.close()
Example #8
0
def compare(interest_fp,
            other_fp,
            output_dir='blast-results-compare',
            interest_pcts=None,
            interest_alg_lens=None,
            other_pcts=None,
            other_alg_lens=None,
            hits_to_first=False,
            hits_to_second=False):
    """Compare two databases and write the outputs

    Parameters
    ----------
    interest_fp : str
        BLAST results when searching against the database of interest.
    other_fp : str
        BLAST results when searching against the other database.
    output_dir : str, optional
        Name of the output file path.
    interest_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        interest database search results. If None is passed, it defaults to
        `[70]`.
    interest_alg_lens : list, optional
        Minimum alginment length to be considered a valid result in the
        interest database search results. If None is passed, it defaults to
        `[50]`.
    other_pcts : list, optional
        Minimum percentage identity to be considered as a valid result in the
        other database search results. If None is passed, it defaults to
        `[70]`.
    other_lengths : list, optional
        Minimum alginment length to be considered a valid result in the
        other database search results. If None is passed, it defaults to
        `[50]`.
    hits_to_first : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the first
        database.
    hits_to_second : bool, optional defaults to False
        Outputs the labels and counts of the sequences being hit in the second
        database.

    Raises
    ------
    click.BadParameter
        If the `interest_pcts` and the `other_pcts` lists are of different
        length.
        If the `interest_alg_lens` and the `other_alg_lens` lists are of
        different length.
    """

    if interest_pcts is None:
        interest_pcts = [70]
    if interest_alg_lens is None:
        interest_alg_lens = [50]

    db_a = open(interest_fp, 'U')
    db_b = open(other_fp, 'U')

    # try to create the output directory, if it exists, just continue
    create_dir(output_dir, False)

    # run some validations on the input parameters
    if other_pcts:
        if len(interest_pcts) != len(other_pcts):
            raise BadParameter("The percentage values for both databases "
                               "should be the same length: %s - %s" %
                               (interest_pcts, other_pcts))
    else:
        other_pcts = interest_pcts

    if other_alg_lens:
        if len(interest_alg_lens) != len(other_alg_lens):
            raise BadParameter("The alignment length values for both databases"
                               " should be the length : %s - %s" %
                               (interest_alg_lens, other_alg_lens))
    else:
        other_alg_lens = interest_alg_lens

    # process databases
    total_queries, best_hits = parse_first_database(db_a, interest_pcts,
                                                    interest_alg_lens)
    parse_second_database(db_b, best_hits, other_pcts, other_alg_lens)

    # parse results
    results = process_results(interest_pcts, interest_alg_lens, other_pcts,
                              other_alg_lens, best_hits)

    # Collating output and writing full results
    for i, item in enumerate(results):
        filename = join(output_dir, "summary_" + item['filename'] + ".txt")

        with open(filename, 'w') as fd:
            fd.write('\n'.join(item['summary']))

        if i == 0:
            combined_results = []
            combined_results.append(['filename'])
            combined_results.append(
                ['interest db (%s)' % basename(interest_fp)])
            combined_results.append(['other db (%s)' % basename(other_fp)])
            combined_results.append(['only interest'])
            combined_results.append(['both dbs'])
            combined_results.append(['no hits in interest db'])

        no_hits = total_queries - item['db_interest'] - item['db_other'] - \
            item['perfect_interest'] - item['equal']

        combined_results[0].append(item['filename'])
        combined_results[1].append(str(item['db_interest']))
        combined_results[2].append(str(item['db_other']))
        combined_results[3].append(str(item['perfect_interest']))
        combined_results[4].append(str(item['equal']))
        combined_results[5].append(str(no_hits))

        # tiny helper function to save hits files
        def save_hits(data, name):

            s_hits = sorted(data, key=itemgetter(1), reverse=True)
            filename = join(output_dir, name)
            with open(filename, 'w') as fd:
                fd.write('\n'.join(
                    ['%s\t%d' % (k, v) for k, v in s_hits if v != 0]))

        if hits_to_first:
            save_hits(item['db_seqs_counts']['a'].items(),
                      "hits_to_first_db_%s.txt" % item['filename'])

        if hits_to_second:
            save_hits(item['db_seqs_counts']['b'].items(),
                      "hits_to_second_db_%s.txt" % item['filename'])

    # saving collated results
    with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output:
        compiled_output.write('\n'.join(
            ['\t'.join(item) for item in combined_results]))

    fn = join(output_dir, "compile_output_no_nohits.txt")
    with open(fn, 'w') as fd:
        fd.write('\n'.join(['\t'.join(item)
                            for item in combined_results[:-1]]))