def test_parse_first_database(self): """Parse first db should build best_hits correctly""" out_total_queries, out_best_hits = parse_first_database(self.db1, [0.80], [50]) self.assertEquals(out_total_queries, 2) self.assertEquals( out_best_hits, { "HABJ36W02EXF44": [ { "a": { "evalue": 0.0, "subject_id": "NZ_ABEH01000018_641736102", "bit_score": 1005.0, "percentage_id": 99.42, "alg_length": 519, }, "b": {"subject_id": None, "bit_score": -1}, } ], "HABJ36W02DLDSY": [ { "a": { "evalue": 0.0, "subject_id": "NZ_ABEH01000005_641736102", "bit_score": 959.0, "percentage_id": 99.22, "alg_length": 512, }, "b": {"subject_id": None, "bit_score": -1}, } ], }, )
def test_parse_first_database(self): """Parse first db should build best_hits correctly""" out_total_queries, out_best_hits = parse_first_database(self.db1, [.80], [50]) self.assertEquals(out_total_queries, 4) self.assertEquals( out_best_hits, {'HABJ36W02EXF44': [ {'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000018_641736102', 'bit_score': 1005.0, 'percentage_id': 99.42, 'alg_length': 519}, 'b': {'subject_id': None, 'bit_score': -1}}], 'BLANK-TEST-NOT-IN-SECOND': [ {'a': {'evalue': 4e-133, 'subject_id': 'NZ_ACZD01000120_647000262', 'bit_score': 482.0, 'percentage_id': 88.79, 'alg_length': 455}, 'b': {'subject_id': None, 'bit_score': -1}}], 'HABJ36W02DLDSY': [ {'a': {'evalue': 0.0, 'subject_id': 'NZ_ABEH01000005_641736102', 'bit_score': 959.0, 'percentage_id': 99.22, 'alg_length': 512}, 'b': {'subject_id': None, 'bit_score': -1}}]})
def compare(interest_fp, other_fp, output_dir='blast-results-compare', interest_pcts=None, interest_alg_lens=None, other_pcts=None, other_alg_lens=None, hits_to_first=False, hits_to_second=False): """Compare two databases and write the outputs Parameters ---------- interest_fp : str BLAST results when searching against the database of interest. other_fp : str BLAST results when searching against the other database. output_dir : str, optional Name of the output file path. interest_pcts : list, optional Minimum percentage identity to be considered as a valid result in the interest database search results. If None is passed, it defaults to `[70]`. interest_alg_lens : list, optional Minimum alginment length to be considered a valid result in the interest database search results. If None is passed, it defaults to `[50]`. other_pcts : list, optional Minimum percentage identity to be considered as a valid result in the other database search results. If None is passed, it defaults to `[70]`. other_lengths : list, optional Minimum alginment length to be considered a valid result in the other database search results. If None is passed, it defaults to `[50]`. hits_to_first : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the first database. hits_to_second : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the second database. Raises ------ click.BadParameter If the `interest_pcts` and the `other_pcts` lists are of different length. If the `interest_alg_lens` and the `other_alg_lens` lists are of different length. """ if interest_pcts is None: interest_pcts = [70] if interest_alg_lens is None: interest_alg_lens = [50] db_a = open(interest_fp, 'U') db_b = open(other_fp, 'U') # try to create the output directory, if it exists, just continue create_dir(output_dir, False) # run some validations on the input parameters if other_pcts: if len(interest_pcts) != len(other_pcts): raise BadParameter("The percentage values for both databases " "should be the same length: %s - %s" % (interest_pcts, other_pcts)) else: other_pcts = interest_pcts if other_alg_lens: if len(interest_alg_lens) != len(other_alg_lens): raise BadParameter("The alignment length values for both databases" " should be the length : %s - %s" % (interest_alg_lens, other_alg_lens)) else: other_alg_lens = interest_alg_lens # process databases total_queries, best_hits = parse_first_database(db_a, interest_pcts, interest_alg_lens) parse_second_database(db_b, best_hits, other_pcts, other_alg_lens) # parse results results = process_results(interest_pcts, interest_alg_lens, other_pcts, other_alg_lens, best_hits) # Collating output and writing full results for i, item in enumerate(results): filename = join(output_dir, "summary_" + item['filename'] + ".txt") with open(filename, 'w') as fd: fd.write('\n'.join(item['summary'])) if i == 0: combined_results = [] combined_results.append(['filename']) combined_results.append(['interest db (%s)' % basename(interest_fp)]) combined_results.append(['other db (%s)' % basename(other_fp)]) combined_results.append(['only interest']) combined_results.append(['both dbs']) combined_results.append(['no hits in interest db']) no_hits = total_queries - item['db_interest'] - item['db_other'] - \ item['perfect_interest'] - item['equal'] combined_results[0].append(item['filename']) combined_results[1].append(str(item['db_interest'])) combined_results[2].append(str(item['db_other'])) combined_results[3].append(str(item['perfect_interest'])) combined_results[4].append(str(item['equal'])) combined_results[5].append(str(no_hits)) # tiny helper function to save hits files def save_hits(data, name): s_hits = sorted(data, key=itemgetter(1), reverse=True) filename = join(output_dir, name) with open(filename, 'w') as fd: fd.write('\n'.join(['%s\t%d' % (k, v) for k, v in s_hits if v != 0])) if hits_to_first: save_hits(item['db_seqs_counts']['a'].items(), "hits_to_first_db_%s.txt" % item['filename']) if hits_to_second: save_hits(item['db_seqs_counts']['b'].items(), "hits_to_second_db_%s.txt" % item['filename']) # saving collated results with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output: compiled_output.write('\n'.join(['\t'.join(item) for item in combined_results])) fn = join(output_dir, "compile_output_no_nohits.txt") with open(fn, 'w') as fd: fd.write('\n'.join(['\t'.join(item) for item in combined_results[:-1]]))
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) # turn a comma-separated list of numbers into a 1-D list of integers list_of_ints = lambda string: map(int, string.split(",")) input_path_interest = opts.input_path_interest input_path_other = opts.input_path_other percentage_ids = list_of_ints(opts.percentage_ids) alignment_lengths = list_of_ints(opts.alignment_lengths) hits_to_1st = opts.hits_to_1st hits_to_2nd = opts.hits_to_2nd percentage_ids_other = opts.percentage_ids_other alignment_lengths_other = opts.alignment_lengths_other output_dir = opts.output_dir db_a = open(input_path_interest, "U") db_b = open(input_path_other, "U") # try to create the output directory, if it exists, just continue create_dir(output_dir, False) # run some validations on the input parameters if percentage_ids_other: percentage_ids_other = list_of_ints(percentage_ids_other) if len(percentage_ids) != len(percentage_ids_other): option_parser.error( "The percentage values for both databases " "should be the same length: %s - %s" % (percentage_ids, percentage_ids_other) ) else: percentage_ids_other = percentage_ids if alignment_lengths_other: alignment_lengths_other = list_of_ints(alignment_lengths_other) if len(alignment_lengths) != len(alignment_lengths_other): option_parser.error( "The alignment length values for both databases" " should be the length : %s - %s" % (alignment_lengths, alignment_lengths_other) ) else: alignment_lengths_other = alignment_lengths # Process databases total_queries, best_hits = parse_first_database(db_a, percentage_ids, alignment_lengths) parse_second_database(db_b, best_hits, percentage_ids_other, alignment_lengths_other) # Parse results results = process_results( percentage_ids, alignment_lengths, percentage_ids_other, alignment_lengths_other, best_hits, input_path_interest, input_path_other, ) # Collating output and writing full results for i, item in enumerate(results): filename = join(output_dir, "summary_" + item["filename"] + ".txt") fd = open(filename, "w") fd.write("\n".join(item["summary"])) fd.close() if i == 0: combined_results = [] combined_results.append(["filename"]) combined_results.append(["interestdb (%s)" % input_path_interest]) combined_results.append(["other db (%s)" % input_path_other]) combined_results.append(["only interest"]) combined_results.append(["both dbs"]) combined_results.append(["no hits in interest db"]) no_hits = total_queries - item["db_interest"] - item["db_other"] - item["perfect_interest"] - item["equal"] combined_results[0].append(item["filename"]) combined_results[1].append(str(item["db_interest"])) combined_results[2].append(str(item["db_other"])) combined_results[3].append(str(item["perfect_interest"])) combined_results[4].append(str(item["equal"])) combined_results[5].append(str(no_hits)) # Printing count of hits to the db if hits_to_1st: s_hits = sorted(item["db_seqs_counts"]["a"].items(), key=itemgetter(1), reverse=True) filename = join(output_dir, "hits_to_1st_db_" + item["filename"] + ".txt") fd = open(filename, "w") fd.write("\n".join(["%s\t%d" % (k, v) for k, v in s_hits if v != 0])) fd.close() if hits_to_2nd: s_hits = sorted(item["db_seqs_counts"]["b"].items(), key=itemgetter(1), reverse=True) filename = join(output_dir, "hits_to_2nd_db_" + item["filename"] + ".txt") fd = open(filename, "w") fd.write("\n".join(["%s: %d" % (k, v) for k, v in s_hits if v != 0])) fd.close() # Printing collated results compiled_output_fd = open(join(output_dir, "compile_output.txt"), "w") compiled_output_fd.write("\n".join(["\t".join(item) for item in combined_results])) compiled_output_fd.close() compiled_output_no_hits_fd = open(join(output_dir, "compile_output_no_nohits.txt"), "w") compiled_output_no_hits_fd.write("\n".join(["\t".join(item) for item in combined_results[:-1]])) compiled_output_no_hits_fd.close()
def compare(interest_fp, other_fp, output_dir='blast-results-compare', interest_pcts=None, interest_alg_lens=None, other_pcts=None, other_alg_lens=None, hits_to_first=False, hits_to_second=False): """Compare two databases and write the outputs Parameters ---------- interest_fp : str BLAST results when searching against the database of interest. other_fp : str BLAST results when searching against the other database. output_dir : str, optional Name of the output file path. interest_pcts : list, optional Minimum percentage identity to be considered as a valid result in the interest database search results. If None is passed, it defaults to `[70]`. interest_alg_lens : list, optional Minimum alginment length to be considered a valid result in the interest database search results. If None is passed, it defaults to `[50]`. other_pcts : list, optional Minimum percentage identity to be considered as a valid result in the other database search results. If None is passed, it defaults to `[70]`. other_lengths : list, optional Minimum alginment length to be considered a valid result in the other database search results. If None is passed, it defaults to `[50]`. hits_to_first : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the first database. hits_to_second : bool, optional defaults to False Outputs the labels and counts of the sequences being hit in the second database. Raises ------ click.BadParameter If the `interest_pcts` and the `other_pcts` lists are of different length. If the `interest_alg_lens` and the `other_alg_lens` lists are of different length. """ if interest_pcts is None: interest_pcts = [70] if interest_alg_lens is None: interest_alg_lens = [50] db_a = open(interest_fp, 'U') db_b = open(other_fp, 'U') # try to create the output directory, if it exists, just continue create_dir(output_dir, False) # run some validations on the input parameters if other_pcts: if len(interest_pcts) != len(other_pcts): raise BadParameter("The percentage values for both databases " "should be the same length: %s - %s" % (interest_pcts, other_pcts)) else: other_pcts = interest_pcts if other_alg_lens: if len(interest_alg_lens) != len(other_alg_lens): raise BadParameter("The alignment length values for both databases" " should be the length : %s - %s" % (interest_alg_lens, other_alg_lens)) else: other_alg_lens = interest_alg_lens # process databases total_queries, best_hits = parse_first_database(db_a, interest_pcts, interest_alg_lens) parse_second_database(db_b, best_hits, other_pcts, other_alg_lens) # parse results results = process_results(interest_pcts, interest_alg_lens, other_pcts, other_alg_lens, best_hits) # Collating output and writing full results for i, item in enumerate(results): filename = join(output_dir, "summary_" + item['filename'] + ".txt") with open(filename, 'w') as fd: fd.write('\n'.join(item['summary'])) if i == 0: combined_results = [] combined_results.append(['filename']) combined_results.append( ['interest db (%s)' % basename(interest_fp)]) combined_results.append(['other db (%s)' % basename(other_fp)]) combined_results.append(['only interest']) combined_results.append(['both dbs']) combined_results.append(['no hits in interest db']) no_hits = total_queries - item['db_interest'] - item['db_other'] - \ item['perfect_interest'] - item['equal'] combined_results[0].append(item['filename']) combined_results[1].append(str(item['db_interest'])) combined_results[2].append(str(item['db_other'])) combined_results[3].append(str(item['perfect_interest'])) combined_results[4].append(str(item['equal'])) combined_results[5].append(str(no_hits)) # tiny helper function to save hits files def save_hits(data, name): s_hits = sorted(data, key=itemgetter(1), reverse=True) filename = join(output_dir, name) with open(filename, 'w') as fd: fd.write('\n'.join( ['%s\t%d' % (k, v) for k, v in s_hits if v != 0])) if hits_to_first: save_hits(item['db_seqs_counts']['a'].items(), "hits_to_first_db_%s.txt" % item['filename']) if hits_to_second: save_hits(item['db_seqs_counts']['b'].items(), "hits_to_second_db_%s.txt" % item['filename']) # saving collated results with open(join(output_dir, "compile_output.txt"), 'w') as compiled_output: compiled_output.write('\n'.join( ['\t'.join(item) for item in combined_results])) fn = join(output_dir, "compile_output_no_nohits.txt") with open(fn, 'w') as fd: fd.write('\n'.join(['\t'.join(item) for item in combined_results[:-1]]))