def postSeqResult(): """Search through COBS index located in BacQuerya storage and return search results and metadata to frontend""" if not request.json: return "not a json post" if request.json: sequence_dict = request.json query_sequence = sequence_dict['searchTerm'].replace(" ", "").upper() # search for uploaded sequence in COBS index sys.stderr.write("\nSearching COBS index\n") index_name = os.path.join(gene_dir, "31_index.cobs_compact") index = cobs.Search(index_name) result = index.search(query_sequence, threshold=0.8) # load metadata for identified sequences query_length = len(query_sequence) kmer_length = int(os.path.basename(index_name).split("_")[0]) sys.stderr.write("\nExtracting metadata for COBS result\n") # return only unique gene names and the highest match proportion for duplicate gene names in search results no_duplicates = {} for res in tqdm(result): match_count = int(res.score) geneName = res.doc_name.split("_v")[0] match_proportion = round( match_count * 100 / ((query_length - kmer_length) + 1), 2) if not geneName in no_duplicates: no_duplicates[geneName] = match_proportion else: if no_duplicates[geneName] < match_proportion: no_duplicates[geneName] = match_proportion result_metrics = [] for key, value in no_duplicates.items(): metrics = {"geneName": key, "numberMatching": value} result_metrics.append(metrics) sys.stderr.write("\nPosting results to frontend\n") response = {"resultMetrics": result_metrics} return jsonify(response)
def test_compact_construct_query(self): index_file = datadir + "/python_test.cobs_compact" # construct compact index p = cobs.CompactIndexParameters() p.clobber = True cobs.compact_construct(input=datadir + "/fasta", out_file=index_file, index_params=p) self.assertTrue(os.path.isfile(index_file)) # run queries s = cobs.Search(index_file) r = s.search("AGTCAACGCTAAGGCATTTCCCCCCTGCCTCCTGCCTGCTGCCAAGCCCT") #print(r) self.assertEqual(len(r), 7) self.assertEqual(r[0].doc_name, "sample1") self.assertEqual(r[0].score, 20)
def search_index(kmer, sequence, name): found = '' index = cobs.Search("indexes_all/" + str(kmer) + "_index.cobs_compact") result_zero = index.search(sequence, threshold=0.0) result_one = index.search(sequence, threshold=0.1) result_two = index.search(sequence, threshold=0.2) result_three = index.search(sequence, threshold=0.3) result_four = index.search(sequence, threshold=0.4) result_five = index.search(sequence, threshold=0.5) result_six = index.search(sequence, threshold=0.6) result_seven = index.search(sequence, threshold=0.7) result_eight = index.search(sequence, threshold=0.8) result_nine = index.search(sequence, threshold=0.9) result_ten = index.search(sequence, threshold=1.0) if name == (result_zero[0][1]).split(";")[0]: found = 1 else: found = 0 return (found, [ result_zero, result_one, result_two, result_three, result_four, result_five, result_six, result_seven, result_eight, result_nine, result_ten ])
if not len(result) == 0: result = (str(result[0]).split("'")[1]).split(";")[0] else: result = "[]" #else: # result = "" if result == "[]": result = "NULL" except: result = "ERROR" else: result = "LENGTH/ISOLATE" return result index = cobs.Search("INDEX/10_index_index.cobs_compact") all_annotations = pd.read_csv("reference_sparc_merged/all_annotations.csv") tqdm.pandas() #all_annotations["COBS 75%"] = all_annotations.progress_apply(lambda row: identify(row["strand"], row["sequence"], index, 0.75, row["Isolate"]), axis = 1) #all_annotations["COBS 80%"] = all_annotations.progress_apply(lambda row: identify(row["strand"], row["sequence"], index, 0.8, row["Isolate"]), axis = 1) all_annotations["COBS 85%"] = all_annotations.progress_apply( lambda row: identify(row["strand"], row["sequence"], index, 0.85, row[ "Isolate"]), axis=1) #all_annotations["COBS 90%"] = all_annotations.progress_apply(lambda row: identify(row["strand"], row["sequence"], index, 0.9, row["Isolate"]), axis = 1) #all_annotations["COBS 95%"] = all_annotations.progress_apply(lambda row: identify(row["strand"], row["sequence"], index, 0.95, row["Isolate"]), axis = 1) all_annotations.to_csv(
def search_index(kmer, sequence, name, file_dir): found = '' index = cobs.Search("ini_cluster_indexes/" + str(kmer) + "_index.cobs_compact") result_zero = index.search(sequence, threshold=0.0) result_zero_five = index.search(sequence, threshold=0.05) result_one = index.search(sequence, threshold=0.1) result_one_five = index.search(sequence, threshold=0.15) result_two = index.search(sequence, threshold=0.2) result_two_five = index.search(sequence, threshold=0.25) result_three = index.search(sequence, threshold=0.3) result_three_five = index.search(sequence, threshold=0.35) result_four = index.search(sequence, threshold=0.4) result_four_five = index.search(sequence, threshold=0.45) result_five = index.search(sequence, threshold=0.5) result_five_five = index.search(sequence, threshold=0.55) result_six = index.search(sequence, threshold=0.6) result_six_five = index.search(sequence, threshold=0.65) result_seven = index.search(sequence, threshold=0.7) result_seven_five = index.search(sequence, threshold=0.75) result_eight = index.search(sequence, threshold=0.8) result_eight_five = index.search(sequence, threshold=0.85) result_nine = index.search(sequence, threshold=0.9) result_nine_five = index.search(sequence, threshold=0.95) result_ten = index.search(sequence, threshold=1.0) if name == (result_zero[0][1]).split(";")[0]: found = 1 else: found = 0 result_zero_five = get_characteristic(name, found, result_zero, result_zero_five, sequence, file_dir) result_one = get_characteristic(name, found, result_zero, result_one, sequence, file_dir) result_one_five = get_characteristic(name, found, result_zero, result_one_five, sequence, file_dir) result_two = get_characteristic(name, found, result_zero, result_two, sequence, file_dir) result_two_five = get_characteristic(name, found, result_zero, result_two_five, sequence, file_dir) result_three = get_characteristic(name, found, result_zero, result_three, sequence, file_dir) result_three_five = get_characteristic(name, found, result_zero, result_three_five, sequence, file_dir) result_four = get_characteristic(name, found, result_zero, result_four, sequence, file_dir) result_four_five = get_characteristic(name, found, result_zero, result_four_five, sequence, file_dir) result_five = get_characteristic(name, found, result_zero, result_five, sequence, file_dir) result_five_five = get_characteristic(name, found, result_zero, result_five_five, sequence, file_dir) result_six = get_characteristic(name, found, result_zero, result_six, sequence, file_dir) result_six_five = get_characteristic(name, found, result_zero, result_six_five, sequence, file_dir) result_seven = get_characteristic(name, found, result_zero, result_seven, sequence, file_dir) result_seven_five = get_characteristic(name, found, result_zero, result_seven_five, sequence, file_dir) result_eight = get_characteristic(name, found, result_zero, result_eight, sequence, file_dir) result_eight_five = get_characteristic(name, found, result_zero, result_eight_five, sequence, file_dir) result_nine = get_characteristic(name, found, result_zero, result_nine, sequence, file_dir) result_nine_five = get_characteristic(name, found, result_zero, result_nine_five, sequence, file_dir) result_ten = get_characteristic(name, found, result_zero, result_ten, sequence, file_dir) result_zero = get_characteristic(name, found, result_zero, result_zero, sequence, file_dir) return (found, [ result_zero, result_zero_five, result_one, result_one_five, result_two, result_two_five, result_three, result_three_five, result_four, result_four_five, result_five, result_five_five, result_six, result_six_five, result_seven, result_seven_five, result_eight, result_eight_five, result_nine, result_nine_five, result_ten ])