Esempio n. 1
0
def postSeqResult():
    """Search through COBS index located in BacQuerya storage and return search results and metadata to frontend"""
    if not request.json:
        return "not a json post"
    if request.json:
        sequence_dict = request.json
        query_sequence = sequence_dict['searchTerm'].replace(" ", "").upper()
        # search for uploaded sequence in COBS index
        sys.stderr.write("\nSearching COBS index\n")
        index_name = os.path.join(gene_dir, "31_index.cobs_compact")
        index = cobs.Search(index_name)
        result = index.search(query_sequence, threshold=0.8)
        # load metadata for identified sequences
        query_length = len(query_sequence)
        kmer_length = int(os.path.basename(index_name).split("_")[0])
        sys.stderr.write("\nExtracting metadata for COBS result\n")
        # return only unique gene names and the highest match proportion for duplicate gene names in search results
        no_duplicates = {}
        for res in tqdm(result):
            match_count = int(res.score)
            geneName = res.doc_name.split("_v")[0]
            match_proportion = round(
                match_count * 100 / ((query_length - kmer_length) + 1), 2)
            if not geneName in no_duplicates:
                no_duplicates[geneName] = match_proportion
            else:
                if no_duplicates[geneName] < match_proportion:
                    no_duplicates[geneName] = match_proportion
        result_metrics = []
        for key, value in no_duplicates.items():
            metrics = {"geneName": key, "numberMatching": value}
            result_metrics.append(metrics)
        sys.stderr.write("\nPosting results to frontend\n")
        response = {"resultMetrics": result_metrics}
    return jsonify(response)
Esempio n. 2
0
    def test_compact_construct_query(self):
        index_file = datadir + "/python_test.cobs_compact"

        # construct compact index
        p = cobs.CompactIndexParameters()
        p.clobber = True
        cobs.compact_construct(input=datadir + "/fasta",
                               out_file=index_file,
                               index_params=p)
        self.assertTrue(os.path.isfile(index_file))

        # run queries
        s = cobs.Search(index_file)
        r = s.search("AGTCAACGCTAAGGCATTTCCCCCCTGCCTCCTGCCTGCTGCCAAGCCCT")
        #print(r)
        self.assertEqual(len(r), 7)
        self.assertEqual(r[0].doc_name, "sample1")
        self.assertEqual(r[0].score, 20)
def search_index(kmer, sequence, name):
    found = ''
    index = cobs.Search("indexes_all/" + str(kmer) + "_index.cobs_compact")
    result_zero = index.search(sequence, threshold=0.0)
    result_one = index.search(sequence, threshold=0.1)
    result_two = index.search(sequence, threshold=0.2)
    result_three = index.search(sequence, threshold=0.3)
    result_four = index.search(sequence, threshold=0.4)
    result_five = index.search(sequence, threshold=0.5)
    result_six = index.search(sequence, threshold=0.6)
    result_seven = index.search(sequence, threshold=0.7)
    result_eight = index.search(sequence, threshold=0.8)
    result_nine = index.search(sequence, threshold=0.9)
    result_ten = index.search(sequence, threshold=1.0)
    if name == (result_zero[0][1]).split(";")[0]:
        found = 1
    else:
        found = 0
    return (found, [
        result_zero, result_one, result_two, result_three, result_four,
        result_five, result_six, result_seven, result_eight, result_nine,
        result_ten
    ])
Esempio n. 4
0
                if not len(result) == 0:
                    result = (str(result[0]).split("'")[1]).split(";")[0]
                else:
                    result = "[]"
            #else:
            # result = ""
            if result == "[]":
                result = "NULL"
        except:
            result = "ERROR"
    else:
        result = "LENGTH/ISOLATE"
    return result


index = cobs.Search("INDEX/10_index_index.cobs_compact")

all_annotations = pd.read_csv("reference_sparc_merged/all_annotations.csv")

tqdm.pandas()

#all_annotations["COBS 75%"] = all_annotations.progress_apply(lambda row: identify(row["strand"], row["sequence"], index, 0.75, row["Isolate"]), axis = 1)
#all_annotations["COBS 80%"] = all_annotations.progress_apply(lambda row: identify(row["strand"], row["sequence"], index, 0.8, row["Isolate"]), axis = 1)
all_annotations["COBS 85%"] = all_annotations.progress_apply(
    lambda row: identify(row["strand"], row["sequence"], index, 0.85, row[
        "Isolate"]),
    axis=1)
#all_annotations["COBS 90%"] = all_annotations.progress_apply(lambda row: identify(row["strand"], row["sequence"], index, 0.9, row["Isolate"]), axis = 1)
#all_annotations["COBS 95%"] = all_annotations.progress_apply(lambda row: identify(row["strand"], row["sequence"], index, 0.95, row["Isolate"]), axis = 1)

all_annotations.to_csv(
def search_index(kmer, sequence, name, file_dir):
    found = ''
    index = cobs.Search("ini_cluster_indexes/" + str(kmer) +
                        "_index.cobs_compact")
    result_zero = index.search(sequence, threshold=0.0)
    result_zero_five = index.search(sequence, threshold=0.05)
    result_one = index.search(sequence, threshold=0.1)
    result_one_five = index.search(sequence, threshold=0.15)
    result_two = index.search(sequence, threshold=0.2)
    result_two_five = index.search(sequence, threshold=0.25)
    result_three = index.search(sequence, threshold=0.3)
    result_three_five = index.search(sequence, threshold=0.35)
    result_four = index.search(sequence, threshold=0.4)
    result_four_five = index.search(sequence, threshold=0.45)
    result_five = index.search(sequence, threshold=0.5)
    result_five_five = index.search(sequence, threshold=0.55)
    result_six = index.search(sequence, threshold=0.6)
    result_six_five = index.search(sequence, threshold=0.65)
    result_seven = index.search(sequence, threshold=0.7)
    result_seven_five = index.search(sequence, threshold=0.75)
    result_eight = index.search(sequence, threshold=0.8)
    result_eight_five = index.search(sequence, threshold=0.85)
    result_nine = index.search(sequence, threshold=0.9)
    result_nine_five = index.search(sequence, threshold=0.95)
    result_ten = index.search(sequence, threshold=1.0)

    if name == (result_zero[0][1]).split(";")[0]:
        found = 1
    else:
        found = 0

    result_zero_five = get_characteristic(name, found, result_zero,
                                          result_zero_five, sequence, file_dir)
    result_one = get_characteristic(name, found, result_zero, result_one,
                                    sequence, file_dir)
    result_one_five = get_characteristic(name, found, result_zero,
                                         result_one_five, sequence, file_dir)
    result_two = get_characteristic(name, found, result_zero, result_two,
                                    sequence, file_dir)
    result_two_five = get_characteristic(name, found, result_zero,
                                         result_two_five, sequence, file_dir)
    result_three = get_characteristic(name, found, result_zero, result_three,
                                      sequence, file_dir)
    result_three_five = get_characteristic(name, found, result_zero,
                                           result_three_five, sequence,
                                           file_dir)
    result_four = get_characteristic(name, found, result_zero, result_four,
                                     sequence, file_dir)
    result_four_five = get_characteristic(name, found, result_zero,
                                          result_four_five, sequence, file_dir)
    result_five = get_characteristic(name, found, result_zero, result_five,
                                     sequence, file_dir)
    result_five_five = get_characteristic(name, found, result_zero,
                                          result_five_five, sequence, file_dir)
    result_six = get_characteristic(name, found, result_zero, result_six,
                                    sequence, file_dir)
    result_six_five = get_characteristic(name, found, result_zero,
                                         result_six_five, sequence, file_dir)
    result_seven = get_characteristic(name, found, result_zero, result_seven,
                                      sequence, file_dir)
    result_seven_five = get_characteristic(name, found, result_zero,
                                           result_seven_five, sequence,
                                           file_dir)
    result_eight = get_characteristic(name, found, result_zero, result_eight,
                                      sequence, file_dir)
    result_eight_five = get_characteristic(name, found, result_zero,
                                           result_eight_five, sequence,
                                           file_dir)
    result_nine = get_characteristic(name, found, result_zero, result_nine,
                                     sequence, file_dir)
    result_nine_five = get_characteristic(name, found, result_zero,
                                          result_nine_five, sequence, file_dir)
    result_ten = get_characteristic(name, found, result_zero, result_ten,
                                    sequence, file_dir)
    result_zero = get_characteristic(name, found, result_zero, result_zero,
                                     sequence, file_dir)

    return (found, [
        result_zero, result_zero_five, result_one, result_one_five, result_two,
        result_two_five, result_three, result_three_five, result_four,
        result_four_five, result_five, result_five_five, result_six,
        result_six_five, result_seven, result_seven_five, result_eight,
        result_eight_five, result_nine, result_nine_five, result_ten
    ])