Exemple #1
0
def test_validity():
    # assumes that mu's have already been json'ed
    mu = fm.unjson_it("../src/c22_mu")
    geno = fm.read_unambiguous("../data/22.fa")

    myd = {}

    for key in tqdm(mu, desc="checking uniqueness"):
        seq = geno[int(key):mu[key] + 1]
        assert seq not in myd
        myd[seq] = 1
        if key == "700":
            myd["ACGT"]
Exemple #2
0
def temp_forward_unique_check():

    geno = read_unambiguous(filename=PATH + FORWARD)
    # comment()
    s_arr = read_byte_numpy(append_file_name('data/22.sa'))

    inv_suff, lcp = kasai(geno, s_arr)

    myd = {}
    for num in range(len(s_arr)):
        myd[s_arr[num]] = lcp[num]

    #trues0 = list(get_uniques(lcp))
    json_it(trues0, "c22_forward_uniques")
Exemple #3
0
def SA_file(filename:str):
    print('Reading file: ')
    sequence = read_unambiguous(filename)
    print('File read!\nCreating Suffix Array Naively:')

    s_array, _ = naive_SA(sequence)

    length = len(s_array)
    s_array.insert(0,length)

    print('Suffix Array Created!\nWriting to file: ')

    write_array_to_byte(filename='fake_genome_sa',byte_arr=s_array)

    return
Exemple #4
0
def genome_reads_test(filename):
    # testing reads() from file manager
    filename = append_file_name(filename)
    past = time.time()
    read_bioseq = read_unambiguous(filename=filename)
    current = time.time()
    print('genome read with Bio.Seq. Time elapsed: ', current - past)
    past = current

    read_reads = reads(filename=filename)
    current = time.time()
    print('genome read with Reads.py. Time elapsed: ', current - past)

    assert type(read_bioseq) is Bio.Seq.Seq

    assert type(read_reads) is str

    assert read_reads == str(read_bioseq)
Exemple #5
0
def driver():

    geno = read_unambiguous(TWO_WAY_GENO)
    s_arr = read_byte_numpy(S_ARRAY)
Exemple #6
0
def sesame_plant():
    s_arr = read_byte_numpy(append_file_name('data/22.sa'))
    geno = read_unambiguous(append_file_name('data/22.fa'))

    se = SAGuide(s_arr=s_arr, geno=geno)
    print(se)
Exemple #7
0
def mu_driver():
    """
        similar function as driver.py, except include minimal uniques instead of finding 20-100 uniquemers
    :return:
    """

    try:

        # gitignore()
        print('reading original genome: ', end='')
        chrs, geno = chr_splits(filename=PATH + ORIGINAL)
        json_it(chrs, append_file_name("json_chrs"))
        del chrs
        print('done.\nreading original SA...: ', end='')
        s_arr = read_byte_numpy(append_file_name('data/genome.sa'))

        lcp1 = kasai(geno, s_arr)[1]
        d1 = OrderedDict(mu(SA=s_arr, LCP=lcp1))
        del lcp1
        del s_arr

        au = _part_2(genome_file_name=PATH + ORIGINAL)
        print("au list: ", list(au))

        # *************************
        # (2) flipped
        # *************************

        print("performing flips: ")
        geno2 = read_unambiguous(PATH + FLIPPED)

        s_arr2 = read_byte_numpy(append_file_name('data/flippedGeno.sa'))

        lcp2 = kasai(geno2, s_arr2)[1]
        del geno2

        mu_result = dict(compare(d=d1, SA=s_arr2, LCP=lcp2))
        del lcp2
        mu_result = OrderedDict(sort_mu(mu_result))

        mu_result = OrderedDict(true_address_dict(mu_result, au))

        json_it(mu_result, append_file_name(files['MU_RESULT']))

        #contigs = list(find_contigs(d=old_mu_result_without_true_addresses, bot=20, top=100))
        contigs = OrderedDict(
            find_perfect_contigs(d=mu_result, bot=20, top=100))

        json_it(contigs, append_file_name(files['PERFECT_CONTIGS']))

        contigs = list(within_distance(d=contigs, distance=300))

        json_it(contigs,
                append_file_name(files['PERFECT_CONTIGS_WITH_DISTANCE']))

        print("number of contigs: ", len(contigs))

        print("done")

    except Exception as e:
        raise
Exemple #8
0
def efficient_mu_driver():
    """
        NOTES:
            07/05: You MUST run get_uniques first before sorting the lcp

    :return:
    """

    try:
        # comment()
        geno = reads(filename=PATH + FORWARD)
        geno_length = len(geno)
        # comment()
        s_arr = read_byte_numpy(append_file_name('data/22.sa'))

        inv_suff, lcp = kasai(geno, s_arr)
        lcp = kasai(geno, s_arr)[1]
        del geno, s_arr

        # comment()
        au = _part_2(genome_file_name=PATH + FORWARD)

        lcp = list(get_uniques(lcp))

        trues0 = list(sort_lcp(lcp=lcp, inv_suff=inv_suff))

        del lcp

        bad_address = forbiddens(inv_suff=inv_suff, lcp=trues0, au=au)
        del inv_suff

        geno = read_unambiguous(filename=PATH + FLIPPED)
        s_arr = read_byte_numpy(append_file_name('data/f22.sa'))

        inv_2, lcp = kasai(geno, s_arr)
        lcp = kasai(geno, s_arr)[1]
        del geno, s_arr

        lcp = list(get_uniques(lcp))

        trues1 = list(sort_lcp(lcp=lcp, inv_suff=inv_2))
        del lcp, inv_2

        # mu_s, mu_e = list(compare(inv0=inv_suff, trues0=trues0, inv1=inv_2, trues1=trues, bad_address=bad_address))
        mu_s = []
        mu_e = []

        au_dict = {}
        for item in list(au):
            au_dict[item[0]] = item[1]

        del au

        u_ceil = list(au_dict)[0]
        u_floor = 0
        a_offset = au_dict[u_ceil]

        # mu_s, mu_e = list(compare(trues0=trues0, trues1=trues1, bad_address=bad_address))
        for tup in compare_no_inv_suff(trues0=trues0,
                                       trues1=trues1,
                                       bad_address=bad_address,
                                       geno_length=geno_length):
            sa = tup[0]

            if sa < u_floor:
                raise Exception(
                    "SA is less than u_floor. Possible that s_arr not sorted correctly?"
                )

            if sa > u_ceil and len(au_dict) > 1:
                u_floor = u_ceil
                del au_dict[u_ceil]
                u_ceil = list(au_dict)[0]
                a_offset = au_dict[u_ceil]

            elif len(au_dict) < 1:
                print("not au_dict reached")
                break

            # mu_s.append(tup[0])
            mu_s.append(sa + a_offset)
            mu_e.append(tup[1])

        # TODO: 07/05 made the line below return a dict as well as accept geno
        #   to return to before, do not input geno and output two lists
        # myd = dict(compare(trues0 = trues0, trues1 = trues1, bad_address=bad_address))
        # json_it(data=myd, filename="c22_mu")
        assert len(mu_s) == len(mu_e)
        just_dump(myl=mu_s, fn="c22_mu_starts_0709", print_length=True)
        just_dump(myl=mu_e, fn="c22_mu_ends_0709", print_length=True)
        # 07/08: changed get_uniques so that it doesn't yield past or lcp + 1

        # json_it(mu_s, "efficient_mu_starts")
        # json_it(mu_e, "efficient_mu_ends")

        # stitched = list(stitch(starts=mu_s, uniques=mu_e))
        # json_it(stitched, "stitched")

        # print("Number of stitched: " + str(len(stitched)))

        # print("Number of MU: " + str(len(mu_s)))
        # findmean(mys = mu_s, mye=mu_e)

    except IndexError:
        pass
    except Exception:
        print(traceback.format_exc())
        breakpoint()