Ejemplo n.º 1
0
def byte_read_array_numpy(filename: str):
    array_array = read_byte_array(filename)
    numpy_array = read_byte_numpy(filename)

    assert type(numpy_array) is np.ndarray
    assert type(array_array) is array.ArrayType
    assert list(numpy_array) == list(array_array)
Ejemplo n.º 2
0
def temp_forward_unique_check():

    geno = read_unambiguous(filename=PATH + FORWARD)
    # comment()
    s_arr = read_byte_numpy(append_file_name('data/22.sa'))

    inv_suff, lcp = kasai(geno, s_arr)

    myd = {}
    for num in range(len(s_arr)):
        myd[s_arr[num]] = lcp[num]

    #trues0 = list(get_uniques(lcp))
    json_it(trues0, "c22_forward_uniques")
Ejemplo n.º 3
0
def _part_0(args=None, print=print):
    try:
        start = time.time()
        # _____________________________________________
        print('\n_____________________________________')
        print('PART 0: READ ARGS AND GENOME/SA FILES')
        print('_____________________________________\n')
        # _____________________________________________

        past = start
        print('reading SA...\n')

        # read suffix array from bytes to ints
        # reading with numpy then converting to 1-D array much slower than array.array
        # however, array cannot read files larger than ~3GB

        s_array = read_byte_numpy(filename=args.SA)
        print('SA read.\n')
        past = get_time(past, print=print)
        print('reading genome...\n')

        # read with Reads instead
        # ! genome has ambs
        #genome = reads(filename=args.genome)
        chrs, genome = chr_splits(filename=args.genome)

        json_it(data=chrs,
                filename=append_file_name(args.outfile + "json_chrs"))

        print('genome read.\n')
        past = get_time(past, print=print)

        # TODO: change below line as necessary
        # args.LCPfile = '../data/lcp_pickle'

        return genome, past, s_array, start

    except Exception as e:
        raise
Ejemplo n.º 4
0
def driver():

    geno = read_unambiguous(TWO_WAY_GENO)
    s_arr = read_byte_numpy(S_ARRAY)
Ejemplo n.º 5
0
def sesame_plant():
    s_arr = read_byte_numpy(append_file_name('data/22.sa'))
    geno = read_unambiguous(append_file_name('data/22.fa'))

    se = SAGuide(s_arr=s_arr, geno=geno)
    print(se)
Ejemplo n.º 6
0
def naive_lcp_22():
    s_arr = fm.read_byte_numpy(filename=fm.append_file_name('data/22.sa'))
    lcp = test_kasai.naive_lcp(s_array=s_arr,T=simple_genome())
    fm.json_it(data=lcp,filename=fm.append_file_name('output/naive_lcp_22'))
Ejemplo n.º 7
0
def mu_driver():
    """
        similar function as driver.py, except include minimal uniques instead of finding 20-100 uniquemers
    :return:
    """

    try:

        # gitignore()
        print('reading original genome: ', end='')
        chrs, geno = chr_splits(filename=PATH + ORIGINAL)
        json_it(chrs, append_file_name("json_chrs"))
        del chrs
        print('done.\nreading original SA...: ', end='')
        s_arr = read_byte_numpy(append_file_name('data/genome.sa'))

        lcp1 = kasai(geno, s_arr)[1]
        d1 = OrderedDict(mu(SA=s_arr, LCP=lcp1))
        del lcp1
        del s_arr

        au = _part_2(genome_file_name=PATH + ORIGINAL)
        print("au list: ", list(au))

        # *************************
        # (2) flipped
        # *************************

        print("performing flips: ")
        geno2 = read_unambiguous(PATH + FLIPPED)

        s_arr2 = read_byte_numpy(append_file_name('data/flippedGeno.sa'))

        lcp2 = kasai(geno2, s_arr2)[1]
        del geno2

        mu_result = dict(compare(d=d1, SA=s_arr2, LCP=lcp2))
        del lcp2
        mu_result = OrderedDict(sort_mu(mu_result))

        mu_result = OrderedDict(true_address_dict(mu_result, au))

        json_it(mu_result, append_file_name(files['MU_RESULT']))

        #contigs = list(find_contigs(d=old_mu_result_without_true_addresses, bot=20, top=100))
        contigs = OrderedDict(
            find_perfect_contigs(d=mu_result, bot=20, top=100))

        json_it(contigs, append_file_name(files['PERFECT_CONTIGS']))

        contigs = list(within_distance(d=contigs, distance=300))

        json_it(contigs,
                append_file_name(files['PERFECT_CONTIGS_WITH_DISTANCE']))

        print("number of contigs: ", len(contigs))

        print("done")

    except Exception as e:
        raise
Ejemplo n.º 8
0
def efficient_mu_driver():
    """
        NOTES:
            07/05: You MUST run get_uniques first before sorting the lcp

    :return:
    """

    try:
        # comment()
        geno = reads(filename=PATH + FORWARD)
        geno_length = len(geno)
        # comment()
        s_arr = read_byte_numpy(append_file_name('data/22.sa'))

        inv_suff, lcp = kasai(geno, s_arr)
        lcp = kasai(geno, s_arr)[1]
        del geno, s_arr

        # comment()
        au = _part_2(genome_file_name=PATH + FORWARD)

        lcp = list(get_uniques(lcp))

        trues0 = list(sort_lcp(lcp=lcp, inv_suff=inv_suff))

        del lcp

        bad_address = forbiddens(inv_suff=inv_suff, lcp=trues0, au=au)
        del inv_suff

        geno = read_unambiguous(filename=PATH + FLIPPED)
        s_arr = read_byte_numpy(append_file_name('data/f22.sa'))

        inv_2, lcp = kasai(geno, s_arr)
        lcp = kasai(geno, s_arr)[1]
        del geno, s_arr

        lcp = list(get_uniques(lcp))

        trues1 = list(sort_lcp(lcp=lcp, inv_suff=inv_2))
        del lcp, inv_2

        # mu_s, mu_e = list(compare(inv0=inv_suff, trues0=trues0, inv1=inv_2, trues1=trues, bad_address=bad_address))
        mu_s = []
        mu_e = []

        au_dict = {}
        for item in list(au):
            au_dict[item[0]] = item[1]

        del au

        u_ceil = list(au_dict)[0]
        u_floor = 0
        a_offset = au_dict[u_ceil]

        # mu_s, mu_e = list(compare(trues0=trues0, trues1=trues1, bad_address=bad_address))
        for tup in compare_no_inv_suff(trues0=trues0,
                                       trues1=trues1,
                                       bad_address=bad_address,
                                       geno_length=geno_length):
            sa = tup[0]

            if sa < u_floor:
                raise Exception(
                    "SA is less than u_floor. Possible that s_arr not sorted correctly?"
                )

            if sa > u_ceil and len(au_dict) > 1:
                u_floor = u_ceil
                del au_dict[u_ceil]
                u_ceil = list(au_dict)[0]
                a_offset = au_dict[u_ceil]

            elif len(au_dict) < 1:
                print("not au_dict reached")
                break

            # mu_s.append(tup[0])
            mu_s.append(sa + a_offset)
            mu_e.append(tup[1])

        # TODO: 07/05 made the line below return a dict as well as accept geno
        #   to return to before, do not input geno and output two lists
        # myd = dict(compare(trues0 = trues0, trues1 = trues1, bad_address=bad_address))
        # json_it(data=myd, filename="c22_mu")
        assert len(mu_s) == len(mu_e)
        just_dump(myl=mu_s, fn="c22_mu_starts_0709", print_length=True)
        just_dump(myl=mu_e, fn="c22_mu_ends_0709", print_length=True)
        # 07/08: changed get_uniques so that it doesn't yield past or lcp + 1

        # json_it(mu_s, "efficient_mu_starts")
        # json_it(mu_e, "efficient_mu_ends")

        # stitched = list(stitch(starts=mu_s, uniques=mu_e))
        # json_it(stitched, "stitched")

        # print("Number of stitched: " + str(len(stitched)))

        # print("Number of MU: " + str(len(mu_s)))
        # findmean(mys = mu_s, mye=mu_e)

    except IndexError:
        pass
    except Exception:
        print(traceback.format_exc())
        breakpoint()