Example #1
0
def temp_forward_unique_check():

    geno = read_unambiguous(filename=PATH + FORWARD)
    # comment()
    s_arr = read_byte_numpy(append_file_name('data/22.sa'))

    inv_suff, lcp = kasai(geno, s_arr)

    myd = {}
    for num in range(len(s_arr)):
        myd[s_arr[num]] = lcp[num]

    #trues0 = list(get_uniques(lcp))
    json_it(trues0, "c22_forward_uniques")
Example #2
0
def _part_0(args=None, print=print):
    try:
        start = time.time()
        # _____________________________________________
        print('\n_____________________________________')
        print('PART 0: READ ARGS AND GENOME/SA FILES')
        print('_____________________________________\n')
        # _____________________________________________

        past = start
        print('reading SA...\n')

        # read suffix array from bytes to ints
        # reading with numpy then converting to 1-D array much slower than array.array
        # however, array cannot read files larger than ~3GB

        s_array = read_byte_numpy(filename=args.SA)
        print('SA read.\n')
        past = get_time(past, print=print)
        print('reading genome...\n')

        # read with Reads instead
        # ! genome has ambs
        #genome = reads(filename=args.genome)
        chrs, genome = chr_splits(filename=args.genome)

        json_it(data=chrs,
                filename=append_file_name(args.outfile + "json_chrs"))

        print('genome read.\n')
        past = get_time(past, print=print)

        # TODO: change below line as necessary
        # args.LCPfile = '../data/lcp_pickle'

        return genome, past, s_array, start

    except Exception as e:
        raise
Example #3
0
def naive_lcp_22():
    s_arr = fm.read_byte_numpy(filename=fm.append_file_name('data/22.sa'))
    lcp = test_kasai.naive_lcp(s_array=s_arr,T=simple_genome())
    fm.json_it(data=lcp,filename=fm.append_file_name('output/naive_lcp_22'))
Example #4
0
def mu_driver():
    """
        similar function as driver.py, except include minimal uniques instead of finding 20-100 uniquemers
    :return:
    """

    try:

        # gitignore()
        print('reading original genome: ', end='')
        chrs, geno = chr_splits(filename=PATH + ORIGINAL)
        json_it(chrs, append_file_name("json_chrs"))
        del chrs
        print('done.\nreading original SA...: ', end='')
        s_arr = read_byte_numpy(append_file_name('data/genome.sa'))

        lcp1 = kasai(geno, s_arr)[1]
        d1 = OrderedDict(mu(SA=s_arr, LCP=lcp1))
        del lcp1
        del s_arr

        au = _part_2(genome_file_name=PATH + ORIGINAL)
        print("au list: ", list(au))

        # *************************
        # (2) flipped
        # *************************

        print("performing flips: ")
        geno2 = read_unambiguous(PATH + FLIPPED)

        s_arr2 = read_byte_numpy(append_file_name('data/flippedGeno.sa'))

        lcp2 = kasai(geno2, s_arr2)[1]
        del geno2

        mu_result = dict(compare(d=d1, SA=s_arr2, LCP=lcp2))
        del lcp2
        mu_result = OrderedDict(sort_mu(mu_result))

        mu_result = OrderedDict(true_address_dict(mu_result, au))

        json_it(mu_result, append_file_name(files['MU_RESULT']))

        #contigs = list(find_contigs(d=old_mu_result_without_true_addresses, bot=20, top=100))
        contigs = OrderedDict(
            find_perfect_contigs(d=mu_result, bot=20, top=100))

        json_it(contigs, append_file_name(files['PERFECT_CONTIGS']))

        contigs = list(within_distance(d=contigs, distance=300))

        json_it(contigs,
                append_file_name(files['PERFECT_CONTIGS_WITH_DISTANCE']))

        print("number of contigs: ", len(contigs))

        print("done")

    except Exception as e:
        raise
Example #5
0
def _part_1(genome, past, s_array, args=None, print=print):
    try:
        # check if args.LCPfile exists
        # if it does, read the pickle file instead of calculating new lcp
        inv_suff = []

        # ___________________________________________
        print('\n_____________________________________')
        print('PART 1: COMPUTE LCP ARRAY')
        print('_____________________________________\n')
        # ____________________________________________

        # if user has specified a LCP pickle file that already exists
        if args.lcpfile and os.path.isfile(path=args.lcpfile):

            print("uniques file exists: ")
            #print(args.lcpfile, '\n')

            # TODO: change this as necessary
            #   hopefully start_uniques will be pickled/jsom/msgpacked in the future
            #lcp = unpickle_dict(filename=args.lcpfile)
            lcp = unjson_it(args.lcpfile)
            # find out what format the lcp was pickled
            if type(lcp) == dict or type(lcp) == OrderedDict:
                key = lcp.keys()[0]
                value = lcp[key]

                if key > value and (100 >= value >= 20):
                    print("old lcp pickle was in format sa:lcp")
                    lcp = deque(lcp.values())

                elif key < value and (100 >= key >= 20):
                    print("old lcp pickle was in format lcp:sa")
                    lcp = deque(lcp.keys())

                else:
                    print("not sure what's going on here for sa_lcp dict")
                    raise KeyboardInterrupt

                s_array = deque(s_array)

            elif type(lcp) == list:
                print('LCP file read as list')

            print("uniques unpacked\n")
            past = get_time(past, print=print)
            print("Computing Unique Start Lengths")

            # combine sa and lcp to form a dict with keys: sa, values: unique_starts
            # TODO: creating OrderedDict consumes too much memory

            filename = append_file_name('json_lcp')
            if args.outfile:
                filename = args.outfile

            if args.inverse:
                inv_suff = unjson_it(args.inverse)
            else:
                inv_suff = inverse1(s_array=s_array)

        else:
            if args.inverse:
                inv_suff = unjson_it(args.inverse)
            inv_suff, lcp = kasai(genome=genome,
                                  inv_suff=inv_suff,
                                  s_array=s_array,
                                  print=print)
            past = time.time()

            # convert suffix array (list) to suffix array (deque) for increased efficiency
            print('Completed.')

            # json it
            filename = append_file_name('json_lcp')
            if args.outfile:
                filename = append_file_name(args.outfile + 'json_lcp')

            print('json\'ing lcp array to %s', filename)
            json_it(data=lcp, filename=filename)

            print('LCP json\'ed!')
            past = get_time(past, print=print)

        return past, lcp, inv_suff

    except Exception as e:
        raise