Beispiel #1
0
def crawler(filename: str, geno_file: str):
    # goes through ambs/unam json file to ensure that the validity of ambs/unam groups
    a_u_dict = fm.unjson_it(filename)
    seq = fm.reads(geno_file)
    max = len(seq)
    u_offset = 0
    a_offset = 0  # important to note that ambs measures length and not position

    for una in a_u_dict:
        # key = unam
        # value = ambs

        u_offset += int(una)
        a_offset += (int(a_u_dict[una]) + int(una) - 1)

        if u_offset != 0:
            assert seq[u_offset] in ['A', 'C', 'G', 'T']

            if u_offset < max:
                assert seq[u_offset + 1] == 'N'

        assert seq[a_offset] == 'N'

        if a_offset < max:
            assert seq[a_offset + 1] in ['A', 'C', 'G', 'T']

        u_offset += int(a_u_dict[una])

    return
Beispiel #2
0
def read_json_file(filename: str):
    a_u_dict = fm.unjson_it(filename)
    with open('ambs', 'w') as ambs:
        with open('unam', 'w') as unam:
            for key in a_u_dict:
                unam.write(str(key) + '\n')
                ambs.write(str(a_u_dict[key]) + '\n')
Beispiel #3
0
def read_trues_chr(true_file: str, chrs_file: str):

    print("Reading data: ")
    print("Reading True Address file: ", end="")
    trues = read_byte_to_queue(true_file)

    print("done.\nReading chrs file: ", end='')
    chrs_dict = unjson_it(chrs_file)
    print("done.")
    return trues, chrs_dict
Beispiel #4
0
def read_dict(filename: str, filetype='pickle'):
    if filetype.lower() == 'pickle':
        return unpickle_dict(filename)

    elif filetype.lower() == 'json':
        return unjson_it(filename)

    elif filetype.lower() == 'msgpack':
        return msgunpack_dict(filename=filename)

    else:
        raise Exception('unknown file type')
Beispiel #5
0
def read_data(true_file: str, tops_file: str, lcps_file: str, chrs_file: str):
    print("Reading data: ")
    print("Reading True Address file: ", end="")
    trues = read_byte_to_queue(true_file)
    print("done.\nReading Tops file: ", end="")
    tops = read_byte_to_queue(tops_file)
    print("done.\nReading Unique Start file: ", end='')
    lcps = read_byte_to_queue(lcps_file)
    print("done.\nReading chrs file: ", end='')
    chrs_dict = unjson_it(chrs_file)
    print("done.")
    return trues, tops, lcps, chrs_dict
Beispiel #6
0
def test_validity():
    # assumes that mu's have already been json'ed
    mu = fm.unjson_it("../src/c22_mu")
    geno = fm.read_unambiguous("../data/22.fa")

    myd = {}

    for key in tqdm(mu, desc="checking uniqueness"):
        seq = geno[int(key):mu[key] + 1]
        assert seq not in myd
        myd[seq] = 1
        if key == "700":
            myd["ACGT"]
Beispiel #7
0
import sys
import os
from tqdm import tqdm


sys.path.append(os.getcwd().split('uniquekmer')[0] + 'uniquekmer/src')

import file_manager as fm


filename = '../22_json_default_dict'
print('reading jsoned dict')
d = fm.unjson_it(filename)
print('read!')

print('reading genome')
genome = fm.reads('../data/22.fa')
print('read!')

ambs=0
tops=0

# checking how many tops are less than 100
for sa in tqdm(d, desc='checking dict'):
    top = int(d[sa][1])
    sa = int(sa)
    string = genome[sa:sa+top]
    if 'N' in string:
        ambs+=1

    if top < 100:
Beispiel #8
0
def temp():
    c22 = unjson_it(filename="c22_mu")
    l0 = list(c22.keys())
    l1 = list(c22.values())
    just_dump(l0, l1, fn="c22_mu_just_dump")
Beispiel #9
0
def _part_1(genome, past, s_array, args=None, print=print):
    try:
        # check if args.LCPfile exists
        # if it does, read the pickle file instead of calculating new lcp
        inv_suff = []

        # ___________________________________________
        print('\n_____________________________________')
        print('PART 1: COMPUTE LCP ARRAY')
        print('_____________________________________\n')
        # ____________________________________________

        # if user has specified a LCP pickle file that already exists
        if args.lcpfile and os.path.isfile(path=args.lcpfile):

            print("uniques file exists: ")
            #print(args.lcpfile, '\n')

            # TODO: change this as necessary
            #   hopefully start_uniques will be pickled/jsom/msgpacked in the future
            #lcp = unpickle_dict(filename=args.lcpfile)
            lcp = unjson_it(args.lcpfile)
            # find out what format the lcp was pickled
            if type(lcp) == dict or type(lcp) == OrderedDict:
                key = lcp.keys()[0]
                value = lcp[key]

                if key > value and (100 >= value >= 20):
                    print("old lcp pickle was in format sa:lcp")
                    lcp = deque(lcp.values())

                elif key < value and (100 >= key >= 20):
                    print("old lcp pickle was in format lcp:sa")
                    lcp = deque(lcp.keys())

                else:
                    print("not sure what's going on here for sa_lcp dict")
                    raise KeyboardInterrupt

                s_array = deque(s_array)

            elif type(lcp) == list:
                print('LCP file read as list')

            print("uniques unpacked\n")
            past = get_time(past, print=print)
            print("Computing Unique Start Lengths")

            # combine sa and lcp to form a dict with keys: sa, values: unique_starts
            # TODO: creating OrderedDict consumes too much memory

            filename = append_file_name('json_lcp')
            if args.outfile:
                filename = args.outfile

            if args.inverse:
                inv_suff = unjson_it(args.inverse)
            else:
                inv_suff = inverse1(s_array=s_array)

        else:
            if args.inverse:
                inv_suff = unjson_it(args.inverse)
            inv_suff, lcp = kasai(genome=genome,
                                  inv_suff=inv_suff,
                                  s_array=s_array,
                                  print=print)
            past = time.time()

            # convert suffix array (list) to suffix array (deque) for increased efficiency
            print('Completed.')

            # json it
            filename = append_file_name('json_lcp')
            if args.outfile:
                filename = append_file_name(args.outfile + 'json_lcp')

            print('json\'ing lcp array to %s', filename)
            json_it(data=lcp, filename=filename)

            print('LCP json\'ed!')
            past = get_time(past, print=print)

        return past, lcp, inv_suff

    except Exception as e:
        raise