Ejemplo n.º 1
0
def crawler(filename: str, geno_file: str):
    # goes through ambs/unam json file to ensure that the validity of ambs/unam groups
    a_u_dict = fm.unjson_it(filename)
    seq = fm.reads(geno_file)
    max = len(seq)
    u_offset = 0
    a_offset = 0  # important to note that ambs measures length and not position

    for una in a_u_dict:
        # key = unam
        # value = ambs

        u_offset += int(una)
        a_offset += (int(a_u_dict[una]) + int(una) - 1)

        if u_offset != 0:
            assert seq[u_offset] in ['A', 'C', 'G', 'T']

            if u_offset < max:
                assert seq[u_offset + 1] == 'N'

        assert seq[a_offset] == 'N'

        if a_offset < max:
            assert seq[a_offset + 1] in ['A', 'C', 'G', 'T']

        u_offset += int(a_u_dict[una])

    return
Ejemplo n.º 2
0
def main():
    print('Reading SA: ')
    s_array = read_byte_array(append_file_name('data/22.sa'))
    print('SA read!\nReading genome: ')
    # !! reads returns ambs
    genome = reads(filename=append_file_name('data/22.fa'))
    gen_list = genome.split('N')
    genome = ''
    for part in gen_list:
        genome += part
    #genome = read_unambiguous(filename=append_file_name('data/22.fa'))
    print('Genome read!')

    length = 30
    s_len = len(s_array)
    for i in trange(s_len, desc='Checking validity of suffix array: '):
        sa = s_array[i]
        if sa + length + 1 < s_len:
            s0 = genome[sa:sa + length + 1]
            s1 = genome[s_array[i + 1]:s_array[i + 1] + length + 1]

            print('s0: ', s0)
            print('s1: ', s1)

            assert s0 <= s1

        else:
            pass
Ejemplo n.º 3
0
def with_args():

    seq = fm.reads(input('Enter file name of sequence file: '))
    d = split_sequence(sequence=seq)
    print('writing to file: ')
    with open(fm.append_file_name('ambs_unam'),'w') as file:
        unam = list(d.keys())
        ambs = list(d.values())
        file.write('unam\n')
        for u in unam:
            file.write(str(u) + '\n')

        file.write('\n\nambs\n')
        for a in ambs:
            file.write(str(a) + '\n')
Ejemplo n.º 4
0
def specific_k(k: int,
               d: dict,
               seq_file='',
               sequence='',
               high=0,
               outfile=append_file_name('k_mer')):
    """
        d: { key( true address of genome ): [ unique start, top ] }
    :param k:
    :param d:
    :return:
    """
    # if the sequence has been passed, then use sequence.
    # else, if only seq_file was passed, then read sequence
    if not sequence and seq_file:
        sequence = reads(seq_file)
        print('length of sequence: ' + str(len(sequence)))

    elif sequence:
        pass

    else:
        raise InsufficientArguments

    valids = {}
    # keys: seq
    # values: sa
    try:
        special_end = 0
        for sa in tqdm(d, desc='finding ' + str(k) + '-mers: '):
            # first find all the k-mers
            if d[sa][1] != high:
                special_end += 1
            if d[sa][0] + 1 < k < d[sa][1]:
                seq = sequence[int(sa):int(sa) + k]
                if seq in valids:
                    del valids[seq]
                    continue
                else:
                    valids[seq] = sa

    except IndexError:
        pass

    return valids
Ejemplo n.º 5
0
def genome_reads_test(filename):
    # testing reads() from file manager
    filename = append_file_name(filename)
    past = time.time()
    read_bioseq = read_unambiguous(filename=filename)
    current = time.time()
    print('genome read with Bio.Seq. Time elapsed: ', current - past)
    past = current

    read_reads = reads(filename=filename)
    current = time.time()
    print('genome read with Reads.py. Time elapsed: ', current - past)

    assert type(read_bioseq) is Bio.Seq.Seq

    assert type(read_reads) is str

    assert read_reads == str(read_bioseq)
Ejemplo n.º 6
0
def _test_part_0(args:Args):

    if not args.SA:
        sequence = fm.reads(args.genome)
        s_array, L = naive_SA(string=sequence)
        fm.write_array_to_byte(byte_arr = s_array, filename=fm.append_file_name('test/fake_SA'))
        args.SA = fm.append_file_name('test/fake_SA')

    genome, past, s_array, start = driver._part_0(args=args)

    # genome is a string of the sequence
    assert type(genome) is str and genome

    # s_array is a numpy array
    assert type(s_array) is np.ndarray

    # start is a time.time object
    assert type(start) is float

    # past is also a time.time object
    assert type(past) is float

    return genome, past, s_array, start
Ejemplo n.º 7
0
import os
from tqdm import tqdm


sys.path.append(os.getcwd().split('uniquekmer')[0] + 'uniquekmer/src')

import file_manager as fm


filename = '../22_json_default_dict'
print('reading jsoned dict')
d = fm.unjson_it(filename)
print('read!')

print('reading genome')
genome = fm.reads('../data/22.fa')
print('read!')

ambs=0
tops=0

# checking how many tops are less than 100
for sa in tqdm(d, desc='checking dict'):
    top = int(d[sa][1])
    sa = int(sa)
    string = genome[sa:sa+top]
    if 'N' in string:
        ambs+=1

    if top < 100:
        tops += 1
Ejemplo n.º 8
0
def efficient_mu_driver():
    """
        NOTES:
            07/05: You MUST run get_uniques first before sorting the lcp

    :return:
    """

    try:
        # comment()
        geno = reads(filename=PATH + FORWARD)
        geno_length = len(geno)
        # comment()
        s_arr = read_byte_numpy(append_file_name('data/22.sa'))

        inv_suff, lcp = kasai(geno, s_arr)
        lcp = kasai(geno, s_arr)[1]
        del geno, s_arr

        # comment()
        au = _part_2(genome_file_name=PATH + FORWARD)

        lcp = list(get_uniques(lcp))

        trues0 = list(sort_lcp(lcp=lcp, inv_suff=inv_suff))

        del lcp

        bad_address = forbiddens(inv_suff=inv_suff, lcp=trues0, au=au)
        del inv_suff

        geno = read_unambiguous(filename=PATH + FLIPPED)
        s_arr = read_byte_numpy(append_file_name('data/f22.sa'))

        inv_2, lcp = kasai(geno, s_arr)
        lcp = kasai(geno, s_arr)[1]
        del geno, s_arr

        lcp = list(get_uniques(lcp))

        trues1 = list(sort_lcp(lcp=lcp, inv_suff=inv_2))
        del lcp, inv_2

        # mu_s, mu_e = list(compare(inv0=inv_suff, trues0=trues0, inv1=inv_2, trues1=trues, bad_address=bad_address))
        mu_s = []
        mu_e = []

        au_dict = {}
        for item in list(au):
            au_dict[item[0]] = item[1]

        del au

        u_ceil = list(au_dict)[0]
        u_floor = 0
        a_offset = au_dict[u_ceil]

        # mu_s, mu_e = list(compare(trues0=trues0, trues1=trues1, bad_address=bad_address))
        for tup in compare_no_inv_suff(trues0=trues0,
                                       trues1=trues1,
                                       bad_address=bad_address,
                                       geno_length=geno_length):
            sa = tup[0]

            if sa < u_floor:
                raise Exception(
                    "SA is less than u_floor. Possible that s_arr not sorted correctly?"
                )

            if sa > u_ceil and len(au_dict) > 1:
                u_floor = u_ceil
                del au_dict[u_ceil]
                u_ceil = list(au_dict)[0]
                a_offset = au_dict[u_ceil]

            elif len(au_dict) < 1:
                print("not au_dict reached")
                break

            # mu_s.append(tup[0])
            mu_s.append(sa + a_offset)
            mu_e.append(tup[1])

        # TODO: 07/05 made the line below return a dict as well as accept geno
        #   to return to before, do not input geno and output two lists
        # myd = dict(compare(trues0 = trues0, trues1 = trues1, bad_address=bad_address))
        # json_it(data=myd, filename="c22_mu")
        assert len(mu_s) == len(mu_e)
        just_dump(myl=mu_s, fn="c22_mu_starts_0709", print_length=True)
        just_dump(myl=mu_e, fn="c22_mu_ends_0709", print_length=True)
        # 07/08: changed get_uniques so that it doesn't yield past or lcp + 1

        # json_it(mu_s, "efficient_mu_starts")
        # json_it(mu_e, "efficient_mu_ends")

        # stitched = list(stitch(starts=mu_s, uniques=mu_e))
        # json_it(stitched, "stitched")

        # print("Number of stitched: " + str(len(stitched)))

        # print("Number of MU: " + str(len(mu_s)))
        # findmean(mys = mu_s, mye=mu_e)

    except IndexError:
        pass
    except Exception:
        print(traceback.format_exc())
        breakpoint()
Ejemplo n.º 9
0
def convert_geno_to_num(filename):
    geno = fm.reads(filename)
    return [
        alpha[_] for _ in tqdm(geno, desc="converting genome to numbers: ")
    ]
Ejemplo n.º 10
0
    return


def with_args():

    seq = fm.reads(input('Enter file name of sequence file: '))
    d = split_sequence(sequence=seq)
    print('writing to file: ')
    with open(fm.append_file_name('ambs_unam'),'w') as file:
        unam = list(d.keys())
        ambs = list(d.values())
        file.write('unam\n')
        for u in unam:
            file.write(str(u) + '\n')

        file.write('\n\nambs\n')
        for a in ambs:
            file.write(str(a) + '\n')


if __name__ == '__main__':
    ambs, unam = split_sequence(filename='../data/22.fa')
    rb = rb_tree_ambs(ambs, unam)
    print(list(rb))
    seq = fm.reads('../data/22.fa')
    test_rb_ambs(rb,sequence=seq)



Ejemplo n.º 11
0
def test_chr_splits():
    # ASSUMES THAT reads() HAS BEEN THOROUGHLY TESTED AND VALID
    s0 = reads(filename='../data/22.fa')
    chrs, s1 = chr_splits('../data/22.fa')

    assert s0 == s1