def crawler(filename: str, geno_file: str): # goes through ambs/unam json file to ensure that the validity of ambs/unam groups a_u_dict = fm.unjson_it(filename) seq = fm.reads(geno_file) max = len(seq) u_offset = 0 a_offset = 0 # important to note that ambs measures length and not position for una in a_u_dict: # key = unam # value = ambs u_offset += int(una) a_offset += (int(a_u_dict[una]) + int(una) - 1) if u_offset != 0: assert seq[u_offset] in ['A', 'C', 'G', 'T'] if u_offset < max: assert seq[u_offset + 1] == 'N' assert seq[a_offset] == 'N' if a_offset < max: assert seq[a_offset + 1] in ['A', 'C', 'G', 'T'] u_offset += int(a_u_dict[una]) return
def main(): print('Reading SA: ') s_array = read_byte_array(append_file_name('data/22.sa')) print('SA read!\nReading genome: ') # !! reads returns ambs genome = reads(filename=append_file_name('data/22.fa')) gen_list = genome.split('N') genome = '' for part in gen_list: genome += part #genome = read_unambiguous(filename=append_file_name('data/22.fa')) print('Genome read!') length = 30 s_len = len(s_array) for i in trange(s_len, desc='Checking validity of suffix array: '): sa = s_array[i] if sa + length + 1 < s_len: s0 = genome[sa:sa + length + 1] s1 = genome[s_array[i + 1]:s_array[i + 1] + length + 1] print('s0: ', s0) print('s1: ', s1) assert s0 <= s1 else: pass
def with_args(): seq = fm.reads(input('Enter file name of sequence file: ')) d = split_sequence(sequence=seq) print('writing to file: ') with open(fm.append_file_name('ambs_unam'),'w') as file: unam = list(d.keys()) ambs = list(d.values()) file.write('unam\n') for u in unam: file.write(str(u) + '\n') file.write('\n\nambs\n') for a in ambs: file.write(str(a) + '\n')
def specific_k(k: int, d: dict, seq_file='', sequence='', high=0, outfile=append_file_name('k_mer')): """ d: { key( true address of genome ): [ unique start, top ] } :param k: :param d: :return: """ # if the sequence has been passed, then use sequence. # else, if only seq_file was passed, then read sequence if not sequence and seq_file: sequence = reads(seq_file) print('length of sequence: ' + str(len(sequence))) elif sequence: pass else: raise InsufficientArguments valids = {} # keys: seq # values: sa try: special_end = 0 for sa in tqdm(d, desc='finding ' + str(k) + '-mers: '): # first find all the k-mers if d[sa][1] != high: special_end += 1 if d[sa][0] + 1 < k < d[sa][1]: seq = sequence[int(sa):int(sa) + k] if seq in valids: del valids[seq] continue else: valids[seq] = sa except IndexError: pass return valids
def genome_reads_test(filename): # testing reads() from file manager filename = append_file_name(filename) past = time.time() read_bioseq = read_unambiguous(filename=filename) current = time.time() print('genome read with Bio.Seq. Time elapsed: ', current - past) past = current read_reads = reads(filename=filename) current = time.time() print('genome read with Reads.py. Time elapsed: ', current - past) assert type(read_bioseq) is Bio.Seq.Seq assert type(read_reads) is str assert read_reads == str(read_bioseq)
def _test_part_0(args:Args): if not args.SA: sequence = fm.reads(args.genome) s_array, L = naive_SA(string=sequence) fm.write_array_to_byte(byte_arr = s_array, filename=fm.append_file_name('test/fake_SA')) args.SA = fm.append_file_name('test/fake_SA') genome, past, s_array, start = driver._part_0(args=args) # genome is a string of the sequence assert type(genome) is str and genome # s_array is a numpy array assert type(s_array) is np.ndarray # start is a time.time object assert type(start) is float # past is also a time.time object assert type(past) is float return genome, past, s_array, start
import os from tqdm import tqdm sys.path.append(os.getcwd().split('uniquekmer')[0] + 'uniquekmer/src') import file_manager as fm filename = '../22_json_default_dict' print('reading jsoned dict') d = fm.unjson_it(filename) print('read!') print('reading genome') genome = fm.reads('../data/22.fa') print('read!') ambs=0 tops=0 # checking how many tops are less than 100 for sa in tqdm(d, desc='checking dict'): top = int(d[sa][1]) sa = int(sa) string = genome[sa:sa+top] if 'N' in string: ambs+=1 if top < 100: tops += 1
def efficient_mu_driver(): """ NOTES: 07/05: You MUST run get_uniques first before sorting the lcp :return: """ try: # comment() geno = reads(filename=PATH + FORWARD) geno_length = len(geno) # comment() s_arr = read_byte_numpy(append_file_name('data/22.sa')) inv_suff, lcp = kasai(geno, s_arr) lcp = kasai(geno, s_arr)[1] del geno, s_arr # comment() au = _part_2(genome_file_name=PATH + FORWARD) lcp = list(get_uniques(lcp)) trues0 = list(sort_lcp(lcp=lcp, inv_suff=inv_suff)) del lcp bad_address = forbiddens(inv_suff=inv_suff, lcp=trues0, au=au) del inv_suff geno = read_unambiguous(filename=PATH + FLIPPED) s_arr = read_byte_numpy(append_file_name('data/f22.sa')) inv_2, lcp = kasai(geno, s_arr) lcp = kasai(geno, s_arr)[1] del geno, s_arr lcp = list(get_uniques(lcp)) trues1 = list(sort_lcp(lcp=lcp, inv_suff=inv_2)) del lcp, inv_2 # mu_s, mu_e = list(compare(inv0=inv_suff, trues0=trues0, inv1=inv_2, trues1=trues, bad_address=bad_address)) mu_s = [] mu_e = [] au_dict = {} for item in list(au): au_dict[item[0]] = item[1] del au u_ceil = list(au_dict)[0] u_floor = 0 a_offset = au_dict[u_ceil] # mu_s, mu_e = list(compare(trues0=trues0, trues1=trues1, bad_address=bad_address)) for tup in compare_no_inv_suff(trues0=trues0, trues1=trues1, bad_address=bad_address, geno_length=geno_length): sa = tup[0] if sa < u_floor: raise Exception( "SA is less than u_floor. Possible that s_arr not sorted correctly?" ) if sa > u_ceil and len(au_dict) > 1: u_floor = u_ceil del au_dict[u_ceil] u_ceil = list(au_dict)[0] a_offset = au_dict[u_ceil] elif len(au_dict) < 1: print("not au_dict reached") break # mu_s.append(tup[0]) mu_s.append(sa + a_offset) mu_e.append(tup[1]) # TODO: 07/05 made the line below return a dict as well as accept geno # to return to before, do not input geno and output two lists # myd = dict(compare(trues0 = trues0, trues1 = trues1, bad_address=bad_address)) # json_it(data=myd, filename="c22_mu") assert len(mu_s) == len(mu_e) just_dump(myl=mu_s, fn="c22_mu_starts_0709", print_length=True) just_dump(myl=mu_e, fn="c22_mu_ends_0709", print_length=True) # 07/08: changed get_uniques so that it doesn't yield past or lcp + 1 # json_it(mu_s, "efficient_mu_starts") # json_it(mu_e, "efficient_mu_ends") # stitched = list(stitch(starts=mu_s, uniques=mu_e)) # json_it(stitched, "stitched") # print("Number of stitched: " + str(len(stitched))) # print("Number of MU: " + str(len(mu_s))) # findmean(mys = mu_s, mye=mu_e) except IndexError: pass except Exception: print(traceback.format_exc()) breakpoint()
def convert_geno_to_num(filename): geno = fm.reads(filename) return [ alpha[_] for _ in tqdm(geno, desc="converting genome to numbers: ") ]
return def with_args(): seq = fm.reads(input('Enter file name of sequence file: ')) d = split_sequence(sequence=seq) print('writing to file: ') with open(fm.append_file_name('ambs_unam'),'w') as file: unam = list(d.keys()) ambs = list(d.values()) file.write('unam\n') for u in unam: file.write(str(u) + '\n') file.write('\n\nambs\n') for a in ambs: file.write(str(a) + '\n') if __name__ == '__main__': ambs, unam = split_sequence(filename='../data/22.fa') rb = rb_tree_ambs(ambs, unam) print(list(rb)) seq = fm.reads('../data/22.fa') test_rb_ambs(rb,sequence=seq)
def test_chr_splits(): # ASSUMES THAT reads() HAS BEEN THOROUGHLY TESTED AND VALID s0 = reads(filename='../data/22.fa') chrs, s1 = chr_splits('../data/22.fa') assert s0 == s1