def test_validity(): # assumes that mu's have already been json'ed mu = fm.unjson_it("../src/c22_mu") geno = fm.read_unambiguous("../data/22.fa") myd = {} for key in tqdm(mu, desc="checking uniqueness"): seq = geno[int(key):mu[key] + 1] assert seq not in myd myd[seq] = 1 if key == "700": myd["ACGT"]
def temp_forward_unique_check(): geno = read_unambiguous(filename=PATH + FORWARD) # comment() s_arr = read_byte_numpy(append_file_name('data/22.sa')) inv_suff, lcp = kasai(geno, s_arr) myd = {} for num in range(len(s_arr)): myd[s_arr[num]] = lcp[num] #trues0 = list(get_uniques(lcp)) json_it(trues0, "c22_forward_uniques")
def SA_file(filename:str): print('Reading file: ') sequence = read_unambiguous(filename) print('File read!\nCreating Suffix Array Naively:') s_array, _ = naive_SA(sequence) length = len(s_array) s_array.insert(0,length) print('Suffix Array Created!\nWriting to file: ') write_array_to_byte(filename='fake_genome_sa',byte_arr=s_array) return
def genome_reads_test(filename): # testing reads() from file manager filename = append_file_name(filename) past = time.time() read_bioseq = read_unambiguous(filename=filename) current = time.time() print('genome read with Bio.Seq. Time elapsed: ', current - past) past = current read_reads = reads(filename=filename) current = time.time() print('genome read with Reads.py. Time elapsed: ', current - past) assert type(read_bioseq) is Bio.Seq.Seq assert type(read_reads) is str assert read_reads == str(read_bioseq)
def driver(): geno = read_unambiguous(TWO_WAY_GENO) s_arr = read_byte_numpy(S_ARRAY)
def sesame_plant(): s_arr = read_byte_numpy(append_file_name('data/22.sa')) geno = read_unambiguous(append_file_name('data/22.fa')) se = SAGuide(s_arr=s_arr, geno=geno) print(se)
def mu_driver(): """ similar function as driver.py, except include minimal uniques instead of finding 20-100 uniquemers :return: """ try: # gitignore() print('reading original genome: ', end='') chrs, geno = chr_splits(filename=PATH + ORIGINAL) json_it(chrs, append_file_name("json_chrs")) del chrs print('done.\nreading original SA...: ', end='') s_arr = read_byte_numpy(append_file_name('data/genome.sa')) lcp1 = kasai(geno, s_arr)[1] d1 = OrderedDict(mu(SA=s_arr, LCP=lcp1)) del lcp1 del s_arr au = _part_2(genome_file_name=PATH + ORIGINAL) print("au list: ", list(au)) # ************************* # (2) flipped # ************************* print("performing flips: ") geno2 = read_unambiguous(PATH + FLIPPED) s_arr2 = read_byte_numpy(append_file_name('data/flippedGeno.sa')) lcp2 = kasai(geno2, s_arr2)[1] del geno2 mu_result = dict(compare(d=d1, SA=s_arr2, LCP=lcp2)) del lcp2 mu_result = OrderedDict(sort_mu(mu_result)) mu_result = OrderedDict(true_address_dict(mu_result, au)) json_it(mu_result, append_file_name(files['MU_RESULT'])) #contigs = list(find_contigs(d=old_mu_result_without_true_addresses, bot=20, top=100)) contigs = OrderedDict( find_perfect_contigs(d=mu_result, bot=20, top=100)) json_it(contigs, append_file_name(files['PERFECT_CONTIGS'])) contigs = list(within_distance(d=contigs, distance=300)) json_it(contigs, append_file_name(files['PERFECT_CONTIGS_WITH_DISTANCE'])) print("number of contigs: ", len(contigs)) print("done") except Exception as e: raise
def efficient_mu_driver(): """ NOTES: 07/05: You MUST run get_uniques first before sorting the lcp :return: """ try: # comment() geno = reads(filename=PATH + FORWARD) geno_length = len(geno) # comment() s_arr = read_byte_numpy(append_file_name('data/22.sa')) inv_suff, lcp = kasai(geno, s_arr) lcp = kasai(geno, s_arr)[1] del geno, s_arr # comment() au = _part_2(genome_file_name=PATH + FORWARD) lcp = list(get_uniques(lcp)) trues0 = list(sort_lcp(lcp=lcp, inv_suff=inv_suff)) del lcp bad_address = forbiddens(inv_suff=inv_suff, lcp=trues0, au=au) del inv_suff geno = read_unambiguous(filename=PATH + FLIPPED) s_arr = read_byte_numpy(append_file_name('data/f22.sa')) inv_2, lcp = kasai(geno, s_arr) lcp = kasai(geno, s_arr)[1] del geno, s_arr lcp = list(get_uniques(lcp)) trues1 = list(sort_lcp(lcp=lcp, inv_suff=inv_2)) del lcp, inv_2 # mu_s, mu_e = list(compare(inv0=inv_suff, trues0=trues0, inv1=inv_2, trues1=trues, bad_address=bad_address)) mu_s = [] mu_e = [] au_dict = {} for item in list(au): au_dict[item[0]] = item[1] del au u_ceil = list(au_dict)[0] u_floor = 0 a_offset = au_dict[u_ceil] # mu_s, mu_e = list(compare(trues0=trues0, trues1=trues1, bad_address=bad_address)) for tup in compare_no_inv_suff(trues0=trues0, trues1=trues1, bad_address=bad_address, geno_length=geno_length): sa = tup[0] if sa < u_floor: raise Exception( "SA is less than u_floor. Possible that s_arr not sorted correctly?" ) if sa > u_ceil and len(au_dict) > 1: u_floor = u_ceil del au_dict[u_ceil] u_ceil = list(au_dict)[0] a_offset = au_dict[u_ceil] elif len(au_dict) < 1: print("not au_dict reached") break # mu_s.append(tup[0]) mu_s.append(sa + a_offset) mu_e.append(tup[1]) # TODO: 07/05 made the line below return a dict as well as accept geno # to return to before, do not input geno and output two lists # myd = dict(compare(trues0 = trues0, trues1 = trues1, bad_address=bad_address)) # json_it(data=myd, filename="c22_mu") assert len(mu_s) == len(mu_e) just_dump(myl=mu_s, fn="c22_mu_starts_0709", print_length=True) just_dump(myl=mu_e, fn="c22_mu_ends_0709", print_length=True) # 07/08: changed get_uniques so that it doesn't yield past or lcp + 1 # json_it(mu_s, "efficient_mu_starts") # json_it(mu_e, "efficient_mu_ends") # stitched = list(stitch(starts=mu_s, uniques=mu_e)) # json_it(stitched, "stitched") # print("Number of stitched: " + str(len(stitched))) # print("Number of MU: " + str(len(mu_s))) # findmean(mys = mu_s, mye=mu_e) except IndexError: pass except Exception: print(traceback.format_exc()) breakpoint()