def byte_read_array_numpy(filename: str): array_array = read_byte_array(filename) numpy_array = read_byte_numpy(filename) assert type(numpy_array) is np.ndarray assert type(array_array) is array.ArrayType assert list(numpy_array) == list(array_array)
def temp_forward_unique_check(): geno = read_unambiguous(filename=PATH + FORWARD) # comment() s_arr = read_byte_numpy(append_file_name('data/22.sa')) inv_suff, lcp = kasai(geno, s_arr) myd = {} for num in range(len(s_arr)): myd[s_arr[num]] = lcp[num] #trues0 = list(get_uniques(lcp)) json_it(trues0, "c22_forward_uniques")
def _part_0(args=None, print=print): try: start = time.time() # _____________________________________________ print('\n_____________________________________') print('PART 0: READ ARGS AND GENOME/SA FILES') print('_____________________________________\n') # _____________________________________________ past = start print('reading SA...\n') # read suffix array from bytes to ints # reading with numpy then converting to 1-D array much slower than array.array # however, array cannot read files larger than ~3GB s_array = read_byte_numpy(filename=args.SA) print('SA read.\n') past = get_time(past, print=print) print('reading genome...\n') # read with Reads instead # ! genome has ambs #genome = reads(filename=args.genome) chrs, genome = chr_splits(filename=args.genome) json_it(data=chrs, filename=append_file_name(args.outfile + "json_chrs")) print('genome read.\n') past = get_time(past, print=print) # TODO: change below line as necessary # args.LCPfile = '../data/lcp_pickle' return genome, past, s_array, start except Exception as e: raise
def driver(): geno = read_unambiguous(TWO_WAY_GENO) s_arr = read_byte_numpy(S_ARRAY)
def sesame_plant(): s_arr = read_byte_numpy(append_file_name('data/22.sa')) geno = read_unambiguous(append_file_name('data/22.fa')) se = SAGuide(s_arr=s_arr, geno=geno) print(se)
def naive_lcp_22(): s_arr = fm.read_byte_numpy(filename=fm.append_file_name('data/22.sa')) lcp = test_kasai.naive_lcp(s_array=s_arr,T=simple_genome()) fm.json_it(data=lcp,filename=fm.append_file_name('output/naive_lcp_22'))
def mu_driver(): """ similar function as driver.py, except include minimal uniques instead of finding 20-100 uniquemers :return: """ try: # gitignore() print('reading original genome: ', end='') chrs, geno = chr_splits(filename=PATH + ORIGINAL) json_it(chrs, append_file_name("json_chrs")) del chrs print('done.\nreading original SA...: ', end='') s_arr = read_byte_numpy(append_file_name('data/genome.sa')) lcp1 = kasai(geno, s_arr)[1] d1 = OrderedDict(mu(SA=s_arr, LCP=lcp1)) del lcp1 del s_arr au = _part_2(genome_file_name=PATH + ORIGINAL) print("au list: ", list(au)) # ************************* # (2) flipped # ************************* print("performing flips: ") geno2 = read_unambiguous(PATH + FLIPPED) s_arr2 = read_byte_numpy(append_file_name('data/flippedGeno.sa')) lcp2 = kasai(geno2, s_arr2)[1] del geno2 mu_result = dict(compare(d=d1, SA=s_arr2, LCP=lcp2)) del lcp2 mu_result = OrderedDict(sort_mu(mu_result)) mu_result = OrderedDict(true_address_dict(mu_result, au)) json_it(mu_result, append_file_name(files['MU_RESULT'])) #contigs = list(find_contigs(d=old_mu_result_without_true_addresses, bot=20, top=100)) contigs = OrderedDict( find_perfect_contigs(d=mu_result, bot=20, top=100)) json_it(contigs, append_file_name(files['PERFECT_CONTIGS'])) contigs = list(within_distance(d=contigs, distance=300)) json_it(contigs, append_file_name(files['PERFECT_CONTIGS_WITH_DISTANCE'])) print("number of contigs: ", len(contigs)) print("done") except Exception as e: raise
def efficient_mu_driver(): """ NOTES: 07/05: You MUST run get_uniques first before sorting the lcp :return: """ try: # comment() geno = reads(filename=PATH + FORWARD) geno_length = len(geno) # comment() s_arr = read_byte_numpy(append_file_name('data/22.sa')) inv_suff, lcp = kasai(geno, s_arr) lcp = kasai(geno, s_arr)[1] del geno, s_arr # comment() au = _part_2(genome_file_name=PATH + FORWARD) lcp = list(get_uniques(lcp)) trues0 = list(sort_lcp(lcp=lcp, inv_suff=inv_suff)) del lcp bad_address = forbiddens(inv_suff=inv_suff, lcp=trues0, au=au) del inv_suff geno = read_unambiguous(filename=PATH + FLIPPED) s_arr = read_byte_numpy(append_file_name('data/f22.sa')) inv_2, lcp = kasai(geno, s_arr) lcp = kasai(geno, s_arr)[1] del geno, s_arr lcp = list(get_uniques(lcp)) trues1 = list(sort_lcp(lcp=lcp, inv_suff=inv_2)) del lcp, inv_2 # mu_s, mu_e = list(compare(inv0=inv_suff, trues0=trues0, inv1=inv_2, trues1=trues, bad_address=bad_address)) mu_s = [] mu_e = [] au_dict = {} for item in list(au): au_dict[item[0]] = item[1] del au u_ceil = list(au_dict)[0] u_floor = 0 a_offset = au_dict[u_ceil] # mu_s, mu_e = list(compare(trues0=trues0, trues1=trues1, bad_address=bad_address)) for tup in compare_no_inv_suff(trues0=trues0, trues1=trues1, bad_address=bad_address, geno_length=geno_length): sa = tup[0] if sa < u_floor: raise Exception( "SA is less than u_floor. Possible that s_arr not sorted correctly?" ) if sa > u_ceil and len(au_dict) > 1: u_floor = u_ceil del au_dict[u_ceil] u_ceil = list(au_dict)[0] a_offset = au_dict[u_ceil] elif len(au_dict) < 1: print("not au_dict reached") break # mu_s.append(tup[0]) mu_s.append(sa + a_offset) mu_e.append(tup[1]) # TODO: 07/05 made the line below return a dict as well as accept geno # to return to before, do not input geno and output two lists # myd = dict(compare(trues0 = trues0, trues1 = trues1, bad_address=bad_address)) # json_it(data=myd, filename="c22_mu") assert len(mu_s) == len(mu_e) just_dump(myl=mu_s, fn="c22_mu_starts_0709", print_length=True) just_dump(myl=mu_e, fn="c22_mu_ends_0709", print_length=True) # 07/08: changed get_uniques so that it doesn't yield past or lcp + 1 # json_it(mu_s, "efficient_mu_starts") # json_it(mu_e, "efficient_mu_ends") # stitched = list(stitch(starts=mu_s, uniques=mu_e)) # json_it(stitched, "stitched") # print("Number of stitched: " + str(len(stitched))) # print("Number of MU: " + str(len(mu_s))) # findmean(mys = mu_s, mye=mu_e) except IndexError: pass except Exception: print(traceback.format_exc()) breakpoint()