def main(): print('Reading SA: ') s_array = read_byte_array(append_file_name('data/22.sa')) print('SA read!\nReading genome: ') # !! reads returns ambs genome = reads(filename=append_file_name('data/22.fa')) gen_list = genome.split('N') genome = '' for part in gen_list: genome += part #genome = read_unambiguous(filename=append_file_name('data/22.fa')) print('Genome read!') length = 30 s_len = len(s_array) for i in trange(s_len, desc='Checking validity of suffix array: '): sa = s_array[i] if sa + length + 1 < s_len: s0 = genome[sa:sa + length + 1] s1 = genome[s_array[i + 1]:s_array[i + 1] + length + 1] print('s0: ', s0) print('s1: ', s1) assert s0 <= s1 else: pass
def bytes_main(): args = specific_unique_args() #trues, tops, lcps, chrs_d = read_data(chrs_file= append_file_name(args.chrs), true_file=append_file_name(args.trues), tops_file=append_file_name(args.tops), lcps_file=append_file_name(args.unqs)) trues, chrs_d = read_trues_chr(chrs_file=append_file_name(args.chrs), true_file=append_file_name(args.trues)) #counts, chr_counts = _run_with_trues(trues=trues, lcps=lcps, tops=tops, chr_d=chrs_d, bot=args.low, top=args.high) chr_counts = _count_trues(trues=trues, chr_d=chrs_d) #print("counts of uniques: ", end='') #print(counts) print('chrs: ') print(chr_counts)
def driver(): addr = [] snp = [] for tup in parse_SNP(filename=append_file_name("data/22.snp")): addr.append(tup[0]) snp.append(tup[1]) with open("../output/ez_c22_SNP_pos", "w") as file: with open ("../output/ez_c22_SNP_char", 'w') as file_2: for tup in parse_SNP(filename=append_file_name("data/22.snp")): file.write(str(tup[0]) + '\n') file_2.write(str(tup[1]) + '\n')
def test_ambs_unam_split(): filename = fm.append_file_name('test/fake_genome') # make a fake genome sequence with file name test_a_u_split.txt if not os.path.isfile(filename): fm.fake_genome(filename=filename, lines=1000, alpha=alpha) try: ambs, unam = ambs_unam_split(chr_option=False, filename=filename) assert ambs is not None and unam is not None assert len(ambs) is not 0 and len(unam) is not 0 # test to see that the entries in each deque are increasing past_amb = ambs.popleft() past_una = unam.popleft() while ambs: cur_amb = ambs.popleft() assert cur_amb > past_amb past_amb = cur_amb cur_una = unam.popleft() assert cur_una > past_una past_una = cur_una except IndexError as e: print(e)
def compare2(filename: str): d = split_sequence(filename=fm.append_file_name('test/' + filename)) ambs = list(d.values()) unam = list(d.keys()) naive_a, naive_u = naive_ambiguous_split(filename=filename) assert len(ambs) == len(naive_a) assert len(unam) == len(naive_u) for i in range(len(ambs)): assert ambs[i] == naive_a[i] for i in range(len(unam)): assert unam[i] == naive_u[i] with open('ambs_unam_2', 'w') as file: file.write('ambs\n') for i in range(len(ambs)): file.write(str(ambs[i]) + '\n') file.write('\n\n') file.write('unam\n') for i in range(len(unam)): file.write(str(unam[i]) + '\n')
def simple_genome(): seq = '' with open(fm.append_file_name('data/22.fa'),'r') as file: for line in file: if line[0] != '>': seq += line.split()[0] return seq
def get_kmers(filename): geno = convert_geno_to_num(filename) print("reading true addresses: ", end='') trues = fm.read_byte_to_queue( fm.append_file_name('0415_c22_1132_true_addresses')) print("done!\nreading unique start addresses: ", end='') uniques = fm.read_byte_to_queue( fm.append_file_name('0415_c22_1132_unique_starts')) print("done!") kmer_list = [] for i in trange(len(trues), desc="finding kmers: "): t = trues[i] # kmer = filter(str.isdigit, repr(geno[t:t+uniques[i]])) kmer = int(''.join(map(str, geno[t:t + uniques[i]]))) kmer_list.append(kmer) return kmer_list
def create_fake(filename: str): sequence = '' # create fake genome with known ambs unam groups # thus, unable to simply to file_manager's fake_genome() # create about 40 ambs/unam groups: # lists of the lengths of ambs/unam groups ambs = random.choices(population=range(1, 50), k=40) unam = random.choices(population=range(50, 100), k=40) file1 = fm.append_file_name('test/' + filename + 'ambs_unam') # write the ambs/unam lists to a file with open(file1, 'w') as file: file.write('ambs: ') for a in ambs: file.write(str(a) + ' ') file.write('\nunam: ') for u in unam: file.write(str(u) + ' ') file2 = fm.append_file_name('test/' + filename) # start with ambs for i in range(len(ambs)): a = ambs[i] for _ in range(a): sequence += 'N' u = unam[i] for _ in range(u): sequence += random.choice(['A', 'C', 'G', 'T']) # split string into 60 chars, as the fasta file is lines = [sequence[_:_ + 60] for _ in range(0, len(sequence), 60)] with open(file2, 'w') as file: for line in lines: file.write(line + '\n')
def _test_part_0(args:Args): if not args.SA: sequence = fm.reads(args.genome) s_array, L = naive_SA(string=sequence) fm.write_array_to_byte(byte_arr = s_array, filename=fm.append_file_name('test/fake_SA')) args.SA = fm.append_file_name('test/fake_SA') genome, past, s_array, start = driver._part_0(args=args) # genome is a string of the sequence assert type(genome) is str and genome # s_array is a numpy array assert type(s_array) is np.ndarray # start is a time.time object assert type(start) is float # past is also a time.time object assert type(past) is float return genome, past, s_array, start
def temp_forward_unique_check(): geno = read_unambiguous(filename=PATH + FORWARD) # comment() s_arr = read_byte_numpy(append_file_name('data/22.sa')) inv_suff, lcp = kasai(geno, s_arr) myd = {} for num in range(len(s_arr)): myd[s_arr[num]] = lcp[num] #trues0 = list(get_uniques(lcp)) json_it(trues0, "c22_forward_uniques")
def main(): #args = read_args() args = Args() args.genome = fm.append_file_name('test/fake_genome') genome, past, s_array, start = _test_part_0(args) past, sa_uniques = _test_part_1(genome=genome, s_array=s_array, args=args, past=past) a_u_dict, past, unam = _test_part_2(args=args, past=past) _test_part_3(args=args, past=past, sa_uniques=sa_uniques, unam=unam, a_u_dict=a_u_dict)
def append_file_test(): #appended = append_file_name('test') #print('appended: ', appended) #print('os.getcwd(): ', os.getcwd()) #assert appended == os.getcwd() appended = append_file_name('test/append_test.txt') with open(appended, 'w') as file: for i in range(100): file.write(str(i)) file.write('\n')
def with_args(): seq = fm.reads(input('Enter file name of sequence file: ')) d = split_sequence(sequence=seq) print('writing to file: ') with open(fm.append_file_name('ambs_unam'),'w') as file: unam = list(d.keys()) ambs = list(d.values()) file.write('unam\n') for u in unam: file.write(str(u) + '\n') file.write('\n\nambs\n') for a in ambs: file.write(str(a) + '\n')
def specific_k(k: int, d: dict, seq_file='', sequence='', high=0, outfile=append_file_name('k_mer')): """ d: { key( true address of genome ): [ unique start, top ] } :param k: :param d: :return: """ # if the sequence has been passed, then use sequence. # else, if only seq_file was passed, then read sequence if not sequence and seq_file: sequence = reads(seq_file) print('length of sequence: ' + str(len(sequence))) elif sequence: pass else: raise InsufficientArguments valids = {} # keys: seq # values: sa try: special_end = 0 for sa in tqdm(d, desc='finding ' + str(k) + '-mers: '): # first find all the k-mers if d[sa][1] != high: special_end += 1 if d[sa][0] + 1 < k < d[sa][1]: seq = sequence[int(sa):int(sa) + k] if seq in valids: del valids[seq] continue else: valids[seq] = sa except IndexError: pass return valids
def genome_reads_test(filename): # testing reads() from file manager filename = append_file_name(filename) past = time.time() read_bioseq = read_unambiguous(filename=filename) current = time.time() print('genome read with Bio.Seq. Time elapsed: ', current - past) past = current read_reads = reads(filename=filename) current = time.time() print('genome read with Reads.py. Time elapsed: ', current - past) assert type(read_bioseq) is Bio.Seq.Seq assert type(read_reads) is str assert read_reads == str(read_bioseq)
def with_args(): args = specific_unique_args() print('reading dict:') d = sl.read_dict(filename=args.dfile, filetype=args.filetype) print('dict read!') output = '' if not args.outfile else append_file_name('output/' + args.outfile) valids, sequence, special_end = sl.specific_k(k=args.length, d=d, seq_file=args.genome, outfile=output) sl.file_write(valids=valids, outfile=output, sequence=sequence, k=args.length, special_end=special_end, d=d) naive_address_uniqueness_test(args.dfile) naive_kmer_uniqueness_test(output)
def _part_0(args=None, print=print): try: start = time.time() # _____________________________________________ print('\n_____________________________________') print('PART 0: READ ARGS AND GENOME/SA FILES') print('_____________________________________\n') # _____________________________________________ past = start print('reading SA...\n') # read suffix array from bytes to ints # reading with numpy then converting to 1-D array much slower than array.array # however, array cannot read files larger than ~3GB s_array = read_byte_numpy(filename=args.SA) print('SA read.\n') past = get_time(past, print=print) print('reading genome...\n') # read with Reads instead # ! genome has ambs #genome = reads(filename=args.genome) chrs, genome = chr_splits(filename=args.genome) json_it(data=chrs, filename=append_file_name(args.outfile + "json_chrs")) print('genome read.\n') past = get_time(past, print=print) # TODO: change below line as necessary # args.LCPfile = '../data/lcp_pickle' return genome, past, s_array, start except Exception as e: raise
def naive_ambiguous_split(filename: str): ambs = [] unam = [] is_amb = False first = True with open(fm.append_file_name('test/' + filename), 'r') as file: a_count = -1 u_count = 0 for line in file: for char in line.strip(): if char == 'N': if u_count and not is_amb: first = False is_amb = True unam.append(u_count) a_count += 1 elif char == 'A' or char == 'C' or char == 'G' or char == 'T': if (a_count and is_amb) or first: is_amb = False first = False ambs.append(a_count) u_count += 1 if a_count not in ambs: ambs.append(a_count) if u_count not in unam: unam.append(u_count) return ambs, unam
def mu_driver(): """ similar function as driver.py, except include minimal uniques instead of finding 20-100 uniquemers :return: """ try: # gitignore() print('reading original genome: ', end='') chrs, geno = chr_splits(filename=PATH + ORIGINAL) json_it(chrs, append_file_name("json_chrs")) del chrs print('done.\nreading original SA...: ', end='') s_arr = read_byte_numpy(append_file_name('data/genome.sa')) lcp1 = kasai(geno, s_arr)[1] d1 = OrderedDict(mu(SA=s_arr, LCP=lcp1)) del lcp1 del s_arr au = _part_2(genome_file_name=PATH + ORIGINAL) print("au list: ", list(au)) # ************************* # (2) flipped # ************************* print("performing flips: ") geno2 = read_unambiguous(PATH + FLIPPED) s_arr2 = read_byte_numpy(append_file_name('data/flippedGeno.sa')) lcp2 = kasai(geno2, s_arr2)[1] del geno2 mu_result = dict(compare(d=d1, SA=s_arr2, LCP=lcp2)) del lcp2 mu_result = OrderedDict(sort_mu(mu_result)) mu_result = OrderedDict(true_address_dict(mu_result, au)) json_it(mu_result, append_file_name(files['MU_RESULT'])) #contigs = list(find_contigs(d=old_mu_result_without_true_addresses, bot=20, top=100)) contigs = OrderedDict( find_perfect_contigs(d=mu_result, bot=20, top=100)) json_it(contigs, append_file_name(files['PERFECT_CONTIGS'])) contigs = list(within_distance(d=contigs, distance=300)) json_it(contigs, append_file_name(files['PERFECT_CONTIGS_WITH_DISTANCE'])) print("number of contigs: ", len(contigs)) print("done") except Exception as e: raise
parser = argparse.ArgumentParser() parser.add_argument('-g', dest='genome') parser.add_argument('-s', dest='SA') parser.add_argument('-o', dest='outfile') parser.add_argument('-m', dest='length') parser.add_argument('-l', dest='low') parser.add_argument('-q', dest='quiet') parser.add_argument('-d', dest='distance') args = parser.parse_args() args.length = 100 if not args.length else int(args.length) args.low = 20 if not args.low else int(args.low) if not args.genome: args.genome = fm.append_file_name('test/fake_genome') args.distance = 300 if not args.distance else int(args.distance) args.quiet = False if (args.quiet == 'f' or args.quiet == 'F' or args.quiet == 'false' or args.quiet == 'False') else True return args """ if __name__ == '__main__': args = Args() args.genome = fm.append_file_name('data/genome.fa') past = time.time() driver._part_2(past=past, args=args)
from collections import deque, OrderedDict from ambiguous_split import split_sequence, rb_tree_ambs from file_manager import read_byte_numpy, reads, msgunpack_dict, append_file_name, json_it, unjson_it, chr_splits, write_array_to_byte, read_byte_array, read_unambiguous from true_address import true_address_with_sort, true_address_no_sort, true_address_dict, forbiddens, just_true_address # TODO: will hard code directory paths for now 05/21 if "bioinformatics" in os.getcwd(): path.append('/work/bioinformatics/s187520/minimal_unique/src') else: path.append(os.getcwd().split('Documents')[0] + 'PycharmProjects/minimal_uniquemer/src') from MU_internal import mu, compare, sort_mu from contiguous import find_contigs, contig_seqs, find_perfect_contigs, within_distance PATH = append_file_name('data/') #ORIGINAL = '22.fa' #FLIPPED = 'flip22' ORIGINAL = 'genome.fa' FLIPPED = 'flippedGeno' bt2_SA1 = '22' bt2_SA2 = 'flip22' files = { 'MU_RESULT': '/output/c22_mu_result', 'PERFECT_CONTIGS': '/output/c22_perfect_contigs', 'PERFECT_CONTIGS_WITH_DISTANCE': '/output/c22_perfect_contigs_with_distance' }
def sesame_plant(): s_arr = read_byte_numpy(append_file_name('data/22.sa')) geno = read_unambiguous(append_file_name('data/22.fa')) se = SAGuide(s_arr=s_arr, geno=geno) print(se)
import sys import os from tqdm import tqdm sys.path.append(os.getcwd().split('uniquekmer')[0] + 'uniquekmer/src') from file_manager import append_file_name, read_unambiguous, read_byte_numpy from kasai import kasai TWO_WAY_GENO = append_file_name('data/2_way_genome.fa') S_ARRAY = append_file_name('data/2_way_genome.sa') # todo: does building a 2-way genome and computing SA and LCP intermingle the forward and reverse? class OddSuffixArrayLength(Exception): """the length of forward + reverse suffix array is not even""" raise Exception("Length of suffix array is not even!") def driver(): geno = read_unambiguous(TWO_WAY_GENO) s_arr = read_byte_numpy(S_ARRAY) def mu_2way_internal(geno:str, s_arr): try: # check if the length of s_arr is even. (forward length + reverse length) if not len(s_arr) % 2 == 0: raise OddSuffixArrayLength
def efficient_mu_driver(): """ NOTES: 07/05: You MUST run get_uniques first before sorting the lcp :return: """ try: # comment() geno = reads(filename=PATH + FORWARD) geno_length = len(geno) # comment() s_arr = read_byte_numpy(append_file_name('data/22.sa')) inv_suff, lcp = kasai(geno, s_arr) lcp = kasai(geno, s_arr)[1] del geno, s_arr # comment() au = _part_2(genome_file_name=PATH + FORWARD) lcp = list(get_uniques(lcp)) trues0 = list(sort_lcp(lcp=lcp, inv_suff=inv_suff)) del lcp bad_address = forbiddens(inv_suff=inv_suff, lcp=trues0, au=au) del inv_suff geno = read_unambiguous(filename=PATH + FLIPPED) s_arr = read_byte_numpy(append_file_name('data/f22.sa')) inv_2, lcp = kasai(geno, s_arr) lcp = kasai(geno, s_arr)[1] del geno, s_arr lcp = list(get_uniques(lcp)) trues1 = list(sort_lcp(lcp=lcp, inv_suff=inv_2)) del lcp, inv_2 # mu_s, mu_e = list(compare(inv0=inv_suff, trues0=trues0, inv1=inv_2, trues1=trues, bad_address=bad_address)) mu_s = [] mu_e = [] au_dict = {} for item in list(au): au_dict[item[0]] = item[1] del au u_ceil = list(au_dict)[0] u_floor = 0 a_offset = au_dict[u_ceil] # mu_s, mu_e = list(compare(trues0=trues0, trues1=trues1, bad_address=bad_address)) for tup in compare_no_inv_suff(trues0=trues0, trues1=trues1, bad_address=bad_address, geno_length=geno_length): sa = tup[0] if sa < u_floor: raise Exception( "SA is less than u_floor. Possible that s_arr not sorted correctly?" ) if sa > u_ceil and len(au_dict) > 1: u_floor = u_ceil del au_dict[u_ceil] u_ceil = list(au_dict)[0] a_offset = au_dict[u_ceil] elif len(au_dict) < 1: print("not au_dict reached") break # mu_s.append(tup[0]) mu_s.append(sa + a_offset) mu_e.append(tup[1]) # TODO: 07/05 made the line below return a dict as well as accept geno # to return to before, do not input geno and output two lists # myd = dict(compare(trues0 = trues0, trues1 = trues1, bad_address=bad_address)) # json_it(data=myd, filename="c22_mu") assert len(mu_s) == len(mu_e) just_dump(myl=mu_s, fn="c22_mu_starts_0709", print_length=True) just_dump(myl=mu_e, fn="c22_mu_ends_0709", print_length=True) # 07/08: changed get_uniques so that it doesn't yield past or lcp + 1 # json_it(mu_s, "efficient_mu_starts") # json_it(mu_e, "efficient_mu_ends") # stitched = list(stitch(starts=mu_s, uniques=mu_e)) # json_it(stitched, "stitched") # print("Number of stitched: " + str(len(stitched))) # print("Number of MU: " + str(len(mu_s))) # findmean(mys = mu_s, mye=mu_e) except IndexError: pass except Exception: print(traceback.format_exc()) breakpoint()
import sys import os sys.path.append(os.getcwd().split('uniquekmer')[0] + 'uniquekmer/src') from file_manager import append_file_name, read_byte_numpy, read_unambiguous sys.path.append(append_file_name('experimental')) from SeSAme import SAGuide def sesame_seed(): se = SAGuide() test_str = "A" * 10 a = se._next_string(test_str) print(a) test_str = "ATTTTT" print(se._next_string(test_str)) print(se._next_string("TTTTT")) def sesame_plant(): s_arr = read_byte_numpy(append_file_name('data/22.sa')) geno = read_unambiguous(append_file_name('data/22.fa')) se = SAGuide(s_arr=s_arr, geno=geno) print(se) if __name__ == "__main__":
def naive_lcp_22(): s_arr = fm.read_byte_numpy(filename=fm.append_file_name('data/22.sa')) lcp = test_kasai.naive_lcp(s_array=s_arr,T=simple_genome()) fm.json_it(data=lcp,filename=fm.append_file_name('output/naive_lcp_22'))
L += string[index - 1] pbar.update(1) if verbose: print("Suffix array: ", sa_array) print("The last column is: ", L) return sa_array, L def SA_file(filename:str): print('Reading file: ') sequence = read_unambiguous(filename) print('File read!\nCreating Suffix Array Naively:') s_array, _ = naive_SA(sequence) length = len(s_array) s_array.insert(0,length) print('Suffix Array Created!\nWriting to file: ') write_array_to_byte(filename='fake_genome_sa',byte_arr=s_array) return if __name__ == '__main__': SA_file(append_file_name('test/fake_genome'))
past = adr return def with_args(): args = specific_unique_args() print('reading dict:') d = sl.read_dict(filename=args.dfile, filetype=args.filetype) print('dict read!') output = '' if not args.outfile else append_file_name('output/' + args.outfile) valids, sequence, special_end = sl.specific_k(k=args.length, d=d, seq_file=args.genome, outfile=output) sl.file_write(valids=valids, outfile=output, sequence=sequence, k=args.length, special_end=special_end, d=d) naive_address_uniqueness_test(args.dfile) naive_kmer_uniqueness_test(output) if __name__ == '__main__': naive_kmer_uniqueness_test( filename=append_file_name('output/0403_c22_30mer'))
if not rb.contains(k): rb.add(k) return rb def get_kmers(filename): geno = convert_geno_to_num(filename) print("reading true addresses: ", end='') trues = fm.read_byte_to_queue( fm.append_file_name('0415_c22_1132_true_addresses')) print("done!\nreading unique start addresses: ", end='') uniques = fm.read_byte_to_queue( fm.append_file_name('0415_c22_1132_unique_starts')) print("done!") kmer_list = [] for i in trange(len(trues), desc="finding kmers: "): t = trues[i] # kmer = filter(str.isdigit, repr(geno[t:t+uniques[i]])) kmer = int(''.join(map(str, geno[t:t + uniques[i]]))) kmer_list.append(kmer) return kmer_list if __name__ == "__main__": main(fm.append_file_name("data/22.fa"))
def _part_3(lcp, au: RedBlackTree, past, inv_suff, args=None, print=print): try: # ____________________________________ print('\n_____________________________________') print('PART 3: VALIDATE STARTING ADDRESSES') print('_____________________________________\n') # ____________________________________ true_addresses = [] # tops = [] unique_starts = [] for tup in true_address_with_sort(lcp=lcp, au=au, top=args.length, bot=args.low, distance=args.distance, inv_suff=inv_suff): true_addresses.append(tup[0]) # tops.append(tup[1]) unique_starts.append(tup[1]) print('valid addresses calculated\n') past = get_time(past, print=print) # d_sa = within_distance(in_dict=d_sa, top=args.length, distance=args.distance) print('addresses within %s%s', str(args.distance), ' calculated') past = get_time(past, print=print) filename = append_file_name(filename=args.outfile + 'true_addresses') # MSGPACK DOES NOT PRESERVE ORDER print('saving true addresses as byte file') write_array_to_byte(filename=filename, byte_arr=true_addresses) print('saving tops as byte file') # filename = append_file_name(filename=args.outfile + 'tops') # write_array_to_byte(filename=filename, byte_arr=tops) print('saving unique starts as byte file') filename = append_file_name(filename=args.outfile + 'unique_starts') write_array_to_byte(filename=filename, byte_arr=unique_starts) # json_it(d_sa, filename) print('default dict msgpack\'ed\n') get_time(past, print=print) # delete lcp file if final file successfully written # don't delete an input file the user has specified # if not args.lcpfile: # filename = append_file_name(args.lcpfile + 'json_lcp') # os.remove(filename) # write_dictionary(in_dict=d_sa, filename='../default_d_sa_json') # print('wrote dictionary to json file\n') return except InsufficientArguments as e: print("Insufficient number of arguments passed!") except MemoryError: print_memory_usage() raise