def fix_sequence(self): """ Remove internal primer and restriction sites from the coding sequence. Returns ------- None. """ to_exclude = [ self.typeIIs, self.asmf_re, self.asmr_re, self.gsp_f, self.gsp_r, self.asm_f, self.asm_r, 'AAAAA', 'GGGGG', 'CCCCC', 'TTTTT' ] to_exclude.extend([utils.rev_comp(subseq) for subseq in to_exclude]) primers = [ self.gsp_f, utils.rev_comp(self.gsp_r), self.asm_f, utils.rev_comp(self.asm_r) ] gc_limits = [35, 65] good_seq = False while not good_seq: bad_codons = set() for subseq in to_exclude: bad_site = self.nt_seq.find(subseq) if bad_site > -1: positions = np.arange(bad_site, bad_site + len(subseq)) bad_codons.update(utils.get_codons(positions)) for primer in primers: match_len, bad_nt_pos, _ = utils.lcs(self.nt_seq, primer) if match_len > 10: bad_codons.update(utils.get_codons(bad_nt_pos)) # primers are single stranded, but the templates are not (after one cycle, at least) match_len, bad_nt_pos, _ = utils.lcs( utils.rev_comp(self.nt_seq), primer) if match_len > 10: bad_codons.update(utils.get_codons(bad_nt_pos)) if len(bad_codons) == 0: good_seq = 1 else: to_fix = self.rng.choice(list(bad_codons)) self.sample_new_codon(to_fix) # no restriction sites but a bad GC content... pick a random site to change if good_seq and not (gc_limits[0] <= GC(self.nt_seq) <= gc_limits[1]): good_seq = 0 to_fix = self.rng.choice(np.arange(len(self.codons))) self.sample_new_codon(to_fix) return
def gen(): while True: random_mirs, random_images, random_labels, random_stypes = [], [], [], [] # choose one of the RBNS miRNAs, generate target with no pairing, and assign logkd of 2 rbns1_mir = np.random.choice(TRAIN_MIRS_KDS) random_mirs.append(rbns1_mir.encode('utf-8')) rbns1_mirseq = MIRNA_DATA.loc[rbns1_mir]['guide_seq'][:options. MIRLEN] rbns1_target = utils.get_target_no_match(rbns1_mirseq, SEQLEN) random_images.append( np.outer(utils.one_hot_encode(rbns1_mirseq), utils.one_hot_encode(rbns1_target))) random_labels.append([2.0]) random_stypes.append(b'extra') # generate miRNA and target with no pairing and assign log kd of 2 rbns2_mir = np.random.choice(TRAIN_MIRS_KDS) random_mirs.append(rbns2_mir.encode('utf-8')) rbns2_target = utils.generate_random_seq(3) + utils.rev_comp( MIRNA_DATA.loc[rbns2_mir]['guide_seq'] [1:7]) + utils.generate_random_seq(3) rbns2_mirseq = utils.get_mir_no_match(rbns2_target, options.MIRLEN) random_images.append( np.outer(utils.one_hot_encode(rbns2_mirseq), utils.one_hot_encode(rbns2_target))) random_labels.append([2.0]) random_stypes.append(b'extra') # generate random 8mer pair and assign KD of average 8mer random_mirseq = utils.generate_random_seq(options.MIRLEN) random_mirs.append(b'random') up_flank = utils.generate_random_seq(2) down_flank = utils.generate_random_seq(2) random_target = up_flank + utils.rev_comp( random_mirseq[1:8]) + 'A' + down_flank random_images.append( np.outer(utils.one_hot_encode(random_mirseq), utils.one_hot_encode(random_target))) # new_label = -5.367 new_label = -5 flank_vals = { 'A': -0.34923908, 'T': -0.24840472, 'C': 0.12640774, 'G': 0.47123606 } all_flank = up_flank + down_flank for nt, val in flank_vals.items(): new_label += val * all_flank.count(nt) random_labels.append([new_label]) random_stypes.append(b'extra') yield np.array(random_mirs), np.stack(random_images), np.array( random_labels), np.array(random_stypes)
def gen(): while True: random_mirseq = utils.generate_random_seq(options.MIRLEN) random_target = utils.get_target_no_match(random_mirseq, SEQLEN) random_image = np.outer(utils.one_hot_encode(random_mirseq), utils.one_hot_encode(random_target)) rbns1_mir = np.random.choice(TRAIN_MIRS_KDS) rbns1_mirseq = MIRNA_DATA.loc[rbns1_mir]['guide_seq'][:options. MIRLEN] rbns1_target = utils.get_target_no_match(rbns1_mirseq, SEQLEN) rbns1_image = np.outer(utils.one_hot_encode(rbns1_mirseq), utils.one_hot_encode(rbns1_target)) rbns2_mir = np.random.choice(TRAIN_MIRS_KDS) rbns2_target = utils.generate_random_seq(3) + utils.rev_comp( MIRNA_DATA.loc[rbns2_mir]['guide_seq'] [1:7]) + utils.generate_random_seq(3) rbns2_mirseq = utils.get_mir_no_match(rbns2_target, options.MIRLEN) rbns2_image = np.outer(utils.one_hot_encode(rbns2_mirseq), utils.one_hot_encode(rbns2_target)) yield np.array([ b'random', rbns1_mir.encode('utf-8'), rbns2_mir.encode('utf-8') ]), np.stack([random_image, rbns1_image, rbns2_image]), np.array( [[0.0], [0.0], [0.0]]), np.array([b'no site', b'no site', b'no site'])
def generate_repeats(sizes, atomic): """Generates all possible motifs for repeats in a given length range""" generated_repeats = [] alphabet = ['A', 'C', 'G', 'T'] expanded_set = set() repeat_set = set() sizes.sort() min_size = sizes[0] max_size = sizes[-1] non_atomic_repeats = dict() for s in range(1, max_size): if s not in sizes: non_atomic_repeats[s] = set() if atomic: for combination in product(alphabet, repeat=s): repeat = ''.join(combination) expanded = expand_repeat(repeat, max_size) non_atomic_repeats[s].add(expanded) for i in sizes: factors = num_factors(i) for combination in product(alphabet, repeat=i): repeat = ''.join(combination) repeat_revcomp = rev_comp(repeat) expanded = expand_repeat(repeat, max_size) atomic_check = False if atomic: for factor in factors: if factor not in sizes and expanded in non_atomic_repeats[ factor]: atomic_check = True if expanded in expanded_set: continue elif atomic and atomic_check: continue else: repeat_cycles = get_cycles(repeat) for cycle in repeat_cycles: strand = '+' string = expand_repeat(cycle, max_size) expanded_set.add(string) if cycle not in repeat_set: repeat_set.add(cycle) if len(cycle) >= min_size: generated_repeats.append('\t'.join( [cycle, repeat, str(len(cycle)), strand])) if repeat_revcomp == repeat: continue repeat_cycles = get_cycles(repeat_revcomp) for cycle in repeat_cycles: strand = '-' string = expand_repeat(cycle, max_size) expanded_set.add(string) if cycle not in repeat_set: repeat_set.add(cycle) if len(cycle) >= min_size: generated_repeats.append('\t'.join( [cycle, repeat, str(len(cycle)), strand])) return generated_repeats
def write_12mers(mirname, mirseq, outfile): site8 = utils.rev_comp(mirseq[1:8]) + 'A' all_12mers = generate_12mers(site8) if len(all_12mers) != 262144: raise (ValueError("all_12mers should be 262144 in length")) with tf.python_io.TFRecordWriter(outfile) as tfwriter: for siteseq in all_12mers: aligned_stype = utils.get_centered_stype(site8, siteseq) if aligned_stype == 'no site': keep_prob = 0.001 else: keep_prob = 1.0 feature_dict = { 'mir': tf_utils._bytes_feature(mirname.encode('utf-8')), 'mir_1hot': tf_utils._float_feature(utils.one_hot_encode(mirseq)), 'seq_1hot': tf_utils._float_feature(utils.one_hot_encode(siteseq)), 'log_kd': tf_utils._float_feature([-0.0]), 'keep_prob': tf_utils._float_feature([keep_prob]), 'stype': tf_utils._bytes_feature(aligned_stype.encode('utf-8')), } example_proto = tf.train.Example(features=tf.train.Features( feature=feature_dict)) example_proto = example_proto.SerializeToString() tfwriter.write(example_proto)
def calculate_threep_score(mirseq, utr, site_start, upstream_limit): """ Calculate the three-prime pairing score Parameters ---------- mirseq: string, miRNA sequence utr: string, utr sequence site_start: int, start of 12mer site upstream_limit: int, how far upstream to look for 3p pairing Output ------ float: 3' pairing score """ if site_start <= 0: return 0 # get the 3' region of the mirna and the corresponding utr seq mirseq_3p = mirseq[8:] # miRNA sequence from position 9 onward trailing = utr[max(0, site_start - upstream_limit):site_start + 2] # site sequence up to edges of possible 8mer site utr_5p = utils.rev_comp(trailing) # initiate array for dynamic programming search scores = np.empty((len(utr_5p) + 1, len(mirseq_3p) + 1)) scores.fill(np.nan) possible_scores = [0] # fill in array for i, nt1 in enumerate(utr_5p): for j, nt2 in enumerate(mirseq_3p): if nt1 == nt2: new_score = 0.5 + 0.5 * ((j > 3) & (j < 8)) if not np.isnan(scores[i, j]): new_score += scores[i, j] scores[i + 1, j + 1] = new_score possible_scores.append(new_score) else: offset_penalty = max(0, (abs(i - j) - 2) * 0.5) scores[i + 1, j + 1] = new_score - offset_penalty else: scores[i + 1, j + 1] = float('NaN') return np.nanmax(possible_scores)
def generate_repeats(min_size, max_size, atomic): """Generates all possible motifs for repeats in a given length range""" generated_repeats = [] alphabet = ['A', 'C', 'G', 'T'] expanded_set = set() repeat_set = set() init_size = 1 if atomic: init_size = min_size for i in range(init_size, max_size+1): for combination in product(alphabet, repeat=i): repeat = ''.join(combination) repeat_revcomp = rev_comp(repeat) expanded = expand_repeat(repeat, max_size) if expanded in expanded_set: continue else: repeat_cycles = get_cycles(repeat) for cycle in repeat_cycles: strand = '+' string = expand_repeat(cycle, max_size) expanded_set.add(string) if cycle not in repeat_set: repeat_set.add(cycle) if len(cycle) >= min_size: generated_repeats.append('\t'.join([cycle, repeat, str(len(cycle)), strand])) if repeat_revcomp == repeat: continue repeat_cycles = get_cycles(repeat_revcomp) for cycle in repeat_cycles: strand = '-' string = expand_repeat(cycle, max_size) expanded_set.add(string) if cycle not in repeat_set: repeat_set.add(cycle) if len(cycle) >= min_size: generated_repeats.append('\t'.join([cycle, repeat, str(len(cycle)), strand])) return generated_repeats
parser.add_option("--overlap_dist", dest="OVERLAP_DIST", help="minimum distance between neighboring sites", type=int) parser.add_option("--upstream_limit", dest="UPSTREAM_LIMIT", help="how far upstream to look for 3p pairing", type=int) parser.add_option("--only_canon", dest="ONLY_CANON", help="only use canonical sites", default=False, action='store_true') (options, args) = parser.parse_args() TRANSCRIPTS = pd.read_csv(options.TRANSCRIPTS, sep='\t', index_col='transcript') mirseqs = pd.read_csv(options.MIR_SEQS, sep='\t', index_col='mir') if '_pass' in options.MIR: MIRSEQ = mirseqs.loc[options.MIR.replace('_pass', '')]['pass_seq'] FAMILY = mirseqs.loc[options.MIR.replace('_pass', '')]['pass_family'] else: MIRSEQ = mirseqs.loc[options.MIR]['guide_seq'] FAMILY = mirseqs.loc[options.MIR]['guide_family'] SITE8 = utils.rev_comp(MIRSEQ[1:8]) + 'A' print(options.MIR, SITE8) # if KD file provided, find sites based on KD file if options.KDS is not None: KDS = pd.read_csv(options.KDS, sep='\t') if options.ONLY_CANON: KDS = KDS[KDS['aligned_stype'] != 'no site'] KDS = KDS[KDS['best_stype'] == KDS['aligned_stype']] temp = KDS[KDS['mir'] == FAMILY] if len(temp) == 0: raise ValueError('{} not in kd files'.format(FAMILY)) mir_kd_dict = {x: y for (x, y) in zip(temp['12mer'], temp['log_kd']) if (y < options.KD_CUTOFF)} # find all the sites and KDs
def calculate_12mer_kds(mirname, mirseq, mirlen, load_model, outfile): """ For a given miRNA sequence, use a saved ConvNet to generate predictions """ if len(mirseq) < 12: raise(ValueError("miRNA must be at least 12 nt long")) if len(mirseq.replace('A','').replace('C','').replace('G','').replace('T','')) > 0: raise(ValueError("miRNA must only contain A, C, T, G")) site8 = utils.rev_comp(mirseq[1:8]) + 'A' mirseq_one_hot = utils.one_hot_encode(mirseq[:mirlen]) # generate all 12mer sequences, there should be 262,144 kmers = generate_12mers(site8) if len(kmers) != 438784: raise(ValueError("kmers should be 438784 in length")) # load trained model tf.reset_default_graph() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.import_meta_graph(load_model + '.meta') print('Restoring from {}'.format(load_model)) saver.restore(sess, load_model) _dropout_rate = tf.get_default_graph().get_tensor_by_name('dropout_rate:0') _phase_train = tf.get_default_graph().get_tensor_by_name('phase_train:0') _combined_x = tf.get_default_graph().get_tensor_by_name('combined_x:0') _prediction = tf.get_default_graph().get_tensor_by_name('final_layer/pred_ka:0') num_batches = 64 batch_size = 6856 with open(outfile, 'w') as outfile_writer: outfile_writer.write('12mer\tlog_kd\tmir\tmirseq\taligned_stype\tbest_stype\n') for batch in range(num_batches): print("Processing {}/{}...".format((batch+1)*batch_size, 438784)) seqs = kmers[batch*batch_size: (batch+1) * batch_size] input_data = [] for ix, seq in enumerate(seqs): seq_one_hot = utils.one_hot_encode(seq) input_data.append(np.outer(mirseq_one_hot, seq_one_hot)) input_data = np.stack(input_data) feed_dict = { _dropout_rate: 0.0, _phase_train: False, _combined_x: input_data } pred_kds = -1 * sess.run(_prediction, feed_dict=feed_dict).flatten() aligned_stypes = [utils.get_centered_stype(site8, seq) for seq in seqs] best_stypes = [utils.get_best_stype(site8, seq) for seq in seqs] for seq, kd, aligned_stype, best_stype in zip(seqs, pred_kds, aligned_stypes, best_stypes): outfile_writer.write('{}\t{}\t{}\t{}\t{}\t{}\n'.format(seq, kd, mirname, mirseq, aligned_stype, best_stype))
action='store_true') (options, args) = parser.parse_args() ### READ miRNA DATA and filter for ones to keep ### MIRNAS = pd.read_csv(options.MIR_SEQS, sep='\t') MIRNAS = MIRNAS[MIRNAS['use_tpms']] ALL_GUIDES = sorted(list(MIRNAS['mir'].values)) MIR_DICT = {} for row in MIRNAS.iterrows(): guide_seq = row[1]['guide_seq'] pass_seq = row[1]['pass_seq'] MIR_DICT[row[1]['mir']] = { 'mirseq': guide_seq, 'site8': utils.rev_comp(guide_seq[1:8]) + 'A', 'one_hot': utils.one_hot_encode(guide_seq[:options.MIRLEN]) } MIR_DICT[row[1]['mir'] + '*'] = { 'mirseq': pass_seq, 'site8': utils.rev_comp(pass_seq[1:8]) + 'A', 'one_hot': utils.one_hot_encode(pass_seq[:options.MIRLEN]) } ### READ EXPRESSION DATA ### TPM = pd.read_csv(options.TPM_FILE, sep='\t', index_col=0).sort_index() for mir in ALL_GUIDES: if mir not in TPM.columns: raise ValueError( '{} given in mirseqs file but not in TPM file.'.format(mir))
dest="OUTFILE", help="file for writing outputs, use MIR as placeholder") parser.add_option("--passenger", dest="PASSENGER", help="also calculate for passenger strand", default=False, action='store_true') (options, args) = parser.parse_args() mirseqs = pd.read_csv(options.MIRSEQS, sep='\t', index_col='mir') for row in mirseqs.iterrows(): # get names and 8mer sites for the guide and passenger strands, if applicable mirnames = [row[0]] site8s = [utils.rev_comp(row[1]['guide_seq'][1:8]) + 'A'] if options.PASSENGER: mirnames += [row[0] + '_pass'] site8s += [utils.rev_comp(row[1]['pass_seq'][1:8]) + 'A'] # read in RNAplfold results for mirname, site8 in zip(mirnames, site8s): SA_bg = [] for ix in range(options.NBINS): seqs = pd.read_csv(os.path.join( options.INFILE_SEQS.replace('MIR', mirname).replace( 'IX', str(ix))), sep='\t') temp = pd.read_csv(os.path.join( options.INFILE_BG.replace('MIR', mirname).replace('IX', str(ix))),
def split_kmers(self, min_overlap=16, max_overlap=35, min_tm=59, max_tm=64, min_gc=40, max_gc=60, min_ddg=-3, min_dimer=-9): """ Split the coding sequnce Parameters ---------- (int) min_overlap -- minimum bp in overlap sequences. Default 16 (int) max_overlap -- maximum bp in overlap sequences. Default 35 (float) min_tm -- minimum melting temp in overlap sequences. Default 59 (float) max_tm -- maximum melting temp in overlap sequences. Default 64 (float) min_gc -- minimum %GC content in overlap sequences. Default 40 (float) max_gc -- maximum %GC content in overlap sequences. Default 60 (float) min_ddg=-3 -- minimum hairpin/secondary structure deltaG from ViennaRNA RNAfold (DNA parameters) If None, do not check this (float) min_dimer=-9 -- minimum self-association deltaG from ViennaRNA RNAduplex (DNA parameters) If None, do not check this Returns ------- None. Oligos stored in oligos field of Gene object. """ # calculate expected number of oligos necessary so that length can be equal-ish tot_assembled = len(self.aa_seq)*3+len(self.asm_f)+len(self.asmf_re) + \ len(self.asmr_re)+len(self.asm_r) # total sequence that needs to be split into fragments available_nt = self.oligo_size - len(self.gsp_f) - len( self.gsp_r) - 2 * len( self.typeIIs) # non-constant oligo region size expected_overlap_nt = ( np.ceil(tot_assembled / available_nt) - 1 ) * max_overlap # expected number of bp doubly represented due to overlaps expected_oligos = np.ceil( (tot_assembled + expected_overlap_nt) / available_nt) # expected number of oligos including overlap region target_length = int( (tot_assembled + expected_overlap_nt) // expected_oligos ) # length to target per fragment to get roughly equal lengths # The basic gist of how this works is you start with the full sequence, which you have already determined the nucleotides # for (this is the main difference from Bill's code). You take as much of that sequence as you can to fill the current # oligo, cut it back until you get a GC on the 3' end, then work backwards until you get a good overlap. If you can't get # a good overlap (usually due to GC content), start over with a different max length. Max lengths go from n, n-1, n+1, n-2, n+2, ... # If the max length becomes greater than the allowed length (or shorter than max_overlap+2, but usually the former happens first), # can't assemble the sequence. Try with a different random seed to produce a different sequence. # Note: the former constraint can be relaxed to try some shorter lengths too, but seems to work OK for now all_oligos = False # keep track of progress curr_max = target_length # max number of bp allowed in single oligo curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp( self.asm_r) next_oligo = "" while not all_oligos: if len(curr_oligo) > curr_max: # just take what will fit on the current oligo, save the # rest for later next_oligo = curr_oligo[curr_max:] curr_oligo = curr_oligo[:curr_max] if len(next_oligo) == 0: # don't need to find an overlap bc done self.oligos.append(curr_oligo) # check the assembly gene = "" badoverlap = False for i in range(len(self.oligos)): if i == 0: gene += self.oligos[i] else: common = self.oligos[i].find(self.oligos[i - 1][-10:]) if common < 0: badoverlap = True break gene += self.oligos[i][common + 10:] # check for additional overlap sites between different oligos for i, olap in enumerate(self.overlaps): for j, oligo in enumerate(self.oligos): # overlap i corresponds to oligo i and oligo i+1 if j == i: # overlap at the end true_occur = oligo.find(olap) trimmed_seq = oligo[:true_occur] elif j == i + 1: #overlap at the beginning true_occur = oligo.find(olap) trimmed_seq = oligo[true_occur + len(olap):] else: trimmed_seq = oligo match_len_fwd, _, _ = utils.lcs(trimmed_seq, olap) match_len_rev, _, _ = utils.lcs( trimmed_seq, utils.rev_comp(olap)) if match_len_fwd > 10 or match_len_rev > 10: print("Bad overlap due to possible mispriming.") print("Oligo %d overlap %d match %d bp" % (j, i, max(match_len_fwd, match_len_rev))) badoverlap = True break if badoverlap or \ gene != self.asm_f + self.asmf_re + self.nt_seq + \ self.stop + self.asmr_re + utils.rev_comp(self.asm_r): if curr_max < target_length: curr_max = 2 * target_length - curr_max else: curr_max = 2 * target_length - curr_max - 1 if curr_max > available_nt or curr_max < max_overlap + 2: raise Exception( "Couldn't find oligos with given framework, failed at assembly" ) curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp( self.asm_r) self.oligos = [] self.overlaps = [] self.overlap_gc = [] self.overlap_tm = [] else: assert gene.count(self.asm_f) == 1, "Incorrect number AsmF" assert gene.count( self.asmf_re) == 1, "Incorrect number AsmF RE" assert gene.count(utils.rev_comp( self.asm_r)) == 1, "Incorrect number AsmR" assert gene.count( self.asmr_re) == 1, "Incorrect number AsmR RE" all_oligos = True continue # trim back to g or c while curr_oligo[-1] not in 'GC': next_oligo = curr_oligo[-1] + next_oligo curr_oligo = curr_oligo[:-1] # find the overlap overlap_pos = len(curr_oligo) - min_overlap + 1 curr_tm = 0 curr_gc = 0 curr_ss_ddg = 10 curr_dimer = 10 while (curr_oligo[overlap_pos] not in 'GC' or not min_tm <= curr_tm <= max_tm or not min_gc <= curr_gc <= max_gc or (min_ddg is not None and not curr_ss_ddg > min_ddg) or (min_dimer is not None and not curr_dimer > min_dimer)): overlap_pos -= 1 # initial case accounted for in math above if overlap_pos < len( curr_oligo ) - max_overlap or curr_tm > max_tm: #Tm is never going to decrease break # no good overlap... try different max length and # restart the loop # don't bother with expensive calcs if the loop is just going to fail anyway if curr_oligo[overlap_pos] not in 'GC': continue temp_overlap = Seq(curr_oligo[overlap_pos:]) # tm calculation with salt correction for KOD reaction curr_tm = mt.Tm_NN(temp_overlap, Mg=1.5, dNTPs=0.8) curr_gc = GC(temp_overlap) # ViennaRNA external software if min_ddg is not None: curr_ss_ddg = utils.pred_ss_ddg(str( temp_overlap)) # this calculation slows it down a LOT if min_dimer is not None: curr_dimer = utils.pred_dimer(str(temp_overlap), str(temp_overlap)) if (curr_oligo[overlap_pos] not in 'GC' or not min_tm <= curr_tm <= max_tm or not min_gc <= curr_gc <= max_gc or (min_ddg != None and not curr_ss_ddg > min_ddg) or (min_dimer != None and not curr_dimer > min_dimer)): # this means the above loop broke, so try diff max length # and restart the loop if curr_max < target_length: curr_max = 2 * target_length - curr_max else: curr_max = 2 * target_length - curr_max - 1 if curr_max > available_nt or curr_max < max_overlap + 2: raise Exception( "Couldn't find oligos with given framework, failed at melting temp" ) curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp( self.asm_r) self.oligos = [] self.overlaps = [] self.overlap_gc = [] self.overlap_tm = [] continue # otherwise process the overlap! self.oligos.append(curr_oligo) self.overlaps.append(curr_oligo[overlap_pos:]) self.overlap_gc.append(curr_gc) self.overlap_tm.append(curr_tm) curr_oligo = self.overlaps[-1] + next_oligo # add in the overlap next_oligo = "" # add the gsps, type IIs, any buffer residues to everything full_oligos = [] for i, oligo in enumerate(self.oligos): # add buffer between 3' GSP site, TypeIIs site to bring oligo # up to full size padding_size = available_nt - len(oligo) padding = self.rng.choice(['A', 'C', 'G', 'T'], padding_size) good_padding = False to_exclude = [ self.typeIIs, self.asmf_re, self.asmr_re, self.gsp_f, self.gsp_r, self.asm_f, self.asm_r, 'AAAAA', 'GGGGG', 'CCCCC', 'TTTTT' ] to_exclude.extend( [utils.rev_comp(subseq) for subseq in to_exclude]) primers = [ self.gsp_f, utils.rev_comp(self.gsp_r), self.asm_f, utils.rev_comp(self.asm_r) ] left_boundary = oligo[-4:] + utils.rev_comp(self.typeIIs) fixable_pos = set( range(len(left_boundary), len(left_boundary) + padding_size)) # make sure you're not getting any restriction/priming sites in the buffer bp that will # mess things up while not good_padding: subseq = left_boundary + "".join(padding) + utils.rev_comp( self.gsp_r)[:5] bad_pos = set() for site in to_exclude: # include boundaries substr = subseq.find(site) if substr >= 0: bad_pos.update( fixable_pos.intersection( range(substr, substr + len(site)))) for primer in primers: match_len, bad_nt_pos, _ = utils.lcs(subseq, primer) if match_len > 10: bad_pos.update(fixable_pos.intersection(bad_nt_pos)) # primers are single stranded, but the templates are not (after one cycle, at least) match_len, bad_nt_pos, _ = utils.lcs( utils.rev_comp(subseq), primer) if match_len > 10: bad_pos.update(fixable_pos.intersection(bad_nt_pos)) if len(bad_pos) == 0: good_padding = True else: to_fix = self.rng.choice(list(bad_pos)) padding[to_fix - len(left_boundary)] = self.rng.choice( ['A', 'C', 'G', 'T']) padding = "".join(padding) complete_oligo = self.gsp_f + self.typeIIs + oligo + utils.rev_comp(self.typeIIs) + \ padding + utils.rev_comp(self.gsp_r) # already checked the full assembly, so now make sure nothing was accidentally introduced # at boundaries assert complete_oligo.count( self.gsp_f) == 1, "GSP F not found in %d -th oligo" % i assert complete_oligo.count(utils.rev_comp( self.gsp_r)) == 1, "GSP_R not found in %d -th oligo" % i assert complete_oligo.count(utils.rev_comp( self.gsp_f)) == 0, "GSP_F RC found in %d -th oligo" % i assert complete_oligo.count( self.gsp_r) == 0, "GSP_R RC found in %d -th oligo" % i if self.typeIIs == utils.rev_comp(self.typeIIs): assert complete_oligo.count( self.typeIIs ) == 2, "Extra Type IIS sites in %d -th oligo" % i else: assert complete_oligo.count( self.typeIIs ) == 1, "Extra Type IIS sites in %d -th oligo" % i assert complete_oligo.count( utils.rev_comp(self.typeIIs) ) == 1, "Extra Type IIS sites in %d -th oligo" % i assert complete_oligo.count(utils.rev_comp( self.asm_f)) == 0, "AsmF RC in in %d -th oligo" % i assert complete_oligo.count( self.asm_r) == 0, "AsmR in %d -th oligo" % i if self.asmf_re != utils.rev_comp(self.asmf_re): assert complete_oligo.count(utils.rev_comp( self.asmf_re)) == 0, "AsmF RE RC in %d -th oligo" % i if self.asmr_re != utils.rev_comp(self.asmr_re): assert complete_oligo.count(utils.rev_comp( self.asmr_re)) == 0, "AsmR RE RC in %d -th oligo" % i full_oligos.append(complete_oligo) self.oligos = full_oligos return