def fix_sequence(self): """ Remove internal primer and restriction sites from the coding sequence. Returns ------- None. """ to_exclude = [ self.typeIIs, self.asmf_re, self.asmr_re, self.gsp_f, self.gsp_r, self.asm_f, self.asm_r, 'AAAAA', 'GGGGG', 'CCCCC', 'TTTTT' ] to_exclude.extend([utils.rev_comp(subseq) for subseq in to_exclude]) primers = [ self.gsp_f, utils.rev_comp(self.gsp_r), self.asm_f, utils.rev_comp(self.asm_r) ] gc_limits = [35, 65] good_seq = False while not good_seq: bad_codons = set() for subseq in to_exclude: bad_site = self.nt_seq.find(subseq) if bad_site > -1: positions = np.arange(bad_site, bad_site + len(subseq)) bad_codons.update(utils.get_codons(positions)) for primer in primers: match_len, bad_nt_pos, _ = utils.lcs(self.nt_seq, primer) if match_len > 10: bad_codons.update(utils.get_codons(bad_nt_pos)) # primers are single stranded, but the templates are not (after one cycle, at least) match_len, bad_nt_pos, _ = utils.lcs( utils.rev_comp(self.nt_seq), primer) if match_len > 10: bad_codons.update(utils.get_codons(bad_nt_pos)) if len(bad_codons) == 0: good_seq = 1 else: to_fix = self.rng.choice(list(bad_codons)) self.sample_new_codon(to_fix) # no restriction sites but a bad GC content... pick a random site to change if good_seq and not (gc_limits[0] <= GC(self.nt_seq) <= gc_limits[1]): good_seq = 0 to_fix = self.rng.choice(np.arange(len(self.codons))) self.sample_new_codon(to_fix) return
def align_podcast_tokens(args, df): """Align the embeddings tokens with datum (containing onset/offset) Args: args (Namespace): namespace object containing project parameters df (DataFrame): embeddings dataframe Returns: df (DataFrame): aligned/filtered dataframe (goes into encoding) """ DATA_DIR = os.path.join(os.getcwd(), 'data', args.project_id) cloze_file = os.path.join(DATA_DIR, 'podcast-datum-cloze.csv') cloze_df = pd.read_csv(cloze_file, sep=',') words = list(map(str.lower, cloze_df.word.tolist())) model_tokens = df['token2word'].tolist() # Align the two lists mask1, mask2 = lcs(words, model_tokens) cloze_df = cloze_df.iloc[mask1, :].reset_index(drop=True) df = df.iloc[mask2, :].reset_index(drop=True) df_final = pd.concat([df, cloze_df], axis=1) df = df_final.loc[:, ~df_final.columns.duplicated()] return df
def map_attributes_to_mesh(value, banned_columns=None): assert type( value ) == str, "Value passed to 'map_attributes_to_mesh' must be of type <str>" data = {'term': value, 'terminology': 'text', 'result_format': 'json'} response = requests.post(CATALOGUE_TRANSLATE_URL, data=data) if response.status_code != 200: return None maximum_ed = None mesh_code = None for mesh_candidate in response.json(): if mesh_candidate[ 'mesh_code'] not in banned_columns or banned_columns is None: if mesh_candidate['mesh_code'].startswith("D"): current_ed = 1000 else: current_ed = lcs(value, mesh_candidate['mesh_label']) if maximum_ed is None or current_ed > maximum_ed: maximum_ed = current_ed mesh_code = mesh_candidate['mesh_code'] if mesh_code is not None: return mesh_code else: return value
def get_candidate(k2,candidates,ref_scores=None): k = utils.accent2bare(k2) l_bares = [] for c in candidates: l_bares.append(utils.accent2bare(c)) dc = len(k)*1.0 l_sims = [] i = -1 for bare in l_bares: i += 1 count = len(utils.lcs(k,bare))*1.0 if k2[0] == candidates[i][0]: count += 0.1 count += math.log(100.0+ref_scores[i])/math.log(1000) l_sims.append(count/dc) #print l_sims sorted_scores,sorted_indices = utils.sort_array_indices(l_sims) l_true_candidates = [] for ind in sorted_indices: l_true_candidates.append(candidates[ind]) return l_true_candidates,sorted_scores
def tag_brand_name(pt_text, attr_vals): output = None if 'brand' in attr_vals: output = str(utils.lcs(pt_text, attr_vals['brand'])) return output
def cal_same_terms(ori_quests, cand_quests, features): """ the number of same terms between original question and candicate question """ for idx in np.arange(len(ori_quests)): ori_quest = [each for each in ori_quests[idx]] cand_quest = [each for each in cand_quests[idx]] _, commList = lcs(ori_quest, cand_quest) score = float(len(commList)) / len(ori_quest) features[idx].append(score)
def cal_max_similarity_term(ori_quests, cand_quests, features): """ the length of the same terms between original question and candicate question """ for idx in np.arange(len(ori_quests)): ori_quest = [each for each in ori_quests[idx]] cand_quest = [each for each in cand_quests[idx]] _, commList = lcs(ori_quest, cand_quest) commLength, oriLen = 0, 0 for each in commList: commLength += len(each) for each in ori_quest: oriLen += len(each) score = float(commLength) / oriLen features[idx].append(score)
def tag_processor_type(pt_text, attr_vals): output = None if 'processor_type' in attr_vals: proc = attr_vals['processor_type'] if len(proc) > 0: pattern = '(' + utils.lcs( pt_text, proc) + ')\s*(([A-Za-z][0-9]+[- ])?\s*[0-9]{4}([A-Za-z]+)?)?' pattern = re.compile(pattern, re.IGNORECASE) for m in pattern.finditer(pt_text): output = pt_text[m.start():m.start() + len(m.group(0))] break return output
def predict_lcs(lcs_classificator, nn_tree, testSample, use_min=False): testSample = nn_tree.query(testSample, k=1, return_distance=False) testSample = [sample[0] for sample in testSample] scores = [] for cl in lcs_classificator.classes: class_score = [] for sample in cl: intersection = set(testSample).intersection(sample) test = filter(lambda x: x in intersection, testSample) sample = filter(lambda x: x in intersection, sample) if len(intersection) > 0: class_score.append(lcs(test, sample)) else: class_score.append(0.0) print scores scores.append(sum(np.array(class_score)) / float(len(cl))) return np.argmax(scores)
def split_kmers(self, min_overlap=16, max_overlap=35, min_tm=59, max_tm=64, min_gc=40, max_gc=60, min_ddg=-3, min_dimer=-9): """ Split the coding sequnce Parameters ---------- (int) min_overlap -- minimum bp in overlap sequences. Default 16 (int) max_overlap -- maximum bp in overlap sequences. Default 35 (float) min_tm -- minimum melting temp in overlap sequences. Default 59 (float) max_tm -- maximum melting temp in overlap sequences. Default 64 (float) min_gc -- minimum %GC content in overlap sequences. Default 40 (float) max_gc -- maximum %GC content in overlap sequences. Default 60 (float) min_ddg=-3 -- minimum hairpin/secondary structure deltaG from ViennaRNA RNAfold (DNA parameters) If None, do not check this (float) min_dimer=-9 -- minimum self-association deltaG from ViennaRNA RNAduplex (DNA parameters) If None, do not check this Returns ------- None. Oligos stored in oligos field of Gene object. """ # calculate expected number of oligos necessary so that length can be equal-ish tot_assembled = len(self.aa_seq)*3+len(self.asm_f)+len(self.asmf_re) + \ len(self.asmr_re)+len(self.asm_r) # total sequence that needs to be split into fragments available_nt = self.oligo_size - len(self.gsp_f) - len( self.gsp_r) - 2 * len( self.typeIIs) # non-constant oligo region size expected_overlap_nt = ( np.ceil(tot_assembled / available_nt) - 1 ) * max_overlap # expected number of bp doubly represented due to overlaps expected_oligos = np.ceil( (tot_assembled + expected_overlap_nt) / available_nt) # expected number of oligos including overlap region target_length = int( (tot_assembled + expected_overlap_nt) // expected_oligos ) # length to target per fragment to get roughly equal lengths # The basic gist of how this works is you start with the full sequence, which you have already determined the nucleotides # for (this is the main difference from Bill's code). You take as much of that sequence as you can to fill the current # oligo, cut it back until you get a GC on the 3' end, then work backwards until you get a good overlap. If you can't get # a good overlap (usually due to GC content), start over with a different max length. Max lengths go from n, n-1, n+1, n-2, n+2, ... # If the max length becomes greater than the allowed length (or shorter than max_overlap+2, but usually the former happens first), # can't assemble the sequence. Try with a different random seed to produce a different sequence. # Note: the former constraint can be relaxed to try some shorter lengths too, but seems to work OK for now all_oligos = False # keep track of progress curr_max = target_length # max number of bp allowed in single oligo curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp( self.asm_r) next_oligo = "" while not all_oligos: if len(curr_oligo) > curr_max: # just take what will fit on the current oligo, save the # rest for later next_oligo = curr_oligo[curr_max:] curr_oligo = curr_oligo[:curr_max] if len(next_oligo) == 0: # don't need to find an overlap bc done self.oligos.append(curr_oligo) # check the assembly gene = "" badoverlap = False for i in range(len(self.oligos)): if i == 0: gene += self.oligos[i] else: common = self.oligos[i].find(self.oligos[i - 1][-10:]) if common < 0: badoverlap = True break gene += self.oligos[i][common + 10:] # check for additional overlap sites between different oligos for i, olap in enumerate(self.overlaps): for j, oligo in enumerate(self.oligos): # overlap i corresponds to oligo i and oligo i+1 if j == i: # overlap at the end true_occur = oligo.find(olap) trimmed_seq = oligo[:true_occur] elif j == i + 1: #overlap at the beginning true_occur = oligo.find(olap) trimmed_seq = oligo[true_occur + len(olap):] else: trimmed_seq = oligo match_len_fwd, _, _ = utils.lcs(trimmed_seq, olap) match_len_rev, _, _ = utils.lcs( trimmed_seq, utils.rev_comp(olap)) if match_len_fwd > 10 or match_len_rev > 10: print("Bad overlap due to possible mispriming.") print("Oligo %d overlap %d match %d bp" % (j, i, max(match_len_fwd, match_len_rev))) badoverlap = True break if badoverlap or \ gene != self.asm_f + self.asmf_re + self.nt_seq + \ self.stop + self.asmr_re + utils.rev_comp(self.asm_r): if curr_max < target_length: curr_max = 2 * target_length - curr_max else: curr_max = 2 * target_length - curr_max - 1 if curr_max > available_nt or curr_max < max_overlap + 2: raise Exception( "Couldn't find oligos with given framework, failed at assembly" ) curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp( self.asm_r) self.oligos = [] self.overlaps = [] self.overlap_gc = [] self.overlap_tm = [] else: assert gene.count(self.asm_f) == 1, "Incorrect number AsmF" assert gene.count( self.asmf_re) == 1, "Incorrect number AsmF RE" assert gene.count(utils.rev_comp( self.asm_r)) == 1, "Incorrect number AsmR" assert gene.count( self.asmr_re) == 1, "Incorrect number AsmR RE" all_oligos = True continue # trim back to g or c while curr_oligo[-1] not in 'GC': next_oligo = curr_oligo[-1] + next_oligo curr_oligo = curr_oligo[:-1] # find the overlap overlap_pos = len(curr_oligo) - min_overlap + 1 curr_tm = 0 curr_gc = 0 curr_ss_ddg = 10 curr_dimer = 10 while (curr_oligo[overlap_pos] not in 'GC' or not min_tm <= curr_tm <= max_tm or not min_gc <= curr_gc <= max_gc or (min_ddg is not None and not curr_ss_ddg > min_ddg) or (min_dimer is not None and not curr_dimer > min_dimer)): overlap_pos -= 1 # initial case accounted for in math above if overlap_pos < len( curr_oligo ) - max_overlap or curr_tm > max_tm: #Tm is never going to decrease break # no good overlap... try different max length and # restart the loop # don't bother with expensive calcs if the loop is just going to fail anyway if curr_oligo[overlap_pos] not in 'GC': continue temp_overlap = Seq(curr_oligo[overlap_pos:]) # tm calculation with salt correction for KOD reaction curr_tm = mt.Tm_NN(temp_overlap, Mg=1.5, dNTPs=0.8) curr_gc = GC(temp_overlap) # ViennaRNA external software if min_ddg is not None: curr_ss_ddg = utils.pred_ss_ddg(str( temp_overlap)) # this calculation slows it down a LOT if min_dimer is not None: curr_dimer = utils.pred_dimer(str(temp_overlap), str(temp_overlap)) if (curr_oligo[overlap_pos] not in 'GC' or not min_tm <= curr_tm <= max_tm or not min_gc <= curr_gc <= max_gc or (min_ddg != None and not curr_ss_ddg > min_ddg) or (min_dimer != None and not curr_dimer > min_dimer)): # this means the above loop broke, so try diff max length # and restart the loop if curr_max < target_length: curr_max = 2 * target_length - curr_max else: curr_max = 2 * target_length - curr_max - 1 if curr_max > available_nt or curr_max < max_overlap + 2: raise Exception( "Couldn't find oligos with given framework, failed at melting temp" ) curr_oligo = self.asm_f + self.asmf_re + self.nt_seq + self.stop + self.asmr_re + utils.rev_comp( self.asm_r) self.oligos = [] self.overlaps = [] self.overlap_gc = [] self.overlap_tm = [] continue # otherwise process the overlap! self.oligos.append(curr_oligo) self.overlaps.append(curr_oligo[overlap_pos:]) self.overlap_gc.append(curr_gc) self.overlap_tm.append(curr_tm) curr_oligo = self.overlaps[-1] + next_oligo # add in the overlap next_oligo = "" # add the gsps, type IIs, any buffer residues to everything full_oligos = [] for i, oligo in enumerate(self.oligos): # add buffer between 3' GSP site, TypeIIs site to bring oligo # up to full size padding_size = available_nt - len(oligo) padding = self.rng.choice(['A', 'C', 'G', 'T'], padding_size) good_padding = False to_exclude = [ self.typeIIs, self.asmf_re, self.asmr_re, self.gsp_f, self.gsp_r, self.asm_f, self.asm_r, 'AAAAA', 'GGGGG', 'CCCCC', 'TTTTT' ] to_exclude.extend( [utils.rev_comp(subseq) for subseq in to_exclude]) primers = [ self.gsp_f, utils.rev_comp(self.gsp_r), self.asm_f, utils.rev_comp(self.asm_r) ] left_boundary = oligo[-4:] + utils.rev_comp(self.typeIIs) fixable_pos = set( range(len(left_boundary), len(left_boundary) + padding_size)) # make sure you're not getting any restriction/priming sites in the buffer bp that will # mess things up while not good_padding: subseq = left_boundary + "".join(padding) + utils.rev_comp( self.gsp_r)[:5] bad_pos = set() for site in to_exclude: # include boundaries substr = subseq.find(site) if substr >= 0: bad_pos.update( fixable_pos.intersection( range(substr, substr + len(site)))) for primer in primers: match_len, bad_nt_pos, _ = utils.lcs(subseq, primer) if match_len > 10: bad_pos.update(fixable_pos.intersection(bad_nt_pos)) # primers are single stranded, but the templates are not (after one cycle, at least) match_len, bad_nt_pos, _ = utils.lcs( utils.rev_comp(subseq), primer) if match_len > 10: bad_pos.update(fixable_pos.intersection(bad_nt_pos)) if len(bad_pos) == 0: good_padding = True else: to_fix = self.rng.choice(list(bad_pos)) padding[to_fix - len(left_boundary)] = self.rng.choice( ['A', 'C', 'G', 'T']) padding = "".join(padding) complete_oligo = self.gsp_f + self.typeIIs + oligo + utils.rev_comp(self.typeIIs) + \ padding + utils.rev_comp(self.gsp_r) # already checked the full assembly, so now make sure nothing was accidentally introduced # at boundaries assert complete_oligo.count( self.gsp_f) == 1, "GSP F not found in %d -th oligo" % i assert complete_oligo.count(utils.rev_comp( self.gsp_r)) == 1, "GSP_R not found in %d -th oligo" % i assert complete_oligo.count(utils.rev_comp( self.gsp_f)) == 0, "GSP_F RC found in %d -th oligo" % i assert complete_oligo.count( self.gsp_r) == 0, "GSP_R RC found in %d -th oligo" % i if self.typeIIs == utils.rev_comp(self.typeIIs): assert complete_oligo.count( self.typeIIs ) == 2, "Extra Type IIS sites in %d -th oligo" % i else: assert complete_oligo.count( self.typeIIs ) == 1, "Extra Type IIS sites in %d -th oligo" % i assert complete_oligo.count( utils.rev_comp(self.typeIIs) ) == 1, "Extra Type IIS sites in %d -th oligo" % i assert complete_oligo.count(utils.rev_comp( self.asm_f)) == 0, "AsmF RC in in %d -th oligo" % i assert complete_oligo.count( self.asm_r) == 0, "AsmR in %d -th oligo" % i if self.asmf_re != utils.rev_comp(self.asmf_re): assert complete_oligo.count(utils.rev_comp( self.asmf_re)) == 0, "AsmF RE RC in %d -th oligo" % i if self.asmr_re != utils.rev_comp(self.asmr_re): assert complete_oligo.count(utils.rev_comp( self.asmr_re)) == 0, "AsmR RE RC in %d -th oligo" % i full_oligos.append(complete_oligo) self.oligos = full_oligos return