def get_spcrs(sequence): last_rep = fuzzysearch.find_near_matches(Repeat[0:15], sequence.seq, max_l_dist=3) results = fuzzysearch.find_near_matches(Repeat, sequence.seq, max_l_dist=6) if len(results) == 3 and len(last_rep) >= 1 and last_rep[len(last_rep)-1].start > results[len(results)-1].start: spacer_list = [sequence[results[0].end:results[1].start]] spacer_list.append (sequence[results[1].end:results[2].start]) spacer_list.append (sequence[results[2].end:last_rep[len(last_rep)-1].start]) elif len(results) == 2 and len(last_rep) >= 1 and last_rep[len(last_rep)-1].start > results[len(results)-1].start: spacer_list = [sequence[results[0].end:results[1].start]] spacer_list.append (sequence[results[1].end:last_rep[len(last_rep)-1].start]) elif len(results) == 1 and len(last_rep) >= 1 and last_rep[len(last_rep)-1].start > results[len(results)-1].start: spacer_list = [sequence[results[0].end:last_rep[len(last_rep)-1].start]] elif len(results) == 4: spacer_list = [sequence[results[0].end:results[1].start]] spacer_list.append (sequence[results[1].end:results[2].start]) spacer_list.append (sequence[results[2].end:results[3].start]) elif len(results) == 3: spacer_list = [sequence[results[0].end:results[1].start]] spacer_list.append (sequence[results[1].end:results[2].start]) elif len(results) == 2: spacer_list = [sequence[results[0].end:results[1].start]] else: spacer_list = [] return spacer_list
def is_covid19(entry): for covid_word in covid19_words: for key in ['abstract', 'title']: if key in entry.keys(): if isinstance(entry[key], list): text = " ".join(entry[key]) else: text = entry[key] if text is not None: #"provide" is really painful here if len([m for m in find_near_matches(covid_word, text, max_l_dist=1, max_substitutions=1) if m.matched != "ovid"]) > 0: return True for key in ['keywords_ML', 'keywords']: if key in entry.keys(): if isinstance(entry[key], list): text = " ".join(entry[key]) else: text = entry[key] if text is not None: if len(find_near_matches(covid_word, text, max_l_dist=1, max_substitutions=1)) > 0: return True if 'body_text' in entry.keys(): if entry['body_text'] is not None: for e in entry['body_text']: if 'Text' in e.keys() and e['Text'] is not None: if len([m for m in find_near_matches(covid_word, text, max_l_dist=1, max_substitutions=1) if m.matched != "ovid"]) > 0: return True return False
def Kernel_func(seq1, seq2, k=5, tolerance=1, kernel='bio'): """ ------------------------------------------------------------------------------ Compute directly the kernel Kernel_func(seq1,seq2) The projection is not made explicit ------------------------------------------------------------------------------ Inputs: seq1,seq2: DNAs sequences (with ATGC letters) k: length of subsequences considered Output: Kernel function (a real number) """ result = 0 if kernel == 'mismatch': subseqs1 = seq2subseq(seq1, k) for subseq1 in subseqs1: result += len( find_near_matches(subseq1, seq2, max_l_dist=tolerance)) subseqs2 = seq2subseq(seq2, k) for subseq2 in subseqs2: result += len( find_near_matches(subseq2, seq1, max_l_dist=tolerance)) elif kernel == 'bio': result = pairwise2.align.globalxx(seq1, seq2, score_only=True, penalize_extend_when_opening=True, penalize_end_gaps=True) return result
def get_spcrs(sequence): last_rep = fuzzysearch.find_near_matches(Repeat[0:15], sequence.seq, max_l_dist=3) results = fuzzysearch.find_near_matches(Repeat, sequence.seq, max_l_dist=6) if len(results) == 3 and len(last_rep) >= 1 and last_rep[ len(last_rep) - 1].start > results[len(results) - 1].start: spacer_list = [sequence[results[0].end:results[1].start]] spacer_list.append(sequence[results[1].end:results[2].start]) spacer_list.append(sequence[results[2].end:last_rep[len(last_rep) - 1].start]) elif len(results) == 2 and len(last_rep) >= 1 and last_rep[ len(last_rep) - 1].start > results[len(results) - 1].start: spacer_list = [sequence[results[0].end:results[1].start]] spacer_list.append(sequence[results[1].end:last_rep[len(last_rep) - 1].start]) elif len(results) == 1 and len(last_rep) >= 1 and last_rep[ len(last_rep) - 1].start > results[len(results) - 1].start: spacer_list = [ sequence[results[0].end:last_rep[len(last_rep) - 1].start] ] elif len(results) == 4: spacer_list = [sequence[results[0].end:results[1].start]] spacer_list.append(sequence[results[1].end:results[2].start]) spacer_list.append(sequence[results[2].end:results[3].start]) elif len(results) == 3: spacer_list = [sequence[results[0].end:results[1].start]] spacer_list.append(sequence[results[1].end:results[2].start]) elif len(results) == 2: spacer_list = [sequence[results[0].end:results[1].start]] else: spacer_list = [] return spacer_list
def extract_information(file): path = Path(file) # TODO Utilizar OCR cuando no se puede extraer el texto del pdf string_1 = "corte suprema de justicia de la nación" string_2 = "oficina de violencia doméstica" i = 0 all_pages = [] for text in extract_text_from_pdf(path, full=True): matches_1 = find_near_matches(string_1, text.lower(), max_l_dist=4) matches_2 = find_near_matches(string_2, text.lower(), max_l_dist=4) if matches_1 and matches_2: all_pages.append(i) i += 1 if len(all_pages) > 2: all_pages = filter_pages(all_pages, 10) all_pages = sorted(all_pages) with open(f"{path.stem}.txt", "w", encoding="UTF8", errors="ignore") as f: for text in extract_text_from_pdf(path, pages=(all_pages[0], all_pages[-1] + 1)): if len(text) > 0: f.write("============ page ==============") f.write(text) else: print("No se encontró ninguna página de la OVD")
def extract_barcode_v2(sequence1): ''' Function to extract barcodes ''' # Parse out barcodes if we can ID the constants try: # use some approximate, yet generous, indices to facilitate faster matching c1_hit = find_near_matches(c1, sequence1[7:25], max_l_dist=2) c2_hit = find_near_matches(c2, sequence1[23:42], max_l_dist=2) nxt_hit = find_near_matches(nxt, sequence1[33:65], max_l_dist=2) me_hit = find_near_matches(me, sequence1[55:], max_l_dist=2) # Now grab the barcodes bc1, mm1 = prove_barcode(sequence1[0:7], barcodes, n_mismatch) bc2, mm2 = prove_barcode( sequence1[c1_hit[0].end + 7:c2_hit[0].start + 23], barcodes, n_mismatch) bc3, mm3 = prove_barcode( sequence1[c2_hit[0].end + 23:nxt_hit[0].start + 33], barcodes, n_mismatch) seq = sequence1[me_hit[0].end + 55:] return (bc1 + "_" + bc2 + "_" + bc3, seq, str(mm1) + "," + str(mm2) + "," + str(mm3)) except: return (dumb, sequence1, "0,0,0")
def extractbarcode_v2_tn5(sequence1): ''' Function to extract barcodes ''' # Parse out barcodes if we can ID the constants try: c1_hit = find_near_matches(c1, sequence1[7:25], max_l_dist=2) c2_hit = find_near_matches(c2, sequence1[23:42], max_l_dist=2) nxt_hit = find_near_matches(nxt, sequence1[33:65], max_l_dist=2) me_hit = find_near_matches(me, sequence1[55:], max_l_dist=2) # Now grab the barcodes bc1, mm1 = prove_barcode(sequence1[0:7], barcodes, n_mismatch) bc2, mm2 = prove_barcode(sequence1[c1_hit[0][1] + 7:c2_hit[0][0] + 23], barcodes, n_mismatch) bc3, mm3 = prove_barcode( sequence1[c2_hit[0][1] + 23:nxt_hit[0][0] + 33], barcodes, n_mismatch) bc_tn5, mm4 = prove_barcode( sequence1[nxt_hit[0][1] + 33:me_hit[0][0] + 55], tn5, n_mismatch) seq = sequence1[me_hit[0][1] + 55:] return (bc1 + "_" + bc2 + "_" + bc3 + "_" + bc_tn5, seq, str(mm1) + "," + str(mm2) + "," + str(mm3) + "," + str(mm4)) except: return (dumb, sequence1, "0,0,0,0")
def test_levenshtein(self): """test cases where 0 < max_l_dist <= max(others)""" # in these cases, find_near_matches should call # find_near_matches_levenshtein self.patch_concrete_search_classes() self.mock_find_near_matches_levenshtein.return_value = \ [mock.sentinel.SENTINEL] self.assertEqual( find_near_matches('a', 'a', 1, 1, 1, 1), [mock.sentinel.SENTINEL], ) self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 1) self.assertEqual( find_near_matches('a', 'a', 2, 2, 2, 2), [mock.sentinel.SENTINEL], ) self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 2) self.assertEqual( find_near_matches('a', 'a', 5, 3, 7, 2), [mock.sentinel.SENTINEL], ) self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 3)
def test_levenshtein(self): """test cases where 0 < max_l_dist <= max(others)""" # in these cases, find_near_matches should call # find_near_matches_levenshtein self.patch_concrete_search_methods() self.mock_find_near_matches_levenshtein.return_value = \ [mock.sentinel.SENTINEL] self.assertEqual( find_near_matches('a', 'a', 1, 1, 1, 1), [mock.sentinel.SENTINEL], ) self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 1) self.assertEqual( find_near_matches('a', 'a', 2, 2, 2, 2), [mock.sentinel.SENTINEL], ) self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 2) self.assertEqual( find_near_matches('a', 'a', 5, 3, 7, 2), [mock.sentinel.SENTINEL], ) self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 3)
def extract_cdrs(sequence1, sequence2): ''' Function to extract barcodes ''' # Parse out barcodes if we can ID the constants try: # use some approximate, yet generous, indices to facilitate faster matching cdr3_hit1 = find_near_matches(const_cdr3_R1, sequence1[17:42], max_l_dist=n_mismatch) #c2R1_hit = find_near_matches(const2R1, sequence1[50:], max_l_dist = n_mismatch) cdr1_hit1 = find_near_matches(const_cdr1_R2, sequence2[60:85], max_l_dist=n_mismatch) cdr2_hit1 = find_near_matches(const_cdr2_R2, sequence2[115:145], max_l_dist=n_mismatch) # R1 stuff cdr3longseq = reverse_complement_table( sequence1[(cdr3_hit1[0].end + 17):(cdr3_hit1[0].end + 17 + cdr3n_long)]) cdr3medseq = reverse_complement_table( sequence1[(cdr3_hit1[0].end + 17):(cdr3_hit1[0].end + 17 + cdr3n_med)]) cdr3shortseq = reverse_complement_table( sequence1[(cdr3_hit1[0].end + 17):(cdr3_hit1[0].end + 17 + cdr3n_short)]) # Try to decide which CDR3 by identifying the immediate downstream constant sequence upstream_const_seq = "TATTATTGCGCG" if (len( find_near_matches(upstream_const_seq, cdr3medseq[0:15], max_l_dist=n_mismatch)) > 0): whichone = "short" cdr3 = cdr3shortseq elif (len( find_near_matches(upstream_const_seq, cdr3longseq[0:15], max_l_dist=n_mismatch)) > 0): whichone = "medium" cdr3 = cdr3medseq else: whichone = "long" cdr3 = cdr3longseq # R2 stuff cdr1 = (sequence2[(cdr1_hit1[0].end + 60):(cdr1_hit1[0].end + 60 + cdr1n)]) cdr2 = (sequence2[(cdr2_hit1[0].end + 115):(cdr2_hit1[0].end + 115 + cdr2n)]) return (cdr1, cdr2, cdr3, translate_dna_to_protein(cdr1), translate_dna_to_protein(cdr2), translate_dna_to_protein(cdr3), whichone) except: return ("NA", "NA", "NA", "NA", "NA", "NA", "NA")
def process_speech(transcript): if find_near_matches('mantra', transcript, max_l_dist=2): return 'mantra' if find_near_matches('breath', transcript, max_l_dist=2): return 'breathing' if find_near_matches('focus', transcript, max_l_dist=2): return 'object' # pick random command print("we're going random") index = random.randint(0, len(commands) - 1) return commands[index]
def is_need_out_barcode(self): """ Judge out barcode in each reads pair Args: Returns: True or False """ if fuzzysearch.find_near_matches(self.out_barcode, self.ob_read1.description, 1, 0, 0, 1) \ and fuzzysearch.find_near_matches(self.out_barcode, self.ob_read2.description, 1, 0, 0, 1): return True else: return False
def find_specific_chunk(chunk_needle, haystack, common_prefix, common_prefix_matches, max_substs, max_inserts, max_dels): max_levenshtein = max_substs + max_inserts + max_dels results = [] chunk_needle_str = chunk_needle.astype(np.uint8).tostring() haystack_str = haystack.astype(np.uint8).tostring() for i in range(len(common_prefix_matches)): # We have an approximate match against the common prefix. # To speed things up, now search for an approximate match # for the remaining part of the string only, with a max # Levenshtein distance that makes the distance for the whole # string (common prefix plus unique) within our bounds. common_length = len(common_prefix) corrupted_common_match_len = len(common_prefix_matches[i][3]) start_pos = common_prefix_matches[i][0] needle_not_common = chunk_needle_str[common_length:] hay_not_common = haystack_str[start_pos + corrupted_common_match_len:start_pos + corrupted_common_match_len + len(chunk_needle) - common_length + max_inserts] fuzzy_result = fuzzysearch.find_near_matches( needle_not_common, hay_not_common, max_substs, max_inserts, max_dels, max_levenshtein - common_prefix_matches[i][2]) # The actual fuzzy match data gives no useful information # because all the offsets are off. So instead just record the # match start position given by common_prefix_matches, and end # position according to the fuzzy search result. # Sometimes we get a false positive (match) with a nonzero # start index, which would count as an insertion error over the # whole string. So do a search of the whole string if we get a # match, and only *then* admit. if len(fuzzy_result) > 0: fuzzy_result = fuzzysearch.find_near_matches( chunk_needle_str, haystack_str[start_pos:start_pos + len(chunk_needle) + max_inserts], max_substs, max_inserts, max_dels) if len(fuzzy_result) > 0: result = fuzzy_result[0] results.append((result.start + start_pos, result.end + start_pos, result.dist)) return results
def trim_primer( seq_record: SeqIO.SeqRecord, primer_seqs: List[str], max_mismatch: Union[float, int] = 0.14, ) -> SeqIO.SeqRecord: """ Trim primer sequences. Parameters ---------- seq_record : Bio.Seq.SeqRecord input sequence record primer_seqs : list list of the foward and reverse primer sequnces max_mismatch : float Maximum number (or proportion) of mismatches allowed for searching primer sequeces (default: 0.14) """ seq = seq_record.seq fwd, rev = primer_seqs rev_rc = revc(rev) len_fwd, len_rev = len(fwd), len(rev) if max_mismatch > 1: max_l_dist1 = max_l_dist2 = max_mismatch elif max_mismatch > 0: max_l_dist1 = round(len_fwd * max_mismatch) max_l_dist2 = round(len_rev * max_mismatch) else: raise ValueError("max_mismatch must be a positive value") m0 = find_near_matches(fwd, str(seq), max_l_dist=max_l_dist1) m1 = find_near_matches(rev_rc, str(seq), max_l_dist=max_l_dist2) if len(m0) > 0: match_fwd = get_best_match_in_group(m0) if len(m1) > 0: match_rev_rc = get_best_match_in_group(m1) if len(m0) > 0 and len(m1) > 0: tr = seq_record[match_fwd.end : match_rev_rc.start] elif len(m0) > 0: tr = seq_record[match_fwd.end :] elif len(m1) > 0: tr = seq_record[: match_rev_rc.start] else: tr = seq_record[:] return tr
def fuzzy_search(string, primer_pairs, max_dist): """ Search `string` for both primer sequences """ for (i, primer_pair) in enumerate(primer_pairs): search1 = fuzzysearch.find_near_matches(primer_pair[0], string, max_l_dist=max_dist) if search1: search2 = fuzzysearch.find_near_matches(primer_pair[1], string, max_l_dist=max_dist) if search2: return i, search1[0], search2[0]
def searchText_old(self, names, titles, channelName): for name in names: for screenshot in self.channelData: if "TESSERACT" not in screenshot["customLabels"] and "ACTIVATION" not in screenshot["customLabels"] and "LOGIN" not in screenshot["customLabels"] : screenshot["imdbName"] = [] screenshot["ocrName"] = [] screenshot["nameMatchDistance"] = {} compareResult = find_near_matches(name, screenshot["textBody"], max_l_dist=self.matchingDistance) if len(compareResult) == 0: continue else: if compareResult[0].end != compareResult[0].start: if "IMDB_NAME" not in screenshot["customLabels"]: screenshot["customLabels"].append("IMDB_NAME") screenshot["imdbName"].append(name) screenshot["ocrName"].append(screenshot["textBody"][compareResult[0].start:compareResult[0].end]) screenshot["nameMatchDistance"][name] = compareResult[0].dist for title in titles: if len(title) < MIN_IMDB_SEARCH_LEN: continue if EXCLUDE_MENU_INTERFACE_STRINGS and title in MENU_INTERFACE_STRINGS: continue for screenshot in self.channelData: if "TESSERACT" in screenshot["customLabels"] or "ACTIVATION" in screenshot["customLabels"] or "LOGIN" in screenshot["customLabels"]: continue screenshot["imdbTitle"] = [] screenshot["ocrTitle"] = [] screenshot["titleMatchDistance"] = {} for screenshot_str in screenshot["textBody"].split('\n'): if not screenshot_str or len(screenshot_str) < MIN_IMDB_SEARCH_LEN: continue compareResult = find_near_matches(screenshot_str, title, max_l_dist=self.matchingDistance, max_deletions=0, max_insertions=0) # compareResult = find_near_matches(title, screenshot["textBody"], max_l_dist=self.matchingDistance) if len(compareResult) == 0: continue else: if compareResult[0].end != compareResult[0].start: if "IMDB_TITLE" not in screenshot["customLabels"]: screenshot["customLabels"].append("IMDB_TITLE") print("Matched with IMDB title", screenshot_str, title, self.matchingDistance) print("compareResult", compareResult) screenshot["imdbTitle"].append(title) screenshot["ocrTitle"].append(screenshot["textBody"][compareResult[0].start:compareResult[0].end]) screenshot["titleMatchDistance"][title] = compareResult[0].dist
def extract_and_match(sequence): left_flank = fuzzysearch.find_near_matches( Region_dict[edit_site]['flanking'][0], sequence, max_l_dist=4) right_flank = fuzzysearch.find_near_matches( Region_dict[edit_site]['flanking'][1], sequence, max_l_dist=4) if len(left_flank) == 1 and len(right_flank) == 1: region = sequence[left_flank[0].end:right_flank[0].start] if region == Region_dict[edit_site]['wt_or_edited'][0]: return 'wt' elif region == Region_dict[edit_site]['wt_or_edited'][1]: return 'edited' else: return 'undetermined_no_site_match' else: return 'undetermined_no_flanking_match'
def get_position(context_no_space, text_with_space, valid_length, max_error, margin_error): context = context_no_space text = text_with_space.split() accumulate_position = 0 count = 0 for word in text: if len(word) < valid_length: continue error_distance = len(word) - margin_error if error_distance < 0: error_distance = 0 if error_distance > max_error: error_distance = max_error all_position = find_near_matches(word, context, max_l_dist=error_distance) for position in all_position: count += 1 accumulate_position += (position[0]+position[1])/2 if count == 0: return -1 return accumulate_position/count
def phrase_match(phrase_list, page_text, max_l_dist=4): '''Fuzzysearches `page_text` for phrases pased in `phrase_list`. String matching performed using the [fuzzysearch](https://github.com/taleinat/fuzzysearch) Python package. :param phrase_list: phrase list corresponding to certain form component :type phrase_list: list :param page_text: page text :type page_text: string :param max_l_dist: maximum levenshtein distance for fuzzysearch, default 4 :type max_l_dist: integer :returns: bool depending if a phrase was matched within certain levenshtein distance :rtype: bool ''' detected_phrase = False ###################################################### #Iterate through the phrase list and detect matches ##################################################### ##### Fuzzysearch: https://github.com/taleinat/fuzzysearch for phrase in phrase_list: #Search page text for a phrase within 2 character changes (levenstein distance) if len( fuzzysearch.find_near_matches(phrase.lower(), page_text.lower(), max_l_dist=max_l_dist)): detected_phrase = True break return detected_phrase
def fuzzyExtract(self, qs, ls, threshold): ''' todo fuzzy search seperation in words :param qs: query string :param ls: large string :param threshold: threshold :return: ''' '''fuzzy matches 'qs' in 'ls' and returns list of tuples of (word,index) ''' if len(qs) < self.fuzzySearchOptimumLength: processThreshold = 60 max_l_dist = 0 else: processThreshold = threshold max_l_dist = 1 for word, confidence in process.extractBests(qs, (ls,), score_cutoff=processThreshold): print('word {}'.format(word), confidence) for match in find_near_matches(qs, word, max_l_dist=max_l_dist): match = word[match.start:match.end] print('match {}'.format(match)) index = ls.find(match)
def search_subtitle_2(subtitles, line, offset): allsub = "" partialsum = [] for sub in subtitles: allsub = allsub + sub.content + " " partialsum.append(len(allsub)) line = line.lower() allsub = allsub.lower() t1 = datetime.now() result = find_near_matches(line, allsub, max_l_dist=5) t2 = datetime.now() print((t2 - t1).seconds) if len(result) != 1: return -1 result = result[0] print(result) if offset > 0: #Searching for after #Searching for the index of the end of the substring i = 0 for val in partialsum: if val > result.end: return i i += 1 else: #Searching for before #Searching for the index of the begining of the substring i = len(partialsum) for val in reversed(partialsum): if val < result.start: return i + 1 i -= 1 return -1
def extract_information(pages_list): string_1 = "corte suprema de justicia de la nación" string_2 = "oficina de violencia doméstica" pages_dict = {} for idx, text in enumerate(pages_list): matches_1 = find_near_matches(string_1, text.lower(), max_l_dist=4) matches_2 = find_near_matches(string_2, text.lower(), max_l_dist=4) if matches_1 and matches_2: pages_dict[idx] = text return pages_dict
def get_barcode_index(fq, barcode = 'hiseq'): sys.stdout.write("%s get barcode index start process at %s" % (fq, time.ctime())) if re.findall(r'gz', fq): fq = gzip.open(fq) out_barcode = setting.SeqIndex.out_barcode barcode_index = {} for k, v in enumerate(setting.SeqIndex.barcode[barcode]): barcode_index[v] = k + 1 iter_fq = SeqIO.parse(fq, "fastq") l_barcode = [] for ob_fq in iter_fq: fq_out_barcode = ob_fq.description[-6:] fq_barcode = str(ob_fq.seq)[:6] try: if fuzzysearch.find_near_matches(out_barcode, fq_out_barcode, 1, 1, 1, 1): out_str = '1\t' + fq_out_barcode + '\t' + str(barcode_index[fq_barcode]) + '\t' + fq_barcode else: out_str = '0\t' + fq_out_barcode + '\t' + str(barcode_index[fq_barcode]) + '\t' + fq_barcode except KeyError: out_str = '2\t' + fq_out_barcode l_barcode.append(out_str) return l_barcode
def return_song(): args = sys.argv[1:] n_args = len(args) access_token = __get_token() try: with open('./assets/genres.json', 'r') as infile: valid_genres = json.load(infile) except FileNotFoundError: print("Couldn't find genres file!") if n_args == 0: selected_genre = random.choice(valid_genres) else: selected_genre = (" ".join(args)).lower() if selected_genre in valid_genres: result = __request_valid_song(access_token, genre=selected_genre) return result else: valid_genres_to_text = " ".join(valid_genres) try: closest_genre = find_near_matches(selected_genre, valid_genres_to_text, max_l_dist=2)[0].matched result = __request_valid_song(access_token, genre=closest_genre) return result except IndexError: print("Genre not found")
def get_fuzzymatches(sentence: str, text: str, q_factor: int, qmax: int, qstep: int) -> List[Match]: """ .. py:function:: get_fuzzymatches(sentence, text, q_factor, qmax, qstep) Finds fuzzy matches of sentence in text :param str sentence: Input sentence to find :param str text: Input text :param int q_factor: Value of initial Max Levenshtein distance :param str qmax: Max Value of Levenshtein distance :param str qstep: Levenshter distance increasing step :return: List of fuzzy matches :rtype: list[Match] """ fuzzymatches = [] while q_factor <= qmax: fuzzymatches = find_near_matches(sentence.lower(), text, max_l_dist=q_factor) if fuzzymatches: break q_factor += qstep if q_factor >= qmax: logger.warning("Cannot continue fuzzing. Max Q-Factor reached." f"The sentence is `{sentence}`") break return fuzzymatches
def fuzzy_search_extract_in_orig_doc(original_doc_text, searched_text, stored_matches): # Note; scripts return blanks instead of null values if searched_text != '': matches = [] threshold = 1 while matches == [] and threshold < 10: threshold += 2 matches = find_near_matches(searched_text, original_doc_text, max_l_dist=threshold) match = get_first_elem_or_none(matches) if len(matches) == 1: return ((match[0], match[1]), match) while match: if not check_if_stored_already(match, stored_matches): print_if_debug( ("MATCH FOUND: ", original_doc_text[match[0]:match[1]], threshold)) matches += match return ((match[0], match[1]), match) matches = matches[1:len(matches)] match = get_first_elem_or_none(matches) return None
def pattern(): with open('cutibacterium_acnes.fasta', 'r') as genome: seq = SeqIO.read(genome, 'fasta').seq sequence = st.text_area('Sequence to search in C. acnes genome') mismatch = st.number_input('Select number of mismatches', value=0, min_value=0, max_value=5) if len(sequence) < 15: st.error('Please write a sequence longer than 15bp') subseq = Seq(sequence.upper()) if sequence: fw = fz.find_near_matches(subseq, seq, max_l_dist=mismatch, max_deletions=0, max_insertions=0) rv = fz.find_near_matches(subseq.reverse_complement(), seq, max_l_dist=mismatch, max_deletions=0, max_insertions=0) if fw or rv: st.write( f'## You have {len(fw)+len(rv)} matches for your sequence in KPA' ) c1, c2, c3, c4 = st.beta_columns(4) c1.write('### Position') c2.write('### Mismatch') c3.write('### Sequence') c4.write('### Direction') for match in fw: c1.write(match.start) c2.write(match.dist) c3.write(match.matched) c4.write('`forward`') for match in rv: c1.write(match.start) c2.write(match.dist) c3.write(match.matched) c4.write('`reverse`') else: st.error('No matches found')
def get_fuzzy_search(input_text, subtitle_data, max_dist=5): match_str = find_near_matches(input_text, subtitle_data, max_l_dist=max_dist) if len(match_str) > 0: return match_str[0].start else: return -1
def test_all_zero(self): self.patch_concrete_search_methods() self.mock_search_exact.return_value = [42] self.assertEqual( find_near_matches('a', 'a', 0, 0, 0, 0), [Match(42, 43, 0)], ) self.assertEqual(self.mock_search_exact.call_count, 1)
def search(self, subsequence, sequence, start_index=0, end_index=None): if end_index is None: end_index = len(sequence) sequence = sequence[start_index:end_index] return [ start_index + match.start for match in find_near_matches(subsequence, sequence, max_l_dist=0) ]
def search(self, subsequence, sequence, start_index=0, end_index=None): if end_index is None: end_index = len(sequence) sequence = sequence[start_index:end_index] return [ start_index + match.start for match in find_near_matches(subsequence, sequence, max_l_dist=0) ]
def test_all_zero(self): self.patch_concrete_search_classes() self.mock_search_exact.return_value = [Match(42, 43, 0)] self.assertEqual( find_near_matches('a', 'a', 0, 0, 0, 0), [Match(42, 43, 0)], ) self.assertEqual(self.mock_search_exact.call_count, 1)
def fuzzy_regex(label, string_list): ret = '' # Only remove the best-matched string if we have a good score. If the score is poor, # possible that its a wrong match and deleting it will cause that field to have # a wrong match threshold_to_remove = 85 filtered_string_list = list( filter(lambda x: len(x) >= len(label), string_list)) if (len(filtered_string_list) == 0): ret = Regex_Utils.REGEX_FAILURE else: # First, search and extract the highest match: string_match = process.extractOne(label, filtered_string_list, scorer=fuzz.partial_ratio) string = string_match[0] score = string_match[1] # If its a sufficiently high match, then remove it from the list to help next searches if (score >= threshold_to_remove): string_list.remove(string) # Fuzzysearch for best match of label within the string: match = find_near_matches(label, string, max_l_dist=2) # Need to sort and pull out best match: if (len(match) == 0): ret = Regex_Utils.REGEX_FAILURE else: best_match = sorted(match, key=lambda i: i.dist)[0] # Convert the best match into actual string slice best_match_string = string[best_match.start:best_match.end] # Construct regex based on this slice regex = best_match_string + '\s*(.*)' try: # Perform the regex search to find the text of interest output = re.search(regex, string) ret = output.group(1) except: print(label + ' Failed searches') ret = Regex_Utils.REGEX_FAILURE return ret, string_list
def search(description: str, question: list): # description разбивается на список предложений sentences = list(filter(None, split_text(description))) lowerSentences = [] for sentence in sentences: lowerSentences.append(sentence.lower()) resultSentences = [] for keyword in question: if type(keyword) != dict: if len(keyword) < 5: max_l_dist = 0 max_deletions = 0 max_insertions = 0 max_substitutions = 0 else: max_l_dist = 2 max_deletions = 4 max_insertions = 1 max_substitutions = 0 for sentence in lowerSentences: if find_near_matches(keyword, sentence, max_l_dist=max_l_dist, max_deletions=max_deletions, max_insertions=max_insertions, max_substitutions=max_substitutions ) and sentence not in resultSentences: resultSentences.append(sentence) else: synonym = list(keyword.keys())[0] addWords = list(keyword.values())[0] for word in addWords: for sentence in lowerSentences: if find_near_matches(synonym + word, sentence, max_l_dist=2, max_deletions=4, max_insertions=2, max_substitutions=0 ) and sentence not in resultSentences: resultSentences.append(sentence) # print(resultSentences) return resultSentences
def flanktrim(): for record in SeqIO.parse(args.fasta, "fasta"): seqid = record.name seq = record.seq fwflank = args.fwflank rvflank = args.rvflank ed1 = round(len(fwflank) * float(args.edit)) ed2 = round(len(rvflank) * float(args.edit)) fwmatch = find_near_matches(fwflank, seq, max_l_dist=ed1) rvmatch = find_near_matches(rvflank, seq, max_l_dist=ed2) # continue with next locus if either forward or reverse match are not found if len(fwmatch) == 0 or len(rvmatch) == 0: continue # for both forward and reverse, pick index of lowest edit distance match else: fwindex = [] for i in range(0, len(fwmatch)): fwindex.append(fwmatch[i].dist) fwstart = fwmatch[fwindex.index(min(fwindex))].start fwend = fwmatch[fwindex.index(min(fwindex))].end fwdist = fwmatch[fwindex.index(min(fwindex))].dist fwseq = fwmatch[fwindex.index(min(fwindex))].matched rvindex = [] for i in range(0, len(rvmatch)): rvindex.append(rvmatch[i].dist) rvstart = rvmatch[rvindex.index(min(rvindex))].start rvend = rvmatch[rvindex.index(min(rvindex))].end rvdist = rvmatch[rvindex.index(min(rvindex))].dist rvseq = rvmatch[rvindex.index(min(rvindex))].matched # break if reverse is downstream to forward primer if rvend < fwstart: continue # trim region else: trim = seq[fwend:rvstart] trimlen = len(trim) # check if trimmed sequence is within the fragment length range if trimlen <= 20 or trimlen >= 300: continue print(seqid, "\t", trim, "\t", trimlen, "\t", fwdist, "\t", rvdist, "\t")
def is_need_out_barcode(self): """ Judge out barcode in each reads pair Args: Returns: True or False """ try: if re.match(r'[ATCG]', self.ob_read1.description.split()[1].split(":")[-1].strip()): if fuzzysearch.find_near_matches(self.out_barcode, self.ob_read1.description, 1, 0, 0, 1) \ and fuzzysearch.find_near_matches(self.out_barcode, self.ob_read2.description, 1, 0, 0, 1): return True else: return False else: return True except IndexError: return True
def test_all_none_except_max_l_dist(self): self.patch_concrete_search_classes() self.mock_find_near_matches_levenshtein.return_value = [Match(42, 43, 0)] self.assertEqual( find_near_matches('a', 'a', max_l_dist=1), [Match(42, 43, 0)], ) self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 1)
def search_teacher(self, teacher_name): matched_teachers = [] all_teachers = self.teachers_database.find() for teacher in all_teachers: if (len(find_near_matches(teacher_name, teacher['name'], max_l_dist=1)) > 0): matched_teachers += [teacher] return matched_teachers
def test_only_substitutions(self): self.patch_concrete_search_classes() self.mock_find_near_matches_substitutions.return_value = [Match(42, 43, 0)] self.assertEqual( find_near_matches('a', 'a', 1, 0, 0), [Match(42, 43, 0)], ) self.assertEqual( self.mock_find_near_matches_substitutions.call_count, 1, ) self.assertEqual( find_near_matches('a', 'a', 1, 0, 0, 1), [Match(42, 43, 0)], ) self.assertEqual( self.mock_find_near_matches_substitutions.call_count, 2, )
def test_generic(self): self.patch_concrete_search_classes() self.mock_find_near_matches_generic.return_value = [Match(42, 43, 0)] self.assertEqual( find_near_matches('a', 'a', 1, 1, 1), [Match(42, 43, 0)], ) self.assertEqual( self.mock_find_near_matches_generic.call_count, 1, ) self.assertEqual( find_near_matches('a', 'a', 1, 1, 1, 2), [Match(42, 43, 0)], ) self.assertEqual( self.mock_find_near_matches_generic.call_count, 2, )
def define_adapter_presence_substitutions_only(self, adapter, max_substitutions): """Sets the adapter_absent attribute according to whether a match is found with a number of substitutions less or equal to edit_threshold. :rtype : None Args: self, adapter, edit_threshold Returns: None Args: self, adapter, max_substitutions Returns: None """ # Solves the straightforward case where the adapter is exactly present in the read if adapter in self.sequence_line: self.adapter_present = True return # Otherwise, look for an approximate match of the adapter, less than max_substitutions different # First do a preliminary (hopefully faster) check whether the adapter is present as an approximate match in # the read sequence within a Levenshtein distance equal to the allowed number of substitutions # Two sequences less than N substitutions apart are automatically less than a N edits apart (the reciprocal # is not true). If the Levenshtein code is faster, then it will save a lot of time pre-filtering adapters # within the Levenshtein distance, which are much more likely to contain adapters. The slower code checking # the mismatch distance will then be called on these pre-filtered reads to confirm whether it is an actual # substituted match or if the Levensthein match involved insertions and deletions # If 1 or more approximate matches of the adapter were found within a Levenshtein distance of max_substitutions if len(fuzzysearch.find_near_matches(adapter, self.sequence_line, max_substitutions)): # scan the read sequence for a potential substituted match # for each subsequence of the sequence_line of length identical to the adapter (last starting position is # length of the adapter before the end of the read) for index in range(len(self.sequence_line) - len(adapter) + 1): # check if the adapter is less than max_substitutions away from the subsequence result = ApproxMatch.approx_substitute(adapter, self.sequence_line[index:index + len(adapter)], max_substitutions) # If a match was found (result is TRUE) if result: # set the adapter presence to True (and stop the function here) self.adapter_present = True return
def fuzzy_search_barcode(self, barcodes_name, inner_barcode): """ Fuzzy search inner barcode in barcodes table, for sequencing company's problem :param barcodes_name: :param inner_barcode: :return: the position of barcodes table """ position = 0 flag = 0 tag = '' while position < len(setting.SeqIndex.barcode[barcodes_name]): if flag > 1: tag = '' break if fuzzysearch.find_near_matches(setting.SeqIndex.barcode[barcodes_name][position], inner_barcode, 1, 1, 1, 1): tag = str(position + 1) flag += 1 position += 1 return tag
def test_zero_max_l_dist(self): self.patch_concrete_search_classes() self.mock_search_exact.return_value = [Match(42, 43, 0)] call_count = 0 for (max_subs, max_ins, max_dels) in [ (1, 0, 0), (0, 1, 0), (0, 0, 1), (1, 1, 0), (1, 0, 1), (0, 1, 1), (1, 1, 1), ]: with self.subTest('max_l_dist=0, max_subs={0}, max_ins={1}, max_dels={2}'.format( max_subs, max_ins, max_dels)): self.assertEqual( find_near_matches('a', 'a', max_subs, max_ins, max_dels, 0), [Match(42, 43, 0)], ) call_count += 1 self.assertEqual(self.mock_search_exact.call_count, call_count)
def search(self, subsequence, sequence, max_l_dist): return find_near_matches(subsequence, sequence, max_l_dist=max_l_dist)
def search(self, subsequence, sequence, max_subs): return find_near_matches(subsequence, sequence, max_insertions=0, max_deletions=0, max_substitutions=max_subs)
def search(self, pattern, sequence, max_subs, max_ins, max_dels, max_l_dist=None): return find_near_matches(pattern, sequence, max_subs, max_ins, max_dels, max_l_dist)
def test_no_limitations(self): with self.assertRaises(Exception): find_near_matches('a', 'a')
for i in range(0,5): counts.append(Pixet_dict[PixEt][i][1]) Pixet_least_count_dict[PixEt] = counts[4] elif PixEt in Pixet_dict and len(Pixet_dict[PixEt]) < 5: Too_few_Pixets_list.append(PixEt) print "Pixets with 5 or more sequences: %s" % len(Pixet_least_count_dict) print "Pixets with fewer than 5 sequences: %s" % len(Too_few_Pixets_list) Pixets_ordered_by_least_count = sorted(Pixet_least_count_dict.items(), key=operator.itemgetter(1)) Pixets_ordered_by_least_count.reverse() #makes into a descending tuple for P in Pixets_ordered_by_least_count: Pixets_by_least_count_list.append(P[0]) for doubles in SeqIO.parse("%s/double_expansion_sequences_two_read_seqs.fastq" % Data_Path, "fastq"): repeats = fuzzysearch.find_near_matches(Repeat, doubles.seq, max_l_dist=dist_repeat) spacer_list = [doubles.seq[repeats[0].end:repeats[1].start]] spacer_list.append (doubles.seq[repeats[1].end:repeats[2].start]) double_list.append(spacer_list) for doubles in SeqIO.parse("%s/double_expansion_sequences_three_read_seqs.fastq" % Data_Path, "fastq"): repeats = fuzzysearch.find_near_matches(Repeat, doubles.seq, max_l_dist=dist_repeat) spacer_list = [doubles.seq[repeats[0].end:repeats[1].start]] spacer_list.append (doubles.seq[repeats[1].end:repeats[2].start]) double_list.append(spacer_list) for triples in SeqIO.parse("%s/triple_expansion_sequences_three_read_seqs.fastq" % Data_Path, "fastq"): repeats = fuzzysearch.find_near_matches(Repeat, triples.seq, max_l_dist=dist_repeat) spacer_list = [triples.seq[repeats[0].end:repeats[1].start]] spacer_list.append (triples.seq[repeats[1].end:repeats[2].start]) spacer_list.append (triples.seq[repeats[2].end:repeats[3].start]) triple_list.append(spacer_list)
def test_unlimited_parameter(self): with self.assertRaises(Exception): find_near_matches('a', 'a', max_substitutions=1) with self.assertRaises(Exception): find_near_matches('a', 'a', max_insertions=1) with self.assertRaises(Exception): find_near_matches('a', 'a', max_deletions=1) with self.assertRaises(Exception): find_near_matches('a', 'a', max_substitutions=1, max_insertions=1) with self.assertRaises(Exception): find_near_matches('a', 'a', max_substitutions=1, max_deletions=1) with self.assertRaises(Exception): find_near_matches('a', 'a', max_insertions=1, max_deletions=1)