def get_spcrs(sequence):
    last_rep = fuzzysearch.find_near_matches(Repeat[0:15], sequence.seq, max_l_dist=3)
    results = fuzzysearch.find_near_matches(Repeat, sequence.seq, max_l_dist=6)
    if len(results) == 3 and len(last_rep) >= 1 and last_rep[len(last_rep)-1].start > results[len(results)-1].start:
        spacer_list = [sequence[results[0].end:results[1].start]]
        spacer_list.append (sequence[results[1].end:results[2].start])
        spacer_list.append (sequence[results[2].end:last_rep[len(last_rep)-1].start])
    elif len(results) == 2 and len(last_rep) >= 1 and last_rep[len(last_rep)-1].start > results[len(results)-1].start:
        spacer_list = [sequence[results[0].end:results[1].start]]
        spacer_list.append (sequence[results[1].end:last_rep[len(last_rep)-1].start])
    elif len(results) == 1 and len(last_rep) >= 1 and last_rep[len(last_rep)-1].start > results[len(results)-1].start:
        spacer_list = [sequence[results[0].end:last_rep[len(last_rep)-1].start]]

    elif len(results) == 4:
        spacer_list = [sequence[results[0].end:results[1].start]]
        spacer_list.append (sequence[results[1].end:results[2].start])
        spacer_list.append (sequence[results[2].end:results[3].start])
    elif len(results) == 3:
        spacer_list = [sequence[results[0].end:results[1].start]]
        spacer_list.append (sequence[results[1].end:results[2].start])
    elif len(results) == 2:
        spacer_list = [sequence[results[0].end:results[1].start]]
    else:
        spacer_list = []
    return spacer_list
Exemple #2
0
def is_covid19(entry):

    for covid_word in covid19_words:
        for key in ['abstract', 'title']:
            if key in entry.keys():
                if isinstance(entry[key], list):
                    text = " ".join(entry[key])
                else:
                    text = entry[key]
                if text is not None:
                    #"provide" is really painful here
                    if len([m for m in find_near_matches(covid_word, text, max_l_dist=1, max_substitutions=1) if m.matched != "ovid"]) > 0:
                        return True
        for key in ['keywords_ML', 'keywords']:
            if key in entry.keys():
                if isinstance(entry[key], list):
                    text = " ".join(entry[key])
                else:
                    text = entry[key]
                if text is not None:
                    if len(find_near_matches(covid_word, text, max_l_dist=1, max_substitutions=1)) > 0:
                        return True

        if 'body_text' in entry.keys():
            if entry['body_text'] is not None:
                for e in entry['body_text']:
                    if 'Text' in e.keys() and e['Text'] is not None:
                        if len([m for m in find_near_matches(covid_word, text, max_l_dist=1, max_substitutions=1) if m.matched != "ovid"]) > 0:
                            return True
    return False
def Kernel_func(seq1, seq2, k=5, tolerance=1, kernel='bio'):
    """
    ------------------------------------------------------------------------------
    Compute directly the kernel Kernel_func(seq1,seq2)
    The projection is not made explicit
    ------------------------------------------------------------------------------
    
    Inputs:
    seq1,seq2: DNAs sequences (with ATGC letters)
    k: length of subsequences considered
    
    Output:
    Kernel function (a real number)
    """
    result = 0

    if kernel == 'mismatch':
        subseqs1 = seq2subseq(seq1, k)
        for subseq1 in subseqs1:
            result += len(
                find_near_matches(subseq1, seq2, max_l_dist=tolerance))

        subseqs2 = seq2subseq(seq2, k)
        for subseq2 in subseqs2:
            result += len(
                find_near_matches(subseq2, seq1, max_l_dist=tolerance))

    elif kernel == 'bio':
        result = pairwise2.align.globalxx(seq1,
                                          seq2,
                                          score_only=True,
                                          penalize_extend_when_opening=True,
                                          penalize_end_gaps=True)

    return result
Exemple #4
0
def get_spcrs(sequence):
    last_rep = fuzzysearch.find_near_matches(Repeat[0:15],
                                             sequence.seq,
                                             max_l_dist=3)
    results = fuzzysearch.find_near_matches(Repeat, sequence.seq, max_l_dist=6)
    if len(results) == 3 and len(last_rep) >= 1 and last_rep[
            len(last_rep) - 1].start > results[len(results) - 1].start:
        spacer_list = [sequence[results[0].end:results[1].start]]
        spacer_list.append(sequence[results[1].end:results[2].start])
        spacer_list.append(sequence[results[2].end:last_rep[len(last_rep) -
                                                            1].start])
    elif len(results) == 2 and len(last_rep) >= 1 and last_rep[
            len(last_rep) - 1].start > results[len(results) - 1].start:
        spacer_list = [sequence[results[0].end:results[1].start]]
        spacer_list.append(sequence[results[1].end:last_rep[len(last_rep) -
                                                            1].start])
    elif len(results) == 1 and len(last_rep) >= 1 and last_rep[
            len(last_rep) - 1].start > results[len(results) - 1].start:
        spacer_list = [
            sequence[results[0].end:last_rep[len(last_rep) - 1].start]
        ]

    elif len(results) == 4:
        spacer_list = [sequence[results[0].end:results[1].start]]
        spacer_list.append(sequence[results[1].end:results[2].start])
        spacer_list.append(sequence[results[2].end:results[3].start])
    elif len(results) == 3:
        spacer_list = [sequence[results[0].end:results[1].start]]
        spacer_list.append(sequence[results[1].end:results[2].start])
    elif len(results) == 2:
        spacer_list = [sequence[results[0].end:results[1].start]]
    else:
        spacer_list = []
    return spacer_list
def extract_information(file):
    path = Path(file)
    # TODO Utilizar OCR cuando no se puede extraer el texto del pdf

    string_1 = "corte suprema de justicia de la nación"
    string_2 = "oficina de violencia doméstica"

    i = 0
    all_pages = []
    for text in extract_text_from_pdf(path, full=True):
        matches_1 = find_near_matches(string_1, text.lower(), max_l_dist=4)
        matches_2 = find_near_matches(string_2, text.lower(), max_l_dist=4)
        if matches_1 and matches_2:
            all_pages.append(i)
        i += 1

    if len(all_pages) > 2:
        all_pages = filter_pages(all_pages, 10)
        all_pages = sorted(all_pages)
        with open(f"{path.stem}.txt", "w", encoding="UTF8",
                  errors="ignore") as f:
            for text in extract_text_from_pdf(path,
                                              pages=(all_pages[0],
                                                     all_pages[-1] + 1)):
                if len(text) > 0:
                    f.write("============ page ==============")
                    f.write(text)
    else:
        print("No se encontró ninguna página de la OVD")
Exemple #6
0
def extract_barcode_v2(sequence1):
    '''
	Function to extract barcodes
	'''

    # Parse out barcodes if we can ID the constants
    try:

        # use some approximate, yet generous, indices to facilitate faster matching
        c1_hit = find_near_matches(c1, sequence1[7:25], max_l_dist=2)
        c2_hit = find_near_matches(c2, sequence1[23:42], max_l_dist=2)
        nxt_hit = find_near_matches(nxt, sequence1[33:65], max_l_dist=2)
        me_hit = find_near_matches(me, sequence1[55:], max_l_dist=2)

        # Now grab the barcodes
        bc1, mm1 = prove_barcode(sequence1[0:7], barcodes, n_mismatch)
        bc2, mm2 = prove_barcode(
            sequence1[c1_hit[0].end + 7:c2_hit[0].start + 23], barcodes,
            n_mismatch)
        bc3, mm3 = prove_barcode(
            sequence1[c2_hit[0].end + 23:nxt_hit[0].start + 33], barcodes,
            n_mismatch)
        seq = sequence1[me_hit[0].end + 55:]

        return (bc1 + "_" + bc2 + "_" + bc3, seq,
                str(mm1) + "," + str(mm2) + "," + str(mm3))
    except:
        return (dumb, sequence1, "0,0,0")
Exemple #7
0
def extractbarcode_v2_tn5(sequence1):
    '''
	Function to extract barcodes
	'''

    # Parse out barcodes if we can ID the constants
    try:

        c1_hit = find_near_matches(c1, sequence1[7:25], max_l_dist=2)
        c2_hit = find_near_matches(c2, sequence1[23:42], max_l_dist=2)
        nxt_hit = find_near_matches(nxt, sequence1[33:65], max_l_dist=2)
        me_hit = find_near_matches(me, sequence1[55:], max_l_dist=2)

        # Now grab the barcodes
        bc1, mm1 = prove_barcode(sequence1[0:7], barcodes, n_mismatch)
        bc2, mm2 = prove_barcode(sequence1[c1_hit[0][1] + 7:c2_hit[0][0] + 23],
                                 barcodes, n_mismatch)
        bc3, mm3 = prove_barcode(
            sequence1[c2_hit[0][1] + 23:nxt_hit[0][0] + 33], barcodes,
            n_mismatch)
        bc_tn5, mm4 = prove_barcode(
            sequence1[nxt_hit[0][1] + 33:me_hit[0][0] + 55], tn5, n_mismatch)
        seq = sequence1[me_hit[0][1] + 55:]

        return (bc1 + "_" + bc2 + "_" + bc3 + "_" + bc_tn5, seq,
                str(mm1) + "," + str(mm2) + "," + str(mm3) + "," + str(mm4))
    except:
        return (dumb, sequence1, "0,0,0,0")
    def test_levenshtein(self):
        """test cases where 0 < max_l_dist <= max(others)"""
        # in these cases, find_near_matches should call
        # find_near_matches_levenshtein
        self.patch_concrete_search_classes()
        self.mock_find_near_matches_levenshtein.return_value = \
            [mock.sentinel.SENTINEL]

        self.assertEqual(
            find_near_matches('a', 'a', 1, 1, 1, 1),
            [mock.sentinel.SENTINEL],
        )
        self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 1)

        self.assertEqual(
            find_near_matches('a', 'a', 2, 2, 2, 2),
            [mock.sentinel.SENTINEL],
        )
        self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 2)

        self.assertEqual(
            find_near_matches('a', 'a', 5, 3, 7, 2),
            [mock.sentinel.SENTINEL],
        )
        self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 3)
    def test_levenshtein(self):
        """test cases where 0 < max_l_dist <= max(others)"""
        # in these cases, find_near_matches should call
        # find_near_matches_levenshtein
        self.patch_concrete_search_methods()
        self.mock_find_near_matches_levenshtein.return_value = \
            [mock.sentinel.SENTINEL]

        self.assertEqual(
            find_near_matches('a', 'a', 1, 1, 1, 1),
            [mock.sentinel.SENTINEL],
        )
        self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 1)

        self.assertEqual(
            find_near_matches('a', 'a', 2, 2, 2, 2),
            [mock.sentinel.SENTINEL],
        )
        self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 2)

        self.assertEqual(
            find_near_matches('a', 'a', 5, 3, 7, 2),
            [mock.sentinel.SENTINEL],
        )
        self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 3)
def extract_cdrs(sequence1, sequence2):
    '''
	Function to extract barcodes
	'''
    # Parse out barcodes if we can ID the constants
    try:
        # use some approximate, yet generous, indices to facilitate faster matching
        cdr3_hit1 = find_near_matches(const_cdr3_R1,
                                      sequence1[17:42],
                                      max_l_dist=n_mismatch)
        #c2R1_hit = find_near_matches(const2R1, sequence1[50:], max_l_dist = n_mismatch)
        cdr1_hit1 = find_near_matches(const_cdr1_R2,
                                      sequence2[60:85],
                                      max_l_dist=n_mismatch)
        cdr2_hit1 = find_near_matches(const_cdr2_R2,
                                      sequence2[115:145],
                                      max_l_dist=n_mismatch)

        # R1 stuff
        cdr3longseq = reverse_complement_table(
            sequence1[(cdr3_hit1[0].end + 17):(cdr3_hit1[0].end + 17 +
                                               cdr3n_long)])
        cdr3medseq = reverse_complement_table(
            sequence1[(cdr3_hit1[0].end + 17):(cdr3_hit1[0].end + 17 +
                                               cdr3n_med)])
        cdr3shortseq = reverse_complement_table(
            sequence1[(cdr3_hit1[0].end + 17):(cdr3_hit1[0].end + 17 +
                                               cdr3n_short)])

        # Try to decide which CDR3 by identifying the immediate downstream constant sequence
        upstream_const_seq = "TATTATTGCGCG"
        if (len(
                find_near_matches(upstream_const_seq,
                                  cdr3medseq[0:15],
                                  max_l_dist=n_mismatch)) > 0):
            whichone = "short"
            cdr3 = cdr3shortseq
        elif (len(
                find_near_matches(upstream_const_seq,
                                  cdr3longseq[0:15],
                                  max_l_dist=n_mismatch)) > 0):
            whichone = "medium"
            cdr3 = cdr3medseq
        else:
            whichone = "long"
            cdr3 = cdr3longseq

        # R2 stuff
        cdr1 = (sequence2[(cdr1_hit1[0].end + 60):(cdr1_hit1[0].end + 60 +
                                                   cdr1n)])
        cdr2 = (sequence2[(cdr2_hit1[0].end + 115):(cdr2_hit1[0].end + 115 +
                                                    cdr2n)])
        return (cdr1, cdr2, cdr3, translate_dna_to_protein(cdr1),
                translate_dna_to_protein(cdr2), translate_dna_to_protein(cdr3),
                whichone)
    except:
        return ("NA", "NA", "NA", "NA", "NA", "NA", "NA")
def process_speech(transcript):
    if find_near_matches('mantra', transcript, max_l_dist=2):
        return 'mantra'
    if find_near_matches('breath', transcript, max_l_dist=2):
        return 'breathing'
    if find_near_matches('focus', transcript, max_l_dist=2):
        return 'object'
    # pick random command
    print("we're going random")
    index = random.randint(0, len(commands) - 1)
    return commands[index]
Exemple #12
0
    def is_need_out_barcode(self):
        """
        Judge out barcode in each reads pair
        Args:
        Returns: True or False

        """
        if fuzzysearch.find_near_matches(self.out_barcode, self.ob_read1.description, 1, 0, 0, 1) \
                and fuzzysearch.find_near_matches(self.out_barcode, self.ob_read2.description, 1, 0, 0, 1):
            return True
        else:
            return False
def find_specific_chunk(chunk_needle, haystack, common_prefix,
                        common_prefix_matches, max_substs, max_inserts,
                        max_dels):

    max_levenshtein = max_substs + max_inserts + max_dels
    results = []

    chunk_needle_str = chunk_needle.astype(np.uint8).tostring()
    haystack_str = haystack.astype(np.uint8).tostring()

    for i in range(len(common_prefix_matches)):
        # We have an approximate match against the common prefix.
        # To speed things up, now search for an approximate match
        # for the remaining part of the string only, with a max
        # Levenshtein distance that makes the distance for the whole
        # string (common prefix plus unique) within our bounds.

        common_length = len(common_prefix)
        corrupted_common_match_len = len(common_prefix_matches[i][3])
        start_pos = common_prefix_matches[i][0]

        needle_not_common = chunk_needle_str[common_length:]
        hay_not_common = haystack_str[start_pos +
                                      corrupted_common_match_len:start_pos +
                                      corrupted_common_match_len +
                                      len(chunk_needle) - common_length +
                                      max_inserts]
        fuzzy_result = fuzzysearch.find_near_matches(
            needle_not_common, hay_not_common, max_substs, max_inserts,
            max_dels, max_levenshtein - common_prefix_matches[i][2])

        # The actual fuzzy match data gives no useful information
        # because all the offsets are off. So instead just record the
        # match start position given by common_prefix_matches, and end
        # position according to the fuzzy search result.

        # Sometimes we get a false positive (match) with a nonzero
        # start index, which would count as an insertion error over the
        # whole string. So do a search of the whole string if we get a
        # match, and only *then* admit.

        if len(fuzzy_result) > 0:
            fuzzy_result = fuzzysearch.find_near_matches(
                chunk_needle_str,
                haystack_str[start_pos:start_pos + len(chunk_needle) +
                             max_inserts], max_substs, max_inserts, max_dels)

        if len(fuzzy_result) > 0:
            result = fuzzy_result[0]
            results.append((result.start + start_pos, result.end + start_pos,
                            result.dist))

    return results
Exemple #14
0
def trim_primer(
    seq_record: SeqIO.SeqRecord,
    primer_seqs: List[str],
    max_mismatch: Union[float, int] = 0.14,
) -> SeqIO.SeqRecord:
    """
    Trim primer sequences.

    Parameters
    ----------
    seq_record : Bio.Seq.SeqRecord
        input sequence record
    primer_seqs : list
        list of the foward and reverse primer sequnces
    max_mismatch : float
        Maximum number (or proportion) of mismatches allowed for searching primer
        sequeces (default: 0.14)

    """
    seq = seq_record.seq
    fwd, rev = primer_seqs
    rev_rc = revc(rev)
    len_fwd, len_rev = len(fwd), len(rev)

    if max_mismatch > 1:
        max_l_dist1 = max_l_dist2 = max_mismatch
    elif max_mismatch > 0:
        max_l_dist1 = round(len_fwd * max_mismatch)
        max_l_dist2 = round(len_rev * max_mismatch)
    else:
        raise ValueError("max_mismatch must be a positive value")

    m0 = find_near_matches(fwd, str(seq), max_l_dist=max_l_dist1)
    m1 = find_near_matches(rev_rc, str(seq), max_l_dist=max_l_dist2)

    if len(m0) > 0:
        match_fwd = get_best_match_in_group(m0)
    if len(m1) > 0:
        match_rev_rc = get_best_match_in_group(m1)

    if len(m0) > 0 and len(m1) > 0:
        tr = seq_record[match_fwd.end : match_rev_rc.start]
    elif len(m0) > 0:
        tr = seq_record[match_fwd.end :]
    elif len(m1) > 0:
        tr = seq_record[: match_rev_rc.start]
    else:
        tr = seq_record[:]

    return tr
def fuzzy_search(string, primer_pairs, max_dist):
    """
    Search `string` for both primer sequences
    """
    for (i, primer_pair) in enumerate(primer_pairs):
        search1 = fuzzysearch.find_near_matches(primer_pair[0],
                                                string,
                                                max_l_dist=max_dist)
        if search1:
            search2 = fuzzysearch.find_near_matches(primer_pair[1],
                                                    string,
                                                    max_l_dist=max_dist)
            if search2:
                return i, search1[0], search2[0]
Exemple #16
0
    def searchText_old(self, names, titles, channelName):
        for name in names:
            for screenshot in self.channelData:
                if "TESSERACT" not in screenshot["customLabels"] and "ACTIVATION" not in screenshot["customLabels"] and "LOGIN" not in screenshot["customLabels"] :
                    screenshot["imdbName"] = []
                    screenshot["ocrName"] = [] 
                    screenshot["nameMatchDistance"] = {}               
                    compareResult = find_near_matches(name, screenshot["textBody"], max_l_dist=self.matchingDistance)
                    if len(compareResult) == 0:
                        continue
                    else:
                        if compareResult[0].end != compareResult[0].start:
                            if "IMDB_NAME" not in screenshot["customLabels"]:
                                screenshot["customLabels"].append("IMDB_NAME")
                            screenshot["imdbName"].append(name)
                            screenshot["ocrName"].append(screenshot["textBody"][compareResult[0].start:compareResult[0].end])
                            screenshot["nameMatchDistance"][name] = compareResult[0].dist

        for title in titles:
            if len(title) < MIN_IMDB_SEARCH_LEN:
                continue
            if EXCLUDE_MENU_INTERFACE_STRINGS and title in MENU_INTERFACE_STRINGS:
                continue

            for screenshot in self.channelData:
                if "TESSERACT" in screenshot["customLabels"] or "ACTIVATION" in screenshot["customLabels"] or "LOGIN" in screenshot["customLabels"]:
                    continue
                screenshot["imdbTitle"] = []
                screenshot["ocrTitle"] = []
                screenshot["titleMatchDistance"] = {}
                for screenshot_str in screenshot["textBody"].split('\n'):
                    if not screenshot_str or len(screenshot_str) < MIN_IMDB_SEARCH_LEN:
                        continue
                    compareResult = find_near_matches(screenshot_str, title,
                                                      max_l_dist=self.matchingDistance,
                                                      max_deletions=0,
                                                      max_insertions=0)
                # compareResult = find_near_matches(title, screenshot["textBody"], max_l_dist=self.matchingDistance)
                    if len(compareResult) == 0:
                        continue
                    else:
                        if compareResult[0].end != compareResult[0].start:
                            if "IMDB_TITLE" not in screenshot["customLabels"]:
                                screenshot["customLabels"].append("IMDB_TITLE")
                            print("Matched with IMDB title", screenshot_str, title, self.matchingDistance)
                            print("compareResult", compareResult)
                            screenshot["imdbTitle"].append(title)
                            screenshot["ocrTitle"].append(screenshot["textBody"][compareResult[0].start:compareResult[0].end])
                            screenshot["titleMatchDistance"][title] = compareResult[0].dist
Exemple #17
0
def extract_and_match(sequence):
    left_flank = fuzzysearch.find_near_matches(
        Region_dict[edit_site]['flanking'][0], sequence, max_l_dist=4)
    right_flank = fuzzysearch.find_near_matches(
        Region_dict[edit_site]['flanking'][1], sequence, max_l_dist=4)
    if len(left_flank) == 1 and len(right_flank) == 1:
        region = sequence[left_flank[0].end:right_flank[0].start]
        if region == Region_dict[edit_site]['wt_or_edited'][0]:
            return 'wt'
        elif region == Region_dict[edit_site]['wt_or_edited'][1]:
            return 'edited'
        else:
            return 'undetermined_no_site_match'
    else:
        return 'undetermined_no_flanking_match'
Exemple #18
0
def get_position(context_no_space, text_with_space, valid_length, max_error, margin_error):
    context = context_no_space
    text = text_with_space.split()

    accumulate_position = 0
    count = 0

    for word in text:
        if len(word) < valid_length:
            continue
        error_distance = len(word) - margin_error
        if error_distance < 0:
            error_distance = 0
        if error_distance > max_error:
            error_distance = max_error
        all_position = find_near_matches(word, context, max_l_dist=error_distance)

        for position in all_position:
            count += 1
            accumulate_position += (position[0]+position[1])/2

    if count == 0:
        return -1

    return accumulate_position/count
Exemple #19
0
    def phrase_match(phrase_list, page_text, max_l_dist=4):
        '''Fuzzysearches `page_text` for phrases pased in `phrase_list`.

        String matching performed using the [fuzzysearch](https://github.com/taleinat/fuzzysearch)
        Python package.

        :param phrase_list: phrase list corresponding to certain form component
        :type phrase_list: list
        :param page_text: page text
        :type page_text: string
        :param max_l_dist: maximum levenshtein distance for fuzzysearch, default 4
        :type max_l_dist: integer

        :returns: bool depending if a phrase was matched within certain levenshtein distance
        :rtype: bool
        '''

        detected_phrase = False

        ######################################################
        #Iterate through the phrase list and detect matches
        #####################################################
        ##### Fuzzysearch: https://github.com/taleinat/fuzzysearch
        for phrase in phrase_list:
            #Search page text for a phrase within 2 character changes (levenstein distance)
            if len(
                    fuzzysearch.find_near_matches(phrase.lower(),
                                                  page_text.lower(),
                                                  max_l_dist=max_l_dist)):
                detected_phrase = True
                break

        return detected_phrase
Exemple #20
0
    def fuzzyExtract(self, qs, ls, threshold):
        '''
        todo fuzzy search seperation in words
        :param qs: query string
        :param ls: large string
        :param threshold: threshold
        :return:
        '''
        '''fuzzy matches 'qs' in 'ls' and returns list of
        tuples of (word,index)
        '''

        if len(qs) < self.fuzzySearchOptimumLength:
            processThreshold = 60
            max_l_dist = 0
        else:
            processThreshold = threshold
            max_l_dist = 1

        for word, confidence in process.extractBests(qs, (ls,), score_cutoff=processThreshold):
            print('word {}'.format(word), confidence)
            for match in find_near_matches(qs, word, max_l_dist=max_l_dist):
                match = word[match.start:match.end]
                print('match {}'.format(match))
                index = ls.find(match)
Exemple #21
0
def search_subtitle_2(subtitles, line, offset):
    allsub = ""
    partialsum = []
    for sub in subtitles:
        allsub = allsub + sub.content + " "
        partialsum.append(len(allsub))

    line = line.lower()
    allsub = allsub.lower()
    t1 = datetime.now()
    result = find_near_matches(line, allsub, max_l_dist=5)
    t2 = datetime.now()
    print((t2 - t1).seconds)
    if len(result) != 1:
        return -1

    result = result[0]
    print(result)
    if offset > 0:  #Searching for after
        #Searching for the index of the end of the substring
        i = 0
        for val in partialsum:
            if val > result.end:
                return i
            i += 1

    else:  #Searching for before
        #Searching for the index of the begining of the substring
        i = len(partialsum)
        for val in reversed(partialsum):
            if val < result.start:
                return i + 1
            i -= 1
    return -1
Exemple #22
0
def extract_information(pages_list):

    string_1 = "corte suprema de justicia de la nación"
    string_2 = "oficina de violencia doméstica"

    pages_dict = {}

    for idx, text in enumerate(pages_list):

        matches_1 = find_near_matches(string_1, text.lower(), max_l_dist=4)
        matches_2 = find_near_matches(string_2, text.lower(), max_l_dist=4)

        if matches_1 and matches_2:
            pages_dict[idx] = text

    return pages_dict
Exemple #23
0
def get_barcode_index(fq, barcode = 'hiseq'):
    sys.stdout.write("%s get barcode index start process at %s" % (fq, time.ctime()))
    if re.findall(r'gz', fq):
        fq = gzip.open(fq)
    out_barcode = setting.SeqIndex.out_barcode

    barcode_index = {}
    for k, v in enumerate(setting.SeqIndex.barcode[barcode]):
        barcode_index[v] = k + 1

    iter_fq = SeqIO.parse(fq, "fastq")

    l_barcode = []
    for ob_fq in iter_fq:
        fq_out_barcode = ob_fq.description[-6:]
        fq_barcode = str(ob_fq.seq)[:6]

        try:
            if fuzzysearch.find_near_matches(out_barcode, fq_out_barcode, 1, 1, 1, 1):
               out_str = '1\t' + fq_out_barcode + '\t' + str(barcode_index[fq_barcode]) + '\t' + fq_barcode
            else:
               out_str = '0\t' + fq_out_barcode + '\t' + str(barcode_index[fq_barcode]) + '\t' + fq_barcode
        except KeyError:
            out_str = '2\t' + fq_out_barcode
        l_barcode.append(out_str)

    return l_barcode
Exemple #24
0
def return_song():
    args = sys.argv[1:]
    n_args = len(args)

    access_token = __get_token()

    try:
        with open('./assets/genres.json', 'r') as infile:
            valid_genres = json.load(infile)
    except FileNotFoundError:
        print("Couldn't find genres file!")

    if n_args == 0:
        selected_genre = random.choice(valid_genres)
    else:
        selected_genre = (" ".join(args)).lower()

    if selected_genre in valid_genres:
        result = __request_valid_song(access_token, genre=selected_genre)
        return result
    else:
        valid_genres_to_text = " ".join(valid_genres)
        try:
            closest_genre = find_near_matches(selected_genre,
                                              valid_genres_to_text,
                                              max_l_dist=2)[0].matched
            result = __request_valid_song(access_token, genre=closest_genre)
            return result
        except IndexError:
            print("Genre not found")
Exemple #25
0
def get_fuzzymatches(sentence: str, text: str, q_factor: int, qmax: int,
                     qstep: int) -> List[Match]:
    """
        .. py:function:: get_fuzzymatches(sentence, text, q_factor, qmax, qstep)

        Finds fuzzy matches of sentence in text

        :param str sentence: Input sentence to find
        :param str text: Input text
        :param int q_factor: Value of initial Max Levenshtein distance
        :param str qmax: Max Value of Levenshtein distance
        :param str qstep: Levenshter distance increasing step

        :return: List of fuzzy matches
        :rtype: list[Match]
    """
    fuzzymatches = []
    while q_factor <= qmax:
        fuzzymatches = find_near_matches(sentence.lower(),
                                         text,
                                         max_l_dist=q_factor)
        if fuzzymatches:
            break
        q_factor += qstep
        if q_factor >= qmax:
            logger.warning("Cannot continue fuzzing. Max Q-Factor reached."
                           f"The sentence is `{sentence}`")
            break
    return fuzzymatches
Exemple #26
0
def fuzzy_search_extract_in_orig_doc(original_doc_text, searched_text,
                                     stored_matches):
    # Note; scripts return blanks instead of null values
    if searched_text != '':
        matches = []
        threshold = 1
        while matches == [] and threshold < 10:
            threshold += 2
            matches = find_near_matches(searched_text,
                                        original_doc_text,
                                        max_l_dist=threshold)

        match = get_first_elem_or_none(matches)
        if len(matches) == 1:
            return ((match[0], match[1]), match)
        while match:
            if not check_if_stored_already(match, stored_matches):
                print_if_debug(
                    ("MATCH FOUND: ", original_doc_text[match[0]:match[1]],
                     threshold))
                matches += match
                return ((match[0], match[1]), match)
            matches = matches[1:len(matches)]
            match = get_first_elem_or_none(matches)

    return None
Exemple #27
0
def pattern():
    with open('cutibacterium_acnes.fasta', 'r') as genome:
        seq = SeqIO.read(genome, 'fasta').seq

    sequence = st.text_area('Sequence to search in C. acnes genome')
    mismatch = st.number_input('Select number of mismatches',
                               value=0,
                               min_value=0,
                               max_value=5)

    if len(sequence) < 15:
        st.error('Please write a sequence longer than 15bp')
    subseq = Seq(sequence.upper())

    if sequence:
        fw = fz.find_near_matches(subseq,
                                  seq,
                                  max_l_dist=mismatch,
                                  max_deletions=0,
                                  max_insertions=0)
        rv = fz.find_near_matches(subseq.reverse_complement(),
                                  seq,
                                  max_l_dist=mismatch,
                                  max_deletions=0,
                                  max_insertions=0)

        if fw or rv:
            st.write(
                f'## You have {len(fw)+len(rv)} matches for your sequence in KPA'
            )
            c1, c2, c3, c4 = st.beta_columns(4)
            c1.write('### Position')
            c2.write('### Mismatch')
            c3.write('### Sequence')
            c4.write('### Direction')
            for match in fw:
                c1.write(match.start)
                c2.write(match.dist)
                c3.write(match.matched)
                c4.write('`forward`')
            for match in rv:
                c1.write(match.start)
                c2.write(match.dist)
                c3.write(match.matched)
                c4.write('`reverse`')
        else:
            st.error('No matches found')
Exemple #28
0
def get_fuzzy_search(input_text, subtitle_data, max_dist=5):
    match_str = find_near_matches(input_text,
                                  subtitle_data,
                                  max_l_dist=max_dist)
    if len(match_str) > 0:
        return match_str[0].start
    else:
        return -1
 def test_all_zero(self):
     self.patch_concrete_search_methods()
     self.mock_search_exact.return_value = [42]
     self.assertEqual(
         find_near_matches('a', 'a', 0, 0, 0, 0),
         [Match(42, 43, 0)],
     )
     self.assertEqual(self.mock_search_exact.call_count, 1)
 def search(self, subsequence, sequence, start_index=0, end_index=None):
     if end_index is None:
         end_index = len(sequence)
     sequence = sequence[start_index:end_index]
     return [
         start_index + match.start
         for match in find_near_matches(subsequence, sequence, max_l_dist=0)
     ]
 def search(self, subsequence, sequence, start_index=0, end_index=None):
     if end_index is None:
         end_index = len(sequence)
     sequence = sequence[start_index:end_index]
     return [
         start_index + match.start
         for match in find_near_matches(subsequence, sequence, max_l_dist=0)
     ]
 def test_all_zero(self):
     self.patch_concrete_search_classes()
     self.mock_search_exact.return_value = [Match(42, 43, 0)]
     self.assertEqual(
         find_near_matches('a', 'a', 0, 0, 0, 0),
         [Match(42, 43, 0)],
     )
     self.assertEqual(self.mock_search_exact.call_count, 1)
Exemple #33
0
    def fuzzy_regex(label, string_list):

        ret = ''

        # Only remove the best-matched string if we have a good score. If the score is poor,
        # possible that its a wrong match and deleting it will cause that field to have
        # a wrong match
        threshold_to_remove = 85

        filtered_string_list = list(
            filter(lambda x: len(x) >= len(label), string_list))

        if (len(filtered_string_list) == 0):

            ret = Regex_Utils.REGEX_FAILURE

        else:

            # First, search and extract the highest match:
            string_match = process.extractOne(label,
                                              filtered_string_list,
                                              scorer=fuzz.partial_ratio)
            string = string_match[0]
            score = string_match[1]

            # If its a sufficiently high match, then remove it from the list to help next searches
            if (score >= threshold_to_remove):
                string_list.remove(string)

    # Fuzzysearch for best match of label within the string:
            match = find_near_matches(label, string, max_l_dist=2)

            # Need to sort and pull out best match:

            if (len(match) == 0):
                ret = Regex_Utils.REGEX_FAILURE

            else:

                best_match = sorted(match, key=lambda i: i.dist)[0]

                # Convert the best match into actual string slice
                best_match_string = string[best_match.start:best_match.end]

                # Construct regex based on this slice
                regex = best_match_string + '\s*(.*)'

                try:
                    # Perform the regex search to find the text of interest
                    output = re.search(regex, string)

                    ret = output.group(1)

                except:
                    print(label + ' Failed searches')
                    ret = Regex_Utils.REGEX_FAILURE

        return ret, string_list
def search(description: str, question: list):
    # description разбивается на список предложений
    sentences = list(filter(None, split_text(description)))
    lowerSentences = []
    for sentence in sentences:
        lowerSentences.append(sentence.lower())
    resultSentences = []

    for keyword in question:
        if type(keyword) != dict:
            if len(keyword) < 5:
                max_l_dist = 0
                max_deletions = 0
                max_insertions = 0
                max_substitutions = 0
            else:
                max_l_dist = 2
                max_deletions = 4
                max_insertions = 1
                max_substitutions = 0
            for sentence in lowerSentences:
                if find_near_matches(keyword,
                                     sentence,
                                     max_l_dist=max_l_dist,
                                     max_deletions=max_deletions,
                                     max_insertions=max_insertions,
                                     max_substitutions=max_substitutions
                                     ) and sentence not in resultSentences:
                    resultSentences.append(sentence)
        else:
            synonym = list(keyword.keys())[0]
            addWords = list(keyword.values())[0]
            for word in addWords:
                for sentence in lowerSentences:
                    if find_near_matches(synonym + word,
                                         sentence,
                                         max_l_dist=2,
                                         max_deletions=4,
                                         max_insertions=2,
                                         max_substitutions=0
                                         ) and sentence not in resultSentences:
                        resultSentences.append(sentence)

    # print(resultSentences)
    return resultSentences
Exemple #35
0
def flanktrim():
    for record in SeqIO.parse(args.fasta, "fasta"):
        seqid = record.name
        seq = record.seq
        fwflank = args.fwflank
        rvflank = args.rvflank
        ed1 = round(len(fwflank) * float(args.edit))
        ed2 = round(len(rvflank) * float(args.edit))
        fwmatch = find_near_matches(fwflank, seq, max_l_dist=ed1)
        rvmatch = find_near_matches(rvflank, seq, max_l_dist=ed2)

        # continue with next locus if either forward or reverse match are not found
        if len(fwmatch) == 0 or len(rvmatch) == 0:
            continue
        # for both forward and reverse, pick index of lowest edit distance match
        else:
            fwindex = []
            for i in range(0, len(fwmatch)):
                fwindex.append(fwmatch[i].dist)
            fwstart = fwmatch[fwindex.index(min(fwindex))].start
            fwend = fwmatch[fwindex.index(min(fwindex))].end
            fwdist = fwmatch[fwindex.index(min(fwindex))].dist
            fwseq = fwmatch[fwindex.index(min(fwindex))].matched

            rvindex = []
            for i in range(0, len(rvmatch)):
                rvindex.append(rvmatch[i].dist)
            rvstart = rvmatch[rvindex.index(min(rvindex))].start
            rvend = rvmatch[rvindex.index(min(rvindex))].end
            rvdist = rvmatch[rvindex.index(min(rvindex))].dist
            rvseq = rvmatch[rvindex.index(min(rvindex))].matched

        # break if reverse is downstream to forward primer
        if rvend < fwstart:
            continue
        # trim region
        else:
            trim = seq[fwend:rvstart]
            trimlen = len(trim)

        # check if trimmed sequence is within the fragment length range
        if trimlen <= 20 or trimlen >= 300:
            continue
        print(seqid, "\t", trim, "\t", trimlen, "\t", fwdist, "\t", rvdist,
              "\t")
Exemple #36
0
    def is_need_out_barcode(self):
        """
        Judge out barcode in each reads pair
        Args:
        Returns: True or False

        """
        try:
            if re.match(r'[ATCG]', self.ob_read1.description.split()[1].split(":")[-1].strip()):
                if fuzzysearch.find_near_matches(self.out_barcode, self.ob_read1.description, 1, 0, 0, 1) \
                        and fuzzysearch.find_near_matches(self.out_barcode, self.ob_read2.description, 1, 0, 0, 1):
                    return True
                else:
                    return False
            else:
                return True
        except IndexError:
            return True
    def test_all_none_except_max_l_dist(self):
        self.patch_concrete_search_classes()
        self.mock_find_near_matches_levenshtein.return_value = [Match(42, 43, 0)]

        self.assertEqual(
            find_near_matches('a', 'a', max_l_dist=1),
            [Match(42, 43, 0)],
        )
        self.assertEqual(self.mock_find_near_matches_levenshtein.call_count, 1)
Exemple #38
0
    def search_teacher(self, teacher_name):
        matched_teachers = []
        all_teachers = self.teachers_database.find()

        for teacher in all_teachers:
            if (len(find_near_matches(teacher_name, teacher['name'], max_l_dist=1)) > 0):
                matched_teachers += [teacher]

        return matched_teachers
    def test_only_substitutions(self):
        self.patch_concrete_search_classes()
        self.mock_find_near_matches_substitutions.return_value = [Match(42, 43, 0)]

        self.assertEqual(
            find_near_matches('a', 'a', 1, 0, 0),
            [Match(42, 43, 0)],
        )
        self.assertEqual(
            self.mock_find_near_matches_substitutions.call_count,
            1,
        )

        self.assertEqual(
            find_near_matches('a', 'a', 1, 0, 0, 1),
            [Match(42, 43, 0)],
        )
        self.assertEqual(
            self.mock_find_near_matches_substitutions.call_count,
            2,
        )
    def test_generic(self):
        self.patch_concrete_search_classes()
        self.mock_find_near_matches_generic.return_value = [Match(42, 43, 0)]

        self.assertEqual(
            find_near_matches('a', 'a', 1, 1, 1),
            [Match(42, 43, 0)],
        )
        self.assertEqual(
            self.mock_find_near_matches_generic.call_count,
            1,
        )

        self.assertEqual(
            find_near_matches('a', 'a', 1, 1, 1, 2),
            [Match(42, 43, 0)],
        )
        self.assertEqual(
            self.mock_find_near_matches_generic.call_count,
            2,
        )
    def define_adapter_presence_substitutions_only(self, adapter, max_substitutions):
        """Sets the adapter_absent attribute according to whether a match is found with a number of substitutions
        less or equal to
        edit_threshold.

        :rtype : None
        Args:
            self, adapter, edit_threshold

        Returns:
            None

        Args:
            self, adapter, max_substitutions

        Returns:
            None
        """
        # Solves the straightforward case where the adapter is exactly present in the read
        if adapter in self.sequence_line:
            self.adapter_present = True
            return
        # Otherwise, look for an approximate match of the adapter, less than max_substitutions different
        # First do a preliminary (hopefully faster) check whether the adapter is present as an approximate match in
        # the read sequence within a Levenshtein distance equal to the allowed number of substitutions
        # Two sequences less than N substitutions apart are automatically less than a N edits apart (the reciprocal
        # is not true). If the Levenshtein code is faster, then it will save a lot of time pre-filtering adapters
        # within the Levenshtein distance, which are much more likely to contain adapters. The slower code checking
        # the mismatch distance will then be called on these pre-filtered reads to confirm whether it is an actual
        # substituted match or if the Levensthein match involved insertions and deletions
        # If 1 or more approximate matches of the adapter were found within a Levenshtein distance of max_substitutions
        if len(fuzzysearch.find_near_matches(adapter, self.sequence_line, max_substitutions)):
            # scan the read sequence for a potential substituted match
            # for each subsequence of the sequence_line of length identical to the adapter (last starting position is
            # length of the adapter before the end of the read)
            for index in range(len(self.sequence_line) - len(adapter) + 1):
                # check if the adapter is less than max_substitutions away from the subsequence
                result = ApproxMatch.approx_substitute(adapter, self.sequence_line[index:index + len(adapter)],
                                                       max_substitutions)
                # If a match was found (result is TRUE)
                if result:
                    # set the adapter presence to True (and stop the function here)
                    self.adapter_present = True
                    return
Exemple #42
0
 def fuzzy_search_barcode(self, barcodes_name, inner_barcode):
     """
     Fuzzy search inner barcode in barcodes table, for sequencing company's problem
     :param barcodes_name:
     :param inner_barcode:
     :return: the position of barcodes table
     """
     position = 0
     flag = 0
     tag = ''
     while position < len(setting.SeqIndex.barcode[barcodes_name]):
         if flag > 1:
             tag = ''
             break
         if fuzzysearch.find_near_matches(setting.SeqIndex.barcode[barcodes_name][position], inner_barcode, 1, 1, 1, 1):
             tag = str(position + 1)
             flag += 1
         position += 1
     return tag
    def test_zero_max_l_dist(self):
        self.patch_concrete_search_classes()
        self.mock_search_exact.return_value = [Match(42, 43, 0)]

        call_count = 0
        for (max_subs, max_ins, max_dels) in [
            (1, 0, 0),
            (0, 1, 0),
            (0, 0, 1),
            (1, 1, 0),
            (1, 0, 1),
            (0, 1, 1),
            (1, 1, 1),
        ]:
            with self.subTest('max_l_dist=0, max_subs={0}, max_ins={1}, max_dels={2}'.format(
                    max_subs, max_ins, max_dels)):
                self.assertEqual(
                    find_near_matches('a', 'a', max_subs, max_ins, max_dels, 0),
                    [Match(42, 43, 0)],
                )
                call_count += 1
                self.assertEqual(self.mock_search_exact.call_count, call_count)
 def search(self, subsequence, sequence, max_l_dist):
     return find_near_matches(subsequence, sequence, max_l_dist=max_l_dist)
 def search(self, subsequence, sequence, max_subs):
     return find_near_matches(subsequence, sequence,
                              max_insertions=0, max_deletions=0,
                              max_substitutions=max_subs)
 def search(self, pattern, sequence, max_subs, max_ins, max_dels,
            max_l_dist=None):
     return find_near_matches(pattern, sequence,
                              max_subs, max_ins, max_dels, max_l_dist)
 def test_no_limitations(self):
     with self.assertRaises(Exception):
         find_near_matches('a', 'a')
		for i in range(0,5):
			counts.append(Pixet_dict[PixEt][i][1])
		Pixet_least_count_dict[PixEt] = counts[4]
	elif PixEt in Pixet_dict and len(Pixet_dict[PixEt]) < 5:
		Too_few_Pixets_list.append(PixEt)

print "Pixets with 5 or more sequences: %s" % len(Pixet_least_count_dict)
print "Pixets with fewer than 5 sequences: %s" % len(Too_few_Pixets_list)

Pixets_ordered_by_least_count = sorted(Pixet_least_count_dict.items(), key=operator.itemgetter(1))
Pixets_ordered_by_least_count.reverse() #makes into a descending tuple
for P in Pixets_ordered_by_least_count:
	Pixets_by_least_count_list.append(P[0])

for doubles in SeqIO.parse("%s/double_expansion_sequences_two_read_seqs.fastq" % Data_Path, "fastq"):
    repeats = fuzzysearch.find_near_matches(Repeat, doubles.seq, max_l_dist=dist_repeat)
    spacer_list = [doubles.seq[repeats[0].end:repeats[1].start]]
    spacer_list.append (doubles.seq[repeats[1].end:repeats[2].start])
    double_list.append(spacer_list)
for doubles in SeqIO.parse("%s/double_expansion_sequences_three_read_seqs.fastq" % Data_Path, "fastq"):
    repeats = fuzzysearch.find_near_matches(Repeat, doubles.seq, max_l_dist=dist_repeat)
    spacer_list = [doubles.seq[repeats[0].end:repeats[1].start]]
    spacer_list.append (doubles.seq[repeats[1].end:repeats[2].start])
    double_list.append(spacer_list)
for triples in SeqIO.parse("%s/triple_expansion_sequences_three_read_seqs.fastq" % Data_Path, "fastq"):
    repeats = fuzzysearch.find_near_matches(Repeat, triples.seq, max_l_dist=dist_repeat)
    spacer_list = [triples.seq[repeats[0].end:repeats[1].start]]
    spacer_list.append (triples.seq[repeats[1].end:repeats[2].start])
    spacer_list.append (triples.seq[repeats[2].end:repeats[3].start])
    triple_list.append(spacer_list)
    def test_unlimited_parameter(self):
        with self.assertRaises(Exception):
            find_near_matches('a', 'a', max_substitutions=1)

        with self.assertRaises(Exception):
            find_near_matches('a', 'a', max_insertions=1)

        with self.assertRaises(Exception):
            find_near_matches('a', 'a', max_deletions=1)

        with self.assertRaises(Exception):
            find_near_matches('a', 'a', max_substitutions=1, max_insertions=1)

        with self.assertRaises(Exception):
            find_near_matches('a', 'a', max_substitutions=1, max_deletions=1)

        with self.assertRaises(Exception):
            find_near_matches('a', 'a', max_insertions=1, max_deletions=1)