Python hamming 예제들, Levenshtein.hamming Python 예제들

예제 #1

0

파일 보기

    def get_nn(self, q, nonn="None"):
        from Levenshtein import hamming

        assert (len(q) == self.l)

        set_for_check = set()

        for j in range(self.tau + 1):
            substr = q[j * self.piece_len:j * self.piece_len +
                       self.piece_lens[j]]
            if substr in self.sets_interval[j]:
                set_for_check.update(self.sets_interval[j][substr])

        if len(set_for_check) == 0:
            return None

        i = min(set_for_check,
                key=lambda i: (hamming(self.reads[i], q), -self.priority[i]))

        result = self.reads[i]

        if hamming(result, q) > self.tau and nonn == "None":
            result = None

        return result

예제 #2

0

파일 보기

파일: ig_basic.py 프로젝트: egor-bogomolov/ig_repertoire_constructor

def hamming_graph_naive(reads, tau=1, **kwargs):
    """
    Construct hamming(tau) graph using naive O(N**2 d) algorithm
    """
    import igraph as ig
    import numpy as np
    from Levenshtein import hamming

    l = len(reads[0])

    for read in reads:
        assert (len(read) == l)

    N = len(reads)
    m = np.zeros((N, N), dtype=int)

    for i in range(N):
        for j in range(i):
            dist = hamming(reads[i], reads[j])
            m[i, j] = m[j, i] = dist if dist <= tau else 0

    # Be careful! Zero elements are not interpreted as zero-length edges
    g = ig.Graph.Weighted_Adjacency(m.tolist(),
                                    mode="UNDIRECTED",
                                    attr="weight",
                                    loops=False)

    g.vs["read"] = reads

    for attr_name, attr_data in kwargs.iteritems():
        g.vs[attr_name] = attr_data

    return g

예제 #3

0

파일 보기

def filter(msg, filterWords, englishWords, hammingDistance,
           levenshteinDistance):
    #Filter special characters from string
    filteredMsg = ''.join(e for e in msg if e.isalnum())
    #for each word in filter list
    for word in filterWords:
        #for all criteria check if it's a proper english word before filtering it
        #If a word matches a filter criteria, and is not an english word, the asterisk string is returned
        #only check hamming distance of strings are same length
        if len(word) == len(filteredMsg):
            #if hamming distance isthe value passed in or less, censor it
            if hamming(word, filteredMsg) <= hammingDistance:
                if not WordChecker.check_word_exists_in(
                        englishWords, filteredMsg):
                    return generateRandomAsteriskString()
            #if not within hamming distance, check levenshtein distance is within range to be filtered
            elif distance(word, filteredMsg) <= levenshteinDistance:
                if not WordChecker.check_word_exists_in(
                        englishWords, filteredMsg):
                    return generateRandomAsteriskString()
        #if not same length, check if within levenshtein distance insert range
        elif abs(len(word) - len(filteredMsg) <= levenshteinDistance):
            if distance(word, filteredMsg) <= levenshteinDistance:
                if not WordChecker.check_word_exists_in(
                        englishWords, filteredMsg):
                    return generateRandomAsteriskString()
    # Otherwise, return original string
    return msg

예제 #4

0

파일 보기

파일: hamming.py 프로젝트: aminelz/ift6285-hw3

def hamming_distance(words: Iterator[str], vocabulary: Dict[str, int]):
    """Corrects the words based on Hamming distances

    Args:
        words (Iterator[str]): Iterator over the misspelled words
        vocabulary (Dict[str,int]) : dictionary holding words and their frequency
    """

    for word in words:
        distances = []
        suggestions = []
        vocab_list = list(vocabulary)
        for (i,vocab) in enumerate(vocab_list):
            if len(vocab) == len(word):
                distances.append(hamming(word, vocab))
            else:
                distances.append(120)
            
        idx = np.array(distances).argsort()[:5]
        
        for i in range(5):
            for j in range(i+1,5):
                if distances[idx[i]] == distances[idx[j]]:
                    if vocabulary.get(vocab_list[idx[i]]) < vocabulary.get(vocab_list[idx[j]]):
                        temp = idx[i] 
                        idx[i] = idx[j]
                        idx[j] = temp   

        for i in idx:
            suggestions.append(vocab_list[i])

        output("{misspelled}\t{corrections}".format(
            misspelled=word,
            corrections="\t".join(suggestions)
        ))  # may cause IO bottleneck

예제 #5

0

파일 보기

파일: utils.py 프로젝트: suleymanov/BCD100

def hamm(str1, str2):
    """ Levenshtein distance with option to throw None
    for strings of different lengths.
    :param str1: string
    :param str2: string
    :return: int or None
    """
    return hamming(str1, str2) if len(str1) is len(str2) else None

예제 #6

0

파일 보기

파일: analyze_scDHC_V2design.py 프로젝트: jayguoxue/combinatorialHiC

def checkHamming(barcodes, barcode):
    for bc in barcodes:
        match = False
        hd = hamming(barcode, bc)
        if hd <= 2:
            match = True
            barcode = bc
            break
    return (match, barcode)

예제 #7

0

파일 보기

def checkHamming(barcodes, barcode):
    '''Given a list of barcodes, check that the given barcode is within edit\
		distance 2 to any of the list of barcodes'''
    for bc in barcodes:
        match = False
        hd = hamming(barcode, bc)
        if hd <= 2:
            match = True
            barcode = bc
            break
    return (match, barcode)

예제 #8

0

파일 보기

파일: deparse.py 프로젝트: kdelwat/LangEvolve

def segment_match(feature_strings, target_segment):
    '''Returns the best match for the IPA string of the given Segment, from the
    given list of tuples containing feature strings. The first item in each
    tuple is the phoneme and the second is the feature string.

    '''
    target_feature_string = feature_string(target_segment)

    # If the segment has previously been matched, return the cached value
    if target_feature_string in deparse_cache:
        return deparse_cache[target_feature_string]

    # Find the distance of the initial candidate to serve as a benchmark.
    best_distance = hamming(target_feature_string, feature_strings[0][1])
    best_strings = [feature_strings[0][0]]

    # Loop through the rest of the available strings. If the distance between
    # the string and the target is greater than the current best, jump to the
    # next string. Otherwise, if it's the same add it to best_strings, or if
    # it's less overwrite best_strings.
    for string in feature_strings[1:]:
        new_distance = hamming(target_feature_string, string[1])

        if new_distance > best_distance:
            continue

        elif new_distance < best_distance:
            best_distance = new_distance
            best_strings = [string[0]]

        else:
            best_strings.append(string[0])

    # Find the shortest of these strings, because we want to deparse
    # into the simplest segments possible.
    deparsed_segment = min(best_strings, key=len)

    # Add the new match to the cache.
    deparse_cache[target_feature_string] = deparsed_segment

    return deparsed_segment

예제 #9

0

파일 보기

파일: deparse.py 프로젝트: awesome-interesting-projects/Onset

def segment_match(feature_strings, target_segment):
    '''Returns the best match for the IPA string of the given Segment, from the
    given list of tuples containing feature strings. The first item in each
    tuple is the phoneme and the second is the feature string.

    '''
    target_feature_string = feature_string(target_segment)

    # If the segment has previously been matched, return the cached value
    if target_feature_string in deparse_cache:
        return deparse_cache[target_feature_string]

    # Find the distance of the initial candidate to serve as a benchmark.
    best_distance = hamming(target_feature_string, feature_strings[0][1])
    best_strings = [feature_strings[0][0]]

    # Loop through the rest of the available strings. If the distance between
    # the string and the target is greater than the current best, jump to the
    # next string. Otherwise, if it's the same add it to best_strings, or if
    # it's less overwrite best_strings.
    for string in feature_strings[1:]:
        new_distance = hamming(target_feature_string, string[1])

        if new_distance > best_distance:
            continue

        elif new_distance < best_distance:
            best_distance = new_distance
            best_strings = [string[0]]

        else:
            best_strings.append(string[0])

    # Find the shortest of these strings, because we want to deparse
    # into the simplest segments possible.
    deparsed_segment = min(best_strings, key=len)

    # Add the new match to the cache.
    deparse_cache[target_feature_string] = deparsed_segment

    return deparsed_segment

예제 #10

0

파일 보기

파일: reparse_tags.py 프로젝트: GrahamDB/phyluce

def main():
    args = get_args()
    #pdb.set_trace()
    fastqs = fastq.FastqReader(args.input)
    out_fastq = fastq.FastqWriter(args.output)
    for read in fastqs:
        read_tag = read.identifier.split('#')[-1].split('/')[0]
        if hamming(read_tag, args.tag) <= 1:
            out_fastq.write(read)
        else:
            pass
    out_fastq.close()

예제 #11

0

파일 보기

def main():
    args = get_args()
    #pdb.set_trace()
    fastqs = fastq.FastqReader(args.input)
    out_fastq = fastq.FastqWriter(args.output)
    for read in fastqs:
        read_tag = read.identifier.split('#')[-1].split('/')[0]
        if hamming(read_tag, args.tag) <= 1:
            out_fastq.write(read)
        else:
            pass
    out_fastq.close()

예제 #12

0

파일 보기

	def check_hamming_distance(self, iList, datatype, d_type, split_line):
		MAX_SPELLING_ERRORS = 2

		if len(d_type) == len(datatype):
			if hamming(d_type, datatype) <= MAX_SPELLING_ERRORS: 
				if datatype == 'indicators of compromise':
					print('\tno regex match on %s, using hamming distance' % datatype)
					return self.ret_indicators_of_compromise(iList)
				else:
					print('\tno regex match on %s, using hamming distance' % datatype)
					return ''.join(split_line[1:])
		return ''

예제 #13

0

파일 보기

def merge_paths(paths, MIN_DIST=1):
    paths_sorted = sorted(paths, key=lambda tup: tup[1])
    num_paths = len(paths)

    paths_merged = {tup[0]: tup for tup in paths_sorted}
    get_seq = lambda tup: tup[0]
    for (i, path) in enumerate(paths_sorted):
        for j in range(i + 1, num_paths):
            ham_dist = hamming(get_seq(paths[i]), get_seq(paths[j]))
            if (ham_dist <= MIN_DIST):
                bad_path = min([paths[i], paths[j]], key=lambda tup: tup[1])
                if (get_seq(bad_path) in paths_merged.keys()):
                    del (paths_merged[get_seq(bad_path)])
    return list(paths_merged.values())

예제 #14

0

파일 보기

파일: ig_basic.py 프로젝트: egor-bogomolov/ig_repertoire_constructor

def hamming_graph_knuth(reads, tau=1, **kwargs):
    """
    Construct hamming(tau) graph using Knuth's algorithm
    """
    import igraph as ig
    from collections import defaultdict
    from Levenshtein import hamming

    l = len(reads[0])

    for read in reads:
        assert (len(read) == l)

    piece_len = int(l / (tau + 1))
    piece_len_last = l - piece_len * tau

    piece_lens = [piece_len] * tau + [piece_len_last]

    g = ig.Graph(len(reads))
    g.vs["read"] = reads

    for attr_name, attr_data in kwargs.iteritems():
        g.vs[attr_name] = attr_data

    edges_for_check = set()

    for j in range(tau + 1):
        sets = defaultdict(list)
        for i in range(len(reads)):
            substr = reads[i][j * piece_len:j * piece_len + piece_lens[j]]
            sets[substr].append(i)

        for v_list in sets.itervalues():
            # print("N += %d" % len(v_list))
            for i1 in range(len(v_list)):
                for i2 in range(i1 + 1, len(v_list)):
                    edges_for_check.add((v_list[i1], v_list[i2]))

    # print("Edges for check %d" % len(edges_for_check))
    for v1, v2 in edges_for_check:
        read1, read2 = reads[v1], reads[v2]
        d = hamming(read1, read2)
        if d <= tau:
            g.add_edge(v1, v2, weight=d)

    return g

예제 #15

0

파일 보기

파일: Naive_pipeline.py 프로젝트: modash/sircel

def assign_read(params):
    (consensus_bcs, (reads_data, reads_offset), (barcodes_data,
                                                 barcodes_offset)) = params

    obs_bc = reads_data[1].strip()[ \
     args['barcode_start']: args['barcode_end']]

    min_dist = None
    assignment = []
    for consensus_bc in consensus_bcs:
        dist = hamming(obs_bc, consensus_bc)
        if min_dist == None or dist < min_dist:
            min_dist = dist
            assignment = [consensus_bc]
        #in the case of a tie,
        elif dist == min_dist:
            assignment.append(consensus_bc)
    #return the best unique assignment
    if len(assignment) == 1:
        return (assignment[0], reads_offset, barcodes_offset)
    #or don't assign read (in the case of a tie)
    return ('unassigned', reads_offset, barcodes_offset)

예제 #16

0

파일 보기

파일: DNA_match.py 프로젝트: yiweiyu/MyScript

    group_cnt += 1
    if group_cnt < 39490:
        continue
    print index

    groupname1, groupname2 = row[0], row[1]
    dna_series = df[(df[0] == groupname1) & (df[1] == groupname2)][2]

    #前者存数据，后者存结果
    dna_list = list(dna_series)  #将大组数据存进一个list
    dna_result_dict = {}

    for dna in dna_list:
        isNewKey = True
        for key in dna_result_dict.keys():
            if hamming(dna, key) < 3:  #小于3则相似
                dna_result_dict[key].append(dna)
                isNewKey = False
                break
        if isNewKey:
            dna_result_dict[dna] = [dna]

    #不相似的归到一组
    no_match_list = []
    group_index = 1
    for key, value in dna_result_dict.iteritems():
        if len(value) == 1:
            no_match_list.append(key)
            # del dna_result_dict[key]
        else:
            result_list.append([groupname1, groupname2, group_index, value])

예제 #17

0

파일 보기

파일: ig_basic.py 프로젝트: egor-bogomolov/ig_repertoire_constructor

 def dist(i, j):
     return hamming(reads[i], reads[j])

예제 #18

0

파일 보기

파일: test_c_levenshtein.py 프로젝트: carlosp420/edittag

 def test_zero_differences(self):
     """[hamming-c] no differences"""
     expected = 0
     observed = hamming('wonderful', 'wonderful')
     self.assertEqual(expected, observed)

예제 #19

0

파일 보기

                      priority=original_barcode_mult)
    print "Index constructed"

    bad_barcodes = []
    barcode_barcode = {}
    dists = []

    for barcode in data_barcodes:
        if barcode in original_barcodes:
            barcode_barcode[barcode] = barcode
            dists.append(0)
        else:
            neib = tree.get_nn(barcode, nonn="None")
            if neib is not None:
                barcode_barcode[barcode] = neib
                dists.append(hamming(neib, barcode))
            else:
                bad_barcodes.append(barcode)
                dists.append(-1)

    print "Bad-coded reads %d unique barcodes %d" % (sum(barcodes_count[barcode] for barcode in bad_barcodes),
                                                     len(bad_barcodes))

    print "Well-coded reads %d unique barcodes %d" % (sum(barcodes_count[barcode] for barcode in barcode_barcode.iterkeys()),
                                                      len(barcode_barcode))


    dist_hist = defaultdict(int)
    dist_barcodes = defaultdict(list)
    for dist, barcode in zip(dists, data_barcodes):
        dist_hist[dist] += 1

예제 #20

0

파일 보기

    remaining_seqs.pop(max_seq)
    seqs_in_cluster = 1
    for sample_ID in all_samples_reads:
        if max_seq in all_samples_reads[sample_ID]:
            all_samples_clusters[sample_ID][max_seq] = all_samples_reads[
                sample_ID][max_seq]

    for next_seq, matches in sorted(remaining_seqs.items(),
                                    key=lambda x: x[1],
                                    reverse=True):

        if len(next_seq) != len(max_seq):
            mismatches = 99
            #wrong length
        else:
            mismatches = hamming(max_seq, next_seq)

        if mismatches <= 1 or (next_seq
                               in max_seq):  #allow missing bases at ends

            #add_to_cluster
            current_cluster += remaining_seqs[next_seq]

            for sample_ID in all_samples_reads:
                if next_seq in all_samples_reads[sample_ID]:
                    increment(all_samples_clusters[sample_ID], max_seq,
                              all_samples_reads[sample_ID][next_seq])

            remaining_seqs.pop(next_seq)

            seqs_in_cluster += 1

예제 #21

0

파일 보기

  def even_split_mismatching(self, kmers, kmer_dict, rev_kmer_dict, peptide_length):
    '''
    '''
    # record matches in a set so as to not duplicate matches
    matches = set()

    for i in range(0, len(kmers), self.split):

      # find each hit for each k-mer
      try:
        for hit in kmer_dict[kmers[i]]:

          mismatches = 0

          # if the k-mer is found in the middle or end, check the neighboring
          # k-mers to the left
          for j in range(0, i, self.split):
            
            # use reverse dictionary to retrive k-mers for Hamming distance
            try:
              mismatches += hamming(rev_kmer_dict[hit+j-i], kmers[j])
              
              # if mismatches ever reach threshold, break out of loop
              if mismatches >= self.max_mismatches + 1:
                break

            # if first k-mer finds nothing, set mismatches to 100 to disqualify this
            # peptide from matching with this area
            except KeyError:
              mismatches = 100

          # if the k-mer is found in the middle or end, check the neighbors
          # k-mers to the right
          for k in range(i+self.split, len(kmers), self.split):
            try:

              # use reverse dictionary to retrive k-mers for Hamming distance
              mismatches += hamming(rev_kmer_dict[hit+k-i], kmers[k])

              # if mismatches ever reach threshold, break out of loop
              if mismatches >= self.max_mismatches + 1:
                break

            # if last k-mer finds nothing, set mismatches to 100 to disqualify this
            # peptide from matching with this area
            except KeyError:
              mismatches = 100

          # if the mismatches that were calculated is less than threshold
          # for all neighbors, then it's a match
          if mismatches < self.max_mismatches + 1:
            matched_peptide = ''

            try:
              for s in range(0, peptide_length, self.split):
                matched_peptide += rev_kmer_dict[hit-i+s]
            except KeyError:
              continue

            matches.add((matched_peptide, mismatches, hit - i))

            if self.best_match and not mismatches:
              return matches

      # if nothing is found, you can check the next k-mer, since it can still be a match
      except KeyError:
        continue

    return matches

예제 #22

0

파일 보기

파일: equivalence_class_tools.py 프로젝트: cynthiaw2004/Causality_Equivalence_Class

def determine_edit_distance(G1,G2):
	G1str = str(graph2str(G1))
	G2str = str(graph2str(G2))
	#G1str and G2str MUST have same length
	return hamming(G1str,G2str)

예제 #23

0

파일 보기

 def test_one_substitution(self):
     """[hamming-c] one difference"""
     expected = 1
     observed = hamming('wonderful', 'wondirful')
     self.assertEqual(expected, observed)

예제 #24

0

파일 보기

 def test_zero_differences(self):
     """[hamming-c] no differences"""
     expected = 0
     observed = hamming('wonderful', 'wonderful')
     self.assertEqual(expected, observed)

예제 #25

0

파일 보기

파일: validate_edit_metric_tags.py 프로젝트: faircloth-lab/edittag

def hammng(a, b):
    """return the hamming distance between a and b"""
    return hamming(a, b)

예제 #26

0

파일 보기

def matched_name_in_snt(name, alias_names, snt, mode):
    user_utterance = snt
    domEnt = name
    alias_domEnts = alias_names
    # check 4968
    if mode == 'exact':
        # hard matching
        searchObj = re.search("(^{}\W|\W{}\W|\W{}$)".format(domEnt, domEnt, domEnt), user_utterance, re.I)
        if searchObj:
            return True
        
    if mode == 'hamming':
        # hamming distance
        if len(domEnt) <= 4:
            return False
        
#         if not "acorn" in domEnt.lower():
#             return False
        
        for start in range(len(user_utterance) - len(domEnt) + 2):
            ## if not segmented properly, continue 
            
#             if "acorn" in domEnt.lower():
#                 print (user_utterance.lower()[start: start + len(domEnt) - 1])
#                 print (start, len(domEnt), len(user_utterance))
            if (
                    start == 0 and start + len(domEnt) < len(user_utterance) and\
                        not user_utterance[start + len(domEnt)].isalnum() or\
                    start == len(user_utterance) - len(domEnt) and start > 0 and\
                        not user_utterance[start-1].isalnum() or\
                    (start-1 > 0 and not user_utterance[start-1].isalnum() and\
                     start + len(domEnt) < len(user_utterance) and\
                     not user_utterance[start + len(domEnt)].isalnum())
                ):
                

                if hamming(domEnt.lower(), user_utterance.lower()[start: start + len(domEnt)]) <= 1 and user_utterance[start].lower() == domEnt[0].lower():# and\
                   #user_utterance[start].isupper():
                    # print ("hamming0: ", domEnt, user_utterance[start: start + len(domEnt)])
                    return True
            # +1
            if (
                    start == 0 and start + len(domEnt) + 1 < len(user_utterance) and\
                        not user_utterance[start + len(domEnt) + 1].isalnum() or\
                    start == len(user_utterance) - len(domEnt) - 1 and start > 0 and\
                        not user_utterance[start-1].isalnum() or\
                    (start-1 > 0 and not user_utterance[start-1].isalnum() and\
                     start + len(domEnt) + 1 < len(user_utterance) and\
                     not user_utterance[start + len(domEnt) + 1].isalnum())
                ):
                
                if fuzz.ratio(domEnt.lower(), user_utterance.lower()[start: start + len(domEnt) + 1]) >= 90 and user_utterance[start].lower() == domEnt[0].lower():# and\
                   #user_utterance[start].isupper():
        #                         print ("hamming: ", domEnt, user_utterance[start: start + len(domEnt)])
#                     print ("hamming+1: ", domEnt, user_utterance)
                    return True
    
            # -1
            if (
                    start == 0 and start + len(domEnt) - 1 < len(user_utterance) and\
                        not user_utterance[start + len(domEnt) - 1].isalnum() or\
                    start == len(user_utterance) - len(domEnt) + 1 and start > 0 and\
                        not user_utterance[start-1].isalnum() or\
                    (start-1 > 0 and not user_utterance[start-1].isalnum() and\
                     start + len(domEnt) - 1 < len(user_utterance) and\
                     not user_utterance[start + len(domEnt) - 1].isalnum())
                ):
#                 print (user_utterance[start: start + len(domEnt) - 1])
                    
                
                
                if fuzz.ratio(domEnt.lower(), user_utterance.lower()[start: start + len(domEnt) - 1]) >= 90 and user_utterance[start].lower() == domEnt[0].lower():# and\
                   #user_utterance[start].isupper():
#                     print ("hamming-1: ", domEnt, user_utterance.lower()[start: start + len(domEnt) - 1])
                    return True
                      
    
    if mode == 'alias':
        # alias matching
        for domEnt in alias_domEnts:
        #                     print ("try alias: ", domEnt, user_utterance)
            searchObj = re.search("(^{}\W|\W{}\W|\W{}$)".format(domEnt, domEnt, domEnt), user_utterance, re.I)
            if searchObj:
                if user_utterance[searchObj.span(1)[0]+1].isupper():
                    return True
                else:
                    # print (domEnt, "vs", user_utterance)
                    return False
            # if upper then cannot match 5325, 5326, etc, where user types lowercase incomplete entity name
            # but if not upper, then ask will be identified as Ask Restaurant 
            
#             for start in range(len(user_utterance) - len(domEnt) + 2):
#                 ## if not segmented properly, continue 

#     #             if "acorn" in domEnt.lower():
#     #                 print (user_utterance.lower()[start: start + len(domEnt) - 1])
#     #                 print (start, len(domEnt), len(user_utterance))
#                 if (
#                     start == 0 and start + len(domEnt) < len(user_utterance) and\
#                         not user_utterance[start + len(domEnt)].isalnum() or\
#                     start == len(user_utterance) - len(domEnt) and start > 0 and\
#                         not user_utterance[start-1].isalnum() or\
#                     (start-1 > 0 and not user_utterance[start-1].isalnum() and\
#                      start + len(domEnt) < len(user_utterance) and\
#                      not user_utterance[start + len(domEnt)].isalnum())
#                     ):


#                     if hamming(domEnt.lower(), user_utterance.lower()[start: start + len(domEnt)]) <= 1 and user_utterance[start].lower() == domEnt[0].lower():# and\
#                        #user_utterance[start].isupper():
#                         print ("alias hamming: ", domEnt, user_utterance)
#                         return True
      
    if mode == 'lemma':
        lemmatizer = WordNetLemmatizer().lemmatize # lemmatizer function or None
        tokenizer = nltk.tokenize.WordPunctTokenizer().tokenize
        
        tokenized = tokenizer(user_utterance)
        tokenized_pos = nltk.pos_tag(tokenized)
        tokenized_joined = " ".join([lemmatize(lemmatizer, token.lower(),
                                               get_wordnet_pos(pos) or wordnet.NOUN).lower()
                                     for token, pos in tokenized_pos])
        # lemma matching
        searchObj = re.search("(^{}\W|\W{}\W|\W{}$)".format(domEnt, domEnt, domEnt), tokenized_joined, re.I)
        if searchObj:
            return True
    return False

예제 #27

0

파일 보기

파일: validate_edit_metric_tags.py 프로젝트: bwlang/edittag

def hammng(a, b):
    """return the hamming distance between a and b"""
    return hamming(a, b)

예제 #28

0

파일 보기

파일: test_c_levenshtein.py 프로젝트: carlosp420/edittag

 def test_one_substitution(self):
     """[hamming-c] one difference"""
     expected = 1
     observed = hamming('wonderful', 'wondirful')
     self.assertEqual(expected, observed)

예제 #29

0

파일 보기

bc = dict()
first = True
with open(args.barcodesFile, 'r') as f:
    for line in f:
        barcode, count = line.rstrip().split("\t")
        count = int(count)
        if first:
            bc[barcode] = count
            first = False
        else:
            found = False
            for b in list(bc):
                # Hamming distance is only for equal lengths. Different lengths should not occur under current CaTCH design, but may be diagnostic counts or unforseen irregularities.
                # Explicitly exclude any diagnostic categories from having their distances compared.
                if len(b) == len(barcode) and b not in ["unknown", "SampleUnknown", "unmatched", "BCUnmatched", "empty", "EmptyVector", "spike", "SpikeIn"]:
                    h = hamming(b, barcode)
                    sh = str(h)
                    hbcstat.update([sh])
                    if sh in hrmin:
                        hrmin[sh] = min([hrmin[sh], count, bc[b]])
                        hrmax[sh] = max([hrmax[sh], count, bc[b]])
                    else:
                        hrmin[sh] = count
                        hrmax[sh] = count
                    if len(b) == len(barcode) and h <= args.hammDist and not found:
                        if bc[b] >= count:
                            # If existing sequence is more abundant, update its count and discard the new sequence
                            # This is the most likely scenario, as the quantifier orders the barcodes by decreasing abundance.
                            bc[b] = bc[b] + count
                            found = True
                            # break