Example #1
0
def hamming(ahash,phash):
	"""return the 10 most similar pics based on the hamming"""
	result = find_aHash();
	dist_a = 9
	dist_b = 16
	ahash_db = 0
	bhash_db = 0
	phash = str(convert_hex_to_bin(phash))
	for i in result:
		if distance.hamming(i, ahash) < dist_a:
			dist_a = distance.hamming(i,ahash)
			ahash_db = i			
	result_p  = find_pHash_a(ahash_db)
	print result_p
	resu =[]
	for x in result_p:
		y=list(x)
		y[0] = str(convert_hex_to_bin(y[0]))
		y.append(distance.hamming(phash,y[0]))
		resu.append(y)
	resu = sorted(resu,key=lambda x: x[2])
	if len(resu) <= 10:
		return resu
	else:
		return resu[0:10]
    def _getLinkerIndices(self, readKmers):
        minLinker1Ed = 2
        minLinker2Ed = 2
        linker1Index = None
        linker2Index = None
        for i in range(len(readKmers)):
            kmer = readKmers[i]
            linker1Ed = distance.hamming(kmer,self.linker1)
            if linker1Ed < minLinker1Ed:
                minLinker1Ed = linker1Ed
                linker1Index = i
                break
        if linker1Index != None:
            linker2Start = linker1Index + 15
        else:
            linker2Start = 0

        for i in range(linker2Start,len(readKmers)):
            kmer = readKmers[i]
            linker2Ed = distance.hamming(kmer,self.linker2)
            if linker2Ed < minLinker2Ed:
                minLinker2Ed = linker2Ed
                linker2Index = i
                break
        return linker1Index, linker2Index
Example #3
0
def findSameHash(simhash):
    # 对比第一个hash表,第0-15位hash值
    num = int(simhash[:16], 2)
    if len(hash1[num]) == 0:
        hash1[num].append(simhash)
    else:
        for i in range(len(hash1[num])):
            if distance.hamming(simhash, hash1[num][i]) < 3:
                return True
    # 对比第二个hash表,第16-31位hash值
    num2 = int(simhash[16:32], 2)
    if len(hash2[num2]) == 0:
        hash2[num2].append(simhash)
    else:
        for i in range(len(hash2[num2])):
            if distance.hamming(simhash, hash2[num2][i]) < 3:
                return True
    # 对比第三个hash表,第32-47位hash值
    num3 = int(simhash[32:47], 2)
    if len(hash3[num3]) == 0:
        hash3[num3].append(simhash)
    else:
        for i in range(len(hash3[num3])):
            if distance.hamming(simhash, hash3[num3][i]) < 3:
                return True
    # 对比第四个hash表,第48-63位hash值
    num4 = int(simhash[48:64], 2)
    if len(hash4[num4]) == 0:
        hash4[num4].append(simhash)
    else:
        for i in range(len(hash4[num4])):
            if distance.hamming(simhash, hash4[num4][i]) < 3:
                return True
    return False
Example #4
0
def EdgeComp(e1,e2):
	u,v=e1
	x,y=e2
	leveluv = distance.hamming(profiles[u],profiles[v])  
	levelxy = distance.hamming(profiles[x],profiles[y])

	if leveluv != levelxy:
		return leveluv - levelxy

	for k in range (maxlen):				
	 	maxuv = max(lvs[u][k], lvs[v][k])	
	 	maxxy = max(lvs[x][k], lvs[y][k])

		if maxuv != maxxy:
			return maxxy - maxuv

		minuv = min(lvs[u][k], lvs[v][k])
		minxy = min(lvs[x][k], lvs[y][k])

	 	if minuv != minxy:
			return minxy - minuv 

		maxuv = max(u,v) 
		maxxy = max(x,y)

		if maxuv != maxxy:
			return maxxy - maxuv

		minuv = min(u,v)
		minxy = min(x,y)

		if minuv != minxy:
			return minxy - minuv
Example #5
0
def hamming(ahash, phash):
    """return the 10 most similar pics based on the hamming"""
    result = find_aHash()
    dist_a = 9
    dist_b = 16
    ahash_db = 0
    bhash_db = 0
    phash = str(convert_hex_to_bin(phash))
    for i in result:
        if distance.hamming(i, ahash) < dist_a:
            dist_a = distance.hamming(i, ahash)
            ahash_db = i
    result_p = find_pHash_a(ahash_db)
    print result_p
    resu = []
    for x in result_p:
        y = list(x)
        y[0] = str(convert_hex_to_bin(y[0]))
        y.append(distance.hamming(phash, y[0]))
        resu.append(y)
    resu = sorted(resu, key=lambda x: x[2])
    if len(resu) <= 10:
        return resu
    else:
        return resu[0:10]
Example #6
0
def filter_contigs(contigs, assembly_min_uniq=0.01):
    """
    given a list of contigs, removes similar contigs to leave the highest (of the similar) scoring contig only
    """
    filtered_contigs = {}
    # ordering: highest scoring, then longest, then aphanumeric
    for contig in sorted(contigs, key=lambda x: (-1 * x.score, -1 * len(x.seq), x.seq)):
        rseq = reverse_complement(contig.seq)
        if contig.seq in filtered_contigs or rseq in filtered_contigs:
            continue
        drop = False
        # drop all contigs that are more than 'x' percent similar to existing contigs
        for other_seq in filtered_contigs:
            kmer_length = min(len(other_seq), len(contig.seq))
            okmer_list = set(kmers(other_seq, kmer_length))

            for okmer, ckmer in itertools.product(okmer_list, set(kmers(contig.seq, kmer_length))):
                if distance.hamming(okmer, ckmer, normalized=True) < assembly_min_uniq:
                    drop = True
                    break
            if not drop:
                for okmer, ckmer in itertools.product(okmer_list, set(kmers(rseq, kmer_length))):
                    if distance.hamming(okmer, ckmer, normalized=True) < assembly_min_uniq:
                        drop = True
                        break
            if drop:
                break

        if not drop:
            filtered_contigs[contig.seq] = contig

    return list(filtered_contigs.values())
Example #7
0
def get_best_re(x, target_re):
    x_count = sorted(dict(Counter(x)).items(), key=itemgetter(1), reverse=True)
    x_count_max = [(re,a) for re,a in x_count if a == x_count[0][1]]
    if len(x_count_max) == 1:
        return((x_count_max[0][0],x_count_max[0][1],distance.hamming(x_count_max[0][0], target_re)))
    else:
        x_ham = [(re,a,distance.hamming(re, target_re)) for re,a in x_count_max]
        return(max(x_ham, key=itemgetter(2)))
 def blocksAreCorrect(self,block1,block2):
     if (len(block2) < 4):
         return False
     block1Ed = distance.hamming(block1,"ACG")
     block2Ed = distance.hamming(block2,"GACT")
     if (block1Ed > 1) or (block2Ed > 1):
         return False
     else:
         return True
Example #9
0
def groupbyHamm(seqrlist, Hamm_limit):
    """Takes a list of seq records
    Makes a copy of the list

    Calculates the Hamming distance between all pairs of sequences
    including reverse_complements and groups them together into a list
    if their Hamming distances from each other are <= Hamm_limit

    If a group is found, those sequences are removed from
    subsequent consideration by adding them to a passlist

    Outputs list_of_groups, which is a list of lists containing seq groups"""

    seqrcopy = list(seqrlist)
    list_of_groups = []
    rcidlist = []
    passlist = []
    for record1 in seqrcopy:
        if record1.id not in passlist:
            # print "%s not in passlist... creating new seqgroup and beginning search..." % record1.id
            seqgroup = []
            seqgroup.append(record1)
            passlist.append(record1.id)
            fwrecord1 = record1.seq
            rvrecord1 = record1.reverse_complement()

            for record2 in seqrcopy:

                if record2.id not in passlist:
                    # print "%s not in passlist... testing..." % record2.id
                    fwrecord2 = record2.seq
                    # print ("%s/%s --> %s"
                    # % (record1.id, record2.id,
                    # min(distance.hamming(fwrecord1,fwrecord2),
                    # distance.hamming(rvrecord1,fwrecord2)))
                    # )

                    dH = distance.hamming(fwrecord1,fwrecord2)
                    dHr = distance.hamming(rvrecord1,fwrecord2)

                    # print dH
                    # print dHr

                    if dH <= Hamm_limit:
                        # print "Pass! Adding %s to seqgroup and passlist" % record2.id
                        seqgroup.append(record2)
                        passlist.append(record2.id)
                    elif dHr <= Hamm_limit:
                        seqgroup.append(record2)
                        passlist.append(record2.id)
                        rcidlist.append(record2.id)

            list_of_groups.append(seqgroup)

    return list_of_groups,rcidlist
def pack(lines):
    s = ['', '', '', '']
    ref = 'A' * readlen  # ref is the reference which is constantly updated (introduced because matching a read to previous read leads to double noise than actual)
    prev = 'A' * readlen
    count = [[1] * readlen, [0] * readlen, [0] * readlen, [0] * readlen,
             [0] * readlen
             ]  #number of A's,C's,T's,G's and N's seen at each position in ref
    #Note: N is never considered in the ref - we arbitrarily place an A if only N's are seen at some position
    for current in lines:
        flag = 0
        for i in range(maxmatch):
            if (hamming(current[:(readlen - i)], ref[i:]) <= thresh):
                if (hamming(current[:(readlen - i)], ref[i:]) <= hamming(
                        current[:(readlen - i)], prev[i:])):
                    s[1] += 'r'
                    s[0] += (current[(readlen - i):] + '\n')
                    prevj = 0
                    for j in range(readlen - i):
                        count[char2index(current[j])][i + j] += 1
                        if current[j] != ref[i + j]:
                            s[2] += (current[j])
                            s[3] += ("%02d" % (j - prevj))  #delta encoding
                            prevj = j
                else:
                    s[1] += ('p')
                    s[0] += (current[(readlen - i):] + '\n')
                    prevj = 0
                    for j in range(readlen - i):
                        count[char2index(current[j])][i + j] += 1
                        if current[j] != prev[i + j]:
                            s[2] += (current[j])
                            s[3] += ("%02d" % (j - prevj))  #delta encoding
                            prevj = j

                count = [count[j][i:] + [0] * i for j in range(5)]
                for j in range(readlen - i, readlen):
                    count[char2index(current[j])][j] = 1

                ref = findmajority(count)
                #ref = current#ref[i:]+current[readlen-i:]
                s[2] += ('\n')
                flag = 1
                break

        if flag == 0:
            s[1] += ('0')
            s[0] += (current + '\n')
            count = [[0] * readlen for j in range(5)]
            for j in range(readlen):
                count[char2index(current[j])][j] = 1
            ref = findmajority(count)
        prev = current
    return s
Example #11
0
    def calculate_intra_hamming_distance_between_elements(
            self, files, length=391716):
        total = 0
        count = 0
        highest = 0
        highest_pct = 0
        lowest = length
        lowest_pct = 101
        distances = {}
        if isinstance(files, dict):
            print('i')
            for a1, b1 in itertools.combinations(files, 2):
                a = files[a1]
                b = files[b1]
                if len(a) != length and len(b) != length:
                    continue
                dis = distance.hamming(a, b)
                pct = (dis / length) * 100
                total += pct
                count += 1
                name = a1 + "-" + b1
                distances[name] = pct
                if highest < dis:
                    highest = dis
                    highest_pct = pct
                if lowest > dis:
                    lowest = dis
                    lowest_pct = pct
        else:
            print('j')
            for a, b in itertools.combinations(files, 2):

                if len(a) != length and len(b) != length:
                    continue
                dis = distance.hamming(a, b)
                pct = (dis / length) * 100

                total += pct
                count += 1
                if highest < dis:
                    highest = dis
                    highest_pct = pct
                if lowest > dis:
                    lowest = dis
                    lowest_pct = pct
                    # print(str(dis) + ", " + str(pct) + "%")
            average = total / count if count > 0 else 0
        return [
            average, highest, lowest, highest_pct,
            lowest_pct if lowest_pct != 101 else 0, distances
        ]
Example #12
0
    def findOtherBlocks(self):
        print "Finding other blocks ..."
        for i in range(len(self.linkerIndices)):
            indexPair = self.linkerIndices[i]
            if not self.isInvalidIndexPair(indexPair):
                linker1Index = indexPair[0]
                linker2Index = indexPair[1]
                sequenceName = self.sequenceNames[i]
                sequence = self.sequences[i]
                placeHolder = self.sequencePlaceHolders[i]
                qualityScore = self.qualityScores[i]
                R2sequenceName = self.R2sequenceNames[i]
                R2sequence = self.R2sequences[i]
                R2placeHolder = self.R2sequencePlaceHolders[i]
                R2qualityScore = self.R2qualityScores[i]

                barCode1 = sequence[linker1Index-6:linker1Index]
                barCode2 = sequence[linker1Index+15:linker2Index]
                barCode3 = sequence[linker2Index+15:linker2Index+15+6]
                acgBlock = sequence[linker2Index+15+6:linker2Index+15+6+3]
                umiBlock = sequence[linker2Index+15+6+3:linker2Index+15+6+3+8]
                polyTBlock = sequence[linker2Index+15+6+3+8:linker2Index+15+6+3+8+4]

                barCode1QS = qualityScore[linker1Index-6:linker1Index]
                barCode2QS = qualityScore[linker1Index+15:linker2Index]
                barCode3QS = qualityScore[linker2Index+15:linker2Index+15+6]
                umiBlockQS = qualityScore[linker2Index+15+6+3:linker2Index+15+6+3+8]
                if self.barCodesAreCorrect(barCode1,barCode2,barCode3):
                    for possibleBarCode in self.possibleBarCodes:
                        bc1Ed = distance.hamming(barCode1,possibleBarCode)
                        bc2Ed = distance.hamming(barCode2,possibleBarCode)
                        bc3Ed = distance.hamming(barCode3,possibleBarCode)
                        if bc1Ed == 1:
                            barCode1 = possibleBarCode
                            #self.numCorrectedBarCodes = self.numCorrectedBarCodes + 1
                        if bc2Ed == 1:
                            barCode2 = possibleBarCode
                            #self.numCorrectedBarCodes = self.numCorrectedBarCodes + 1
                        if bc3Ed == 1:
                            barCode3 = possibleBarCode
                            #self.numCorrectedBarCodes = self.numCorrectedBarCodes + 1
                if self.blocksAreCorrect(acgBlock,polyTBlock):
                    self.blockSequenceNames.append(sequenceName)
                    self.blocks.append([barCode1,barCode2,barCode3,acgBlock,umiBlock,polyTBlock])
                    self.blockPlaceHolders.append(placeHolder)
                    self.blockQualityScores.append([barCode1QS,barCode2QS,barCode3QS,umiBlockQS])
                    self.read2Lines.extend([R2sequenceName,R2sequence,R2placeHolder,R2qualityScore])
                else:
                    self.filterIndices.append(i)
Example #13
0
def find_sim():
    # image_filenames = [os.path.join(userpath, path) for path in os.listdir(userpath) if is_image(path)]
    # images={}
    f = open(sys.argv[2])
    shopitem_dic = {}
    list = []
    type_item_set = set()
    for line in f:
        dv = line.split()
        type = dv[0]
        shopid = dv[1]
        itemid = dv[2]
        shopitem_dic[itemid] = shopid
        if type == '':
            type_item_set.add(itemid)
    for line in open(sys.argv[3]):
        # hash = hashfunc(Image.open(img))
        # hash = test_dhash(Image.open(img))
        img, hash = line.split()
        if 'home' in img:
            itemid = img.split('/')[-1].split('_')[0]
        else:
            itemid = img.split('_')[0]
        if itemid in type_item_set:
            list.append((img, shopitem_dic.get(itemid), str(hash)))
    num = len(list)
    print num
    for i in xrange(num):
        item1 = list[i]
        for j in xrange(num - i):
            item2 = list[i + j]
            if item1[1] == item2[1]: continue
            sim = distance.hamming(item1[2], item2[2])
            if sim < 4:
                print "-".join(item1), "-".join(item2), sim
Example #14
0
 def calculate_hamming_distance(self, files, length=1048576):
     total = 0
     count = 0
     highest = 0
     highest_pct = 0
     lowest = length
     lowest_pct = 100
     keys = list(files.keys())
     a = files[keys[0]]
     high = 0
     name = ""
     print(keys[0])
     for c, d in itertools.combinations(files, 2):
     # for c in keys[1:]:
     #     b = files[c]
         a = files[c]
         b = files[d]
         if len(a) != length and len(b) != length:
             continue
         dis = distance.hamming(a, b)
         pct = (dis / length) * 100
         total += pct
         count += 1
         if highest < dis:
             highest = dis
             highest_pct = pct
             high = count
             # name = c
         if lowest > dis:
             lowest = dis
             lowest_pct = pct
         print(str(c) + "-" + str(d) + " : " + str(dis) + ", " + str(pct) + "%")
     return [total/count, highest, lowest, highest_pct,lowest_pct, high, name]
Example #15
0
 def get_best_match(self, photo_path):
     """Try to match the photo with the nearest image in the database"""
     #split = cv2.split(warp)
     s = time.time()
     photo = cv2.imread(photo_path)
     # Get the 90° cropped view of the card
     framed = self.get_framed_card(photo)
     photo_array = Image.fromarray(framed)
     # Detect language of the card: https://stackoverflow.com/questions/37235932/python-langdetect-choose-between-one-language-or-the-other-only
     # text = pytesseract.image_to_string(photo_array)
     # try:
         # lang = detect(text)
     # except:
         # lang = "NF"
     phash_photo = str(ih.phash(photo_array))
     # Get 20 best results from phash matching
     distances = sorted([(distance.hamming(phash_photo, i[0]), i[2]) for i in self.phashes], key=itemgetter(0))[:20]
     # Extract descriptor from framed card
     kp1, des_photo = self.orb.detectAndCompute(cv2.cvtColor(framed, cv2.COLOR_BGR2GRAY), None)
     # Get descriptors for the 20 best phash matches
     cards_list = self.database.get_descriptors([p[1] for p in distances])
     # Get the card with the minimum hamming distance between descriptors
     sc, cpath, best_match = min([(self.orb_score(des_photo, des_im), cpath, (name, set_code, scryfall_id)) for des_im, cpath, name, set_code, scryfall_id in cards_list], key = itemgetter(0))
     # scores = []
     # for phash, im_path in distances:
         # scores.append((self.score_images(framed, im_path), im_path))
     # sc, best_match = min(scores, key = itemgetter(0))
     e = time.time()
     logging.info("===> Result : orb_score = {0} / time = {1}s / lang = {2} / best match = {3}".format(sc, e - s, "None", os.path.basename(cpath)))
     print("===> Result : orb_score = {0} / time = {1}s / best match = {2}".format(sc, e - s, os.path.basename(cpath)))
     return best_match 
Example #16
0
def mydist(args):
    query, mydict, thresh = args[:]
    mydistance = [(idx, distance.hamming(query, mydict[idx]))
                  for idx in range(len(mydict))]
    mydistance.sort(key=lambda x: x[1])
    return mydistance[0] if mydistance[0][1] != mydistance[1][1] else (
        -1, mydistance[0][1])
Example #17
0
 def get_similar_photos(user, album):
     with getcursor() as cur:
         cur.execute(
             "SELECT phash, id FROM photos WHERE owner = %s AND album = %s AND phash IS NOT NULL ORDER BY phash",
             (
                 user.id,
                 album,
             ))
         rows = cur.fetchall()
     # TODO clusters should be cached based on rows as a key ie. if the photos haven't changed, the clustering won't have changed
     clusters = []
     for phash, id in (rows):
         found = False
         for cluster in clusters:
             for photo in cluster:
                 if distance.hamming(photo['phash'], phash) <= 8:
                     # add photo to an existing cluster
                     cluster.append({"phash": phash, "id": id})
                     # stop looking once first cluster is found
                     found = True
                     break
             if found:
                 break
         # start a new cluster
         clusters.append([{"phash": phash, "id": id}])
     return list(filter(lambda x: len(x) > 1, clusters))
Example #18
0
def find_ss(s):
    count = 0
    len_ss = len(ss)
    for i in range(len(s)-len_ss):
        if distance.hamming(s[i:i+len_ss], ss) <= 3:
            count += 1
    print(count)
    def _repost_checker_proc(self, to_be_checked, records, hashsize, hd):

        results = [{
            'image_id': to_be_checked['image_id'],
            'older_images': []
        }]

        found_repost = None

        hash_size = 'hash' + hashsize

        for r in records:

            if to_be_checked['image_id'] == r['image_id'] or r['user'] == to_be_checked['user']:
                continue

            try:
                hamming_distance = hamming(to_be_checked[hash_size], r[hash_size])
            except ValueError:
                continue

            if hamming_distance < hd:
                found_repost = True
                results[0]['older_images'].append(r)

        return results if found_repost else None
Example #20
0
File: adam2.py Project: gaeun/adam2
    def gen_hamming_matrices(self):
        """
        Generate Hamming distance and similarity matrices for all sequences
        """
        # For all target sequences
        for target in tqdm(self.target_list,
                           desc="Generating Hamming distance matrices"):

            # Sort the list of k-mers unique to the target and all observed kmers in the set alphabetically
            unique_kmer_list = sorted(list(target.unique_kmers))
            other_kmer_list = sorted(
                list(self.all_kmers.difference(target.unique_kmers)))

            # Get the number of unique k-mers in current target sequence and all observed kmers
            target_unique = len(unique_kmer_list)
            other_kmers = len(other_kmer_list)

            # Generate a matrix of appropriate size
            ham_array = np.zeros((target_unique, other_kmers))

            # Loop over both lists and fill in the Hamming distance matrix
            for i in range(target_unique):
                for j in range(other_kmers):
                    ham_array[i, j] = hamming(unique_kmer_list[i],
                                              other_kmer_list[j])

            # Assign the lists as instance variables to the target sequences in the list
            target.unique_kmer_list = unique_kmer_list
            target.other_kmer_list = other_kmer_list

            # Assign the Hamming distance and similarity score matrices to the target sequences in the list
            target.hamming_dist_matrix = ham_array
            target.similarity_matrix = np.full_like(ham_array,
                                                    self.k) - ham_array
Example #21
0
def find_closest(output, input_patterns):
    distances = []
    for pattern in input_patterns:
        ham_dist = distance.hamming(output, pattern)
        distances.append(ham_dist)
    val, idx = min((val, idx) for (idx, val) in enumerate(distances))
    return idx
Example #22
0
    def find_highest(self, input_offer, bucket_num):
        """
        must call this before updating the round number
        :param bucket_num: the number of utility_buckets we are willing to look in; the best option from the highest_utility bucket is chosen
        :return: an offer that works best for us with the smallest hamming distance from the given offer
        """
        if bucket_num > len(self.utility_buckets) - 1:
            # ensure validity of utility_bucket
            bucket_num = len(self.utility_buckets) - 1
        if bucket_num < 1:
            # self.num_buckets = 1
            bucket_num = 1

        best_off = []
        hamming_best = float("inf")
        i = 0
        for key in self.utility_buckets:
            if i == bucket_num:
                break
            for offer in self.map_util_to_list[key]:
                #tempdist = distance.hamming(offer, self.received_offers[self.cur_round])
                tempdist = distance.hamming(offer, input_offer)
                if tempdist < hamming_best and offer != input_offer:
                    hamming_best = tempdist
                    best_off = offer
                    #print("New best offer to counter ",self.received_offers[self.cur_round]," is ",bestOff, "with distance",hamming_best,"and utility",key)
                    #print("New best offer to counter ",input_offer," is ",bestOff, "with distance",hamming_best,"and utility",key)
            i += 1
        return best_off
Example #23
0
 def calculate_hamming_distance_list(self, files, length=1048576):
     total = 0
     count = 0
     highest = 0
     highest_pct = 0
     lowest = length
     lowest_pct = 100
     a = files[0]
     high = 0
     name = ""
     for a, b in itertools.combinations(files, 2):
     # for c in keys[1:]:
     #     b = files[c]
         if len(a) != length and len(b) != length:
             continue
         dis = distance.hamming(a, b)
         pct = (dis / length) * 100
         total += pct
         count += 1
         if highest < dis:
             highest = dis
             highest_pct = pct
             high = count
         if lowest > dis:
             lowest = dis
             lowest_pct = pct
         print(str(dis) + ", " + str(pct) + "%")
     return [total/count, highest, lowest, highest_pct,lowest_pct, high]
Example #24
0
def find_putative_CS(start, end, kmer, genome, leader_core_sequence):
    search_seq = genome[start:end]
    min_dist = kmer
    res_seq = ''
    res_index = 0
    find = 0
    gaac_list = []
    for i in range(0, len(search_seq) - kmer + 1):
        query_seq = search_seq[i:i + kmer]
        index = start + i + 1
        dist = distance.hamming(leader_core_sequence, query_seq)
        downflnk = search_seq[i + kmer:i + kmer + 4]
        if leaderCS[-4:] == query_seq[-4:] and downflnk == "AAA":
            find = 1
            return query_seq + "(AAA)", dist, index, find
        if downflnk == "AAA":
            find = 1
            gaac_list.append([query_seq + "(AAA)", dist, index, find])
        if dist <= min_dist or leaderCS[-4:] == query_seq[-4:]:
            min_dist = dist
            res_seq = query_seq
            res_index = index
    if len(gaac_list) != 0:
        return sorted(gaac_list, key=lambda x: x[1])[0]
    else:
        if min_dist <= 2:
            find = 1
        return res_seq, min_dist, res_index, find
def detect_images(signatures):
    print(
        '\n[+] Detecting images that meet similarity threshold of signatures ('
        + str(parsed_args.min_similarity_threshold) + '%)...')
    images = []
    for i in os.listdir(dir_image):
        images.append(i)

    with open(csv_file, mode='w') as csv_out:
        csv_writer = csv.writer(csv_out,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow([
            'Document_SHA256', 'Average_Hash', 'Signature_Name', 'Similarity'
        ])

        for i in images:
            image_path = os.path.join(dir_image, i)
            image_hash = str(imagehash.average_hash(Image.open(image_path)))

            for sig_hash, sig_name in signatures.items():
                hamming_distance = distance.hamming(image_hash, sig_hash)
                image_similarity = 100 - ((hamming_distance / 16) * 100)

                if image_similarity >= min_similarity_threshold:
                    csv_writer.writerow(
                        [i, image_hash, sig_name, image_similarity])
                    print('[+] Document ' + i + ' matched ' + sig_name + ' (' +
                          str('%.0f' % image_similarity) + '% similarity).')

    print('[+] Saved results to ' + csv_file + '.')
    return True
def generate_matrix(db, str_rel):
	dic = {}
	for ind,item in enumerate(db.index):
		print str_rel, ind + 1,  len(db.index)
		secdic = {}
		for secitem in db.index:
			if secitem in dic:
				secdic[secitem] = dic[secitem][item]
				continue
			a = db.loc[item].tolist()
			b = db.loc[secitem].tolist()
			secdic[secitem] = hamming(a,b,normalized=True)

		dic[item] = secdic


	pd_pre = []

	for key in sorted(dic.keys()):
		newlst = [dic[key][i] for i in sorted(dic.keys())]
		pd_pre.append(newlst)

		

	new_pd = pd.DataFrame(pd_pre,columns=sorted(dic.keys()),index=sorted(dic.keys()))
	new_pd.to_csv('/Users/virpatel/Desktop/pub_stuff/relevant_data/%s_dataframe.txt' %(str_rel), sep='\t', encoding='utf-8')

	return dic
def generate_matrix(db, str_rel):
	dic = {}
	for ind,item in enumerate(db.keys()):
		print str_rel, ind + 1,  len(db.keys())
		secdic = {}
		for secitem in db.keys():
			if secitem in dic:
				secdic[secitem] = dic[secitem][item]
				continue
			a = db[item]
			b = db[secitem]
			secdic[secitem] = hamming(a,b,normalized=True)

		dic[item] = secdic


	pd_pre = []

	for key in sorted(dic.keys()):
		newlst = [dic[key][i] for i in sorted(dic.keys())]
		pd_pre.append(newlst)

		

	new_pd = pd.DataFrame(pd_pre,columns=sorted(dic.keys()),index=sorted(dic.keys()))
	print new_pd
	new_pd.to_csv('../relevant_data/%s_dataframe.txt' %(str_rel), sep='\t', encoding='utf-8')

	return new_pd
Example #28
0
def find_sim(userpath, hashfunc=imagehash.average_hash):
    image_filenames = [
        os.path.join(userpath, path) for path in os.listdir(userpath)
        if is_image(path)
    ]
    images = {}
    f = open(sys.argv[3])
    shopitem_dic = {}
    list = []
    for line in f:
        dv = line.split()
        shopid = dv[1]
        itemid = dv[2]
        shopitem_dic[itemid] = shopid
    for img in sorted(image_filenames):
        # hash = hashfunc(Image.open(img))
        hash = test_dhash(Image.open(img))
        itemid = img.split('/')[-1].split('_')[0]
        list.append((img.split('/')[-1], shopitem_dic.get(itemid), str(hash)))
        # itemid=img.split('/')[-1].split('_')[0]
        # print itemid
        # images[hash] = images.get(hash, []) + [img]
    num = len(list)
    for i in xrange(num):
        item1 = list[i]
        for j in xrange(num - i):
            item2 = list[i + j]
            if item1[1] == item2[1]: continue
            sim = distance.hamming(item1[2], item2[2])
            if sim < 4:
                print "-".join(item1), "-".join(item2), sim
Example #29
0
def just_sim(userpath):
    images = {}
    f = open(sys.argv[3])
    shopitem_dic = {}
    list = []
    for line in f:
        dv = line.split()
        shopid = dv[1]
        itemid = dv[2]
        shopitem_dic[itemid] = shopid
    fr = open(userpath)
    for img in fr:
        # hash = hashfunc(Image.open(img))
        # hash = test_dhash(Image.open(img))
        item, hash = img.split()
        itemid = item.split('_')[0]
        list.append((item, shopitem_dic.get(itemid), str(hash)))
        # itemid=img.split('/')[-1].split('_')[0]
        # print itemid
        # images[hash] = images.get(hash, []) + [img]
    num = len(list)
    for i in xrange(num):
        item1 = list[i]
        for j in xrange(num - i):
            item2 = list[i + j]
            if item1[1] == item2[1]: continue
            sim = distance.hamming(item1[2], item2[2])
            if sim < 4:
                print "-".join(item1), "-".join(item2), sim
Example #30
0
def call_CDR3_end(VDJ_seq, CDR3_start):
    '''Find the end of the CDR3'''
    try:
        from distance import hamming
    except ImportError:
        from util import hamming

    CDR3_end_anchor_sequence = 'CTGGGG'
    minimum_match_distance = 1
    CDR3_end = -1

    for i in range(CDR3_start,
                   len(VDJ_seq) - len(CDR3_end_anchor_sequence) + 1):
        seq1 = VDJ_seq[i:i + len(CDR3_end_anchor_sequence)]
        seq2 = CDR3_end_anchor_sequence
        if len(seq1) == len(seq2):
            d = hamming(seq1, seq2)
        else:
            print('expected two strings of the same length for hamming')
            d = 10
        if d <= minimum_match_distance:
            CDR3_end = i + 1
            minimum_match_distance = d

    return CDR3_end
    def _final_meme_filter(self, searched_hash: Text,
                           matches: List[ImageSearchMatch],
                           target_hamming) -> List[ImageSearchMatch]:
        results = []
        log.debug('MEME FILTER - Filtering %s matches', len(matches))
        if len(matches) == 0:
            return matches

        for match in matches:
            try:
                match_hash = self._get_meme_hash(match.post.url)
            except Exception as e:
                log.error('Failed to get meme hash for %s', match.post.id)
                continue

            h_distance = hamming(searched_hash, match_hash)

            if h_distance > target_hamming:
                log.info(
                    'Meme Hamming Filter Reject - Target: %s Actual: %s - %s',
                    target_hamming, h_distance,
                    f'https://redd.it/{match.post.post_id}')
                continue
            log.debug('Match found: %s - H:%s',
                      f'https://redd.it/{match.post.post_id}', h_distance)
            match.hamming_distance = h_distance
            match.hash_size = len(searched_hash)
            results.append(match)

        return results
 def _get_adj_list_directional_adjacency(self,
                                         umis,
                                         counts,
                                         threshold,
                                         use_hamming=False,
                                         countRatio=1.5):
     ''' identify all umis within the hamming distance threshold
     and where the counts of the first umi is > (1.5 * second umi counts)-1'''
     if use_hamming:
         return {
             umi: [
                 umi2 for umi2 in umis if hamming(umi, umi2) <= threshold
                 and counts[umi] >= (counts[umi2] * countRatio) - 1
             ]
             for umi in umis
         }
     else:
         return {
             umi: [
                 umi2 for umi2 in umis
                 if edit_distance(umi, umi2) <= threshold and counts[umi] >=
                 (counts[umi2] * countRatio) - 1
             ]
             for umi in umis
         }
Example #33
0
def buildcontig(reads):
    if (len(reads) == 1):  #singleton read
        return [reads[0], [0]]
    count = [[0, 0, 0, 0, 0] for i in range(readlen)
             ]  #number of A's,C's,T's,G's,N's seen at each position in ref
    pos = [0]
    for i in range(readlen):
        count[i][char2index[reads[0][i]]] = 1
    prevread = reads[0]
    for currentread in reads[1:]:
        flag = 0
        bestmatch = readlen
        besti = 0
        for i in range(maxmatch):
            hammingdist = hamming(currentread[:(readlen - i)], prevread[i:])
            if (hammingdist <= thresh):
                pos.append(i + pos[-1])
                count = count + [[0, 0, 0, 0, 0] for j in range(i)]
                for j in range(readlen):
                    count[pos[-1] + j][char2index[currentread[j]]] += 1
                flag = 1
                break
            if (hammingdist < bestmatch):
                bestmatch = hammingdist
                bestmatchpos = i
        if flag == 0:  #no match found due to some reason (this might happen because of matchsort9's
            # handling of N's (if only N's are seen at a position, matchsort9 makes it A in the ref.
            pos.append(bestmatchpos + pos[-1])
            count = count + [[0, 0, 0, 0, 0] for j in range(bestmatchpos)]
            for j in range(readlen):
                count[pos[-1] + j][char2index[currentread[j]]] += 1
        prevread = currentread
    ref = findmajority(count)
    return [ref, pos]
Example #34
0
 def correctBarcodes(self):
     print "Correcting barcodes ..."
     for i in range(len(self.blocks)):
         bc1 = self.blocks[i][0]
         bc2 = self.blocks[i][1]
         bc3 = self.blocks[i][2]
         for possibleBarCode in self.possibleBarCodes:
             bc1Ed = distance.hamming(bc1,possibleBarCode)
             bc2Ed = distance.hamming(bc2,possibleBarCode)
             bc3Ed = distance.hamming(bc3,possibleBarCode)
             if bc1Ed == 1:
                 self.blocks[i][0] = possibleBarCode
             if bc2Ed == 1:
                 self.blocks[i][1] = possibleBarCode
             if bc3Ed == 1:
                 self.blocks[i][2] = possibleBarCode
Example #35
0
def mapseq(seq, pri):
    for offset in range(len(seq) - len(pri)):
        qseq = seq[offset:offset + len(pri)]
        if len(qseq) < len(pri): break
        if hamming(qseq, pri) <= 3:
            return offset
    return 0
Example #36
0
def get_distance(data, vec1):
    distance_list = []
    for i in range(1, len(data)+1):
        vec2 = data[i][0]
        distance_list.append((distance.hamming(vec1, vec2), i))
    distance_list.sort()
    return distance_list
 def calculate_intra_hamming_distance(self, files, length=1048576):
     total = 0
     count = 0
     highest = 0
     highest_pct = 0
     lowest = length
     lowest_pct = 100
     keys = list(files.keys())
     a = files[keys[0]]
     distances = {}
     alphabet = 'B'
     for c in keys[1:]:
         b = files[c]
         if len(a) != length and len(b) != length:
             continue
         dis = distance.hamming(a, b)
         pct = (dis / length) * 100
         total += pct
         count += 1
         name = "A-" + alphabet
         distances[name] = pct
         if highest < dis:
             highest = dis
             highest_pct = pct
         if lowest > dis:
             lowest = dis
             lowest_pct = pct
         alphabet = chr(ord(alphabet) + 1)
     return [
         total / count, highest, lowest, highest_pct, lowest_pct, distances
     ]
Example #38
0
def dhash(image1, image2):
    i1 = Image.open(image1)
    i2 = Image.open(image2)

    h1 = _dhash(i1)
    h2 = _dhash(i2)

    return distance.hamming(h1, h2)
Example #39
0
def hamming_distance_calculator(one, two):
    t1 = anagramfunctions.stripped_string(one["tweet_text"])
    t2 = anagramfunctions.stripped_string(two["tweet_text"])

    comparitor.set_seqs(t1, t2)
    dist = 1.0 - float(distance.hamming(t1, t2)) / len(t1)
    if dist < 1:
        print(t1, t2, str(dist), str(comparitor.ratio()) + "\n\n", sep="\n")
Example #40
0
 def hamming(self,sig1,sig2):
     assert len(sig1) == len(sig2)
     if (sig1 == sig2):
         return 0
     elif sig1 in self._hamming and sig2 in self._hamming[sig1]:
         return self._hamming[sig1][sig2]
     else:
         self._hamming[sig1][sig2] = self._hamming[sig2][sig1] = (hamming(sig1,sig2)/self.length)
         return self._hamming[sig1][sig2]
Example #41
0
def primeiracamada(matriz,palavra,raio):
    # faz a leitura da palavra, compara com a matriz de enderecos e retorna as linhas que estão próximas
    matrizretorno = [] 
    for linha in range(len(matriz)):
        dif = distance.hamming(matriz[linha],palavra)        
        if dif <= raio :
            matrizretorno.append(matriz[linha])           
            
    return (matrizretorno)
Example #42
0
def gdm(pop):
    s1 = 0
    for i in range(len(pop) - 1):
        s2 = 0
        for j in range(i + 1, len(pop)):
            s2 += distance.hamming(pop[i], pop[j], normalized=True)
        s1 += (s2/(len(pop) - i))
    measure = s1 / len(pop)
    return measure
def hammingDistance(motifTable, t):
	'''
	Calculate the hamming distance between one motif and others
	motifTable is a list contains t motifs
	The return value will be the score.
	'''
	motif = motifTable[0]
	minDistance = 0
	for i in xrange(t):
		minDistance += distance.hamming(motif, motifTable[i])
	for i in xrange(t):
		motif = motifTable[i]
		dis = 0
		for j in xrange(t):
			dis += distance.hamming(motif, motifTable[j])
		if dis < minDistance:
			minDistance = dis
	return minDistance
Example #44
0
def search(a, b, count, min_dist):
    min_list=[]
    l={}
    if distance.hamming(a, b)==0:
        print count
        print a
        sys.exit()
        return
    count+=1
    for i in range(0, len(a)):
        for j in range(i, len(a)):
            if i==0 and j==0:
                a1=a[j:i+1:-1]+a[j:]
            elif i==0 and j>0:
                a1=a[j::-1]+a[i+j+1:]
            else:
                a1=a[:i]+a[j+1:i-1:-1]+a[j+2:]
            str1=""
            for q in a1:
                str1+=q
                str1+=" "
            #print str(j)+str(a1)
            l[str1]=distance.hamming(a1, b)
#            if distance.hamming(a1, b)<min_dist:
#                l[a1]=distance.hamming(a1, b)
#            if distance.hamming(a1, b)==0:
#                print count
#                print a1
#                sys.exit()
#                return

    l1=sorted(l.iteritems(), key=operator.itemgetter(1))
    for i in l1:
        if i[1]==l1[0][1] and i[1]<min_dist:
            min_list.append(i[0].split())
    for i in l1:
        if i[1]==l1[0][1]:
            min_dist=i[1]
    for i in min_list:
        o.write(str(i)+"\t"+str(count)+"\n")
        my_count=count
        my_dist=min_dist
        search(i, b, my_count, my_dist)
    return
Example #45
0
def find_similar_images(userpath = "localS3Images", hashfunc = imagehash.dhash, inputUrl = "https://s3.amazonaws.com/treblalee.images/watches7.jpg", inputFilePath = ""):
    import os
    global cache

    def is_image(filename):
    	f = filename.lower()
    	return f.endswith(".png") or f.endswith(".jpg") or \
    		f.endswith(".jpeg") or f.endswith(".bmp") or f.endswith(".gif")

    # get image url to detail page mapping
    imageToDetailPageMapping = {}
    mappingFile = open("imageUrlToDetailPageMapping.txt")
    for line in mappingFile:
        fields = line.strip("\n").split(",")
        imageUrl = str(urllib.unquote(fields[0]).decode('utf8'))
        detailPageUrl = str(urllib.unquote(fields[1]).decode('utf8'))
        imageToDetailPageMapping[imageUrl] = detailPageUrl

    # compute hash of input image
    try:
        if len(inputFilePath) > 0:
            image_file = inputFilePath
            inputAsString = inputFilePath
        else:
            fd = urllib.urlopen(inputUrl)
            image_file = io.BytesIO(fd.read())
            inputAsString = inputUrl
        inputHash = str(hashfunc(Image.open(image_file)))
    except:
        traceback.print_exc(file=sys.stdout)
        return json.dumps({}, sort_keys=True, indent=4, separators=(',', ': '))
    
    # compute hashes of all images in DB (currently just a directory)
    image_filenames = [os.path.join(userpath, path) for path in os.listdir(userpath) if is_image(path)]
    simList = []
    for img in sorted(image_filenames):
        if img in cache:
            hash = cache[img]
        else:
    	    hash = str(hashfunc(Image.open(img)))
            cache[img] = hash
        dist = distance.hamming(inputHash, hash)
        print inputHash + " " + hash + " " + str(dist)
        if dist < 6 and dist > 0:
            imageUrl = img.replace('localS3Images/', 'https://s3.amazonaws.com/treblalee.images/')
            detailPageUrl = imageToDetailPageMapping[imageUrl]
            pair = {}
            pair["imageUrl"] = imageUrl
            pair["detailPageUrl"] = detailPageUrl
    	    simList.append(pair)

    result = {}
    result["input"] = inputAsString
    result["output"] = simList 
    #print result
    return json.dumps(result, sort_keys=True, indent=4, separators=(',', ': ')) 
 def check_left_shift_conflicts(self):
     #checks if indexes from the same library after a left shift are conflicting
     for kit_type in self.indexes_by_kit: #for each lib kit type
         for index_type in self.indexes_by_kit[kit_type]: # for each type of indexes
             for index_name, index_seq in self.indexes_by_kit[kit_type][index_type].iteritems():
                 fake_index = index_seq[1:] + "A"
                 for index_name_check, index_seq_check in self.indexes_by_kit[kit_type][index_type].iteritems():
                     hamming_dist = distance.hamming(index_seq_check, fake_index)
                     if hamming_dist <= 2:
                         print "{} {} {} {} {}".format(index_seq, index_seq_check, fake_index, hamming_dist, kit_type)
def dis(kmers, ownKmers):
    hammingDistance = {}
    minDistance = 0
    for each in ownKmers:
        hammingDistance[each] = []
        for i in xrange(len(ownKmers[each])):
            hammingDistance[each].append(distance.hamming(kmers, ownKmers[each][i]))
    for each in hammingDistance:
        minDistance += min(hammingDistance[each])
    return minDistance
Example #48
0
def func(a, b):
    for j in range(32,41):
        for i in range(len(a)-j):
            c1=0
            for k in range(len(b)-j):
                if distance.hamming(a[i:i+j], b[k:k+j])<=3:
                    c1+=1
                    if c1>c:
                        c=c1
    return c
Example #49
0
    def get_distances(self):
        dm = [[0 for x in range(self.pl)] for x in range(self.n)]
        pm = self.pm

        for i in range(self.n):
            for j in range(i + 1, self.n):
                dm[i][j] = distance.hamming(pm[i], pm[j])

        self.dm = dm

        return self.dm
Example #50
0
 def force(p1, p2):
     """
     estimate force between p1 and p2
     p must be a Position
     """
     ## TODO: remake it later
     dist = hamming(p1.entity, p2.entity)
     dist = cannot_be_zero(dist)
     f_abs = G * (p1.mass * p2.mass) / dist
     force = {item: p * f_abs for item, p in (p1.entity - p2.entity).items()}
     return MappingParticle.Velocity(force)
def family_homogenity_collapsed(human_mirlst, mirna2disease, mirna2age):

	family_avg_age = []
	family_avg_hamming = []
	family_percent_invoved_dis = []
	
	all_mir_vector_df = pd.DataFrame()

	dislst = get_list_of_dictionary(mirna2disease)

	all_fam_mir = list(itertools.chain.from_iterable(human_mirlst.values()))

	for mir in all_fam_mir:
		if mir in mirna2disease:
			vec = generate_class_vector(dislst, mirna2disease[mir])
			tmp = pd.DataFrame([vec,],index=[str(mir),], columns=dislst)
			all_mir_vector_df = all_mir_vector_df.append(tmp)


	for fam in human_mirlst:
		family_vector = []
		mirlst = [a for a in human_mirlst[fam] if a in mirna2disease]
		if len(mirlst) < 4: continue
		for mir in mirlst:
			for other_mir in mirlst:
				if mir == other_mir: continue
				family_vector.append(hamming(all_mir_vector_df.loc[mir], all_mir_vector_df.loc[other_mir],normalized=True))
		
		family_avg_hamming.append(mean(family_vector))
		family_avg_age.append(round(mean([float(mirna2age[mirna]) for mirna in mirlst if mirna in mirna2age]),1))
		family_percent_invoved_dis.append(float(len(mirlst)) / float(len(human_mirlst[fam])))



	print spearmanr(family_percent_invoved_dis, family_avg_hamming)

	fam_df = pd.DataFrame(zip(family_avg_age,family_avg_hamming,family_percent_invoved_dis),columns=['fam_age','fam_hamming','fam_per'])


	fam_df = fam_df.sort('fam_age',ascending=1)

	f = plt.gcf()
	f.set_size_inches(20, 10)

	sns.boxplot(x='fam_age',y='fam_hamming',data=fam_df)

	plt.xticks(range(0,len(list(set(family_avg_age)))), [str(a) for a in sorted(list(set(family_avg_age)))])
	plt.gca().set_ylim([0,.094])
	plt.ylabel('Average Family Disease Vector Hamming Distance (0-1)', fontsize=15)
	plt.xlabel('Average Family Age',fontsize=15)
	plt.subplots_adjust(bottom=0.20)
	plt.savefig('figures/family_disease_hamming_collapsed.pdf',bbox_inches='tight')
	plt.close()
Example #52
0
 def calculate_changes_in_fitness(self,population,number_of_trials):
     original_fitnesses = ar(self.fitness_many(population))
     print original_fitnesses.shape
     sample = [self.sample_dA([i]) for i in population]
     # print sample.shape
     sample_fitnesses = ar(self.fitness_many([j for j in sample]))
     # return original_fitnesses,sample,sample_fitnesses
     print sample_fitnesses.shape
     print sample_fitnesses[0:10]
     differences = sample_fitnesses - original_fitnesses
     distances = [[distance.hamming(population[k],sample[k]) for k in range(len(sample))]]
     # pdb.set_trace()
     for i in range(number_of_trials):
         print "trial:",i
         new_sample = [self.sample_dA([j]) for j in population]
         new_sample_fitnesses = ar(self.fitness_many([j for j in new_sample]))
         new_difference = new_sample_fitnesses - original_fitnesses
         sample_fitnesses = np.vstack((sample_fitnesses,new_sample_fitnesses))
         differences = np.vstack((differences,new_difference))
         distances.append([distance.hamming(population[k],sample[k]) for k in range(len(sample))])
     return sample_fitnesses,differences,distances
 def create_graph(graph, word_list):
     """
     :param graph: graph of words with each word as parent and its children as words which differ from parent by one place
     :param word_list: list of all n-letter words sourced from dictionary
     :return: created graph of words and their one-letter spaced children
     """
     for word1 in word_list:
         neighbours_list = []
         for word2 in word_list:
             if distance.hamming(word1, word2) == 1:
                 if word2 not in neighbours_list:
                     neighbours_list.append(word2)
         graph[word1] = set(neighbours_list)
Example #54
0
 def get_statistics(self,population,sample,get_distances=False,original_fitnesses=False):
     if original_fitnesses:
         original_fitnesses = ar(self.fitness_many(population))
     sample_fitnesses = ar(self.fitness_many(sample))
     if original_fitnesses == False:
         original_fitnesses = sample_fitnesses
     if get_distances:
         differences = sample_fitnesses - original_fitnesses
         distances = [distance.hamming(population[k],sample[k]) for k in range(len(sample))]
     else:
         differences = []
         distances = []
     return original_fitnesses,sample_fitnesses,differences,distances
Example #55
0
def collapse_by_hamming(x, maxham):
    x_cllps = deque(sorted(Counter(x).items(), key=itemgetter(1),
                   reverse=True))
    while(len(x_cllps) > 0):
        seq, count = x_cllps.popleft()
        uniq = 1
        #x_cllps_save = deque()
        for aseq, acount in list(x_cllps):
            if distance.hamming(seq, aseq) <= maxham:
                count += acount
                uniq += 1
                x_cllps.remove((aseq, acount))
        yield ((seq, str(len(seq)), str(count), str(uniq)), len(x_cllps))
Example #56
0
def test(n):
    import time
    import distance
    from simhash import Simhash, SimhashIndex

    WIDTH = 3

    def gg():
        import random
        from random import randint
        from simhash import Simhash, SimhashIndex
        from itertools import groupby
        # text = str(bin(randint(2**63, 2**64-1)))[2:]
        # tokens = [text[i:i + WIDTH] for i in range(max(len(text) - WIDTH + 1, 1))]
        # return text, Simhash({k: sum(1 for _ in g) for k, g in groupby(sorted(tokens))})
        text = ''.join([random.choice('0123456789abcdef') for _ in range(36)])
        return text, Simhash(text)

    hashes = [gg() for _ in range(n)]
    d1, d2 = [], []
    test_string, test_hash = gg()

    start = time.time()
    for s, h in hashes:
        d1.append([distance.hamming(test_string, s), s])
    print time.time() - start

    start = time.time()
    index = SimhashIndex(hashes, k=5)
    for st in index.get_near_dups(test_hash):
        d2.append([distance.hamming(test_string, st), st])
    print time.time() - start

    print len(d1), len(d2)

    for a, b in zip(sorted(d1)[:20], sorted(d2)):
        print a[1] == b[1], '\t', a, '\t', b
def best_match(my_bc5, my_bc3, bc5_list, bc3_list, bc5_max_mismatch, bc3_max_mismatch):
  
  ## Find the edit distance between the given 3' barcode and all barcodes within the list
  edit_dist3 = [];
  for i in bc3_list:
    edit_dist3.append(distance.hamming(my_bc3, i))
  edit_dist3_min = min(edit_dist3)
  
  ## Find the edit distance between the given 5' barcode and all barcodes within the list
  edit_dist5 = [];
  for i in bc5_list:
    edit_dist5.append(distance.hamming(my_bc5, i))
  edit_dist5_min = min(edit_dist5)
  
  ## If the number of mismatches is lower than the given threshold AND only one barcode has the lowest
  ## number of mismatches, then the index is assigned
  if edit_dist3_min <= bc3_max_mismatch and edit_dist3.count(edit_dist3_min) == 1 and edit_dist5_min <= bc5_max_mismatch and edit_dist5.count(edit_dist5_min) == 1:
    tmp_bc3 = bc3_list[edit_dist3.index(edit_dist3_min)]
    tmp_bc5 = bc5_list[edit_dist5.index(edit_dist5_min)]
  else: 
    tmp_bc3 = ""
    tmp_bc5 = ""
  
  return (tmp_bc5, tmp_bc3)
Example #58
0
    def CalcLVs(self):
        maxlen = len(self.pm[0])
        nprof = len(self.pm)

        lvs = [[0 for x in range(maxlen)] for x in range(nprof)]

        for i in range(nprof):
            for j in range(i + 1, nprof):
                diff = distance.hamming(self.pm[i], self.pm[j])
                lvs[i][diff - 1] += 1
                lvs[j][diff - 1] += 1

        self.lvs = lvs

        return self.lvs
def call_CDR3_start(VDJ_seq):

    CDR3_start_anchor_sequence = 'TATTACTGT'
    minimum_match_distance = 2
    CDR3_start = -1

    for i in xrange(0, len(VDJ_seq) - len(CDR3_start_anchor_sequence) - 1):
        
        d = hamming(VDJ_seq[i:i+len(CDR3_start_anchor_sequence)], CDR3_start_anchor_sequence)
       
        if d <= minimum_match_distance:
            CDR3_start = i + len(CDR3_start_anchor_sequence) + 1
            minimum_match_distance = d


    return CDR3_start
def call_CDR3_end(VDJ_seq, CDR3_start):

    CDR3_end_anchor_sequence = 'CTGGGG'
    minimum_match_distance = 1
    CDR3_end = -1

    for i in xrange(CDR3_start, len(VDJ_seq) - len(CDR3_end_anchor_sequence) + 1):
        try:
            d = hamming(VDJ_seq[i:i+len(CDR3_end_anchor_sequence)], CDR3_end_anchor_sequence)
        except:
            print 'expected two strings of the same length for hamming'
            d=10
        if d <= minimum_match_distance:
            CDR3_end = i + 1
            minimum_match_distance = d

    return CDR3_end