def checkFirstCodonPos(refSequence, refSeqPosition, querySequence, queryLength, mask_array, threshold, reverse): if DEBUG: print "Ref Pos :", refSeqPosition #if DEBUG: print "Reference :", refSequence if DEBUG: print "Query1 :", querySequence if reverse: # Get the reverse complementary querySequence = str(Seq(querySequence).reverse_complement()) refSeqPosition = abs(refSeqPosition) # Convert to bitarray querySequenceBitarray = bitarray() querySequenceBitarray.encode(const.BIT_DICT, querySequence) bitStart = abs(refSeqPosition * 4) bitEnd = bitStart + queryLength * 4 refSeqPosition = refSeqPosition refBitArray = refSequence[0][refSeqPosition*4:(refSeqPosition+queryLength)*4] if DEBUG: print "Query2 :", querySequence if DEBUG: print "Ref Corres:", refSequence[1][refSeqPosition:refSeqPosition+queryLength] if DEBUG: print "Ref Bit :", refBitArray if DEBUG: print "Query Bit :", querySequenceBitarray try: diffPercent = float(queryLength - bitdiff(refBitArray, querySequenceBitarray)/2)/queryLength if DEBUG: print "Diff Bit :", diffPercent, bitdiff(refBitArray, querySequenceBitarray)/2 if diffPercent >= threshold: return True else: return False except: return False
def rNN_OutLSH(TABLEs, HLLs, LSHs, query, data): L = len(LSHs) hashVALUEs = [0] * L #start = time.clock() for idxTable in range(L): hashVALUEs[idxTable] = hashComputation(LSHs[idxTable], query) #print("Hash Computation time: ", time.clock() - start, " in second") # Estimate output size #start = time.clock() candidateSizeEst, numCollision = outputSizeEstimate(HLLs, hashVALUEs) #print("Output Size Estimation Time: ", time.clock() - start, " in second") #print("Candidate Size Estimate:", candidateSizeEst, " Number of Collisions: ", numCollision) # Searching rNN = list() N = getN() if candidateSizeEst + numCollision * getCPURatio() > N: #print("Use Linear Search for OutputSensitive LSH...") for idxPoint in range(N): if bitdiff(query, data[idxPoint]) <= 5: rNN.append(idxPoint) else: #print("Use standard LSH for OutputSensitive LSH...") candidateNN = set() # Remove duplicates for idxTable in range(L): hashValue = hashVALUEs[idxTable] if TABLEs[idxTable][hashValue]: candidateNN = candidateNN.union(TABLEs[idxTable][hashValue]) # Compute distance for idxPoint in candidateNN: if bitdiff(query, data[idxPoint]) <= 5: rNN.append(idxPoint) return rNN #, candidateSizeEst, numCollision
def dist(x, y): if getType() == "bit_sampling": return bitdiff(x, y) elif getType() == "basic_covering": return bitdiff(x, y) elif getType() == "l1": return scipy.spatial.distance.cityblock(x, y) elif getType() == "l2": return scipy.spatial.distance.euclidean(x, y) elif getType() == "cosine": return scipy.spatial.distance.cosine(x, y)
def hamming_distance(bv1, bv2): """ Return the Hamming distance between `bv1` and `bv2` bitvectors as the number of equal bits for all positions. (e.g. the count of bits set to one in an XOR between two bit strings.) `bv1` and `bv2` must both be either hash-like Halohash instances (with a hash() function) or bit array instances (that can be manipulated as-is). See http://en.wikipedia.org/wiki/Hamming_distance For example: >>> b1 = bitarray('0001010111100001111') >>> b2 = bitarray('0001010111100001111') >>> hamming_distance(b1, b2) 0 >>> b1 = bitarray('11110000') >>> b2 = bitarray('00001111') >>> hamming_distance(b1, b2) 8 >>> b1 = bitarray('11110000') >>> b2 = bitarray('00110011') >>> hamming_distance(b1, b2) 4 """ return int(bitdiff(bv1, bv2))
def showPos(self): length = self.table.rowCount() curstate = bitarray.bitarray(length) curstate.setall(False) for i in range(self.table.rowCount()): if self.table.item(i, 1).checkState(): curstate[i] = 1 if bitarray.bitdiff(curstate, self.laststate): self.laststate = curstate
def rNN_Linear(query, data): # Searching rNN = list() for idxPoint in range(getN()): if bitdiff(query, data[idxPoint]) <= 5: rNN.append(idxPoint) return rNN
def hamming_distance(self, fingerprint1, fingerprint2): """ Return hamming distance between two given fingerprints. Hamming distance is the difference in the bits of two binary string. Files with fingerprints whose hamming distance are less tends to be more similar. """ distance = bitdiff(fingerprint1, fingerprint2) result = int(distance) return result
def hamming_distance(h1, h2): """ Computes the hamming distance between hashes h1 and h2 :param h1: A locality sensitive hash :param h2: A second locality sensitive hash produced from the same basis as h1 :return: The hamming distance between h1 and h2 """ b1 = bitarray.bitarray() b1.frombytes(h1) b2 = bitarray.bitarray() b2.frombytes(h2) return bitarray.bitdiff(b1, b2)
def dehamming(bits): G = [ '00000000', '11010010', '01010101', '10000111', '10011001', '01001011', '11001100', '00011110', '11100001', '00110011', '10110100', '01100110', '01111000', '10101010', '00101101', '11111111' ] for c in G: bc = bitarray(c, endian='little') n = bitdiff(bc, bits) if n == 0 or n == 1: return ([bc[2], bc[4], bc[5], bc[6]], 0) elif n == 2: return ([0 == 1, bool([]), not 1, False], 1) # XD return ([0 == 1, not 1, bool([]), False], 1) # XDD
def rNN_LSH(TABLEs, LSHs, query, data): L = len(LSHs) candidateNN = set() # Removing duplicates for idxTable in range(L): hashValue = hashComputation(LSHs[idxTable], query) if TABLEs[idxTable][hashValue]: candidateNN = candidateNN.union(TABLEs[idxTable][hashValue]) # Compute distance rNN = list() for idxPoint in candidateNN: if bitdiff(query, data[idxPoint]) <= 5: rNN.append(idxPoint) return rNN
def _bit_candidates(qbitvector, bitvectors_by_rid, min_score=100, ignore_empty=True): """ Return rule candidates for further matching based on matching bitvectors """ if ignore_empty and not qbitvector.any(): return [] matchable_len = len(qbitvector) candidates = [] candidates_append = candidates.append for rid, ibitvector in bitvectors_by_rid: # we compute the AND that tells us all tokenids that exist in both. then # the hamming distance of that bitarray to the rule array distance = bitdiff(qbitvector & ibitvector, ibitvector) # a difference means we have some common bits (i.e. tokenids) if distance != matchable_len: if min_score == 100: if distance == 0: # only keep possible 100% matches candidates_append((distance, rid,)) else: candidates_append((distance, rid,)) return candidates
# plt.ylim(0, 6) plt.legend() plt.show() if __name__ == '__main__': b1 = 0 b2 = 99 t = 10 s = 512 ran = [] for i in range(s): ran.append(np.random.random() * (b2 - b1 + 2 * t) + b1 - t) # inputData = np.random.random(1000) * 100 # outputData = [distanceEncoding(x, b1, b2, t, s) for x in inputData] inputNum = [i for i in range(b2 + 1)] outputData = [distanceEncoding(x, b1, b2, t, s, ran) for x in inputNum] onesNum = [x.count() for x in outputData] factEw = np.mean(onesNum) Ew = s * 2. * t / (b2 - b1) factDist = [ bitarray.bitdiff(outputData[0], outputData[i]) for i in range(1, b2 + 1) ] de = [(b2 - b1) * dh / (2. * s) for dh in factDist] draw(b2, de, s)
output[i] = True return output if __name__ == '__main__': b1 = 0 b2 = 999 t = 50 s = 512 # ran = [] # for i in range(s): # ran.append(np.random.random() * (b2 - b1) + b1) # inputData = np.random.random(1000) * 100 # outputData = [distanceEncoding(x, b1, b2, t, s) for x in inputData] inputNum = [i for i in range(b2 + 1)] outputData = [distanceEncoding(x, b1, b2, t, s) for x in inputNum] onesNum = [x.count() for x in outputData] factEw = np.mean(onesNum) Ew = s * 2. * t / (b2 - b1) factDist = [bitarray.bitdiff(outputData[0], outputData[i]) for i in range(1, b2 + 1)] de = [(b2 - b1) * dh / (2. * s) for dh in factDist] draw(b2, de, s)
#bitmask to keep track of kmers that can be eliminated junk = bitarray(total) junk.setall(False) for i in xrange(0, len(encodedklist)): if progress_count % math.ceil(total/10000.0) == 0: p = (float(progress_count)/total)*100 reduction = total - junk.count() sys.stdout.write("\r%.2f%% progress. " % p + "Feature set reduced to %d kmers ... " % reduction) sys.stdout.flush() progress_count += 1 if junk[i]: continue for j in xrange(i+1, len(encodedklist)): if junk[j]: continue if bitdiff(encodedklist[i],encodedklist[j])/2 <= args.threshold: junk[j] = 1 print print "Done reducing kmer set ... " for i in xrange(0,len(encodedklist)): if junk[i]: continue featureMap[''.join(encodedklist[i].decode(encoding))] = f f += 1 print "Creating positive feature vectors in svm-light format ... " output = "pos_svm_light_" + repr(args.k) + "_" + repr(args.threshold) + "_" + args.distance print "Feature vectors will be saved in " + output + " ... "
def areImageSigsSimilar(sig1, sig2): """ Compare 2 image "signatures" and return True if they seem to come from a similar image, False otherwise. """ return bitarray.bitdiff(sig1, sig2) < 100
def scoreHash(h): return 128 - bitdiff(h, getTargetHash())
def test_ber(): # Generate a random bit steam Nbits = 1000 bits=bitarray.bitarray((random.rand(Nbits)>0.5).tolist()) # modulate sig = afsk1200(bits,fs=44100.0) # add noise sig_n = sig + 1*random.randn(len(sig)) # with noise or without? # demodulate and decode bits with non-coherent and FM demodulators NRZa_nc = nc_afskDemod(sig_n, tbw=2.0) NRZa_fm = fm_afskDemod(sig_n, TBW=4) # Compute Error Bit Rate Curves BER_nc = [] BER_fm = [] for sigma in r_[0.1:8.0:0.2]: bits_temp=bitarray.bitarray((random.rand(Nbits)>0.5).tolist()) # modulate and add noise sig = afsk1200(bits_temp,fs=44100.0) sig_n = sig + sigma*random.randn(len(sig)) # demodulate and decode bits with non-coherent and FM demodulators NRZa_nc = nc_afskDemod(sig_n, tbw=2.0, fs=44100.0) NRZa_fm = fm_afskDemod(sig_n, TBW=4, N=74) NRZ_nc = np.sign(NRZa_nc) NRZ_fm = np.sign(NRZa_fm) E_nc = 0 E_fm = 0 fs = 44100.0 bits_dec_nc = decode_bits(NRZ_nc, Nbits, 56, fs=fs) E_nc = int(bitarray.bitdiff(bits_temp, bits_dec_nc[0: min(len(bits_temp), len(bits_dec_nc)) ])) bits_dec_fm = decode_bits(NRZ_fm, Nbits, 92, fs=fs) E_fm = int(bitarray.bitdiff(bits_temp, bits_dec_fm[0: min(len(bits_temp), len(bits_dec_fm)) ])) BER_nc.append(1.0*E_nc/len(bits_temp)) BER_fm.append(1.0*E_fm/len(bits_temp)) print BER_nc print BER_fm # plot f = plt.figure() plt.loglog(1/(r_[0.1:8.1:0.2]),BER_nc) plt.loglog(1/(r_[0.1:8.1:0.2]),BER_fm,'r') plt.title("empirical BER for AFSK demodulation") plt.xlabel("SNR") plt.ylabel("BER") plt.legend(("non-coherent","FM")) plt.show()
junk = bitarray(total) junk.setall(False) for i in xrange(0, len(encodedklist)): if progress_count % math.ceil(total / 10000.0) == 0: p = (float(progress_count) / total) * 100 reduction = total - junk.count() sys.stdout.write("\r%.2f%% progress. " % p + "Feature set reduced to %d kmers ... " % reduction) sys.stdout.flush() progress_count += 1 if junk[i]: continue for j in xrange(i + 1, len(encodedklist)): if junk[j]: continue if bitdiff(encodedklist[i], encodedklist[j]) / 2 <= args.threshold: junk[j] = 1 print print "Done reducing kmer set ... " for i in xrange(0, len(encodedklist)): if junk[i]: continue featureMap[''.join(encodedklist[i].decode(encoding))] = f f += 1 print "Creating positive feature vectors in svm-light format ... " output = "pos_svm_light_" + repr(args.k) + "_" + repr( args.threshold) + "_" + args.distance print "Feature vectors will be saved in " + output + " ... "
def distance(self, other): """ Return the Hamming distance between this hash and another hash. """ return int(bitdiff(self.hash(), other.hash()))
def equalBitArrays(ba1, ba2): return bitdiff(ba1, ba2) == 0