Esempio n. 1
0
 def lsh2(kmer):
     def permute(x):
         return (2654435761 * x) % (2**32)
     shingle_len = 16
     shingles = []
     num_shingles = len(kmer)-shingle_len+1
     global global_timer
     start_time = time.clock()
     for i in range(num_shingles):
         shingles.append(kmerutils.int_value(kmer[i:i+6]))
     global_timer += time.clock() - start_time
     value = min([permute(x) for x in shingles])
     return value
Esempio n. 2
0
 def lsh(kmer):
     # Kind of bullshitting my way through this from info here:
     # http://nlp.stanford.edu/IR-book/html/htmledition/near-duplicates-and-shingling-1.html
     def permute(x, seed):
         random.seed(seed)
         return random.randint(0, x)
     # Params to do some search exploration over:
     # * shingle length
     # * number of seeds to use
     # * could we select the shingles in a strided pattern?
     shingle_len = 6 # because 2^6 == 64, 1 bit per shingle
     shingles = set()
     for i in range(len(kmer)-shingle_len+1):
         shingles.add(kmerutils.int_value(kmer[i:i+6]))
     results = []
     for seed in SEEDS:
         min_permuted = min([permute(x, seed) for x in shingles])
         results.append(min_permuted)
     value = 0
     for result in results:
         value *= 2
         value += result % 2
     return value