Beispiel #1
0
def UseCSearch(seq1 ,set1, seq2, set2, min_score, meaningful_length, allowed_consecutive_errors, allowed_distance_error, count_misses):
    if len(seq1) < len(seq2):
        shortseq = seq1
        longset = set2
    else:
        shortseq = seq2
        longset = set1
    return OptimizeCompare.compare(shortseq, longset, meaningful_length, allowed_consecutive_errors, allowed_distance_error)
Beispiel #2
0
 def __shingle_and_hash(self, words, shingle_size, L_tables):
     if not Cfg.myOptions.useC:
         minhashes = array('l', (9223372036854775807 for n in xrange(L_tables)))
         for i in xrange(len(words) - shingle_size):
             shingle = tuple(words[i:i+shingle_size])
             hshingle = hash(shingle)
             #I've tried all manner of map() to remove this inner loop; but no luck.
             #Python's poor "for" statement and super slow xor means this takes over a second per book.
             for h in xrange(L_tables-1):
                 myhash = hshingle ^ Utility.myMasks.masks[h]
                 if myhash < minhashes[h]:
                     minhashes[h] = myhash
     else:
         hashedwords = len(words)
         hashes = OptimizeCompare.HashSequence()
         hashes.resize(hashedwords, 0)
         for i in xrange(hashedwords):
             hashes[i] = hash(words[i])
         myhashes = OptimizeCompare.HashSequence()
         myhashes = OptimizeCompare.shingle_and_hash(hashes, Utility.myMasks.masks, L_tables, shingle_size)
         minhashes = array('l',myhashes) #Convert to native python type because this array is small, but will be accessed frequently!
     return minhashes