def UseCSearch(seq1 ,set1, seq2, set2, min_score, meaningful_length, allowed_consecutive_errors, allowed_distance_error, count_misses): if len(seq1) < len(seq2): shortseq = seq1 longset = set2 else: shortseq = seq2 longset = set1 return OptimizeCompare.compare(shortseq, longset, meaningful_length, allowed_consecutive_errors, allowed_distance_error)
def __shingle_and_hash(self, words, shingle_size, L_tables): if not Cfg.myOptions.useC: minhashes = array('l', (9223372036854775807 for n in xrange(L_tables))) for i in xrange(len(words) - shingle_size): shingle = tuple(words[i:i+shingle_size]) hshingle = hash(shingle) #I've tried all manner of map() to remove this inner loop; but no luck. #Python's poor "for" statement and super slow xor means this takes over a second per book. for h in xrange(L_tables-1): myhash = hshingle ^ Utility.myMasks.masks[h] if myhash < minhashes[h]: minhashes[h] = myhash else: hashedwords = len(words) hashes = OptimizeCompare.HashSequence() hashes.resize(hashedwords, 0) for i in xrange(hashedwords): hashes[i] = hash(words[i]) myhashes = OptimizeCompare.HashSequence() myhashes = OptimizeCompare.shingle_and_hash(hashes, Utility.myMasks.masks, L_tables, shingle_size) minhashes = array('l',myhashes) #Convert to native python type because this array is small, but will be accessed frequently! return minhashes