def get_low_strdiff_pairs(self): logging.info("String difference phase started") src_index = self._src._index tgt_index = self._tgt._index for src in self.interesting[0]: src_tok = self._src.ints_to_tokens([src])[0].lower() for tgt, _ in self.interesting[0][src].iteritems(): tgt_tok = self._tgt.ints_to_tokens([tgt])[0].lower() ratio = float(len(src_index[src])) / len(tgt_index[tgt]) if ratio > 3 or ratio < 1/3.0: continue #if idiff == 0: if src_tok == tgt_tok: logging.debug("{0} added".format(repr((src_tok, tgt_tok)))) yield ((src,), (tgt,)), 1.0 break idiff = levenshtein(src_tok, tgt_tok) #sdiff = levenshtein(src_tok, tgt_tok, 1) if len(src_tok) >= 5 and len(tgt_tok) >= 5: if idiff == 1: logging.debug("{0} = {1} added".format(repr((src_tok, tgt_tok)), idiff)) yield ((src,), (tgt,)), 0.8 break if len(src_tok) >= 7 and len(tgt_tok) >= 7 and abs(len(tgt_tok) - len(src_tok)) <= 1: if idiff == 2: logging.debug("{0} = {1} added".format(repr((src_tok, tgt_tok)), idiff)) yield ((src,), (tgt,)), 0.6 break logging.info("String difference phase done")
def get_low_strdiff_pairs(self): logging.info("String difference phase started") src_index = self._src._index tgt_index = self._tgt._index for src in self.interesting[0]: src_tok = self._src.ints_to_tokens([src])[0].lower() for tgt, _ in self.interesting[0][src].iteritems(): tgt_tok = self._tgt.ints_to_tokens([tgt])[0].lower() ratio = float(len(src_index[src])) / len(tgt_index[tgt]) if ratio > 3 or ratio < 1 / 3.0: continue #if idiff == 0: if src_tok == tgt_tok: logging.debug("{0} added".format(repr((src_tok, tgt_tok)))) yield ((src, ), (tgt, )), 1.0 break idiff = levenshtein(src_tok, tgt_tok) #sdiff = levenshtein(src_tok, tgt_tok, 1) if len(src_tok) >= 5 and len(tgt_tok) >= 5: if idiff == 1: logging.debug("{0} = {1} added".format( repr((src_tok, tgt_tok)), idiff)) yield ((src, ), (tgt, )), 0.8 break if len(src_tok) >= 7 and len(tgt_tok) >= 7 and abs( len(tgt_tok) - len(src_tok)) <= 1: if idiff == 2: logging.debug("{0} = {1} added".format( repr((src_tok, tgt_tok)), idiff)) yield ((src, ), (tgt, )), 0.6 break logging.info("String difference phase done")
def choose_most_similar_stemming(self, stemmed_versions, b): return sorted(stemmed_versions, key=lambda x: levenshtein(x, b))[0]