Beispiel #1
0
    def get_low_strdiff_pairs(self):
        logging.info("String difference phase started")
        src_index = self._src._index
        tgt_index = self._tgt._index
        for src in self.interesting[0]:
            src_tok = self._src.ints_to_tokens([src])[0].lower()
            for tgt, _ in self.interesting[0][src].iteritems():
                tgt_tok = self._tgt.ints_to_tokens([tgt])[0].lower()
                ratio = float(len(src_index[src])) / len(tgt_index[tgt])
                if ratio > 3 or ratio < 1/3.0:
                    continue

                #if idiff == 0:
                if src_tok == tgt_tok:
                    logging.debug("{0} added".format(repr((src_tok, tgt_tok))))
                    yield ((src,), (tgt,)), 1.0
                    break

                idiff = levenshtein(src_tok, tgt_tok)
                #sdiff = levenshtein(src_tok, tgt_tok, 1)
                if len(src_tok) >= 5 and len(tgt_tok) >= 5:
                    if idiff == 1:
                        logging.debug("{0} = {1} added".format(repr((src_tok, tgt_tok)), idiff))
                        yield ((src,), (tgt,)), 0.8
                        break
                if len(src_tok) >= 7 and len(tgt_tok) >= 7 and abs(len(tgt_tok) - len(src_tok)) <= 1:
                    if idiff == 2:
                        logging.debug("{0} = {1} added".format(repr((src_tok, tgt_tok)), idiff))
                        yield ((src,), (tgt,)), 0.6
                        break

        logging.info("String difference phase done")
Beispiel #2
0
    def get_low_strdiff_pairs(self):
        logging.info("String difference phase started")
        src_index = self._src._index
        tgt_index = self._tgt._index
        for src in self.interesting[0]:
            src_tok = self._src.ints_to_tokens([src])[0].lower()
            for tgt, _ in self.interesting[0][src].iteritems():
                tgt_tok = self._tgt.ints_to_tokens([tgt])[0].lower()
                ratio = float(len(src_index[src])) / len(tgt_index[tgt])
                if ratio > 3 or ratio < 1 / 3.0:
                    continue

                #if idiff == 0:
                if src_tok == tgt_tok:
                    logging.debug("{0} added".format(repr((src_tok, tgt_tok))))
                    yield ((src, ), (tgt, )), 1.0
                    break

                idiff = levenshtein(src_tok, tgt_tok)
                #sdiff = levenshtein(src_tok, tgt_tok, 1)
                if len(src_tok) >= 5 and len(tgt_tok) >= 5:
                    if idiff == 1:
                        logging.debug("{0} = {1} added".format(
                            repr((src_tok, tgt_tok)), idiff))
                        yield ((src, ), (tgt, )), 0.8
                        break
                if len(src_tok) >= 7 and len(tgt_tok) >= 7 and abs(
                        len(tgt_tok) - len(src_tok)) <= 1:
                    if idiff == 2:
                        logging.debug("{0} = {1} added".format(
                            repr((src_tok, tgt_tok)), idiff))
                        yield ((src, ), (tgt, )), 0.6
                        break

        logging.info("String difference phase done")
Beispiel #3
0
 def choose_most_similar_stemming(self, stemmed_versions, b):
     return sorted(stemmed_versions, key=lambda x: levenshtein(x, b))[0]
Beispiel #4
0
 def choose_most_similar_stemming(self, stemmed_versions, b):
     return sorted(stemmed_versions, key=lambda x: levenshtein(x, b))[0]