Esempio n. 1
0
class NGramSimilarity():

    def __init__(self):
        self.preprocessor = Preprocessor()
        self.current_dir = os.path.dirname(__file__)

    def check(self, doc1, doc2):
        checksum1 = self.three_grams(self.preprocessor.remove_mark(doc1))
        checksum2 = self.three_grams(self.preprocessor.remove_mark(doc2))

        return len(checksum1 & checksum2)* 1.0/min(len(checksum1), len(checksum2))

    def three_grams(self, doc):
        checksums = set()

        doc = doc.split(' ')
        for i in range(len(doc) - 2):
            checksums.add(hashlib.md5(' '.join(doc[i:i + 3])).digest())

        return checksums