Exemple #1
0
def test_generate_shingles():
    def identity(x):
        return x

    assert list(generate_shingles(["a"], count=2, mapper=identity)) == ["a"]
    assert list(generate_shingles(["b", "c"], count=2,
                                  mapper=identity)) == ["b c"]
    assert list(generate_shingles(["d", "e", "f"], count=2,
                                  mapper=identity)) == ["d e", "e f"]
    assert list(
        generate_shingles(["g", "h", "i", "j"], count=2,
                          mapper=identity)) == ["g h", "h i", "i j"]
Exemple #2
0
    def addDocument(self, id, body):
        if len(id) > 42:
            return 'Id too long', 400

        if id in self.store:
            return 'Document already exists', 409

        shingles = list(minhash.generate_shingles(body.split(" ")))
        self.store.add(id,
                       minhash.calculate_signature(shingles, self.hash_funcs))
Exemple #3
0
    def similarByContent(self, content):
        shingles = list(minhash.generate_shingles(content.split(" ")))
        sig = minhash.calculate_signature(shingles, self.hash_funcs)
        scores = minhash.approx_jaccard_score(sig, self.store.sigs, 1)
        hits = scores > .42  # TODO: find appropriate threshold

        return [{
            "id": id,
            "score": score
        } for id, score in zip(self.store.ids[hits], (
            scores[hits] * 100).astype(int).tolist())]
Exemple #4
0
def __main__():
    sig_len = 42

    seed = os.getenv("SEED")
    random.seed(seed)
    print("using seed:", seed)
    print("signature length:", sig_len)

    if os.path.isfile("signatures.json"):
        print("loading known signatures")
        with open("signatures.json", "r") as fp:
            data = json.load(fp)
            ids = []
            sigs = np.empty(
                (len(data), sig_len))  # TODO: sig_len may be different
            for i, doc in enumerate(data):
                ids.append(doc['id'])
                sigs[i] = doc['sig']
    else:
        with open("docs.json") as fp:
            docs = json.load(fp)

        ids = [doc['id'] for doc in docs]
        print(len(ids), ":", " ".join(map(str, ids[1:5])), "...",
              " ".join(map(str, ids[-4:])))

        hash_funcs = list(generate_hash_funcs(sig_len))

        with Timer("signature time"):
            sigs = np.empty((len(docs), sig_len))
            for i, doc in enumerate(docs):
                shingles = list(
                    generate_shingles(
                        simple_preprocess(doc['text'], min_len=0,
                                          max_len=4242)))
                sigs[i] = calculate_signature(shingles, hash_funcs)

        with open("signatures.json", 'w') as fp:
            json.dump([{
                "id": id,
                "sig": sig.astype(int).tolist()
            } for id, sig in zip(ids, sigs)], fp)

    for sig in sigs[:4]:
        print("[", " ".join(map(str, sig[:4])), "...",
              " ".join(map(str, sig[-4:])), "]")
    print("...")
    for sig in sigs[-4:]:
        print("[", " ".join(map(str, sig[:4])), "...",
              " ".join(map(str, sig[-4:])), "]")

    # this builds a diagonal, upper-right matrix
    # locations along the main diagonal and below (lower-left) are invalid
    # access scores[x][y] at scores[x][y-x-1]
    with Timer("score time"):
        scores = [
            approx_jaccard_score(a, sigs[i + 1:], 1)
            for i, a in enumerate(sigs)
        ]

    with Timer("bin time"):
        # np.histogram uses last bin as max, to include 1.0 need a bin >1.0
        bins = (0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 42)
        hist = {
            0: 0,
            .1: 0,
            .2: 0,
            .3: 0,
            .4: 0,
            .5: 0,
            .6: 0,
            .7: 0,
            .8: 0,
            .9: 0,
            1: 0
        }
        for row in scores:
            counts, _ = np.histogram((row * 10).astype(int) / 10, bins)
            for i, c in enumerate(counts):
                hist[bins[i]] += c
    print(hist)

    with open("discovered_dups", "w") as fp:
        threshold = .42
        for i in range(len(scores)):
            for j in range(i + 1, len(scores)):
                if threshold < scores[i][j - i - 1] and scores[i][j - i -
                                                                  1] < 1:
                    print(ids[i], ids[j], scores[i][j - i - 1], file=fp)
Exemple #5
0
 def calculate_signature(self, text):
     if isinstance(text, str):
         text = text.split(" ")
     return minhash.calculate_signature(
         list(minhash.generate_shingles(text)), self.hash_funcs)