Esempio n. 1
0
def test_approx_jaccard_score():
    from numpy import array as a
    assert approx_jaccard_score(a([0, 0, 0, 0]), a([0, 0, 0, 0])) == 1
    assert approx_jaccard_score(a([0, 0, 0, 0]), a([1, 0, 0, 0])) == 3 / 4
    assert approx_jaccard_score(a([0, 0, 0, 0]), a([1, 0, 1, 0])) == 2 / 4
    assert approx_jaccard_score(a([0, 0, 0, 0]), a([1, 1, 1, 0])) == 1 / 4
    assert approx_jaccard_score(a([0, 0, 0, 0]), a([1, 1, 1, 1])) == 0
    assert approx_jaccard_score(a([0, 0, 0]), a([0, 0, 0])) == 1
    assert approx_jaccard_score(a([0, 0, 0]), a([1, 0, 0])) == 2 / 3
    assert approx_jaccard_score(a([0, 0, 0]), a([1, 0, 1])) == 1 / 3
    assert approx_jaccard_score(a([0, 0, 0]), a([1, 1, 1])) == 0
Esempio n. 2
0
    def similarByContent(self, content):
        shingles = list(minhash.generate_shingles(content.split(" ")))
        sig = minhash.calculate_signature(shingles, self.hash_funcs)
        scores = minhash.approx_jaccard_score(sig, self.store.sigs, 1)
        hits = scores > .42  # TODO: find appropriate threshold

        return [{
            "id": id,
            "score": score
        } for id, score in zip(self.store.ids[hits], (
            scores[hits] * 100).astype(int).tolist())]
Esempio n. 3
0
    def similarById(self, id):
        if id not in self.store:
            return 'Not Found', 404

        sig = self.store.sigs[self.store.ids == id][0]
        scores = minhash.approx_jaccard_score(sig, self.store.sigs, 1)
        hits = scores > .42  # TODO: find appropriate threshold

        return [{
            "id": id,
            "score": score
        } for id, score in zip(self.store.ids[hits], (
            scores[hits] * 100).astype(int).tolist())]
Esempio n. 4
0
File: cli.py Progetto: mattf/nutai
def test_minhash(documents, stopwords, labeled):
    with open(stopwords, 'rb') as fp:
        stops = set(msgpack.load(fp))
    filter_processor = simple_preprocess_and_filter_stopwords(stops)
    docs = load_docs(documents,
                     process_text=lambda doc: filter_processor(
                         combine_issue_and_body_filter_labels(doc)))
    labels = load_testset(labeled, docs)
    model = nutai.minhash.Model()

    true = [label for _, _, label in labels]
    pred = [
        approx_jaccard_score(model.calculate_signature(docs[id0]['text']),
                             model.calculate_signature(docs[id1]['text']))
        for id0, id1, _ in labels
    ]
    best_thresh = calculate_best_threshold(pred, labels)

    print("best threshold:", best_thresh)
    print_confusion_matrix(
        confusion_matrix(true, [p > best_thresh for p in pred]))
Esempio n. 5
0
def __main__():
    sig_len = 42

    seed = os.getenv("SEED")
    random.seed(seed)
    print("using seed:", seed)
    print("signature length:", sig_len)

    if os.path.isfile("signatures.json"):
        print("loading known signatures")
        with open("signatures.json", "r") as fp:
            data = json.load(fp)
            ids = []
            sigs = np.empty(
                (len(data), sig_len))  # TODO: sig_len may be different
            for i, doc in enumerate(data):
                ids.append(doc['id'])
                sigs[i] = doc['sig']
    else:
        with open("docs.json") as fp:
            docs = json.load(fp)

        ids = [doc['id'] for doc in docs]
        print(len(ids), ":", " ".join(map(str, ids[1:5])), "...",
              " ".join(map(str, ids[-4:])))

        hash_funcs = list(generate_hash_funcs(sig_len))

        with Timer("signature time"):
            sigs = np.empty((len(docs), sig_len))
            for i, doc in enumerate(docs):
                shingles = list(
                    generate_shingles(
                        simple_preprocess(doc['text'], min_len=0,
                                          max_len=4242)))
                sigs[i] = calculate_signature(shingles, hash_funcs)

        with open("signatures.json", 'w') as fp:
            json.dump([{
                "id": id,
                "sig": sig.astype(int).tolist()
            } for id, sig in zip(ids, sigs)], fp)

    for sig in sigs[:4]:
        print("[", " ".join(map(str, sig[:4])), "...",
              " ".join(map(str, sig[-4:])), "]")
    print("...")
    for sig in sigs[-4:]:
        print("[", " ".join(map(str, sig[:4])), "...",
              " ".join(map(str, sig[-4:])), "]")

    # this builds a diagonal, upper-right matrix
    # locations along the main diagonal and below (lower-left) are invalid
    # access scores[x][y] at scores[x][y-x-1]
    with Timer("score time"):
        scores = [
            approx_jaccard_score(a, sigs[i + 1:], 1)
            for i, a in enumerate(sigs)
        ]

    with Timer("bin time"):
        # np.histogram uses last bin as max, to include 1.0 need a bin >1.0
        bins = (0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 42)
        hist = {
            0: 0,
            .1: 0,
            .2: 0,
            .3: 0,
            .4: 0,
            .5: 0,
            .6: 0,
            .7: 0,
            .8: 0,
            .9: 0,
            1: 0
        }
        for row in scores:
            counts, _ = np.histogram((row * 10).astype(int) / 10, bins)
            for i, c in enumerate(counts):
                hist[bins[i]] += c
    print(hist)

    with open("discovered_dups", "w") as fp:
        threshold = .42
        for i in range(len(scores)):
            for j in range(i + 1, len(scores)):
                if threshold < scores[i][j - i - 1] and scores[i][j - i -
                                                                  1] < 1:
                    print(ids[i], ids[j], scores[i][j - i - 1], file=fp)
Esempio n. 6
0
 def calculate_similarity(self, signature):
     return minhash.approx_jaccard_score(signature, self.store.sigs, 1)