コード例 #1
0
ファイル: deduper.py プロジェクト: mychapati/nomenklatura
def generate_pairings(threshold=100):
    time.sleep(random.uniform(0, 4))
    dedupe_generate_pairings.delay(threshold=threshold)
    try:
        if not Lock.acquire(LOCK_GENERATE):
            return

        while query_pairings(False).count() <= KEEP_SIZE:
            generate_best_random_pairing()
    finally:
        Lock.release(LOCK_GENERATE)
コード例 #2
0
ファイル: deduper.py プロジェクト: mychapati/nomenklatura
def dedupe_generate_pairings(threshold=100):
    # do this only on full moon.
    num = query_pairings(True).count()
    log.info('Triggered dedupe, with %s pairings of training data', num)
    if num < threshold or num % threshold != 0:
        return
    time.sleep(random.uniform(0, 4))

    try:
        if not Lock.acquire(LOCK_DEDUPE):
            return

        log.info("Dedupe to generate pairings candidates")
        fields = make_fields()
        data = make_data(fields)
        pairs = make_pairs(data)

        deduper = dedupe.Dedupe(fields)
        deduper.sample(data)
        deduper.markPairs(pairs)
        deduper.train()

        matches = []
        for match in deduper.match(data):
            scored = sorted(zip(match[0], match[1]),
                            key=lambda (id, s): s, reverse=True)
            scored = list(scored)[:2]
            (e1, s1), (e2, s2) = scored
            score = ((s1 + s2) / 2.0) * 100.0
            matches.append((e1, e2, score))

        matches = sorted(matches, key=lambda (e, a, s): s, reverse=True)
        for (left_id, right_id, score) in matches:
            if score < 50:
                continue
            if not same_as.match(left_id, right_id):
                Pairing.update({'left_id': left_id, 'right_id': right_id},
                               None, score=score)
                db.session.commit()
    finally:
        Lock.release(LOCK_DEDUPE)