Exemple #1
0
def generate_best_random_pairing(num_rounds=10, cutoff=None):
    best_pair = None
    best_score = 0
    pairing = None
    for i in range(num_rounds):
        random_pairing = generate_random_pairing()
        if random_pairing is None:
            return
        left_id, right_id, score = random_pairing
        if cutoff is not None and score >= cutoff:
            pairing = Pairing.update({
                'left_id': left_id,
                'right_id': right_id
            }, None, score=score)
            break
        if score > best_score:
            best_score = score
            best_pair = (left_id, right_id)

    log.info('Generated best match, %r -> score %s', best_pair, best_score)
    if pairing is None and best_pair is not None:
        pairing = Pairing.update({
            'left_id': best_pair[0],
            'right_id': best_pair[1]
        }, None, score=best_score)
    db.session.commit()
    return pairing
def store():
    authz.require(authz.system_edit())
    pairing = Pairing.update(request_data(), current_user)
    pairing.apply()
    db.session.commit()
    generate_pairings.delay()
    return jsonify(pairing)
Exemple #3
0
def dedupe_generate_pairings(threshold=100):
    # do this only on full moon.
    num = query_pairings(True).count()
    log.info('Triggered dedupe, with %s pairings of training data', num)
    if num < threshold or num % threshold != 0:
        return
    time.sleep(random.uniform(0, 4))

    try:
        if not Lock.acquire(LOCK_DEDUPE):
            return

        log.info("Dedupe to generate pairings candidates")
        fields = make_fields()
        data = make_data(fields)
        pairs = make_pairs(data)

        deduper = dedupe.Dedupe(fields)
        deduper.sample(data)
        deduper.markPairs(pairs)
        deduper.train()

        matches = []
        for match in deduper.match(data):
            scored = sorted(zip(match[0], match[1]),
                            key=lambda (id, s): s, reverse=True)
            scored = list(scored)[:2]
            (e1, s1), (e2, s2) = scored
            score = ((s1 + s2) / 2.0) * 100.0
            matches.append((e1, e2, score))

        matches = sorted(matches, key=lambda (e, a, s): s, reverse=True)
        for (left_id, right_id, score) in matches:
            if score < 50:
                continue
            if not same_as.match(left_id, right_id):
                Pairing.update({'left_id': left_id, 'right_id': right_id},
                               None, score=score)
                db.session.commit()
    finally:
        Lock.release(LOCK_DEDUPE)