Beispiel #1
0
def generate_best_random_pairing(num_rounds=10, cutoff=None):
    best_pair = None
    best_score = 0
    pairing = None
    for i in range(num_rounds):
        random_pairing = generate_random_pairing()
        if random_pairing is None:
            return
        left_id, right_id, score = random_pairing
        if cutoff is not None and score >= cutoff:
            pairing = Pairing.update({
                'left_id': left_id,
                'right_id': right_id
            }, None, score=score)
            break
        if score > best_score:
            best_score = score
            best_pair = (left_id, right_id)

    log.info('Generated best match, %r -> score %s', best_pair, best_score)
    if pairing is None and best_pair is not None:
        pairing = Pairing.update({
            'left_id': best_pair[0],
            'right_id': best_pair[1]
        }, None, score=best_score)
    db.session.commit()
    return pairing
Beispiel #2
0
def store():
    authz.require(authz.system_edit())
    pairing = Pairing.update(request_data(), current_user)
    pairing.apply()
    db.session.commit()
    generate_pairings.delay()
    return jsonify(pairing)
Beispiel #3
0
def view(id):
    authz.require(authz.system_edit())
    pairing = obj_or_404(Pairing.by_id(id))
    return jsonify(
        {
            "status": "ok",
            "left": EntityQuery.by_id(pairing.left_id),
            "right": EntityQuery.by_id(pairing.right_id),
            "pairing": pairing,
        }
    )
Beispiel #4
0
def dedupe_generate_pairings(threshold=100):
    # do this only on full moon.
    num = query_pairings(True).count()
    log.info('Triggered dedupe, with %s pairings of training data', num)
    if num < threshold or num % threshold != 0:
        return
    time.sleep(random.uniform(0, 4))

    try:
        if not Lock.acquire(LOCK_DEDUPE):
            return

        log.info("Dedupe to generate pairings candidates")
        fields = make_fields()
        data = make_data(fields)
        pairs = make_pairs(data)

        deduper = dedupe.Dedupe(fields)
        deduper.sample(data)
        deduper.markPairs(pairs)
        deduper.train()

        matches = []
        for match in deduper.match(data):
            scored = sorted(zip(match[0], match[1]),
                            key=lambda (id, s): s, reverse=True)
            scored = list(scored)[:2]
            (e1, s1), (e2, s2) = scored
            score = ((s1 + s2) / 2.0) * 100.0
            matches.append((e1, e2, score))

        matches = sorted(matches, key=lambda (e, a, s): s, reverse=True)
        for (left_id, right_id, score) in matches:
            if score < 50:
                continue
            if not same_as.match(left_id, right_id):
                Pairing.update({'left_id': left_id, 'right_id': right_id},
                               None, score=score)
                db.session.commit()
    finally:
        Lock.release(LOCK_DEDUPE)
Beispiel #5
0
def request_pairing(num_rounds=10, cutoff=95, exclude=None):
    q = Pairing.all()
    q = q.filter_by(decided=False)
    if exclude is not None:
        q = q.filter(~Pairing.id.in_(exclude))
    q = q.order_by(Pairing.score.desc())

    next_ = q.first()
    if next_ is not None:
        return next_

    next_ = generate_best_random_pairing(num_rounds=num_rounds,
                                         cutoff=cutoff)
    generate_pairings.delay()
    return next_
Beispiel #6
0
def generate_random_pairing():
    query = {
        'label': None,
        'sort': 'random',
        'same_as': {'optional': 'forbidden'}
    }
    ent = execute_query(query).get('result')
    ent_id = ent.get('id')
    avoid = same_as.expand(ent_id)
    avoid.update(Pairing.existing(ent_id))
    q = {
        'id|!=': list(avoid),
        'label%=': ent.get('label')
    }
    for res in execute_query([q]).get('result'):
        return (res.get('id'), ent_id, res.get('score'))