def generate_best_random_pairing(num_rounds=10, cutoff=None): best_pair = None best_score = 0 pairing = None for i in range(num_rounds): random_pairing = generate_random_pairing() if random_pairing is None: return left_id, right_id, score = random_pairing if cutoff is not None and score >= cutoff: pairing = Pairing.update({ 'left_id': left_id, 'right_id': right_id }, None, score=score) break if score > best_score: best_score = score best_pair = (left_id, right_id) log.info('Generated best match, %r -> score %s', best_pair, best_score) if pairing is None and best_pair is not None: pairing = Pairing.update({ 'left_id': best_pair[0], 'right_id': best_pair[1] }, None, score=best_score) db.session.commit() return pairing
def store(): authz.require(authz.system_edit()) pairing = Pairing.update(request_data(), current_user) pairing.apply() db.session.commit() generate_pairings.delay() return jsonify(pairing)
def dedupe_generate_pairings(threshold=100): # do this only on full moon. num = query_pairings(True).count() log.info('Triggered dedupe, with %s pairings of training data', num) if num < threshold or num % threshold != 0: return time.sleep(random.uniform(0, 4)) try: if not Lock.acquire(LOCK_DEDUPE): return log.info("Dedupe to generate pairings candidates") fields = make_fields() data = make_data(fields) pairs = make_pairs(data) deduper = dedupe.Dedupe(fields) deduper.sample(data) deduper.markPairs(pairs) deduper.train() matches = [] for match in deduper.match(data): scored = sorted(zip(match[0], match[1]), key=lambda (id, s): s, reverse=True) scored = list(scored)[:2] (e1, s1), (e2, s2) = scored score = ((s1 + s2) / 2.0) * 100.0 matches.append((e1, e2, score)) matches = sorted(matches, key=lambda (e, a, s): s, reverse=True) for (left_id, right_id, score) in matches: if score < 50: continue if not same_as.match(left_id, right_id): Pairing.update({'left_id': left_id, 'right_id': right_id}, None, score=score) db.session.commit() finally: Lock.release(LOCK_DEDUPE)