def generate_pairings(threshold=100): time.sleep(random.uniform(0, 4)) dedupe_generate_pairings.delay(threshold=threshold) try: if not Lock.acquire(LOCK_GENERATE): return while query_pairings(False).count() <= KEEP_SIZE: generate_best_random_pairing() finally: Lock.release(LOCK_GENERATE)
def dedupe_generate_pairings(threshold=100): # do this only on full moon. num = query_pairings(True).count() log.info('Triggered dedupe, with %s pairings of training data', num) if num < threshold or num % threshold != 0: return time.sleep(random.uniform(0, 4)) try: if not Lock.acquire(LOCK_DEDUPE): return log.info("Dedupe to generate pairings candidates") fields = make_fields() data = make_data(fields) pairs = make_pairs(data) deduper = dedupe.Dedupe(fields) deduper.sample(data) deduper.markPairs(pairs) deduper.train() matches = [] for match in deduper.match(data): scored = sorted(zip(match[0], match[1]), key=lambda (id, s): s, reverse=True) scored = list(scored)[:2] (e1, s1), (e2, s2) = scored score = ((s1 + s2) / 2.0) * 100.0 matches.append((e1, e2, score)) matches = sorted(matches, key=lambda (e, a, s): s, reverse=True) for (left_id, right_id, score) in matches: if score < 50: continue if not same_as.match(left_id, right_id): Pairing.update({'left_id': left_id, 'right_id': right_id}, None, score=score) db.session.commit() finally: Lock.release(LOCK_DEDUPE)