def _index_predicates(self, candidates): blocker = self.block_learner.blocker A, B = zip(*candidates) A = core.unique(A) B = core.unique(B) for field in blocker.index_fields: unique_fields = {record[field] for record in B} blocker.index(unique_fields, field) for pred in blocker.index_predicates: pred.freeze(A, B)
def _index_predicates(self, candidates): fingerprinter = self.fingerprinter records = core.unique((record for pair in candidates for record in pair)) for field in fingerprinter.index_fields: unique_fields = {record[field] for record in records} fingerprinter.index(unique_fields, field) for pred in fingerprinter.index_predicates: pred.freeze(records)
def _index_predicates(self, candidates): blocker = self.block_learner.blocker records = core.unique((record for pair in candidates for record in pair)) for field in blocker.index_fields: unique_fields = {record[field] for record in records} blocker.index(unique_fields, field) for pred in blocker.index_predicates: pred.freeze(records)
def _index_predicates(self, candidates): blocker = self.block_learner.blocker logger.debug( f"labeler.DedupeBlockLearner blocker: {type(self.block_learner.blocker)}" ) records = core.unique( (record for pair in candidates for record in pair)) for field in blocker.index_fields: unique_fields = {record[field] for record in records} blocker.index(unique_fields, field) for pred in blocker.index_predicates: pred.freeze(records)
def consoleLabel(deduper): # pragma: no cover ''' Command line interface for presenting and labeling training pairs by the user Argument : A deduper object ''' finished = False use_previous = False fields = unique(field.field for field in deduper.data_model.primary_fields) buffer_len = 1 # Max number of previous operations examples_buffer = [] uncertain_pairs = [] while not finished: if use_previous: record_pair, _ = examples_buffer.pop(0) use_previous = False else: if not uncertain_pairs: uncertain_pairs = deduper.uncertainPairs() try: record_pair = uncertain_pairs.pop() except IndexError: break n_match = (len(deduper.training_pairs['match']) + sum(label == 'match' for _, label in examples_buffer)) n_distinct = (len(deduper.training_pairs['distinct']) + sum(label == 'distinct' for _, label in examples_buffer)) for pair in record_pair: for field in fields: line = "%s : %s" % (field, pair[field]) print(line, file=sys.stderr) print(file=sys.stderr) print("{0}/10 positive, {1}/10 negative".format(n_match, n_distinct), file=sys.stderr) print('Do these records refer to the same thing?', file=sys.stderr) valid_response = False user_input = '' while not valid_response: if examples_buffer: prompt = '(y)es / (n)o / (u)nsure / (f)inished / (p)revious' valid_responses = {'y', 'n', 'u', 'f', 'p'} else: prompt = '(y)es / (n)o / (u)nsure / (f)inished' valid_responses = {'y', 'n', 'u', 'f'} print(prompt, file=sys.stderr) user_input = input() if user_input in valid_responses: valid_response = True if user_input == 'y': examples_buffer.insert(0, (record_pair, 'match')) elif user_input == 'n': examples_buffer.insert(0, (record_pair, 'distinct')) elif user_input == 'u': examples_buffer.insert(0, (record_pair, 'uncertain')) elif user_input == 'f': print('Finished labeling', file=sys.stderr) finished = True elif user_input == 'p': use_previous = True uncertain_pairs.append(record_pair) if len(examples_buffer) > buffer_len: record_pair, label = examples_buffer.pop() if label in ['distinct', 'match']: examples = {'distinct': [], 'match': []} examples[label].append(record_pair) deduper.markPairs(examples) for record_pair, label in examples_buffer: if label in ['distinct', 'match']: examples = {'distinct': [], 'match': []} examples[label].append(record_pair) deduper.markPairs(examples)
def console_label( deduper: dedupe.api.ActiveMatching) -> None: # pragma: no cover ''' Train a matcher instance (Dedupe, RecordLink, or Gazetteer) from the command line. Example .. code:: python > deduper = dedupe.Dedupe(variables) > deduper.prepare_training(data) > dedupe.console_label(deduper) ''' finished = False use_previous = False fields = unique(field.field for field in deduper.data_model.primary_fields) buffer_len = 1 # Max number of previous operations examples_buffer: List[Tuple[TrainingExample, Literal['match', 'distinct', 'uncertain']]] = [] uncertain_pairs: List[TrainingExample] = [] while not finished: if use_previous: record_pair, _ = examples_buffer.pop(0) use_previous = False else: try: if not uncertain_pairs: uncertain_pairs = deduper.uncertain_pairs() record_pair = uncertain_pairs.pop() except IndexError: break n_match = (len(deduper.training_pairs['match']) + sum(label == 'match' for _, label in examples_buffer)) n_distinct = (len(deduper.training_pairs['distinct']) + sum(label == 'distinct' for _, label in examples_buffer)) for pair in record_pair: for field in fields: line = "%s : %s" % (field, pair[field]) print(line, file=sys.stderr) print(file=sys.stderr) print("{0}/10 positive, {1}/10 negative".format(n_match, n_distinct), file=sys.stderr) print('Do these records refer to the same thing?', file=sys.stderr) valid_response = False user_input = '' while not valid_response: if examples_buffer: prompt = '(y)es / (n)o / (u)nsure / (f)inished / (p)revious' valid_responses = {'y', 'n', 'u', 'f', 'p'} else: prompt = '(y)es / (n)o / (u)nsure / (f)inished' valid_responses = {'y', 'n', 'u', 'f'} print(prompt, file=sys.stderr) user_input = input() if user_input in valid_responses: valid_response = True if user_input == 'y': examples_buffer.insert(0, (record_pair, 'match')) elif user_input == 'n': examples_buffer.insert(0, (record_pair, 'distinct')) elif user_input == 'u': examples_buffer.insert(0, (record_pair, 'uncertain')) elif user_input == 'f': print('Finished labeling', file=sys.stderr) finished = True elif user_input == 'p': use_previous = True uncertain_pairs.append(record_pair) if len(examples_buffer) > buffer_len: record_pair, label = examples_buffer.pop() if label in {'distinct', 'match'}: examples: TrainingData examples = {'distinct': [], 'match': []} examples[label].append(record_pair) # type: ignore deduper.mark_pairs(examples) for record_pair, label in examples_buffer: if label in ['distinct', 'match']: exmples: TrainingData examples = {'distinct': [], 'match': []} examples[label].append(record_pair) # type: ignore deduper.mark_pairs(examples)
def console_label(deduper: dedupe.api.ActiveMatching) -> None: # noqa: C901 """ Train a matcher instance (Dedupe, RecordLink, or Gazetteer) from the command line. Example .. code:: python > deduper = dedupe.Dedupe(variables) > deduper.prepare_training(data) > dedupe.console_label(deduper) """ confirmed_matches = [] finished = False use_previous = False fields = unique(field.field for field in deduper.data_model.primary_fields) buffer_len = 1 # Max number of previous operations examples_buffer: List[ Tuple[TrainingExample, Literal["match", "distinct", "uncertain"]] ] = [] uncertain_pairs: List[TrainingExample] = [] # don't re-use items that are confirmed with a mapping mapped_items = set() while not finished: if use_previous: record_pair, _ = examples_buffer.pop(0) use_previous = False else: try: if not uncertain_pairs: uncertain_pairs = deduper.uncertain_pairs() while True: record_pair = uncertain_pairs.pop() if ( len( set([x["uri"] for x in record_pair]).intersection( mapped_items ) ) > 0 ): examples_buffer.insert(0, (record_pair, "distinct")) # TODO: do i need to process these? else: break except IndexError: break n_match = len(deduper.training_pairs["match"]) + sum( label == "match" for _, label in examples_buffer ) n_distinct = len(deduper.training_pairs["distinct"]) + sum( label == "distinct" for _, label in examples_buffer ) for pair in record_pair: for field in fields: line = "%s : %s" % (field, pair[field]) print(line, file=sys.stderr) print(file=sys.stderr) print( "{0}/10 positive, {1}/10 negative".format(n_match, n_distinct), file=sys.stderr, ) print( Fore.YELLOW + "Do these records refer to the same thing?" + Style.RESET_ALL, file=sys.stderr, ) valid_response = False user_input = "" while not valid_response: if examples_buffer: prompt = "(y)es / (n)o / (u)nsure / (f)inished / (p)revious" valid_responses = {"y", "n", "u", "f", "p"} else: prompt = "(y)es / (n)o / (u)nsure / (f)inished" valid_responses = {"y", "n", "u", "f"} print(Fore.YELLOW + prompt + Style.RESET_ALL, file=sys.stderr) user_input = input() if user_input in valid_responses: valid_response = True if user_input == "y": examples_buffer.insert(0, (record_pair, "match")) mapped_items.add(record_pair[0]["uri"]) mapped_items.add(record_pair[1]["uri"]) confirmed_matches.append((record_pair[0]["uri"], record_pair[1]["uri"])) # deduper.mark_pairs({'match': record_pair}) elif user_input == "n": examples_buffer.insert(0, (record_pair, "distinct")) elif user_input == "u": examples_buffer.insert(0, (record_pair, "uncertain")) elif user_input == "f": print(Fore.GREEN + "Finished labeling" + Style.RESET_ALL, file=sys.stderr) finished = True elif user_input == "p": use_previous = True uncertain_pairs.append(record_pair) if len(examples_buffer) > buffer_len: record_pair, label = examples_buffer.pop() if label in {"distinct", "match"}: examples: TrainingData examples = {"distinct": [], "match": []} examples[label].append(record_pair) # type: ignore deduper.mark_pairs(examples) for record_pair, label in examples_buffer: if label in ["distinct", "match"]: exmples: TrainingData examples = {"distinct": [], "match": []} examples[label].append(record_pair) # type: ignore deduper.mark_pairs(examples) return confirmed_matches
def _merge_features(fields, g1_features, g2_features): while True: if len(g1_features) == len(g2_features) == 1: linked_records = ( (list(g1_features.keys())[0], list(g2_features.keys())[0]), 1.0, ) g1_features = [] g2_features = [] break linker = dedupe.RecordLink(fields) linker.prepare_training(g1_features, g2_features) confirmed_matches = console_label(linker) linker.train() linked_records = linker.join( g1_features, g2_features, 0.0, constraint="one-to-one" ) # remove records from linked_records that are in confirmed_matches for (e1, e2) in confirmed_matches: idx = 0 while idx < len(linked_records): pair = linked_records[idx] for (pair, _) in linked_records: if e1 in pair or e2 in pair: linked_records.pop(idx) break idx += 1 # replace linked record scores with 1.0 if the user explicitly # marked them as equivalent. Then fill in other user-marked pairs # of linked records at the end idx = 0 while idx < len(confirmed_matches): pair = confirmed_matches[idx] for lidx, (lpair, _) in enumerate(linked_records): if pair == lpair: linked_records[lidx] = (pair, 1.0) confirmed_matches.pop(idx) idx -= 1 # cancel out the increment break idx += 1 linked_records.extend([(pair, 1.0) for pair in confirmed_matches]) print( Style.BRIGHT + Fore.YELLOW + "Is this matching correct?" + Style.RESET_ALL ) for (e1, e2), similarity in linked_records: for field in unique(field["field"] for field in fields): g1_val = g1_features[e1][field] g2_val = g2_features[e2][field] print(f"{g1_val:<50} | {g2_val:<50}") print(f"Similarity: {similarity}") print("-" * 20) ans = input(Fore.YELLOW + "[y/n]? " + Style.RESET_ALL) if ans.lower() == "y": print( Fore.GREEN + "All correct! Moving on to any stragglers" + Style.RESET_ALL ) break else: print(Fore.RED + "Re-labeling..." + Style.RESET_ALL) linked_entities = _unpack_linked_records(linked_records) if len(linked_entities) != len(g1_features) or len(linked_entities) != len( g2_features ): leftover_g1 = set(g1_features.keys()).difference(linked_entities) leftover_g2 = set(g2_features.keys()).difference(linked_entities) leftover_g1 = {k: v for (k, v) in g1_features.items() if k in leftover_g1} leftover_g2 = {k: v for (k, v) in g2_features.items() if k in leftover_g2} return linked_records, leftover_g1, leftover_g2