コード例 #1
0
    def _index_predicates(self, candidates):

        blocker = self.block_learner.blocker

        A, B = zip(*candidates)
        A = core.unique(A)
        B = core.unique(B)

        for field in blocker.index_fields:
            unique_fields = {record[field] for record in B}
            blocker.index(unique_fields, field)

        for pred in blocker.index_predicates:
            pred.freeze(A, B)
コード例 #2
0
ファイル: learner.py プロジェクト: ahill187/dedupe
    def _index_predicates(self, candidates):

        fingerprinter = self.fingerprinter

        records = core.unique((record for pair in candidates for record in pair))

        for field in fingerprinter.index_fields:
            unique_fields = {record[field] for record in records}
            fingerprinter.index(unique_fields, field)

        for pred in fingerprinter.index_predicates:
            pred.freeze(records)
コード例 #3
0
    def _index_predicates(self, candidates):

        blocker = self.block_learner.blocker

        records = core.unique((record for pair in candidates for record in pair))

        for field in blocker.index_fields:
            unique_fields = {record[field] for record in records}
            blocker.index(unique_fields, field)

        for pred in blocker.index_predicates:
            pred.freeze(records)
コード例 #4
0
    def _index_predicates(self, candidates):

        blocker = self.block_learner.blocker
        logger.debug(
            f"labeler.DedupeBlockLearner blocker: {type(self.block_learner.blocker)}"
        )

        records = core.unique(
            (record for pair in candidates for record in pair))

        for field in blocker.index_fields:
            unique_fields = {record[field] for record in records}
            blocker.index(unique_fields, field)

        for pred in blocker.index_predicates:
            pred.freeze(records)
コード例 #5
0
ファイル: convenience.py プロジェクト: jamal2300/dedupe-1
def consoleLabel(deduper):  # pragma: no cover
    '''
    Command line interface for presenting and labeling training pairs
    by the user

    Argument :
    A deduper object
    '''

    finished = False
    use_previous = False
    fields = unique(field.field for field in deduper.data_model.primary_fields)

    buffer_len = 1  # Max number of previous operations
    examples_buffer = []
    uncertain_pairs = []

    while not finished:
        if use_previous:
            record_pair, _ = examples_buffer.pop(0)
            use_previous = False
        else:
            if not uncertain_pairs:
                uncertain_pairs = deduper.uncertainPairs()

            try:
                record_pair = uncertain_pairs.pop()
            except IndexError:
                break

        n_match = (len(deduper.training_pairs['match']) +
                   sum(label == 'match' for _, label in examples_buffer))
        n_distinct = (len(deduper.training_pairs['distinct']) +
                      sum(label == 'distinct' for _, label in examples_buffer))

        for pair in record_pair:
            for field in fields:
                line = "%s : %s" % (field, pair[field])
                print(line, file=sys.stderr)
            print(file=sys.stderr)

        print("{0}/10 positive, {1}/10 negative".format(n_match, n_distinct),
              file=sys.stderr)
        print('Do these records refer to the same thing?', file=sys.stderr)

        valid_response = False
        user_input = ''
        while not valid_response:
            if examples_buffer:
                prompt = '(y)es / (n)o / (u)nsure / (f)inished / (p)revious'
                valid_responses = {'y', 'n', 'u', 'f', 'p'}
            else:
                prompt = '(y)es / (n)o / (u)nsure / (f)inished'
                valid_responses = {'y', 'n', 'u', 'f'}

            print(prompt, file=sys.stderr)
            user_input = input()
            if user_input in valid_responses:
                valid_response = True

        if user_input == 'y':
            examples_buffer.insert(0, (record_pair, 'match'))
        elif user_input == 'n':
            examples_buffer.insert(0, (record_pair, 'distinct'))
        elif user_input == 'u':
            examples_buffer.insert(0, (record_pair, 'uncertain'))
        elif user_input == 'f':
            print('Finished labeling', file=sys.stderr)
            finished = True
        elif user_input == 'p':
            use_previous = True
            uncertain_pairs.append(record_pair)

        if len(examples_buffer) > buffer_len:
            record_pair, label = examples_buffer.pop()
            if label in ['distinct', 'match']:
                examples = {'distinct': [], 'match': []}
                examples[label].append(record_pair)
                deduper.markPairs(examples)

    for record_pair, label in examples_buffer:
        if label in ['distinct', 'match']:
            examples = {'distinct': [], 'match': []}
            examples[label].append(record_pair)
            deduper.markPairs(examples)
コード例 #6
0
ファイル: convenience.py プロジェクト: zhenglinyi/dedupe
def console_label(
        deduper: dedupe.api.ActiveMatching) -> None:  # pragma: no cover
    '''
   Train a matcher instance (Dedupe, RecordLink, or Gazetteer) from the command line.
   Example

   .. code:: python

      > deduper = dedupe.Dedupe(variables)
      > deduper.prepare_training(data)
      > dedupe.console_label(deduper)
    '''

    finished = False
    use_previous = False
    fields = unique(field.field for field in deduper.data_model.primary_fields)

    buffer_len = 1  # Max number of previous operations
    examples_buffer: List[Tuple[TrainingExample, Literal['match', 'distinct',
                                                         'uncertain']]] = []
    uncertain_pairs: List[TrainingExample] = []

    while not finished:
        if use_previous:
            record_pair, _ = examples_buffer.pop(0)
            use_previous = False
        else:
            try:
                if not uncertain_pairs:
                    uncertain_pairs = deduper.uncertain_pairs()

                record_pair = uncertain_pairs.pop()
            except IndexError:
                break

        n_match = (len(deduper.training_pairs['match']) +
                   sum(label == 'match' for _, label in examples_buffer))
        n_distinct = (len(deduper.training_pairs['distinct']) +
                      sum(label == 'distinct' for _, label in examples_buffer))

        for pair in record_pair:
            for field in fields:
                line = "%s : %s" % (field, pair[field])
                print(line, file=sys.stderr)
            print(file=sys.stderr)

        print("{0}/10 positive, {1}/10 negative".format(n_match, n_distinct),
              file=sys.stderr)
        print('Do these records refer to the same thing?', file=sys.stderr)

        valid_response = False
        user_input = ''
        while not valid_response:
            if examples_buffer:
                prompt = '(y)es / (n)o / (u)nsure / (f)inished / (p)revious'
                valid_responses = {'y', 'n', 'u', 'f', 'p'}
            else:
                prompt = '(y)es / (n)o / (u)nsure / (f)inished'
                valid_responses = {'y', 'n', 'u', 'f'}

            print(prompt, file=sys.stderr)
            user_input = input()
            if user_input in valid_responses:
                valid_response = True

        if user_input == 'y':
            examples_buffer.insert(0, (record_pair, 'match'))
        elif user_input == 'n':
            examples_buffer.insert(0, (record_pair, 'distinct'))
        elif user_input == 'u':
            examples_buffer.insert(0, (record_pair, 'uncertain'))
        elif user_input == 'f':
            print('Finished labeling', file=sys.stderr)
            finished = True
        elif user_input == 'p':
            use_previous = True
            uncertain_pairs.append(record_pair)

        if len(examples_buffer) > buffer_len:
            record_pair, label = examples_buffer.pop()
            if label in {'distinct', 'match'}:

                examples: TrainingData
                examples = {'distinct': [], 'match': []}
                examples[label].append(record_pair)  # type: ignore
                deduper.mark_pairs(examples)

    for record_pair, label in examples_buffer:
        if label in ['distinct', 'match']:

            exmples: TrainingData
            examples = {'distinct': [], 'match': []}
            examples[label].append(record_pair)  # type: ignore
            deduper.mark_pairs(examples)
コード例 #7
0
def console_label(deduper: dedupe.api.ActiveMatching) -> None:  # noqa: C901
    """
    Train a matcher instance (Dedupe, RecordLink, or Gazetteer) from the command line.
    Example

    .. code:: python

       > deduper = dedupe.Dedupe(variables)
       > deduper.prepare_training(data)
       > dedupe.console_label(deduper)
    """

    confirmed_matches = []

    finished = False
    use_previous = False
    fields = unique(field.field for field in deduper.data_model.primary_fields)

    buffer_len = 1  # Max number of previous operations
    examples_buffer: List[
        Tuple[TrainingExample, Literal["match", "distinct", "uncertain"]]
    ] = []
    uncertain_pairs: List[TrainingExample] = []

    # don't re-use items that are confirmed with a mapping
    mapped_items = set()

    while not finished:
        if use_previous:
            record_pair, _ = examples_buffer.pop(0)
            use_previous = False
        else:
            try:
                if not uncertain_pairs:
                    uncertain_pairs = deduper.uncertain_pairs()
                while True:
                    record_pair = uncertain_pairs.pop()
                    if (
                        len(
                            set([x["uri"] for x in record_pair]).intersection(
                                mapped_items
                            )
                        )
                        > 0
                    ):
                        examples_buffer.insert(0, (record_pair, "distinct"))
                        # TODO: do i need to process these?
                    else:
                        break
            except IndexError:
                break

        n_match = len(deduper.training_pairs["match"]) + sum(
            label == "match" for _, label in examples_buffer
        )
        n_distinct = len(deduper.training_pairs["distinct"]) + sum(
            label == "distinct" for _, label in examples_buffer
        )

        for pair in record_pair:
            for field in fields:
                line = "%s : %s" % (field, pair[field])
                print(line, file=sys.stderr)
            print(file=sys.stderr)

        print(
            "{0}/10 positive, {1}/10 negative".format(n_match, n_distinct),
            file=sys.stderr,
        )
        print(
            Fore.YELLOW + "Do these records refer to the same thing?" + Style.RESET_ALL,
            file=sys.stderr,
        )

        valid_response = False
        user_input = ""
        while not valid_response:
            if examples_buffer:
                prompt = "(y)es / (n)o / (u)nsure / (f)inished / (p)revious"
                valid_responses = {"y", "n", "u", "f", "p"}
            else:
                prompt = "(y)es / (n)o / (u)nsure / (f)inished"
                valid_responses = {"y", "n", "u", "f"}

            print(Fore.YELLOW + prompt + Style.RESET_ALL, file=sys.stderr)
            user_input = input()
            if user_input in valid_responses:
                valid_response = True

        if user_input == "y":
            examples_buffer.insert(0, (record_pair, "match"))
            mapped_items.add(record_pair[0]["uri"])
            mapped_items.add(record_pair[1]["uri"])
            confirmed_matches.append((record_pair[0]["uri"], record_pair[1]["uri"]))
            # deduper.mark_pairs({'match': record_pair})
        elif user_input == "n":
            examples_buffer.insert(0, (record_pair, "distinct"))
        elif user_input == "u":
            examples_buffer.insert(0, (record_pair, "uncertain"))
        elif user_input == "f":
            print(Fore.GREEN + "Finished labeling" + Style.RESET_ALL, file=sys.stderr)
            finished = True
        elif user_input == "p":
            use_previous = True
            uncertain_pairs.append(record_pair)

        if len(examples_buffer) > buffer_len:
            record_pair, label = examples_buffer.pop()
            if label in {"distinct", "match"}:

                examples: TrainingData
                examples = {"distinct": [], "match": []}
                examples[label].append(record_pair)  # type: ignore
                deduper.mark_pairs(examples)

    for record_pair, label in examples_buffer:
        if label in ["distinct", "match"]:

            exmples: TrainingData
            examples = {"distinct": [], "match": []}
            examples[label].append(record_pair)  # type: ignore
            deduper.mark_pairs(examples)
    return confirmed_matches
コード例 #8
0
def _merge_features(fields, g1_features, g2_features):
    while True:
        if len(g1_features) == len(g2_features) == 1:
            linked_records = (
                (list(g1_features.keys())[0], list(g2_features.keys())[0]),
                1.0,
            )
            g1_features = []
            g2_features = []
            break

        linker = dedupe.RecordLink(fields)
        linker.prepare_training(g1_features, g2_features)
        confirmed_matches = console_label(linker)
        linker.train()
        linked_records = linker.join(
            g1_features, g2_features, 0.0, constraint="one-to-one"
        )

        # remove records from linked_records that are in confirmed_matches
        for (e1, e2) in confirmed_matches:
            idx = 0
            while idx < len(linked_records):
                pair = linked_records[idx]
                for (pair, _) in linked_records:
                    if e1 in pair or e2 in pair:
                        linked_records.pop(idx)
                        break
                idx += 1

        # replace linked record scores with 1.0 if the user explicitly
        # marked them as equivalent. Then fill in other user-marked pairs
        # of linked records at the end
        idx = 0
        while idx < len(confirmed_matches):
            pair = confirmed_matches[idx]
            for lidx, (lpair, _) in enumerate(linked_records):
                if pair == lpair:
                    linked_records[lidx] = (pair, 1.0)
                    confirmed_matches.pop(idx)
                    idx -= 1  # cancel out the increment
                    break
            idx += 1
        linked_records.extend([(pair, 1.0) for pair in confirmed_matches])

        print(
            Style.BRIGHT + Fore.YELLOW + "Is this matching correct?" + Style.RESET_ALL
        )
        for (e1, e2), similarity in linked_records:
            for field in unique(field["field"] for field in fields):
                g1_val = g1_features[e1][field]
                g2_val = g2_features[e2][field]
                print(f"{g1_val:<50} | {g2_val:<50}")
            print(f"Similarity: {similarity}")
            print("-" * 20)
        ans = input(Fore.YELLOW + "[y/n]? " + Style.RESET_ALL)
        if ans.lower() == "y":
            print(
                Fore.GREEN
                + "All correct! Moving on to any stragglers"
                + Style.RESET_ALL
            )
            break
        else:
            print(Fore.RED + "Re-labeling..." + Style.RESET_ALL)
    linked_entities = _unpack_linked_records(linked_records)
    if len(linked_entities) != len(g1_features) or len(linked_entities) != len(
        g2_features
    ):
        leftover_g1 = set(g1_features.keys()).difference(linked_entities)
        leftover_g2 = set(g2_features.keys()).difference(linked_entities)
        leftover_g1 = {k: v for (k, v) in g1_features.items() if k in leftover_g1}
        leftover_g2 = {k: v for (k, v) in g2_features.items() if k in leftover_g2}
    return linked_records, leftover_g1, leftover_g2