Beispiel #1
0
def learn_model(distance_pairs, input_signatures, input_records,
                distance_model, verbose=0, ethnicity_estimator=None,
                fast=False):
    """Learn the distance model for pairs of signatures.

    Parameters
    ----------
    :param distance_pairs: string
        Path to the file with signature pairs. The content should be a JSON
        array of tuples (`signature_id1`, `signature_id2`, `target`),
        where `target = 0` if both signatures belong to the same author,
        and `target = 1` otherwise.

        [(0, 1, 0), (2, 3, 0), (4, 5, 1), ...]

    :param input_signatures: string
        Path to the file with signatures. The content should be a JSON array
        of dictionaries holding metadata about signatures.

        [{"signature_id": 0,
          "author_name": "Doe, John",
          "publication_id": 10, ...}, { ... }, ...]

    :param input_records: string
        Path to the file with records. The content should be a JSON array of
        dictionaries holding metadata about records

        [{"publication_id": 0,
          "title": "Author disambiguation using Beard", ... }, { ... }, ...]

    :param distance_model: string
        Path to the file with the distance model. The file should be pickled.

    :param fast: boolean
        Whether the distance model should be built on a reduced set of
        features.
    """
    pairs = json.load(open(distance_pairs, "r"))
    signatures, records = load_signatures(input_signatures, input_records)

    X = np.empty((len(pairs), 2), dtype=np.object)
    y = np.empty(len(pairs), dtype=np.int)

    for k, (i, j, target) in enumerate(pairs):
        X[k, 0] = signatures[i]
        X[k, 1] = signatures[j]
        y[k] = target

    # Learn a distance estimator on paired signatures
    distance_estimator = _build_distance_estimator(
        X, y,
        verbose=verbose, ethnicity_estimator=ethnicity_estimator, fast=fast
    )

    pickle.dump(distance_estimator,
                open(distance_model, "wb"),
                protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #2
0
def learn_model(distance_pairs, input_signatures, input_records,
                distance_model, verbose=0, ethnicity_estimator=None):
    """Learn the distance model for pairs of signatures.

    Parameters
    ----------
    :param distance_pairs: string
        Path to the file with signature pairs. The content should be a JSON
        array of tuples (`signature_id1`, `signature_id2`, `target`),
        where `target = 0` if both signatures belong to the same author,
        and `target = 1` otherwise.

        [(0, 1, 0), (2, 3, 0), (4, 5, 1), ...]

    :param input_signatures: string
        Path to the file with signatures. The content should be a JSON array
        of dictionaries holding metadata about signatures.

        [{"signature_id": 0,
          "author_name": "Doe, John",
          "publication_id": 10, ...}, { ... }, ...]

    :param input_records: string
        Path to the file with records. The content should be a JSON array of
        dictionaries holding metadata about records

        [{"publication_id": 0,
          "title": "Author disambiguation using Beard", ... }, { ... }, ...]

    :param distance_model: string
        Path to the file with the distance model. The file should be pickled.
    """
    pairs = json.load(open(distance_pairs, "r"))
    signatures, records = load_signatures(input_signatures, input_records)

    X = np.empty((len(pairs), 2), dtype=np.object)
    y = np.empty(len(pairs), dtype=np.int)

    for k, (i, j, target) in enumerate(pairs):
        X[k, 0] = signatures[i]
        X[k, 1] = signatures[j]
        y[k] = target

    # Learn a distance estimator on paired signatures
    distance_estimator = _build_distance_estimator(
        X, y, verbose=verbose, ethnicity_estimator=ethnicity_estimator
    )

    pickle.dump(distance_estimator,
                open(distance_model, "wb"),
                protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #3
0
    def signature_update(self, logger):
        try:
            tmpfile = tempfile.NamedTemporaryFile()
            response = urllib2.urlopen(self.settings().signature_url)
            sigjson = response.read()
            tmpfile.write(sigjson)
            tmpfile.flush()

            logger.debug("Successfully got file from %s" % self.settings().signature_url)
            # test the import without caching it
            if not utils.load_signatures(tmpfile.name,cache=False):
                logger.error("Downloaded signatures failed test load (tempfile = %s)" % tmpfile.name)
                return False

            # rewrite the real signature file and import it for real
            f = open(self.settings().signature_path,"w")
            f.write(sigjson)
            f.close()

            return utils.load_signatures(self.settings().signature_path)
        except:
            utils.log_exc(logger)
            return False
Beispiel #4
0
    def signature_update(self, logger):
        try:
            tmpfile = tempfile.NamedTemporaryFile()
            response = urllib2.urlopen(self.settings().signature_url)
            sigjson = response.read()
            tmpfile.write(sigjson)
            tmpfile.flush()

            logger.debug("Successfully got file from %s" % self.settings().signature_url)
            # test the import without caching it
            if not utils.load_signatures(tmpfile.name,cache=False):
                logger.error("Downloaded signatures failed test load (tempfile = %s)" % tmpfile.name)
                return False

            # rewrite the real signature file and import it for real
            f = open(self.settings().signature_path,"w")
            f.write(sigjson)
            f.close()

            return utils.load_signatures(self.settings().signature_path)
        except:
            utils.log_exc(logger)
            return False
Beispiel #5
0
    def __init__(self, is_cobblerd=False):
        """
        Constructor
        """

        # FIXME: this should be switchable through some simple system

        self.__dict__ = BootAPI.__shared_state
        self.perms_ok = False
        if not BootAPI.__has_loaded:

            if os.path.exists("/etc/cobbler/use.couch"):
                 self.use_couch = True
            else:
                 self.use_couch = False

            # NOTE: we do not log all API actions, because
            # a simple CLI invocation may call adds and such
            # to load the config, which would just fill up
            # the logs, so we'll do that logging at CLI
            # level (and remote.py web service level) instead.

            random.seed()
            self.is_cobblerd = is_cobblerd

            try:
                self.logger = clogger.Logger("/var/log/cobbler/cobbler.log")
            except CX:
                # return to CLI/other but perms are not valid
                # perms_ok is False
                return

            # FIXME: conslidate into 1 server instance

            self.selinux_enabled = utils.is_selinux_enabled()
            self.dist = utils.check_dist()
            self.os_version = utils.os_release()

            BootAPI.__has_loaded   = True

            # load the modules first, or nothing else works...
            module_loader.load_modules()

            self._config = config.Config(self)
            self.deserialize()

            # import signatures
            if not utils.load_signatures(self.settings().signature_path):
                return
            else:
                self.log("%d breeds and %d OS versions read from the signature file" % ( \
                         len(utils.get_valid_breeds()), \
                         len(utils.get_valid_os_versions())))

            self.authn = self.get_module_from_file(
                "authentication",
                "module",
                "authn_configfile"
            )
            self.authz  = self.get_module_from_file(
                "authorization",
                "module",
                "authz_allowall"
            )

            # FIXME: pass more loggers around, and also see that those
            # using things via tasks construct their own kickgen/yumgen/
            # pxegen versus reusing this one, which has the wrong logger
            # (most likely) for background tasks.

            self.kickgen = kickgen.KickGen(self._config)
            self.yumgen  = yumgen.YumGen(self._config)
            self.pxegen  = pxegen.PXEGen(self._config, logger=self.logger)
            self.logger.debug("API handle initialized")
            self.perms_ok = True
Beispiel #6
0
            elif object_action in [
                    "poweron", "poweroff", "powerstatus", "reboot"
            ]:
                power = {}
                power["power"] = object_action.replace("power", "")
                power["systems"] = [options.name]
                task_id = self.remote.background_power_system(
                    power, self.token)
            elif object_action == "update":
                task_id = self.remote.background_signature_update(
                    utils.strip_none(vars(options), omit_none=True),
                    self.token)
            elif object_action == "reload":
                filename = opt(options, "filename",
                               "/var/lib/cobbler/distro_signatures.json")
                if not utils.load_signatures(filename, cache=True):
                    print "There was an error loading the signature data in %s." % filename
                    print "Please check the JSON file or run 'cobbler signature update'."
                    return False
                else:
                    print "Signatures were successfully loaded"
            else:
                raise exceptions.NotImplementedError()
        else:
            raise exceptions.NotImplementedError()

        # FIXME: add tail/polling code here
        if task_id != -1:
            self.print_task(task_id)
            self.follow_task(task_id)
Beispiel #7
0
def clustering(input_signatures,
               input_records,
               distance_model,
               input_clusters=None,
               output_clusters=None,
               verbose=1,
               n_jobs=-1,
               clustering_method="average",
               train_signatures_file=None,
               clustering_threshold=None,
               results_file=None,
               blocking_function="block_phonetic",
               blocking_threshold=1,
               blocking_phonetic_alg="nysiis"):
    """Cluster signatures using a pretrained distance model.

    Parameters
    ----------
    :param input_signatures: string
        Path to the file with signatures. The content should be a JSON array
        of dictionaries holding metadata about signatures.

        [{"signature_id": 0,
          "author_name": "Doe, John",
          "publication_id": 10, ...}, { ... }, ...]

    :param input_records: string
        Path to the file with records. The content should be a JSON array of
        dictionaries holding metadata about records

        [{"publication_id": 0,
          "title": "Author disambiguation using Beard", ... }, { ... }, ...]

    :param distance_model: string
        Path to the file with the distance model. The file should be a pickle
        created using the ``distance.py`` script.

    :param input_clusters: string
        Path to the file with knownn clusters. The file should be a dictionary,
        where keys are cluster labels and values are the `signature_id` of the
        signatures grouped in the clusters. Signatures assigned to the cluster
        with label "-1" are not clustered.

        {"0": [0, 1, 3], "1": [2, 5], ...}

    :param output_clusters: string
        Path to the file with output cluster. The file will be filled with
        clusters, using the same format as ``input_clusters``.

    :param verbose: int
        If not zero, function will output scores on stdout.

    :param n_jobs: int
        Parameter passed to joblib. Number of threads to be used.

    :param clustering_method: string
        Parameter passed to ``ScipyHierarchicalClustering``. Used only if
        ``clustering_test_size`` is specified.

    :param train_signatures_file: str
        Path to the file with train set signatures. Format the same as in
        ``input_signatures``.

    :param clustering_threshold: float
        Threshold passed to ``ScipyHierarchicalClustering``.

    :param results_file: str
        Path to the file where the results will be output. It will give
        additional information about pairwise variant of scores.

    :param blocking_function: string
        must be a defined blocking function. Defined functions are:
        - "block_last_name_first_initial"
        - "block_phonetic"

    :param blocking_threshold: int or None
        It determines the maximum allowed size of blocking on the last name
        It can only be:
        -   None; if the blocking function is block_last_name_first_initial
        -   int; if the blocking function is block_phonetic
            please check the documentation of phonetic blocking in
            beard.clustering.blocking_funcs.py

    :param blocking_phonetic_alg: string or None
        If not None, determines which phonetic algorithm is used. Options:
        -  "double_metaphone"
        -  "nysiis" (only for Python 2)
        -  "soundex" (only for Python 2)
    """
    # Assumes that 'distance_estimator' lives in global, making things fast
    global distance_estimator
    distance_estimator = pickle.load(open(distance_model, "rb"))

    try:
        distance_estimator.steps[-1][1].set_params(n_jobs=1)
    except:
        pass

    signatures, records = load_signatures(input_signatures, input_records)

    indices = {}
    X = np.empty((len(signatures), 1), dtype=np.object)
    for i, signature in enumerate(
            sorted(signatures.values(), key=lambda s: s["signature_id"])):
        X[i, 0] = signature
        indices[signature["signature_id"]] = i

    if blocking_function == "block_last_name_first_initial":
        block_function = block_last_name_first_initial
    else:
        block_function = partial(block_phonetic,
                                 threshold=blocking_threshold,
                                 phonetic_algorithm=blocking_phonetic_alg)

    # Semi-supervised block clustering
    if input_clusters:
        true_clusters = json.load(open(input_clusters, "r"))
        y_true = -np.ones(len(X), dtype=np.int)

        for label, signature_ids in true_clusters.items():
            for signature_id in signature_ids:
                y_true[indices[signature_id]] = label

        y = -np.ones(len(X), dtype=np.int)

        if train_signatures_file:
            train_signatures = json.load(open(train_signatures_file, "r"))
            train_ids = [x['signature_id'] for x in train_signatures]
            del train_signatures
            y[train_ids] = y_true[train_ids]
            test_ids = list(
                set([x['signature_id']
                     for _, x in signatures.iteritems()]) - set(train_ids))
        else:
            y = y_true

    else:
        y = None

    clusterer = BlockClustering(blocking=block_function,
                                base_estimator=ScipyHierarchicalClustering(
                                    affinity=_affinity,
                                    threshold=clustering_threshold,
                                    method=clustering_method,
                                    supervised_scoring=b3_f_score),
                                verbose=verbose,
                                n_jobs=n_jobs).fit(X, y)

    labels = clusterer.labels_

    # Save predicted clusters
    if output_clusters:
        clusters = {}

        for label in np.unique(labels):
            mask = (labels == label)
            clusters[str(label)] = [r[0]["signature_id"] for r in X[mask]]

        json.dump(clusters, open(output_clusters, "w"))

    # Statistics
    if verbose and input_clusters:
        print("Number of blocks =", len(clusterer.clusterers_))
        print("True number of clusters", len(np.unique(y_true)))
        print("Number of computed clusters", len(np.unique(labels)))

        b3_overall = b3_precision_recall_fscore(y_true, labels)
        print("B^3 F-score (overall) =", b3_overall[2])

        if train_signatures_file:
            b3_train = b3_precision_recall_fscore(y_true[train_ids],
                                                  labels[train_ids])
            b3_test = b3_precision_recall_fscore(y_true[test_ids],
                                                 labels[test_ids])
            print("B^3 F-score (train) =", b3_train[2])
            print("B^3 F-score (test) =", b3_test[2])
            if results_file:
                paired_overall = paired_precision_recall_fscore(y_true, labels)
                paired_train = paired_precision_recall_fscore(
                    y_true[train_ids], labels[train_ids])
                paired_test = paired_precision_recall_fscore(
                    y_true[test_ids], labels[test_ids])

                json.dump(
                    {
                        "description": ["precision", "recall", "f_score"],
                        "b3": {
                            "overall": list(b3_overall),
                            "train": list(b3_train),
                            "test": list(b3_test)
                        },
                        "paired": {
                            "overall": list(paired_overall),
                            "train": list(paired_train),
                            "test": list(paired_test)
                        }
                    }, open(results_file, 'w'))
Beispiel #8
0
    def __init__(self, is_cobblerd=False):
        """
        Constructor
        """

        # FIXME: this should be switchable through some simple system

        self.__dict__ = BootAPI.__shared_state
        self.perms_ok = False
        if not BootAPI.__has_loaded:

            if os.path.exists("/etc/cobbler/use.couch"):
                 self.use_couch = True
            else:
                 self.use_couch = False

            # NOTE: we do not log all API actions, because
            # a simple CLI invocation may call adds and such
            # to load the config, which would just fill up
            # the logs, so we'll do that logging at CLI
            # level (and remote.py web service level) instead.

            random.seed()
            self.is_cobblerd = is_cobblerd

            try:
                self.logger = clogger.Logger("/var/log/cobbler/cobbler.log")
            except CX:
                # return to CLI/other but perms are not valid
                # perms_ok is False
                return

            # FIXME: conslidate into 1 server instance

            self.selinux_enabled = utils.is_selinux_enabled()
            self.dist = utils.check_dist()
            self.os_version = utils.os_release()

            BootAPI.__has_loaded   = True

            # load the modules first, or nothing else works...
            module_loader.load_modules()

            self._config = config.Config(self)
            self.deserialize()

            # import signatures
            if not utils.load_signatures(self.settings().signature_path):
                return
            else:
                self.log("%d breeds and %d OS versions read from the signature file" % ( \
                         len(utils.get_valid_breeds()), \
                         len(utils.get_valid_os_versions())))

            self.authn = self.get_module_from_file(
                "authentication",
                "module",
                "authn_configfile"
            )
            self.authz  = self.get_module_from_file(
                "authorization",
                "module",
                "authz_allowall"
            )

            # FIXME: pass more loggers around, and also see that those
            # using things via tasks construct their own kickgen/yumgen/
            # pxegen versus reusing this one, which has the wrong logger
            # (most likely) for background tasks.

            self.kickgen = kickgen.KickGen(self._config)
            self.yumgen  = yumgen.YumGen(self._config)
            self.pxegen  = pxegen.PXEGen(self._config, logger=self.logger)
            self.logger.debug("API handle initialized")
            self.perms_ok = True
Beispiel #9
0
def clustering(input_signatures, input_records, distance_model,
               input_clusters=None, output_clusters=None,
               verbose=1, n_jobs=-1, clustering_method="average",
               clustering_random_state=42, clustering_test_size=None,
               clustering_threshold=None):
    """Cluster signatures using a pretrained distance model.

    Parameters
    ----------
    :param input_signatures: string
        Path to the file with signatures. The content should be a JSON array
        of dictionaries holding metadata about signatures.

        [{"signature_id": 0,
          "author_name": "Doe, John",
          "publication_id": 10, ...}, { ... }, ...]

    :param input_records: string
        Path to the file with records. The content should be a JSON array of
        dictionaries holding metadata about records

        [{"publication_id": 0,
          "title": "Author disambiguation using Beard", ... }, { ... }, ...]

    :param distance_model: string
        Path to the file with the distance model. The file should be a pickle
        created using the ``distance.py`` script.

    :param input_clusters: string
        Path to the file with knownn clusters. The file should be a dictionary,
        where keys are cluster labels and values are the `signature_id` of the
        signatures grouped in the clusters. Signatures assigned to the cluster
        with label "-1" are not clustered.

        {"0": [0, 1, 3], "1": [2, 5], ...}

    :param output_clusters: string
        Path to the file with output cluster. The file will be filled with
        clusters, using the same format as ``input_clusters``.

    :param verbose: int
        If not zero, function will output scores on stdout.

    :param n_jobs: int
        Parameter passed to joblib. Number of threads to be used.

    :param clustering_method: string
        Parameter passed to ``ScipyHierarchicalClustering``. Used only if
        ``clustering_test_size`` is specified.

    :param clustering_random_state: int or RandomState
        Random state for spltting the data into training and test data.

    :param clustering_test_size: float
        Part of data used in the test set.

    :param clustering_threshold: float
        Threshold passed to ``ScipyHierarchicalClustering``.

    """
    # Assumes that 'distance_estimator' lives in global, making things fast
    global distance_estimator

    distance_estimator = pickle.load(open(distance_model, "rb"))
    signatures, records = load_signatures(input_signatures,
                                          input_records)

    indices = {}
    X = np.empty((len(signatures), 1), dtype=np.object)
    for i, signature in enumerate(sorted(signatures.values(),
                                         key=lambda s: s["signature_id"])):
        X[i, 0] = signature
        indices[signature["signature_id"]] = i

    # Semi-supervised block clustering
    if input_clusters:
        true_clusters = json.load(open(input_clusters, "r"))
        y_true = -np.ones(len(X), dtype=np.int)

        for label, signature_ids in true_clusters.items():
            for signature_id in signature_ids:
                y_true[indices[signature_id]] = label

        if clustering_test_size is not None:
            train, test = train_test_split(
                np.arange(len(X)),
                test_size=clustering_test_size,
                random_state=clustering_random_state)

            y = -np.ones(len(X), dtype=np.int)
            y[train] = y_true[train]

        else:
            y = y_true

    else:
        y = None

    clusterer = BlockClustering(
        blocking=block_last_name_first_initial,
        base_estimator=ScipyHierarchicalClustering(
            affinity=_affinity,
            threshold=clustering_threshold,
            method=clustering_method,
            supervised_scoring=b3_f_score),
        verbose=verbose,
        n_jobs=n_jobs).fit(X, y)

    labels = clusterer.labels_

    # Save predicted clusters
    if output_clusters:
        clusters = {}

        for label in np.unique(labels):
            mask = (labels == label)
            clusters[label] = [r[0]["signature_id"] for r in X[mask]]

        json.dump(clusters, open(output_clusters, "w"))

    # Statistics
    if verbose and input_clusters:
        print("Number of blocks =", len(clusterer.clusterers_))
        print("True number of clusters", len(np.unique(y_true)))
        print("Number of computed clusters", len(np.unique(labels)))
        print("B^3 F-score (overall) =", b3_f_score(y_true, labels))

        if clustering_test_size:
            print("B^3 F-score (train) =",
                  b3_f_score(y_true[train], labels[train]))
            print("B^3 F-score (test) =",
                  b3_f_score(y_true[test], labels[test]))
Beispiel #10
0
             data = self.remote.get_blended_data("",options.name)
         # FIXME: pretty-printing and sorting here
         keys = data.keys()
         keys.sort()
         for x in keys:
            print "%s : %s" % (x, data[x])
     elif object_action in [ "poweron", "poweroff", "powerstatus", "reboot" ]:
         power={}
         power["power"] = object_action.replace("power","")
         power["systems"] = [options.name]
         task_id = self.remote.background_power_system(power, self.token)
     elif object_action == "update":
         task_id = self.remote.background_signature_update(utils.strip_none(vars(options),omit_none=True), self.token)
     elif object_action == "reload":
         filename = opt(options,"filename","/var/lib/cobbler/distro_signatures.json")
         if not utils.load_signatures(filename,cache=True):
             print "There was an error loading the signature data in %s." % filename
             print "Please check the JSON file or run 'cobbler signature update'."
             return False
         else:
             print "Signatures were successfully loaded"
     else:
         raise exceptions.NotImplementedError()
 else:
     raise exceptions.NotImplementedError() 
     
 # FIXME: add tail/polling code here
 if task_id != -1:
     self.print_task(task_id)
     self.follow_task(task_id)
                                         
Beispiel #11
0
def clustering(input_signatures, input_records, distance_model,
               input_clusters=None, output_clusters=None,
               verbose=1, n_jobs=-1, clustering_method="average",
               train_signatures_file=None, clustering_threshold=None,
               results_file=None, blocking_function="block_phonetic",
               blocking_threshold=1, blocking_phonetic_alg="nysiis"):
    """Cluster signatures using a pretrained distance model.

    Parameters
    ----------
    :param input_signatures: string
        Path to the file with signatures. The content should be a JSON array
        of dictionaries holding metadata about signatures.

        [{"signature_id": 0,
          "author_name": "Doe, John",
          "publication_id": 10, ...}, { ... }, ...]

    :param input_records: string
        Path to the file with records. The content should be a JSON array of
        dictionaries holding metadata about records

        [{"publication_id": 0,
          "title": "Author disambiguation using Beard", ... }, { ... }, ...]

    :param distance_model: string
        Path to the file with the distance model. The file should be a pickle
        created using the ``distance.py`` script.

    :param input_clusters: string
        Path to the file with knownn clusters. The file should be a dictionary,
        where keys are cluster labels and values are the `signature_id` of the
        signatures grouped in the clusters. Signatures assigned to the cluster
        with label "-1" are not clustered.

        {"0": [0, 1, 3], "1": [2, 5], ...}

    :param output_clusters: string
        Path to the file with output cluster. The file will be filled with
        clusters, using the same format as ``input_clusters``.

    :param verbose: int
        If not zero, function will output scores on stdout.

    :param n_jobs: int
        Parameter passed to joblib. Number of threads to be used.

    :param clustering_method: string
        Parameter passed to ``ScipyHierarchicalClustering``. Used only if
        ``clustering_test_size`` is specified.

    :param train_signatures_file: str
        Path to the file with train set signatures. Format the same as in
        ``input_signatures``.

    :param clustering_threshold: float
        Threshold passed to ``ScipyHierarchicalClustering``.

    :param results_file: str
        Path to the file where the results will be output. It will give
        additional information about pairwise variant of scores.

    :param blocking_function: string
        must be a defined blocking function. Defined functions are:
        - "block_last_name_first_initial"
        - "block_phonetic"

    :param blocking_threshold: int or None
        It determines the maximum allowed size of blocking on the last name
        It can only be:
        -   None; if the blocking function is block_last_name_first_initial
        -   int; if the blocking function is block_phonetic
            please check the documentation of phonetic blocking in
            beard.clustering.blocking_funcs.py

    :param blocking_phonetic_alg: string or None
        If not None, determines which phonetic algorithm is used. Options:
        -  "double_metaphone"
        -  "nysiis" (only for Python 2)
        -  "soundex" (only for Python 2)
    """
    # Assumes that 'distance_estimator' lives in global, making things fast
    global distance_estimator
    distance_estimator = pickle.load(open(distance_model, "rb"))

    try:
        distance_estimator.steps[-1][1].set_params(n_jobs=1)
    except:
        pass

    signatures, records = load_signatures(input_signatures,
                                          input_records)

    indices = {}
    X = np.empty((len(signatures), 1), dtype=np.object)
    for i, signature in enumerate(sorted(signatures.values(),
                                         key=lambda s: s["signature_id"])):
        X[i, 0] = signature
        indices[signature["signature_id"]] = i

    if blocking_function == "block_last_name_first_initial":
        block_function = block_last_name_first_initial
    else:
        block_function = partial(block_phonetic,
                                 threshold=blocking_threshold,
                                 phonetic_algorithm=blocking_phonetic_alg)

    # Semi-supervised block clustering
    if input_clusters:
        true_clusters = json.load(open(input_clusters, "r"))
        y_true = -np.ones(len(X), dtype=np.int)

        for label, signature_ids in true_clusters.items():
            for signature_id in signature_ids:
                y_true[indices[signature_id]] = label

        y = -np.ones(len(X), dtype=np.int)

        if train_signatures_file:
            train_signatures = json.load(open(train_signatures_file, "r"))
            train_ids = [x['signature_id'] for x in train_signatures]
            del train_signatures
            y[train_ids] = y_true[train_ids]
            test_ids = list(set([x['signature_id'] for _, x in
                                 signatures.iteritems()]) - set(train_ids))
        else:
            y = y_true

    else:
        y = None

    clusterer = BlockClustering(
        blocking=block_function,
        base_estimator=ScipyHierarchicalClustering(
            affinity=_affinity,
            threshold=clustering_threshold,
            method=clustering_method,
            supervised_scoring=b3_f_score),
        verbose=verbose,
        n_jobs=n_jobs).fit(X, y)

    labels = clusterer.labels_

    # Save predicted clusters
    if output_clusters:
        clusters = {}

        for label in np.unique(labels):
            mask = (labels == label)
            clusters[str(label)] = [r[0]["signature_id"] for r in X[mask]]

        json.dump(clusters, open(output_clusters, "w"))

    # Statistics
    if verbose and input_clusters:
        print("Number of blocks =", len(clusterer.clusterers_))
        print("True number of clusters", len(np.unique(y_true)))
        print("Number of computed clusters", len(np.unique(labels)))

        b3_overall = b3_precision_recall_fscore(y_true, labels)
        print("B^3 F-score (overall) =", b3_overall[2])

        if train_signatures_file:
            b3_train = b3_precision_recall_fscore(
                y_true[train_ids],
                labels[train_ids]
            )
            b3_test = b3_precision_recall_fscore(
                y_true[test_ids],
                labels[test_ids]
            )
            print("B^3 F-score (train) =", b3_train[2])
            print("B^3 F-score (test) =", b3_test[2])
            if results_file:
                paired_overall = paired_precision_recall_fscore(y_true, labels)
                paired_train = paired_precision_recall_fscore(
                    y_true[train_ids],
                    labels[train_ids]
                )
                paired_test = paired_precision_recall_fscore(
                    y_true[test_ids],
                    labels[test_ids]
                )

                json.dump({
                    "description": ["precision", "recall", "f_score"],
                    "b3": {"overall": list(b3_overall),
                           "train": list(b3_train),
                           "test": list(b3_test)
                           },
                    "paired": {"overall": list(paired_overall),
                               "train": list(paired_train),
                               "test": list(paired_test)
                               }
                }, open(results_file, 'w'))