def learn_model(distance_pairs, input_signatures, input_records, distance_model, verbose=0, ethnicity_estimator=None, fast=False): """Learn the distance model for pairs of signatures. Parameters ---------- :param distance_pairs: string Path to the file with signature pairs. The content should be a JSON array of tuples (`signature_id1`, `signature_id2`, `target`), where `target = 0` if both signatures belong to the same author, and `target = 1` otherwise. [(0, 1, 0), (2, 3, 0), (4, 5, 1), ...] :param input_signatures: string Path to the file with signatures. The content should be a JSON array of dictionaries holding metadata about signatures. [{"signature_id": 0, "author_name": "Doe, John", "publication_id": 10, ...}, { ... }, ...] :param input_records: string Path to the file with records. The content should be a JSON array of dictionaries holding metadata about records [{"publication_id": 0, "title": "Author disambiguation using Beard", ... }, { ... }, ...] :param distance_model: string Path to the file with the distance model. The file should be pickled. :param fast: boolean Whether the distance model should be built on a reduced set of features. """ pairs = json.load(open(distance_pairs, "r")) signatures, records = load_signatures(input_signatures, input_records) X = np.empty((len(pairs), 2), dtype=np.object) y = np.empty(len(pairs), dtype=np.int) for k, (i, j, target) in enumerate(pairs): X[k, 0] = signatures[i] X[k, 1] = signatures[j] y[k] = target # Learn a distance estimator on paired signatures distance_estimator = _build_distance_estimator( X, y, verbose=verbose, ethnicity_estimator=ethnicity_estimator, fast=fast ) pickle.dump(distance_estimator, open(distance_model, "wb"), protocol=pickle.HIGHEST_PROTOCOL)
def learn_model(distance_pairs, input_signatures, input_records, distance_model, verbose=0, ethnicity_estimator=None): """Learn the distance model for pairs of signatures. Parameters ---------- :param distance_pairs: string Path to the file with signature pairs. The content should be a JSON array of tuples (`signature_id1`, `signature_id2`, `target`), where `target = 0` if both signatures belong to the same author, and `target = 1` otherwise. [(0, 1, 0), (2, 3, 0), (4, 5, 1), ...] :param input_signatures: string Path to the file with signatures. The content should be a JSON array of dictionaries holding metadata about signatures. [{"signature_id": 0, "author_name": "Doe, John", "publication_id": 10, ...}, { ... }, ...] :param input_records: string Path to the file with records. The content should be a JSON array of dictionaries holding metadata about records [{"publication_id": 0, "title": "Author disambiguation using Beard", ... }, { ... }, ...] :param distance_model: string Path to the file with the distance model. The file should be pickled. """ pairs = json.load(open(distance_pairs, "r")) signatures, records = load_signatures(input_signatures, input_records) X = np.empty((len(pairs), 2), dtype=np.object) y = np.empty(len(pairs), dtype=np.int) for k, (i, j, target) in enumerate(pairs): X[k, 0] = signatures[i] X[k, 1] = signatures[j] y[k] = target # Learn a distance estimator on paired signatures distance_estimator = _build_distance_estimator( X, y, verbose=verbose, ethnicity_estimator=ethnicity_estimator ) pickle.dump(distance_estimator, open(distance_model, "wb"), protocol=pickle.HIGHEST_PROTOCOL)
def signature_update(self, logger): try: tmpfile = tempfile.NamedTemporaryFile() response = urllib2.urlopen(self.settings().signature_url) sigjson = response.read() tmpfile.write(sigjson) tmpfile.flush() logger.debug("Successfully got file from %s" % self.settings().signature_url) # test the import without caching it if not utils.load_signatures(tmpfile.name,cache=False): logger.error("Downloaded signatures failed test load (tempfile = %s)" % tmpfile.name) return False # rewrite the real signature file and import it for real f = open(self.settings().signature_path,"w") f.write(sigjson) f.close() return utils.load_signatures(self.settings().signature_path) except: utils.log_exc(logger) return False
def __init__(self, is_cobblerd=False): """ Constructor """ # FIXME: this should be switchable through some simple system self.__dict__ = BootAPI.__shared_state self.perms_ok = False if not BootAPI.__has_loaded: if os.path.exists("/etc/cobbler/use.couch"): self.use_couch = True else: self.use_couch = False # NOTE: we do not log all API actions, because # a simple CLI invocation may call adds and such # to load the config, which would just fill up # the logs, so we'll do that logging at CLI # level (and remote.py web service level) instead. random.seed() self.is_cobblerd = is_cobblerd try: self.logger = clogger.Logger("/var/log/cobbler/cobbler.log") except CX: # return to CLI/other but perms are not valid # perms_ok is False return # FIXME: conslidate into 1 server instance self.selinux_enabled = utils.is_selinux_enabled() self.dist = utils.check_dist() self.os_version = utils.os_release() BootAPI.__has_loaded = True # load the modules first, or nothing else works... module_loader.load_modules() self._config = config.Config(self) self.deserialize() # import signatures if not utils.load_signatures(self.settings().signature_path): return else: self.log("%d breeds and %d OS versions read from the signature file" % ( \ len(utils.get_valid_breeds()), \ len(utils.get_valid_os_versions()))) self.authn = self.get_module_from_file( "authentication", "module", "authn_configfile" ) self.authz = self.get_module_from_file( "authorization", "module", "authz_allowall" ) # FIXME: pass more loggers around, and also see that those # using things via tasks construct their own kickgen/yumgen/ # pxegen versus reusing this one, which has the wrong logger # (most likely) for background tasks. self.kickgen = kickgen.KickGen(self._config) self.yumgen = yumgen.YumGen(self._config) self.pxegen = pxegen.PXEGen(self._config, logger=self.logger) self.logger.debug("API handle initialized") self.perms_ok = True
elif object_action in [ "poweron", "poweroff", "powerstatus", "reboot" ]: power = {} power["power"] = object_action.replace("power", "") power["systems"] = [options.name] task_id = self.remote.background_power_system( power, self.token) elif object_action == "update": task_id = self.remote.background_signature_update( utils.strip_none(vars(options), omit_none=True), self.token) elif object_action == "reload": filename = opt(options, "filename", "/var/lib/cobbler/distro_signatures.json") if not utils.load_signatures(filename, cache=True): print "There was an error loading the signature data in %s." % filename print "Please check the JSON file or run 'cobbler signature update'." return False else: print "Signatures were successfully loaded" else: raise exceptions.NotImplementedError() else: raise exceptions.NotImplementedError() # FIXME: add tail/polling code here if task_id != -1: self.print_task(task_id) self.follow_task(task_id)
def clustering(input_signatures, input_records, distance_model, input_clusters=None, output_clusters=None, verbose=1, n_jobs=-1, clustering_method="average", train_signatures_file=None, clustering_threshold=None, results_file=None, blocking_function="block_phonetic", blocking_threshold=1, blocking_phonetic_alg="nysiis"): """Cluster signatures using a pretrained distance model. Parameters ---------- :param input_signatures: string Path to the file with signatures. The content should be a JSON array of dictionaries holding metadata about signatures. [{"signature_id": 0, "author_name": "Doe, John", "publication_id": 10, ...}, { ... }, ...] :param input_records: string Path to the file with records. The content should be a JSON array of dictionaries holding metadata about records [{"publication_id": 0, "title": "Author disambiguation using Beard", ... }, { ... }, ...] :param distance_model: string Path to the file with the distance model. The file should be a pickle created using the ``distance.py`` script. :param input_clusters: string Path to the file with knownn clusters. The file should be a dictionary, where keys are cluster labels and values are the `signature_id` of the signatures grouped in the clusters. Signatures assigned to the cluster with label "-1" are not clustered. {"0": [0, 1, 3], "1": [2, 5], ...} :param output_clusters: string Path to the file with output cluster. The file will be filled with clusters, using the same format as ``input_clusters``. :param verbose: int If not zero, function will output scores on stdout. :param n_jobs: int Parameter passed to joblib. Number of threads to be used. :param clustering_method: string Parameter passed to ``ScipyHierarchicalClustering``. Used only if ``clustering_test_size`` is specified. :param train_signatures_file: str Path to the file with train set signatures. Format the same as in ``input_signatures``. :param clustering_threshold: float Threshold passed to ``ScipyHierarchicalClustering``. :param results_file: str Path to the file where the results will be output. It will give additional information about pairwise variant of scores. :param blocking_function: string must be a defined blocking function. Defined functions are: - "block_last_name_first_initial" - "block_phonetic" :param blocking_threshold: int or None It determines the maximum allowed size of blocking on the last name It can only be: - None; if the blocking function is block_last_name_first_initial - int; if the blocking function is block_phonetic please check the documentation of phonetic blocking in beard.clustering.blocking_funcs.py :param blocking_phonetic_alg: string or None If not None, determines which phonetic algorithm is used. Options: - "double_metaphone" - "nysiis" (only for Python 2) - "soundex" (only for Python 2) """ # Assumes that 'distance_estimator' lives in global, making things fast global distance_estimator distance_estimator = pickle.load(open(distance_model, "rb")) try: distance_estimator.steps[-1][1].set_params(n_jobs=1) except: pass signatures, records = load_signatures(input_signatures, input_records) indices = {} X = np.empty((len(signatures), 1), dtype=np.object) for i, signature in enumerate( sorted(signatures.values(), key=lambda s: s["signature_id"])): X[i, 0] = signature indices[signature["signature_id"]] = i if blocking_function == "block_last_name_first_initial": block_function = block_last_name_first_initial else: block_function = partial(block_phonetic, threshold=blocking_threshold, phonetic_algorithm=blocking_phonetic_alg) # Semi-supervised block clustering if input_clusters: true_clusters = json.load(open(input_clusters, "r")) y_true = -np.ones(len(X), dtype=np.int) for label, signature_ids in true_clusters.items(): for signature_id in signature_ids: y_true[indices[signature_id]] = label y = -np.ones(len(X), dtype=np.int) if train_signatures_file: train_signatures = json.load(open(train_signatures_file, "r")) train_ids = [x['signature_id'] for x in train_signatures] del train_signatures y[train_ids] = y_true[train_ids] test_ids = list( set([x['signature_id'] for _, x in signatures.iteritems()]) - set(train_ids)) else: y = y_true else: y = None clusterer = BlockClustering(blocking=block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=clustering_threshold, method=clustering_method, supervised_scoring=b3_f_score), verbose=verbose, n_jobs=n_jobs).fit(X, y) labels = clusterer.labels_ # Save predicted clusters if output_clusters: clusters = {} for label in np.unique(labels): mask = (labels == label) clusters[str(label)] = [r[0]["signature_id"] for r in X[mask]] json.dump(clusters, open(output_clusters, "w")) # Statistics if verbose and input_clusters: print("Number of blocks =", len(clusterer.clusterers_)) print("True number of clusters", len(np.unique(y_true))) print("Number of computed clusters", len(np.unique(labels))) b3_overall = b3_precision_recall_fscore(y_true, labels) print("B^3 F-score (overall) =", b3_overall[2]) if train_signatures_file: b3_train = b3_precision_recall_fscore(y_true[train_ids], labels[train_ids]) b3_test = b3_precision_recall_fscore(y_true[test_ids], labels[test_ids]) print("B^3 F-score (train) =", b3_train[2]) print("B^3 F-score (test) =", b3_test[2]) if results_file: paired_overall = paired_precision_recall_fscore(y_true, labels) paired_train = paired_precision_recall_fscore( y_true[train_ids], labels[train_ids]) paired_test = paired_precision_recall_fscore( y_true[test_ids], labels[test_ids]) json.dump( { "description": ["precision", "recall", "f_score"], "b3": { "overall": list(b3_overall), "train": list(b3_train), "test": list(b3_test) }, "paired": { "overall": list(paired_overall), "train": list(paired_train), "test": list(paired_test) } }, open(results_file, 'w'))
def clustering(input_signatures, input_records, distance_model, input_clusters=None, output_clusters=None, verbose=1, n_jobs=-1, clustering_method="average", clustering_random_state=42, clustering_test_size=None, clustering_threshold=None): """Cluster signatures using a pretrained distance model. Parameters ---------- :param input_signatures: string Path to the file with signatures. The content should be a JSON array of dictionaries holding metadata about signatures. [{"signature_id": 0, "author_name": "Doe, John", "publication_id": 10, ...}, { ... }, ...] :param input_records: string Path to the file with records. The content should be a JSON array of dictionaries holding metadata about records [{"publication_id": 0, "title": "Author disambiguation using Beard", ... }, { ... }, ...] :param distance_model: string Path to the file with the distance model. The file should be a pickle created using the ``distance.py`` script. :param input_clusters: string Path to the file with knownn clusters. The file should be a dictionary, where keys are cluster labels and values are the `signature_id` of the signatures grouped in the clusters. Signatures assigned to the cluster with label "-1" are not clustered. {"0": [0, 1, 3], "1": [2, 5], ...} :param output_clusters: string Path to the file with output cluster. The file will be filled with clusters, using the same format as ``input_clusters``. :param verbose: int If not zero, function will output scores on stdout. :param n_jobs: int Parameter passed to joblib. Number of threads to be used. :param clustering_method: string Parameter passed to ``ScipyHierarchicalClustering``. Used only if ``clustering_test_size`` is specified. :param clustering_random_state: int or RandomState Random state for spltting the data into training and test data. :param clustering_test_size: float Part of data used in the test set. :param clustering_threshold: float Threshold passed to ``ScipyHierarchicalClustering``. """ # Assumes that 'distance_estimator' lives in global, making things fast global distance_estimator distance_estimator = pickle.load(open(distance_model, "rb")) signatures, records = load_signatures(input_signatures, input_records) indices = {} X = np.empty((len(signatures), 1), dtype=np.object) for i, signature in enumerate(sorted(signatures.values(), key=lambda s: s["signature_id"])): X[i, 0] = signature indices[signature["signature_id"]] = i # Semi-supervised block clustering if input_clusters: true_clusters = json.load(open(input_clusters, "r")) y_true = -np.ones(len(X), dtype=np.int) for label, signature_ids in true_clusters.items(): for signature_id in signature_ids: y_true[indices[signature_id]] = label if clustering_test_size is not None: train, test = train_test_split( np.arange(len(X)), test_size=clustering_test_size, random_state=clustering_random_state) y = -np.ones(len(X), dtype=np.int) y[train] = y_true[train] else: y = y_true else: y = None clusterer = BlockClustering( blocking=block_last_name_first_initial, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=clustering_threshold, method=clustering_method, supervised_scoring=b3_f_score), verbose=verbose, n_jobs=n_jobs).fit(X, y) labels = clusterer.labels_ # Save predicted clusters if output_clusters: clusters = {} for label in np.unique(labels): mask = (labels == label) clusters[label] = [r[0]["signature_id"] for r in X[mask]] json.dump(clusters, open(output_clusters, "w")) # Statistics if verbose and input_clusters: print("Number of blocks =", len(clusterer.clusterers_)) print("True number of clusters", len(np.unique(y_true))) print("Number of computed clusters", len(np.unique(labels))) print("B^3 F-score (overall) =", b3_f_score(y_true, labels)) if clustering_test_size: print("B^3 F-score (train) =", b3_f_score(y_true[train], labels[train])) print("B^3 F-score (test) =", b3_f_score(y_true[test], labels[test]))
data = self.remote.get_blended_data("",options.name) # FIXME: pretty-printing and sorting here keys = data.keys() keys.sort() for x in keys: print "%s : %s" % (x, data[x]) elif object_action in [ "poweron", "poweroff", "powerstatus", "reboot" ]: power={} power["power"] = object_action.replace("power","") power["systems"] = [options.name] task_id = self.remote.background_power_system(power, self.token) elif object_action == "update": task_id = self.remote.background_signature_update(utils.strip_none(vars(options),omit_none=True), self.token) elif object_action == "reload": filename = opt(options,"filename","/var/lib/cobbler/distro_signatures.json") if not utils.load_signatures(filename,cache=True): print "There was an error loading the signature data in %s." % filename print "Please check the JSON file or run 'cobbler signature update'." return False else: print "Signatures were successfully loaded" else: raise exceptions.NotImplementedError() else: raise exceptions.NotImplementedError() # FIXME: add tail/polling code here if task_id != -1: self.print_task(task_id) self.follow_task(task_id)
def clustering(input_signatures, input_records, distance_model, input_clusters=None, output_clusters=None, verbose=1, n_jobs=-1, clustering_method="average", train_signatures_file=None, clustering_threshold=None, results_file=None, blocking_function="block_phonetic", blocking_threshold=1, blocking_phonetic_alg="nysiis"): """Cluster signatures using a pretrained distance model. Parameters ---------- :param input_signatures: string Path to the file with signatures. The content should be a JSON array of dictionaries holding metadata about signatures. [{"signature_id": 0, "author_name": "Doe, John", "publication_id": 10, ...}, { ... }, ...] :param input_records: string Path to the file with records. The content should be a JSON array of dictionaries holding metadata about records [{"publication_id": 0, "title": "Author disambiguation using Beard", ... }, { ... }, ...] :param distance_model: string Path to the file with the distance model. The file should be a pickle created using the ``distance.py`` script. :param input_clusters: string Path to the file with knownn clusters. The file should be a dictionary, where keys are cluster labels and values are the `signature_id` of the signatures grouped in the clusters. Signatures assigned to the cluster with label "-1" are not clustered. {"0": [0, 1, 3], "1": [2, 5], ...} :param output_clusters: string Path to the file with output cluster. The file will be filled with clusters, using the same format as ``input_clusters``. :param verbose: int If not zero, function will output scores on stdout. :param n_jobs: int Parameter passed to joblib. Number of threads to be used. :param clustering_method: string Parameter passed to ``ScipyHierarchicalClustering``. Used only if ``clustering_test_size`` is specified. :param train_signatures_file: str Path to the file with train set signatures. Format the same as in ``input_signatures``. :param clustering_threshold: float Threshold passed to ``ScipyHierarchicalClustering``. :param results_file: str Path to the file where the results will be output. It will give additional information about pairwise variant of scores. :param blocking_function: string must be a defined blocking function. Defined functions are: - "block_last_name_first_initial" - "block_phonetic" :param blocking_threshold: int or None It determines the maximum allowed size of blocking on the last name It can only be: - None; if the blocking function is block_last_name_first_initial - int; if the blocking function is block_phonetic please check the documentation of phonetic blocking in beard.clustering.blocking_funcs.py :param blocking_phonetic_alg: string or None If not None, determines which phonetic algorithm is used. Options: - "double_metaphone" - "nysiis" (only for Python 2) - "soundex" (only for Python 2) """ # Assumes that 'distance_estimator' lives in global, making things fast global distance_estimator distance_estimator = pickle.load(open(distance_model, "rb")) try: distance_estimator.steps[-1][1].set_params(n_jobs=1) except: pass signatures, records = load_signatures(input_signatures, input_records) indices = {} X = np.empty((len(signatures), 1), dtype=np.object) for i, signature in enumerate(sorted(signatures.values(), key=lambda s: s["signature_id"])): X[i, 0] = signature indices[signature["signature_id"]] = i if blocking_function == "block_last_name_first_initial": block_function = block_last_name_first_initial else: block_function = partial(block_phonetic, threshold=blocking_threshold, phonetic_algorithm=blocking_phonetic_alg) # Semi-supervised block clustering if input_clusters: true_clusters = json.load(open(input_clusters, "r")) y_true = -np.ones(len(X), dtype=np.int) for label, signature_ids in true_clusters.items(): for signature_id in signature_ids: y_true[indices[signature_id]] = label y = -np.ones(len(X), dtype=np.int) if train_signatures_file: train_signatures = json.load(open(train_signatures_file, "r")) train_ids = [x['signature_id'] for x in train_signatures] del train_signatures y[train_ids] = y_true[train_ids] test_ids = list(set([x['signature_id'] for _, x in signatures.iteritems()]) - set(train_ids)) else: y = y_true else: y = None clusterer = BlockClustering( blocking=block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=clustering_threshold, method=clustering_method, supervised_scoring=b3_f_score), verbose=verbose, n_jobs=n_jobs).fit(X, y) labels = clusterer.labels_ # Save predicted clusters if output_clusters: clusters = {} for label in np.unique(labels): mask = (labels == label) clusters[str(label)] = [r[0]["signature_id"] for r in X[mask]] json.dump(clusters, open(output_clusters, "w")) # Statistics if verbose and input_clusters: print("Number of blocks =", len(clusterer.clusterers_)) print("True number of clusters", len(np.unique(y_true))) print("Number of computed clusters", len(np.unique(labels))) b3_overall = b3_precision_recall_fscore(y_true, labels) print("B^3 F-score (overall) =", b3_overall[2]) if train_signatures_file: b3_train = b3_precision_recall_fscore( y_true[train_ids], labels[train_ids] ) b3_test = b3_precision_recall_fscore( y_true[test_ids], labels[test_ids] ) print("B^3 F-score (train) =", b3_train[2]) print("B^3 F-score (test) =", b3_test[2]) if results_file: paired_overall = paired_precision_recall_fscore(y_true, labels) paired_train = paired_precision_recall_fscore( y_true[train_ids], labels[train_ids] ) paired_test = paired_precision_recall_fscore( y_true[test_ids], labels[test_ids] ) json.dump({ "description": ["precision", "recall", "f_score"], "b3": {"overall": list(b3_overall), "train": list(b3_train), "test": list(b3_test) }, "paired": {"overall": list(paired_overall), "train": list(paired_train), "test": list(paired_test) } }, open(results_file, 'w'))