Beispiel #1
0
def test_load_signatures(curated_signature, non_curated_signature):
    signatures_by_uuid = load_signatures(
        [curated_signature, non_curated_signature])
    expected_signatures_by_uuid = {
        curated_signature["signature_uuid"]: curated_signature,
        non_curated_signature["signature_uuid"]: non_curated_signature,
    }
    assert signatures_by_uuid == expected_signatures_by_uuid
    def load_data(self, curated_signatures, pairs, pairs_size):
        """Loads training data to the estimator vectors

        Args:
            curated_signatures (iterable): Signatures for the training
            pairs (iterable): Pairs of signatures and clusters for the training
            pairs_size (int): Amount of pairs

        """
        signatures_by_uuid = load_signatures(curated_signatures)

        self.X = np.empty((pairs_size, 2), dtype=np.object)
        self.y = np.empty(pairs_size, dtype=np.int)

        for i, pair in enumerate(pairs):
            self.X[i, 0] = signatures_by_uuid[pair["signature_uuids"][0]]
            self.X[i, 1] = signatures_by_uuid[pair["signature_uuids"][1]]
            self.y[i] = 0 if pair["same_cluster"] else 1
    def load_data(self, signatures, input_clusters):
        """Loads data to the estimator vectors

        Args:
            signatures (iterable): Signatures which should be processed
            input_clusters (iterable): Input clusters built for provided signatures
                see: `inspire_disambiguation.core.es.readers.get_input_clusters`

        """
        signatures_by_uuid = load_signatures(signatures)

        self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object)
        self.y = -np.ones(len(self.X), dtype=np.int)

        i = 0
        for cluster in input_clusters:
            for signature_uuid in cluster["signature_uuids"]:
                if signature_uuid not in signatures_by_uuid:
                    continue  # TODO figure out how this can happen
                self.X[i, 0] = signatures_by_uuid[signature_uuid]
                self.y[i] = cluster["cluster_id"]
                i += 1