Exemple #1
0
def merge_pseudo_labels(mapper, truth, pred, merge_by="max"):
    """Merge the prediction results of truth and prediction, combining the real labels
    and pseudo labels. The combination algorithm can be max or mean.

    Args:
        mapper: The label mapping matrix. Find the actual label id from the new label id.
        truth: The groundtruth matrix.
        pred: The prediction results.
        merge_by: If "max" is used, aggregate the scores by max operator. If "mean" is 
            used, aggregated by average.
    
    Returns:
        The aggregated scores.
    """
    # pred: N x Lnew, Lnew x Lold ==> N x Lold
    n_labels = pred.shape[1] - len(mapper)
    n_new_labels = pred.shape[1]
    if len(mapper) == 0:
        return truth, pred
    if merge_by == "mean":
        normalized_mapper = build_label_mapping_matrix(mapper, n_labels,
                                                       n_new_labels)
        pred1 = clib.sparse_matmul(pred, normalized_mapper)
        # debiasing
        pred_sign = pred.sign()
        bias = clib.sparse_matmul(pred_sign, normalized_mapper)
        bias.data = 1.0 / np.clip(bias.data, a_min=0.1, a_max=None)
        pred = pred1.multiply(bias)
    else:
        pred = pred.tolil()
        truth = truth.tolil()
        for pseudo_id, real_id in mapper.items():
            pred[:, real_id] = pred[:, real_id].maximum(pred[:, pseudo_id])
    return truth[:, :n_labels], pred[:, :n_labels]
Exemple #2
0
def do_analyze(args):
    # Load Data
    Xt = XLinearModel.load_feature_matrix(args.inst_path)
    Yt = XLinearModel.load_label_matrix(args.label_path)

    # Optionally load mapper
    mapper = {}
    if args.mapper is not None:
        with open(args.mapper, "rb") as reader:
            mapper = pkl.load(reader)
    unused_label_set = {}
    if args.unused_labels is not None:
        with open(args.unused_labels, "rb") as reader:
            unused_label_set = pkl.load(reader)

    # Model prediction
    xlinear_model = XLinearModel.load(args.model_folder)
    kwargs = {
        "beam_size": args.beam_size,
        "only_topk": 160,
        "post_processor": "l3-hinge",
    }

    pred = None
    batch_size = 8192 * 16
    pred_batches = []
    M_batches = []
    for i in range((Xt.shape[0] - 1) // batch_size + 1):
        beg, end = i * batch_size, (i + 1) * batch_size
        end = min(end, Xt.shape[0])
        X_batch = Xt[beg:end, :]
        M_batch = forward_matcher(xlinear_model, X_batch, **kwargs)
        # pred_batch = forward_ranker(xlinear_model, X_batch, M_batch, **kwargs)
        pred_batch = xlinear_model.predict(Xt[beg:end, :], **kwargs)
        M_batches.append(M_batch)
        pred_batches.append(pred_batch)

    Mb = smat_util.binarized(smat.vstack(M_batches))
    C = xlinear_model.model.model_chain[-1].pC.buf
    MC = clib.sparse_matmul(Mb, C.transpose())
    avg_inner_prod = MC.sum(axis=1).mean(axis=0)[0, 0]

    pred = smat.vstack(pred_batches)
    unused_label_transformer = build_score_transformer(unused_label_set,
                                                       pred.shape[1])
    # we set j-th column of pred to zero, iff j is an unused label, this is prevent label j
    # from being accidentally ranked to the front.
    pred = clib.sparse_matmul(pred, unused_label_transformer)
    truth = Yt

    print("Merging pseudo labels")
    truth, pred = merge_pseudo_labels(mapper, truth, pred, merge_by="mean")
    truth = truth.tocsr()
    pred = pred.tocsr()
    print("Calculating metrics")
    metric = smat_util.Metrics.generate(truth, pred, topk=10)
    print(metric)
    print("Average #inner prod: ", avg_inner_prod)
Exemple #3
0
    def generate_matching_chain(self, M_dict):
        """Generate a chain of instance to cluster matching matrix for user supplied negative (usn) from partial matching chain.

        Args:
            M_dict (dict): dictionary of partial matching chains, with keys being number of layers above leaf elements.
                M_dict[i].shape[0] == nr_inst, for all i.
                M_dict[0].shape[1] == self.chain[-1].shape[0],
                M_dict[i].shape[1] == self.chain[-i].shape[1], for i >= 1
                M_dict.keys() \\subset range(len(self.chain)+1)

        Returns:
            matching_chain: list of csc matrices for user supplied negatives
        """
        matching_chain = [None] * (len(self) + 1)
        # if nothing is given, return a chain of None
        if M_dict is None or all(M_dict[x] is None for x in M_dict):
            return matching_chain

        nr_insts, nr_labels = self.matrix_chain_dimension_check(M_dict)

        # construct matching chain from incomplete chain
        if M_dict.get(0, None) is not None:
            matching_chain[0] = smat_util.binarized(M_dict[0])
        else:
            matching_chain[0] = smat.csc_matrix((nr_insts, nr_labels),
                                                dtype=np.float32)
        for i in range(1, len(self) + 1):
            matching_chain[i] = clib.sparse_matmul(matching_chain[i - 1],
                                                   self.chain[-i])
            if M_dict.get(i, None) is not None:
                matching_chain[i] += smat_util.binarized(M_dict[i])
            matching_chain[i] = matching_chain[i].tocsc().sorted_indices()
        matching_chain.reverse()

        return matching_chain[:-1]
Exemple #4
0
    def generate_relevance_chain(self, R_dict, norm_type=None, induce=True):
        """Generate a chain of instance to cluster relevance matrix for cost sensitive learning from partial relevance chain.

        Args:
            R_dict (dict): dictionary of partial relevance chains, with keys being number of layers above leaf elements.
                R_dict[i].shape[0] == nr_inst, for all i.
                R_dict[0].shape[1] == self.chain[-1].shape[0],
                R_dict[i].shape[1] == self.chain[-i].shape[1], for i >= 1
                R_dict.keys() \\subset range(len(self.chain)+1)
            norm_type (str, optional): row wise normalziation of resulting relevance matrices. Defatult None to ignore.
                Options: ‘l1’, ‘l2’, ‘max’, 'no-norm', None
            induce (bool, optional): whether to induce missing relevance matrix by label aggregation. Default True

        Returns:
            relevance_chain: list of csc matrices for relevance
        """

        relevance_chain = [None] * (len(self) + 1)
        # if nothing is given, return a chain of None
        if R_dict is None or all(R_dict[x] is None for x in R_dict):
            return relevance_chain

        self.matrix_chain_dimension_check(R_dict)

        # construct relevance chain from incomplete chain
        relevance_chain[0] = R_dict.get(0, None)
        for i in range(1, len(self) + 1):
            if R_dict.get(i, None) is not None:
                relevance_chain[i] = R_dict[i]
            elif relevance_chain[i - 1] is not None and induce:
                relevance_chain[i] = clib.sparse_matmul(
                    relevance_chain[i - 1], self.chain[-i])
            else:
                relevance_chain[i] = None
        relevance_chain.reverse()

        if norm_type not in [None, "no-norm"]:
            relevance_chain = [
                sk_normalize(rr.tocsr(), norm=norm_type)
                if rr is not None else None for rr in relevance_chain
            ]

        return relevance_chain[1:]
Exemple #5
0
    def train(
        cls,
        prob,
        clustering=None,
        val_prob=None,
        train_params=None,
        pred_params=None,
        **kwargs,
    ):
        """Train the XR-Transformer model with the given input data.

        Args:
            prob (MLProblemWithText): ML problem to solve.
            clustering (ClusterChain, optional): preliminary hierarchical label tree,
                where transformer is fine-tuned on.
            val_prob (MLProblemWithText, optional): ML problem for validation.
            train_params (XTransformer.TrainParams): training parameters for XTransformer
            pred_params (XTransformer.pred_params): pred parameters for XTransformer
            kwargs:
                label_feat (ndarray or csr_matrix, optional): label features on which to generate preliminary HLT
                saved_trn_pt (str, optional): path to save the tokenized trn text. Use a tempdir if not given
                saved_val_pt (str, optional): path to save the tokenized val text. Use a tempdir if not given
                matmul_threads (int, optional): number of threads to use for
                    constructing label tree. Default to use at most 32 threads
                beam_size (int, optional): overrides only_topk for models except
                    bottom layer one

        Returns:
            XTransformer
        """
        # tempdir to save tokenized text
        temp_dir = tempfile.TemporaryDirectory()
        saved_trn_pt = kwargs.get("saved_trn_pt", "")
        if not saved_trn_pt:
            saved_trn_pt = f"{temp_dir.name}/X_trn.pt"

        saved_val_pt = kwargs.get("saved_val_pt", "")
        if not saved_val_pt:
            saved_val_pt = f"{temp_dir.name}/X_val.pt"

        # construct train_params
        if train_params is None:
            # fill all BaseParams class with their default value
            train_params = cls.TrainParams.from_dict(dict(), recursive=True)
        else:
            train_params = cls.TrainParams.from_dict(train_params)
        # construct pred_params
        if pred_params is None:
            # fill all BaseParams with their default value
            pred_params = cls.PredParams.from_dict(dict(), recursive=True)
        else:
            pred_params = cls.PredParams.from_dict(pred_params)

        if not train_params.do_fine_tune:
            if isinstance(train_params.matcher_params_chain, list):
                matcher_train_params = train_params.matcher_params_chain[-1]
            else:
                matcher_train_params = train_params.matcher_params_chain

            if isinstance(train_params.matcher_params_chain, list):
                matcher_pred_params = pred_params.matcher_params_chain[-1]
            else:
                matcher_pred_params = pred_params.matcher_params_chain

            device, n_gpu = torch_util.setup_device(matcher_train_params.use_gpu)

            if matcher_train_params.init_model_dir:
                parent_model = cls.load(train_params.init_model_dir)
                LOGGER.info("Loaded encoder from {}.".format(matcher_train_params.init_model_dir))
            else:
                parent_model = TransformerMatcher.download_model(
                    matcher_train_params.model_shortcut,
                )
                LOGGER.info(
                    "Downloaded encoder from {}.".format(matcher_train_params.model_shortcut)
                )

            parent_model.to_device(device, n_gpu=n_gpu)
            _, inst_embeddings = parent_model.predict(
                prob.X_text,
                pred_params=matcher_pred_params,
                batch_size=matcher_train_params.batch_size * max(1, n_gpu),
                batch_gen_workers=matcher_train_params.batch_gen_workers,
                only_embeddings=True,
            )
            if val_prob:
                _, val_inst_embeddings = parent_model.predict(
                    val_prob.X_text,
                    pred_params=matcher_pred_params,
                    batch_size=matcher_train_params.batch_size * max(1, n_gpu),
                    batch_gen_workers=matcher_train_params.batch_gen_workers,
                    only_embeddings=True,
                )
        else:
            # 1. Constructing primary Hierarchial Label Tree
            if clustering is None:
                label_feat = kwargs.get("label_feat", None)
                if label_feat is None:
                    if prob.X_feat is None:
                        raise ValueError(
                            "Instance features are required to generate label features!"
                        )
                    label_feat = LabelEmbeddingFactory.pifa(prob.Y, prob.X_feat)

                clustering = Indexer.gen(
                    label_feat,
                    train_params=train_params.preliminary_indexer_params,
                )
            else:
                # assert cluster chain in clustering is valid
                clustering = ClusterChain(clustering)
                if clustering[-1].shape[0] != prob.nr_labels:
                    raise ValueError("nr_labels mismatch!")
            prelim_hierarchiy = [cc.shape[0] for cc in clustering]
            LOGGER.info("Hierarchical label tree: {}".format(prelim_hierarchiy))

            # get the fine-tuning task numbers
            nr_transformers = sum(i <= train_params.max_match_clusters for i in prelim_hierarchiy)

            LOGGER.info(
                "Fine-tune Transformers with nr_labels={}".format(
                    [cc.shape[0] for cc in clustering[:nr_transformers]]
                )
            )

            steps_scale = kwargs.get("steps_scale", None)
            if steps_scale is None:
                steps_scale = [1.0] * nr_transformers
            if len(steps_scale) != nr_transformers:
                raise ValueError(f"steps-scale length error: {len(steps_scale)}!={nr_transformers}")

            # construct fields with chain now we know the depth
            train_params = HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
                train_params, cls.TrainParams, nr_transformers
            )

            LOGGER.debug(
                f"XTransformer train_params: {json.dumps(train_params.to_dict(), indent=True)}"
            )

            pred_params = HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
                pred_params, cls.PredParams, nr_transformers
            )
            pred_params = pred_params.override_with_kwargs(kwargs)

            LOGGER.debug(
                f"XTransformer pred_params: {json.dumps(pred_params.to_dict(), indent=True)}"
            )

            def get_negative_samples(mat_true, mat_pred, scheme):
                if scheme == "tfn":
                    result = smat_util.binarized(mat_true)
                elif scheme == "man":
                    result = smat_util.binarized(mat_pred)
                elif "tfn" in scheme and "man" in scheme:
                    result = smat_util.binarized(mat_true) + smat_util.binarized(mat_pred)
                else:
                    raise ValueError("Unrecognized negative sampling method {}".format(scheme))
                LOGGER.debug(
                    f"Construct {scheme} with shape={result.shape} avr_M_nnz={result.nnz/result.shape[0]}"
                )
                return result

            # construct label chain for training and validation set
            # avoid large matmul_threads to prevent overhead in Y.dot(C) and save memory
            matmul_threads = kwargs.get("threads", os.cpu_count())
            matmul_threads = min(32, matmul_threads)
            YC_list = [prob.Y]
            for cur_C in reversed(clustering[1:]):
                Y_t = clib.sparse_matmul(YC_list[-1], cur_C, threads=matmul_threads).tocsr()
                YC_list.append(Y_t)
            YC_list.reverse()

            if val_prob is not None:
                val_YC_list = [val_prob.Y]
                for cur_C in reversed(clustering[1:]):
                    Y_t = clib.sparse_matmul(val_YC_list[-1], cur_C, threads=matmul_threads).tocsr()
                    val_YC_list.append(Y_t)
                val_YC_list.reverse()

            parent_model = None
            M, val_M = None, None
            M_pred, val_M_pred = None, None
            bootstrapping, inst_embeddings = None, None
            for i in range(nr_transformers):
                cur_train_params = train_params.matcher_params_chain[i]
                cur_pred_params = pred_params.matcher_params_chain[i]
                cur_train_params.max_steps = steps_scale[i] * cur_train_params.max_steps
                cur_train_params.num_train_epochs = (
                    steps_scale[i] * cur_train_params.num_train_epochs
                )

                cur_ns = cur_train_params.negative_sampling

                # construct train and val problem for level i
                # note that final layer do not need X_feat
                if i > 0:
                    M = get_negative_samples(YC_list[i - 1], M_pred, cur_ns)

                cur_prob = MLProblemWithText(
                    prob.X_text,
                    YC_list[i],
                    X_feat=None if i == nr_transformers - 1 else prob.X_feat,
                    C=clustering[i],
                    M=M,
                )
                if val_prob is not None:
                    if i > 0:
                        val_M = get_negative_samples(val_YC_list[i - 1], val_M_pred, cur_ns)
                    cur_val_prob = MLProblemWithText(
                        val_prob.X_text,
                        val_YC_list[i],
                        X_feat=None if i == nr_transformers - 1 else val_prob.X_feat,
                        C=clustering[i],
                        M=val_M,
                    )
                else:
                    cur_val_prob = None

                avr_trn_labels = (
                    float(cur_prob.M.nnz) / YC_list[i].shape[0]
                    if cur_prob.M is not None
                    else YC_list[i].shape[1]
                )
                LOGGER.info(
                    "Fine-tuning XR-Transformer with {} at level {}, nr_labels={}, avr_M_nnz={}".format(
                        cur_ns, i, YC_list[i].shape[1], avr_trn_labels
                    )
                )

                # bootstrapping with previous text_encoder and instance embeddings
                if parent_model is not None:
                    init_encoder = deepcopy(parent_model.text_encoder)
                    init_text_model = deepcopy(parent_model.text_model)
                    bootstrapping = (init_encoder, inst_embeddings, init_text_model)

                # determine whether train prediction and instance embeddings are needed
                return_train_pred = (
                    i + 1 < nr_transformers
                ) and "man" in train_params.matcher_params_chain[i + 1].negative_sampling
                return_train_embeddings = (
                    i + 1 == nr_transformers
                ) or "linear" in cur_train_params.bootstrap_method

                res_dict = TransformerMatcher.train(
                    cur_prob,
                    csr_codes=M_pred,
                    val_prob=cur_val_prob,
                    val_csr_codes=val_M_pred,
                    train_params=cur_train_params,
                    pred_params=cur_pred_params,
                    bootstrapping=bootstrapping,
                    return_dict=True,
                    return_train_pred=return_train_pred,
                    return_train_embeddings=return_train_embeddings,
                    saved_trn_pt=saved_trn_pt,
                    saved_val_pt=saved_val_pt,
                )
                parent_model = res_dict["matcher"]
                M_pred = res_dict["trn_pred"]
                val_M_pred = res_dict["val_pred"]
                inst_embeddings = res_dict["trn_embeddings"]
                val_inst_embeddings = res_dict["val_embeddings"]

        if train_params.save_emb_dir:
            os.makedirs(train_params.save_emb_dir, exist_ok=True)
            if inst_embeddings is not None:
                smat_util.save_matrix(
                    os.path.join(train_params.save_emb_dir, "X.trn.npy"),
                    inst_embeddings,
                )
                LOGGER.info(f"Trn embeddings saved to {train_params.save_emb_dir}/X.trn.npy")
            if val_inst_embeddings is not None:
                smat_util.save_matrix(
                    os.path.join(train_params.save_emb_dir, "X.val.npy"),
                    val_inst_embeddings,
                )
                LOGGER.info(f"Val embeddings saved to {train_params.save_emb_dir}/X.val.npy")

        ranker = None
        if not train_params.only_encoder:
            # construct X_concat
            X_concat = TransformerMatcher.concat_features(
                prob.X_feat,
                inst_embeddings,
                normalize_emb=True,
            )
            del inst_embeddings
            LOGGER.info("Constructed instance feature matrix with shape={}".format(X_concat.shape))

            # 3. construct refined HLT
            if train_params.fix_clustering:
                clustering = clustering
            else:
                clustering = Indexer.gen(
                    LabelEmbeddingFactory.pifa(prob.Y, X_concat),
                    train_params=train_params.refined_indexer_params,
                )
            LOGGER.info(
                "Hierarchical label tree for ranker: {}".format([cc.shape[0] for cc in clustering])
            )

            # the HLT could have changed depth
            train_params.ranker_params.hlm_args = (
                HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
                    train_params.ranker_params.hlm_args,
                    HierarchicalMLModel.TrainParams,
                    len(clustering),
                )
            )
            pred_params.ranker_params.hlm_args = (
                HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain(
                    pred_params.ranker_params.hlm_args,
                    HierarchicalMLModel.PredParams,
                    len(clustering),
                )
            )
            pred_params.ranker_params.override_with_kwargs(kwargs)

            # train the ranker
            LOGGER.info("Start training ranker...")

            ranker = XLinearModel.train(
                X_concat,
                prob.Y,
                C=clustering,
                train_params=train_params.ranker_params,
                pred_params=pred_params.ranker_params,
            )

        return cls(parent_model, ranker)
Exemple #6
0
def do_spmm_exp(args):
    # load data
    Y = smat_util.load_matrix(args.y_npz_path).astype(np.float32)
    X = smat_util.load_matrix(args.x_npz_path).astype(np.float32)
    YT_csr = Y.T.tocsr()
    X_csr = X.tocsr()

    # The #threads is control by env variables (except for pecos)
    # e.g., export OMP_NUM_THREADS=16, export MKL_NUM_THREADS=16.
    run_time = 0.0
    if args.spmm_algo == "pecos":
        start = time.time()
        Z = pecos_clib.sparse_matmul(
            YT_csr,
            X_csr,
            eliminate_zeros=False,
            sorted_indices=True,
            threads=args.threads,
        )
        run_time += time.time() - start
        Z_data = Z.data
    elif args.spmm_algo == "intel-mkl":
        from sparse_dot_mkl import dot_product_mkl
        # make sure set the index to int64 for large matrices
        # export MKL_INTERFACE_LAYER=ILP64
        start = time.time()
        Z = dot_product_mkl(YT_csr, X_csr, reorder_output=True)
        run_time += time.time() - start
        Z_data = Z.data
    elif args.spmm_algo == "scipy":
        # scipy will not sorted the indices for each row,
        # so we do it explicitly
        start = time.time()
        Z = YT_csr.dot(X_csr)
        Z.sort_indices()
        run_time += time.time() - start
        Z_data = Z.data
    elif args.spmm_algo == "pytorch":
        import torch

        def get_pt_data(A_csr):
            A_indices, A_values = csr_to_coo(A_csr)
            A_pt = torch.sparse_coo_tensor(
                A_indices.T.astype(np.int64),
                A_values.astype(np.float32),
                A_csr.shape,
            )
            return A_pt

        YT_pt = get_pt_data(YT_csr)
        X_pt = get_pt_data(X_csr)
        start = time.time()
        Z_pt = torch.sparse.mm(YT_pt, X_pt)
        run_time += time.time() - start
        Z_data = Z_pt.coalesce().values().numpy()
    elif args.spmm_algo == "tensorflow":
        import tensorflow.compat.v1 as tf
        from tensorflow.python.ops.linalg.sparse import sparse_csr_matrix_ops

        def get_tf_data(A_csr):
            # Define (COO format) Sparse Tensors over Numpy arrays
            A_indices, A_values = csr_to_coo(A_csr)
            A_st = tf.sparse.SparseTensor(
                A_indices.astype(np.int64),
                A_values.astype(np.float32),
                A_csr.shape,
            )
            return A_st

        # Tensorflow (v2.5.0) usage, as of 07/20/2021:
        # https://www.tensorflow.org/api_docs/python/tf/raw_ops/SparseMatrixSparseMatMul
        with tf.Session() as sess:
            YT_st = get_tf_data(YT_csr)
            X_st = get_tf_data(X_csr)
            sess.run(YT_st)
            sess.run(X_st)
            YT_sm = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix(
                YT_st.indices, YT_st.values, YT_st.dense_shape)
            X_sm = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix(
                X_st.indices, X_st.values, X_st.dense_shape)
            start = time.time()
            Z_sm = sparse_csr_matrix_ops.sparse_matrix_sparse_mat_mul(
                a=YT_sm, b=X_sm, type=tf.float32)
            Z_st = sparse_csr_matrix_ops.csr_sparse_matrix_to_sparse_tensor(
                Z_sm, tf.float32)
            Z_data = sess.run(Z_st.values)
            run_time += time.time() - start
    else:
        raise ValueError(f"spmm_algo={args.spmm_algo} is not valid")
    run_time = time.time() - start
    print(
        "algo {:16s} time(s) {:9.5f} nnz(Z) {:12d} mu(Z.data) {:8.4f}".format(
            args.spmm_algo,
            run_time,
            len(Z_data),
            np.mean(Z_data),
        ))