def merge_pseudo_labels(mapper, truth, pred, merge_by="max"): """Merge the prediction results of truth and prediction, combining the real labels and pseudo labels. The combination algorithm can be max or mean. Args: mapper: The label mapping matrix. Find the actual label id from the new label id. truth: The groundtruth matrix. pred: The prediction results. merge_by: If "max" is used, aggregate the scores by max operator. If "mean" is used, aggregated by average. Returns: The aggregated scores. """ # pred: N x Lnew, Lnew x Lold ==> N x Lold n_labels = pred.shape[1] - len(mapper) n_new_labels = pred.shape[1] if len(mapper) == 0: return truth, pred if merge_by == "mean": normalized_mapper = build_label_mapping_matrix(mapper, n_labels, n_new_labels) pred1 = clib.sparse_matmul(pred, normalized_mapper) # debiasing pred_sign = pred.sign() bias = clib.sparse_matmul(pred_sign, normalized_mapper) bias.data = 1.0 / np.clip(bias.data, a_min=0.1, a_max=None) pred = pred1.multiply(bias) else: pred = pred.tolil() truth = truth.tolil() for pseudo_id, real_id in mapper.items(): pred[:, real_id] = pred[:, real_id].maximum(pred[:, pseudo_id]) return truth[:, :n_labels], pred[:, :n_labels]
def do_analyze(args): # Load Data Xt = XLinearModel.load_feature_matrix(args.inst_path) Yt = XLinearModel.load_label_matrix(args.label_path) # Optionally load mapper mapper = {} if args.mapper is not None: with open(args.mapper, "rb") as reader: mapper = pkl.load(reader) unused_label_set = {} if args.unused_labels is not None: with open(args.unused_labels, "rb") as reader: unused_label_set = pkl.load(reader) # Model prediction xlinear_model = XLinearModel.load(args.model_folder) kwargs = { "beam_size": args.beam_size, "only_topk": 160, "post_processor": "l3-hinge", } pred = None batch_size = 8192 * 16 pred_batches = [] M_batches = [] for i in range((Xt.shape[0] - 1) // batch_size + 1): beg, end = i * batch_size, (i + 1) * batch_size end = min(end, Xt.shape[0]) X_batch = Xt[beg:end, :] M_batch = forward_matcher(xlinear_model, X_batch, **kwargs) # pred_batch = forward_ranker(xlinear_model, X_batch, M_batch, **kwargs) pred_batch = xlinear_model.predict(Xt[beg:end, :], **kwargs) M_batches.append(M_batch) pred_batches.append(pred_batch) Mb = smat_util.binarized(smat.vstack(M_batches)) C = xlinear_model.model.model_chain[-1].pC.buf MC = clib.sparse_matmul(Mb, C.transpose()) avg_inner_prod = MC.sum(axis=1).mean(axis=0)[0, 0] pred = smat.vstack(pred_batches) unused_label_transformer = build_score_transformer(unused_label_set, pred.shape[1]) # we set j-th column of pred to zero, iff j is an unused label, this is prevent label j # from being accidentally ranked to the front. pred = clib.sparse_matmul(pred, unused_label_transformer) truth = Yt print("Merging pseudo labels") truth, pred = merge_pseudo_labels(mapper, truth, pred, merge_by="mean") truth = truth.tocsr() pred = pred.tocsr() print("Calculating metrics") metric = smat_util.Metrics.generate(truth, pred, topk=10) print(metric) print("Average #inner prod: ", avg_inner_prod)
def generate_matching_chain(self, M_dict): """Generate a chain of instance to cluster matching matrix for user supplied negative (usn) from partial matching chain. Args: M_dict (dict): dictionary of partial matching chains, with keys being number of layers above leaf elements. M_dict[i].shape[0] == nr_inst, for all i. M_dict[0].shape[1] == self.chain[-1].shape[0], M_dict[i].shape[1] == self.chain[-i].shape[1], for i >= 1 M_dict.keys() \\subset range(len(self.chain)+1) Returns: matching_chain: list of csc matrices for user supplied negatives """ matching_chain = [None] * (len(self) + 1) # if nothing is given, return a chain of None if M_dict is None or all(M_dict[x] is None for x in M_dict): return matching_chain nr_insts, nr_labels = self.matrix_chain_dimension_check(M_dict) # construct matching chain from incomplete chain if M_dict.get(0, None) is not None: matching_chain[0] = smat_util.binarized(M_dict[0]) else: matching_chain[0] = smat.csc_matrix((nr_insts, nr_labels), dtype=np.float32) for i in range(1, len(self) + 1): matching_chain[i] = clib.sparse_matmul(matching_chain[i - 1], self.chain[-i]) if M_dict.get(i, None) is not None: matching_chain[i] += smat_util.binarized(M_dict[i]) matching_chain[i] = matching_chain[i].tocsc().sorted_indices() matching_chain.reverse() return matching_chain[:-1]
def generate_relevance_chain(self, R_dict, norm_type=None, induce=True): """Generate a chain of instance to cluster relevance matrix for cost sensitive learning from partial relevance chain. Args: R_dict (dict): dictionary of partial relevance chains, with keys being number of layers above leaf elements. R_dict[i].shape[0] == nr_inst, for all i. R_dict[0].shape[1] == self.chain[-1].shape[0], R_dict[i].shape[1] == self.chain[-i].shape[1], for i >= 1 R_dict.keys() \\subset range(len(self.chain)+1) norm_type (str, optional): row wise normalziation of resulting relevance matrices. Defatult None to ignore. Options: ‘l1’, ‘l2’, ‘max’, 'no-norm', None induce (bool, optional): whether to induce missing relevance matrix by label aggregation. Default True Returns: relevance_chain: list of csc matrices for relevance """ relevance_chain = [None] * (len(self) + 1) # if nothing is given, return a chain of None if R_dict is None or all(R_dict[x] is None for x in R_dict): return relevance_chain self.matrix_chain_dimension_check(R_dict) # construct relevance chain from incomplete chain relevance_chain[0] = R_dict.get(0, None) for i in range(1, len(self) + 1): if R_dict.get(i, None) is not None: relevance_chain[i] = R_dict[i] elif relevance_chain[i - 1] is not None and induce: relevance_chain[i] = clib.sparse_matmul( relevance_chain[i - 1], self.chain[-i]) else: relevance_chain[i] = None relevance_chain.reverse() if norm_type not in [None, "no-norm"]: relevance_chain = [ sk_normalize(rr.tocsr(), norm=norm_type) if rr is not None else None for rr in relevance_chain ] return relevance_chain[1:]
def train( cls, prob, clustering=None, val_prob=None, train_params=None, pred_params=None, **kwargs, ): """Train the XR-Transformer model with the given input data. Args: prob (MLProblemWithText): ML problem to solve. clustering (ClusterChain, optional): preliminary hierarchical label tree, where transformer is fine-tuned on. val_prob (MLProblemWithText, optional): ML problem for validation. train_params (XTransformer.TrainParams): training parameters for XTransformer pred_params (XTransformer.pred_params): pred parameters for XTransformer kwargs: label_feat (ndarray or csr_matrix, optional): label features on which to generate preliminary HLT saved_trn_pt (str, optional): path to save the tokenized trn text. Use a tempdir if not given saved_val_pt (str, optional): path to save the tokenized val text. Use a tempdir if not given matmul_threads (int, optional): number of threads to use for constructing label tree. Default to use at most 32 threads beam_size (int, optional): overrides only_topk for models except bottom layer one Returns: XTransformer """ # tempdir to save tokenized text temp_dir = tempfile.TemporaryDirectory() saved_trn_pt = kwargs.get("saved_trn_pt", "") if not saved_trn_pt: saved_trn_pt = f"{temp_dir.name}/X_trn.pt" saved_val_pt = kwargs.get("saved_val_pt", "") if not saved_val_pt: saved_val_pt = f"{temp_dir.name}/X_val.pt" # construct train_params if train_params is None: # fill all BaseParams class with their default value train_params = cls.TrainParams.from_dict(dict(), recursive=True) else: train_params = cls.TrainParams.from_dict(train_params) # construct pred_params if pred_params is None: # fill all BaseParams with their default value pred_params = cls.PredParams.from_dict(dict(), recursive=True) else: pred_params = cls.PredParams.from_dict(pred_params) if not train_params.do_fine_tune: if isinstance(train_params.matcher_params_chain, list): matcher_train_params = train_params.matcher_params_chain[-1] else: matcher_train_params = train_params.matcher_params_chain if isinstance(train_params.matcher_params_chain, list): matcher_pred_params = pred_params.matcher_params_chain[-1] else: matcher_pred_params = pred_params.matcher_params_chain device, n_gpu = torch_util.setup_device(matcher_train_params.use_gpu) if matcher_train_params.init_model_dir: parent_model = cls.load(train_params.init_model_dir) LOGGER.info("Loaded encoder from {}.".format(matcher_train_params.init_model_dir)) else: parent_model = TransformerMatcher.download_model( matcher_train_params.model_shortcut, ) LOGGER.info( "Downloaded encoder from {}.".format(matcher_train_params.model_shortcut) ) parent_model.to_device(device, n_gpu=n_gpu) _, inst_embeddings = parent_model.predict( prob.X_text, pred_params=matcher_pred_params, batch_size=matcher_train_params.batch_size * max(1, n_gpu), batch_gen_workers=matcher_train_params.batch_gen_workers, only_embeddings=True, ) if val_prob: _, val_inst_embeddings = parent_model.predict( val_prob.X_text, pred_params=matcher_pred_params, batch_size=matcher_train_params.batch_size * max(1, n_gpu), batch_gen_workers=matcher_train_params.batch_gen_workers, only_embeddings=True, ) else: # 1. Constructing primary Hierarchial Label Tree if clustering is None: label_feat = kwargs.get("label_feat", None) if label_feat is None: if prob.X_feat is None: raise ValueError( "Instance features are required to generate label features!" ) label_feat = LabelEmbeddingFactory.pifa(prob.Y, prob.X_feat) clustering = Indexer.gen( label_feat, train_params=train_params.preliminary_indexer_params, ) else: # assert cluster chain in clustering is valid clustering = ClusterChain(clustering) if clustering[-1].shape[0] != prob.nr_labels: raise ValueError("nr_labels mismatch!") prelim_hierarchiy = [cc.shape[0] for cc in clustering] LOGGER.info("Hierarchical label tree: {}".format(prelim_hierarchiy)) # get the fine-tuning task numbers nr_transformers = sum(i <= train_params.max_match_clusters for i in prelim_hierarchiy) LOGGER.info( "Fine-tune Transformers with nr_labels={}".format( [cc.shape[0] for cc in clustering[:nr_transformers]] ) ) steps_scale = kwargs.get("steps_scale", None) if steps_scale is None: steps_scale = [1.0] * nr_transformers if len(steps_scale) != nr_transformers: raise ValueError(f"steps-scale length error: {len(steps_scale)}!={nr_transformers}") # construct fields with chain now we know the depth train_params = HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain( train_params, cls.TrainParams, nr_transformers ) LOGGER.debug( f"XTransformer train_params: {json.dumps(train_params.to_dict(), indent=True)}" ) pred_params = HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain( pred_params, cls.PredParams, nr_transformers ) pred_params = pred_params.override_with_kwargs(kwargs) LOGGER.debug( f"XTransformer pred_params: {json.dumps(pred_params.to_dict(), indent=True)}" ) def get_negative_samples(mat_true, mat_pred, scheme): if scheme == "tfn": result = smat_util.binarized(mat_true) elif scheme == "man": result = smat_util.binarized(mat_pred) elif "tfn" in scheme and "man" in scheme: result = smat_util.binarized(mat_true) + smat_util.binarized(mat_pred) else: raise ValueError("Unrecognized negative sampling method {}".format(scheme)) LOGGER.debug( f"Construct {scheme} with shape={result.shape} avr_M_nnz={result.nnz/result.shape[0]}" ) return result # construct label chain for training and validation set # avoid large matmul_threads to prevent overhead in Y.dot(C) and save memory matmul_threads = kwargs.get("threads", os.cpu_count()) matmul_threads = min(32, matmul_threads) YC_list = [prob.Y] for cur_C in reversed(clustering[1:]): Y_t = clib.sparse_matmul(YC_list[-1], cur_C, threads=matmul_threads).tocsr() YC_list.append(Y_t) YC_list.reverse() if val_prob is not None: val_YC_list = [val_prob.Y] for cur_C in reversed(clustering[1:]): Y_t = clib.sparse_matmul(val_YC_list[-1], cur_C, threads=matmul_threads).tocsr() val_YC_list.append(Y_t) val_YC_list.reverse() parent_model = None M, val_M = None, None M_pred, val_M_pred = None, None bootstrapping, inst_embeddings = None, None for i in range(nr_transformers): cur_train_params = train_params.matcher_params_chain[i] cur_pred_params = pred_params.matcher_params_chain[i] cur_train_params.max_steps = steps_scale[i] * cur_train_params.max_steps cur_train_params.num_train_epochs = ( steps_scale[i] * cur_train_params.num_train_epochs ) cur_ns = cur_train_params.negative_sampling # construct train and val problem for level i # note that final layer do not need X_feat if i > 0: M = get_negative_samples(YC_list[i - 1], M_pred, cur_ns) cur_prob = MLProblemWithText( prob.X_text, YC_list[i], X_feat=None if i == nr_transformers - 1 else prob.X_feat, C=clustering[i], M=M, ) if val_prob is not None: if i > 0: val_M = get_negative_samples(val_YC_list[i - 1], val_M_pred, cur_ns) cur_val_prob = MLProblemWithText( val_prob.X_text, val_YC_list[i], X_feat=None if i == nr_transformers - 1 else val_prob.X_feat, C=clustering[i], M=val_M, ) else: cur_val_prob = None avr_trn_labels = ( float(cur_prob.M.nnz) / YC_list[i].shape[0] if cur_prob.M is not None else YC_list[i].shape[1] ) LOGGER.info( "Fine-tuning XR-Transformer with {} at level {}, nr_labels={}, avr_M_nnz={}".format( cur_ns, i, YC_list[i].shape[1], avr_trn_labels ) ) # bootstrapping with previous text_encoder and instance embeddings if parent_model is not None: init_encoder = deepcopy(parent_model.text_encoder) init_text_model = deepcopy(parent_model.text_model) bootstrapping = (init_encoder, inst_embeddings, init_text_model) # determine whether train prediction and instance embeddings are needed return_train_pred = ( i + 1 < nr_transformers ) and "man" in train_params.matcher_params_chain[i + 1].negative_sampling return_train_embeddings = ( i + 1 == nr_transformers ) or "linear" in cur_train_params.bootstrap_method res_dict = TransformerMatcher.train( cur_prob, csr_codes=M_pred, val_prob=cur_val_prob, val_csr_codes=val_M_pred, train_params=cur_train_params, pred_params=cur_pred_params, bootstrapping=bootstrapping, return_dict=True, return_train_pred=return_train_pred, return_train_embeddings=return_train_embeddings, saved_trn_pt=saved_trn_pt, saved_val_pt=saved_val_pt, ) parent_model = res_dict["matcher"] M_pred = res_dict["trn_pred"] val_M_pred = res_dict["val_pred"] inst_embeddings = res_dict["trn_embeddings"] val_inst_embeddings = res_dict["val_embeddings"] if train_params.save_emb_dir: os.makedirs(train_params.save_emb_dir, exist_ok=True) if inst_embeddings is not None: smat_util.save_matrix( os.path.join(train_params.save_emb_dir, "X.trn.npy"), inst_embeddings, ) LOGGER.info(f"Trn embeddings saved to {train_params.save_emb_dir}/X.trn.npy") if val_inst_embeddings is not None: smat_util.save_matrix( os.path.join(train_params.save_emb_dir, "X.val.npy"), val_inst_embeddings, ) LOGGER.info(f"Val embeddings saved to {train_params.save_emb_dir}/X.val.npy") ranker = None if not train_params.only_encoder: # construct X_concat X_concat = TransformerMatcher.concat_features( prob.X_feat, inst_embeddings, normalize_emb=True, ) del inst_embeddings LOGGER.info("Constructed instance feature matrix with shape={}".format(X_concat.shape)) # 3. construct refined HLT if train_params.fix_clustering: clustering = clustering else: clustering = Indexer.gen( LabelEmbeddingFactory.pifa(prob.Y, X_concat), train_params=train_params.refined_indexer_params, ) LOGGER.info( "Hierarchical label tree for ranker: {}".format([cc.shape[0] for cc in clustering]) ) # the HLT could have changed depth train_params.ranker_params.hlm_args = ( HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain( train_params.ranker_params.hlm_args, HierarchicalMLModel.TrainParams, len(clustering), ) ) pred_params.ranker_params.hlm_args = ( HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain( pred_params.ranker_params.hlm_args, HierarchicalMLModel.PredParams, len(clustering), ) ) pred_params.ranker_params.override_with_kwargs(kwargs) # train the ranker LOGGER.info("Start training ranker...") ranker = XLinearModel.train( X_concat, prob.Y, C=clustering, train_params=train_params.ranker_params, pred_params=pred_params.ranker_params, ) return cls(parent_model, ranker)
def do_spmm_exp(args): # load data Y = smat_util.load_matrix(args.y_npz_path).astype(np.float32) X = smat_util.load_matrix(args.x_npz_path).astype(np.float32) YT_csr = Y.T.tocsr() X_csr = X.tocsr() # The #threads is control by env variables (except for pecos) # e.g., export OMP_NUM_THREADS=16, export MKL_NUM_THREADS=16. run_time = 0.0 if args.spmm_algo == "pecos": start = time.time() Z = pecos_clib.sparse_matmul( YT_csr, X_csr, eliminate_zeros=False, sorted_indices=True, threads=args.threads, ) run_time += time.time() - start Z_data = Z.data elif args.spmm_algo == "intel-mkl": from sparse_dot_mkl import dot_product_mkl # make sure set the index to int64 for large matrices # export MKL_INTERFACE_LAYER=ILP64 start = time.time() Z = dot_product_mkl(YT_csr, X_csr, reorder_output=True) run_time += time.time() - start Z_data = Z.data elif args.spmm_algo == "scipy": # scipy will not sorted the indices for each row, # so we do it explicitly start = time.time() Z = YT_csr.dot(X_csr) Z.sort_indices() run_time += time.time() - start Z_data = Z.data elif args.spmm_algo == "pytorch": import torch def get_pt_data(A_csr): A_indices, A_values = csr_to_coo(A_csr) A_pt = torch.sparse_coo_tensor( A_indices.T.astype(np.int64), A_values.astype(np.float32), A_csr.shape, ) return A_pt YT_pt = get_pt_data(YT_csr) X_pt = get_pt_data(X_csr) start = time.time() Z_pt = torch.sparse.mm(YT_pt, X_pt) run_time += time.time() - start Z_data = Z_pt.coalesce().values().numpy() elif args.spmm_algo == "tensorflow": import tensorflow.compat.v1 as tf from tensorflow.python.ops.linalg.sparse import sparse_csr_matrix_ops def get_tf_data(A_csr): # Define (COO format) Sparse Tensors over Numpy arrays A_indices, A_values = csr_to_coo(A_csr) A_st = tf.sparse.SparseTensor( A_indices.astype(np.int64), A_values.astype(np.float32), A_csr.shape, ) return A_st # Tensorflow (v2.5.0) usage, as of 07/20/2021: # https://www.tensorflow.org/api_docs/python/tf/raw_ops/SparseMatrixSparseMatMul with tf.Session() as sess: YT_st = get_tf_data(YT_csr) X_st = get_tf_data(X_csr) sess.run(YT_st) sess.run(X_st) YT_sm = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix( YT_st.indices, YT_st.values, YT_st.dense_shape) X_sm = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix( X_st.indices, X_st.values, X_st.dense_shape) start = time.time() Z_sm = sparse_csr_matrix_ops.sparse_matrix_sparse_mat_mul( a=YT_sm, b=X_sm, type=tf.float32) Z_st = sparse_csr_matrix_ops.csr_sparse_matrix_to_sparse_tensor( Z_sm, tf.float32) Z_data = sess.run(Z_st.values) run_time += time.time() - start else: raise ValueError(f"spmm_algo={args.spmm_algo} is not valid") run_time = time.time() - start print( "algo {:16s} time(s) {:9.5f} nnz(Z) {:12d} mu(Z.data) {:8.4f}".format( args.spmm_algo, run_time, len(Z_data), np.mean(Z_data), ))