def do_encode(args): """Generate text embeddings with XTransformer and save to file. Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ if os.path.isdir(args.save_emb_path): args.save_emb_path = os.path.join(args.save_emb_path, "embeddings.npy") xtf = XTransformer.load(args.model_folder) # load instance feature and text X_text = Preprocessor.load_data_from_file(args.text_path, label_text_path=None, text_pos=0)[ "corpus" ] X_emb = xtf.encode( X_text, batch_size=args.batch_size, batch_gen_workers=args.batch_gen_workers, use_gpu=args.use_gpu, max_pred_chunk=args.max_pred_chunk, ) smat_util.save_matrix(args.save_emb_path, X_emb)
def save_feature_matrix(tgt, feat_mat): """Save feature matrix to file Args: tgt (str or file-like object): destination to save the feature matrix feat_mat (sparse matrix or ndarray): feature matrix to save """ smat_util.save_matrix(tgt, feat_mat)
def save(self, folder): """Save to disk. Args: folder (str): Folder to save to. """ os.makedirs(folder, exist_ok=True) with open(os.path.join(folder, "config.json"), "w", encoding="utf-8") as fout: fout.write(json.dumps({"len": len(self)})) for i, C in enumerate(self): smat_util.save_matrix(os.path.join(folder, f"C{i}.npz"), C)
def do_predict(args): """Predict and Evaluate for HNSW model Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ # Load data Xt = smat_util.load_matrix(args.inst_path).astype(np.float32) # Load model model = HNSW.load(args.model_folder) # Setup HNSW Searchers for thread-safe inference threads = os.cpu_count() if args.threads <= 0 else args.threads searchers = model.searchers_create(num_searcher=threads) # Setup prediction params # pred_params.threads will be overrided if searchers are provided in model.predict() pred_params = HNSW.PredParams( efS=args.efSearch, topk=args.only_topk, threads=threads, ) # Model Predicting Yt_pred = model.predict( Xt, pred_params=pred_params, searchers=searchers, ret_csr=True, ) # Save prediction if args.save_pred_path: smat_util.save_matrix(args.save_pred_path, Yt_pred) # Evaluate Recallk@k if args.label_path: Yt = smat_util.load_matrix(args.label_path) # assuming ground truth is similarity-based (larger the better) Yt_topk = smat_util.sorted_csr(Yt, only_topk=args.only_topk) # assuming prediction matrix is distance-based, so need 1-dist=similiarty Yt_pred.data = 1.0 - Yt_pred.data metric = smat_util.Metrics.generate(Yt_topk, Yt_pred, topk=args.only_topk) print("Recall{}@{} {:.6f}%".format(args.only_topk, args.only_topk, 100.0 * metric.recall[-1]))
def do_predict(args): """Predict and Evaluate for xlinear model Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ # Load data Xt = XLinearModel.load_feature_matrix(args.inst_path) if args.selected_output is not None: # Selected Output selected_outputs_csr = XLinearModel.load_feature_matrix( args.selected_output) xlinear_model = XLinearModel.load(args.model_folder, is_predict_only=True, weight_matrix_type="CSC") else: # TopK selected_outputs_csr = None xlinear_model = XLinearModel.load(args.model_folder, is_predict_only=True) # Model Predicting Yt_pred = xlinear_model.predict( Xt, selected_outputs_csr=selected_outputs_csr, only_topk=args.only_topk, beam_size=args.beam_size, post_processor=args.post_processor, threads=args.threads, max_pred_chunk=args.max_pred_chunk, ) # Save prediction if args.save_pred_path: smat_util.save_matrix(args.save_pred_path, Yt_pred) # Evaluate if args.label_path: Yt = XLinearModel.load_label_matrix(args.label_path) metric = smat_util.Metrics.generate(Yt, Yt_pred, topk=10) print("==== evaluation results ====") print(metric)
def do_predict(args): """Predict with XTransformer and save the result. Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ if os.path.isdir(args.save_pred_path): args.save_pred_path = os.path.join(args.save_pred_path, "P.npz") torch_util.set_seed(args.seed) xtf = XTransformer.load(args.model_folder) # load instance feature and text if args.feat_path: X_feat = smat_util.load_matrix(args.feat_path) else: X_feat = None X_text = Preprocessor.load_data_from_file(args.text_path, label_text_path=None, text_pos=0)["corpus"] P_matrix = xtf.predict( X_text, X_feat=X_feat, batch_size=args.batch_size, batch_gen_workers=args.batch_gen_workers, use_gpu=args.use_gpu, beam_size=args.beam_size, only_topk=args.only_topk, post_processor=args.post_processor, max_pred_chunk=args.max_pred_chunk, threads=args.threads, ) smat_util.save_matrix(args.save_pred_path, P_matrix)
def run(args): """Preprocess text using an existing preprocessor. Args: args (argparse.Namespace): Command line argument parsed by `parser.parse_args()` """ preprocessor = Preprocessor.load(args.input_preprocessor_folder) if args.from_file and not args.output_label_path and not args.output_rel_path: Y, R = None, None corpus = args.input_text_path else: result = Preprocessor.load_data_from_file( args.input_text_path, label_text_path=args.label_text_path, maxsplit=args.maxsplit, text_pos=args.text_pos, label_pos=args.label_pos, ) Y = result["label_matrix"] R = result["label_relevance"] corpus = result["corpus"] X = preprocessor.predict( corpus, batch_size=args.batch_size, use_gpu_if_available=args.use_gpu, buffer_size=args.buffer_size, threads=args.threads, ) smat_util.save_matrix(args.output_inst_path, X) if args.output_label_path and Y is not None: smat_util.save_matrix(args.output_label_path, Y) if args.output_rel_path and R is not None: smat_util.save_matrix(args.output_rel_path, R)
def train( cls, input_text_path, output_text_path, label_embed_type="pifa", vectorizer_config=None, train_params=None, pred_params=None, workspace_folder=None, **kwargs, ): """Train a Text2Text model Args: input_text_path (str): Text input file name. Format: in each line, OUTPUT_ID1,OUTPUT_ID2,OUTPUT_ID3,...\t INPUT_TEXT where OUTPUT_IDs are the zero-based output item indices corresponding to the line numbers of OUTPUT_ITEM_PATH. We assume utf-8 encoding for text. output_text_path (str): The file path for output text items. Format: each line corresponds to a representation of the output item. We assume utf-8 encoding for text. label_embed_type (list of str): Label embedding types. (default pifa). We support pifa, pifa_lf_concat::Z=path, and pifa_lf_convex_combine::Z=path::alpha=scalar_value. Multiple values will lead to different individual models for ensembling. vectorizer_config_json (str): Json_format string for vectorizer config (default None) train_params (Text2Text.TrainParams): params to train Text2Text model pred_params (Text2Text.PredParams): params to predict Text2Text model workspace_folder: (str, default=None): A folder name for storing intermediate variables during training kwargs: {"beam_size": INT, "only_topk": INT, "post_processor": STR}, Default None to use HierarchicalMLModel.PredParams defaults Returns: A Text2Text object """ ws = CachedWorkspace(workspace_folder) dtype = np.float32 # Train Preprocessor and obtain X, Y XY_kwargs = dict( input_text_path=input_text_path, output_text_path=output_text_path, vectorizer_config=vectorizer_config, dtype=str(dtype), ) # Prepare Preprocessor preprocessor_path = ws.get_path_for_name_and_kwargs("preprocessor", XY_kwargs) if path.exists(preprocessor_path): LOGGER.info("Loading existing preprocessor...") preprocessor = Preprocessor.load(preprocessor_path) else: LOGGER.info("Parsing text files...") parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path) Y = parsed_result["label_matrix"] R = parsed_result["label_relevance"] corpus = parsed_result["corpus"] LOGGER.info( f"Training {vectorizer_config['type']} vectorizer on {len(corpus)} input texts..." ) preprocessor = Preprocessor.train(corpus, vectorizer_config, dtype=dtype) preprocessor.save(preprocessor_path) # Prepare X, X could be dense or sparse X_path = ws.get_path_for_name_and_kwargs("X", XY_kwargs) if path.exists(X_path): X = XLinearModel.load_feature_matrix(X_path) else: if "corpus" not in locals(): parse_result = Preprocessor.load_data_from_file(input_text_path, output_text_path) Y = parse_result["label_matrix"] R = parse_result["label_relevance"] corpus = parse_result["corpus"] LOGGER.info(f"Vectorizing {len(corpus)} texts...") X = preprocessor.predict(corpus) XLinearModel.save_feature_matrix(X_path, X) LOGGER.info( f"{vectorizer_config['type']} input X loaded: {X.shape[0]} samples with {X.shape[1]} features." ) # Prepare Y, Y is always sparse Y_path = ws.get_path_for_name_and_kwargs("Y", XY_kwargs) + ".npz" if path.exists(Y_path): Y = smat_util.load_matrix(Y_path) else: if "Y" not in locals(): parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path) Y = parsed_result["label_matrix"] R = parsed_result["label_relevance"] smat_util.save_matrix(Y_path, Y) LOGGER.info(f"Output label Y loaded: {Y.shape[0]} samples with {Y.shape[1]} labels.") # Prepare R, R should have same sparsity pattern as Y R_path = ws.get_path_for_name_and_kwargs("R", XY_kwargs) + ".npz" if path.exists(R_path): R = smat_util.load_matrix(R_path) else: if "R" not in locals(): parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path) R = parsed_result["label_relevance"] if R is not None: smat_util.save_matrix(R_path, R) if R is not None: LOGGER.info(f"Relevance matrix R loaded, cost sensitive learning enabled.") # construct indexing, training and prediction params if train_params is None: # fill all BaseParams class with their default value train_params = cls.TrainParams.from_dict(dict(), recursive=True) else: train_params = cls.TrainParams.from_dict(train_params) # construct pred_params if pred_params is None: # fill all BaseParams with their default value pred_params = cls.PredParams.from_dict(dict(), recursive=True) else: pred_params = cls.PredParams.from_dict(pred_params) pred_params = pred_params.override_with_kwargs(kwargs) # 1. Generate label features label_embed_kwargs = dict( input_text_path=input_text_path, output_text_path=output_text_path, dtype=str(dtype), vectorizer_config=vectorizer_config, embed_type=label_embed_type, ) label_embed_path = ws.get_path_for_name_and_kwargs("L", label_embed_kwargs) if path.exists(label_embed_path): LOGGER.info(f"Loading existing {label_embed_type} features for {Y.shape[1]} labels...") label_feat = XLinearModel.load_feature_matrix(label_embed_path) else: LOGGER.info(f"Generating {label_embed_type} features for {Y.shape[1]} labels...") # parse embed_type string, expect either the following three cases: # (1) pifa # (2) pifa_lf_concat::Z=path # (3) pifa_lf_convex_combine::Z=path::alpha=value lemb_key_val_list = label_embed_type.split("::") lemb_type = lemb_key_val_list[0] lemb_kwargs = {} for key_val_str in lemb_key_val_list[1:]: key, val = key_val_str.split("=") if key == "Z": Z = smat_util.load_matrix(val) lemb_kwargs.update({"Z": Z}) elif key == "alpha": alpha = float(val) lemb_kwargs.update({"alpha": alpha}) else: raise ValueError(f"key={key}, val={val} is not supported!") if "lf" in lemb_type and lemb_kwargs.get("Z", None) is None: raise ValueError( "pifa_lf_concat/pifa_lf_convex_combine must provide external path for Z." ) # Create label features label_feat = LabelEmbeddingFactory.create( Y, X, method=lemb_type, **lemb_kwargs, ) XLinearModel.save_feature_matrix(label_embed_path, label_feat) # 2. Indexing indexer_kwargs_dict = train_params.indexer_params.to_dict() C_path = ws.get_path_for_name_and_kwargs("C", indexer_kwargs_dict) if path.exists(C_path): LOGGER.info(f"Loading existing clustering code with params {indexer_kwargs_dict}") C = ClusterChain.load(C_path) else: C = Indexer.gen(label_feat, train_params=train_params.indexer_params) LOGGER.info("Hierarchical label tree: {}".format([cc.shape[0] for cc in C])) C.save(C_path) del label_feat gc.collect() # Ensemble Models m = XLinearModel.train( X, Y, C=C, R=R, train_params=train_params.xlinear_params, pred_params=pred_params.xlinear_params, pred_kwargs=kwargs, ) xlinear_models = [[m, train_params.to_dict()]] # Load output items with open(output_text_path, "r", encoding="utf-8") as f: output_items = [q.strip() for q in f] return cls(preprocessor, xlinear_models, output_items)
def train( cls, prob, clustering=None, val_prob=None, train_params=None, pred_params=None, **kwargs, ): """Train the XR-Transformer model with the given input data. Args: prob (MLProblemWithText): ML problem to solve. clustering (ClusterChain, optional): preliminary hierarchical label tree, where transformer is fine-tuned on. val_prob (MLProblemWithText, optional): ML problem for validation. train_params (XTransformer.TrainParams): training parameters for XTransformer pred_params (XTransformer.pred_params): pred parameters for XTransformer kwargs: label_feat (ndarray or csr_matrix, optional): label features on which to generate preliminary HLT saved_trn_pt (str, optional): path to save the tokenized trn text. Use a tempdir if not given saved_val_pt (str, optional): path to save the tokenized val text. Use a tempdir if not given matmul_threads (int, optional): number of threads to use for constructing label tree. Default to use at most 32 threads beam_size (int, optional): overrides only_topk for models except bottom layer one Returns: XTransformer """ # tempdir to save tokenized text temp_dir = tempfile.TemporaryDirectory() saved_trn_pt = kwargs.get("saved_trn_pt", "") if not saved_trn_pt: saved_trn_pt = f"{temp_dir.name}/X_trn.pt" saved_val_pt = kwargs.get("saved_val_pt", "") if not saved_val_pt: saved_val_pt = f"{temp_dir.name}/X_val.pt" # construct train_params if train_params is None: # fill all BaseParams class with their default value train_params = cls.TrainParams.from_dict(dict(), recursive=True) else: train_params = cls.TrainParams.from_dict(train_params) # construct pred_params if pred_params is None: # fill all BaseParams with their default value pred_params = cls.PredParams.from_dict(dict(), recursive=True) else: pred_params = cls.PredParams.from_dict(pred_params) if not train_params.do_fine_tune: if isinstance(train_params.matcher_params_chain, list): matcher_train_params = train_params.matcher_params_chain[-1] else: matcher_train_params = train_params.matcher_params_chain if isinstance(train_params.matcher_params_chain, list): matcher_pred_params = pred_params.matcher_params_chain[-1] else: matcher_pred_params = pred_params.matcher_params_chain device, n_gpu = torch_util.setup_device(matcher_train_params.use_gpu) if matcher_train_params.init_model_dir: parent_model = cls.load(train_params.init_model_dir) LOGGER.info("Loaded encoder from {}.".format(matcher_train_params.init_model_dir)) else: parent_model = TransformerMatcher.download_model( matcher_train_params.model_shortcut, ) LOGGER.info( "Downloaded encoder from {}.".format(matcher_train_params.model_shortcut) ) parent_model.to_device(device, n_gpu=n_gpu) _, inst_embeddings = parent_model.predict( prob.X_text, pred_params=matcher_pred_params, batch_size=matcher_train_params.batch_size * max(1, n_gpu), batch_gen_workers=matcher_train_params.batch_gen_workers, only_embeddings=True, ) if val_prob: _, val_inst_embeddings = parent_model.predict( val_prob.X_text, pred_params=matcher_pred_params, batch_size=matcher_train_params.batch_size * max(1, n_gpu), batch_gen_workers=matcher_train_params.batch_gen_workers, only_embeddings=True, ) else: # 1. Constructing primary Hierarchial Label Tree if clustering is None: label_feat = kwargs.get("label_feat", None) if label_feat is None: if prob.X_feat is None: raise ValueError( "Instance features are required to generate label features!" ) label_feat = LabelEmbeddingFactory.pifa(prob.Y, prob.X_feat) clustering = Indexer.gen( label_feat, train_params=train_params.preliminary_indexer_params, ) else: # assert cluster chain in clustering is valid clustering = ClusterChain(clustering) if clustering[-1].shape[0] != prob.nr_labels: raise ValueError("nr_labels mismatch!") prelim_hierarchiy = [cc.shape[0] for cc in clustering] LOGGER.info("Hierarchical label tree: {}".format(prelim_hierarchiy)) # get the fine-tuning task numbers nr_transformers = sum(i <= train_params.max_match_clusters for i in prelim_hierarchiy) LOGGER.info( "Fine-tune Transformers with nr_labels={}".format( [cc.shape[0] for cc in clustering[:nr_transformers]] ) ) steps_scale = kwargs.get("steps_scale", None) if steps_scale is None: steps_scale = [1.0] * nr_transformers if len(steps_scale) != nr_transformers: raise ValueError(f"steps-scale length error: {len(steps_scale)}!={nr_transformers}") # construct fields with chain now we know the depth train_params = HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain( train_params, cls.TrainParams, nr_transformers ) LOGGER.debug( f"XTransformer train_params: {json.dumps(train_params.to_dict(), indent=True)}" ) pred_params = HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain( pred_params, cls.PredParams, nr_transformers ) pred_params = pred_params.override_with_kwargs(kwargs) LOGGER.debug( f"XTransformer pred_params: {json.dumps(pred_params.to_dict(), indent=True)}" ) def get_negative_samples(mat_true, mat_pred, scheme): if scheme == "tfn": result = smat_util.binarized(mat_true) elif scheme == "man": result = smat_util.binarized(mat_pred) elif "tfn" in scheme and "man" in scheme: result = smat_util.binarized(mat_true) + smat_util.binarized(mat_pred) else: raise ValueError("Unrecognized negative sampling method {}".format(scheme)) LOGGER.debug( f"Construct {scheme} with shape={result.shape} avr_M_nnz={result.nnz/result.shape[0]}" ) return result # construct label chain for training and validation set # avoid large matmul_threads to prevent overhead in Y.dot(C) and save memory matmul_threads = kwargs.get("threads", os.cpu_count()) matmul_threads = min(32, matmul_threads) YC_list = [prob.Y] for cur_C in reversed(clustering[1:]): Y_t = clib.sparse_matmul(YC_list[-1], cur_C, threads=matmul_threads).tocsr() YC_list.append(Y_t) YC_list.reverse() if val_prob is not None: val_YC_list = [val_prob.Y] for cur_C in reversed(clustering[1:]): Y_t = clib.sparse_matmul(val_YC_list[-1], cur_C, threads=matmul_threads).tocsr() val_YC_list.append(Y_t) val_YC_list.reverse() parent_model = None M, val_M = None, None M_pred, val_M_pred = None, None bootstrapping, inst_embeddings = None, None for i in range(nr_transformers): cur_train_params = train_params.matcher_params_chain[i] cur_pred_params = pred_params.matcher_params_chain[i] cur_train_params.max_steps = steps_scale[i] * cur_train_params.max_steps cur_train_params.num_train_epochs = ( steps_scale[i] * cur_train_params.num_train_epochs ) cur_ns = cur_train_params.negative_sampling # construct train and val problem for level i # note that final layer do not need X_feat if i > 0: M = get_negative_samples(YC_list[i - 1], M_pred, cur_ns) cur_prob = MLProblemWithText( prob.X_text, YC_list[i], X_feat=None if i == nr_transformers - 1 else prob.X_feat, C=clustering[i], M=M, ) if val_prob is not None: if i > 0: val_M = get_negative_samples(val_YC_list[i - 1], val_M_pred, cur_ns) cur_val_prob = MLProblemWithText( val_prob.X_text, val_YC_list[i], X_feat=None if i == nr_transformers - 1 else val_prob.X_feat, C=clustering[i], M=val_M, ) else: cur_val_prob = None avr_trn_labels = ( float(cur_prob.M.nnz) / YC_list[i].shape[0] if cur_prob.M is not None else YC_list[i].shape[1] ) LOGGER.info( "Fine-tuning XR-Transformer with {} at level {}, nr_labels={}, avr_M_nnz={}".format( cur_ns, i, YC_list[i].shape[1], avr_trn_labels ) ) # bootstrapping with previous text_encoder and instance embeddings if parent_model is not None: init_encoder = deepcopy(parent_model.text_encoder) init_text_model = deepcopy(parent_model.text_model) bootstrapping = (init_encoder, inst_embeddings, init_text_model) # determine whether train prediction and instance embeddings are needed return_train_pred = ( i + 1 < nr_transformers ) and "man" in train_params.matcher_params_chain[i + 1].negative_sampling return_train_embeddings = ( i + 1 == nr_transformers ) or "linear" in cur_train_params.bootstrap_method res_dict = TransformerMatcher.train( cur_prob, csr_codes=M_pred, val_prob=cur_val_prob, val_csr_codes=val_M_pred, train_params=cur_train_params, pred_params=cur_pred_params, bootstrapping=bootstrapping, return_dict=True, return_train_pred=return_train_pred, return_train_embeddings=return_train_embeddings, saved_trn_pt=saved_trn_pt, saved_val_pt=saved_val_pt, ) parent_model = res_dict["matcher"] M_pred = res_dict["trn_pred"] val_M_pred = res_dict["val_pred"] inst_embeddings = res_dict["trn_embeddings"] val_inst_embeddings = res_dict["val_embeddings"] if train_params.save_emb_dir: os.makedirs(train_params.save_emb_dir, exist_ok=True) if inst_embeddings is not None: smat_util.save_matrix( os.path.join(train_params.save_emb_dir, "X.trn.npy"), inst_embeddings, ) LOGGER.info(f"Trn embeddings saved to {train_params.save_emb_dir}/X.trn.npy") if val_inst_embeddings is not None: smat_util.save_matrix( os.path.join(train_params.save_emb_dir, "X.val.npy"), val_inst_embeddings, ) LOGGER.info(f"Val embeddings saved to {train_params.save_emb_dir}/X.val.npy") ranker = None if not train_params.only_encoder: # construct X_concat X_concat = TransformerMatcher.concat_features( prob.X_feat, inst_embeddings, normalize_emb=True, ) del inst_embeddings LOGGER.info("Constructed instance feature matrix with shape={}".format(X_concat.shape)) # 3. construct refined HLT if train_params.fix_clustering: clustering = clustering else: clustering = Indexer.gen( LabelEmbeddingFactory.pifa(prob.Y, X_concat), train_params=train_params.refined_indexer_params, ) LOGGER.info( "Hierarchical label tree for ranker: {}".format([cc.shape[0] for cc in clustering]) ) # the HLT could have changed depth train_params.ranker_params.hlm_args = ( HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain( train_params.ranker_params.hlm_args, HierarchicalMLModel.TrainParams, len(clustering), ) ) pred_params.ranker_params.hlm_args = ( HierarchicalMLModel._duplicate_fields_with_name_ending_with_chain( pred_params.ranker_params.hlm_args, HierarchicalMLModel.PredParams, len(clustering), ) ) pred_params.ranker_params.override_with_kwargs(kwargs) # train the ranker LOGGER.info("Start training ranker...") ranker = XLinearModel.train( X_concat, prob.Y, C=clustering, train_params=train_params.ranker_params, pred_params=pred_params.ranker_params, ) return cls(parent_model, ranker)
def do_predict(args): """Predict and Evaluate for xlinear model Args: args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()` """ # Load data LOGGER.info("| loading data begin...") start_time = time.time() Xt = XLinearModel.load_feature_matrix(args.inst_path) Xt = normalize(Xt, axis=1, norm="l2") run_time_data = time.time() - start_time LOGGER.info( "| loading data finsihed | time(s) {:9.4f}".format(run_time_data)) LOGGER.info("| loading model begin...") start_time = time.time() if args.selected_output is not None: # Selected Output selected_outputs_csr = XLinearModel.load_feature_matrix( args.selected_output) xlinear_model = XLinearModel.load(args.model_folder, is_predict_only=True, weight_matrix_type="CSC") else: # TopK selected_outputs_csr = None xlinear_model = XLinearModel.load(args.model_folder, is_predict_only=True) run_time_io = time.time() - start_time LOGGER.info( "| loading model finsihed | time(s) {:9.4f}".format(run_time_io)) # Model Predicting LOGGER.info("| inference model begin...") start_time = time.time() Yt_pred = xlinear_model.predict( Xt, selected_outputs_csr=selected_outputs_csr, only_topk=args.only_topk, beam_size=args.beam_size, post_processor=args.post_processor, threads=args.threads, max_pred_chunk=args.max_pred_chunk, ) run_time_pred = time.time() - start_time LOGGER.info( "| inference model finsihed | time(s) {:9.4f} latency(ms/q) {:9.4f}". format( run_time_pred, run_time_pred / Xt.shape[0] * 1000, )) # Save prediction if args.save_pred_path: smat_util.save_matrix(args.save_pred_path, Yt_pred) # Evaluate if args.label_path: Yt = XLinearModel.load_label_matrix(args.label_path) metric = smat_util.Metrics.generate(Yt, Yt_pred, topk=10) print("==== evaluation results ====") print(metric)
def main(): parser = argparse.ArgumentParser(description='Prepare data for Giant-XRT') parser.add_argument( '--raw-text-path', type=str, required=True, help="Path of raw text (.txt file, each raw correspond to a node)") parser.add_argument( '--vectorizer-config-path', type=str, required=True, help="a path to a json file that specify the tfidf hyper-paramters") parser.add_argument('--data-root-dir', type=str, default="./dataset") parser.add_argument('--xrt-data-dir', type=str, default="./proc_data_xrt") parser.add_argument('--dataset', type=str, default="ogbn-arxiv") parser.add_argument('--max-deg', type=int, default=1000) args = parser.parse_args() print(args) # Change args.save_data_dir to args.save_data_dir/args.dataset save_data_dir = os.path.join(args.xrt_data_dir, args.dataset) dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_root_dir) data = dataset[0] edge_index = data.edge_index # Make sure edge_index is undirected!!! if not is_undirected(edge_index): edge_index = to_undirected(edge_index) # Filtering nodes whose number of edges >= max_degree Degree = degree(edge_index[0]) Filtered_idx = torch.where(Degree < args.max_deg)[0] print('Number of original nodes:{}'.format(data.x.shape[0])) print('Number of filtered nodes:{}'.format(len(Filtered_idx))) # # Construct and save label matrix (adjacencey matrix) Y. Y_csr_all = smat.csr_matrix(to_scipy_sparse_matrix(edge_index)) Y_csr_trn = Y_csr_all[Filtered_idx] smat_util.save_matrix(f"{save_data_dir}/Y.trn.npz", Y_csr_trn) smat_util.save_matrix(f"{save_data_dir}/Y.all.npz", Y_csr_all) print("Saved Y.trn.npz and Y.all.npz") # Apply the same filtering for raw text with open(args.raw_text_path, "r") as fin: node_text_list = fin.readlines() print("|node_text_list={}".format(len(node_text_list))) count = 0 with open(f"{save_data_dir}/X.trn.txt", "w") as fout: for cur_idx, line in enumerate(node_text_list): if Filtered_idx[count].item() == cur_idx: fout.writelines(line) count += 1 assert count == len(Filtered_idx), "count={}, len(Filtered_idx)={}".format( count, len(Filtered_idx)) print("Saved X.trn.txt") # Apply the same filtering for tfidf features vectorizer_config = Vectorizer.load_config_from_args( args) # using args.vectorizer_config_path preprocessor = Preprocessor.train(node_text_list, vectorizer_config, dtype=np.float32) preprocessor.save(f"{save_data_dir}/tfidf-model") X_tfidf_all = preprocessor.predict(node_text_list) X_tfidf_trn = X_tfidf_all[Filtered_idx] smat_util.save_matrix(f"{save_data_dir}/X.all.tfidf.npz", X_tfidf_all) smat_util.save_matrix(f"{save_data_dir}/X.trn.tfidf.npz", X_tfidf_trn) print("Saved X.trn.npz and X.all.npz")