コード例 #1
0
ファイル: test_hnsw.py プロジェクト: OctoberChang/pecos
def test_save_and_load(tmpdir):
    import random
    import numpy as np
    from pecos.ann.hnsw import HNSW
    from pecos.utils import smat_util

    random.seed(1234)
    np.random.seed(1234)
    X_trn = smat_util.load_matrix(
        "test/tst-data/ann/X.trn.l2-normalized.npy").astype(np.float32)
    X_tst = smat_util.load_matrix(
        "test/tst-data/ann/X.tst.l2-normalized.npy").astype(np.float32)
    model_folder = tmpdir.join("hnsw_model_dir")

    train_params = HNSW.TrainParams(M=36, efC=90, metric_type="ip", threads=1)
    pred_params = HNSW.PredParams(efS=80, topk=10, threads=1)
    model = HNSW.train(
        X_trn,
        train_params=train_params,
        pred_params=pred_params,
    )
    Yp_from_mem, _ = model.predict(X_tst, ret_csr=False)
    model.save(model_folder)
    del model

    model = HNSW.load(model_folder)
    Yp_from_file, _ = model.predict(X_tst,
                                    pred_params=pred_params,
                                    ret_csr=False)
    assert Yp_from_mem == approx(
        Yp_from_file,
        abs=0.0), f"save and load failed: Yp_from_mem != Yp_from_file"
コード例 #2
0
    def load(cls, path_to_cluster):
        """Load from disk.

        Args:
            path_to_cluster (str): Folder where `ClusterChain` was saved to using `ClusterChain.save`.

        Returns:
            ClusterChain: The loaded object.
        """

        if os.path.isfile(path_to_cluster):
            C = smat_util.load_matrix(path_to_cluster)
            return cls.from_partial_chain(C)

        config_path = os.path.join(path_to_cluster, "config.json")
        if not os.path.exists(config_path):
            raise ValueError(
                f"Cluster config file, {config_path}, does not exist")

        with open(config_path, "r", encoding="utf-8") as fin:
            config = json.loads(fin.read())
            length = config.get("len", None)
            if length is None:
                raise ValueError(
                    f'Cluster config file, {config_path}, does not have "len" parameter'
                )

        chain = []
        for i in range(length):
            chain.append(
                smat_util.load_matrix(
                    os.path.join(path_to_cluster,
                                 f"C{i}.npz")).tocsc().astype(np.float32))

        return cls(chain)
コード例 #3
0
def preprocessor_cli(tmpdir, config_path, tgt_input_file):
    import subprocess
    import shlex

    model_folder = str(tmpdir.join("vectorizer"))
    x_file = str(tmpdir.join("x"))
    y_file = str(tmpdir.join("y.npz"))

    # Build
    cmd = []
    cmd += ["python3 -m pecos.utils.featurization.text.preprocess"]
    cmd += ["build"]
    cmd += ["-i {}".format(src_input_file)]
    cmd += ["--text-pos 1"]
    cmd += ["--vectorizer-config-path {}".format(config_path)]
    cmd += ["-m {}".format(model_folder)]
    print(" ".join(cmd))
    process = subprocess.run(shlex.split(" ".join(cmd)),
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
    assert process.returncode == 0

    # Run
    cmd = []
    cmd += ["python3 -m pecos.utils.featurization.text.preprocess"]
    cmd += ["run"]
    cmd += ["-i {}".format(src_input_file)]
    cmd += ["-l {}".format(label_file)]
    cmd += ["-p {}".format(model_folder)]
    cmd += ["-x {}".format(x_file)]
    cmd += ["-y {}".format(y_file)]
    cmd += ["--text-pos 1"]
    cmd += ["--label-pos 0"]
    cmd += ["--threads 1"]
    print(" ".join(cmd))
    process = subprocess.run(shlex.split(" ".join(cmd)),
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
    assert process.returncode == 0
    X = smat_util.load_matrix(x_file)
    Xtgt = smat_util.load_matrix(tgt_input_file)
    assert_matrix_equal(Xtgt, X)

    # Run without labels
    cmd = []
    cmd += ["python3 -m pecos.utils.featurization.text.preprocess"]
    cmd += ["run"]
    cmd += ["-i {}".format(src_input_file)]
    cmd += ["-p {}".format(model_folder)]
    cmd += ["-x {}".format(x_file)]
    cmd += ["--text-pos 1"]
    print(" ".join(cmd))
    process = subprocess.run(shlex.split(" ".join(cmd)),
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
    assert process.returncode == 0
    X = smat_util.load_matrix(x_file)
    Xtgt = smat_util.load_matrix(tgt_input_file)
    assert_matrix_equal(Xtgt, X)
コード例 #4
0
def do_evaluation(args):
    """ Evaluate xlinear predictions """
    assert len(args.tags) == len(args.pred_path)
    Y_true = sorted_csr(load_matrix(args.truth_path).tocsr())
    Y_pred = [sorted_csr(load_matrix(pp).tocsr()) for pp in args.pred_path]
    print("==== evaluation results ====")
    CsrEnsembler.print_ens(Y_true,
                           Y_pred,
                           args.tags,
                           ens_method=args.ens_method)
コード例 #5
0
ファイル: evaluate.py プロジェクト: OctoberChang/pecos
def do_evaluation(args):
    """Evaluate xlinear predictions

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """

    Y_true = smat_util.load_matrix(args.truth_path).tocsr()
    Y_pred = smat_util.load_matrix(args.pred_path).tocsr()
    metric = smat_util.Metrics.generate(Y_true, Y_pred, topk=args.topk)
    print("==== evaluation results ====")
    print(metric)
コード例 #6
0
def do_predict(args):
    """Predict and Evaluate for HNSW model

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """

    # Load data
    Xt = smat_util.load_matrix(args.inst_path).astype(np.float32)

    # Load model
    model = HNSW.load(args.model_folder)

    # Setup HNSW Searchers for thread-safe inference
    threads = os.cpu_count() if args.threads <= 0 else args.threads
    searchers = model.searchers_create(num_searcher=threads)

    # Setup prediction params
    # pred_params.threads will be overrided if searchers are provided in model.predict()
    pred_params = HNSW.PredParams(
        efS=args.efSearch,
        topk=args.only_topk,
        threads=threads,
    )

    # Model Predicting
    Yt_pred = model.predict(
        Xt,
        pred_params=pred_params,
        searchers=searchers,
        ret_csr=True,
    )

    # Save prediction
    if args.save_pred_path:
        smat_util.save_matrix(args.save_pred_path, Yt_pred)

    # Evaluate Recallk@k
    if args.label_path:
        Yt = smat_util.load_matrix(args.label_path)
        # assuming ground truth is similarity-based (larger the better)
        Yt_topk = smat_util.sorted_csr(Yt, only_topk=args.only_topk)
        # assuming prediction matrix is distance-based, so need 1-dist=similiarty
        Yt_pred.data = 1.0 - Yt_pred.data
        metric = smat_util.Metrics.generate(Yt_topk,
                                            Yt_pred,
                                            topk=args.only_topk)
        print("Recall{}@{} {:.6f}%".format(args.only_topk, args.only_topk,
                                           100.0 * metric.recall[-1]))
コード例 #7
0
def test_xtransformer_python_api():
    import numpy as np
    from pecos.utils import smat_util
    from pecos.utils.featurization.text.preprocess import Preprocessor

    from pecos.xmc.xtransformer.model import XTransformer
    from pecos.xmc.xtransformer.module import MLProblemWithText

    X_trn_file = "test/tst-data/xmc/xtransformer/train.txt"
    Y_trn_file = "test/tst-data/xmc/xtransformer/train_label.npz"

    trn_corpus = Preprocessor.load_data_from_file(
        X_trn_file,
        label_text_path=None,
        text_pos=0,
    )["corpus"]
    X_trn = smat_util.load_matrix(train_feat_file, dtype=np.float32)
    Y_trn = smat_util.load_matrix(Y_trn_file, dtype=np.float32)
    trn_prob = MLProblemWithText(trn_corpus, Y_trn, X_feat=X_trn)
    train_params = XTransformer.TrainParams.from_dict({}, recursive=True)

    train_params.matcher_params_chain.init_model_dir = bert_model_path
    train_params.matcher_params_chain.batch_size = 1
    train_params.matcher_params_chain.num_train_epochs = 1
    train_params.matcher_params_chain.save_steps = 2
    train_params.matcher_params_chain.batch_gen_workers = 2

    pred_params = XTransformer.PredParams.from_dict({}, recursive=True)
    pred_params.matcher_params_chain.only_topk = 2
    pred_params.ranker_params.hlm_args.model_chain.only_topk = 2

    print(train_params.to_dict())
    print(pred_params.to_dict())

    xtf = XTransformer.train(
        trn_prob,
        train_params=train_params,
        pred_params=pred_params,
    )
    P = xtf.predict(trn_corpus, X_trn)
    metric = smat_util.Metrics.generate(Y_trn, P, topk=10)
    std_output = "prec   = 100.00 100.00 66.67 50.00 40.00 33.33 28.57 25.00 22.22 20.00\nrecall = 41.67 83.33 83.33 83.33 83.33 83.33 83.33 83.33 83.33 83.33"
    assert str(metric) == std_output, f"{str(metric)} != {std_output}"
コード例 #8
0
ファイル: model.py プロジェクト: OctoberChang/pecos
    def load_label_matrix(src, for_training=False):
        """Load label matrix from file

        Args:
            src (str or file-like object): file to load the label matrix
            for_training (bool, optional): if False(default) return csr_matrix, else return csc_matrix

        Returns:
            matrix (csr_matrix or csc_matrix): loaded label matrix
        """
        assert isinstance(src, str), "src for load_label_matrix must be a str"
        dtype = np.float32
        feat_mat = smat_util.load_matrix(src)
        feat_mat = feat_mat.tocsc() if for_training else feat_mat.tocsr()
        return feat_mat.astype(dtype)
コード例 #9
0
ファイル: model.py プロジェクト: OctoberChang/pecos
    def load_feature_matrix(src):
        """Load feature matrix from file

        Args:
            src (str or file-like object): file to load the feature matrix

        Returns:
            matrix (csr_matrix or ndarray): loaded feature matrix
        """
        feat_mat = smat_util.load_matrix(src)
        if isinstance(feat_mat, np.ndarray):
            feat_mat = np.ascontiguousarray(feat_mat)
        elif isinstance(feat_mat, smat.spmatrix):
            feat_mat = feat_mat.tocsr()
            feat_mat.sort_indices()
        return feat_mat
コード例 #10
0
ファイル: train.py プロジェクト: OctoberChang/pecos
def do_train(args):
    """Train and Save HNSW model

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """

    # Create model folder
    if not os.path.exists(args.model_folder):
        os.makedirs(args.model_folder)

    # Load training inputs
    X = smat_util.load_matrix(args.inst_path).astype(np.float32)

    # Setup training and prediction params
    # Note that prediction params can be overrided in inference time
    train_params = HNSW.TrainParams(
        M=args.max_edge_per_node,
        efC=args.efConstruction,
        metric_type=args.metric_type,
        max_level_upper_bound=args.max_level_upper_bound,
        threads=args.threads,
    )
    pred_params = HNSW.PredParams(
        efS=args.efSearch,
        topk=args.only_topk,
        threads=args.threads,
    )

    # train and save HNSW indexer
    model = HNSW.train(
        X,
        train_params=train_params,
        pred_params=pred_params,
    )

    model.save(args.model_folder)
コード例 #11
0
def do_predict(args):
    """Predict with XTransformer and save the result.

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """
    if os.path.isdir(args.save_pred_path):
        args.save_pred_path = os.path.join(args.save_pred_path, "P.npz")

    torch_util.set_seed(args.seed)

    xtf = XTransformer.load(args.model_folder)

    # load instance feature and text
    if args.feat_path:
        X_feat = smat_util.load_matrix(args.feat_path)
    else:
        X_feat = None
    X_text = Preprocessor.load_data_from_file(args.text_path,
                                              label_text_path=None,
                                              text_pos=0)["corpus"]

    P_matrix = xtf.predict(
        X_text,
        X_feat=X_feat,
        batch_size=args.batch_size,
        batch_gen_workers=args.batch_gen_workers,
        use_gpu=args.use_gpu,
        beam_size=args.beam_size,
        only_topk=args.only_topk,
        post_processor=args.post_processor,
        max_pred_chunk=args.max_pred_chunk,
        threads=args.threads,
    )

    smat_util.save_matrix(args.save_pred_path, P_matrix)
コード例 #12
0
ファイル: mlp.py プロジェクト: OctoberChang/pecos
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (MLP)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_embedding', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=300)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--bn', action='store_true')
    parser.add_argument('--data_root_dir', type=str, default='../../dataset')
    parser.add_argument('--node_emb_path', type=str, default=None)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products',
                                     root=args.data_root_dir)
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    if args.node_emb_path:
        data.x = torch.from_numpy(
            smat_util.load_matrix(args.node_emb_path).astype(np.float32))
        print("Loaded pre-trained node embeddings of shape={} from {}".format(
            data.x.shape, args.node_emb_path))

    x = data.x
    x = x.to(device)

    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    model = MLP(x.size(-1), args.hidden_channels, dataset.num_classes,
                args.num_layers, args.dropout, args.bn).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, x, y_true, train_idx, optimizer)
            result = test(model, x, y_true, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}%, '
                      f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
コード例 #13
0
ファイル: model.py プロジェクト: OctoberChang/pecos
    def train(
        cls,
        input_text_path,
        output_text_path,
        label_embed_type="pifa",
        vectorizer_config=None,
        train_params=None,
        pred_params=None,
        workspace_folder=None,
        **kwargs,
    ):
        """Train a Text2Text model

        Args:

            input_text_path (str): Text input file name.
                Format: in each line, OUTPUT_ID1,OUTPUT_ID2,OUTPUT_ID3,...\t INPUT_TEXT
                where OUTPUT_IDs are the zero-based output item indices
                corresponding to the line numbers of OUTPUT_ITEM_PATH.
                We assume utf-8 encoding for text.
            output_text_path (str): The file path for output text items.
                Format: each line corresponds to a representation
                of the output item. We assume utf-8 encoding for text.
            label_embed_type (list of str): Label embedding types. (default pifa).
                We support pifa, pifa_lf_concat::Z=path, and pifa_lf_convex_combine::Z=path::alpha=scalar_value.
                Multiple values will lead to different individual models for ensembling.
            vectorizer_config_json (str): Json_format string for vectorizer config (default None)
            train_params (Text2Text.TrainParams): params to train Text2Text model
            pred_params (Text2Text.PredParams): params to predict Text2Text model
            workspace_folder: (str, default=None): A folder name for storing intermediate
                variables during training
            kwargs:
                {"beam_size": INT, "only_topk": INT, "post_processor": STR},
                    Default None to use HierarchicalMLModel.PredParams defaults

        Returns:
            A Text2Text object
        """

        ws = CachedWorkspace(workspace_folder)
        dtype = np.float32

        # Train Preprocessor and obtain X, Y
        XY_kwargs = dict(
            input_text_path=input_text_path,
            output_text_path=output_text_path,
            vectorizer_config=vectorizer_config,
            dtype=str(dtype),
        )

        # Prepare Preprocessor
        preprocessor_path = ws.get_path_for_name_and_kwargs("preprocessor", XY_kwargs)
        if path.exists(preprocessor_path):
            LOGGER.info("Loading existing preprocessor...")
            preprocessor = Preprocessor.load(preprocessor_path)
        else:
            LOGGER.info("Parsing text files...")
            parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path)
            Y = parsed_result["label_matrix"]
            R = parsed_result["label_relevance"]
            corpus = parsed_result["corpus"]

            LOGGER.info(
                f"Training {vectorizer_config['type']} vectorizer on {len(corpus)} input texts..."
            )
            preprocessor = Preprocessor.train(corpus, vectorizer_config, dtype=dtype)
            preprocessor.save(preprocessor_path)

        # Prepare X, X could be dense or sparse
        X_path = ws.get_path_for_name_and_kwargs("X", XY_kwargs)

        if path.exists(X_path):
            X = XLinearModel.load_feature_matrix(X_path)
        else:
            if "corpus" not in locals():
                parse_result = Preprocessor.load_data_from_file(input_text_path, output_text_path)
                Y = parse_result["label_matrix"]
                R = parse_result["label_relevance"]
                corpus = parse_result["corpus"]
            LOGGER.info(f"Vectorizing {len(corpus)} texts...")
            X = preprocessor.predict(corpus)
            XLinearModel.save_feature_matrix(X_path, X)
        LOGGER.info(
            f"{vectorizer_config['type']} input X loaded: {X.shape[0]} samples with {X.shape[1]} features."
        )

        # Prepare Y, Y is always sparse
        Y_path = ws.get_path_for_name_and_kwargs("Y", XY_kwargs) + ".npz"
        if path.exists(Y_path):
            Y = smat_util.load_matrix(Y_path)
        else:
            if "Y" not in locals():
                parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path)
                Y = parsed_result["label_matrix"]
                R = parsed_result["label_relevance"]
            smat_util.save_matrix(Y_path, Y)
        LOGGER.info(f"Output label Y loaded: {Y.shape[0]} samples with {Y.shape[1]} labels.")

        # Prepare R, R should have same sparsity pattern as Y
        R_path = ws.get_path_for_name_and_kwargs("R", XY_kwargs) + ".npz"
        if path.exists(R_path):
            R = smat_util.load_matrix(R_path)
        else:
            if "R" not in locals():
                parsed_result = Preprocessor.load_data_from_file(input_text_path, output_text_path)
                R = parsed_result["label_relevance"]
            if R is not None:
                smat_util.save_matrix(R_path, R)
        if R is not None:
            LOGGER.info(f"Relevance matrix R loaded, cost sensitive learning enabled.")

        # construct indexing, training and prediction params
        if train_params is None:
            # fill all BaseParams class with their default value
            train_params = cls.TrainParams.from_dict(dict(), recursive=True)
        else:
            train_params = cls.TrainParams.from_dict(train_params)

        # construct pred_params
        if pred_params is None:
            # fill all BaseParams with their default value
            pred_params = cls.PredParams.from_dict(dict(), recursive=True)
        else:
            pred_params = cls.PredParams.from_dict(pred_params)
        pred_params = pred_params.override_with_kwargs(kwargs)

        # 1. Generate label features
        label_embed_kwargs = dict(
            input_text_path=input_text_path,
            output_text_path=output_text_path,
            dtype=str(dtype),
            vectorizer_config=vectorizer_config,
            embed_type=label_embed_type,
        )
        label_embed_path = ws.get_path_for_name_and_kwargs("L", label_embed_kwargs)
        if path.exists(label_embed_path):
            LOGGER.info(f"Loading existing {label_embed_type} features for {Y.shape[1]} labels...")
            label_feat = XLinearModel.load_feature_matrix(label_embed_path)
        else:
            LOGGER.info(f"Generating {label_embed_type} features for {Y.shape[1]} labels...")
            # parse embed_type string, expect either the following three cases:
            # (1) pifa
            # (2) pifa_lf_concat::Z=path
            # (3) pifa_lf_convex_combine::Z=path::alpha=value
            lemb_key_val_list = label_embed_type.split("::")
            lemb_type = lemb_key_val_list[0]
            lemb_kwargs = {}
            for key_val_str in lemb_key_val_list[1:]:
                key, val = key_val_str.split("=")
                if key == "Z":
                    Z = smat_util.load_matrix(val)
                    lemb_kwargs.update({"Z": Z})
                elif key == "alpha":
                    alpha = float(val)
                    lemb_kwargs.update({"alpha": alpha})
                else:
                    raise ValueError(f"key={key}, val={val} is not supported!")
            if "lf" in lemb_type and lemb_kwargs.get("Z", None) is None:
                raise ValueError(
                    "pifa_lf_concat/pifa_lf_convex_combine must provide external path for Z."
                )
            # Create label features
            label_feat = LabelEmbeddingFactory.create(
                Y,
                X,
                method=lemb_type,
                **lemb_kwargs,
            )
            XLinearModel.save_feature_matrix(label_embed_path, label_feat)

        # 2. Indexing
        indexer_kwargs_dict = train_params.indexer_params.to_dict()
        C_path = ws.get_path_for_name_and_kwargs("C", indexer_kwargs_dict)
        if path.exists(C_path):
            LOGGER.info(f"Loading existing clustering code with params {indexer_kwargs_dict}")
            C = ClusterChain.load(C_path)
        else:
            C = Indexer.gen(label_feat, train_params=train_params.indexer_params)
            LOGGER.info("Hierarchical label tree: {}".format([cc.shape[0] for cc in C]))
            C.save(C_path)

        del label_feat
        gc.collect()

        # Ensemble Models
        m = XLinearModel.train(
            X,
            Y,
            C=C,
            R=R,
            train_params=train_params.xlinear_params,
            pred_params=pred_params.xlinear_params,
            pred_kwargs=kwargs,
        )

        xlinear_models = [[m, train_params.to_dict()]]

        # Load output items
        with open(output_text_path, "r", encoding="utf-8") as f:
            output_items = [q.strip() for q in f]

        return cls(preprocessor, xlinear_models, output_items)
コード例 #14
0
ファイル: graph_saint.py プロジェクト: OctoberChang/pecos
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (GraphSAINT)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--inductive', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=20000)
    parser.add_argument('--walk_length', type=int, default=3)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--num_steps', type=int, default=30)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--eval_steps', type=int, default=2)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--data_root_dir', type=str, default='../../dataset')
    parser.add_argument('--node_emb_path', type=str, default=None)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products',
                                     root=args.data_root_dir)
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    # Load Pretrained node features from PECOS
    if args.node_emb_path:
        data.x = torch.from_numpy(
            smat_util.load_matrix(args.node_emb_path).astype(np.float32))
        print("Loaded pre-trained node embeddings of shape={} from {}".format(
            data.x.shape, args.node_emb_path))

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in split_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    # We omit normalization factors here since those are only defined for the
    # inductive learning setup.
    sampler_data = data
    if args.inductive:
        sampler_data = to_inductive(data)

    loader = GraphSAINTRandomWalkSampler(sampler_data,
                                         batch_size=args.batch_size,
                                         walk_length=args.walk_length,
                                         num_steps=args.num_steps,
                                         sample_coverage=0,
                                         save_dir=dataset.processed_dir)

    model = SAGE(data.x.size(-1), args.hidden_channels, dataset.num_classes,
                 args.num_layers, args.dropout).to(device)

    subgraph_loader = NeighborSampler(data.edge_index,
                                      sizes=[-1],
                                      batch_size=4096,
                                      shuffle=False,
                                      num_workers=12)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, loader, optimizer, device)
            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}')

            if epoch > 9 and epoch % args.eval_steps == 0:
                result = test(model, data, evaluator, subgraph_loader, device)
                logger.add_result(run, result)
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')

        logger.add_result(run, result)
        logger.print_statistics(run)
    logger.print_statistics()
コード例 #15
0
def do_train(args):
    """Train and Save xr-linear model

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """
    params = dict()
    if args.generate_params_skeleton:
        params["train_params"] = XLinearModel.TrainParams.from_dict(
            {}, recursive=True).to_dict()
        params["pred_params"] = XLinearModel.PredParams.from_dict(
            {}, recursive=True).to_dict()
        params["indexer_params"] = HierarchicalKMeans.TrainParams.from_dict(
            {}, recursive=True).to_dict()
        print(f"{json.dumps(params, indent=True)}")
        return

    if args.params_path:
        with open(args.params_path, "r") as fin:
            params = json.load(fin)

    train_params = params.get("train_params", None)
    pred_params = params.get("pred_params", None)
    indexer_params = params.get("indexer_params", None)

    if train_params is not None:
        train_params = XLinearModel.TrainParams.from_dict(train_params)
    else:
        train_params = XLinearModel.TrainParams.from_dict(
            {k: v
             for k, v in vars(args).items() if v is not None},
            recursive=True,
        )

    if pred_params is not None:
        pred_params = XLinearModel.PredParams.from_dict(pred_params)
    else:
        pred_params = XLinearModel.PredParams.from_dict(
            {k: v
             for k, v in vars(args).items() if v is not None},
            recursive=True,
        )

    if indexer_params is not None:
        indexer_params = HierarchicalKMeans.TrainParams.from_dict(
            indexer_params)
    else:
        indexer_params = HierarchicalKMeans.TrainParams.from_dict(
            {k: v
             for k, v in vars(args).items() if v is not None},
            recursive=True,
        )
    if args.seed:
        indexer_params.seed = args.seed

    if not os.path.exists(args.model_folder):
        os.makedirs(args.model_folder)

    LOGGER.info("| loading data begin...")
    start_time = time.time()
    X = XLinearModel.load_feature_matrix(args.inst_path)
    X = normalize(X, axis=1, norm="l2")
    Y = XLinearModel.load_label_matrix(args.label_path, for_training=True)
    run_time_io = time.time() - start_time
    LOGGER.info(
        "| loading data finsihed | time(s) {:9.4f}".format(run_time_io))

    LOGGER.info("| building HLT...")
    start_time = time.time()
    if args.code_path:
        cluster_chain = ClusterChain.load(args.code_path)
    else:
        if args.label_feat_path:
            label_feat = XLinearModel.load_feature_matrix(args.label_feat_path)
        else:
            label_feat = LabelEmbeddingFactory.create(Y, X, method="pifa")

        cluster_chain = Indexer.gen(label_feat, train_params=indexer_params)
    run_time_hlt = time.time() - start_time
    LOGGER.info(
        "| building HLT finsihed | time(s) {:9.4f}".format(run_time_hlt))

    # load label importance matrix if given
    if args.usn_label_path:
        usn_label_mat = smat_util.load_matrix(args.usn_label_path)
    else:
        usn_label_mat = None
    # load user supplied matching matrix if given
    if args.usn_match_path:
        usn_match_mat = smat_util.load_matrix(args.usn_match_path)
    else:
        usn_match_mat = None
    usn_match_dict = {0: usn_label_mat, 1: usn_match_mat}

    # load relevance matrix for cost-sensitive learning
    if args.rel_path:
        R = smat_util.load_matrix(args.rel_path)
    else:
        R = None

    pred_kwargs = {}
    for kw in ["beam_size", "only_topk", "post_processor"]:
        if getattr(args, kw, None) is not None:
            pred_kwargs[kw] = getattr(args, kw)

    LOGGER.info("| training XR-Linear...")
    start_time = time.time()
    xlm = XLinearModel.train(
        X,
        Y,
        C=cluster_chain,
        R=R,
        user_supplied_negatives=usn_match_dict,
        train_params=train_params,
        pred_params=pred_params,
        pred_kwargs=pred_kwargs,
    )
    run_time_xrl = time.time() - start_time
    LOGGER.info(
        "| training XR_Linear finsihed | time(s) {:9.4f}".format(run_time_xrl))

    xlm.save(args.model_folder)
    LOGGER.info(
        "| Finished with run_time(s) | total {:9.4f} hlt {:9.4f} xrl {:9.4f}".
        format(
            run_time_hlt + run_time_xrl,
            run_time_hlt,
            run_time_xrl,
        ))
コード例 #16
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-papers100M (MLP)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--epochs', type=int, default=30)
    parser.add_argument('--runs', type=int, default=5)
    parser.add_argument('--data_root_dir', type=str, default='../../dataset')
    parser.add_argument('--node_emb_path', type=str, default=None)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-papers100M',root=args.data_root_dir)
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    if args.node_emb_path:
        data.x = torch.from_numpy(smat_util.load_matrix(args.node_emb_path).astype(np.float32))
        print("Loaded pre-trained node embeddings of shape={} from {}".format(data.x.shape, args.node_emb_path))

    x = data.x
    y = data.y.to(torch.long)
    train_dataset = SimpleDataset(x[split_idx['train']], y[split_idx['train']])
    valid_dataset = SimpleDataset(x[split_idx['valid']], y[split_idx['valid']])
    test_dataset = SimpleDataset(x[split_idx['test']], y[split_idx['test']])

    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size * 4, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size * 4, shuffle=False)

    model = MLP(x.size(-1), args.hidden_channels, dataset.num_classes,
                args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-papers100M')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            train(model, device, train_loader, optimizer)
            train_acc = test(model, device, train_loader, evaluator)
            valid_acc = test(model, device, valid_loader, evaluator)
            test_acc = test(model, device, test_loader, evaluator)

            logger.add_result(run, (train_acc, valid_acc, test_acc))

            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}%, '
                      f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
コード例 #17
0
ファイル: train.py プロジェクト: OctoberChang/pecos
def do_train(args):
    """Train and Save xlinear model

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """
    params = dict()
    if args.generate_params_skeleton:
        params["train_params"] = XLinearModel.TrainParams.from_dict(
            {}, recursive=True).to_dict()
        params["pred_params"] = XLinearModel.PredParams.from_dict(
            {}, recursive=True).to_dict()
        params["indexer_params"] = HierarchicalKMeans.TrainParams.from_dict(
            {}, recursive=True).to_dict()
        print(f"{json.dumps(params, indent=True)}")
        return

    if args.params_path:
        with open(args.params_path, "r") as fin:
            params = json.load(fin)

    train_params = params.get("train_params", None)
    pred_params = params.get("pred_params", None)
    indexer_params = params.get("indexer_params", None)

    if train_params is not None:
        train_params = XLinearModel.TrainParams.from_dict(train_params)
    else:
        train_params = XLinearModel.TrainParams.from_dict(
            {k: v
             for k, v in vars(args).items() if v is not None},
            recursive=True,
        )

    if pred_params is not None:
        pred_params = XLinearModel.PredParams.from_dict(pred_params)
    else:
        pred_params = XLinearModel.PredParams.from_dict(
            {k: v
             for k, v in vars(args).items() if v is not None},
            recursive=True,
        )

    if indexer_params is not None:
        indexer_params = HierarchicalKMeans.TrainParams.from_dict(
            indexer_params)
    else:
        indexer_params = HierarchicalKMeans.TrainParams.from_dict(
            {k: v
             for k, v in vars(args).items() if v is not None},
            recursive=True,
        )

    # Create model folder
    if not os.path.exists(args.model_folder):
        os.makedirs(args.model_folder)

    # Load training inputs and labels
    X = XLinearModel.load_feature_matrix(args.inst_path)
    Y = XLinearModel.load_label_matrix(args.label_path, for_training=True)

    if args.code_path:
        cluster_chain = ClusterChain.load(args.code_path)
    else:
        if args.label_feat_path:
            label_feat = XLinearModel.load_feature_matrix(args.label_feat_path)
        else:
            label_feat = LabelEmbeddingFactory.create(Y, X, method="pifa")

        cluster_chain = Indexer.gen(label_feat, train_params=indexer_params)

    # load label importance matrix if given
    if args.usn_label_path:
        usn_label_mat = smat_util.load_matrix(args.usn_label_path)
    else:
        usn_label_mat = None
    # load user supplied matching matrix if given
    if args.usn_match_path:
        usn_match_mat = smat_util.load_matrix(args.usn_match_path)
    else:
        usn_match_mat = None
    usn_match_dict = {0: usn_label_mat, 1: usn_match_mat}

    # load relevance matrix for cost-sensitive learning
    if args.rel_path:
        R = smat_util.load_matrix(args.rel_path)
    else:
        R = None

    pred_kwargs = {}
    for kw in ["beam_size", "only_topk", "post_processor"]:
        if getattr(args, kw, None) is not None:
            pred_kwargs[kw] = getattr(args, kw)

    xlm = XLinearModel.train(
        X,
        Y,
        C=cluster_chain,
        R=R,
        user_supplied_negatives=usn_match_dict,
        train_params=train_params,
        pred_params=pred_params,
        pred_kwargs=pred_kwargs,
    )

    xlm.save(args.model_folder)
コード例 #18
0
ファイル: train.py プロジェクト: OctoberChang/pecos
def do_train(args):
    """Train and save XR-Transformer model.

    Args:
        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
    """
    params = dict()
    if args.generate_params_skeleton:
        params["train_params"] = XTransformer.TrainParams.from_dict(
            {}, recursive=True).to_dict()
        params["pred_params"] = XTransformer.PredParams.from_dict(
            {}, recursive=True).to_dict()
        print(f"{json.dumps(params, indent=True)}")
        return

    if args.params_path:
        with open(args.params_path, "r") as fin:
            params = json.load(fin)

    train_params = params.get("train_params", None)
    pred_params = params.get("pred_params", None)

    if train_params is not None:
        train_params = XTransformer.TrainParams.from_dict(train_params)
    else:
        train_params = XTransformer.TrainParams.from_dict(
            {k: v
             for k, v in vars(args).items() if v is not None},
            recursive=True,
        )

    if pred_params is not None:
        pred_params = XTransformer.PredParams.from_dict(pred_params)
    else:
        pred_params = XTransformer.PredParams.from_dict(
            {k: v
             for k, v in vars(args).items() if v is not None},
            recursive=True,
        )

    torch_util.set_seed(args.seed)
    LOGGER.info("Setting random seed {}".format(args.seed))

    # Load training feature
    if args.trn_feat_path:
        X_trn = smat_util.load_matrix(args.trn_feat_path, dtype=np.float32)
        LOGGER.info("Loaded training feature matrix with shape={}".format(
            X_trn.shape))
    else:
        X_trn = None
        LOGGER.info("Training feature matrix not provided")
        if not args.label_feat_path and not args.code_path:
            raise ValueError(
                "trn-feat is required unless code-path or label-feat is provided."
            )

    # Load training labels
    Y_trn = smat_util.load_matrix(args.trn_label_path, dtype=np.float32)
    LOGGER.info("Loaded training label matrix with shape={}".format(
        Y_trn.shape))

    # Load test feature if given
    if args.tst_feat_path:
        X_tst = smat_util.load_matrix(args.tst_feat_path, dtype=np.float32)
        LOGGER.info("Loaded test feature matrix with shape={}".format(
            X_tst.shape))
    else:
        X_tst = None

    # Load test labels if given
    if args.tst_label_path:
        Y_tst = smat_util.load_matrix(args.tst_label_path, dtype=np.float32)
        LOGGER.info("Loaded test label matrix with shape={}".format(
            Y_tst.shape))
    else:
        Y_tst = None

    # Load training texts
    trn_corpus = Preprocessor.load_data_from_file(
        args.trn_text_path,
        label_text_path=None,
        text_pos=0,
    )["corpus"]
    LOGGER.info("Loaded {} training sequences".format(len(trn_corpus)))

    # Load test text if given
    if args.tst_text_path:
        tst_corpus = Preprocessor.load_data_from_file(
            args.tst_text_path,
            label_text_path=None,
            text_pos=0,
        )["corpus"]
        LOGGER.info("Loaded {} test sequences".format(len(tst_corpus)))
    else:
        tst_corpus = None

    # load cluster chain or label features
    cluster_chain, label_feat = None, None
    if os.path.exists(args.code_path):
        cluster_chain = ClusterChain.from_partial_chain(
            smat_util.load_matrix(args.code_path),
            min_codes=args.min_codes,
            nr_splits=args.nr_splits,
        )
        LOGGER.info("Loaded from code-path: {}".format(args.code_path))
    else:
        if os.path.isfile(args.label_feat_path):
            label_feat = smat_util.load_matrix(args.label_feat_path,
                                               dtype=np.float32)
            LOGGER.info("Loaded label feature matrix shape={}, from {}".format(
                label_feat.shape, args.label_feat_path))

    trn_prob = MLProblemWithText(trn_corpus, Y_trn, X_feat=X_trn)
    if all(v is not None for v in [tst_corpus, Y_tst]):
        val_prob = MLProblemWithText(tst_corpus, Y_tst, X_feat=X_tst)
    else:
        val_prob = None

    xtf = XTransformer.train(
        trn_prob,
        clustering=cluster_chain,
        val_prob=val_prob,
        train_params=train_params,
        pred_params=pred_params,
        beam_size=args.beam_size,
        steps_scale=args.steps_scale,
        label_feat=label_feat,
    )

    xtf.save(args.model_dir)
コード例 #19
0
ファイル: test_hnsw.py プロジェクト: OctoberChang/pecos
def test_predict_and_recall():
    import random
    import numpy as np
    import scipy.sparse as smat
    from pecos.utils import smat_util
    from pecos.ann.hnsw import HNSW

    random.seed(1234)
    np.random.seed(1234)
    top_k = 10
    efS_list = [50, 75, 100]
    num_searcher_online = 2

    def calc_recall(Y_true, Y_pred):
        n_data, top_k = Y_true.shape
        recall = 0.0
        for qid in range(n_data):
            yt = set(Y_true[qid, :].flatten().data)
            yp = set(Y_pred[qid, :].flatten().data)
            recall += len(yt.intersection(yp)) / top_k
        recall = recall / n_data
        return recall

    # load data matrices
    X_trn = smat_util.load_matrix(
        "test/tst-data/ann/X.trn.l2-normalized.npy").astype(np.float32)
    X_tst = smat_util.load_matrix(
        "test/tst-data/ann/X.tst.l2-normalized.npy").astype(np.float32)
    dense_model_folder = "test/tst-data/ann/hnsw-model-dense"
    sparse_model_folder = "test/tst-data/ann/hnsw-model-sparse"

    # compute exact NN ground truth
    # for both ip and cosine similarity, since data is l2-normalized
    Y_true = 1.0 - X_tst.dot(X_trn.T)
    Y_true = np.argsort(Y_true)[:, :top_k]

    # test dense features
    model = HNSW.load(dense_model_folder)
    searchers = model.searchers_create(num_searcher_online)
    pred_params = model.get_pred_params()
    for efS in efS_list:
        pred_params.efS = efS
        Y_pred, _ = model.predict(X_tst,
                                  pred_params=pred_params,
                                  searchers=searchers,
                                  ret_csr=False)
        recall = calc_recall(Y_true, Y_pred)
        assert recall == approx(
            1.0, abs=1e-2
        ), f"hnsw inference failed: data_type=drm, efS={efS}, recall={recall}"
    del searchers, model

    # test csr features, we just reuse the Y_true since data are the same
    X_trn = smat.csr_matrix(X_trn).astype(np.float32)
    X_tst = smat.csr_matrix(X_tst).astype(np.float32)
    model = HNSW.load(sparse_model_folder)
    searchers = model.searchers_create(num_searcher_online)
    pred_params = model.get_pred_params()
    for efS in efS_list:
        pred_params.efS = efS
        Y_pred, _ = model.predict(X_tst,
                                  pred_params=pred_params,
                                  searchers=searchers,
                                  ret_csr=False)
        recall = calc_recall(Y_true, Y_pred)
        assert recall == approx(
            1.0, abs=1e-2
        ), f"hnsw inference failed: data_type=csr, efS={efS}, recall={recall}"
    del searchers, model
コード例 #20
0
def do_spmm_exp(args):
    # load data
    Y = smat_util.load_matrix(args.y_npz_path).astype(np.float32)
    X = smat_util.load_matrix(args.x_npz_path).astype(np.float32)
    YT_csr = Y.T.tocsr()
    X_csr = X.tocsr()

    # The #threads is control by env variables (except for pecos)
    # e.g., export OMP_NUM_THREADS=16, export MKL_NUM_THREADS=16.
    run_time = 0.0
    if args.spmm_algo == "pecos":
        start = time.time()
        Z = pecos_clib.sparse_matmul(
            YT_csr,
            X_csr,
            eliminate_zeros=False,
            sorted_indices=True,
            threads=args.threads,
        )
        run_time += time.time() - start
        Z_data = Z.data
    elif args.spmm_algo == "intel-mkl":
        from sparse_dot_mkl import dot_product_mkl
        # make sure set the index to int64 for large matrices
        # export MKL_INTERFACE_LAYER=ILP64
        start = time.time()
        Z = dot_product_mkl(YT_csr, X_csr, reorder_output=True)
        run_time += time.time() - start
        Z_data = Z.data
    elif args.spmm_algo == "scipy":
        # scipy will not sorted the indices for each row,
        # so we do it explicitly
        start = time.time()
        Z = YT_csr.dot(X_csr)
        Z.sort_indices()
        run_time += time.time() - start
        Z_data = Z.data
    elif args.spmm_algo == "pytorch":
        import torch

        def get_pt_data(A_csr):
            A_indices, A_values = csr_to_coo(A_csr)
            A_pt = torch.sparse_coo_tensor(
                A_indices.T.astype(np.int64),
                A_values.astype(np.float32),
                A_csr.shape,
            )
            return A_pt

        YT_pt = get_pt_data(YT_csr)
        X_pt = get_pt_data(X_csr)
        start = time.time()
        Z_pt = torch.sparse.mm(YT_pt, X_pt)
        run_time += time.time() - start
        Z_data = Z_pt.coalesce().values().numpy()
    elif args.spmm_algo == "tensorflow":
        import tensorflow.compat.v1 as tf
        from tensorflow.python.ops.linalg.sparse import sparse_csr_matrix_ops

        def get_tf_data(A_csr):
            # Define (COO format) Sparse Tensors over Numpy arrays
            A_indices, A_values = csr_to_coo(A_csr)
            A_st = tf.sparse.SparseTensor(
                A_indices.astype(np.int64),
                A_values.astype(np.float32),
                A_csr.shape,
            )
            return A_st

        # Tensorflow (v2.5.0) usage, as of 07/20/2021:
        # https://www.tensorflow.org/api_docs/python/tf/raw_ops/SparseMatrixSparseMatMul
        with tf.Session() as sess:
            YT_st = get_tf_data(YT_csr)
            X_st = get_tf_data(X_csr)
            sess.run(YT_st)
            sess.run(X_st)
            YT_sm = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix(
                YT_st.indices, YT_st.values, YT_st.dense_shape)
            X_sm = sparse_csr_matrix_ops.sparse_tensor_to_csr_sparse_matrix(
                X_st.indices, X_st.values, X_st.dense_shape)
            start = time.time()
            Z_sm = sparse_csr_matrix_ops.sparse_matrix_sparse_mat_mul(
                a=YT_sm, b=X_sm, type=tf.float32)
            Z_st = sparse_csr_matrix_ops.csr_sparse_matrix_to_sparse_tensor(
                Z_sm, tf.float32)
            Z_data = sess.run(Z_st.values)
            run_time += time.time() - start
    else:
        raise ValueError(f"spmm_algo={args.spmm_algo} is not valid")
    run_time = time.time() - start
    print(
        "algo {:16s} time(s) {:9.5f} nnz(Z) {:12d} mu(Z.data) {:8.4f}".format(
            args.spmm_algo,
            run_time,
            len(Z_data),
            np.mean(Z_data),
        ))