Exemple #1
0
def random_forest_eval(
    data_cnf, model_cnf, data_name, model_name, model_path, emb_init,
    tree_id, output_suffix, dry_run, num_tree,
):
    mlb_list = []
    logger.info('Loading Test Set')
    mlb = get_mlb(data_cnf['labels_binarizer'])
    labels_num = len(mlb.classes_)
    test_x, _ = get_data(data_cnf['test']['texts'], None)
    logger.info(F'Size of Test Set: {len(test_x):,}')

    logger.info('Predicting')
    if 'cluster' not in model_cnf:
        raise Exception("AttentionXML is not currently supported random forest mode")
    else:
        labels_binarizer_path = data_cnf['labels_binarizer']
        for i in range(num_tree):
            filename = f"{labels_binarizer_path}_RF_{i}"
            mlb_tree = get_mlb(filename)
            mlb_list.append(mlb_tree)

        scores_list = []
        labels_list = []

        for i, mlb in enumerate(mlb_list):
            logger.info(f"Predicting RF {i}")
            model = FastAttentionXML(
                        len(mlb.classes_), data_cnf, model_cnf, tree_id,
                        f"{output_suffix}-{i}")
            scores, labels = model.predict(test_x, model_cnf['predict'].get('rf_k', 100 // num_tree))
            scores_list.append(scores)
            labels_list.append(mlb.classes_[labels])
            logger.info(f"Finish Prediting RF {i}")

        scores = np.hstack(scores_list)
        labels = np.hstack(labels_list)

        i = np.arange(len(scores))[:, None]
        j = np.argsort(scores)[:, ::-1]

        scores = scores[i, j]
        labels = labels[i, j]

    logger.info('Finish Predicting')
    score_path, label_path = output_res(data_cnf['output']['res'],
                                        f'{model_name}-{data_name}{tree_id}',
                                        scores, labels, output_suffix)

    log_results(score_path, label_path, dry_run)
Exemple #2
0
def default_eval(
    data_cnf, model_cnf, data_name, model_name, model_path, emb_init,
    tree_id, output_suffix, dry_run,
):
    logger.info('Loading Test Set')
    mlb = get_mlb(data_cnf['labels_binarizer'])
    labels_num = len(mlb.classes_)
    test_x, _ = get_data(data_cnf['test']['texts'], None)
    logger.info(F'Size of Test Set: {len(test_x):,}')

    logger.info('Predicting')
    model_cnf['model'].pop('load_model', None)
    if 'cluster' not in model_cnf:
        test_loader = DataLoader(
            MultiLabelDataset(test_x),
            model_cnf['predict']['batch_size'],
            num_workers=4)

        if 'loss' in model_cnf:
            gamma = model_cnf['loss'].get('gamma', 1.0)
            loss_name = model_cnf['loss']['name']
        else:
            gamma = None
            loss_name = 'bce'

        model = Model(
            network=AttentionRNN, labels_num=labels_num,
            model_path=model_path, emb_init=emb_init,
            load_model=True, loss_name=loss_name, gamma=gamma,
            **data_cnf['model'], **model_cnf['model'])

        scores, labels = model.predict(test_loader, k=model_cnf['predict'].get('k', 100))
        labels = mlb.classes_[labels]
    else:
        model = FastAttentionXML(labels_num, data_cnf, model_cnf,
                                 tree_id, output_suffix)

        scores, labels = model.predict(test_x, model_cnf['predict'].get('k', 100))
        labels = mlb.classes_[labels]

    logger.info('Finish Predicting')
    score_path, label_path = output_res(data_cnf['output']['res'],
                                        f'{model_name}-{data_name}{tree_id}',
                                        scores, labels, output_suffix)

    log_results(score_path, label_path, dry_run)
Exemple #3
0
def main(data_cnf, model_cnf, mode, tree_id):
    tree_id = F'-Tree-{tree_id}' if tree_id is not None else ''
    yaml = YAML(typ='safe')
    data_cnf, model_cnf = yaml.load(Path(data_cnf)), yaml.load(Path(model_cnf))
    model, model_name, data_name = None, model_cnf['name'], data_cnf['name']
    model_path = os.path.join(model_cnf['path'], F'{model_name}-{data_name}{tree_id}')
    emb_init = get_word_emb(data_cnf['embedding']['emb_init'])
    logger.info(F'Model Name: {model_name}')

    if mode is None or mode == 'train':
        logger.info('Loading Training and Validation Set')
        train_x, train_labels = get_data(data_cnf['train']['texts'], data_cnf['train']['labels'])
        if 'size' in data_cnf['valid']:
            random_state = data_cnf['valid'].get('random_state', 1240)
            train_x, valid_x, train_labels, valid_labels = train_test_split(train_x, train_labels,
                                                                            test_size=data_cnf['valid']['size'],
                                                                            random_state=random_state)
        else:
            valid_x, valid_labels = get_data(data_cnf['valid']['texts'], data_cnf['valid']['labels'])
        mlb = get_mlb(data_cnf['labels_binarizer'], np.hstack((train_labels, valid_labels)))
        train_y, valid_y = mlb.transform(train_labels), mlb.transform(valid_labels)
        labels_num = len(mlb.classes_)
        logger.info(F'Number of Labels: {labels_num}')
        logger.info(F'Size of Training Set: {len(train_x)}')
        logger.info(F'Size of Validation Set: {len(valid_x)}')

        logger.info('Training')
        if 'cluster' not in model_cnf:
            train_loader = DataLoader(MultiLabelDataset(train_x, train_y),
                                      model_cnf['train']['batch_size'], shuffle=True, num_workers=4)
            valid_loader = DataLoader(MultiLabelDataset(valid_x, valid_y, training=False),
                                      model_cnf['valid']['batch_size'], num_workers=4)
            model = Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path, emb_init=emb_init,
                          **data_cnf['model'], **model_cnf['model'])
            model.train(train_loader, valid_loader, **model_cnf['train'])
        else:
            model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id)
            model.train(train_x, train_y, valid_x, valid_y, mlb)
        logger.info('Finish Training')

    if mode is None or mode == 'eval':
        logger.info('Loading Test Set')
        mlb = get_mlb(data_cnf['labels_binarizer'])
        labels_num = len(mlb.classes_)
        test_x, _ = get_data(data_cnf['test']['texts'], None)
        logger.info(F'Size of Test Set: {len(test_x)}')

        logger.info('Predicting')
        if 'cluster' not in model_cnf:
            test_loader = DataLoader(MultiLabelDataset(test_x), model_cnf['predict']['batch_size'],
                                     num_workers=4)
            if model is None:
                model = Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path, emb_init=emb_init,
                              **data_cnf['model'], **model_cnf['model'])
            scores, labels = model.predict(test_loader, k=model_cnf['predict'].get('k', 100))
        else:
            if model is None:
                model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id)
            scores, labels = model.predict(test_x)
        logger.info('Finish Predicting')
        labels = mlb.classes_[labels]
        output_res(data_cnf['output']['res'], F'{model_name}-{data_name}{tree_id}', scores, labels)
def spectral_clustering_train(
    data_cnf, data_cnf_path, model_cnf, model_cnf_path,
    emb_init, model_path, tree_id, output_suffix, dry_run,
):
    train_xs = []
    valid_xs = []
    train_labels_list = []
    valid_labels_list = []
    train_ys = []
    valid_ys = []
    mlb_list = []
    indices_list = []

    n_clusters = model_cnf['spectral_clustering']['num_clusters']
    n_components = model_cnf['spectral_clustering']['n_components']
    alg = model_cnf['spectral_clustering']['alg']
    size_min = model_cnf['spectral_clustering']['size_min']
    size_max = model_cnf['spectral_clustering']['size_max']

    train_x, train_labels = load_dataset(data_cnf)

    if 'cluster' not in model_cnf:
        mlb = get_mlb(data_cnf['labels_binarizer'], train_labels)
        train_y = mlb.transform(train_labels)

        logger.info('Build label adjacency matrix')
        adj = train_y.T @ train_y
        adj.setdiag(0)
        adj.eliminate_zeros()
        logger.info(f"Sparsity: {adj.count_nonzero() / adj.shape[0] ** 2}")
        clustering = MySpectralClustering(n_clusters=n_clusters, affinity='precomputed',
                                          n_components=n_components, n_init=1,
                                          size_min=size_min,
                                          size_max=size_max,
                                          assign_labels=alg, n_jobs=-1)
        logger.info('Start Spectral Clustering')
        clustering.fit(adj)
        logger.info('Finish Spectral Clustering')

        groups = [[] for _ in range(n_clusters)]
        for i, group in enumerate(clustering.labels_):
            groups[group].append(i)

        splitted_labels = []
        for indices in groups:
            splitted_labels.append(mlb.classes_[indices])

        for labels in splitted_labels:
            indices = get_splitted_samples(labels, train_labels)
            indices_list.append(indices)
            train_xs.append(train_x[indices])
            train_labels_list.append(train_labels[indices])

        if 'size' in data_cnf['valid']:
            for i, (train_x, train_labels) in enumerate(zip(train_xs, train_labels_list)):
                valid_size = data_cnf['valid']['size']
                if len(train_x) * 0.8 > len(train_x) - valid_size:
                    valid_size = 0.2
                train_x, valid_x, train_labels, valid_labels = train_test_split(
                    train_x, train_labels, test_size=valid_size,
                )
                train_xs[i] = train_x
                train_labels_list[i] = train_labels
                valid_xs.append(valid_x)
                valid_labels_list.append(valid_labels)

        else:
            raise Exception("Setting valid set explicitly is not "
                            "supported spectral clustering mode.")

        labels_binarizer_path = data_cnf['labels_binarizer']
        suffix = output_suffix.upper().replace('-', '_')
        for i, labels in enumerate(splitted_labels):
            filename = f"{labels_binarizer_path}_{suffix}_{i}"
            mlb_tree = get_mlb(filename, labels[None, ...], force=True)
            mlb_list.append(mlb_tree)
            logger.info(f"Number of labels of cluster {i}: {len(labels):,}")
            logger.info(f"Number of Training Set of cluster {i}: {len(train_xs[i]):,}")
            logger.info(f"Number of Validation Set of cluster {i}: {len(valid_xs[i]):,}")

            with redirect_stderr(None):
                train_y = mlb_tree.transform(train_labels_list[i])
                valid_y = mlb_tree.transform(valid_labels_list[i])

            train_ys.append(train_y)
            valid_ys.append(valid_y)

    else:
        if 'size' in data_cnf['valid']:
            train_x, valid_x, train_labels, valid_labels = train_test_split(
                train_x, train_labels, test_size=data_cnf['valid']['size'],
            )

        else:
            valid_x, valid_labels = get_data(data_cnf['valid']['texts'], data_cnf['valid']['labels'])

        mlb = get_mlb(data_cnf['labels_binarizer'], np.hstack((
            train_labels, valid_labels,
        )))

        train_y, valid_y = mlb.transform(train_labels), mlb.transform(valid_labels)


    logger.info('Training')
    if 'cluster' not in model_cnf:
        for i, (train_x, train_y, valid_x, valid_y) in enumerate(zip(
            train_xs, train_ys, valid_xs, valid_ys,
        )):
            train_loader = DataLoader(
                MultiLabelDataset(train_x, train_y),
                model_cnf['train']['batch_size'], shuffle=True, num_workers=4)
            valid_loader = DataLoader(
                MultiLabelDataset(valid_x, valid_y, training=False),
                model_cnf['valid']['batch_size'], num_workers=4)
            model = Model(
                network=AttentionRNN, labels_num=len(mlb_list[i].classes_),
                model_path=f'{model_path}-{i}', emb_init=emb_init,
                **data_cnf['model'], **model_cnf['model'])

            if not dry_run:
                logger.info(f"Start Training Cluster {i}")
                model.train(train_loader, valid_loader, **model_cnf['train'])
                logger.info(f"Finish Training Cluster {i}")
            else:
                model.save_model()

    else:
        model = FastAttentionXML(
            len(mlb.classes_), data_cnf, model_cnf, tree_id, output_suffix,
        )

        if not dry_run:
            model.train(train_x, train_y, valid_x, valid_y, mlb)

    log_config(data_cnf_path, model_cnf_path, dry_run)
def spectral_clustering_eval(
    data_cnf, model_cnf, data_name, model_name, model_path, emb_init,
    tree_id, output_suffix, dry_run,
):
    mlb_list = []
    n_clusters = model_cnf['spectral_clustering']['num_clusters']
    labels_binarizer_path = data_cnf['labels_binarizer']
    scores_list = []
    labels_list = []

    logger.info('Loading Test Set')
    test_x, _ = get_data(data_cnf['test']['texts'], None)
    logger.info(F'Size of Test Set: {len(test_x):,}')

    logger.info('Predicting')
    if 'cluster' not in model_cnf:
        suffix = output_suffix.upper().replace('-', '_')
        for i in range(n_clusters):
            filename = f"{labels_binarizer_path}_{suffix}_{i}"
            mlb_tree = get_mlb(filename)
            mlb_list.append(mlb_tree)

        test_loader = DataLoader(
            MultiLabelDataset(test_x),
            model_cnf['predict']['batch_size'],
            num_workers=4)

        for i, mlb in enumerate(mlb_list):
            logger.info(f"Predicting Cluster {i}")
            labels_num = len(mlb.classes_)
            k = model_cnf['predict'].get('k', 100) // n_clusters

            model = Model(
                network=AttentionRNN, labels_num=labels_num,
                model_path=f'{model_path}-{i}', emb_init=emb_init,
                load_model=True,
                **data_cnf['model'], **model_cnf['model'])

            scores, labels = model.predict(test_loader, k=k)
            scores_list.append(scores)
            labels_list.append(mlb.classes_[labels])
            logger.info(f"Finish Prediting Cluster {i}")

        scores = np.hstack(scores_list)
        labels = np.hstack(labels_list)

        i = np.arange(len(scores))[:, None]
        j = np.argsort(scores)[:, ::-1]

        scores = scores[i, j]
        labels = labels[i, j]

    else:
        mlb = get_mlb(data_cnf['labels_binarizer'])
        model = FastAttentionXML(len(mlb.classes_), data_cnf, model_cnf,
                                 tree_id, output_suffix)

        scores, labels = model.predict(test_x, model_cnf['predict'].get('k', 100))
        labels = mlb.classes_[labels]

    logger.info('Finish Predicting')
    score_path, label_path = output_res(data_cnf['output']['res'],
                                        f'{model_name}-{data_name}{tree_id}',
                                        scores, labels, output_suffix)

    log_results(score_path, label_path, dry_run)
Exemple #6
0
def random_forest_train(
    data_cnf, data_cnf_path, model_cnf, model_cnf_path,
    emb_init, model_path, tree_id, output_suffix, dry_run,
    num_tree,
):
    indices_list = []
    train_xs = []
    valid_xs = []
    train_labels_list = []
    valid_labels_list = []
    train_ys = []
    valid_ys = []
    mlb_list = []

    train_x, train_labels = load_dataset(data_cnf)

    unique_labels = get_unique_labels(train_labels)
    splitted_labels = split_labels(unique_labels, num_tree)

    for labels in splitted_labels:
        indices = get_splitted_samples(labels, train_labels)
        indices_list.append(indices)
        train_xs.append(train_x[indices])
        train_labels_list.append(train_labels[indices])

    if 'size' in data_cnf['valid']:
        valid_size = data_cnf['valid']['size']
        for i, (train_x, train_labels) in enumerate(zip(train_xs, train_labels_list)):
            train_x, valid_x, train_labels, valid_labels = train_test_split(
                train_x, train_labels, test_size=valid_size,
            )
            train_xs[i] = train_x
            train_labels_list[i] = train_labels
            valid_xs.append(valid_x)
            valid_labels_list.append(valid_labels)

    else:
        raise Exception("Setting valid set explicitly is not "
                        "supported random forest mode.")

    labels_binarizer_path = data_cnf['labels_binarizer']
    mlb = get_mlb(data_cnf['labels_binarizer'], np.hstack((
        train_labels, valid_labels,
    )))

    for i, labels in enumerate(splitted_labels):
        filename = f"{labels_binarizer_path}_RF_{i}"
        mlb_tree = get_mlb(filename, labels[None, ...])
        mlb_list.append(mlb_tree)
        logger.info(f"Number of labels of Tree {i}: {len(labels):,}")
        logger.info(f"Number of Training Set of Tree {i}: {len(train_xs[i]):,}")
        logger.info(f"Number of Validation Set of Tree {i}: {len(valid_xs[i]):,}")

        with redirect_stderr(None):
            train_y = mlb_tree.transform(train_labels_list[i])
            valid_y = mlb_tree.transform(valid_labels_list[i])

        train_ys.append(train_y)
        valid_ys.append(valid_y)

        logger.info('Training')
        if 'cluster' not in model_cnf:
            raise Exception("AttentionXML is not currently supported for "
                            "random forest mode")

        else:
            for i, (train_x, train_y, valid_x, valid_y, indices) in enumerate(zip(
                train_xs, train_ys, valid_xs, valid_ys, indices_list
            )):
                model = FastAttentionXML(
                    len(mlb_list[i].classes_), data_cnf, model_cnf, tree_id,
                    f"{output_suffix}-{i}")

                if not dry_run:
                    logger.info(f"Start Training RF {i}")
                    model.train(train_x, train_y, valid_x, valid_y, mlb_list[i], indices)
                    logger.info(f"Finish Training RF {i}")

        log_config(data_cnf_path, model_cnf_path, dry_run)
Exemple #7
0
def default_train(
    data_cnf, data_cnf_path, model_cnf, model_cnf_path,
    emb_init, model_path, tree_id, output_suffix, dry_run,
):
    train_x, train_labels = load_dataset(data_cnf)

    if 'size' in data_cnf['valid']:
        train_x, valid_x, train_labels, valid_labels = train_test_split(
            train_x, train_labels, test_size=data_cnf['valid']['size'],
        )

    else:
        valid_x, valid_labels = get_data(data_cnf['valid']['texts'], data_cnf['valid']['labels'])

    mlb = get_mlb(data_cnf['labels_binarizer'], np.hstack((
        train_labels, valid_labels,
    )))
    freq = mlb.transform(np.hstack([train_labels, valid_labels])).sum(axis=0).A1
    train_y, valid_y = mlb.transform(train_labels), mlb.transform(valid_labels)
    labels_num = len(mlb.classes_)
    logger.info(F'Number of Labels: {labels_num}')
    logger.info(F'Size of Training Set: {len(train_x):,}')
    logger.info(F'Size of Validation Set: {len(valid_x):,}')

    logger.info('Training')
    if 'cluster' not in model_cnf:
        if 'propensity' in data_cnf:
            a = data_cnf['propensity']['a']
            b = data_cnf['propensity']['b']
            pos_weight = get_inv_propensity(train_y, a, b)
        else:
            pos_weight = None

        train_loader = DataLoader(
            MultiLabelDataset(train_x, train_y),
            model_cnf['train']['batch_size'], shuffle=True, num_workers=4)
        valid_loader = DataLoader(
            MultiLabelDataset(valid_x, valid_y, training=False),
            model_cnf['valid']['batch_size'], num_workers=4)

        if 'loss' in model_cnf:
            gamma = model_cnf['loss'].get('gamma', 2.0)
            loss_name = model_cnf['loss']['name']
        else:
            gamma = None
            loss_name = 'bce'

        model = Model(
            network=AttentionRNN, labels_num=labels_num, model_path=model_path,
            emb_init=emb_init, pos_weight=pos_weight, loss_name=loss_name, gamma=gamma,
            freq=freq, **data_cnf['model'], **model_cnf['model'])

        if not dry_run:
            model.train(train_loader, valid_loader, mlb=mlb, **model_cnf['train'])
        else:
            model.save_model()

    else:
        model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id, output_suffix)

        if not dry_run:
            model.train(train_x, train_y, valid_x, valid_y, mlb)

    log_config(data_cnf_path, model_cnf_path, dry_run)