Esempio n. 1
0
def transformer_eval(
    data_cnf,
    model_cnf,
    data_name,
    model_name,
    model_path,
    tree_id,
    output_suffix,
    dry_run,
):
    logger.info("Loading Test Set")
    mlb = get_mlb(data_cnf["labels_binarizer"])
    num_labels = len(mlb.classes_)
    test_x, _ = get_data(data_cnf["test"]["texts"], None)
    test_atten_mask = test_x["attention_mask"]
    test_x = test_x["input_ids"]

    logger.info(f"Size of Test Set: {len(test_x):,}")

    logger.info("Predicting")
    test_loader = DataLoader(
        MultiLabelDataset(test_x, attention_mask=test_atten_mask),
        model_cnf["predict"]["batch_size"],
        num_workers=4,
    )

    model_cls = MODEL_TYPE[model_cnf["model"]["base"]]

    network = model_cls.from_pretrained(model_cnf["model"]["pretrained"],
                                        num_labels=num_labels)

    model_cnf['model'].pop('load_model', None)
    model = TransformerXML(network,
                           model_path,
                           load_model=True,
                           **data_cnf["model"],
                           **model_cnf["model"])

    scores, labels = model.predict(test_loader,
                                   k=model_cnf["predict"].get("k", 100))
    labels = mlb.classes_[labels]

    logger.info("Finish Predicting")
    score_path, label_path = output_res(
        data_cnf["output"]["res"],
        f"{model_name}-{data_name}{tree_id}",
        scores,
        labels,
        output_suffix,
    )

    log_results(score_path, label_path, dry_run)
Esempio n. 2
0
def random_forest_eval(
    data_cnf, model_cnf, data_name, model_name, model_path, emb_init,
    tree_id, output_suffix, dry_run, num_tree,
):
    mlb_list = []
    logger.info('Loading Test Set')
    mlb = get_mlb(data_cnf['labels_binarizer'])
    labels_num = len(mlb.classes_)
    test_x, _ = get_data(data_cnf['test']['texts'], None)
    logger.info(F'Size of Test Set: {len(test_x):,}')

    logger.info('Predicting')
    if 'cluster' not in model_cnf:
        raise Exception("AttentionXML is not currently supported random forest mode")
    else:
        labels_binarizer_path = data_cnf['labels_binarizer']
        for i in range(num_tree):
            filename = f"{labels_binarizer_path}_RF_{i}"
            mlb_tree = get_mlb(filename)
            mlb_list.append(mlb_tree)

        scores_list = []
        labels_list = []

        for i, mlb in enumerate(mlb_list):
            logger.info(f"Predicting RF {i}")
            model = FastAttentionXML(
                        len(mlb.classes_), data_cnf, model_cnf, tree_id,
                        f"{output_suffix}-{i}")
            scores, labels = model.predict(test_x, model_cnf['predict'].get('rf_k', 100 // num_tree))
            scores_list.append(scores)
            labels_list.append(mlb.classes_[labels])
            logger.info(f"Finish Prediting RF {i}")

        scores = np.hstack(scores_list)
        labels = np.hstack(labels_list)

        i = np.arange(len(scores))[:, None]
        j = np.argsort(scores)[:, ::-1]

        scores = scores[i, j]
        labels = labels[i, j]

    logger.info('Finish Predicting')
    score_path, label_path = output_res(data_cnf['output']['res'],
                                        f'{model_name}-{data_name}{tree_id}',
                                        scores, labels, output_suffix)

    log_results(score_path, label_path, dry_run)
Esempio n. 3
0
def default_eval(
    data_cnf, model_cnf, data_name, model_name, model_path, emb_init,
    tree_id, output_suffix, dry_run,
):
    logger.info('Loading Test Set')
    mlb = get_mlb(data_cnf['labels_binarizer'])
    labels_num = len(mlb.classes_)
    test_x, _ = get_data(data_cnf['test']['texts'], None)
    logger.info(F'Size of Test Set: {len(test_x):,}')

    logger.info('Predicting')
    model_cnf['model'].pop('load_model', None)
    if 'cluster' not in model_cnf:
        test_loader = DataLoader(
            MultiLabelDataset(test_x),
            model_cnf['predict']['batch_size'],
            num_workers=4)

        if 'loss' in model_cnf:
            gamma = model_cnf['loss'].get('gamma', 1.0)
            loss_name = model_cnf['loss']['name']
        else:
            gamma = None
            loss_name = 'bce'

        model = Model(
            network=AttentionRNN, labels_num=labels_num,
            model_path=model_path, emb_init=emb_init,
            load_model=True, loss_name=loss_name, gamma=gamma,
            **data_cnf['model'], **model_cnf['model'])

        scores, labels = model.predict(test_loader, k=model_cnf['predict'].get('k', 100))
        labels = mlb.classes_[labels]
    else:
        model = FastAttentionXML(labels_num, data_cnf, model_cnf,
                                 tree_id, output_suffix)

        scores, labels = model.predict(test_x, model_cnf['predict'].get('k', 100))
        labels = mlb.classes_[labels]

    logger.info('Finish Predicting')
    score_path, label_path = output_res(data_cnf['output']['res'],
                                        f'{model_name}-{data_name}{tree_id}',
                                        scores, labels, output_suffix)

    log_results(score_path, label_path, dry_run)
Esempio n. 4
0
def main(data_cnf, model_cnf, mode, reg):
    yaml = YAML(typ='safe')
    data_cnf, model_cnf = yaml.load(Path(data_cnf)), yaml.load(Path(model_cnf))
    model, model_name, data_name = None, model_cnf['name'], data_cnf['name']
    model_path = os.path.join(model_cnf['path'], F'{model_name}-{data_name}')
    emb_init = get_word_emb(data_cnf['embedding']['emb_init'])
    logger.info(F'Model Name: {model_name}')

    if mode is None or mode == 'train':
        logger.info('Loading Training and Validation Set')
        train_x, train_labels = get_data(data_cnf['train']['texts'],
                                         data_cnf['train']['labels'])
        if 'size' in data_cnf['valid']:
            random_state = data_cnf['valid'].get('random_state', 1240)
            train_x, valid_x, train_labels, valid_labels = train_test_split(
                train_x,
                train_labels,
                test_size=data_cnf['valid']['size'],
                random_state=random_state)
        else:
            valid_x, valid_labels = get_data(data_cnf['valid']['texts'],
                                             data_cnf['valid']['labels'])
        mlb = get_mlb(data_cnf['labels_binarizer'],
                      np.hstack((train_labels, valid_labels)))
        train_y, valid_y = mlb.transform(train_labels), mlb.transform(
            valid_labels)
        labels_num = len(mlb.classes_)
        logger.info(F'Number of Labels: {labels_num}')
        logger.info(F'Size of Training Set: {len(train_x)}')
        logger.info(F'Size of Validation Set: {len(valid_x)}')

        edges = set()
        if reg:
            classes = mlb.classes_.tolist()
            with open(data_cnf['hierarchy']) as fin:
                for line in fin:
                    data = line.strip().split()
                    p = data[0]
                    if p not in classes:
                        continue
                    p_id = classes.index(p)
                    for c in data[1:]:
                        if c not in classes:
                            continue
                        c_id = classes.index(c)
                        edges.add((p_id, c_id))
            logger.info(F'Number of Edges: {len(edges)}')

        logger.info('Training')
        train_loader = DataLoader(MultiLabelDataset(train_x, train_y),
                                  model_cnf['train']['batch_size'],
                                  shuffle=True,
                                  num_workers=4)
        valid_loader = DataLoader(MultiLabelDataset(valid_x,
                                                    valid_y,
                                                    training=True),
                                  model_cnf['valid']['batch_size'],
                                  num_workers=4)
        model = Model(network=MATCH,
                      labels_num=labels_num,
                      model_path=model_path,
                      emb_init=emb_init,
                      mode='train',
                      reg=reg,
                      hierarchy=edges,
                      **data_cnf['model'],
                      **model_cnf['model'])
        opt_params = {
            'lr': model_cnf['train']['learning_rate'],
            'betas':
            (model_cnf['train']['beta1'], model_cnf['train']['beta2']),
            'weight_decay': model_cnf['train']['weight_decay']
        }
        model.train(train_loader,
                    valid_loader,
                    opt_params=opt_params,
                    **model_cnf['train'])  # CHANGE: inserted opt_params
        logger.info('Finish Training')

    if mode is None or mode == 'eval':
        logger.info('Loading Test Set')
        mlb = get_mlb(data_cnf['labels_binarizer'])
        labels_num = len(mlb.classes_)
        test_x, _ = get_data(data_cnf['test']['texts'], None)
        logger.info(F'Size of Test Set: {len(test_x)}')

        logger.info('Predicting')
        test_loader = DataLoader(MultiLabelDataset(test_x),
                                 model_cnf['predict']['batch_size'],
                                 num_workers=4)
        if model is None:
            model = Model(network=MATCH,
                          labels_num=labels_num,
                          model_path=model_path,
                          emb_init=emb_init,
                          mode='eval',
                          **data_cnf['model'],
                          **model_cnf['model'])
        scores, labels = model.predict(test_loader,
                                       k=model_cnf['predict'].get('k', 100))
        logger.info('Finish Predicting')
        labels = mlb.classes_[labels]
        output_res(data_cnf['output']['res'], F'{model_name}-{data_name}',
                   scores, labels)
Esempio n. 5
0
def main(data_cnf, model_cnf, mode, tree_id):
    tree_id = F'-Tree-{tree_id}' if tree_id is not None else ''
    yaml = YAML(typ='safe')
    data_cnf, model_cnf = yaml.load(Path(data_cnf)), yaml.load(Path(model_cnf))
    model, model_name, data_name = None, model_cnf['name'], data_cnf['name']
    model_path = os.path.join(model_cnf['path'], F'{model_name}-{data_name}{tree_id}')
    emb_init = get_word_emb(data_cnf['embedding']['emb_init'])
    logger.info(F'Model Name: {model_name}')

    if mode is None or mode == 'train':
        logger.info('Loading Training and Validation Set')
        train_x, train_labels = get_data(data_cnf['train']['texts'], data_cnf['train']['labels'])
        if 'size' in data_cnf['valid']:
            random_state = data_cnf['valid'].get('random_state', 1240)
            train_x, valid_x, train_labels, valid_labels = train_test_split(train_x, train_labels,
                                                                            test_size=data_cnf['valid']['size'],
                                                                            random_state=random_state)
        else:
            valid_x, valid_labels = get_data(data_cnf['valid']['texts'], data_cnf['valid']['labels'])
        mlb = get_mlb(data_cnf['labels_binarizer'], np.hstack((train_labels, valid_labels)))
        train_y, valid_y = mlb.transform(train_labels), mlb.transform(valid_labels)
        labels_num = len(mlb.classes_)
        logger.info(F'Number of Labels: {labels_num}')
        logger.info(F'Size of Training Set: {len(train_x)}')
        logger.info(F'Size of Validation Set: {len(valid_x)}')

        logger.info('Training')
        if 'cluster' not in model_cnf:
            train_loader = DataLoader(MultiLabelDataset(train_x, train_y),
                                      model_cnf['train']['batch_size'], shuffle=True, num_workers=4)
            valid_loader = DataLoader(MultiLabelDataset(valid_x, valid_y, training=False),
                                      model_cnf['valid']['batch_size'], num_workers=4)
            model = Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path, emb_init=emb_init,
                          **data_cnf['model'], **model_cnf['model'])
            model.train(train_loader, valid_loader, **model_cnf['train'])
        else:
            model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id)
            model.train(train_x, train_y, valid_x, valid_y, mlb)
        logger.info('Finish Training')

    if mode is None or mode == 'eval':
        logger.info('Loading Test Set')
        mlb = get_mlb(data_cnf['labels_binarizer'])
        labels_num = len(mlb.classes_)
        test_x, _ = get_data(data_cnf['test']['texts'], None)
        logger.info(F'Size of Test Set: {len(test_x)}')

        logger.info('Predicting')
        if 'cluster' not in model_cnf:
            test_loader = DataLoader(MultiLabelDataset(test_x), model_cnf['predict']['batch_size'],
                                     num_workers=4)
            if model is None:
                model = Model(network=AttentionRNN, labels_num=labels_num, model_path=model_path, emb_init=emb_init,
                              **data_cnf['model'], **model_cnf['model'])
            scores, labels = model.predict(test_loader, k=model_cnf['predict'].get('k', 100))
        else:
            if model is None:
                model = FastAttentionXML(labels_num, data_cnf, model_cnf, tree_id)
            scores, labels = model.predict(test_x)
        logger.info('Finish Predicting')
        labels = mlb.classes_[labels]
        output_res(data_cnf['output']['res'], F'{model_name}-{data_name}{tree_id}', scores, labels)
Esempio n. 6
0
def spectral_clustering_eval(
    data_cnf, model_cnf, data_name, model_name, model_path, emb_init,
    tree_id, output_suffix, dry_run,
):
    mlb_list = []
    n_clusters = model_cnf['spectral_clustering']['num_clusters']
    labels_binarizer_path = data_cnf['labels_binarizer']
    scores_list = []
    labels_list = []

    logger.info('Loading Test Set')
    test_x, _ = get_data(data_cnf['test']['texts'], None)
    logger.info(F'Size of Test Set: {len(test_x):,}')

    logger.info('Predicting')
    if 'cluster' not in model_cnf:
        suffix = output_suffix.upper().replace('-', '_')
        for i in range(n_clusters):
            filename = f"{labels_binarizer_path}_{suffix}_{i}"
            mlb_tree = get_mlb(filename)
            mlb_list.append(mlb_tree)

        test_loader = DataLoader(
            MultiLabelDataset(test_x),
            model_cnf['predict']['batch_size'],
            num_workers=4)

        for i, mlb in enumerate(mlb_list):
            logger.info(f"Predicting Cluster {i}")
            labels_num = len(mlb.classes_)
            k = model_cnf['predict'].get('k', 100) // n_clusters

            model = Model(
                network=AttentionRNN, labels_num=labels_num,
                model_path=f'{model_path}-{i}', emb_init=emb_init,
                load_model=True,
                **data_cnf['model'], **model_cnf['model'])

            scores, labels = model.predict(test_loader, k=k)
            scores_list.append(scores)
            labels_list.append(mlb.classes_[labels])
            logger.info(f"Finish Prediting Cluster {i}")

        scores = np.hstack(scores_list)
        labels = np.hstack(labels_list)

        i = np.arange(len(scores))[:, None]
        j = np.argsort(scores)[:, ::-1]

        scores = scores[i, j]
        labels = labels[i, j]

    else:
        mlb = get_mlb(data_cnf['labels_binarizer'])
        model = FastAttentionXML(len(mlb.classes_), data_cnf, model_cnf,
                                 tree_id, output_suffix)

        scores, labels = model.predict(test_x, model_cnf['predict'].get('k', 100))
        labels = mlb.classes_[labels]

    logger.info('Finish Predicting')
    score_path, label_path = output_res(data_cnf['output']['res'],
                                        f'{model_name}-{data_name}{tree_id}',
                                        scores, labels, output_suffix)

    log_results(score_path, label_path, dry_run)
Esempio n. 7
0
def main(data_cnf, model_cnf, mode):
    model_name = os.path.split(model_cnf)[1].split(".")[0]
    yaml = YAML(typ='safe')
    data_cnf, model_cnf = yaml.load(Path(data_cnf)), yaml.load(Path(model_cnf))

    # 設定log檔案位置
    logfile("./logs/logfile_{0}_cornet_{1}_cornet_dim_{2}.log".format(
        model_name, model_cnf['model']['n_cornet_blocks'],
        model_cnf['model']['cornet_dim']))

    model, model_name, data_name = None, model_cnf['name'], data_cnf['name']
    model_path = os.path.join(
        model_cnf['path'],
        F'{model_name}-{data_name}-{model_cnf["model"]["n_cornet_blocks"]}-{model_cnf["model"]["cornet_dim"]}'
    )
    emb_init = get_word_emb(data_cnf['embedding']['emb_init'])
    logger.info(F'Model Name: {model_name}')
    # summary(model_dict[model_name])
    if mode is None or mode == 'train':
        logger.info('Loading Training and Validation Set')
        train_x, train_labels = get_data(data_cnf['train']['texts'],
                                         data_cnf['train']['labels'])
        if 'size' in data_cnf['valid']:
            random_state = data_cnf['valid'].get('random_state', 1240)
            train_x, valid_x, train_labels, valid_labels = train_test_split(
                train_x,
                train_labels,
                test_size=data_cnf['valid']['size'],
                random_state=random_state)
        else:
            valid_x, valid_labels = get_data(data_cnf['valid']['texts'],
                                             data_cnf['valid']['labels'])
        mlb = get_mlb(data_cnf['labels_binarizer'],
                      np.hstack((train_labels, valid_labels)))
        train_y, valid_y = mlb.transform(train_labels), mlb.transform(
            valid_labels)
        labels_num = len(mlb.classes_)
        logger.info(F'Number of Labels: {labels_num}')
        logger.info(F'Size of Training Set: {len(train_x)}')
        logger.info(F'Size of Validation Set: {len(valid_x)}')

        logger.info('Training')
        train_loader = DataLoader(MultiLabelDataset(train_x, train_y),
                                  model_cnf['train']['batch_size'],
                                  shuffle=True,
                                  num_workers=4)
        valid_loader = DataLoader(MultiLabelDataset(valid_x,
                                                    valid_y,
                                                    training=True),
                                  model_cnf['valid']['batch_size'],
                                  num_workers=4)

        if 'gpipe' not in model_cnf:
            model = Model(network=model_dict[model_name],
                          labels_num=labels_num,
                          model_path=model_path,
                          emb_init=emb_init,
                          **data_cnf['model'],
                          **model_cnf['model'])
        else:
            model = GPipeModel(model_name,
                               labels_num=labels_num,
                               model_path=model_path,
                               emb_init=emb_init,
                               **data_cnf['model'],
                               **model_cnf['model'])
        loss, p1, p5 = model.train(train_loader, valid_loader,
                                   **model_cnf['train'])
        np.save(
            model_cnf['np_loss'] + "{0}_cornet_{1}_cornet_dim_{2}.npy".format(
                model_name, model_cnf['model']['n_cornet_blocks'],
                model_cnf['model']['cornet_dim']), loss)
        np.save(
            model_cnf['np_p1'] + "{0}_cornet_{1}_cornet_dim_{2}.npy".format(
                model_name, model_cnf['model']['n_cornet_blocks'],
                model_cnf['model']['cornet_dim']), p1)
        np.save(
            model_cnf['np_p5'] + "{0}_cornet_{1}_cornet_dim_{2}.npy".format(
                model_name, model_cnf['model']['n_cornet_blocks'],
                model_cnf['model']['cornet_dim']), p5)
        logger.info('Finish Training')

    if mode is None or mode == 'eval':
        logger.info('Loading Test Set')
        logger.info('model path: ', model_path)
        mlb = get_mlb(data_cnf['labels_binarizer'])
        labels_num = len(mlb.classes_)
        test_x, _ = get_data(data_cnf['test']['texts'], None)
        logger.info(F'Size of Test Set: {len(test_x)}')

        logger.info('Predicting')
        test_loader = DataLoader(MultiLabelDataset(test_x),
                                 model_cnf['predict']['batch_size'],
                                 num_workers=4)
        if 'gpipe' not in model_cnf:
            if model is None:
                model = Model(network=model_dict[model_name],
                              labels_num=labels_num,
                              model_path=model_path,
                              emb_init=emb_init,
                              **data_cnf['model'],
                              **model_cnf['model'])
        else:
            if model is None:
                model = GPipeModel(model_name,
                                   labels_num=labels_num,
                                   model_path=model_path,
                                   emb_init=emb_init,
                                   **data_cnf['model'],
                                   **model_cnf['model'])
        scores, labels = model.predict(test_loader,
                                       k=model_cnf['predict'].get('k', 3801))
        logger.info('Finish Predicting')
        labels = mlb.classes_[labels]
        output_res(data_cnf['output']['res'], F'{model_name}-{data_name}',
                   scores, labels)
Esempio n. 8
0
def splitting_head_tail_eval(
    data_cnf,
    model_cnf,
    data_name,
    model_name,
    model_path,
    emb_init,
    tree_id,
    output_suffix,
    dry_run,
    split_ratio,
    head_labels,
    tail_labels,
    head_model,
    tail_model,
):
    logger.info('Loading Test Set')
    mlb = get_mlb(data_cnf['labels_binarizer'])
    labels_num = len(mlb.classes_)
    test_x, _ = get_data(data_cnf['test']['texts'], None)
    logger.info(F'Size of Test Set: {len(test_x):,}')

    labels_binarizer_path = data_cnf['labels_binarizer']
    mlb_h = get_mlb(f"{labels_binarizer_path}_h_{split_ratio}")
    mlb_t = get_mlb(f"{labels_binarizer_path}_t_{split_ratio}")

    if head_labels is None:
        train_x, train_labels = get_data(data_cnf['train']['texts'],
                                         data_cnf['train']['labels'])
        head_labels, _, tail_labels, _ = get_head_tail_labels(
            train_labels,
            split_ratio,
        )

    h_labels_i = np.nonzero(mlb.transform(head_labels[None, ...]).toarray())[0]
    t_labels_i = np.nonzero(mlb.transform(tail_labels[None, ...]).toarray())[0]

    logger.info('Predicting')
    if 'cluster' not in model_cnf:
        test_loader = DataLoader(MultiLabelDataset(test_x),
                                 model_cnf['predict']['batch_size'],
                                 num_workers=4)

        if head_model is None:
            head_model = Model(network=AttentionRNN,
                               labels_num=len(head_labels),
                               model_path=f'{model_path}-head',
                               emb_init=emb_init,
                               load_model=True,
                               **data_cnf['model'],
                               **model_cnf['model'])

        logger.info('Predicting Head Model')
        h_k = model_cnf['predict'].get('top_head_k', 30)
        scores_h, labels_h = head_model.predict(test_loader, k=h_k)
        labels_h = mlb_h.classes_[labels_h]
        logger.info('Finish Predicting Head Model')

        if tail_model is None:
            tail_model = Model(network=AttentionRNN,
                               labels_num=len(tail_labels),
                               model_path=f'{model_path}-tail',
                               emb_init=emb_init,
                               load_model=True,
                               **data_cnf['model'],
                               **model_cnf['model'])

        logger.info('Predicting Tail Model')
        t_k = model_cnf['predict'].get('top_tail_k', 70)
        scores_t, labels_t = tail_model.predict(test_loader, k=t_k)
        labels_t = mlb_t.classes_[labels_t]
        logger.info('Finish Predicting Tail Model')

        scores = np.c_[scores_h, scores_t]
        labels = np.c_[labels_h, labels_t]

        i = np.arange(len(scores))[:, None]
        j = np.argsort(scores)[:, ::-1]

        scores = scores[i, j]
        labels = labels[i, j]
    else:
        raise Exception("FastAttention is not currently supported for "
                        "splited head and tail dataset")

    logger.info('Finish Predicting')
    score_path, label_path = output_res(data_cnf['output']['res'],
                                        f'{model_name}-{data_name}{tree_id}',
                                        scores, labels, output_suffix)

    log_results(score_path, label_path, dry_run)