Ejemplo n.º 1
0
def main():

    args = configs.arg_parse()
    fix_seed(args.seed)

    # Load the dataset
    data = prepare_data(args.dataset, args.train_ratio, args.input_dim,
                        args.seed)

    # Define and train the model
    if args.dataset in ['Cora', 'PubMed']:
        # Retrieve the model and training hyperparameters depending the data/model given as input
        hyperparam = ''.join(['hparams_', args.dataset, '_', args.model])
        param = ''.join(['params_', args.dataset, '_', args.model])
        model = eval(args.model)(input_dim=data.num_features,
                                 output_dim=data.num_classes,
                                 **eval(hyperparam))
        train_and_val(model, data, **eval(param))
        _, test_acc = evaluate(data, model, data.test_mask)
        print('Test accuracy is {:.4f}'.format(test_acc))

    elif args.dataset in ['syn6', 'Mutagenicity']:
        input_dims = data.x.shape[-1]
        model = GcnEncoderGraph(input_dims,
                                args.hidden_dim,
                                args.output_dim,
                                data.num_classes,
                                args.num_gc_layers,
                                bn=args.bn,
                                dropout=args.dropout,
                                args=args)
        train_gc(data, model, args)
        _, test_acc = evaluate(data, model, data.test_mask)
        print('Test accuracy is {:.4f}'.format(test_acc))

    else:
        # For pytorch geometric model
        #model = GCNNet(args.input_dim, args.hidden_dim,
        #       data.num_classes, args.num_gc_layers, args=args)
        input_dims = data.x.shape[-1]
        model = GcnEncoderNode(data.num_features,
                               args.hidden_dim,
                               args.output_dim,
                               data.num_classes,
                               args.num_gc_layers,
                               bn=args.bn,
                               dropout=args.dropout,
                               args=args)
        train_syn(data, model, args)
        _, test_acc = evaluate(data, model, data.test_mask)
        print('Test accuracy is {:.4f}'.format(test_acc))

    # Save model
    model_path = 'models/{}_model_{}.pth'.format(args.model, args.dataset)
    if not os.path.exists(model_path) or args.save == True:
        torch.save(model, model_path)
Ejemplo n.º 2
0
def main():

    args = configs.arg_parse()
    fix_seed(args.seed)

    # Load the dataset
    data = prepare_data(args.dataset, args.train_ratio,
                        args.input_dim, args.seed)

    # Load the model
    model_path = 'models/{}_model_{}.pth'.format(args.model, args.dataset)
    model = torch.load(model_path)
    
    # Evaluate the model 
    if args.dataset in ['Cora', 'PubMed']:
        _, test_acc = evaluate(data, model, data.test_mask)
    else: 
        test_acc = test(data, model, data.test_mask)
    print('Test accuracy is {:.4f}'.format(test_acc))

    # Explain it with GraphSVX
    explainer = GraphSVX(data, model, args.gpu)

    # Distinguish graph classfication from node classification
    if args.dataset in ['Mutagenicity', 'syn6']:
        explanations = explainer.explain_graphs(args.indexes,
                                         args.hops,
                                         args.num_samples,
                                         args.info,
                                         args.multiclass,
                                         args.fullempty,
                                         args.S,
                                         'graph_classification',
                                         args.feat,
                                         args.coal,
                                         args.g,
                                         args.regu,
                                         True)
    else: 
        explanations = explainer.explain(args.indexes,
                                        args.hops,
                                        args.num_samples,
                                        args.info,
                                        args.multiclass,
                                        args.fullempty,
                                        args.S,
                                        args.hv,
                                        args.feat,
                                        args.coal,
                                        args.g,
                                        args.regu,
                                        True)

    print('Sum explanations: ', [np.sum(explanation) for explanation in explanations])
    print('Base value: ', explainer.base_values)
Ejemplo n.º 3
0
    'reg_lambda': .1,
    'subsample': .9,
    'min_split_gain': .01,
    'min_child_weight': 2,
    'colsample_bytree': .9,  # Subsample ratio of columns when constructing each tree.
    'scale_pos_weight': 9,  # because training data is unbalanced
    'verbose': -1
}

features = list(pd.read_csv(DATA_FOLDER + '/v3/importances.csv', index_col=0).head(800).index)
train_features = [*features, "target"]


train = pd.read_pickle(DATA_FOLDER + '/v3/train.pkl')[train_features]
folds = prepare_folds(train)

models, result = train_folds(folds, config)

test = load_test(DATA_FOLDER + '/v3/test.pkl')[features]
test_target = evaluate(models, test)

print("AUC: %.4f, F1: %.4f" % (result['auc'], result['f1']))

importance = result['importances'].groupby(['feature']) \
    .agg({'importance': 'mean'}) \
    .sort_values(by="importance", ascending=False)

importance.to_csv(DATA_FOLDER + "/v3/importances.csv")

prepare_submission(test_target, "v3_AUC_%.4f_F1_%.4f" % (result['auc'], result['f1']))
            gold_starts=gold_starts,
            gold_ends=gold_ends,
            cluster_ids=cluster_ids)

        loader = DataLoader(
            dataset,
            batch_size=1,
            shuffle=False
        )

        model.load_state_dict(torch.load(args['model_path']))

        # print(model)
        print(model.parameters)

        evaluate(loader, model, device)
        evaluate(loader, model, device)
    
    # if args['mode'] == 'predict':
    #     all_sentences, tokens, batch_indices, mentions, gold_starts, gold_ends, clusters = process_ecb_plus(args['data_path'], args['mention_type'])
    #     tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-base-cased", use_fast=True)
    #     encodings = tokenizer(all_sentences, return_offsets_mapping=True, is_split_into_words=True, truncation=True, padding=True)
    #     encoded_tokens = fix_tokens_with_offsets(tokens, encodings.offset_mapping, batch_indices)
    #     encoded_sentence_map = create_unmasked_sentence_map(encodings.offset_mapping, batch_indices)
    #     encoded_tokens, gold_starts, gold_ends, mentions = process_gold_mentions(encoded_tokens, gold_starts, gold_ends, mentions, encodings.attention_mask, batch_indices)
    #     cluster_ids = get_cluster_ids(mentions, clusters)

    #     dataset = ECBDataset(
    #         encodings=encodings, 
    #         batch_indices=batch_indices,
    #         sentence_map=encoded_sentence_map,