Ejemplo n.º 1
0
def main(): 

    # Load a configuration
    args = configs.arg_parse()
    fix_seed(args.seed)

    # GPU or CPU
    if args.gpu:
        print("CUDA")
    else:
        print("Using CPU")

    # Load dataset
    data = prepare_data(args.dataset, args.train_ratio, args.input_dim, args.seed)
    
    # Load model 
    model_path = 'models/GCN_model_{}.pth'.format(args.dataset)
    model = torch.load(model_path)

    # Evaluate GraphSVX 
    if args.dataset == 'Mutagenicity':
        data = selected_data(data, args.dataset)
        eval_Mutagenicity(data, model, args)
    elif args.dataset == 'syn6': 
        eval_syn6(data, model, args)
    else: 
        eval_syn(data, model, args)
def main():
    hparams = AmHparams()
    parser = hparams.parser
    am_hp = parser.parse_args()

    rate = 1
    out_path = Const.NoiseOutPath
    delete_files(out_path)
    train_data = prepare_data('train', am_hp, shuffle=True, length=None)
    pathlist = train_data.path_lst
    pylist = train_data.pny_lst
    hzlist = train_data.han_lst
    length = len(pathlist)
    rand_list = random.sample(range(length), int(rate * length))

    pre_list = []
    for i in rand_list:
        path = pathlist[i]
        pre_list.append(os.path.join(Const.SpeechDataPath, path))
    _, filename_list = add_noise(pre_list,
                                 out_path=Const.NoiseOutPath,
                                 keep_bits=False)

    data = ''
    with open(Const.NoiseDataTxT, 'w') as f:
        for i in range(len(rand_list)):
            pinyin = pylist[rand_list[i]]
            hanzi = hzlist[rand_list[i]]
            data += filename_list[i] + '\t' + pinyin + '\t' + hanzi + '\n'
        f.writelines(data[:-1])
    print('---------------噪声数据生成完毕------------')
def main():
    hparams = TransformerHparams()
    parser = hparams.parser
    hp = parser.parse_args()
    # 数据准备工作
    test_data = prepare_data('test', hp, shuffle=True, length=None)
    test_data.feature_dim = hp.feature_dim
    transformer_test(hp, test_data)
def main():
    hparams = TransformerHparams()
    parser = hparams.parser
    hp = parser.parse_args()

    # 数据准备工作
    train_data = prepare_data('train', hp, shuffle=True, length=None)
    transformer_train(hp, train_data)
Ejemplo n.º 5
0
def main():

    args = configs.arg_parse()
    fix_seed(args.seed)

    # Load the dataset
    data = prepare_data(args.dataset, args.train_ratio, args.input_dim,
                        args.seed)

    # Define and train the model
    if args.dataset in ['Cora', 'PubMed']:
        # Retrieve the model and training hyperparameters depending the data/model given as input
        hyperparam = ''.join(['hparams_', args.dataset, '_', args.model])
        param = ''.join(['params_', args.dataset, '_', args.model])
        model = eval(args.model)(input_dim=data.num_features,
                                 output_dim=data.num_classes,
                                 **eval(hyperparam))
        train_and_val(model, data, **eval(param))
        _, test_acc = evaluate(data, model, data.test_mask)
        print('Test accuracy is {:.4f}'.format(test_acc))

    elif args.dataset in ['syn6', 'Mutagenicity']:
        input_dims = data.x.shape[-1]
        model = GcnEncoderGraph(input_dims,
                                args.hidden_dim,
                                args.output_dim,
                                data.num_classes,
                                args.num_gc_layers,
                                bn=args.bn,
                                dropout=args.dropout,
                                args=args)
        train_gc(data, model, args)
        _, test_acc = evaluate(data, model, data.test_mask)
        print('Test accuracy is {:.4f}'.format(test_acc))

    else:
        # For pytorch geometric model
        #model = GCNNet(args.input_dim, args.hidden_dim,
        #       data.num_classes, args.num_gc_layers, args=args)
        input_dims = data.x.shape[-1]
        model = GcnEncoderNode(data.num_features,
                               args.hidden_dim,
                               args.output_dim,
                               data.num_classes,
                               args.num_gc_layers,
                               bn=args.bn,
                               dropout=args.dropout,
                               args=args)
        train_syn(data, model, args)
        _, test_acc = evaluate(data, model, data.test_mask)
        print('Test accuracy is {:.4f}'.format(test_acc))

    # Save model
    model_path = 'models/{}_model_{}.pth'.format(args.model, args.dataset)
    if not os.path.exists(model_path) or args.save == True:
        torch.save(model, model_path)
Ejemplo n.º 6
0
def main():

    args = configs.arg_parse()
    fix_seed(args.seed)

    # Load the dataset
    data = prepare_data(args.dataset, args.train_ratio,
                        args.input_dim, args.seed)

    # Load the model
    model_path = 'models/{}_model_{}.pth'.format(args.model, args.dataset)
    model = torch.load(model_path)
    
    # Evaluate the model 
    if args.dataset in ['Cora', 'PubMed']:
        _, test_acc = evaluate(data, model, data.test_mask)
    else: 
        test_acc = test(data, model, data.test_mask)
    print('Test accuracy is {:.4f}'.format(test_acc))

    # Explain it with GraphSVX
    explainer = GraphSVX(data, model, args.gpu)

    # Distinguish graph classfication from node classification
    if args.dataset in ['Mutagenicity', 'syn6']:
        explanations = explainer.explain_graphs(args.indexes,
                                         args.hops,
                                         args.num_samples,
                                         args.info,
                                         args.multiclass,
                                         args.fullempty,
                                         args.S,
                                         'graph_classification',
                                         args.feat,
                                         args.coal,
                                         args.g,
                                         args.regu,
                                         True)
    else: 
        explanations = explainer.explain(args.indexes,
                                        args.hops,
                                        args.num_samples,
                                        args.info,
                                        args.multiclass,
                                        args.fullempty,
                                        args.S,
                                        args.hv,
                                        args.feat,
                                        args.coal,
                                        args.g,
                                        args.regu,
                                        True)

    print('Sum explanations: ', [np.sum(explanation) for explanation in explanations])
    print('Base value: ', explainer.base_values)
Ejemplo n.º 7
0
def recognition(type='dfcnn'):

    file = '../wav_file/input1.wav'

    # 现场输入识别
    receive_wav(file)
    if type == 'dfcnn':
        # 1.声学模型-----------------------------------
        hparams = AmHparams()
        parser = hparams.parser
        hp = parser.parse_args()
        am_model = CNNCTCModel(hp)
        print('loading acoustic model...')
        select_model_step = 'model_04-14.91'
        am_model.load_model(select_model_step)

        # 2.语言模型-----------------------------------
        hparams = LmHparams()
        parser = hparams.parser
        hp = parser.parse_args()
        hp.is_training = False
        print('loading language model...')
        lm_model = Language_Model(hp)
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
        sess = tf.Session(graph=lm_model.graph,
                          config=tf.ConfigProto(gpu_options=gpu_options))
        with lm_model.graph.as_default():
            saver = tf.train.Saver()
        with sess.as_default():
            latest = tf.train.latest_checkpoint(Const.LmModelFolder)
            saver.restore(sess, latest)
        while (True):
            dfcnn_speech(sess, am_model, lm_model, file)

    if type == 'transformer':
        hparams = TransformerHparams()
        parser = hparams.parser
        hp = parser.parse_args()
        hp.is_training = False
        train_data = prepare_data('train', hp, shuffle=True, length=None)

        model = Transformer(hp)
        with model.graph.as_default():
            saver = tf.train.Saver()
        with tf.Session(graph=model.graph) as sess:
            latest = tf.train.latest_checkpoint(Const.TransformerFolder)
            saver.restore(sess, latest)
        while (True):
            transformer_speech(sess, model, train_data, file)
Ejemplo n.º 8
0
def train_model(is_streamlit=False):
    #Manda pré-processar as informações
    print('Preparing data...')
    if is_streamlit:
        st.write('Preparing data...')
    X_full, y_full, X_train, y_train, X_test, y_test = prepare_data()

    print('Preparing data... OK')
    #Manda treinar
    print('Trainning data...')
    if is_streamlit:
        st.write('Trainning data...')
    response = train_breast_cancer(X_full, y_full, X_train, y_train, X_test,
                                   y_test, is_streamlit)
    print('Trainning data... OK')

    #Retorna sucesso
    print(response)
Ejemplo n.º 9
0
 def load_data(self):
     ''' Load data for training/validation '''
     self.tr_set, self.dv_set, _, self.audio_dim, msg = \
         prepare_data(self.paras.njobs, self.paras.dev_njobs, self.paras.gpu,
                      self.paras.pin_memory, **self.config['data'])
     self.verbose(msg)
Ejemplo n.º 10
0
                    default='GAT',
                    help="Name of the GNN: GCN or GAT")
parser.add_argument(
    "--dataset",
    type=str,
    default='PubMed',
    help="Name of the dataset among Cora, PubMed, Amazon, PPI, Reddit")
parser.add_argument("--seed", type=int, default=10)
parser.add_argument("--save",
                    type=str,
                    default=False,
                    help="True to save the trained model obtained")
args = parser.parse_args()

# Load the dataset
data = prepare_data(args.dataset, args.seed)

# Train the model - specific case for PPI dataset
if args.dataset == "PPI":
    model = main_ppi(type=args.model)
    # test_ppi shows how to compute predictions (model(), then positive values => predict this class)

else:
    # Retrieve the model and training hyperparameters depending the data/model given as input
    hyperparam = ''.join(['hparams_', args.dataset, '_', args.model])
    param = ''.join(['params_', args.dataset, '_', args.model])

    # Define the model
    if args.model == 'GCN':
        model = GCN(input_dim=data.x.size(1),
                    output_dim=data.num_classes,
Ejemplo n.º 11
0
        return f1, precision, recall, dict_pt_verbs
    else:
        return f1

<<<<<<< HEAD
def main(name_file='all_f1', train_dir='all', test_dir='test', dir_files='data/disambiguation/', dir_results='results_2/', max_length=120, cuda_id=0, cuda=True, n_epochs=9, seed=0, lr=0.0001):
=======
def main(name_file='all_f1', train_dir='all', test_dir='test', dir_files='data/disambiguation/', dir_results='results/', max_length=120, cuda_id=0, cuda=True, n_epochs=9, seed=0):
>>>>>>> b4396c75b27e3d4f8680ea6762d7f2c530382768
    
    dir_train = os.path.join(dir_files, train_dir)
    dir_test = os.path.join(dir_files, test_dir)
    dir_results = os.path.join(dir_results, train_dir, name_file)
    os.makedirs(dir_results, exist_ok=True)
    
    input_lang, output_lang, pairs_train, pairs_test, senses_per_sentence = prepare_data(name_file, 'verbs_selected_lemma', max_length=max_length, dir_train=dir_train, dir_test=dir_test)
    selected_synsets = np.load(os.path.join(dir_files, 'selected_synsets.npy'))

    reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),
<<<<<<< HEAD
=======
        delimiter=',',
>>>>>>> b4396c75b27e3d4f8680ea6762d7f2c530382768
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
    train_dataset = reader.read(os.path.join(dir_train, name_file + '.tsv'))
    validation_dataset = reader.read(os.path.join(dir_test, 'verbs_selected_lemma.tsv'))

    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
Ejemplo n.º 12
0
def filter_useless_nodes_multiclass(args_dataset,
                                    args_model,
                                    args_explainers,
                                    args_hops,
                                    args_num_samples,
                                    args_test_samples,
                                    args_prop_noise_nodes,
                                    args_connectedness,
                                    node_indices,
                                    args_K,
                                    info,
                                    args_hv,
                                    args_feat,
                                    args_coal,
                                    args_g,
                                    args_multiclass,
                                    args_regu,
                                    args_gpu,
                                    args_fullempty,
                                    args_S, 
                                    seed):
    """ Add noisy neighbours to dataset and check how many are included in explanations
    The fewest, the better the explainer.

    Args:
        Arguments defined in argument parser of script_eval.py
    
    """

    # Define dataset
    data = prepare_data(args_dataset, seed=10)
    args_num_noise_nodes = int(args_prop_noise_nodes * data.x.size(0))
    args_c = eval('EVAL1_' + data.name)['args_c']
    args_p = eval('EVAL1_' + data.name)['args_p']
    args_binary = eval('EVAL1_' + data.name)['args_binary']

    # Select a random subset of nodes to eval the explainer on.
    if not node_indices:
        node_indices = extract_test_nodes(data, args_test_samples, seed)

    # Add noisy neighbours to the graph, with random features
    data = add_noise_neighbours(data, args_num_noise_nodes, node_indices,
                                binary=args_binary, p=args_p, connectedness=args_connectedness)

    # Define training parameters depending on (model-dataset) couple
    hyperparam = ''.join(['hparams_', args_dataset, '_', args_model])
    param = ''.join(['params_', args_dataset, '_', args_model])

    # Define the model
    if args_model == 'GCN':
        model = GCN(input_dim=data.x.size(
            1), output_dim=data.num_classes, **eval(hyperparam))
    else:
        model = GAT(input_dim=data.x.size(
            1), output_dim=data.num_classes, **eval(hyperparam))

    # Re-train the model on dataset with noisy features
    train_and_val(model, data, **eval(param))

    model.eval()
    with torch.no_grad():
        log_logits = model(x=data.x, edge_index=data.edge_index)  # [2708, 7]
    test_acc = accuracy(log_logits[data.test_mask], data.y[data.test_mask])
    print('Test accuracy is {:.4f}'.format(test_acc))
    del log_logits

    # Study attention weights of noisy nodes in GAT model - compare attention with explanations
    if str(type(model)) == "<class 'src.models.GAT'>":
        study_attention_weights(data, model, args_test_samples)
    
    # Adaptable K - top k explanations we look at for each node
    # Depends on number of existing features/neighbours considered for GraphSVX
    # if 'GraphSVX' in args_explainers:
    # 	K = []
    # else:
    # 	K = [5]*len(node_indices)

    # Do for several explainers
    for c, explainer_name in enumerate(args_explainers):
        
        print('EXPLAINER: ', explainer_name)
        # Define the explainer
        explainer = eval(explainer_name)(data, model, args_gpu)

        # Loop on each test sample and store how many times do noisy nodes appear among
        # K most influential features in our explanations
        # 1 el per test sample - count number of noisy nodes in explanations
        total_num_noise_neis = []
        # 1 el per test sample - count number of noisy nodes in explanations for 1 class
        pred_class_num_noise_neis = []
        # 1 el per test sample - count number of noisy nodes in subgraph
        total_num_noisy_nei = []
        total_neigbours = []  # 1 el per test samples - number of neigbours of v in subgraph
        M = []  # 1 el per test sample - number of non zero features
        for node_idx in tqdm(node_indices, desc='explain node', leave=False):

            # Look only at coefficients for nodes (not node features)
            if explainer_name == 'Greedy':
                coefs = explainer.explain_nei(node_index=node_idx,
                                              hops=args_hops,
                                              num_samples=args_num_samples,
                                              info=False,
                                              multiclass=True)

            elif explainer_name == 'GNNExplainer':
                _ = explainer.explain(node_index=node_idx,
                                      hops=args_hops,
                                      num_samples=args_num_samples,
                                      info=False,
                                      multiclass=True)
                coefs = explainer.coefs

            else:
                # Explanations via GraphSVX
                coefs = explainer.explain([node_idx],
                                          args_hops,
                                          args_num_samples,
                                          info,
                                          args_multiclass,
                                          args_fullempty,
                                          args_S,
                                          args_hv,
                                          args_feat,
                                          args_coal,
                                          args_g,
                                          args_regu)
                coefs = coefs[0].T[explainer.F:]
            
            # if explainer.F > 50:
            # 	K.append(10)
            # else:
            # 	K.append(int(explainer.F * args_K))

            # Check how many non zero features
            M.append(explainer.M)

            # Number of noisy nodes in the subgraph of node_idx
            num_noisy_nodes = len(
                [n_idx for n_idx in explainer.neighbours if n_idx >= data.x.size(0)-args_num_noise_nodes])

            # Number of neighbours in the subgraph
            total_neigbours.append(len(explainer.neighbours))

            # Multilabel classification - consider all classes instead of focusing on the
            # class that is predicted by our model
            num_noise_neis = []  # one element for each class of a test sample
            true_conf, predicted_class = model(x=data.x, edge_index=data.edge_index).exp()[
                node_idx].max(dim=0)

            for i in range(data.num_classes):

                # Store indexes of K most important features, for each class
                nei_indices = np.abs(coefs[:, i]).argsort()[-args_K:].tolist()

                # Number of noisy features that appear in explanations - use index to spot them
                num_noise_nei = sum(
                    idx >= (explainer.neighbours.shape[0] - num_noisy_nodes) for idx in nei_indices)
                num_noise_neis.append(num_noise_nei)

                if i == predicted_class:
                    #nei_indices = coefs[:,i].argsort()[-args_K:].tolist()
                    #num_noise_nei = sum(idx >= (explainer.neighbours.shape[0] - num_noisy_nodes) for idx in nei_indices)
                    pred_class_num_noise_neis.append(num_noise_nei)

            # Return this number => number of times noisy neighbours are provided as explanations
            total_num_noise_neis.append(sum(num_noise_neis))
            # Return number of noisy nodes adjacent to node of interest
            total_num_noisy_nei.append(num_noisy_nodes)

        if info:
            print('Noisy neighbours included in explanations: ',
                  total_num_noise_neis)

            print('There are {} noise neighbours found in the explanations of {} test samples, an average of {} per sample'
                  .format(sum(total_num_noise_neis), args_test_samples, sum(total_num_noise_neis)/args_test_samples))

            print(np.sum(pred_class_num_noise_neis) /
                  args_test_samples, 'for the predicted class only')

            print('Proportion of explanations showing noisy neighbours: {:.2f}%'.format(
                100 * sum(total_num_noise_neis) / (args_K * args_test_samples * data.num_classes)))

            perc = 100 * sum(total_num_noise_neis) / (args_test_samples *
                                                      args_num_noise_nodes * data.num_classes)
            perc2 = 100 * ((args_K * args_test_samples * data.num_classes) -
                           sum(total_num_noise_neis)) / (np.sum(M) - sum(total_num_noisy_nei))
            print('Proportion of noisy neighbours found in explanations vs normal features: {:.2f}% vs {:.2f}'.format(
                perc, perc2))

            print('Proportion of nodes in subgraph that are noisy: {:.2f}%'.format(
                100 * sum(total_num_noisy_nei) / sum(total_neigbours)))

            print('Proportion of noisy neighbours in subgraph found in explanations: {:.2f}%'.format(
                100 * sum(total_num_noise_neis) / (sum(total_num_noisy_nei) * data.num_classes)))

        # Plot of kernel density estimates of number of noisy features included in explanation
        # Do for all benchmarks (with diff colors) and plt.show() to get on the same graph
        total_num_noise_neis = [item/data.num_classes for item in total_num_noise_neis]
        plot_dist(total_num_noise_neis,
                    label=explainer_name, color=COLOURS[c])
        # else:  # consider only predicted class
        # 	plot_dist(pred_class_num_noise_neis,
        # 			  label=explainer_name, color=COLOURS[c])

    # Random explainer - plot estimated kernel density
    total_num_noise_neis = noise_nodes_for_random(
        data, model, args_K, args_num_noise_nodes, node_indices)
    
    total_num_noise_neis= [item/data.num_classes for item in total_num_noise_neis]
    plot_dist(total_num_noise_neis, label='Random',
              color='y')

    plt.savefig('results/eval1_node_{}'.format(data.name))
    #plt.show()

    return total_num_noise_neis
Ejemplo n.º 13
0
def filter_useless_nodes(args_dataset,
                         args_model,
                         args_explainers,
                         args_hops,
                         args_num_samples,
                         args_test_samples,
                         args_K,
                         args_prop_noise_nodes,
                         args_connectedness,
                         node_indices,
                         info,
                         args_hv,
                         args_feat,
                         args_coal,
                         args_g,
                         args_multiclass,
                         args_regu,
                         args_gpu,
                         args_fullempty,
                         args_S,
                         seed):
    """ Add noisy neighbours to dataset and check how many are included in explanations
    The fewest, the better the explainer.

    Args:
        Arguments defined in argument parser of script_eval.py
    
    """

    # Define dataset
    data = prepare_data(args_dataset, seed=seed)

    # Select a random subset of nodes to eval the explainer on.
    if not node_indices:
        node_indices = extract_test_nodes(data, args_test_samples, seed)
    
    # Define number of noisy nodes according to dataset size
    args_num_noise_nodes = int(args_prop_noise_nodes * data.x.size(0))
    args_c = eval('EVAL1_' + data.name)['args_c']
    args_p = eval('EVAL1_' + data.name)['args_p']
    args_binary = eval('EVAL1_' + data.name)['args_binary']

    # Add noisy neighbours to the graph, with random features
    data = add_noise_neighbours(data, args_num_noise_nodes, node_indices,
                                binary=args_binary, p=args_p, 
                                connectedness=args_connectedness, c=args_c)

    # Define training parameters depending on (model-dataset) couple
    hyperparam = ''.join(['hparams_', args_dataset, '_', args_model])
    param = ''.join(['params_', args_dataset, '_', args_model])

    # Define the model
    if args_model == 'GCN':
        model = GCN(input_dim=data.x.size(
            1), output_dim=data.num_classes, **eval(hyperparam))
    else:
        model = GAT(input_dim=data.x.size(
            1), output_dim=data.num_classes, **eval(hyperparam))

    # Re-train the model on dataset with noisy features
    train_and_val(model, data, **eval(param))
    
    # Evaluate model
    model.eval()
    with torch.no_grad():
        log_logits = model(x=data.x, edge_index=data.edge_index)  # [2708, 7]
    test_acc = accuracy(log_logits[data.test_mask], data.y[data.test_mask])
    print('Test accuracy is {:.4f}'.format(test_acc))

    # Derive predicted class for each test node
    with torch.no_grad():
        true_confs, predicted_classes = log_logits.exp()[node_indices].max(dim=1)
    del log_logits

    if args_regu == 1:
        args_regu = 0

    # Study attention weights of noisy nodes in GAT model - compare attention with explanations
    if str(type(model)) == "<class 'src.models.GAT'>":
        study_attention_weights(data, model, args_test_samples)

    # Do for several explainers
    for c, explainer_name in enumerate(args_explainers):

        print('EXPLAINER: ', explainer_name)	
        # Define the explainer
        explainer = eval(explainer_name)(data, model, args_gpu)

        # Loop on each test sample and store how many times do noisy nodes appear among
        # K most influential features in our explanations
        # count number of noisy nodes in explanations 
        pred_class_num_noise_neis = []
        # count number of noisy nodes in subgraph
        total_num_noisy_nei = []
        # Number of neigbours of v in subgraph
        total_neigbours = []  
        # Stores number of most important neighbours we look at, for each node 
        K = []
        # To retrieve the predicted class
        j = 0
        for node_idx in tqdm(node_indices, desc='explain node', leave=False):

            # Look only at coefficients for nodes (not node features)
            if explainer_name == 'Greedy':
                coefs = explainer.explain_nei(node_idx,
                                              args_hops,
                                              args_num_samples)

            elif explainer_name == 'GNNExplainer':
                _ = explainer.explain(node_idx,
                                      args_hops,
                                      args_num_samples)
                coefs = explainer.coefs

            else:
                # Explanations via GraphSVX
                coefs = explainer.explain([node_idx],
                                          args_hops,
                                          args_num_samples,
                                          info,
                                          args_multiclass,
                                          args_fullempty,
                                          args_S,
                                          args_hv,
                                          args_feat,
                                          args_coal,
                                          args_g,
                                          args_regu)
                coefs = coefs[0].T[explainer.F:]

            # Number of noisy nodes in the subgraph of node_idx
            num_noisy_nodes = len(
                [n_idx for n_idx in explainer.neighbours if n_idx >= data.x.size(0)-args_num_noise_nodes])

            # Number of neighbours in the subgraph
            total_neigbours.append(len(explainer.neighbours))

            # Adaptable K - vary according to number of nodes in the subgraph
            if len(explainer.neighbours) > 100:
                K.append(int(args_K * 100))
            else:
                K.append( max(1, int(args_K * len(explainer.neighbours))) )

            # Store indexes of K most important features, for each class
            nei_indices = coefs.argsort()[-K[j]:].tolist()

            # Number of noisy features that appear in explanations - use index to spot them
            noise_nei = [idx for idx in nei_indices if idx > (explainer.neighbours.shape[0] - num_noisy_nodes)]

            # If node importance of top K neighbours is unsignificant, discard 
            # Possible as we have importance informative measure, unlike others.
            if explainer_name == 'GraphSVX':
                explainable_part = true_confs[c] - \
                                explainer.base_values[c]
                noise_nei = [idx for idx in noise_nei if np.abs(coefs[idx]) > 0.05*np.abs(explainable_part)]
            
            num_noise_nei = len(noise_nei)
            pred_class_num_noise_neis.append(num_noise_nei)

            # Return number of noisy nodes adjacent to node of interest
            total_num_noisy_nei.append(num_noisy_nodes)

            j += 1

        print('Noisy neighbours included in explanations: ',
                        pred_class_num_noise_neis)

        print('There are {} noise neighbours found in the explanations of {} test samples, an average of {} per sample'
                        .format(sum(pred_class_num_noise_neis), args_test_samples, sum(pred_class_num_noise_neis)/args_test_samples))

        print('Proportion of explanations showing noisy neighbours: {:.2f}%'.format(
            100 * sum(pred_class_num_noise_neis) / sum(K)))

        perc = 100 * sum(pred_class_num_noise_neis) / (sum(total_num_noisy_nei))
        perc2 = 100 * (sum(K) - sum(pred_class_num_noise_neis)) \
        / (sum(total_neigbours) - sum(total_num_noisy_nei))
        print('Proportion of noisy neighbours found in explanations vs normal neighbours (in subgraph): {:.2f}% vs {:.2f}'.format(
            perc, perc2))

        print('Proportion of nodes in subgraph that are noisy: {:.2f}%'.format(
            100 * sum(total_num_noisy_nei) / sum(total_neigbours)))

        print('Proportion of noisy neighbours found in explanations (entire graph): {:.2f}%'.format(
            100 * sum(pred_class_num_noise_neis) / (args_test_samples * args_num_noise_nodes)))
        
        print('------------------------------------')

        # Plot of kernel density estimates of number of noisy features included in explanation
        # Do for all benchmarks (with diff colors) and plt.show() to get on the same graph
        plot_dist(pred_class_num_noise_neis,
                    label=explainer_name, color=COLOURS[c])

    # Random explainer - plot estimated kernel density
    total_num_noise_neis = noise_nodes_for_random(
        data, model, K, node_indices, total_num_noisy_nei, total_neigbours)
    plot_dist(total_num_noise_neis, label='Random',
              color='y')
    
    # Store graph - with key params and time
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    plt.savefig('results/eval1_node_{}_{}_{}_{}_{}.pdf'.format(data.name,
                                                           args_coal, 
                                                           args_feat, 
                                                           args_hv, 
                                                           current_time))
    plt.close()
    #plt.show()

    return total_num_noise_neis
Ejemplo n.º 14
0
node_indices = [
    2332, 2101, 1769, 2546, 2595, 1913, 1804, 2419, 2530, 1872, 2629, 2272,
    1739, 2394, 1770, 2030, 2123, 2176, 1999, 2608
]
#node_indices= [2420,2455,1783,2165,2628,1822,2682,2261,1896,1880,2137,2237,2313,2218,1822,1719,1763,2263,2020,1988]
node_indices = [
    10, 18, 89, 178, 333, 356, 378, 456, 500, 2222, 1220, 1900, 1328, 189,
    1111, 124, 666, 684, 1556, 1881
]
node_indices = [
    1834, 2512, 2591, 2101, 1848, 1853, 2326, 1987, 2359, 2453, 2230, 2267,
    2399, 2150, 2400, 2546, 1825, 2529, 2559, 1883
]

# Define dataset - include noisy features
data = prepare_data(args_dataset, seed=10)
data, noise_feat = add_noise_features(data,
                                      num_noise=args_num_noise_feat,
                                      binary=args_binary,
                                      p=args_p)

# Define training parameters depending on (model-dataset) couple
hyperparam = ''.join(['hparams_', args_dataset, '_', args_model])
param = ''.join(['params_', args_dataset, '_', args_model])

# Define the model
if args_model == 'GCN':
    model = GCN(input_dim=data.x.size(1),
                output_dim=data.num_classes,
                **eval(hyperparam))
else:
Ejemplo n.º 15
0
def filter_useless_nodes(args_model,
                         args_dataset,
                         args_hops,
                         args_num_samples,
                         args_test_samples,
                         args_K,
                         args_num_noise_nodes,
                         args_p,
                         args_binary,
                         args_connectedness,
                         node_indices=None,
                         info=True):
    """
	Arguments defined in argument parser in script_eval.py
	Add noisy features to dataset and check how many are included in explanations
	The fewest, the better the explainer.
	"""
    '''
	####### Input in script_eval file
	args_dataset = 'Cora'
	args_model = 'GAT'
	args_hops = 2
	args_num_samples = 100
	args_test_samples = 10
	args_num_noise_nodes = 20
	args_K= 5 # maybe def depending on M 
	args_p = 0.013
	args_connectedness = 'medium'
	args_binary=True
	'''
    #### Create function from here. Maybe create training fct first, to avoid retraining the model.

    # Define dataset
    data = prepare_data(args_dataset, seed=10)

    # Select random subset of nodes to eval the explainer on.
    if not node_indices:
        node_indices = extract_test_nodes(data, args_test_samples)

    # Include noisy neighbours
    data = add_noise_neighbors(data,
                               args_num_noise_nodes,
                               node_indices,
                               binary=args_binary,
                               p=args_p,
                               connectedness=args_connectedness)
    # data, noise_feat = add_noise_features(data, num_noise=args_num_noise_feat, binary=True)

    # Define training parameters depending on (model-dataset) couple
    hyperparam = ''.join(['hparams_', args_dataset, '_', args_model])
    param = ''.join(['params_', args_dataset, '_', args_model])

    # Define the model
    if args_model == 'GCN':
        model = GCN(input_dim=data.x.size(1),
                    output_dim=data.num_classes,
                    **eval(hyperparam))
    else:
        model = GAT(input_dim=data.x.size(1),
                    output_dim=data.num_classes,
                    **eval(hyperparam))

    # Re-train the model on dataset with noisy features
    train_and_val(model, data, **eval(param))

    # Study attention weights of noisy nodes - for 20 new nodes
    def study_attention_weights(data, model):
        """
		Studies the attention weights of the GAT model 
		"""
        _, alpha, alpha_bis = model(data.x, data.edge_index, att=True)

        edges, alpha1 = alpha[0][:, :-(data.x.size(0) - 1)], alpha[1][:-(
            data.x.size(0) - 1), :]  # remove self loops att
        alpha2 = alpha_bis[1][:-(data.x.size(0) - 1)]

        att1 = []
        att2 = []
        for i in range(
                data.x.size(0) - args_test_samples, (data.x.size(0) - 1)):
            ind = (edges == i).nonzero()
            for j in ind[:, 1]:
                att1.append(torch.mean(alpha1[j]))
                att2.append(alpha2[j][0])
        print('shape attention noisy', len(att2))

        # It looks like these noisy nodes are very important
        print('av attention',
              (torch.mean(alpha1) + torch.mean(alpha2)) / 2)  # 0.18
        (torch.mean(torch.stack(att1)) +
         torch.mean(torch.stack(att2))) / 2  # 0.32

        # In fact, noisy nodes are slightly below average in terms of attention received
        # Importance of interest: look only at imp. of noisy nei for test nodes
        print('attention 1 av. for noisy nodes: ',
              torch.mean(torch.stack(att1[0::2])))
        print('attention 2 av. for noisy nodes: ',
              torch.mean(torch.stack(att2[0::2])))

    # Study attention weights
    if str(type(model)) == "<class 'src.models.GAT'>":
        study_attention_weights(data, model)

    # Define explainer
    graphshap = GraphSHAP(data, model)

    # Loop on each test sample and store how many times do noise features appear among
    # K most influential features in our explanations
    total_num_noise_neis = [
    ]  # 1 el per test sample - count number of noisy nodes in explanations
    pred_class_num_noise_neis = [
    ]  # 1 el per test sample - count number of noisy nodes in explanations for 1 class
    total_num_noisy_nei = [
    ]  # 1 el per test sample - count number of noisy nodes in subgraph
    total_neigbours = [
    ]  # 1 el per test samples - number of neigbours of v in subgraph
    M = []  # 1 el per test sample - number of non zero features
    for node_idx in tqdm(node_indices, desc='explain node', leave=False):

        # Explanations via GraphSHAP
        coefs = graphshap.explainer(node_index=node_idx,
                                    hops=args_hops,
                                    num_samples=args_num_samples,
                                    info=False)

        # Check how many non zero features
        M.append(graphshap.M)

        # Number of noisy nodes in the subgraph of node_idx
        num_noisy_nodes = len([
            n_idx for n_idx in graphshap.neighbors
            if n_idx >= data.x.size(0) - args_num_noise_nodes
        ])

        total_neigbours.append(len(graphshap.neighbors))

        # Multilabel classification - consider all classes instead of focusing on the
        # class that is predicted by our model
        num_noise_neis = []  # one element for each class of a test sample
        true_conf, predicted_class = model(
            x=data.x, edge_index=data.edge_index).exp()[node_idx].max(dim=0)

        for i in range(data.num_classes):

            # Store indexes of K most important features, for each class
            nei_indices = np.abs(coefs[:, i]).argsort()[-args_K:].tolist()

            # Number of noisy features that appear in explanations - use index to spot them
            num_noise_nei = sum(idx >= graphshap.M - num_noisy_nodes
                                for idx in nei_indices)
            num_noise_neis.append(num_noise_nei)

            if i == predicted_class:
                pred_class_num_noise_neis.append(num_noise_nei)

        # Return this number => number of times noisy neighbours are provided as explanations
        total_num_noise_neis.append(sum(num_noise_neis))
        # Return number of noisy nodes adjacent to node of interest
        total_num_noisy_nei.append(num_noisy_nodes)

    if info:
        print('Noisy neighbours included in explanations: ',
              total_num_noise_neis)

        print('There are {} noise neighbours found in the explanations of {} test samples, an average of {} per sample'\
         .format(sum(total_num_noise_neis),args_test_samples,sum(total_num_noise_neis)/args_test_samples) )

        print(
            np.sum(pred_class_num_noise_neis) / args_test_samples,
            'for the predicted class only')

        print('Proportion of explanations showing noisy neighbours: {:.2f}%'.
              format(100 * sum(total_num_noise_neis) /
                     (args_K * args_test_samples * data.num_classes)))

        perc = 100 * sum(total_num_noise_neis) / (
            args_test_samples * args_num_noise_nodes * data.num_classes)
        perc2 = 100 * (
            (args_K * args_test_samples * data.num_classes) -
            sum(total_num_noise_neis)) / (np.sum(M) - sum(total_num_noisy_nei))
        print(
            'Proportion of noisy neighbours found in explanations vs normal features: {:.2f}% vs {:.2f}'
            .format(perc, perc2))

        print('Proportion of nodes in subgraph that are noisy: {:.2f}%'.format(
            100 * sum(total_num_noisy_nei) / sum(total_neigbours)))

        print('Proportion of noisy neighbours among features: {:.2f}%'.format(
            100 * sum(total_num_noisy_nei) / np.sum(M)))

    # Plot of kernel density estimates of number of noisy features included in explanation
    # Do for all benchmarks (with diff colors) and plt.show() to get on the same graph
    plot_dist(total_num_noise_neis, label='GraphSHAP', color='g')
    #plt.show()

    return total_num_noise_neis
Ejemplo n.º 16
0
def filter_useless_features(args_dataset,
                            args_model,
                            args_explainers,
                            args_hops,
                            args_num_samples,
                            args_test_samples,
                            args_K,
                            args_prop_noise_feat,
                            node_indices,
                            info,
                            args_hv,
                            args_feat,
                            args_coal,
                            args_g,
                            args_multiclass,
                            args_regu,
                            args_gpu,
                            args_fullempty,
                            args_S, 
                            seed):
    """ Add noisy features to dataset and check how many are included in explanations
    The fewest, the better the explainer.

    Args:
        Arguments defined in argument parser of script_eval.py
    
    """

    # Define dataset 
    data = prepare_data(args_dataset, seed=seed)
    args_num_noise_feat = int(data.x.size(1) * args_prop_noise_feat)
    args_p = eval('EVAL1_' + data.name)['args_p']
    args_binary = eval('EVAL1_' + data.name)['args_binary']

    # Include noisy neighbours
    data, noise_feat = add_noise_features(
        data, num_noise=args_num_noise_feat, binary=args_binary, p=args_p)

    # Define training parameters depending on (model-dataset) couple
    hyperparam = ''.join(['hparams_', args_dataset, '_', args_model])
    param = ''.join(['params_', args_dataset, '_', args_model])

    # Define the model
    if args_model == 'GCN':
        model = GCN(input_dim=data.x.size(
            1), output_dim=data.num_classes, **eval(hyperparam))
    else:
        model = GAT(input_dim=data.x.size(
            1), output_dim=data.num_classes,  **eval(hyperparam))

    # Re-train the model on dataset with noisy features
    train_and_val(model, data, **eval(param))

    # Select random subset of nodes to eval the explainer on.
    if not node_indices:
        node_indices = extract_test_nodes(data, args_test_samples, seed)
    
    # Evaluate the model on test set
    model.eval()
    with torch.no_grad():
        log_logits = model(x=data.x, edge_index=data.edge_index)  
    test_acc = accuracy(log_logits[data.test_mask], data.y[data.test_mask])
    print('Test accuracy is {:.4f}'.format(test_acc))

    # Derive predicted class for each test sample
    with torch.no_grad():
        true_confs, predicted_classes = log_logits.exp()[node_indices].max(dim=1)
    del log_logits

    # Adaptable K - top k explanations we look at for each node
    # Depends on number of existing features considered for GraphSVX
    if 'GraphSVX' in args_explainers:
        K = []
    else:
        K = [10]*len(node_indices)
    #for node_idx in node_indices:
    #	K.append(int(data.x[node_idx].nonzero().shape[0] * args_K))

    if args_regu == 0:
        args_regu = 1

    # Loop on the different explainers selected
    for c, explainer_name in enumerate(args_explainers):
        
        # Define explainer
        explainer = eval(explainer_name)(data, model, args_gpu)
        print('EXPLAINER: ', explainer_name)

        # count noisy features found in explanations 
        pred_class_num_noise_feats = []
        # count number of noisy features considered
        total_num_noise_feat_considered = []
        # count number of features   
        F = []

        # Loop on each test sample and store how many times do noise features appear among
        # K most influential features in our explanations
        j=0
        for node_idx in tqdm(node_indices, desc='explain node', leave=False):
            
            # Explanations via GraphSVX
            if explainer_name == 'GraphSVX':
                coefs = explainer.explain(
                                [node_idx],
                                args_hops,
                                args_num_samples,
                                info,
                                args_multiclass,
                                args_fullempty,
                                args_S,
                                args_hv,
                                args_feat,
                                args_coal,
                                args_g,
                                args_regu,
                                )
                # Look only at features coefficients 
                # Neighbours are irrelevant here
                coefs = coefs[0][:explainer.F]
                
                # Adaptable K
                if explainer.F > 100:
                    K.append(int(args_K * 100))
                else:
                    K.append( max(1, int(explainer.F * args_K)) )

                # Num_features_considered
                if args_feat == 'Null':
                    feat_idx = noise_feat[explainer.neighbours, :].mean(axis=0).nonzero()
                    num_noise_feat_considered = feat_idx.size()[0]

                # Consider all features (+ use expectation like below)
                elif args_feat == 'All':
                    num_noise_feat_considered = args_num_noise_feat

                # Consider only features whose aggregated value is different from expected one
                else:
                    # Stats dataset
                    var = noise_feat.std(axis=0)
                    mean = noise_feat.mean(axis=0)
                    # Feature intermediate rep
                    mean_subgraph = noise_feat[explainer.neighbours, :].mean(axis=0)
                    # Select relevant features only - (E-e,E+e)
                    mean_subgraph = torch.where(mean_subgraph > mean - 0.25*var, mean_subgraph,
                                        torch.ones_like(mean_subgraph)*100)
                    mean_subgraph = torch.where(mean_subgraph < mean + 0.25*var, mean_subgraph,
                                        torch.ones_like(mean_subgraph)*100)
                    feat_idx = (mean_subgraph == 100).nonzero()
                    num_noise_feat_considered = feat_idx.shape[0]
                    del mean, mean_subgraph, var
                
            else:
                coefs = explainer.explain(node_idx,
                                        args_hops,
                                        args_num_samples,
                                        info=False,
                                        multiclass=False
                                        )[:explainer.F]
                # All features are considered
                num_noise_feat_considered = args_num_noise_feat

            # Features considered 
            F.append(explainer.F)

            # Store indexes of K most important node features, for each class
            feat_indices = coefs.argsort()[-K[j]:].tolist()

            # Number of noisy features that appear in explanations - use index to spot them
            num_noise_feat = [idx for idx in feat_indices if idx > (explainer.F - num_noise_feat_considered)]

            # If node importance of top K features is unsignificant, discard 
            # Possible as we have importance informative measure, unlike others.
            if explainer_name == 'GraphSVX':
                explainable_part = true_confs[c] - \
                                explainer.base_values[c]
                num_noise_feat = [idx for idx in num_noise_feat if np.abs(coefs[idx]) > 0.05*np.abs(explainable_part)]
            
            # Count number of noisy that appear in explanations
            num_noise_feat = len(num_noise_feat)
            pred_class_num_noise_feats.append(num_noise_feat)

            # Return number of noisy features considered in this test sample
            total_num_noise_feat_considered.append(num_noise_feat_considered)

            j+=1

        print('Noisy features included in explanations: ',
                        sum(pred_class_num_noise_feats) )
        print('For the predicted class, there are {} noise features found in the explanations of {} test samples, an average of {} per sample'
                        .format(sum(pred_class_num_noise_feats), args_test_samples, sum(pred_class_num_noise_feats)/args_test_samples))

        print(pred_class_num_noise_feats)

        if sum(F) != 0:
            perc = 100 * sum(total_num_noise_feat_considered) / sum(F)
            print(
                'Proportion of considered noisy features among features: {:.2f}%'.format(perc))
        if sum(K) != 0:
            perc = 100 * sum(pred_class_num_noise_feats) / sum(K)
            print('Proportion of explanations showing noisy features: {:.2f}%'.format(perc))

        if sum(total_num_noise_feat_considered) != 0:
            perc = 100 * sum(pred_class_num_noise_feats) / (sum(total_num_noise_feat_considered))
            perc2 = 100 * (sum(K) - sum(pred_class_num_noise_feats)) / (sum(F) - sum(total_num_noise_feat_considered)) 
            print('Proportion of noisy features found in explanations vs proportion of normal features (among considered ones): {:.2f}% vs {:.2f}%, over considered features only'.format(
                perc, perc2))

        print('------------------------------------')

        # Plot of kernel density estimates of number of noisy features included in explanation
        # Do for all benchmarks (with diff colors) and plt.show() to get on the same graph
        plot_dist(pred_class_num_noise_feats, 
                    label=explainer_name, color=COLOURS[c])

    # Random explainer - plot estimated kernel density
    total_num_noise_feats = noise_feats_for_random(
        data, model, K, args_num_noise_feat, node_indices)
    save_path = 'results/eval1_feat'
    plot_dist(total_num_noise_feats, label='Random', color='y')

    # Store graph - with key params and time
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    plt.savefig('results/eval1_feat_{}_{}_{}_{}_{}.pdf'.format(data.name,
                                                           args_coal, 
                                                           args_feat, 
                                                           args_hv, 
                                                           current_time))
    plt.close()
Ejemplo n.º 17
0
def filter_useless_features(args_model,
                            args_dataset,
                            args_explainers,
                            args_hops,
                            args_num_samples,
                            args_test_samples,
                            args_K,
                            args_num_noise_feat,
                            args_p,
                            args_binary,
                            node_indices,
                            info=True):
    """
	Arguments defined in argument parser of script_eval.py
	Add noisy features to dataset and check how many are included in explanations
	The fewest, the better the explainer.
	"""
    '''
	####### Input in script_eval file
	args_dataset = 'Cora'
	args_model = 'GCN'
	args_explainers = ['GraphSHAP', 'Greedy']
	args_hops = 2
	args_num_samples = 100 # size shap dataset
	args_test_samples = 20 # number of test samples
	args_num_noise_feat= 25 # number of noisy features
	args_K= 5 # maybe def depending on M
	args_binary = True 
	args_p = 0.5
	info=True

	node_indices= [2420,2455,1783,2165,2628,1822,2682,2261,1896,1880,2137,2237,2313,2218,1822,1719,1763,2263,2020,1988]
	node_indices = [10, 18, 89, 178, 333, 356, 378, 456, 500, 2222, 1220, 1900, 1328, 189, 1111]
	node_indices = [1834,2512,2591,2101,1848,1853,2326,1987,2359,2453,2230,2267,2399, 2150,2400]
	'''

    #### Create function from here. Maybe create training fct first, to avoid retraining the model.

    # Define dataset - include noisy features
    data = prepare_data(args_dataset, seed=10)
    data, noise_feat = add_noise_features(data,
                                          num_noise=args_num_noise_feat,
                                          binary=args_binary,
                                          p=args_p)

    # Define training parameters depending on (model-dataset) couple
    hyperparam = ''.join(['hparams_', args_dataset, '_', args_model])
    param = ''.join(['params_', args_dataset, '_', args_model])

    # Define the model
    if args_model == 'GCN':
        model = GCN(input_dim=data.x.size(1),
                    output_dim=data.num_classes,
                    **eval(hyperparam))
    else:
        model = GAT(input_dim=data.x.size(1),
                    output_dim=data.num_classes,
                    **eval(hyperparam))

    # Re-train the model on dataset with noisy features
    train_and_val(model, data, **eval(param))

    # Select random subset of nodes to eval the explainer on.
    if not node_indices:
        node_indices = extract_test_nodes(data, args_test_samples)

    for explainer_name in args_explainers:

        # Define explainer
        explainer = eval(explainer_name)(data, model)

        total_num_noise_feats = [
        ]  # count noisy features found in explanations for each test sample (for each class)
        pred_class_num_noise_feats = [
        ]  # count noisy features found in explanations for each test sample for class of interest
        total_num_non_zero_noise_feat = [
        ]  # count number of noisy features considered for each test sample
        M = []  # count number of non zero features for each test sample

        # Loop on each test sample and store how many times do noise features appear among
        # K most influential features in our explanations
        for node_idx in tqdm(node_indices, desc='explain node', leave=False):

            # Explanations via GraphSHAP
            coefs = explainer.explainer(node_index=node_idx,
                                        hops=args_hops,
                                        num_samples=args_num_samples,
                                        info=False)

            # Check how many non zero features
            M.append(explainer.M)

            # Number of non zero noisy features
            num_non_zero_noise_feat = len(
                [val for val in noise_feat[node_idx] if val != 0])

            # Multilabel classification - consider all classes instead of focusing on the
            # class that is predicted by our model
            num_noise_feats = []
            true_conf, predicted_class = model(
                x=data.x,
                edge_index=data.edge_index).exp()[node_idx].max(dim=0)

            for i in range(data.num_classes):

                # Store indexes of K most important features, for each class
                feat_indices = np.abs(coefs[:, i]).argsort()[-args_K:].tolist()

                # Number of noisy features that appear in explanations - use index to spot them
                num_noise_feat = sum(idx < num_non_zero_noise_feat
                                     for idx in feat_indices)
                num_noise_feats.append(num_noise_feat)

                if i == predicted_class:
                    pred_class_num_noise_feats.append(num_noise_feat)

            # Return this number => number of times noisy features are provided as explanations
            total_num_noise_feats.append(sum(num_noise_feats))
            # Return number of noisy features considered in this test sample
            total_num_non_zero_noise_feat.append(num_non_zero_noise_feat)

        if info:
            print('Noise features included in explanations: ',
                  total_num_noise_feats)
            print('There are {} noise features found in the explanations of {} test samples, an average of {} per sample'\
             .format(sum(total_num_noise_feats),args_test_samples,sum(total_num_noise_feats)/args_test_samples) )

            # Number of noisy features found in explanation for the predicted class
            print(
                np.sum(pred_class_num_noise_feats) / args_test_samples,
                'for the predicted class only')

            perc = 100 * sum(total_num_non_zero_noise_feat) / np.sum(M)
            print('Overall proportion of considered noisy features : {:.2f}%'.
                  format(perc))

            perc = 100 * sum(total_num_noise_feats) / (
                args_K * args_test_samples * data.num_classes)
            print('Percentage of explanations showing noisy features: {:.2f}%'.
                  format(perc))

            if sum(total_num_non_zero_noise_feat) != 0:
                perc = 100 * sum(total_num_noise_feats) / (
                    sum(total_num_non_zero_noise_feat) * data.num_classes)
                perc2 = 100 * (
                    args_K * args_test_samples * data.num_classes -
                    sum(total_num_noise_feats)) / (
                        data.num_classes *
                        (sum(M) - sum(total_num_non_zero_noise_feat)))
                print(
                    'Proportion of noisy features found in explanations vs normal features: {:.2f}% vs {:.2f}%, over considered features only'
                    .format(perc, perc2))

            print('------------------------------------')
            # Plot of kernel density estimates of number of noisy features included in explanation
            # Do for all benchmarks (with diff colors) and plt.show() to get on the same graph
        plot_dist(total_num_noise_feats, label=explainer_name, color='g')

    plt.show()
    return sum(total_num_noise_feats)
Ejemplo n.º 18
0
def eval_shap(args_dataset,
              args_model,
              args_test_samples,
              args_hops,
              args_K,
              args_num_samples,
              node_indices,
              info,
              args_hv,
              args_feat,
              args_coal,
              args_g,
              args_multiclass,
              args_regu,
              args_gpu,
              args_fullempty,
              args_S, 
              seed):
    """
    Compares SHAP and GraphSVX on graph based datasets
    Check if they agree on features'contribution towards prediction for several test samples
    """

    # Define dataset
    data = prepare_data(args_dataset, seed=10)

    # Select a random subset of nodes to eval the explainer on.
    if not node_indices:
        node_indices = extract_test_nodes(data, args_test_samples, seed)

    # Define training parameters depending on (model-dataset) couple
    hyperparam = ''.join(['hparams_', args_dataset, '_', args_model])
    param = ''.join(['params_', args_dataset, '_', args_model])

    # Define the model
    if args_model == 'GCN':
        model = GCN(input_dim=data.x.size(
            1), output_dim=data.num_classes, **eval(hyperparam))
    else:
        model = GAT(input_dim=data.x.size(
            1), output_dim=data.num_classes, **eval(hyperparam))

    # Re-train the model on dataset with noisy features
    train_and_val(model, data, **eval(param))

    # Store metrics
    iou = []
    prop_contrib_diff = []

    # Iterate over test samples
    for node_idx in tqdm(node_indices, desc='explain node', leave=False):

        # Define explainers we would like to compare
        graphshap = GraphSVX(data, model, args_gpu)
        shap = SHAP(data, model, args_gpu)

        # Explanations via GraphSVX
        graphshap_coefs = graphshap.explain([node_idx],
                                  args_hops,
                                  args_num_samples,
                                  info,
                                  args_multiclass,
                                  args_fullempty,
                                  args_S,
                                  args_hv,
                                  args_feat, # All
                                  args_coal, # Random or SmarerSoftRegu
                                  args_g, #  WLS
                                  args_regu) # 1
        graphshap_coefs = graphshap_coefs[0].T[:graphshap.F]

        shap_coefs = shap.explain(node_idx,
                                  args_hops,
                                  args_num_samples,
                                  info=False,
                                  multiclass=False
                                  )[:shap.F]

        # Consider node features only - for predicted class only
        true_conf, predicted_class = model(x=data.x, edge_index=data.edge_index).exp()[
            node_idx].max(dim=0)

        # Need to apply regularisation

        # Proportional contribution
        prop_contrib_diff.append(np.abs( graphshap_coefs.sum(
        ) / np.abs(graphshap_coefs).sum() - shap_coefs.sum() / np.abs(shap_coefs).sum()))
        #print('GraphSVX proportional contribution to pred: {:.2f}'.format(graphshap_coefs.sum() / np.abs(graphshap_coefs).sum() ))
        #print('SHAP proportional contribution to pred: {:.2f}'.format(shap_coefs.sum() / np.abs(shap_coefs).sum() ))

        # Important features
        graphshap_feat_indices = np.abs(graphshap_coefs).argsort()[-10:].tolist()
        shap_feat_indices = np.abs(shap_coefs).argsort()[-10:].tolist()
        iou.append(len(set(graphshap_feat_indices).intersection(set(shap_feat_indices))
                       ) / len(set(graphshap_feat_indices).union(set(shap_feat_indices))))
        #print('Iou important features: ', iou)

    print('iou av:', np.mean(iou))
    print('difference in contibutions towards pred: ', np.mean(prop_contrib_diff))
Ejemplo n.º 19
0
def filter_useless_features_multiclass(args_dataset,
                                       args_model,
                                       args_explainers,
                                       args_hops,
                                       args_num_samples,
                                       args_test_samples,
                                       args_prop_noise_nodes,
                                       args_connectedness,
                                       node_indices,
                                       args_K,
                                       info,
                                       args_hv,
                                       args_feat,
                                       args_coal,
                                       args_g,
                                       args_multiclass,
                                       args_regu,
                                       args_gpu,
                                       args_fullempty,
                                       args_S, 
                                       seed):
    """ Add noisy features to dataset and check how many are included in explanations
    The fewest, the better the explainer.

    Args:
        Arguments defined in argument parser of script_eval.py
    
    """

    # Define dataset - include noisy features
    data = prepare_data(args_dataset, seed=10)
    args_num_noise_feat = int(data.x.size(1) * args_prop_noise_feat)
    args_p = eval('EVAL1_' + data.name)['args_p']
    args_binary = eval('EVAL1_' + data.name)['args_binary']
    data, noise_feat = add_noise_features(
        data, num_noise=args_num_noise_feat, binary=args_binary, p=args_p)

    # Define training parameters depending on (model-dataset) couple
    hyperparam = ''.join(['hparams_', args_dataset, '_', args_model])
    param = ''.join(['params_', args_dataset, '_', args_model])

    # Define the model
    if args_model == 'GCN':
        model = GCN(input_dim=data.x.size(
            1), output_dim=data.num_classes, **eval(hyperparam))
    else:
        model = GAT(input_dim=data.x.size(
            1), output_dim=data.num_classes,  **eval(hyperparam))

    # Re-train the model on dataset with noisy features
    train_and_val(model, data, **eval(param))

    # Select random subset of nodes to eval the explainer on.
    if not node_indices:
        node_indices = extract_test_nodes(data, args_test_samples, seed)

    # Evaluate the model - test set
    model.eval()
    with torch.no_grad():
        log_logits = model(x=data.x, edge_index=data.edge_index)  # [2708, 7]
    test_acc = accuracy(log_logits[data.test_mask], data.y[data.test_mask])
    print('Test accuracy is {:.4f}'.format(test_acc))
    del log_logits

    # Loop on different explainers selected
    for c, explainer_name in enumerate(args_explainers):

        # Define explainer
        explainer = eval(explainer_name)(data, model, args_gpu)

        # count noisy features found in explanations for each test sample (for each class)
        total_num_noise_feats = []
        # count noisy features found in explanations for each test sample for class of interest
        pred_class_num_noise_feats = []
        # count number of noisy features considered for each test sample (for each class)
        total_num_noise_feat_considered = []
        F = []  # count number of non zero features for each test sample

        # Loop on each test sample and store how many times do noise features appear among
        # K most influential features in our explanations
        for node_idx in tqdm(node_indices, desc='explain node', leave=False):

            if explainer_name == 'GraphSVX':
                coefs = explainer.explain(
                    [node_idx],
                    args_hops,
                    args_num_samples,
                    info,
                    args_multiclass,
                    args_fullempty,
                    args_S,
                    args_hv,
                    args_feat,
                    args_coal,
                    args_g,
                    args_regu)
                coefs = coefs[0].T[:explainer.F]

            # Explanations via GraphSVX
            else:
                coefs = explainer.explain(node_index=node_idx,
                                        hops=args_hops,
                                        num_samples=args_num_samples,
                                        info=False, 
                                        multiclass=True)
            

            # Check how many non zero features
            F.append(explainer.F)

            # Number of non zero noisy features
            # Dfferent for explainers with all features considered vs non zero features only (shap,graphshap)
            # if explainer.F != data.x.size(1)
            if explainer_name == 'GraphSVX' or explainer_name == 'SHAP':
                num_noise_feat_considered = len(
                    [val for val in noise_feat[node_idx] if val != 0])
            else:
                num_noise_feat_considered = args_num_noise_feat

            # Multilabel classification - consider all classes instead of focusing on the
            # class that is predicted by our model
            num_noise_feats = []
            true_conf, predicted_class = model(x=data.x, edge_index=data.edge_index).exp()[
                node_idx].max(dim=0)

            for i in range(data.num_classes):

                # Store indexes of K most important node features, for each class
                feat_indices = np.abs(
                    coefs[:explainer.F, i]).argsort()[-args_K:].tolist()

                # Number of noisy features that appear in explanations - use index to spot them
                num_noise_feat = sum(
                    idx < num_noise_feat_considered for idx in feat_indices)
                num_noise_feats.append(num_noise_feat)

                # For predicted class only
                if i == predicted_class:
                    pred_class_num_noise_feats.append(num_noise_feat)

            # Return number of times noisy features are provided as explanations
            total_num_noise_feats.append(sum(num_noise_feats))

            # Return number of noisy features considered in this test sample
            total_num_noise_feat_considered.append(num_noise_feat_considered)

        if info:
            print('Noise features included in explanations: ',
                  total_num_noise_feats)
            print('There are {} noise features found in the explanations of {} test samples, an average of {} per sample'
                  .format(sum(total_num_noise_feats), args_test_samples, sum(total_num_noise_feats)/args_test_samples))

            # Number of noisy features found in explanation for the predicted class
            print(np.sum(pred_class_num_noise_feats) /
                  args_test_samples, 'for the predicted class only')

            perc = 100 * sum(total_num_noise_feat_considered) / np.sum(F)
            print(
                'Proportion of non-zero noisy features among non-zero features: {:.2f}%'.format(perc))

            perc = 100 * sum(total_num_noise_feats) / \
                (args_K * args_test_samples * data.num_classes)
            print(
                'Proportion of explanations showing noisy features: {:.2f}%'.format(perc))

            if sum(total_num_noise_feat_considered) != 0:
                perc = 100 * sum(total_num_noise_feats) / \
                    (sum(total_num_noise_feat_considered)*data.num_classes)
                perc2 = 100 * (args_K * args_test_samples * data.num_classes - sum(total_num_noise_feats)) / (
                    data.num_classes * (sum(F) - sum(total_num_noise_feat_considered)))
                print('Proportion of noisy features found in explanations vs normal features (among considered ones): {:.2f}% vs {:.2f}%, over considered features only'.format(
                    perc, perc2))

            print('------------------------------------')

        # Plot of kernel density estimates of number of noisy features included in explanation
        # Do for all benchmarks (with diff colors) and plt.show() to get on the same graph
        # plot_dist(total_num_noise_feats, label=explainer_name, color=COLOURS[c])
        plot_dist(total_num_noise_feats,
                    label=explainer_name, color=COLOURS[c])

    # Random explainer - plot estimated kernel density
    total_num_noise_feats = noise_feats_for_random(
        data, model, args_K, args_num_noise_feat, node_indices)
    save_path = 'results/eval1_feat'
    plot_dist(total_num_noise_feats, label='Random', color='y')

    plt.savefig('results/eval1_feat_{}'.format(data.name))