def load_bert(args):
    s_train, p_train = load_data('penn_treebank_dataset', 'train')
    doc_id, sen_id_train, global_graph = construct_graph(p_train)
    bert_train_paths = bert_embeddings(args, s_train, '_train')
    bert_train = np.load(bert_train_paths[-1])

    s_dev, p_dev = load_data('penn_treebank_dataset', 'dev')
    doc_id, sen_id_dev, global_graph = construct_graph(p_dev)
    bert_dev_paths = bert_embeddings(args, s_dev, '_dev')
    bert_dev = np.load(bert_dev_paths[-1])

    s_test, p_test = load_data('penn_treebank_dataset', 'test')
    doc_id, sen_id_test, global_graph = construct_graph(p_test)
    bert_test_paths = bert_embeddings(args, s_test, '_test')
    bert_test = np.load(bert_test_paths[-1])

    return bert_train, bert_dev, bert_test, sen_id_train, sen_id_dev, sen_id_test
def load_graph(args, data_split=True):
    if not data_split:
        _, p_train = load_data('penn_treebank_dataset', 'train')
        doc_id, sen_id_train, global_graph = construct_graph(p_train)
        _, p_dev = load_data('penn_treebank_dataset', 'dev')
        doc_id, sen_id_dev, global_graph = construct_graph(p_dev)
        _, p_test = load_data('penn_treebank_dataset', 'test')
        doc_id, sen_id_test, global_graph = construct_graph(p_test)
        parsed = p_train + p_dev + p_test
        sen_id = sen_id_train + sen_id_dev + sen_id_test
        graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id)
        return graph_emb, sen_id
    else:
        _, p_train = load_data('penn_treebank_dataset', 'train')
        doc_id, sen_id_train, global_graph = construct_graph(p_train)
        ge_train = graph_embeddings(args, global_graph, doc_id, sen_id_train,
                                    '_train')
        _, p_dev = load_data('penn_treebank_dataset', 'dev')
        doc_id, sen_id_dev, global_graph = construct_graph(p_dev)
        ge_dev = graph_embeddings(args, global_graph, doc_id, sen_id_dev,
                                  '_dev')
        _, p_test = load_data('penn_treebank_dataset', 'test')
        doc_id, sen_id_test, global_graph = construct_graph(p_test)
        ge_test = graph_embeddings(args, global_graph, doc_id, sen_id_test,
                                   '_test')
        return ge_train, ge_dev, ge_test, sen_id_train, sen_id_dev, sen_id_test
Beispiel #3
0
def mi_mlps_ptb(args):
    # load data
    s_train, p_train = load_data('penn_treebank_dataset', 'train')
    s_dev, p_dev = load_data('penn_treebank_dataset', 'dev')
    s_test, p_test = load_data('penn_treebank_dataset', 'test')
    sentences = s_train + s_dev + s_test
    parsed = p_train + p_dev + p_test
    doc_id, sen_id, global_graph = construct_graph(parsed)
    s_train, p_train, s_dev, p_dev, s_test, p_test = [], [], [], [], [], []

    # load embeddings
    graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id)
    bert_emb = load_glove(args, sentences)
    # bert_emb = load_elmo(args, sentences)

    # bert_emb_paths = bert_embeddings(args, sentences)
    # bert_emb = np.load(bert_emb_paths[0], allow_pickle=True)


    # initialize mi
    mir, mig, mib = [], [], []
    for l in range(args.bert_layers_num): mib.append([])
    for s in range(len(sentences)):
        mir.append(0.)
        mig.append(0.)
        for l in range(args.bert_layers_num):
            mib[l].append(0.)

    if args.baselines:
        print('3.1 start to calculate baselines of MI...')
        # calculate MI baselines
        for r in range(args.repeat):
            tmp_mir = mine_probe(args, graph_emb, bert_emb, len(sentences), 'lower')
            tmp_mig = mine_probe(args, graph_emb, bert_emb, len(sentences), 'upper')
            # get sum value
            mir = [mir[s]+tmp_mir[s] for s in range(len(tmp_mir))]
            mig = [mig[s]+tmp_mig[s] for s in range(len(tmp_mig))]

    print('3.2 start to calculate BERT hidden states of MI...')
    for r in range(args.repeat):
        tmp_mib = mine_probe(args, graph_emb, bert_emb, len(sentences), 
                                                    args.bert_layers_num - 1)
        mib[-1] = [mib[-1][s]+tmp_mib[s] for s in range(len(tmp_mib))]
    mib_layers = sum(mib[-1]) / (len(mib[-1]) * args.repeat)
    print('MI(G, Glove): {} |'.format(mib_layers))
def load_embeddings(args):
    # load data
    s_train, p_train = load_data('penn_treebank_dataset', 'train')
    s_dev, p_dev = load_data('penn_treebank_dataset', 'dev')
    s_test, p_test = load_data('penn_treebank_dataset', 'test')
    sentences = s_train + s_dev + s_test
    parsed = p_train + p_dev + p_test
    # sentences = s_test
    # parsed = p_test
    doc_id, sen_id, global_graph = construct_graph(parsed)
    # load embeddings
    graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id)
    bert_emb_paths = bert_embeddings(args, sentences)
    # graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id, '_test')
    # bert_emb_paths = bert_embeddings(args, sentences, '_test')
    bert_emb = np.load(bert_emb_paths[-1])

    return graph_emb, bert_emb
Beispiel #5
0
        # (1, dim) -> (1, dim) -> (1, )
        logit = th.sigmoid(th.sum(src * dst))
        preds.append(logit.detach().numpy().tolist())
        labels.append(edge.label)

    fpr, tpr, thresholds = metrics.roc_curve(labels, preds, pos_label=1)

    print("Evaluate link prediction AUC: {:.4f}".format(metrics.auc(fpr, tpr)))


if __name__ == "__main__":
    args = utils.init_args()

    valid_sku_raw_ids = utils.get_valid_sku_set(args.item_info_data)

    g, sku_encoder, sku_decoder = utils.construct_graph(
        args.action_data, args.session_interval_sec, valid_sku_raw_ids)

    train_g, test_g = utils.split_train_test_graph(g)

    sku_info_encoder, sku_info_decoder, sku_info = \
        utils.encode_sku_fields(args.item_info_data, sku_encoder, sku_decoder)

    num_skus = len(sku_encoder)
    num_brands = len(sku_info_encoder["brand"])
    num_shops = len(sku_info_encoder["shop"])
    num_cates = len(sku_info_encoder["cate"])

    print(
        "Num skus: {}, num brands: {}, num shops: {}, num cates: {}".\
            format(num_skus, num_brands, num_shops, num_cates)
    )
                l = list(nx.all_simple_paths(G, source=idx_s, target=idx_d))
                paths_len.append(len(l))
            else:
                continue
    return np.array(paths_len)


if __name__ == '__main__':
    """ Load rules from ClassBench filter file, build a graph,
	and print graph statistics """
    args = parse_args()

    ruleset = load_ruleset(args.ruleset, except_zero=False, random_priority=0)

    # build graph
    G = construct_graph(ruleset, True)

    # every nodes' (in degree, out degree)
    node_degree = []
    node_list = list(G.nodes())
    for idx, i in enumerate(node_list):
        node_degree.append((G.in_degree(i), G.out_degree(i)))
    """Calculation of nodes and edges"""
    # number of edges of each component
    edge_num_by_component = []
    weak_list = list(nx.weakly_connected_components(G))

    for idx, k in enumerate(weak_list):
        #if len(k) != 1:
        #	edge_num_by_component.append(len(list(G.edges(weak_list[idx]))))
        edge_num_by_component.append(len(list(G.edges(weak_list[idx]))))
Beispiel #7
0
def mi_bert_ptb(args, npeet=False, uncontext=False):
    # load data
    s_train, p_train = load_data('penn_treebank_dataset', 'train')
    s_dev, p_dev = load_data('penn_treebank_dataset', 'dev')
    s_test, p_test = load_data('penn_treebank_dataset', 'test')
    sentences = s_train + s_dev + s_test
    parsed = p_train + p_dev + p_test
    doc_id, sen_id, global_graph = construct_graph(parsed)
    s_train, p_train, s_dev, p_dev, s_test, p_test = [], [], [], [], [], []

    # load embeddings
    graph_emb = graph_embeddings(args, global_graph, doc_id, sen_id)
    if uncontext:
        bert_emb = load_glove(args, sentences)
        # bert_emb = load_elmo(args, sentences)
    else:
        bert_emb_paths = bert_embeddings(args, sentences)
        # bert_emb_paths = load_elmos(args, sentences)
        bert_emb = np.load(bert_emb_paths[0], allow_pickle=True)

    # initialize mi
    mir, mig, mib = [], [], []
    for l in range(args.bert_layers_num): mib.append([])
    for s in range(len(sentences)):
        mir.append(0.)
        mig.append(0.)
        for l in range(args.bert_layers_num):
            mib[l].append(0.)

    if args.baselines:
        print('3.1 start to calculate baselines of MI...')
        # calculate MI baselines
        for r in range(args.repeat):
            tmp_mir = mine_probe(args, graph_emb, bert_emb, len(sentences), 'lower')
            tmp_mig = mine_probe(args, graph_emb, bert_emb, len(sentences), 'upper')
            # get sum value
            mir = [mir[s]+tmp_mir[s] for s in range(len(tmp_mir))]
            mig = [mig[s]+tmp_mig[s] for s in range(len(tmp_mig))]

    print('3.2 start to calculate BERT hidden states of MI...')
    if uncontext:
        for r in range(args.repeat):
            tmp_mib = mine_probe(args, graph_emb, bert_emb, len(sentences), 
                                                        args.bert_layers_num - 1)
            mib[-1] = [mib[-1][s]+tmp_mib[s] for s in range(len(tmp_mib))]
        mib_layers = sum(mib[-1]) / (len(mib[-1]) * args.repeat)
        print('MI(G, Glove): {} |'.format(mib_layers))
    else:
        # calculate MI of BERT
        for l in range(args.bert_layers_num):
            bert_emb = np.load(bert_emb_paths[l], allow_pickle=True)
            for r in range(args.repeat):
                tmp_mib = mine_probe(args, graph_emb, bert_emb, len(sentences), l)
                mib[l] = [mib[l][s]+tmp_mib[s] for s in range(len(tmp_mib))]
        # compute average values for all results
        mir = [mi/args.repeat for mi in mir]
        mig = [mi/args.repeat for mi in mig]
        for l in range(args.bert_layers_num):
            mib[l] = [mi/args.repeat for mi in mib[l]]
        mib_layers = [sum(mib[l])/len(mib[l]) for l in range(len(mib))]

        # print general results
        results = {'lower:': mir, 'upper': mig, 'bert': mib}
        # print('\n', results, '\n')
        
        print('MI(G, R): {} | MI(G, G): {}| MI(G, BERT): {} |'.format(sum(
                                                                    mir)/len(mir),
                                                                    sum(mig)/len(mig),
                                                                    mib_layers))

    return
Beispiel #8
0
torch.set_default_tensor_type(torch.DoubleTensor)
torch.set_default_dtype(torch.float64)
np.random.seed(config['seed'])
torch.manual_seed(config['seed'])

# -------------------------------------------------------------------------
# Setup logger
# -------------------------------------------------------------------------
logger.info(f"Add file handle to logger...")
logzero.logfile(os.path.join(result_dir, 'logs.log'))

# -------------------------------------------------------------------------
# Construct graph from config file
# -------------------------------------------------------------------------
logger.info("Construct graph...")
g, g_noy = utils.construct_graph(config)

# -------------------------------------------------------------------------
# Load data according to config
# -------------------------------------------------------------------------
data_type = config['data']['type']
logger.info(f"Load {data_type} data; A: {config['data']['protected']} ...")
data = data_loader.get_data(data_type, config['data'], graph=g)
a, y = data['A'], data['Y']
config['data']['samples'] = len(y)
config['max_cfu']['n_original'] = len(y)

if debug > 0:
    logger.info("Create and save scatter plot of features...")
    plotters.plot_scatter_matrix(data, g, fig_dir, save=True)
    logger.info("Create conditional histograms...")