Ejemplo n.º 1
0
def main(argv):

    #parse command line arguments
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("-d",
                        "--data_dir",
                        type=str,
                        required=True,
                        dest="data_dir",
                        help="name of directory where data is stored")
    parser.add_argument(
        "-s",
        "--dataset",
        type=str,
        required=True,
        dest="data_name",
        help="name of dataset to create (without hdf5 extension)")
    parser.add_argument(
        "-n",
        "--normed",
        type=int,
        default=1,
        dest="use_normed",
        help="choose whether to use normalized features or not")

    args = parser.parse_args()

    data_path = args.data_dir
    data_name = args.data_name
    use_normed = args.use_normed

    infile_prefix = data_path + data_name + "_"
    if use_normed:
        ext = '.normed'
    else:
        ext = ''

    total_true = total_edges = total_b = total_c = 0

    dataset_len = np.zeros(3)
    for i, dataset_type in enumerate(['train', 'val', 'test']):
        g_list = []
        graphs = dgl.load_graphs(infile_prefix + dataset_type + ext +
                                 ".bin")[0]
        dataset_len[i] = len(graphs)

        for graph in graphs:
            ntracks = graph.num_nodes()
            graph = prune_graph(graph)

            #only consider training dataset when writing values to paramfile
            if dataset_type == 'train':
                total_true += int(th.sum(graph.edata['bin_labels'][:, 0]))
                total_b += int(th.sum(graph.edata['mult_labels'][:, 0] == 1))
                total_c += int(th.sum(graph.edata['mult_labels'][:, 0] == 2))
                total_edges += list(graph.edata['bin_labels'][:, 0].size())[0]

            g_list.append(graph)

        random.shuffle(g_list)
        dgl.save_graphs(infile_prefix + dataset_type + ext + '.pruned.bin',
                        g_list)

    #store important values in paramfile
    paramfile = open(infile_prefix + 'params', "w")
    paramfile.write(str(dataset_len[0]) + '\n')  #train length
    paramfile.write(str(dataset_len[1]) + '\n')  #val length
    paramfile.write(str(dataset_len[2]) + '\n')  #test length
    paramfile.write(str(total_true / total_edges) + '\n')
    paramfile.write(str(total_b / total_edges) + '\n')
    paramfile.write(str(total_c / total_edges) + '\n')
    paramfile.close()
Ejemplo n.º 2
0
def create_old_heterograph_files():
    path = os.path.join(os.path.dirname(__file__), "data/hetero1.bin")
    g_list0 = create_heterographs(F.int64) + create_heterographs(F.int32)
    labels_dict = {"graph_label": F.ones(54)}
    dgl.save_graphs(path, g_list0, labels_dict)
    # assign node feature: origin_id(not node_id)
    for entity in entity_dic.keys():
        node_feature = th.tensor(entity_dic[entity])
        g.nodes[entity].data['id'] = node_feature
    # assign edge feature: time
    for edge in edge_dic.keys():
        edge_feature = th.tensor(edge_dic[edge])
        g.edges[edge].data['timestamp'] = edge_feature

    return g
    # print(g)
    # print(g.number_of_nodes('process'))
    # print(g.nodes['process'].data['id'])
    # print(g.edges[('process', 'open', 'file')].data['timestamp'])


if __name__ == '__main__':

    scenario = 'VGame'
    for graph_id in range(200, 299):
        dgl_graph = data_to_heterograph(scenario, graph_id)
        dgl_graphname = "dataset/dglGraph/" + scenario + "/" + str(
            graph_id) + ".bin"
        graph_labels = {"glabel": th.tensor([graph_id])}
        dgl.save_graphs(dgl_graphname, [dgl_graph], graph_labels)
        print("graph #" + str(graph_id) + " of scenario " + scenario +
              " has been saved!")

# load graph from disk
# glist, label_dict = dgl.load_graphs("dataset/dglGraph/YouTube/0.bin")
Ejemplo n.º 4
0
def main(argv):

    #parse command line arguments
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("-d",
                        "--data_dir",
                        type=str,
                        required=True,
                        dest="data_dir",
                        help="name of directory where data is stored")
    parser.add_argument(
        "-s",
        "--dataset",
        type=str,
        required=True,
        dest="data_name",
        help="name of dataset to create (without hdf5 extension)")
    args = parser.parse_args()

    data_path = args.data_dir
    data_name = args.data_name

    train_infile_name = data_path + data_name + "_train.bin"
    val_infile_name = data_path + data_name + "_val.bin"
    test_infile_name = data_path + data_name + "_test.bin"
    train_outfile_name = data_path + data_name + "_train.normed.bin"
    val_outfile_name = data_path + data_name + "_val.normed.bin"
    test_outfile_name = data_path + data_name + "_test.normed.bin"
    normfile_name = data_path + data_name + "_norm"

    incl_errors = incl_hits = incl_corr = incl_vweight = False
    train_graphs = dgl.load_graphs(train_infile_name)[0]
    num_features_base = train_graphs[0].ndata['features_base'].size()[1]
    mean_features_base = np.zeros(num_features_base)
    std_features_base = np.zeros(num_features_base)
    if 'features_vweight' in train_graphs[0].ndata.keys():
        incl_vweight = True
        num_features_vweight = train_graphs[0].ndata['features_vweight'].size(
        )[1]
        mean_features_vweight = np.zeros(num_features_vweight)
        std_features_vweight = np.zeros(num_features_vweight)
    if 'features_errors' in train_graphs[0].ndata.keys():
        incl_errors = True
        num_features_errors = train_graphs[0].ndata['features_errors'].size(
        )[1]
        mean_features_errors = np.zeros(num_features_errors)
        std_features_errors = np.zeros(num_features_errors)
    if 'features_hits' in train_graphs[0].ndata.keys():
        incl_hits = True
        num_features_hits = train_graphs[0].ndata['features_hits'].size()[1]
        mean_features_hits = np.zeros(num_features_hits)
        std_features_hits = np.zeros(num_features_hits)
    if 'features_corr' in train_graphs[0].ndata.keys():
        incl_corr = True
        num_features_corr = train_graphs[0].ndata['features_corr'].size()[1]
        mean_features_corr = np.zeros(num_features_corr)
        std_features_corr = np.zeros(num_features_corr)

    print("Calculating mean of features")
    #calculate mean of training features - error and cov scaling is based only on scaling of base features
    total_tracks = 0
    for graph in train_graphs:
        features_base = graph.ndata['features_base'].numpy()
        mean_features_base += np.sum(features_base, axis=0)
        if incl_vweight:
            features_vweight = graph.ndata['features_vweight'].numpy()
            mean_features_vweight += np.sum(features_vweight, axis=0)
        if incl_hits:
            features_hits = graph.ndata['features_hits'].numpy()
            mean_features_hits += np.sum(features_hits, axis=0)
        total_tracks += graph.ndata['features_base'].size()[0]
    mean_features_base = mean_features_base / total_tracks
    if incl_vweight:
        mean_features_vweight = mean_features_vweight / total_tracks
    if incl_hits: mean_features_hits = mean_features_hits / total_tracks

    print("Calculating STD of features")
    #calculate std of training features
    for graph in train_graphs:
        features_base = graph.ndata['features_base'].numpy()
        std_features_base += np.sum(np.square(features_base -
                                              mean_features_base),
                                    axis=0)
        if incl_vweight:
            features_vweight = graph.ndata['features_vweight'].numpy()
            std_features_vweight += np.sum(np.square(features_vweight -
                                                     mean_features_vweight),
                                           axis=0)
        if incl_hits:
            features_hits = graph.ndata['features_hits'].numpy()
            std_features_hits += np.sum(np.square(features_hits -
                                                  mean_features_hits),
                                        axis=0)
    std_features_base = np.sqrt(std_features_base / total_tracks)
    if incl_vweight:
        std_features_vweight = np.sqrt(std_features_vweight / total_tracks)
    if incl_hits: std_features_hits = np.sqrt(std_features_hits / total_tracks)

    #manually set normalization parameters for special features (features that have a fixed range are set to vary from -1 to 1)
    mean_features_base[1] = math.pi / 2.  #track theta varies from 0 to pi
    std_features_base[1] = math.pi / 2.
    mean_features_base[2] = 0  #track phi varies from -pi to pi
    std_features_base[2] = math.pi
    mean_features_base[7] = 0  #jet phi varies from -pi to pi
    std_features_base[7] = math.pi
    if incl_errors:  #take std of variance to be std of features squared
        std_features_errors[0] = std_features_base[0]
        std_features_errors[1] = std_features_base[1]
        std_features_errors[2] = std_features_base[2]
        std_features_errors[3] = std_features_base[3]
        std_features_errors[4] = std_features_base[4]
    if incl_corr:  #take std of covariance to be product of std of features
        std_features_corr[0] = std_features_base[0] * std_features_base[1]
        std_features_corr[1] = std_features_base[0] * std_features_base[2]
        std_features_corr[2] = std_features_base[0] * std_features_base[3]
        std_features_corr[3] = std_features_base[0] * std_features_base[4]
        std_features_corr[4] = std_features_base[1] * std_features_base[2]
        std_features_corr[5] = std_features_base[1] * std_features_base[3]
        std_features_corr[6] = std_features_base[1] * std_features_base[4]
        std_features_corr[7] = std_features_base[2] * std_features_base[3]
        std_features_corr[8] = std_features_base[2] * std_features_base[4]
        std_features_corr[9] = std_features_base[3] * std_features_base[4]

    #store normalization parameters in file
    normfile = open(normfile_name, "w")
    for i in range(len(mean_features_base)):
        normfile.write(str(mean_features_base[i]) + '\n')
        normfile.write(str(std_features_base[i]) + '\n')
    if incl_vweight:
        for i in range(len(mean_features_vweight)):
            normfile.write(str(mean_features_vweight[i]) + '\n')
            normfile.write(str(std_features_vweight[i]) + '\n')
    if incl_errors:
        for i in range(len(mean_features_errors)):
            normfile.write(str(mean_features_errors[i]) + '\n')
            normfile.write(str(std_features_errors[i]) + '\n')
    if incl_corr:
        for i in range(len(mean_features_corr)):
            normfile.write(str(mean_features_corr[i]) + '\n')
            normfile.write(str(std_features_corr[i]) + '\n')
    if incl_hits:
        for i in range(len(mean_features_hits)):
            normfile.write(str(mean_features_hits[i]) + '\n')
            normfile.write(str(std_features_hits[i]) + '\n')
    normfile.close()

    #apply normalization from training data to all graph features
    print("Normalizing {} training graphs".format(len(train_graphs)))
    for graph in train_graphs:
        features_base = graph.ndata['features_base'].numpy()
        normed_features_base = np.divide(features_base - mean_features_base,
                                         std_features_base)
        graph.ndata['features_base'] = th.from_numpy(normed_features_base)
        if incl_vweight:
            features_vweight = graph.ndata['features_vweight'].numpy()
            normed_features_vweight = np.divide(
                features_vweight - mean_features_vweight, std_features_vweight)
            graph.ndata['features_vweight'] = th.from_numpy(
                normed_features_vweight)
        if incl_errors:
            features_errors = graph.ndata['features_errors'].numpy()
            normed_features_errors = np.divide(
                features_errors - mean_features_errors, std_features_errors)
            graph.ndata['features_errors'] = th.from_numpy(
                normed_features_errors)
        if incl_corr:
            features_corr = graph.ndata['features_corr'].numpy()
            normed_features_corr = np.divide(
                features_corr - mean_features_corr, std_features_corr)
            graph.ndata['features_corr'] = th.from_numpy(normed_features_corr)
        if incl_hits:
            features_hits = graph.ndata['features_hits'].numpy()
            normed_features_hits = np.divide(
                features_hits - mean_features_hits, std_features_hits)
            graph.ndata['features_hits'] = th.from_numpy(normed_features_hits)
    dgl.save_graphs(train_outfile_name, train_graphs)

    val_graphs = dgl.load_graphs(val_infile_name)[0]
    print("Normalizing {} validation graphs".format(len(val_graphs)))
    for graph in val_graphs:
        features_base = graph.ndata['features_base'].numpy()
        normed_features_base = np.divide(features_base - mean_features_base,
                                         std_features_base)
        graph.ndata['features_base'] = th.from_numpy(normed_features_base)
        if incl_vweight:
            features_vweight = graph.ndata['features_vweight'].numpy()
            normed_features_vweight = np.divide(
                features_vweight - mean_features_vweight, std_features_vweight)
            graph.ndata['features_vweight'] = th.from_numpy(
                normed_features_vweight)
        if incl_errors:
            features_errors = graph.ndata['features_errors'].numpy()
            normed_features_errors = np.divide(
                features_errors - mean_features_errors, std_features_errors)
            graph.ndata['features_errors'] = th.from_numpy(
                normed_features_errors)
        if incl_corr:
            features_corr = graph.ndata['features_corr'].numpy()
            normed_features_corr = np.divide(
                features_corr - mean_features_corr, std_features_corr)
            graph.ndata['features_corr'] = th.from_numpy(normed_features_corr)
        if incl_hits:
            features_hits = graph.ndata['features_hits'].numpy()
            normed_features_hits = np.divide(
                features_hits - mean_features_hits, std_features_hits)
            graph.ndata['features_hits'] = th.from_numpy(normed_features_hits)
    dgl.save_graphs(val_outfile_name, val_graphs)

    test_graphs = dgl.load_graphs(test_infile_name)[0]
    print("Normalizing {} testing graphs".format(len(test_graphs)))
    for graph in test_graphs:
        features_base = graph.ndata['features_base'].numpy()
        normed_features_base = np.divide(features_base - mean_features_base,
                                         std_features_base)
        graph.ndata['features_base'] = th.from_numpy(normed_features_base)
        if incl_vweight:
            features_vweight = graph.ndata['features_vweight'].numpy()
            normed_features_vweight = np.divide(
                features_vweight - mean_features_vweight, std_features_vweight)
            graph.ndata['features_vweight'] = th.from_numpy(
                normed_features_vweight)
        if incl_errors:
            features_errors = graph.ndata['features_errors'].numpy()
            normed_features_errors = np.divide(
                features_errors - mean_features_errors, std_features_errors)
            graph.ndata['features_errors'] = th.from_numpy(
                normed_features_errors)
        if incl_corr:
            features_corr = graph.ndata['features_corr'].numpy()
            normed_features_corr = np.divide(
                features_corr - mean_features_corr, std_features_corr)
            graph.ndata['features_corr'] = th.from_numpy(normed_features_corr)
        if incl_hits:
            features_hits = graph.ndata['features_hits'].numpy()
            normed_features_hits = np.divide(
                features_hits - mean_features_hits, std_features_hits)
            graph.ndata['features_hits'] = th.from_numpy(normed_features_hits)
    dgl.save_graphs(test_outfile_name, test_graphs)
def sample_opti_sequenz_graph(iter=None, maxIteration = 501):
    # initialize LEP
    LEP = LEProblem(bridge2Dstochastic())
    tau_adapter = tauadaptor(LEP.get_tau())
    mesh_adapter = meshadaptor(mesh_0=LEP.mesh, CVaR=LEP.CVaR)

    LEP = get_unique_dist_g(LEP, iter=None)

    IterationStep = 0
    get_new_graph_iter = 4

    iters = []
    taus = []
    taus.append(LEP.tau_0)
    gammas = []
    es = []
    controls = []

    for k in range(maxIteration):
        IterationStep += 1
        get_new_graph_iter +=1
        iters.append(IterationStep)
        print("\nIter: {IterationStep}".format(IterationStep=IterationStep))

        # solving
        SE = LEP.SE(LEP.u, LEP.phi_n[0], LEP.v_u, LEP.g)
        solve(lhs(SE) == rhs(SE), LEP.u_n, bcs=LEP.bcSE, solver_parameters={"linear_solver": "umfpack", "preconditioner": "default"}, form_compiler_parameters=None)

        #get u_k and phi_{k-1}
        if get_new_graph_iter >= 5:

            # get edges from fenics
            edge_from = []
            edge_to = []
            #element = LEP.F.element()
            dofmap = LEP.F.dofmap()
            for cell in cells(LEP.mesh):
                finite_element_node_index = dofmap.cell_dofs(cell.index())
                edge_from += list(list(zip(*itertools.permutations(finite_element_node_index[:-1], 2)))[0])
                edge_to += list(list(zip(*itertools.permutations(finite_element_node_index[:-1], 2)))[1])

            # initialize graph
            g = dgl.graph((edge_from, edge_to)) # initialize Graph (nodes and edges)


            g.ndata['value'] = torch.tensor([ [0.5] for i in range(g.number_of_nodes())], dtype=torch.float32)
            g.ndata['position'] = torch.tensor([ [0.001, 0.001] for i in range(g.number_of_nodes()) ], dtype=torch.float32)
            g.ndata['u'] = torch.tensor([ [0.001, 0.001] for i in range(g.number_of_nodes()) ], dtype=torch.float32)


            # get graph data from fenics
            phi_val = LEP.phi_n.vector()[:-1]
            ux, uy = LEP.u_n.split(deepcopy=True)
            ux_val = ux.vector()[:]
            uy_val = uy.vector()[:]
            phi_coor = LEP.F.tabulate_dof_coordinates()[:-1]


            for i in range(len(phi_val)):
                g.ndata['value'][i] = phi_val[i]
                g.ndata['u'][i] = torch.tensor([ux_val[i], uy_val[i]], dtype=torch.float32)
                g.ndata['position'][i] = torch.tensor(phi_coor[i], dtype=torch.float32)

            dgl.save_graphs('data/' + str(iter)+ '_' +str(IterationStep)+ '_(' + str(LEP.g(0)[0]) + ',' + str(LEP.g(0)[1]) + ')_truncnorm_bridge2D_GRAPH_50_sequence.bin', g)


            get_new_graph_iter = 0


        LEP.p_n.assign(LEP.u_n)  # p_n = u_n

        # solve gradient equation
        GE = LEP.GE(LEP.phi, LEP.phi_n, LEP.v_phi, LEP.p_n, LEP.u_n, LEP.tau[0], LEP.gamma)
        solve(lhs(GE) == rhs(GE), LEP.phi_next, bcs=None, solver_parameters={"linear_solver": "umfpack", "preconditioner": "default"}, form_compiler_parameters=None)

        LEP.project_phi()

        # optimization adaptions
        tau_n = LEP.get_tau()
        taus.append(tau_n)

        e_n = LEP.get_e_n()
        es.append(e_n)

        control = LEP.get_control_change()
        controls.append(control / tau_n)

        gammas.append(LEP.gamma(0))

        LEP.tau.vector()[:] = tau_adapter.nextTau(e_n)[0]

        LEP.new_adaptGamma()

        NewMeshIndicator, new_mesh = mesh_adapter.update(LEP.phi_next, LEP.u_n, controls[-1], LEP.mesh)

        if NewMeshIndicator:
            LEP.updateMesh(new_mesh)

        LEP.phi_last.assign(LEP.phi_n)
        LEP.phi_n.assign(LEP.phi_next)



    return
        user_table_path = 's3://xhs.alpha/reddm/' + args.user_table + '/dtm=%s' % args.dsnodash
        user_features = pq.ParquetDataset(user_table_path, filesystem=s3).read().to_pandas()
        device_table_path = 's3://xhs.alpha/reddm/' + args.device_table + '/dtm=%s' % args.dsnodash
        device_features = pq.ParquetDataset(device_table_path, filesystem=s3).read().to_pandas()
        relation_table_path = 's3://xhs.alpha/reddm/' + args.relation_table + '/dtm=%s' % args.dsnodash
        relation_df = pq.ParquetDataset(relation_table_path, filesystem=s3).read().to_pandas()
        label_table_path = 's3://xhs.alpha/reddm/' + args.label_table + '/dtm=%s' % args.dsnodash
        labels = pq.ParquetDataset(label_table_path, filesystem=s3).read().to_pandas()
        # Build graph
        graph_builder = PandasGraphBuilder()
        graph_builder.add_entities(user_features, 'user_entity_id', 'user')
        graph_builder.add_entities(device_features, 'device_entity_id', 'device')
        graph_builder.add_binary_relations(relation_df, 'user_entity_id', 'device_entity_id', 'used')
        graph_builder.add_binary_relations(relation_df, 'device_entity_id', 'user_entity_id', 'used-by')
        g = graph_builder.build()
        dgl.save_graphs('./dataset/dgl_graph', [g])
        # Assign features.
        user_features = user_features.sort_values(by='user_entity_id').values[:, 1:]
        device_features = device_features.sort_values(by='device_entity_id').values[:, 1:]
        labels = labels.values
        np.random.shuffle(labels)
        pos_label_count = np.count_nonzero(labels[:, 1] > 0)
        neg_labels = labels[labels[:, 1] == 0]
        neg_labels = neg_labels[:pos_label_count*args.sample_ratio, :]
        labels = np.vstack((labels[labels[:, 1] > 0], neg_labels))
        np.savez_compressed('./dataset/feat_and_label', user_f=user_features, device_f=device_features, labels=labels)
    else:
        g = dgl.load_graphs('./dataset/dgl_graph')[0][0]
        np_ds = np.load('./dataset/feat_and_label.npz' % args.dsnodash)
        user_features, device_features, labels = np_ds['user_f'], np_ds['device_f'], np_ds['labels']
Ejemplo n.º 7
0
            torch.arange(dataset.num_papers)
        ]))
    g.edata['etype'] = g.edata[dgl.ETYPE].byte()
    del g.edata[dgl.ETYPE]
    del g.ndata[dgl.NTYPE]
    del g.ndata[dgl.NID]

    # Process feature
    full_feat = np.memmap(args.full_output_path,
                          mode='w+',
                          dtype='float16',
                          shape=(dataset.num_authors +
                                 dataset.num_institutions + dataset.num_papers,
                                 dataset.num_paper_features))
    BLOCK_ROWS = 100000
    for start in tqdm.trange(0, dataset.num_authors, BLOCK_ROWS):
        end = min(dataset.num_authors, start + BLOCK_ROWS)
        full_feat[author_offset + start:author_offset +
                  end] = author_feat[start:end]
    for start in tqdm.trange(0, dataset.num_institutions, BLOCK_ROWS):
        end = min(dataset.num_institutions, start + BLOCK_ROWS)
        full_feat[inst_offset + start:inst_offset + end] = inst_feat[start:end]
    for start in tqdm.trange(0, dataset.num_papers, BLOCK_ROWS):
        end = min(dataset.num_papers, start + BLOCK_ROWS)
        full_feat[paper_offset + start:paper_offset +
                  end] = paper_feat[start:end]

# Convert the graph to the given format and save.  (The RGAT baseline needs CSC graph)
g = g.formats(args.graph_format)
dgl.save_graphs(args.graph_output_path, g)
Ejemplo n.º 8
0
def load_from_ogbl_with_name(name):    
    choices = ['ogbl-collab', 'ogbl-ddi', 'ogbl-ppa', 'ogbl-citation']
    assert name in choices, "name must be selected from " + str(choices)
    dataset = DglLinkPropPredDataset(name)
    return dataset[0]

def load_from_ogbn_with_name(name):    
    choices = ['ogbn-products', 'ogbn-proteins', 'ogbn-arxiv', 'ogbn-papers100M']
    assert name in choices, "name must be selected from " + str(choices)
    dataset, label = DglNodePropPredDataset(name)[0]
    return dataset

if __name__ == "__main__":
    """ load datasets as net.txt format """
    parser = argparse.ArgumentParser()
    parser.add_argument('--name', type=str,
        choices=['ogbl-collab', 'ogbl-ddi', 'ogbl-ppa', 'ogbl-citation', 
            'ogbn-products', 'ogbn-proteins', 'ogbn-arxiv', 'ogbn-papers100M'],
        default='ogbl-collab',
        help="name of datasets by ogb")
    args = parser.parse_args()

    name = args.name
    if name.startswith("ogbl"):
        g = load_from_ogbl_with_name(name=name)
    else:
        g = load_from_ogbn_with_name(name=name)

    dgl.save_graphs(name + "-graph.bin", g)
Ejemplo n.º 9
0
    g.edges['listened'].data['created_at'] = torch.LongTensor(
        events['created_at'].values)
    g.edges['listened-by'].data['created_at'] = torch.LongTensor(
        events['created_at'].values)

    n_edges = g.number_of_edges('listened')
    train_indices, val_indices, test_indices = train_test_split_by_time(
        events, 'created_at', 'user_id')
    train_g = build_train_graph(g, train_indices, 'user', 'track', 'listened',
                                'listened-by')
    assert train_g.out_degrees(etype='listened').min() > 0
    val_matrix, test_matrix = build_val_test_matrix(g, val_indices,
                                                    test_indices, 'user',
                                                    'track', 'listened')

    dgl.save_graphs(os.path.join(out_directory, 'train_g.bin'), train_g)

    dataset = {
        'val-matrix': val_matrix,
        'test-matrix': test_matrix,
        'item-texts': {},
        'item-images': None,
        'user-type': 'user',
        'item-type': 'track',
        'user-to-item-type': 'listened',
        'item-to-user-type': 'listened-by',
        'timestamp-edge-column': 'created_at'
    }

    with open(os.path.join(out_directory, 'data.pkl'), 'wb') as f:
        pickle.dump(dataset, f)
Ejemplo n.º 10
0
def main(argv):
    gROOT.SetBatch(True)

    #parse command line arguments
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("-r",
                        "--runnumber",
                        type=str,
                        default=0,
                        dest="runnumber",
                        help="unique identifier for current run")
    parser.add_argument("-e",
                        "--epochs",
                        type=int,
                        default=20,
                        dest="nepochs",
                        help="number of epochs for training")
    parser.add_argument("-d",
                        "--data_dir",
                        type=str,
                        required=True,
                        dest="data_dir",
                        help="name of directory where data is stored")
    parser.add_argument("-o",
                        "--output_dir",
                        type=str,
                        required=True,
                        dest="output_dir",
                        help="name of directory where GNN output is stored")
    parser.add_argument(
        "-s",
        "--dataset",
        type=str,
        required=True,
        dest="infile_name",
        help="name of dataset to train on (without hdf5 extension)")
    parser.add_argument(
        "-n",
        "--normed",
        type=int,
        default=1,
        dest="use_normed",
        help="choose whether to use normalized features or not")
    parser.add_argument(
        "-m",
        "--multiclass",
        type=int,
        default=0,
        dest="multi_class",
        help="choose whether to perform binary of multi-class classification")
    args = parser.parse_args()

    runnumber = args.runnumber
    nepochs = args.nepochs
    infile_name = args.infile_name
    infile_path = args.data_dir
    outfile_path = args.output_dir
    use_normed = args.use_normed
    multi_class = args.multi_class

    #import options from option file
    learning_rate = options.learning_rate
    batch_size = options.batch_size
    attention_heads = options.attention_heads
    nodemlp_sizes = options.nodemlp_sizes
    gat_sizes = options.gat_sizes
    edgemlp_sizes = options.edgemlp_sizes
    reweight = options.reweight  #reweight positive labels in loss to make positives and negatives equally important
    load_checkpoint = options.load_checkpoint
    use_lr_scheduler = options.use_lr_scheduler

    #---------------------------------------------------DATA-IMPORT-------------------------------------------------

    start_time = time.time()
    print("Importing input data.")

    #set relevant filenames
    if use_normed:
        ext = ".normed.pruned"
    else:
        ext = ".pruned"
    paramfile_name = infile_path + infile_name + "_params"
    train_infile_name = infile_path + infile_name + "_train" + ext + ".bin"
    val_infile_name = infile_path + infile_name + "_val" + ext + ".bin"
    test_infile_name = infile_path + infile_name + "_test" + ext + ".bin"
    checkpointfile_name = outfile_path + runnumber + "/" + infile_name + "_" + runnumber + "_model.pt"

    #calculate number of features in graphs
    sample_graph = dgl.load_graphs(train_infile_name, [0])[0][0]
    incl_errors = incl_corr = incl_hits = incl_vweight = False
    nnfeatures_base = sample_graph.ndata['features_base'].size()[1]
    in_features = nnfeatures_base
    if 'features_vweight' in sample_graph.ndata.keys():
        nnfeatures_vweight = sample_graph.ndata['features_vweight'].size()[1]
        incl_vweight = True
        in_features += nnfeatures_vweight
    if 'features_errors' in sample_graph.ndata.keys():
        nnfeatures_errors = sample_graph.ndata['features_errors'].size()[1]
        incl_errors = True
        in_features += nnfeatures_errors
    if 'features_hits' in sample_graph.ndata.keys():
        nnfeatures_hits = sample_graph.ndata['features_hits'].size()[1]
        incl_hits = True
        in_features += nnfeatures_hits
    if 'features_corr' in sample_graph.ndata.keys():
        nnfeatures_corr = sample_graph.ndata['features_corr'].size()[1]
        incl_corr = True
        in_features += nnfeatures_corr

    #read in values from parameter file
    if os.path.isfile(paramfile_name):
        paramfile = open(paramfile_name, "r")
        train_len = int(float(paramfile.readline()))
        val_len = int(float(paramfile.readline()))
        test_len = int(float(paramfile.readline()))
        truth_frac = float(paramfile.readline())
        b_frac = float(paramfile.readline())
        c_frac = float(paramfile.readline())
    else:
        print("ERROR: Specified parameter file not found")
        return 1

    p_time = time.time() - start_time
    print("Finished importing input data. Time elapsed: {}s.\n".format(p_time))

    #reweight positive labels automatically if desired
    if reweight:
        pos_weight = th.tensor([0.5 * (1 - truth_frac) / truth_frac])
        mult_weights = th.tensor(
            [1. / (1 - b_frac - c_frac), 1. / b_frac, 1. / c_frac])
        print("Setting positive weight to {}".format(pos_weight))
    else:
        pos_weight = th.tensor([1])
        mult_weights = th.tensor([1., 1., 1.])

    #calculate number of testing, training and validation batches
    test_batches = int(math.ceil(test_len / batch_size))
    val_batches = int(math.ceil(val_len / batch_size))
    train_batches = int(math.ceil(train_len / batch_size))

    device = th.device('cuda' if th.cuda.is_available() else
                       'cpu')  #automatically run on GPU if available

    #set up loss
    if not multi_class:
        loss = nn.BCEWithLogitsLoss(pos_weight=pos_weight,
                                    reduction='sum').to(device)
        outfeats = 1
        cm = np.zeros((2, 2), dtype=int)
        activation = nn.Sigmoid()
        labeltype = 'bin_labels'
    else:
        loss = nn.CrossEntropyLoss(weight=mult_weights).double().to(device)
        outfeats = 3
        cm = np.zeros((3, 3), dtype=int)
        activation = nn.Softmax(dim=1)
        labeltype = 'mult_labels'

    model = EdgePredModel(nodemlp_sizes, gat_sizes, edgemlp_sizes, in_features,
                          outfeats, attention_heads).double().to(device)
    opt = th.optim.Adam(model.parameters(), lr=learning_rate)
    if use_lr_scheduler:
        scheduler = th.optim.lr_scheduler.OneCycleLR(
            opt, 0.1, epochs=nepochs, steps_per_epoch=train_batches
        )  #th.optim.lr_scheduler.ReduceLROnPlateau(opt,patience=5)

    train_loss_array = np.zeros(nepochs)
    val_loss_array = np.zeros(nepochs)

    #print model parameters
    print("Model built. Parameters:")
    for name, param in model.named_parameters():
        print(name, param.size(), param.requires_grad)
    print("")

    #load existing checkpoint
    if load_checkpoint and os.path.exists(checkpointfile_name):
        checkpoint = th.load(checkpointfile_name)
        model.load_state_dict(checkpoint['model_state_dict'])
        opt.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print("Loading previous model. Starting from epoch {}.".format(
            start_epoch))
    else:
        start_epoch = 1

    #----------------------------------------------------TRAINING---------------------------------------------------

    #main training loop
    t_time = time.time() - start_time
    print("Beginning training. Running on {}. Time elapsed: {}s.\n".format(
        device, t_time))
    for epoch in range(start_epoch, nepochs + 1):
        print("Epoch: {}".format(epoch))

        #training
        total_labels = 0
        model.train()
        for ibatch in range(train_batches):

            #load batch from file
            istart = ibatch * batch_size
            if ibatch == (train_batches - 1) and train_len % batch_size != 0:
                iend = istart + (train_len % batch_size)
            else:
                iend = (ibatch + 1) * batch_size
            batch = dgl.batch(
                dgl.load_graphs(train_infile_name, list(range(istart,
                                                              iend)))[0])

            #construct feature matrix
            features = batch.ndata['features_base']
            if incl_vweight:
                features = th.cat((features, batch.ndata['features_vweight']),
                                  dim=1)
            if incl_errors:
                features = th.cat((features, batch.ndata['features_errors']),
                                  dim=1)
            if incl_hits:
                features = th.cat((features, batch.ndata['features_hits']),
                                  dim=1)
            if incl_corr:
                features = th.cat((features, batch.ndata['features_corr']),
                                  dim=1)

            #process batch
            batch = batch.to(device)  #transfer batch to relevant device
            features = features.to(device)
            pred = model(batch, features)
            target = batch.edata[labeltype]
            if multi_class: target = target[:, 0].long()
            pred_lt = loss(pred, target)

            opt.zero_grad()
            pred_lt.backward()
            opt.step()

            #evaluate loss
            batch_labels = batch.edata['bin_labels'].size()[0]
            total_labels += batch_labels
            print("Training loss: {}".format(pred_lt.item() / batch_labels))
            train_loss_array[epoch - 1] += pred_lt.item()

            if use_lr_scheduler: scheduler.step()

        #normalize loss
        train_loss_array[epoch -
                         1] = train_loss_array[epoch - 1] / total_labels

        #save checkpoint
        th.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': opt.state_dict()
            }, checkpointfile_name)

        #validation
        total_labels = 0
        model.eval()
        for ibatch in range(val_batches):

            #load batch from file
            istart = ibatch * batch_size
            if ibatch == (val_batches - 1) and val_len % batch_size != 0:
                iend = istart + (val_len % batch_size)
            else:
                iend = (ibatch + 1) * batch_size
            val_batch = dgl.batch(
                dgl.load_graphs(val_infile_name, list(range(istart, iend)))[0])

            #construct feature matrix
            val_features = val_batch.ndata['features_base']
            if incl_vweight:
                val_features = th.cat(
                    (val_features, val_batch.ndata['features_vweight']), dim=1)
            if incl_errors:
                val_features = th.cat(
                    (val_features, val_batch.ndata['features_errors']), dim=1)
            if incl_hits:
                val_features = th.cat(
                    (val_features, val_batch.ndata['features_hits']), dim=1)
            if incl_corr:
                val_features = th.cat(
                    (val_features, val_batch.ndata['features_corr']), dim=1)

            #process batch
            val_batch = val_batch.to(device)
            val_features = val_features.to(device)
            pred = model(val_batch, val_features)
            target = val_batch.edata[labeltype]
            if multi_class: target = target[:, 0].long()
            pred_lv = loss(pred, target)

            #evaluate loss
            batch_labels = val_batch.edata['bin_labels'].size()[0]
            total_labels += batch_labels
            print("Validation loss: {}".format(pred_lv.item() / batch_labels))
            val_loss_array[epoch - 1] += pred_lv.item()

        #normalize loss
        val_loss_array[epoch - 1] = val_loss_array[epoch - 1] / total_labels

        #print validation results
        e_time = time.time() - start_time
        print('Time elapsed: {}s.\n'.format(e_time))

    print("Training finished. Evaluating model.\n")

    #---------------------------------------------------EVALUATION--------------------------------------------------

    overall_g_list = []

    #testing
    model.eval()
    for ibatch in range(test_batches):

        #load batch from file
        istart = ibatch * batch_size
        if ibatch == (test_batches - 1) and test_len % batch_size != 0:
            iend = istart + (test_len % batch_size)
        else:
            iend = (ibatch + 1) * batch_size
        test_batch = dgl.batch(
            dgl.load_graphs(test_infile_name, list(range(istart, iend)))[0])

        #construct feature matrix
        test_features = test_batch.ndata['features_base']
        if incl_vweight:
            test_features = th.cat(
                (test_features, test_batch.ndata['features_vweight']), dim=1)
        if incl_errors:
            test_features = th.cat(
                (test_features, test_batch.ndata['features_errors']), dim=1)
        if incl_hits:
            test_features = th.cat(
                (test_features, test_batch.ndata['features_hits']), dim=1)
        if incl_corr:
            test_features = th.cat(
                (test_features, test_batch.ndata['features_corr']), dim=1)

        #process batch
        test_batch = test_batch.to(device)
        test_features = test_features.to(device)
        edge_labels = test_batch.edata[labeltype]

        #evaluate results
        pred = activation(model(test_batch,
                                test_features).float()).cpu().detach().numpy()
        true = test_batch.edata[labeltype].cpu().numpy().astype(int)

        test_batch.edata['pred'] = activation(test_batch.edata['pred'])

        g_test_list = dgl.unbatch(test_batch)
        overall_g_list.extend(g_test_list)

        if not multi_class:
            cm += evaluate_confusion_bin(true, pred.round().astype(int))
        else:
            cm += evaluate_confusion_mult(true, pred.round().astype(int))

    #print test results
    print_output(multi_class, cm)

    #save results to file
    outfile_name = outfile_path + runnumber + "/" + infile_name + "_" + runnumber
    dgl.save_graphs(outfile_name + "_results.bin", overall_g_list)

    #plot loss
    plt.ioff()
    plt.plot(range(nepochs), train_loss_array, label="Training")
    plt.plot(range(nepochs), val_loss_array, label="Validation")
    plt.legend()
    plt.xlabel("Epoch")
    plt.savefig(outfile_name + "_lossplot.png")
Ejemplo n.º 11
0
    compact_g1.ndata[dgl.NTYPE] = compact_g.ndata[dgl.NTYPE][reshuffle_nodes]
    compact_g1.ndata[dgl.NID] = compact_g.ndata[dgl.NID][reshuffle_nodes]
    compact_g1.ndata['inner_node'] = compact_g.ndata['inner_node'][
        reshuffle_nodes]
    compact_g1.edata['orig_id'] = compact_g.edata['orig_id'][compact_g1.edata[
        dgl.EID]]
    compact_g1.edata[dgl.ETYPE] = compact_g.edata[dgl.ETYPE][compact_g1.edata[
        dgl.EID]]
    compact_g1.edata['inner_edge'] = compact_g.edata['inner_edge'][
        compact_g1.edata[dgl.EID]]
    compact_g1.edata[dgl.EID] = compact_g.edata[dgl.EID][compact_g1.edata[
        dgl.EID]]

    part_dir = output_dir + '/part' + str(part_id)
    os.makedirs(part_dir, exist_ok=True)
    dgl.save_graphs(part_dir + '/graph.dgl', [compact_g1])

part_metadata = {
    'graph_name': graph_name,
    'num_nodes': num_nodes,
    'num_edges': num_edges,
    'part_method': 'metis',
    'num_parts': num_parts,
    'halo_hops': 1,
    'node_map': node_map_val,
    'edge_map': edge_map_val,
    'ntypes': ntypes_map,
    'etypes': etypes_map
}

for part_id in range(num_parts):
Ejemplo n.º 12
0
    # To eliminate 0-in-degree nodes
    # bg = dgl.add_reverse_edges(g, copy_ndata=True, copy_edata=True)
    # return bg
    return g


if __name__ == "__main__":
    start_time = time()
    list_user_domian = ['U66@DOM1']
    list_authentication_type = [
        '?', 'NTLM', 'Kerberos', 'Negotiate',
        'MICROSOFT_AUTHENTICATION_PACKAGE_V1_0', 'N'
    ]
    list_logon_type = [
        '?', 'Network', 'Batch', 'NetworkCleartext', 'Unlock',
        'RemoteInteractive', 'Interactive', 'Service', 'CachedInteractive',
        'NewCredentials'
    ]
    list_authentication_orientation = [
        'TGS', 'TGT', 'LogOn', 'LogOff', 'AuthMap', 'ScreenLock',
        'ScreenUnlock'
    ]
    list_failure_success = ['Fail', 'Success']

    graph = graph_construction()
    dgl.save_graphs('/data/LANL/data_including_all_malhosts/train/auth.bin',
                    [graph])
    print("[+] graph of auth has been saved!")
    end_time = time()
    print("Time used: " + str(end_time - start_time))
Ejemplo n.º 13
0
    def _pre_process(self,
                     smiles_to_graph,
                     node_featurizer,
                     edge_featurizer,
                     load,
                     log_every,
                     init_mask,
                     n_jobs=1):
        """Pre-process the dataset

        * Convert molecules from smiles format into DGLGraphs
          and featurize their atoms
        * Set missing labels to be 0 and use a binary masking
          matrix to mask them

        Parameters
        ----------
        smiles_to_graph : callable, SMILES -> DGLGraph
            Function for converting a SMILES (str) into a DGLGraph.
        node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
            Featurization for nodes like atoms in a molecule, which can be used to update
            ndata for a DGLGraph.
        edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
            Featurization for edges like bonds in a molecule, which can be used to update
            edata for a DGLGraph.
        load : bool
            Whether to load the previously pre-processed dataset or pre-process from scratch.
            ``load`` should be False when we want to try different graph construction and
            featurization methods and need to preprocess from scratch. Default to True.
        log_every : bool
            Print a message every time ``log_every`` molecules are processed. It only comes
            into effect when :attr:`n_jobs` is greater than 1.
        init_mask : bool
            Whether to initialize a binary mask indicating the existence of labels.
        n_jobs : int
            Degree of parallelism for pre processing. Default to 1.
        """
        if os.path.exists(self.cache_file_path) and load:
            # DGLGraphs have been constructed before, reload them
            print('Loading previously saved dgl graphs...')
            self.graphs, label_dict = load_graphs(self.cache_file_path)
            self.labels = label_dict['labels']
            if init_mask:
                self.mask = label_dict['mask']
            self.valid_ids = label_dict['valid_ids'].tolist()
        else:
            print('Processing dgl graphs from scratch...')
            if n_jobs > 1:
                self.graphs = pmap(smiles_to_graph,
                                   self.smiles,
                                   node_featurizer=node_featurizer,
                                   edge_featurizer=edge_featurizer,
                                   n_jobs=n_jobs)
            else:
                self.graphs = []
                for i, s in enumerate(self.smiles):
                    if (i + 1) % log_every == 0:
                        print('Processing molecule {:d}/{:d}'.format(
                            i + 1, len(self)))
                    self.graphs.append(
                        smiles_to_graph(s,
                                        node_featurizer=node_featurizer,
                                        edge_featurizer=edge_featurizer))

            # Keep only valid molecules
            self.valid_ids = []
            graphs = []
            for i, g in enumerate(self.graphs):
                if g is not None:
                    self.valid_ids.append(i)
                    graphs.append(g)
            self.graphs = graphs
            _label_values = self.df[self.task_names].values
            # np.nan_to_num will also turn inf into a very large number
            self.labels = F.zerocopy_from_numpy(
                np.nan_to_num(_label_values).astype(
                    np.float32))[self.valid_ids]
            valid_ids = torch.tensor(self.valid_ids)
            if init_mask:
                self.mask = F.zerocopy_from_numpy(
                    (~np.isnan(_label_values)).astype(
                        np.float32))[self.valid_ids]
                save_graphs(self.cache_file_path,
                            self.graphs,
                            labels={
                                'labels': self.labels,
                                'mask': self.mask,
                                'valid_ids': valid_ids
                            })
            else:
                self.mask = None
                save_graphs(self.cache_file_path,
                            self.graphs,
                            labels={
                                'labels': self.labels,
                                'valid_ids': valid_ids
                            })

        self.smiles = [self.smiles[i] for i in self.valid_ids]
Ejemplo n.º 14
0
 def save(self):
     """save the graph list and the labels"""
     graph_path = os.path.join(
         self.data_save_path,
         'dgl_graph_{}_{}.bin'.format(self.hash, self.sub))
     save_graphs(str(graph_path), self.graphs, {'labels': self.labels})
Ejemplo n.º 15
0
def save(self, data_dir):
    # save graphs
    graph_path = os.path.join(data_dir, 'kgat_dgl_graph.bin')
    save_graphs(graph_path, self.train_graph)
Ejemplo n.º 16
0
def create_dataset(args, processor, retrievers, relation_list, evaluate, input_dir):
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()
    definition_info = DefinitionInfo()
    tokenizer, _ = configure_tokenizer_model(args, logger, retrievers, is_preprocess=True)

    logger.info("tokenizer: {}".format(tokenizer))
    if args.test:
        temp_mark = "test"
    elif evaluate:
        temp_mark = "dev"
    else:
        temp_mark = "train"
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            temp_mark,
            args.model_type,
            str(args.cache_file_suffix),
        ),
    )

    if os.path.exists(cached_features_file):
        logger.warning("cache file exist and exit program")
        exit()

    logger.info("Creating features from dataset file at %s", input_dir)

    if not os.path.exists(cached_features_file + "_example"):
        if args.test:
            examples = processor.get_test_examples(args.data_dir, filename=args.predict_file)
        else:
            examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
        torch.save(examples, cached_features_file + "_example")
    else:
        logger.info("Loading examples from cached files.")
        examples = torch.load(cached_features_file + "_example")

    examples_tokenized = processor.tokenization_on_examples(examples, tokenizer, is_testing=args.test)

    features = processor.convert_examples_to_features(args, examples_tokenized, tokenizer, retrievers, not evaluate, debug=args.debug)

    features, dataset, all_kgs_graphs = processor.pad_and_index_features_all(
        features, retrievers, args, tokenizer, relation_list, encoder=None, definition_info=definition_info, is_training=not evaluate, debug=args.debug)

    if args.local_rank in [-1, 0]:
        if args.model_type == "kelm":
            all_kgs_graphs_label_dict = {"glabel": torch.tensor([i for i in range(len(all_kgs_graphs))])}
            save_graphs(cached_features_file+"_all_kgs_graphs.bin", all_kgs_graphs, all_kgs_graphs_label_dict)
        logger.info("complete data preprocessing")

        logger.info("Saving features into cached file %s", cached_features_file)

        for f in features:
            del f.kgs_conceptids2synset
        torch.save({"features": features, "dataset": dataset, "examples": examples_tokenized}, cached_features_file)

        logger.info("Saving knowledge graph retrievers")
        for kg, retriever in retrievers.items():
            if not os.path.exists(os.path.join(input_dir, args.kg_paths[kg])):
                os.mkdir(os.path.join(input_dir, args.kg_paths[kg]))
            torch.save(retriever, os.path.join(input_dir, args.kg_paths[kg], kg + args.cache_file_suffix))

        logger.info("data create is done")

    if args.local_rank == 0:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()
Ejemplo n.º 17
0
newg.edges()

######################################################################
# Loading and Saving Graphs
# -------------------------
#
# You can save a graph or a list of graphs via ``dgl.save_graphs`` and
# load them back with ``dgl.load_graphs``.
#

# Save graphs
print(
    "-----------------------------------------------------------------------------------"
)
print("Step 5: Loading and Saving Graphs: ")
dgl.save_graphs('graph.dgl', g)
dgl.save_graphs('graphs.dgl', [g, sg1, sg2])

# Load graphs
(g, ), _ = dgl.load_graphs('graph.dgl')
print("graph g: ")
print(
    g
)  # each graph contain nodes and their features, edges and their features;
(g, sg1, sg2), _ = dgl.load_graphs('graphs.dgl')
print("graph g: ")
print(g)
print("graph sg1: ")
print(sg1)
print("graph sg2: ")
print(sg2)
Ejemplo n.º 18
0
def main(argv):
    gROOT.SetBatch(True)

    #parse command line arguments
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("-n",
                        "--ntuple",
                        type=str,
                        required=True,
                        dest="ntuple",
                        help="name of HDF5 file to be processed")
    parser.add_argument("-d",
                        "--dataset",
                        type=str,
                        required=True,
                        dest="dataset",
                        help="name of dataset to be created")
    parser.add_argument("-i",
                        "--input_dir",
                        type=str,
                        required=True,
                        dest="infile_dir",
                        help="name of input directory")
    parser.add_argument("-o",
                        "--output_dir",
                        type=str,
                        required=True,
                        dest="outfile_dir",
                        help="name of output directory")
    parser.add_argument("-e",
                        "--max_graphs",
                        type=int,
                        default=0,
                        dest="max_graphs",
                        help="maximum number of graphs to create")
    args = parser.parse_args()

    max_graphs = args.max_graphs

    #input data parameters
    connect_btoc = options.connect_btoc
    incl_errors = options.incl_errors
    incl_corr = options.incl_corr
    incl_hits = options.incl_hits
    incl_vweight = options.incl_vweight
    jet_pt_cut = options.jet_pt_cut
    jet_eta_cut = options.jet_eta_cut
    track_pt_cut = options.track_pt_cut
    track_eta_cut = options.track_eta_cut
    track_z0_cut = options.track_z0_cut
    vweight_pileup_cut = options.vweight_pileup_cut
    vweight_pv_cut = options.vweight_pv_cut

    nnfeatures_base = 8
    nnfeatures_errors = 5
    nnfeatures_corrs = 10
    nnfeatures_hits = 10

    #file names
    infiles = glob.glob(args.infile_dir + args.ntuple + "_*.hdf5")
    infiles.sort()
    outfiles = []
    for infile in infiles:
        infile_name = os.path.splitext(os.path.basename(infile))[0]
        outfiles.append(args.outfile_dir + "/" + infile_name + ".bin")

    start_time = time.time()

    print(
        "--------------------------------------------------------------------")

    #check if track to vertex association information is contained in HDF5 file
    infile = h5py.File(infiles[0], "r")
    if 'tfeatures_w' in infile.keys():
        ttv_avail = True
    else:
        ttv_avail = False
        print("WARNING: Track to vertex association information not found")

    total_jets = 0
    for infile_name in infiles:
        infile = h5py.File(infile_name, "r")
        total_jets += len(infile['jinfo']['event_no'])
        infile.close()
    if max_graphs == 0 or max_graphs > total_jets: max_graphs = total_jets
    print("Total number of jets in dataset: {}".format(total_jets))
    print("Maximum number of jets desired: {}".format(max_graphs))

    passed_graphs = cut_graphs = jet_req_cuts = track_req_cuts = tracks_kept = tracks_cut = 0

    #loop through input files
    for ifile, infile_name in enumerate(infiles):

        if passed_graphs >= max_graphs:
            break  #stop reading in new files if maximum desired jet number is reached

        #check if outfile already exists and skip if it's newer than infile unless it's the last prevously processed file (in case more entries are being added)
        if ifile != len(outfiles) and os.path.exists(
                outfiles[ifile + 1]) and os.path.exists(
                    outfiles[ifile]) and os.path.getmtime(
                        outfiles[ifile]) > os.path.getmtime(infile_name):
            print("Current version of " + os.path.basename(outfiles[ifile]) +
                  " already exists. Skipping file.")
            continue

        infile = h5py.File(infile_name, "r")
        file_jets = len(infile['jinfo']['event_no'])
        g_list = []

        track_offset = 0  #tracks are stored in continuous chunk -> need to offset indices for each jet
        event_index = previous_event = -1
        for ientry in range(file_jets):

            #read in event/jet information
            current_event = infile['jinfo']['event_no'][ientry]
            if current_event != previous_event:
                event_index += 1
                previous_event = current_event
                if passed_graphs >= max_graphs:
                    break  #stop processing events once specified maximum jet number has been read in

            current_jet = infile['jinfo']['jet_no'][ientry]
            ntracks = infile['jinfo']['ntracks'][ientry]
            pv_x = infile['efeatures']['pv_x'][event_index]
            pv_y = infile['efeatures']['pv_y'][event_index]
            pv_z = infile['efeatures']['pv_z'][event_index]
            jet_pt = infile['jfeatures']['pt'][ientry]
            jet_eta = infile['jfeatures']['eta'][ientry]
            jet_phi = infile['jfeatures']['phi'][ientry]
            nedges = ntracks * (ntracks - 1)

            #apply jet cuts
            if jet_pt > jet_pt_cut and abs(jet_eta) < jet_eta_cut:

                #make jet flavor label definitions consistent
                jet_flavor = infile['jinfo']['jet_flavor'][ientry]
                if jet_flavor == 5:  #b-jet
                    jet_flavor = 1
                elif jet_flavor == 4:  #c-jet
                    jet_flavor = 2
                elif jet_flavor == 15:  #tau-jets
                    jet_flavor = 0
                else:
                    jet_flavor = 0

                node_features_base = np.zeros((ntracks, nnfeatures_base))
                if incl_corr:
                    node_features_corrs = np.zeros((ntracks, nnfeatures_corrs))
                if incl_errors:
                    node_features_errors = np.zeros(
                        (ntracks, nnfeatures_errors))
                if incl_hits:
                    node_features_hits = np.zeros((ntracks, nnfeatures_hits))
                if incl_vweight: node_features_vweight = np.zeros((ntracks, 1))

                jet_info = np.zeros(
                    (ntracks, 4)
                )  #store jet info - jet truth label (0 = l, 1 = b, 2 = c), jet pv coordinates
                track_info = np.zeros(
                    (ntracks, 4)
                )  #store track general info - track label (see process_ntuples), track sv coordinates
                track_ancestors = np.zeros(
                    (ntracks, 4))  #store track ancestor info
                #edge_features = np.zeros((nedges,nefeatures))

                #initialize track feature arrays
                hf_ancestors = np.zeros((ntracks, 1))
                prev_b_ancestors = np.zeros((ntracks, 1))
                track_flavors = np.zeros((ntracks, 1))
                reco_use = np.zeros((ntracks, 2))  #use of track in SV0, SV1
                passed_cuts = np.zeros((ntracks, 1))
                bin_labels = np.zeros((nedges, 1))
                mult_labels = np.zeros((nedges, 1))

                #read in features for each track
                for j in range(ntracks):
                    track_pt = infile['tfeatures_b']['pt'][track_offset + j]
                    track_eta = infile['tfeatures_b']['eta'][track_offset + j]
                    track_theta = infile['tfeatures_b']['theta'][track_offset +
                                                                 j]
                    track_phi = infile['tfeatures_b']['phi'][track_offset + j]
                    track_d0 = infile['tfeatures_b']['d0'][track_offset + j]
                    track_z0 = infile['tfeatures_b']['z0'][track_offset + j]
                    track_q = infile['tfeatures_b']['q'][track_offset + j]
                    if ttv_avail:
                        track_vweight = infile['tfeatures_w']['vweight'][
                            track_offset + j]
                        track_vtype = infile['tinfo']['vertex_type'][
                            track_offset + j]
                    if incl_errors:
                        track_cov_d0d0 = math.sqrt(
                            infile['tfeatures_e']['cov_d0d0'][track_offset +
                                                              j])
                        track_cov_z0z0 = math.sqrt(
                            infile['tfeatures_e']['cov_z0z0'][track_offset +
                                                              j])
                        track_cov_phiphi = math.sqrt(
                            infile['tfeatures_e']['cov_phiphi'][track_offset +
                                                                j])
                        track_cov_thetatheta = math.sqrt(
                            infile['tfeatures_e']['cov_thetatheta'][
                                track_offset + j])
                        track_cov_qoverpqoverp = math.sqrt(
                            abs(infile['tfeatures_e']['cov_qoverpqoverp'][
                                track_offset + j]))
                    if incl_corr:
                        track_cov_d0z0 = infile['tfeatures_c']['cov_d0z0'][
                            track_offset + j]
                        track_cov_d0phi = infile['tfeatures_c']['cov_d0phi'][
                            track_offset + j]
                        track_cov_d0theta = infile['tfeatures_c'][
                            'cov_d0theta'][track_offset + j]
                        track_cov_d0qoverp = infile['tfeatures_c'][
                            'cov_d0qoverp'][track_offset + j]
                        track_cov_z0phi = infile['tfeatures_c']['cov_z0phi'][
                            track_offset + j]
                        track_cov_z0theta = infile['tfeatures_c'][
                            'cov_z0theta'][track_offset + j]
                        track_cov_z0qoverp = infile['tfeatures_c'][
                            'cov_z0qoverp'][track_offset + j]
                        track_cov_phitheta = infile['tfeatures_c'][
                            'cov_phitheta'][track_offset + j]
                        track_cov_phiqoverp = infile['tfeatures_c'][
                            'cov_phiqoverp'][track_offset + j]
                        track_cov_thetaqoverp = infile['tfeatures_c'][
                            'cov_thetaqoverp'][track_offset + j]
                    if incl_hits:
                        track_nPixHits = infile['tfeatures_h']['nPixHits'][
                            track_offset + j]
                        track_nSCTHits = infile['tfeatures_h']['nSCTHits'][
                            track_offset + j]
                        track_nBLHits = infile['tfeatures_h']['nBLHits'][
                            track_offset + j]
                        track_nPixHoles = infile['tfeatures_h']['nPixHoles'][
                            track_offset + j]
                        track_nSCTHoles = infile['tfeatures_h']['nSCTHoles'][
                            track_offset + j]
                        track_nPixShared = infile['tfeatures_h']['nPixShared'][
                            track_offset + j]
                        track_nSCTShared = infile['tfeatures_h']['nSCTShared'][
                            track_offset + j]
                        track_nBLShared = infile['tfeatures_h']['nBLShared'][
                            track_offset + j]
                        track_nPixSplit = infile['tfeatures_h']['nPixSplit'][
                            track_offset + j]
                        track_nBLSplit = infile['tfeatures_h']['nBLSplit'][
                            track_offset + j]

                    track_algo = infile['tinfo']['algo'][track_offset + j]
                    reco_use[j] = [(track_algo & 1 << 2) / 4,
                                   (track_algo & 1 << 3) / 8]

                    hf_ancestors[j] = infile['tinfo']['hf_ancestor'][
                        track_offset + j]
                    hf_pdgid = infile['tinfo']['hf_pdgid'][track_offset + j]
                    prev_b_ancestors[j] = infile['tinfo']['prev_b_ancestor'][
                        track_offset + j]
                    prev_b_pdgid = infile['tinfo']['prev_b_pdgid'][track_offset
                                                                   + j]
                    track_flavors[j] = infile['tinfo']['track_flavor'][
                        track_offset + j]
                    sv_x = infile['tinfo']['sv_x'][track_offset + j]
                    sv_y = infile['tinfo']['sv_y'][track_offset + j]
                    sv_z = infile['tinfo']['sv_z'][track_offset + j]

                    #make cuts on track level
                    if ttv_avail:
                        vertex_condition = (
                            track_vweight < vweight_pv_cut and track_vtype
                            == 1) or (track_vweight < vweight_pileup_cut
                                      and track_vtype == 2)
                    else:
                        vertex_condition = True
                    if track_pt > track_pt_cut and abs(
                            track_eta) < track_eta_cut and abs(
                                track_z0) < track_z0_cut and vertex_condition:
                        passed_cuts[j] = 1
                    else:
                        passed_cuts[j] = 0

                    #store information in feature arrays
                    node_features_base[j] = [
                        track_q / track_pt, track_theta, track_phi, track_d0,
                        track_z0, jet_pt, jet_eta, jet_phi
                    ]
                    if incl_vweight:
                        node_features_vweight[j] = [track_vweight]
                    if incl_errors:
                        node_features_errors[j] = [
                            track_cov_qoverpqoverp, track_cov_thetatheta,
                            track_cov_phiphi, track_cov_d0d0, track_cov_z0z0
                        ]
                    if incl_corr:
                        node_features_corrs[j] = [
                            track_cov_thetaqoverp, track_cov_phiqoverp,
                            track_cov_d0qoverp, track_cov_z0qoverp,
                            track_cov_phitheta, track_cov_d0theta,
                            track_cov_z0theta, track_cov_d0phi,
                            track_cov_z0phi, track_cov_d0z0
                        ]
                    if incl_hits:
                        node_features_hits[j] = [
                            track_nPixHits, track_nSCTHits, track_nBLHits,
                            track_nPixHoles, track_nSCTHoles, track_nPixShared,
                            track_nSCTShared, track_nBLShared, track_nPixSplit,
                            track_nBLSplit
                        ]
                    track_ancestors[j] = [
                        hf_ancestors[j], hf_pdgid, prev_b_ancestors[j],
                        prev_b_pdgid
                    ]
                    track_info[j] = [track_flavors[j], sv_x, sv_y, sv_z]
                    jet_info[j] = [jet_flavor, pv_x, pv_y, pv_z]

                #calculate edge features and truth labels
                counter = 0
                for j in range(ntracks):
                    for k in range(j + 1, ntracks):

                        #set edge features
                        #delta_pt = abs(node_features_base[j][0] - node_features_base[k][0])
                        #edge_features[counter:counter+2] = [delta_pt]

                        #truth labels - vertices have to share the same HF ancestor
                        if hf_ancestors[k] == hf_ancestors[
                                j] and hf_ancestors[k] > 0 and track_flavors[
                                    j] == 1 and track_flavors[
                                        k] == 1:  #matching direct ancestors for non secondaries (B to B)
                            bin_labels[counter:counter + 2] = 1
                            mult_labels[counter:counter + 2] = 1
                        elif hf_ancestors[k] == hf_ancestors[
                                j] and hf_ancestors[k] > 0 and track_flavors[
                                    j] == 2 and track_flavors[
                                        k] == 2:  #matching direct ancestors for non secondaries (prompt C to prompt C)
                            bin_labels[counter:counter + 2] = 1
                            mult_labels[counter:counter + 2] = 2
                        elif hf_ancestors[k] == hf_ancestors[
                                j] and hf_ancestors[k] > 0 and track_flavors[
                                    j] == 3 and track_flavors[
                                        k] == 3:  #matching direct ancestors for non secondaries (B->C to B->C for same C)
                            bin_labels[counter:counter + 2] = 1
                            mult_labels[counter:counter + 2] = 1
                        elif prev_b_ancestors[k] == prev_b_ancestors[
                                j] and prev_b_ancestors[
                                    k] > 0:  #matching second ancestors (B->C to B->C for different C)
                            bin_labels[counter:counter + 2] = connect_btoc
                            mult_labels[counter:counter + 2] = connect_btoc
                        elif (
                                prev_b_ancestors[k] == hf_ancestors[j]
                                and hf_ancestors[j] > 0
                        ) or (
                                prev_b_ancestors[j] == hf_ancestors[k]
                                and hf_ancestors[k] > 0
                        ):  #matching second ancestor and direct ancestor (B to B->C)
                            bin_labels[counter:counter + 2] = connect_btoc
                            mult_labels[counter:counter + 2] = connect_btoc
                        counter += 2

                #create graph objects and append them to the list
                if np.sum(passed_cuts) > 1:
                    g = dgl.graph((create_edge_list(ntracks)))
                    g.ndata['features_base'] = th.from_numpy(
                        node_features_base)
                    if incl_vweight:
                        g.ndata['features_vweight'] = th.from_numpy(
                            node_features_vweight)
                    if incl_errors:
                        g.ndata['features_errors'] = th.from_numpy(
                            node_features_errors)
                    if incl_hits:
                        g.ndata['features_hits'] = th.from_numpy(
                            node_features_hits)
                    if incl_corr:
                        g.ndata['features_corr'] = th.from_numpy(
                            node_features_corrs)
                    g.ndata['jet_info'] = th.from_numpy(jet_info)
                    g.ndata['track_info'] = th.from_numpy(track_info)
                    g.ndata['track_ancestors'] = th.from_numpy(track_ancestors)
                    g.ndata['reco_use'] = th.from_numpy(reco_use)
                    g.ndata['passed_cuts'] = th.from_numpy(passed_cuts)
                    g.edata['bin_labels'] = th.from_numpy(bin_labels)
                    g.edata['mult_labels'] = th.from_numpy(mult_labels)
                    g_list.append(g)
                    tracks_kept += np.sum(passed_cuts == 1)
                    tracks_cut += np.sum(passed_cuts == 0)
                    passed_graphs += 1
                else:
                    track_req_cuts += 1
                    cut_graphs += 1

            else:
                jet_req_cuts += 1
                cut_graphs += 1

            track_offset += ntracks

            #output progress
            sys.stdout.write(
                "\rJets processed: {} (Passed: {}, Cut: {}); Files processed: {}/{}"
                .format(cut_graphs + passed_graphs, passed_graphs, cut_graphs,
                        ifile, len(infiles)))
            sys.stdout.flush()

        #save graphs to file
        dgl.save_graphs(outfiles[ifile], g_list)

    print(
        "Found enough good jets to reach desired sample size. Finishing up...")
    print(
        "--------------------------------------------------------------------")

    p_time = time.time() - start_time
    print("\nGraphs cut due to jet requirements: {}".format(jet_req_cuts))
    print("Graphs cut due to track requirements: {}".format(track_req_cuts))
    print("Fraction of tracks cut from passed jets: {}".format(
        tracks_cut / (tracks_cut + tracks_kept)))
    print("Finished creating graphs. Time elapsed: {}s.".format(p_time))
Ejemplo n.º 19
0
 def _save_graph(self):
     data_dir = os.path.join(args.preprocessed_data_dir,
                             args.city_list[self.city_id])
     dgl.save_graphs(os.path.join(data_dir, "city_graph"), [self.graph])
Ejemplo n.º 20
0
def save_building_block_data(building_block_smis, building_block_molgraphs):
    with open(f"{PROCESSED_DATA_DIR}/building_block_smis.pt", "wb") as f:
        torch.save(building_block_smis, f)

    dgl.save_graphs(f"{PROCESSED_DATA_DIR}/building_block_molgraphs.pt",
                    building_block_molgraphs)