Beispiel #1
0
dataset = MovieLens(path, model_name='all-MiniLM-L6-v2')
data = dataset[0].to(device)

# Add user node features for message passing:
data['user'].x = torch.eye(data['user'].num_nodes, device=device)
del data['user'].num_nodes

# Add a reverse ('movie', 'rev_rates', 'user') relation for message passing:
data = T.ToUndirected()(data)
del data['movie', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

# Perform a link-level split into training, validation, and test edges:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)(data)

# We have an unbalanced dataset with many labels for rating 3 and 4, and very
# few for 0 and 1. Therefore we use a weighted MSE loss.
if args.use_weighted_loss:
    weight = torch.bincount(train_data['user', 'movie'].edge_label)
    weight = weight.max() / weight
else:
    weight = None


def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
parser.add_argument('--variational', action='store_true')
parser.add_argument('--linear', action='store_true')
parser.add_argument('--dataset',
                    type=str,
                    default='Cora',
                    choices=['Cora', 'CiteSeer', 'PubMed'])
parser.add_argument('--epochs', type=int, default=400)
args = parser.parse_args()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05,
                      num_test=0.1,
                      is_undirected=True,
                      split_labels=True,
                      add_negative_train_samples=False),
])
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Planetoid')
dataset = Planetoid(path, args.dataset, transform=transform)
train_data, val_data, test_data = dataset[0]


class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
Beispiel #3
0
def _setup(args, device):
    if not args['hvg_file_path']:
        adata_hvg, adata_khvg, X_hvg, X_khvg = _prepare_training_data(args)
    else:
        assert args['khvg_file_path'] is not None
        adata_hvg = _load_separate_hvg(hvg_path=args['hvg_file_path'])
        adata_khvg = _load_separate_hvg(hvg_path=args['khvg_file_path'])
        if args['transpose_input']:
            print(f'Transposing input HVG file to {adata_hvg.shape[::-1]}...')
            adata_hvg = adata_hvg.copy().transpose()
            print(f'Transposing input KHVG file to {adata_khvg.shape[::-1]}...')
            adata_khvg = adata_khvg.copy().transpose()

        X_hvg = adata_hvg.X
        X_khvg = adata_khvg.X

    if not args['graph_file_path']:
        try:
            edgelist = _prepare_graphs(adata_khvg, X_khvg, args)
        except ValueError as ve:
            print()
            print(colored('Exception: ' + str(ve), 'red'))
            print('Might need to transpose input with the --transpose_input argument.')
            sys.exit(1)
    else:
        edgelist = _load_separate_graph_edgelist(args['graph_file_path'])

    num_nodes = X_hvg.shape[0]
    print(f'Number of nodes in graph: {num_nodes}.')
    edge_index = np.array(edgelist).astype(int).T
    edge_index = to_undirected(torch.from_numpy(edge_index).to(torch.long), num_nodes)

    scaler = MinMaxScaler()
    scaled_x = torch.from_numpy(scaler.fit_transform(X_hvg))

    data_obj = Data(edge_index=edge_index, x=scaled_x)
    data_obj.num_nodes = X_hvg.shape[0]

    data_obj.train_mask = data_obj.val_mask = data_obj.test_mask = data_obj.y = None

    if (args['load_model_path'] is not None):
        print('Assuming loaded model is used for testing.')
        # PyTorch Geometric does not allow 0 training samples (all test), so we need to store all test data as 'training'.
        test_split = 0.0
        val_split = 0.0
    else:
        test_split = args['test_split']
        val_split = args['val_split']

    # Can set validation ratio
    try:
        add_negative_train_samples = args['load_model_path'] is not None

        transform = T.RandomLinkSplit(num_val=val_split, num_test=test_split, is_undirected=True, add_negative_train_samples=add_negative_train_samples, split_labels=True)
        train_data, val_data, test_data = transform(data_obj)
    except IndexError as ie:
        print()
        print(colored('Exception: ' + str(ie), 'red'))
        print('Might need to transpose input with the --transpose_input argument.')
        sys.exit(1)

    num_features = data_obj.num_features

    if args['graph_convolution'] in ['GAT', 'GATv2']:
        num_heads = {}
        if len(args['num_heads']) == 4:
            num_heads['first'] = args['num_heads'][0]
            num_heads['second'] = args['num_heads'][1]
            num_heads['mean'] = args['num_heads'][2]
            num_heads['std'] = args['num_heads'][3]
        elif len(args['num_heads']) == 5:
            num_heads['first'] = args['num_heads'][0]
            num_heads['second'] = args['num_heads'][1]
            num_heads['third'] = args['num_heads'][2]
            num_heads['mean'] = args['num_heads'][3]
            num_heads['std'] = args['num_heads'][4]

        encoder = CellVGAE_Encoder(
            in_channels=num_features, num_hidden_layers=args['num_hidden_layers'],
            num_heads=num_heads,
            hidden_dims=args['hidden_dims'],
            dropout=args['dropout'],
            latent_dim=args['latent_dim'],
            v2=args['graph_convolution'] == 'GATv2',
            concat={'first': True, 'second': True})
    else:
        encoder = CellVGAE_GCNEncoder(
            in_channels=num_features,
            num_hidden_layers=args['num_hidden_layers'],
            hidden_dims=args['hidden_dims'],
            latent_dim=args['latent_dim'])

    model = CellVGAE(encoder=encoder, decoder_nn_dim1=args['decoder_nn_dim1'], gcn_or_gat=args['graph_convolution'])
    optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
    model = model.to(device)

    return model, optimizer, train_data, val_data, test_data