Esempio n. 1
0
    def inference(self, graph: dgl.DGLHeteroGraph, relation_target_node_features: dict, relation_embedding: dict = None,
                  device: str = 'cuda:0'):
        """
        mini-batch inference of final representation over all node types. Outer loop: Interate the layers, Inner loop: Interate the batches

        :param graph: The whole relational graphs
        :param relation_target_node_features: target node features under each relation, dict, {(srctype, etype, dsttype): features}
        :param relation_embedding: embedding for each relation, dict, {etype: feature} or None
        :param device: device str
        """
        with torch.no_grad():

            if relation_embedding is None:
                relation_embedding = {}
                for etype in self.relation_embedding:
                    relation_embedding[etype] = self.relation_embedding[etype].flatten()

            # interate over each layer
            for index, layer in enumerate(self.layers):
                # Tensor, features of all relation embeddings of the target nodes, store on cpu
                y = {
                    (stype, etype, dtype): torch.zeros(graph.number_of_nodes(dtype), self.hidden_dim * self.n_heads) for
                    stype, etype, dtype in graph.canonical_etypes}

                # full sample for each type of nodes
                sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
                dataloader = dgl.dataloading.NodeDataLoader(
                    graph,
                    {ntype: torch.arange(graph.number_of_nodes(ntype)) for ntype in graph.ntypes},
                    sampler,
                    batch_size=1280,
                    shuffle=True,
                    drop_last=False,
                    num_workers=4)

                tqdm_dataloader = tqdm(dataloader, ncols=120)
                for batch, (input_nodes, output_nodes, blocks) in enumerate(tqdm_dataloader):
                    block = blocks[0].to(device)

                    # for relational graphs that only contain a single type of nodes, construct the input and output node dictionary
                    if len(set(blocks[0].ntypes)) == 1:
                        input_nodes = {blocks[0].ntypes[0]: input_nodes}
                        output_nodes = {blocks[0].ntypes[0]: output_nodes}

                    input_features = {(stype, etype, dtype): relation_target_node_features[(stype, etype, dtype)][
                        input_nodes[dtype]].to(device)
                                      for stype, etype, dtype in relation_target_node_features.keys()}

                    input_relation_features = relation_embedding

                    if index == 0:
                        # target relation feature projection for the first layer in the full batch inference
                        for stype, reltype, dtype in input_features:
                            input_features[(stype, reltype, dtype)] = self.projection_layer[dtype](
                                input_features[(stype, reltype, dtype)])
                    h, input_relation_features = layer(block, input_features, input_relation_features)
                    for stype, reltype, dtype in h.keys():
                        y[(stype, reltype, dtype)][output_nodes[dtype]] = h[(stype, reltype, dtype)].cpu()

                    tqdm_dataloader.set_description(f'inference for the {batch}-th batch in model {index}-th layer')

                # update the features of all the nodes (after the graph convolution) in the whole graph
                relation_target_node_features = y
                # relation embedding is updated after each layer
                relation_embedding = input_relation_features

            for stype, etype, dtype in relation_target_node_features:
                relation_target_node_features[(stype, etype, dtype)] = relation_target_node_features[
                    (stype, etype, dtype)].to(device)

            relation_fusion_embedding_dict = {}
            # relation_target_node_features -> {(srctype, etype, dsttype): target_node_features}
            for dsttype in set([dtype for _, _, dtype in relation_target_node_features]):

                relation_target_node_features_dict = {etype: relation_target_node_features[(stype, etype, dtype)]
                                                      for stype, etype, dtype in relation_target_node_features}
                etypes = [etype for stype, etype, dtype in relation_target_node_features if dtype == dsttype]
                dst_node_features = [relation_target_node_features_dict[etype] for etype in etypes]
                dst_relation_embeddings = [relation_embedding[etype] for etype in etypes]
                dst_node_feature_transformation_weight = [self.node_transformation_weight[etype] for etype in etypes]
                dst_relation_embedding_transformation_weight = [self.relation_transformation_weight[etype] for etype in etypes]

                # use mini-batch to avoid out of memory in inference
                relation_fusion_embedding = []
                index = 0
                batch_size = 2560
                while index < dst_node_features[0].shape[0]:
                    # Tensor, shape (heads_num * hidden_dim)
                    relation_fusion_embedding.append(self.relation_fusing(
                        [dst_node_feature[index: index + batch_size, :] for dst_node_feature in dst_node_features],
                        dst_relation_embeddings,
                        dst_node_feature_transformation_weight,
                        dst_relation_embedding_transformation_weight))
                    index += batch_size
                relation_fusion_embedding_dict[dsttype] = torch.cat(relation_fusion_embedding, dim=0)

            # relation_fusion_embedding_dict, {ntype: tensor -> (nodes, n_heads * hidden_dim)}
            # relation_target_node_features, {ntype: tensor -> (num_relations, nodes, n_heads * hidden_dim)}
            return relation_fusion_embedding_dict, relation_target_node_features
Esempio n. 2
0
def import_features(
    g: dgl.DGLHeteroGraph,
    user_feat_df,
    item_feat_df,
    sport_onehot_df,
    ctm_id: pd.DataFrame,
    pdt_id: pd.DataFrame,
    spt_id: pd.DataFrame,
    user_item_train,
    get_popularity: bool,
    num_days_pop: int,
    item_id_type: str,
    ctm_id_type: str,
    spt_id_type: str,
):
    """
    Import features to a dict for all node types.

    For user and item, initializes feature arrays with only 0, then fills the values if they are available.

    Parameters
    ----------
    get_popularity, num_days_pop:
        The recommender system can be enhanced by giving score boost for items that were popular. If get_popularity,
        popularity of the items will be computed. Num_days_pop defines the number of days to include in the
        computation.
    item_id_type, ctm_id_type, spt_id_type:
        See utils_data for details.
    all other parameters:
        See other functions in this file for details.

    Returns
    -------
    features_dict:
        Dictionary with all the features imported here.
    """
    features_dict = {}
    # User
    user_feat_df = user_feat_df.merge(ctm_id, how='inner', on=ctm_id_type)

    ids = user_feat_df.ctm_new_id.values.astype(int)
    feats = np.stack(
        (user_feat_df.is_male.values, user_feat_df.is_female.values), axis=1)

    user_feat = np.zeros((g.number_of_nodes('user'), 2))
    user_feat[ids] = feats

    user_feat = torch.tensor(user_feat).float()
    features_dict['user_feat'] = user_feat

    # Item
    if item_id_type in ['SPECIFIC ITEM IDENTIFIER']:
        item_feat_df = item_feat_df.merge(pdt_id, how='left', on=item_id_type)
        item_feat_df = item_feat_df[
            item_feat_df.pdt_new_id <
            g.number_of_nodes('item')]  # Only IDs that are in graph

        ids = item_feat_df.pdt_new_id.values.astype(int)
        feats = np.stack((
            item_feat_df.is_junior.values,
            item_feat_df.is_male.values,
            item_feat_df.is_female.values,
            item_feat_df.eco_design.values,
        ),
                         axis=1)

        item_feat = np.zeros((g.number_of_nodes('item'), feats.shape[1]))
        item_feat[ids] = feats
        item_feat = torch.tensor(item_feat).float()
    elif item_id_type in ['GENERAL ITEM IDENTIFIER']:
        item_feat = torch.zeros((g.number_of_nodes('item'), 4))
    else:
        raise KeyError(f'Item ID {item_id_type} not recognized.')

    features_dict['item_feat'] = item_feat

    # Sport one-hot
    if 'sport' in g.ntypes:
        sport_onehot_df = sport_onehot_df.merge(spt_id,
                                                how='inner',
                                                on=spt_id_type)
        sport_onehot_df.sort_values(
            by='spt_new_id', inplace=True
        )  # Values need to be sorted by node id to align with g.nodes['sport']
        feats = sport_onehot_df.drop(labels=[spt_id_type, 'spt_new_id'],
                                     axis=1).values
        assert feats.shape[0] == g.num_nodes('sport')
        sport_feat = torch.tensor(feats).float()
        features_dict['sport_feat'] = sport_feat

    # Popularity
    if get_popularity:
        item_popularity = np.zeros((g.number_of_nodes('item'), 1))
        pop_df = user_item_train.merge(pdt_id, how='left', on=item_id_type)
        most_recent_date = datetime.strptime(max(pop_df.hit_date), '%Y-%m-%d')
        limit_date = datetime.strftime(
            (most_recent_date - timedelta(days=num_days_pop)),
            format='%Y-%m-%d')
        pop_df = pop_df[pop_df.hit_date >= limit_date]
        pop_df = pd.DataFrame(pop_df.pdt_new_id.value_counts())
        pop_df.columns = ['purchases']
        pop_df['score'] = pop_df.purchases / pop_df.purchases.sum()
        pop_df.sort_index(inplace=True)
        ids = pop_df.index.values.astype(int)
        scores = pop_df.score.values
        item_popularity[ids] = np.expand_dims(scores, axis=1)
        item_popularity = torch.tensor(item_popularity).float()
        features_dict['item_pop'] = item_popularity

    return features_dict