def generate_train_test_samples(file_name,train_begin_date, train_end_date, test_begin_date, test_end_date, save_name='data', cold_start=False, minT=1, maxT=32, p=0.15, max_length=5): print('Spliting Data...') train_data, test_data = split_train_test_data(file_name, train_begin_date, train_end_date, test_begin_date, test_end_date) print('Building Graph...') g = SocialGraph() g.build(data=train_data) print('Saving Graph...') cPickle.dump(g, open('{}_{}_{}_graph.cpkl'.format(save_name,train_begin_date,train_end_date), 'wb')) print('Generating Random Path...') random_path = social_implicit_path_generator(g, minT=minT, maxT=maxT, p=p, max_length=max_length) print('Saving Random Path...') cPickle.dump(random_path, open('{}_{}_{}_path.pkl'.format(save_name,train_begin_date,train_end_date), 'wb')) print('Generating Training Data...') train_data = sample_generator(g, random_path) print('Saving Training Data...') cPickle.dump(train_data, open('{}_{}_{}_train_data.cpkl'.format(save_name,train_begin_date,train_end_date), 'wb')) print('Generating Testing Data...') test_data = generate_test_samples(test_data, cold_start=cold_start) print('Saving Testing Data...') cPickle.dump(test_data, open('{}_{}_{}_test_data.cpkl'.format(save_name,test_begin_date,test_end_date),'wb')) print('Finish!') return True
def social_implicit_path_generator( g=SocialGraph(), minT=1, maxT=32, p=0.15, max_length=5): random_path = dict() # Stage 1. Split social graph into user-user and item-item graphs user_graph, item_graph = g.split() # Stage 2. Random walk in user-user graph print('Random walk in user-user graph') user_path = weighted_random_walk_generator(user_graph, minT=minT, maxT=maxT, p=p, max_length=max_length) random_path['user_path'] = user_path # Stage 3. Random walk in item-item graph print('Random walk in item-item graph') item_path = weighted_random_walk_generator(item_graph, minT=minT, maxT=maxT, p=p, max_length=max_length) random_path['item_path'] = item_path return random_path
def fit(self, data, g=SocialGraph(), epoch=5, batch_size=32): # preprocess data if self.config['user_id'] is None: self.config['user_id'] = np.asarray(g.node_u.keys()) else: self.config['user_id'] = np.asarray(set(self.config['user_id'] + np.asarray(g.node_u.keys()))) if self.config['item_id'] is None: self.config['item_id'] = np.asarray(g.node_i.keys()) else: self.config['item_id'] = np.asarray(set(self.config['item_id'] + np.asarray(g.node_i.keys()))) print('Review preprocessing...') if self.config['review_tokenizer'] is None: reviews, self.config['review_tokenizer'] = self.preprocess(g.review.values()) else: reviews, self.config['review_tokenizer'] = self.preprocess(g.review.values(), token=self.config['review_tokenizer']) g.review = dict(zip(g.review.keys(), reviews)) # Build Model if self.joint_model is None: self.build_model() # Training Model num_train = len(data['user1']) for i in range(epoch): print('{}-th epoch begin:'.format(i)) data = self.data_shuffle(data) num_iters = num_train/batch_size if num_train%batch_size == 0 else int(num_train/batch_size) + 1 iters = tqdm(range(num_iters)) train_loss = [] for j in iters: train_data = self.train_data_generator(data, g, j, batch_size, num_train) history = self.joint_model.fit(train_data, epochs=1, batch_size=batch_size, verbose=0) train_loss.append(history.history['loss'][-1]) iters.set_description('Training loss: {:.4} >>>>'.format(history.history['loss'][-1])) self.config['loss_history'].append(np.mean(train_loss)) print('{}-th epoch ended, training loss {:.4}.'.format(i, np.mean(train_loss)))
def negative_sampling_prob(g=SocialGraph(), random_path=None): init_u = np.zeros(len(g.node_u)) init_i = np.zeros(len(g.node_i)) users = g.node_u.keys() items = g.node_i.keys() user_sample_prob = dict(zip(users,init_u)) item_sample_prob = dict(zip(items,init_i)) user_context = random_path['user_path'] item_context = random_path['item_path'] user_total = 0 item_total = 0 for c in user_context: for i in c: user_sample_prob[i] += 1 user_total += 1 for c in item_context: for i in c: item_sample_prob[i] += 1 item_total += 1 for i in user_sample_prob: user_sample_prob[i] = user_sample_prob[i]*1.0/user_total for i in item_sample_prob: item_sample_prob[i] = item_sample_prob[i]*1.0/item_total return np.asarray(user_sample_prob.values()), np.asarray(item_sample_prob.values())
def get_suggested_friends(user_id): friends = Friend.query.filter(Friend.user_1 == user_id, Friend.active == True).all() # only suggesting friends of friends # only user's friends are the keys in the social graph's underlying dict social_graph = SocialGraph() social_graph.add_friend_node(user_id) for friend in friends: social_graph.add_friend_edge(user_id, friend.user_2) if friend not in social_graph: social_graph.add_friend_node(friend.user_2) friends_friends = Friend.query.filter(Friend.user_1 == friend.user_2, Friend.active == True).all() for friend_of_friend in friends_friends: social_graph.add_friend_edge(friend.user_2, friend_of_friend.user_2) suggested_friends = [] friend_list = social_graph.get_friend_connections(user_id) for friend_user_id in friend_list: friends_friend_list = social_graph.get_friend_connections(friend_user_id) for friend_of_friend in friends_friend_list: if (friend_of_friend != user_id and friend_of_friend not in social_graph and friend_of_friend not in suggested_friends): user = User.query.get(friend_of_friend) suggested_friends.append(user) if len(suggested_friends) > 1: break return suggested_friends
def sample_generator(g=SocialGraph(), random_path=None): samples = {'user1':[], 'item1':[], 'review1':[], 'rating1':[], 'label1':[], 'context_u':[], 'success1':[], 'user2':[], 'item2':[], 'review2':[], 'rating2':[], 'label2':[], 'context_i':[], 'success2':[]} print('Transfer random path to context...') user_context = path2context(g.node_u.keys(), random_path['user_path']) item_context = path2context(g.node_i.keys(), random_path['item_path']) user_nodes = np.asarray(g.node_u.keys()) # the user id item_nodes = np.asarray(g.node_i.keys()) # the item id print('Calculate negative sampling probability...') user_sample_prob, item_sample_prob = negative_sampling_prob(g, random_path) user_sample_index = 0 # the user negative sampling index item_sample_index = 0 # the item negative sampling index negative_user_set = np.random.choice(user_nodes, size=len(user_nodes)*10, p=user_sample_prob) negative_item_set = np.random.choice(item_nodes, size=len(item_nodes)*10, p=item_sample_prob) # Generate user samples num_reviews = len(g.review) # number of reviews sample_index = np.r_[0:num_reviews] # the index set used to random selection np.random.shuffle(sample_index) review_keys = g.review.keys() uni_labels = np.unique(g.label.values()) # unique labels print('Generate user samples') for user in tqdm(user_nodes): items = g.node_u[user] # item = np.random.choice(items) for item in items: # # Negative in context success pair for other_user in user_context[user]: samples['user1'].append(user) samples['item1'].append(item) samples['review1'].append((user,item)) samples['rating1'].append(g.rating[(user,item)]) samples['label1'].append(g.label[(user,item)]) samples['success1'].append(1) samples['context_u'].append(1) other_items = g.node_u[other_user] other_item = np.random.choice(other_items) samples['user2'].append(other_user) samples['item2'].append(other_item) samples['review2'].append((other_user,other_item)) samples['rating2'].append(g.rating[(other_user,other_item)]) samples['label2'].append(g.label[(other_user,other_item)]) samples['success2'].append(1) if other_item in item_context[item]: samples['context_i'].append(1) else: samples['context_i'].append(0) # # Negative in context not success pair samples['user1'].append(user) samples['item1'].append(item) samples['review1'].append((user, item)) samples['rating1'].append(g.rating[(user, item)]) samples['label1'].append(g.label[(user, item)]) samples['success1'].append(1) samples['context_u'].append(1) negative_item = negative_sample(sets=item_nodes, conflicts=other_items) if negative_item is None: negative_item = np.random.choice(item_nodes) samples['success2'].append(1) else: samples['success2'].append(0) samples['user2'].append(other_user) samples['item2'].append(negative_item) samples['review2'].append(review_keys[update_sample_index(sample_index)]) samples['rating2'].append(g.rating[review_keys[update_sample_index(sample_index)]]) samples['label2'].append(np.random.choice(uni_labels)) if negative_item in item_context[item]: samples['context_i'].append(1) else: samples['context_i'].append(0) # # Not in context sampling success pair for i in range(len(user_context[user])): samples['user1'].append(user) samples['item1'].append(item) samples['review1'].append((user, item)) samples['rating1'].append(g.rating[(user, item)]) samples['label1'].append(g.label[(user, item)]) samples['success1'].append(1) negative_user, user_sample_index = fast_negative_sample(random_sets=negative_user_set, conflicts=user_context[user], index=user_sample_index) if negative_user is None: negative_user = np.random.choice(user_nodes) samples['context_u'].append(1) samples['context_u'].append(1) else: samples['context_u'].append(0) samples['context_u'].append(0) other_items = g.node_u[negative_user] other_item = np.random.choice(other_items) samples['user2'].append(negative_user) samples['item2'].append(other_item) samples['review2'].append((negative_user, other_item)) samples['rating2'].append(g.rating[(negative_user, other_item)]) samples['label2'].append(g.label[(negative_user, other_item)]) samples['success2'].append(1) if other_item in item_context[item]: samples['context_i'].append(1) else: samples['context_i'].append(0) # # Not in context sampling not success pair samples['user1'].append(user) samples['item1'].append(item) samples['review1'].append((user, item)) samples['rating1'].append(g.rating[(user, item)]) samples['label1'].append(g.label[(user, item)]) samples['success1'].append(1) negative_item = negative_sample(sets=item_nodes, conflicts=other_items) if negative_item is None: negative_item = np.random.choice(item_nodes) samples['success2'].append(1) else: samples['success2'].append(0) samples['user2'].append(negative_user) samples['item2'].append(negative_item) samples['review2'].append(review_keys[update_sample_index(sample_index)]) samples['rating2'].append(g.rating[review_keys[update_sample_index(sample_index)]]) samples['label2'].append(np.random.choice(uni_labels)) if negative_item in item_context[item]: samples['context_i'].append(1) else: samples['context_i'].append(0) # Generate item samples print('Generate item samples') for item in tqdm(item_nodes): users = g.node_i[item] # user = np.random.choice(users) for user in users: # # Negative in context success pair for other_item in item_context[item]: samples['user1'].append(user) samples['item1'].append(item) samples['review1'].append((user,item)) samples['rating1'].append(g.rating[(user,item)]) samples['label1'].append(g.label[(user,item)]) samples['success1'].append(1) samples['context_i'].append(1) other_users = g.node_i[other_item] other_user = np.random.choice(other_users) samples['user2'].append(other_user) samples['item2'].append(other_item) samples['review2'].append((other_user,other_item)) samples['rating2'].append(g.rating[(other_user,other_item)]) samples['label2'].append(g.label[(other_user,other_item)]) samples['success2'].append(1) if other_user in user_context[user]: samples['context_u'].append(1) else: samples['context_u'].append(0) # # Negative in context not success pair samples['user1'].append(user) samples['item1'].append(item) samples['review1'].append((user, item)) samples['rating1'].append(g.rating[(user, item)]) samples['label1'].append(g.label[(user, item)]) samples['success1'].append(1) samples['context_i'].append(1) negative_user = negative_sample(sets=user_nodes, conflicts=other_users) if negative_user is None: negative_user = np.random.choice(user_nodes) samples['success2'].append(1) else: samples['success2'].append(0) samples['user2'].append(negative_user) samples['item2'].append(item) samples['review2'].append(review_keys[update_sample_index(sample_index)]) samples['rating2'].append(g.rating[review_keys[update_sample_index(sample_index)]]) samples['label2'].append(np.random.choice(uni_labels)) if negative_user in user_context[user]: samples['context_u'].append(1) else: samples['context_u'].append(0) # # Not in context sampling success pair for i in range(len(item_context[item])): samples['user1'].append(user) samples['item1'].append(item) samples['review1'].append((user, item)) samples['rating1'].append(g.rating[(user, item)]) samples['label1'].append(g.label[(user, item)]) samples['success1'].append(1) negative_item, item_sample_index = fast_negative_sample(random_sets=negative_item_set, conflicts=item_context[item], index=item_sample_index) if negative_item is None: negative_item = np.random.choice(item_nodes) samples['context_i'].append(1) samples['context_i'].append(1) else: samples['context_i'].append(0) samples['context_i'].append(0) other_users = g.node_i[negative_item] other_user = np.random.choice(other_users) samples['user2'].append(other_user) samples['item2'].append(negative_item) samples['review2'].append((other_user, negative_item)) samples['rating2'].append(g.rating[(other_user, negative_item)]) samples['label2'].append(g.label[(other_user, negative_item)]) samples['success2'].append(1) if other_user in user_context[user]: samples['context_u'].append(1) else: samples['context_u'].append(0) # # Not in context sampling not success pair samples['user1'].append(user) samples['item1'].append(item) samples['review1'].append((user, item)) samples['rating1'].append(g.rating[(user, item)]) samples['label1'].append(g.label[(user, item)]) samples['success1'].append(1) negative_user = negative_sample(sets=user_nodes, conflicts=other_users) if negative_user is None: negative_user = np.random.choice(user_nodes) samples['success2'].append(1) else: samples['success2'].append(0) samples['user2'].append(negative_user) samples['item2'].append(negative_item) samples['review2'].append(review_keys[update_sample_index(sample_index)]) samples['rating2'].append(g.rating[review_keys[update_sample_index(sample_index)]]) samples['label2'].append(np.random.choice(uni_labels)) if negative_user in user_context[user]: samples['context_u'].append(1) else: samples['context_u'].append(0) return shuffle_samples(samples)