def _fetch_member_friends(self, user_subset): if self.member_friends is not None: return log_method_begin() members = [ member for member in self.members if member['id'] in user_subset ] print('{} users to fetch'.format(len(members))) pool_results = [] with vk_api.VkRequestsPool(self.vk_session) as pool: for member in members: pool_results.append( (member['id'], pool.method('friends.get', { 'user_id': member['id'], 'fields': 'photo' }))) self.member_friends = defaultdict(list) for member_id, friend_request in pool_results: if friend_request.ok: for friend in friend_request.result['items']: if friend['id'] not in user_subset: friend['is_member'] = False self.member_friends[member_id].append(friend) self._compress_users() self._save_pickle('raw_users_data.member_friends', self.member_friends) log_method_end()
def fit(self, post_subset=None): df = self.action_data.get_all() if post_subset is not None: df = df[df['post_id'].isin(post_subset)] log_method_begin() x_df = df.drop( ['user_id', 'post_id', 'is_member', 'is_liked', 'is_reposted'], axis=1) self.like_model.fit(x_df, df['is_liked']) self.repost_model.fit(x_df, df['is_reposted']) self.is_fitted = True log_method_end()
def get_true(self, subset=None): print('GroupPredict.get_true for group {}'.format(self.group_id)) log_method_begin() direct_likes_count = Counter() direct_reposts_count = Counter() non_direct_likes_count = Counter() non_direct_reposts_count = Counter() for post in tqdm(self.raw_wall_data.posts): post_id = post['id'] if subset is not None and post_id not in subset: continue for user_id in post['likes']['user_ids']: user = self.raw_users_data.find_user(user_id) if user is None: continue if user['is_member']: direct_likes_count[post_id] += 1 else: non_direct_likes_count[post_id] += 1 for user_id in post['reposts']['user_ids']: user = self.raw_users_data.find_user(user_id) if user is None: continue if user['is_member']: direct_reposts_count[post_id] += 1 else: non_direct_reposts_count[post_id] += 1 post_ids = list(direct_likes_count.keys() | direct_reposts_count.keys() | non_direct_likes_count.keys() | non_direct_reposts_count.keys()) rows = [] for post_id in post_ids: rows.append([ direct_likes_count[post_id], direct_reposts_count[post_id], non_direct_likes_count[post_id], non_direct_reposts_count[post_id] ]) result = pd.DataFrame(rows, index=post_ids, columns=[ 'direct_likes_count', 'direct_reposts_count', 'non_direct_likes_count', 'non_direct_reposts_count' ]) log_method_end() return result
def _fetch_members(self): if self.members is not None: return log_method_begin() self.members = self.vk_tools.get_all('groups.getMembers', 1000, { 'group_id': self.group_id, 'fields': self.member_fields })['items'] print('{} members'.format(len(self.members))) for member in self.members: member['is_member'] = True log_method_end()
def fit(self): log_method_begin() print("{} members, {} posts".format(len(self.raw_users_data.members), len(self.raw_wall_data.posts))) rows = [] friend_post_pairs = set() for user in tqdm(self.raw_users_data.members, 'ActionData.get_all: for members'): if 'groups' not in user: continue for post in self.raw_wall_data.posts: is_liked = user['id'] in post['likes']['user_ids'] is_reposted = user['id'] in post['reposts']['user_ids'] if is_reposted: for friend in self.raw_users_data.member_friends[ user['id']]: if 'groups' not in friend: continue friend_post_pair = (friend['id'], post['id']) if friend_post_pair not in friend_post_pairs: friend_is_liked = friend['id'] in post['likes'][ 'user_ids'] friend_is_reposted = friend['id'] in post[ 'reposts']['user_ids'] rows.append( self.get_row(friend, post, False, friend_is_liked, friend_is_reposted)) friend_post_pairs.add(friend_post_pair) rows.append( self.get_row(user, post, True, is_liked, is_reposted)) result = pd.DataFrame(rows, columns=self.get_labels()) self.table = result print("{} rows".format(len(result))) print("{} liked, {} reposted".format(sum(result['is_liked']), sum(result['is_reposted']))) print("{} liked, {} reposted by members".format( sum(result[result['is_member']]['is_liked']), sum(result[result['is_member']]['is_reposted']))) log_method_end() return result
def predict(self, post_subset=None): df = self.action_data.get_all() if post_subset is not None: df = df[df['post_id'].isin(post_subset)] log_method_begin() x_df = df.drop( ['user_id', 'post_id', 'is_member', 'is_liked', 'is_reposted'], axis=1) pred = [ df['user_id'], df['post_id'], df['is_member'], self.like_model.predict(x_df), self.repost_model.predict(x_df) ] result = pd.DataFrame(np.array(pred).T, columns=[ 'user_id', 'post_id', 'is_member', 'is_liked', 'is_reposted' ]) log_method_end() return result
def predict(self, post_subset=None): log_method_begin() direct_likes_count = Counter() direct_reposts_count = Counter() non_direct_likes_count = Counter() non_direct_reposts_count = Counter() pred_df = self.predict_action_model.predict(post_subset) for i, row in pred_df.iterrows(): if row['is_liked']: if row['is_member']: direct_likes_count[row['post_id']] += 1 else: non_direct_likes_count[row['post_id']] += 1 if row['is_reposted']: if row['is_member']: direct_reposts_count[row['post_id']] += 1 else: non_direct_reposts_count[row['post_id']] += 1 post_ids = list(direct_likes_count.keys() | direct_reposts_count.keys() | non_direct_likes_count.keys() | non_direct_reposts_count.keys()) rows = [] for post_id in post_ids: rows.append([ direct_likes_count[post_id], direct_reposts_count[post_id], non_direct_likes_count[post_id], non_direct_reposts_count[post_id] ]) result = pd.DataFrame(rows, index=post_ids, columns=[ 'direct_likes_count', 'direct_reposts_count', 'non_direct_likes_count', 'non_direct_reposts_count' ]) log_method_end() return result
def _fetch_activity(self): log_method_begin() print('{} posts to fetch'.format(len(self.posts))) pool_results = [] with vk_api.VkRequestsPool(self.vk_session) as pool: for post in self.posts: likes = pool.method( 'likes.getList', {'item_id': post['id'], 'owner_id': -self.group_id, 'type': 'post', 'count': 1000, 'filter': 'likes'} ) reposts = pool.method( 'likes.getList', {'item_id': post['id'], 'owner_id': -self.group_id, 'type': 'post', 'count': 1000, 'filter': 'copies'} ) pool_results.append((post, likes, reposts)) for post, likes, reposts in pool_results: if 'likes' not in post: post['likes'] = dict() if likes.ok: likes = likes.result['items'] post['likes']['user_ids'] = set(likes) else: post['likes']['user_ids'] = set() if 'reposts' not in post: post['reposts'] = dict() if reposts.ok: reposts = reposts.result['items'] post['reposts']['user_ids'] = set(reposts) else: post['reposts']['user_ids'] = set() log_method_end()
def fit(self): log_method_begin() self.lda_maker = LdaMaker(self._get_corpora_for_lda(), self.num_topics) log_method_end()
def _fetch_wall(self): log_method_begin() self.posts = self.vk_tools.get_all('wall.get', 100, {'owner_id': -self.group_id, 'extended': 1})['items'] print('{} posts'.format(len(self.posts))) log_method_end()
def _fetch_groups(self, user_subset): log_method_begin() all_users = [ user for user in self.get_all_users() if user['id'] in user_subset ] print('{} users to fetch'.format(len(all_users))) all_users_processing_step = 1000 fetch_start = time.time() for i in range(0, len(all_users), all_users_processing_step): print('Fetching from {} to {}...'.format( i, i + all_users_processing_step)) users = all_users[i:i + all_users_processing_step] if time.time() - fetch_start > 2 * 60 * 60: print('Cooldown for 30 minutes') time.sleep(30 * 60) fetch_start = time.time() do_fetch = True last_error_time = -1 while do_fetch: try: pool_results = [] with vk_api.VkRequestsPool(self.vk_session) as pool: for user in users: if 'groups' not in user: pool_results.append( (user, pool.method( 'groups.get', { 'user_id': user['id'], 'count': 1000, 'extended': 1, 'fields': self.group_fields }))) do_fetch = False self.unmark_fetch_groups_error() except Exception as e: print('Can\'t fetch groups because of', e) traceback.print_exc() if time.time() - last_error_time < 120: print( 'Can\'t do anything, exit. Restart will reuse fetched users' ) do_fetch = False self.mark_fetch_groups_error() else: print('Trying again in 1 minute') time.sleep(60) last_error_time = time.time() finally: for user, groups_request in pool_results: if groups_request.ok and groups_request.ready: user['groups'] = [] for group in groups_request.result['items']: if 'description' in group: user['groups'].append( {'description': group['description']}) self._save_pickle('raw_users_data.members', self.members) self._save_pickle('raw_users_data.member_friends', self.member_friends) log_method_end()