def sampling_cate_topics(regions, size, g_percentages): """ Sampling poi topics from the database """ topics = pd.DataFrame(columns=TOPIC_SCHEMA) checkins = None cate_set = set() for r in regions: kbase = KnowledgeBase.fromMongo(db.checkin, r['value']) if checkins is not None: cate_set = set(kbase.checkins['cid'].unique()) checkins = checkins.append(kbase.checkins, ignore_index=True) else: cate_set &= set(kbase.checkins['cid'].unique()) checkins = kbase.checkins _LOGGER.info('%d checkins loaded for cate_topics', len(checkins)) checkins.drop_duplicates(cols=['pid', 'user'], inplace=True) for zcate, group in checkins.groupby('z_category'): cidgroup = [ cid + '\t' + cname for cid, cname in group[['cid', 'category']].values if cid in cate_set ] for gid, g in enumerate( stratified_samples(cidgroup, g_percentages, size / 9)): for s in g: cid, cname = s.split('\t') for r in regions: topics = topics.append([{ 'topic_id': CATE_ID.next(), 'topic': cname, 'region': r['name'], 'associate_id': cid, 'zcategory': zcate, 'group': gid }]) _LOGGER.info('# CATE_topics: %d', len(topics)) for zcate, group in checkins.groupby('z_category'): for r in regions: topics = topics.append([{ 'topic_id': ZCATE_ID.next(), 'topic': zcate, 'region': r['name'], 'associate_id': group['zcid'].values[0], 'zcategory': zcate }]) _LOGGER.info('# Total Cate_topics: %d', len(topics)) return topics
def sampling_cate_topics(regions, size, g_percentages): """ Sampling poi topics from the database """ topics = pd.DataFrame(columns=TOPIC_SCHEMA) checkins = None cate_set = set() for r in regions: kbase = KnowledgeBase.fromMongo(db.checkin, r['value']) if checkins is not None: cate_set = set(kbase.checkins['cid'].unique()) checkins = checkins.append(kbase.checkins, ignore_index=True) else: cate_set &= set(kbase.checkins['cid'].unique()) checkins = kbase.checkins _LOGGER.info('%d checkins loaded for cate_topics', len(checkins)) checkins.drop_duplicates(cols=['pid', 'user'], inplace=True) for zcate, group in checkins.groupby('z_category'): cidgroup = [cid + '\t' + cname for cid, cname in group[['cid', 'category']].values if cid in cate_set] for gid, g in enumerate(stratified_samples(cidgroup, g_percentages, size / 9)): for s in g: cid, cname = s.split('\t') for r in regions: topics = topics.append([{'topic_id': CATE_ID.next(), 'topic': cname, 'region': r['name'], 'associate_id': cid, 'zcategory': zcate, 'group': gid}]) _LOGGER.info('# CATE_topics: %d', len(topics)) for zcate, group in checkins.groupby('z_category'): for r in regions: topics = topics.append([{'topic_id': ZCATE_ID.next(), 'topic': zcate, 'region': r['name'], 'associate_id': group['zcid'].values[0], 'zcategory': zcate}]) _LOGGER.info('# Total Cate_topics: %d', len(topics)) return topics
def sampling_poi_topics(region, size, g_percentages): """ Sampling poi topics from the database """ topics = pd.DataFrame(columns=TOPIC_SCHEMA) kbase = KnowledgeBase.fromMongo(db.checkin, region['value']) kbase.checkins.drop_duplicates(cols=['pid', 'user'], inplace=True) for zcate, group in kbase.checkins.groupby('z_category'): pidgroup = [pid + '\t' + pname for pid, pname in group[['pid', 'place']].values] for gid, g in enumerate(stratified_samples(pidgroup, g_percentages, size / 9)): for s in g: pid, pname = s.split('\t') topics = topics.append([{'topic_id': POI_ID.next(), 'topic': pname, 'region': region['name'], 'associate_id': pid, 'zcategory': zcate, 'group': gid}]) _LOGGER.info('# POI_topics: %d', len(topics)) return topics
def sampling_poi_topics(region, size, g_percentages): """ Sampling poi topics from the database """ topics = pd.DataFrame(columns=TOPIC_SCHEMA) kbase = KnowledgeBase.fromMongo(db.checkin, region['value']) kbase.checkins.drop_duplicates(cols=['pid', 'user'], inplace=True) for zcate, group in kbase.checkins.groupby('z_category'): pidgroup = [ pid + '\t' + pname for pid, pname in group[['pid', 'place']].values ] for gid, g in enumerate( stratified_samples(pidgroup, g_percentages, size / 9)): for s in g: pid, pname = s.split('\t') topics = topics.append([{ 'topic_id': POI_ID.next(), 'topic': pname, 'region': region['name'], 'associate_id': pid, 'zcategory': zcate, 'group': gid }]) _LOGGER.info('# POI_topics: %d', len(topics)) return topics