def process(): # as step 1 in `reviewer_classifier` is to discard the users with `review_count` less than 2 -> we already filter the users users = mongo_functions.mongo_get( collection='pre_user', filter={'review_count': {'$gt': 2}}, fields={'reviews': 1, 'review_count': 1, 'grouped_reviews': 1}, page_size=2000000 ) located_users = [] index = 0 users_len = len(users) classifier_codes = {} print('\n') for user in users: index += 1 sys.stdout.write('\rProcessing user {}/{}...'.format(index, users_len)) sys.stdout.flush() user_location, code = reviewer_classifier(user) if code is not None: if code not in classifier_codes: classifier_codes[code] = 0 classifier_codes[code] += 1 if user_location is None: continue user['local'] = user_location located_users.append(user) ordered_classification_codes = sorted([(key, value) for key, value in classifier_codes.items()], key=lambda x: x[0]) print('\nSummary:') for item in ordered_classification_codes: print('{} users classified as: {}'.format(item[1], classifier_code_map[item[0]])) mongo_functions.batch_update(located_users, collection='pre_user', update='{"$set": item}')
def prepare(): areas = mongo_functions.mongo_get(collection='pre_metropolitan_area') for area in areas: businesses = mongo_functions.mongo_get( collection='pre_business', filter={'tile10': { '$in': area['tiles'] }}) tile15_dict = _group_business_by_tile15(businesses) tile15_ordered_list = sorted([(key, value) for key, value in tile15_dict.items()], key=lambda x: len(x[1]), reverse=True) city_area = _group(set([tile for tile in tile15_dict.keys()]), tile15_ordered_list[0][0]) city_business = [] for tile in city_area[0]: city_business += tile15_dict[tile] # mongo_functions.batch_upsert(city_business, collection='pre_city_business', update="{'$set': item}") area['city_center'] = list( tile_functions.tile_center( int(tile15_ordered_list[0][0].split('_')[0]), int(tile15_ordered_list[0][0].split('_')[1]), 15)) area['city_tiles15'] = city_area[0] area['city_businesses'] = len(city_business) mongo_functions.batch_update(areas, collection='pre_metropolitan_area', update="{'$set': item}")
def yearly_dissimilarity_ratio(): for area in mongo_functions.mongo_get(collection='pre_metropolitan_area', fields={}): business_dict = {} for year, reviews in _reviews_per_year(area).items(): city_unique_visitors = city_unique_visitors_get(reviews, area) city_unique_residents = city_unique_residents_get(reviews, area) total_visitors = len(city_unique_visitors) total_residents = len(city_unique_residents) business_reviews_dict = business_reviews_dict_group(reviews) for business_id, business_reviews in business_reviews_dict.items(): if business_id not in business_dict: business_dict[business_id] = { '_id': business_id, 'ratio_yearly': {} } business_dict[business_id]['ratio_yearly'][str(year)] = float( '{0:.9f}'.format( business_dissimilarity_ratio(business_reviews, total_visitors, total_residents, area))) business_list = [item for item in business_dict.values()] mongo_functions.batch_update( business_list, collection='pre_business', update='{"$set": {"ratio_yearly": item["ratio_yearly"]}}')
def prepare(): for area in mongo_functions.mongo_get(collection='pre_metropolitan_area', fields={}): reviews = mongo_functions.mongo_get(collection='pre_review', filter={'city_area': area['_id']}, fields={'user_id': 1}) try: user_ids = list({review['user_id'] for review in reviews}) except Exception as e: print(e) return try: user_type_dict = {} while len(user_ids) > 0: current_user_ids = user_ids[:100000] user_ids = user_ids[100000:] user_type_dict.update({ user['_id']: user['local'] for user in mongo_functions.mongo_get( collection='pre_user', filter={ '_id': { '$in': current_user_ids }, 'local': { '$exists': True } }, fields={'local': 1}) }) except Exception as e: print(e) return _reviews = [] while len(reviews) > 0: review = reviews.pop() try: review['user_from'] = user_type_dict[review['user_id']] _reviews.append(review) except KeyError: pass except Exception as e: print(e) pass mongo_functions.batch_update( _reviews, collection='pre_review', update='{"$set": {"user_from": item["user_from"]}}')
def prepare(): category_dict = category_dict_prepare() for area in mongo_functions.mongo_get(collection='pre_metropolitan_area', fields={ 'tiles': 1, 'name': 1 }): print('processing business for ', area['_id']) business = mongo_functions.mongo_get( collection='pre_business', filter={"tile10": { '$in': area['tiles'] }}, fields={ 'name': 1, 'categories': 1 }) business_prepare(business, category_dict) mongo_functions.batch_update( business, collection='pre_business', update='{"$set": {"norm_categories": item["norm_categories"]}}')
def main(): for area in mongo_functions.mongo_get(collection='pre_metropolitan_area', fields={}): reviews = mongo_functions.mongo_get(collection='pre_review', filter={ 'city_area': area['_id'], 'user_from': { '$exists': True } }, fields={ 'business_id': 1, 'user_id': 1, 'tile15': 1, 'tile18': 1, 'user_from': 1 }) city_unique_visitors = city_unique_visitors_get(reviews, area) city_unique_residents = city_unique_residents_get(reviews, area) total_visitors = len(city_unique_visitors) total_residents = len(city_unique_residents) business_reviews_dict = business_reviews_dict_group(reviews) business_list = [] for business_id, business_reviews in business_reviews_dict.items(): business = { '_id': business_id, 'raw_ratio': business_dissimilarity_ratio(business_reviews, total_visitors, total_residents, area) } business_list.append(business) mongo_functions.batch_update( business_list, collection='pre_business', update='{"$set": {"raw_ratio": item["raw_ratio"]}}')