def loadBusinesses(chunk, num_processes=None): """ Loads businesses. Specify a chunk to only load subsets of the businesses. Also specify number of procecesses to use. """ assert chunk in ['0', '1', '2'], "Chunk must be 0, 1, or 2" businesses = loader.get_challengeset(int(chunk)) num_processes = int(num_processes) if num_processes else cpu_count() total = len(businesses) global featurizer # late import from scorers import Featurizer featurizer = Featurizer() pool = Pool(processes=num_processes) print "Loading {} businesseses".format(total) i = 0 for group in chunker(businesses, num_processes): i += len(group) sys.stdout.write('\r') sys.stdout.write("[%-50s] %d%% (%d/%d) " % ('=' * ((i + 1) * 50 / total), ((i + 1) * 100 / total), i + 1, total)) sys.stdout.flush() pool.map(concurrent_process, group)
def get_all_business_types(): businesses = loader.get_challengeset() idtoloc = loader.get_idtoloc() business_types_dict = loader.get_business_types() print "Done {} of {}".format(len(business_types_dict), len(businesses)) for business in businesses: unique_id = business['unique_id'] if unique_id not in business_types_dict.keys(): print business['name'] closest_place, best_sim = None, 0 lat, lon = idtoloc[unique_id] for place in get_places(lat, lon): sim = cosine_sim(place['name'], business['name']) if sim > best_sim: closest_place = place best_sim = sim if closest_place: types = filter(lambda x: not x in ['point_of_interest', 'establishment', 'sublocality', 'route', 'real', 'political', 'of', 'or', 'local', 'locality', 'intersection', '1'], closest_place['types']) types = " ".join(types).replace("_", " ") else: types = None print types business_types_dict[unique_id] = types loader.dump_business_dict(business_types_dict)
def get_business_lat_lon(): gapi = GoogleGeocodingApi() id_to_loc = loader.get_idtoloc() businesses = loader.get_challengeset() for business in businesses: unique_id = business['unique_id'] address = business['address'] if id_to_loc.get(unique_id) is None: print business['name'] lat, lng = gapi.decode_address(address) id_to_loc[unique_id] = (lat, lng) pickle.dump(id_to_loc, open('../../data/id_to_loc.pickle', 'w'))