Ejemplo n.º 1
0
def loadBusinesses(chunk, num_processes=None):
    """
    Loads businesses. Specify a chunk to only load subsets of the businesses. Also specify number of procecesses to use.
    """
    assert chunk in ['0', '1', '2'], "Chunk must be 0, 1, or 2"
    businesses = loader.get_challengeset(int(chunk))
    num_processes = int(num_processes) if num_processes else cpu_count()
    total = len(businesses)
    global featurizer
    # late import
    from scorers import Featurizer
    featurizer = Featurizer()

    pool = Pool(processes=num_processes)

    print "Loading {} businesseses".format(total)
    i = 0
    for group in chunker(businesses, num_processes):
        i += len(group)
        sys.stdout.write('\r')
        sys.stdout.write("[%-50s] %d%% (%d/%d) " %
                         ('=' * ((i + 1) * 50 / total),
                          ((i + 1) * 100 / total), i + 1, total))
        sys.stdout.flush()
        pool.map(concurrent_process, group)
Ejemplo n.º 2
0
def get_all_business_types():
    businesses = loader.get_challengeset()
    idtoloc = loader.get_idtoloc()
    business_types_dict = loader.get_business_types()
    print "Done {} of {}".format(len(business_types_dict), len(businesses))
    for business in businesses:
        unique_id = business['unique_id']
        if unique_id not in business_types_dict.keys():
            print business['name']
            closest_place, best_sim = None, 0
            lat, lon = idtoloc[unique_id]
            for place in get_places(lat, lon):
                sim = cosine_sim(place['name'], business['name'])
                if sim > best_sim:
                    closest_place = place
                    best_sim = sim
            if closest_place:
                types = filter(lambda x: not x in ['point_of_interest', 'establishment', 'sublocality', 'route',
                                                   'real', 'political', 'of', 'or', 'local', 'locality', 'intersection',
                                                   '1'], closest_place['types'])
                types = " ".join(types).replace("_", " ")
            else:
                types = None
            print types
            business_types_dict[unique_id] = types
            loader.dump_business_dict(business_types_dict)
Ejemplo n.º 3
0
def get_business_lat_lon():
    gapi = GoogleGeocodingApi()
    id_to_loc = loader.get_idtoloc()
    businesses = loader.get_challengeset()
    for business in businesses:
        unique_id = business['unique_id']
        address = business['address']
        if id_to_loc.get(unique_id) is None:
            print business['name']
            lat, lng = gapi.decode_address(address)
            id_to_loc[unique_id] = (lat, lng)
        pickle.dump(id_to_loc, open('../../data/id_to_loc.pickle', 'w'))
Ejemplo n.º 4
0
def loadBusinesses(chunk, num_processes=None):
    """
    Loads businesses. Specify a chunk to only load subsets of the businesses. Also specify number of procecesses to use.
    """
    assert chunk in ['0', '1', '2'], "Chunk must be 0, 1, or 2"
    businesses = loader.get_challengeset(int(chunk))
    num_processes = int(num_processes) if num_processes else cpu_count()
    total = len(businesses)
    global featurizer
    # late import
    from scorers import Featurizer
    featurizer = Featurizer()

    pool = Pool(processes=num_processes)

    print "Loading {} businesseses".format(total)
    i = 0
    for group in chunker(businesses, num_processes):
        i += len(group)
        sys.stdout.write('\r')
        sys.stdout.write("[%-50s] %d%% (%d/%d) " % ('=' * ((i + 1) * 50 / total), ((i + 1) * 100 / total), i + 1, total))
        sys.stdout.flush()
        pool.map(concurrent_process, group)