def __init__(self, pois, level=None): """ :param pois: dataframe with POIs and respective coordinates. :param level: level of the category we want (e.g. Shops & Services:Gas Stations -> level 0: Shops & Services) This param has to be used when it is necessary produce a clean file text for word2vec. """ self._pois = pois self._level = level if self._level is not None: self._pois['categories'] = utils.select_category( list(self._pois['categories']), level)
def from_csv(cls, input, sep='\t', category_column='categories', level=5): # load foursquare dataset mapped on a particular grid df = pd.read_csv(input, sep=sep) df[category_column] = df[category_column].astype(str) # assign category to each record of the dataset df.loc[:, "category"] = utils.select_category(list(df[category_column]), level) # drop entry with empty category df = df.loc[df["category"] != "nan"] return cls(df)
def cell_vector_representation(poi_grid, w2v_model, level, output_file, size): """ Takes as input a spatial grid with POIs for each cell, a Word2Vec model, and a level of detail For each cell: Looks up each category in a cell for the given level in the W2V model, taking the corresponding vector representation Sums all the vectors Returns a dataframe with a w2v representation for all words in that cell in every row """ # load shapefile of mapped POIs gdf = csv_2_geodf(poi_grid) # load w2v_model model = gensim.models.Word2Vec.load(w2v_model) # group every cell grouped_gdf = gdf.groupby('cellID') output = {} with open(output_file, 'w') as out: for cell, group in grouped_gdf: output[cell] = [] for categories_raw in group['categories']: # select level category = utils.select_category(categories_raw.split(':'), level)[-1] # lookup category in w2v try: vector = model[category] output[cell].append(np.array(vector)) except (KeyError): pass if len(output[cell]) == 0: output[cell] = [np.zeros(int(size))] # sum vectors sum_w = sum_vectors(output[cell]) sum_w_str = str("\t".join(map(str, sum_w))) text_to_write = str(cell) + '\t' + sum_w_str + '\n' out.write(text_to_write)
def generate(self, model, area_based=True, strategy='avg'): """ TODO: create the NOT area based function """ # load w2v_model model = gensim.models.Word2Vec.load(model) # group every cell grouped_gdf = gdf.groupby('cellID') output = {} with open(output_file, 'w') as out: for cell, group in grouped_gdf: output[cell] = [] for categories_raw in group['categories']: # select level category = utils.select_category( categories_raw.split(':'), level)[-1] # lookup category in w2v try: vector = model[category] output[cell].append(np.array(vector)) except(KeyError): pass if len(output[cell]) == 0: output[cell] = [np.zeros(int(size))] # sum vectors sum_w = sum_vectors(output[cell]) sum_w_str = str("\t".join(map(str, sum_w))) text_to_write = str(cell) + '\t' + sum_w_str + '\n' out.write(text_to_write)