Beispiel #1
0
    def distance_based_sequence(self, band_size, outfile):

        obs = self._distance(band_size)

        # First step - get the categories for observation ID
        obs_1 = obs.merge(self._pois[['categories']], left_on='observation', right_index=True).rename(
            columns={'categories': 'cat_observation'})

        # Second step - get the categories for observed ID
        obs_2 = obs_1.merge(self._pois[['categories']], left_on='observed', right_index=True).rename(
            columns={'categories': 'cat_observed'})

        # Order by inverse of distance, which is not the real distance but the interaction value from PySal.
        # The interaction among points decreases as the distance increase.
        obs_2.sort_values(by=['observation', 'distance'], ascending=False, inplace=True)

        # Third step - build the sequence joining the words. We keep sequences with at least 3 words.
        obs_3 = obs_2.groupby(['observation', 'cat_observation']).apply(
            lambda x: '\t'.join(x['cat_observed']) if len(x) > 2 else None).reset_index().dropna().rename(
            columns={0: "sequence"})
        obs_3.loc[:, "complete"] = obs_3['cat_observation'] + "\t" + obs_3['sequence']

        # Fourth step - join the pois dataframe with the sequences and save into a csv
        logger.info("Save sequences")
        self._pois[['categories', 'geometry']].merge(obs_3, left_index=True, right_on='observation')[
            ['categories', 'geometry', 'complete']].to_csv(outfile.split(".csv")[0] + "_check.csv", sep='\t', index=False)

        obs_3[['complete']].to_csv(outfile, index=False, header=False)
Beispiel #2
0
    def _distance(self, band_size=100):

        logger.info("Building sequences for each point in the sapce")
        wthresh = pysal.weights.DistanceBand.from_dataframe(
            self._pois, band_size, p=2, binary=False, ids=self._pois.index)

        ds = []
        for index, indexes in wthresh.neighbors.items():
            if len(indexes) == 0:
                d = {}
                d['observation'] = index
                d['observed'] = index
                d['distance'] = None
                ds.append(d)
            else:
                for i in range(len(indexes)):
                    d = {}
                    d['observation'] = index
                    d['observed'] = indexes[i]
                    d['distance'] = wthresh.weights[index][i]
                    ds.append(d)

        obs = pd.DataFrame(ds)

        return obs
Beispiel #3
0
def merge_features_targets(features_path, targets_path, merge_strategy):
    """
    Reads Features and Targets Dataframes (Urban Atlas data)
    Merges them according to the provided merge strategy
    """

    # load Targets DataFrame
    targets_df = pd.read_csv(targets_path)

    # load Features DataFrame
    features_df = pd.read_csv(features_path, sep="\t")

    # select only relevant columns
    targets_df_relevant = targets_df.loc[:, ['cellID', 'predominant']]
    features_df_relevant = features_df.loc[:, ['cellID']]
    features_df_relevant.drop_duplicates(subset=["cellID"], inplace=True)
    # Merge Features and Targets
    if merge_strategy not in [0, 1]:
        logger.info(
            "Please select a correct merge strategy. Options are (1) Left, (2) Right, (3) Inner Join."
        )
    if merge_strategy == 0:
        # Merge the UA and cellvector dataframes
        merged_df = targets_df_relevant.merge(features_df_relevant,
                                              on="cellID",
                                              how='left')
    else:
        # Merge the UA and cellvector dataframes
        merged_df = targets_df_relevant.merge(features_df_relevant,
                                              on="cellID",
                                              how='inner')
    return merged_df
Beispiel #4
0
    def __init__(self, pois, w2v_model, binary=False):

        super(cell2vec, self).__init__()
        logger.info("Loading w2v model")
        self._pois = pois
        self._categories = pois["category"].drop_duplicates().values

        self._model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model, binary=binary)
Beispiel #5
0
    def alphabetically_sequence(self, outfile):

        if('cellID' not in self._pois.columns):
            raise ValueError(
                "The input file with POIs must contains the column cellID.")

        logger.info("Build the sequences")
        self._pois.sort_values(by=["cellID", "categories"]).groupby('cellID')\
            .apply(lambda x: '\t'.join(x['categories']) if len(x) > 2 else None).dropna().to_csv(outfile, index=False, header=None)
Beispiel #6
0
def merge_features_targets(features_path, targets_path, merge_strategy):
    """
    Reads Features and Targets Dataframes (Urban Atlas data)
    Merges them according to the provided merge strategy
    """

    # load Features DataFrame
    features_df = pd.read_csv(features_path, sep='\t', header=None)

    cols = [int(i) for i in features_df.columns]
    cols[0] = 'cellID'
    features_df.columns = cols
    features_df.columns = list(
        map(lambda x: 'f_fs_' + str(x)
            if x != "cellID" else x, features_df.columns))
    features_df.head(2)

    # load Targets DataFrame
    targets_df = pd.read_csv(targets_path)
    targets_df.columns = list(
        map(lambda x: 't_' + x if x != "cellID" else x, targets_df.columns))

    # select only relevant columns
    targets_df_relevant = targets_df.loc[:, ['cellID', 't_predominant']]

    # Merge Features and Targets
    if merge_strategy not in [1, 2, 3]:
        logger.info(
            "Please select a correct merge strategy. Options are (1) Left, (2) Right, (3) Inner Join."
        )
    if merge_strategy == 1:
        # Merge the UA and cellvector dataframes
        merged_features_targets = targets_df_relevant.merge(features_df,
                                                            on="cellID",
                                                            how='left')

    elif merge_strategy == 2:
        # Merge the UA and cellvector dataframes
        merged_features_targets = targets_df_relevant.merge(features_df,
                                                            on="cellID",
                                                            how='left')

    else:
        # Merge the UA and cellvector dataframes
        merged_features_targets = targets_df_relevant.merge(features_df,
                                                            on="cellID",
                                                            how='left')

    # remove empty
    merged_features_targets.dropna(inplace=True)

    return merged_features_targets
Beispiel #7
0
    def from_csv(cls, input, model, binary=False, sep='\t', category_column='categories', level=5):

        logger.info("Loading mapped POIs")

        # load foursquare dataset mapped on a particular grid
        df = pd.read_csv(input, sep=sep)
        df[category_column] = df[category_column].astype(str)

        # assign category to each record of the dataset

        df.loc[:, "category"] = utils.select_category(list(df[category_column]), level)

        # drop entry with empty category
        df = df.loc[df["category"] != "nan"]

        return cls(df, model,binary=binary)
Beispiel #8
0
    def from_csv(cls, inputfile, sep='\t', crs=constants.default_crs):
        """
        Read csv file with POIs details, including latitude and longitude
        :param inputfile:
        :param sep:
        :return:
        """
        #  Read foursquare MAPPED onto the grid
        logger.info("Reading POIs dataset.")
        df = pd.read_csv(inputfile, sep=sep)

        # Create GeoDataFrame from the read DataFrame
        logger.info("Create GeoDataFrame")
        geometry = [Point(xy) for xy in zip(df.longitude, df.latitude)]
        gdf = gpd.GeoDataFrame(
            df, index=df.index, geometry=geometry, crs={'init': crs})

        return cls(gdf.to_crs({'init': constants.universal_crs}))
Beispiel #9
0
    def start(self, grid, output, restart=None):

        # Set output as a global variable
        self.output = output

        # Initalize Foursquare client authentication
        fs_client = foursquare.Foursquare(self.client_id, self.client_secret)

        start_point = 0

        # Remove the file if it already exists
        if restart is None:
            try:
                os.remove(output)
            except OSError:
                pass
        else:
            start_point = restart

        logger.info("Calls to do: " + str(len(grid) - start_point))

        #  Iterate over the spatial grid cells. For each cell call Foursquare API
        for ind in range(start_point, len(grid)):

            # Set bounding box for the request
            row = grid.iloc[ind]
            g = str(row.geometry)
            g_parse = g.split("((")[1].split("))")[0].split(", ")
            sw = g_parse[0].split(" ")  # South-West
            ne = g_parse[2].split(" ")  # North-East

            logger.info(
                str(ind) + " - " + str(sw[1]) + ", " + str(sw[0]) + ", " +
                str(ne[1]) + ", " + str(ne[0]))

            # Setup parameters for calling venue search API
            params = dict(sw=sw[1] + ", " + sw[0],
                          ne=ne[1] + ", " + ne[0],
                          intent="browse")

            self.get_venues_search(fs_client, params)

        self.write_file()

        # Sanity check and removing duplicates
        logger.info("Sanity check and removing duplicates.")
        df = pd.read_csv(self.output)
        df.drop_duplicates(['name', 'latitude', 'longitude'], inplace=True)
        df.to_csv(self.output, encoding='utf-8', index=False)
Beispiel #10
0
    def nearest_based_sequence(self, outfile, inputgrid):

        logger.info("Load the grid.")

        # Load inputgrid
        g = Grid.from_file(inputgrid)
        grid = g.grid.to_crs({'init': constants.universal_crs})
        grid.loc[:, 'centroid'] = grid.centroid

        df = self._pois.copy()

        df = df.merge(grid[['cellID', 'centroid']], on='cellID')

        logger.info("Compute centroid for cells and build the sequences")
        df.loc[:, 'distance'] = df.apply(self._centroid_distance, axis=1)
        df.sort_values(by=['cellID', 'distance'], inplace=True, ascending=True)

        logger.info("Save sequences")
        df.groupby('cellID').apply(self._nearest).dropna().to_csv(outfile, index=False, header=None)
Beispiel #11
0
    def get_venues_search(self, fs_client, params):
        call_flag = False

        self.request_counter += 1

        logger.info("# Requests " + str(self.request_counter))

        url = "https://api.foursquare.com/v2/venues/search"

        logger.info(url)

        # ------------ start request! ---------------

        while call_flag is False:

            try:
                data = fs_client.venues.search(params)
                call_flag = True
            except foursquare.RateLimitExceeded as rle:
                waiting_time = 3600
                logger.info("wait", waiting_time)
                self.write_file()
                time.sleep(waiting_time)
            except Exception as exc:
                logger.error("ERROR: {0}".format(exc))

        # ----------- end request ---------------------

        tot = data['venues']
        logger.info("Number of venues: " + str(len(tot)))

        # Iterate over venues
        for glob in range(0, len(tot)):
            current_cat = data['venues'][glob]['categories']
            if len(current_cat) == 0:
                continue

            checkin = data['venues'][glob]['stats']['checkinsCount']
            user = data['venues'][glob]['stats']['usersCount']
            name = data['venues'][glob]['name']
            current_loc = data['venues'][glob]['location']
            lat = current_loc['lat']
            lon = current_loc['lng']

            # Check presence of address and cross street
            if 'address' not in current_loc:
                address = ""
            else:
                address = current_loc['address']
            if 'crossStreet' not in current_loc:
                crossStreet = ""
            else:
                crossStreet = current_loc['crossStreet']

            # Get categories
            if ('pluralName' in current_cat[0]):
                current_cat = current_cat[0]['pluralName']
            else:
                current_cat = current_cat[0]['name']

            if current_cat not in self.cat.index:
                continue

            cat_name = [
                self.cat.loc[current_cat][e]
                for e in self.cat.loc[current_cat].index
                if e.endswith('name') and self.cat.loc[current_cat][e] != "-"
            ]

            # append date
            self.foursquare_data = self.foursquare_data.append(
                {
                    "name": name,
                    "address": address,
                    "crossStreet": crossStreet,
                    "categories": ':'.join(cat_name),
                    "checkin": checkin,
                    "usercount": user,
                    "latitude": lat,
                    "longitude": lon
                },
                ignore_index=True)

        # Check if there is still rate remaining to call API
        if int(fs_client.rate_remaining) <= 100 and int(
                fs_client.rate_limit) > 0:
            waiting_time = 3600
            logger.info("wait", waiting_time)
            self.write_file()
            time.sleep(waiting_time)

        # Recursive if there are more than 10 places and the distance is greater than 20 meters
        if len(tot) >= 10 and great_circle(params['ne'],
                                           params['sw']).meters >= 20:

            x1, y1 = params['ne'].split(',')
            x2, y2 = params['sw'].split(',')

            x12 = str((float(x1) + float(x2)) / 2.0)
            y12 = str((float(y1) + float(y2)) / 2.0)

            new_params = [
                dict(ne=x12 + ", " + y1, sw=x2 + ", " + y12, intent="browse"),
                dict(ne=x1 + ", " + y1, sw=x12 + ", " + y12, intent="browse"),
                dict(ne=x12 + ", " + y12, sw=x2 + ", " + y2, intent="browse"),
                dict(ne=x1 + ", " + y12, sw=x12 + ", " + y2, intent="browse"),
            ]

            for param in new_params:
                self.get_venues_search(fs_client, param)
Beispiel #12
0
def main(argv):

    parser = argparse.ArgumentParser('Build your own grid.')

    parser.add_argument('-o',
                        '--outputfolder',
                        help='Output folder where to save the matrix.',
                        action='store',
                        dest='outputfolder',
                        required=True,
                        type=str)

    parser.add_argument(
        '-i',
        '--input',
        help=
        'Input file with point-of-interests. NOTE: in the case of strategy=nearest|alphabetically, the input file must contains the column cellID.',
        action='store',
        dest='inputfile',
        required=True,
        type=str)

    parser.add_argument('-a',
                        '--area',
                        action='store',
                        dest='area',
                        help='Area name',
                        default=None,
                        type=str)

    parser.add_argument(
        '-s',
        '--size',
        action='store',
        dest='size',
        help='Word2Vec words size. Used when employing Google News model.',
        default=None,
        type=str)

    parser.add_argument('-v',
                        '--verbose',
                        help='Level of output verbosity.',
                        action='store',
                        dest='verbosity',
                        default=0,
                        type=int,
                        nargs="?")

    args = parser.parse_args()

    if (args.verbosity == 1):
        logging.basicConfig(format='%(levelname)s: %(message)s',
                            level=logging.INFO)

    elif (args.verbosity == 2):
        logging.basicConfig(format='%(levelname)s: %(message)s',
                            level=logging.DEBUG)

    logger.info("Loading w2v model.")

    model = None

    ext = tuple([".biz", ".bin"])

    if (args.inputfile.endswith(ext)):
        model = gensim.models.KeyedVectors.load_word2vec_format(args.inputfile,
                                                                binary=True)
    else:
        model = gensim.models.Word2Vec.load(args.inputfile)

    tree = pd.read_csv(pkg_resources.resource_filename(
        'geol', '/resources/category_tree.csv'),
                       encoding='iso-8859-1')

    words = tree['level1_name'].dropna().drop_duplicates().tolist() + \
        tree['level2_name'].dropna().drop_duplicates().tolist() + \
        tree['level3_name'].dropna().drop_duplicates().tolist() + \
        tree['level4_name'].dropna().drop_duplicates().tolist()

    m = re.search('_s([0-9]+)_', args.inputfile)

    if args.size:
        size = args.size
    else:
        if m:
            size = m.group(1)

    m = re.search('.+/(.+).model', args.inputfile)

    if m:
        model_details = m.group(1)
    else:
        model_details = 'gnews'

    outputfile = os.path.abspath(
        os.path.join(args.outputfolder,
                     "matrix_" + args.area + "_" + model_details + ".txt"))

    f = open(outputfile, 'w', encoding='utf-8')

    for word in words:

        word = utils.normalize_word(word)

        w = word.split(' ')
        v = [0] * int(size)

        if len(w) > 1:
            tmp_w2v = []
            for e in w:
                if e in model:
                    tmp_w2v.append(model[e])
            if len(tmp_w2v) > 0:
                v = np.mean(tmp_w2v, axis=0)
        elif word in model:
            v = model[word]

        v = map(str, v)
        s = ','.join(map(str, v))
        f.write(word.replace(" ", "_") + "::n" + "\t1.0\t0\t" + s + "\n")

    f.close()
Beispiel #13
0
def main(argv):

    parser = argparse.ArgumentParser('Foursquare mapping to a spatial grid.')

    parser.add_argument('-i', '--input',
                        help='POIs file with relative coordinates.',
                        action='store',
                        dest='input',
                        required=True,
                        type=str)

    parser.add_argument('-p', '--prefix',
                        action='store',
                        dest='prefix',
                        help='Prefix for the filename specifying the city name.',
                        required=True,
                        type=str)

    parser.add_argument('-g', '--grid',
                        help='Input grid for the mapping. If crs is not WGS84, specify it with the param -c',
                        action='store',
                        dest='grid',
                        required=True,
                        type=str)

    parser.add_argument('-c', '--crs',
                        help='Coordinate Reference System for the input grid. It is requested only if it is different from WGS84.',
                        action='store',
                        dest='crs',
                        default='epsg:4326',
                        type=str)

    parser.add_argument('-o', '--outputfolder',
                        help='Output folder where to save the mapped file.',
                        action='store',
                        dest='outputfolder',
                        required='True',
                        type=str)

    parser.add_argument('-lat', '--latitude',
                        help='Latitude name.',
                        action='store',
                        dest='latitude',
                        default='latitude',
                        type=str)

    parser.add_argument('-long', '--longitude',
                        help='Longitude name.',
                        action='store',
                        dest='longitude',
                        default='longitude',
                        type=str)

    parser.add_argument('-v', '--verbose',
                        help='Level of output verbosity.',
                        action='store',
                        dest='verbosity',
                        default=0,
                        type=int,
                        nargs="?")

    args = parser.parse_args()

    latitude = args.latitude
    longitude = args.longitude

    if(args.verbosity == 1):
        logger.setLevel(logging.INFO)

    elif(args.verbosity == 2):
        logger.setLevel(logger.DEBUG)

    # Load the grid
    logger.info("Load the grid")
    gdf = gpd.GeoDataFrame.from_file(args.grid)
    gdf.crs = {'init': args.crs}

    if args.crs != 'epsg:4326':
        gdf = gdf.to_crs({'init': 'epsg:4326'})

    # Load POIs
    logger.info("Load POIs")
    df = pd.DataFrame(pd.read_csv(args.input, sep=",", low_memory=False))

    # Create Point from latitude, longitude pairs and build a GeoDataFrame
    logger.info("Build geometry")
    geometry = [Point(xy) for xy in zip(df[longitude], df[latitude])]
    data = gpd.GeoDataFrame(df, crs={'init': 'epsg:4326'}, geometry=geometry)
    data.to_crs(gdf.crs, inplace=True)

    # Check Geometry Validity
    ans = data.geometry.is_valid
    invalid = ans[ans == False]
    data.drop(invalid.index, axis=0, inplace=True)

    # Spatial Join with the grid to associate each entry to the related cell ('within') - LEFT
    join = gpd.sjoin(gdf[['cellID', 'geometry']], data, how='left', op='within')

    # Remove additional columns
    join.drop(['index_right', 'geometry'], axis=1, inplace=True)

    # Save output
    logger.info("Save output file")
    outputfile = os.path.abspath(os.path.join(args.outputfolder, args.prefix + "_mapped_foursquare_pois.csv"))
    join.to_csv(outputfile, index=False, sep='\t', float_format='%.6f')