def test_lcss(): move_df = MoveDataFrame( data=list_data, latitude=LATITUDE, longitude=LONGITUDE, datetime=DATETIME, traj_id=TRAJ_ID, ) move_df_2 = MoveDataFrame( data=list_data_2, latitude=LATITUDE, longitude=LONGITUDE, datetime=DATETIME, traj_id=TRAJ_ID, ) move_lcss = generate_lcss(move_df, move_df_2, 60) cols = [ 'ida', 'idb', 'datetime_ida', 'datetime_idb', 'difference', 'equals', 'edge' ] expected = DataFrame( data=[[ 1, 2, Timestamp('2019-06-05 07:02:42'), Timestamp('2019-06-05 07:02:40'), 2, True, list([1938809894, 2527401909]) ], [ 1, 2, Timestamp('2019-06-05 07:03:42'), Timestamp('2019-06-05 07:03:40'), 2, True, list([2527401903, 2527388956]) ], [ 1, 2, Timestamp('2019-06-05 07:04:42'), Timestamp('2019-06-05 07:04:40'), 2, True, list([2527401895, 6502622934]) ], [ 1, 2, Timestamp('2019-06-05 07:05:42'), Timestamp('2019-06-05 07:05:40'), 2, True, list([2862103272, 2862103258]) ]], columns=cols, index=[0, 1, 2, 3], ) assert_frame_equal(move_lcss, expected) assert len(move_lcss) == 4
def convert_to( self, new_type: Text ) -> Union['PandasMoveDataFrame', 'DaskMoveDataFrame']: """ Convert an object from one type to another specified by the user. Parameters ---------- new_type: 'pandas' or 'dask' The type for which the object will be converted. Returns ------- A subclass of MoveDataFrameAbstractModel The converted object. """ if new_type == TYPE_DASK: return self elif new_type == TYPE_PANDAS: df_pandas = self._data.compute() return MoveDataFrame( df_pandas, latitude=LATITUDE, longitude=LONGITUDE, datetime=DATETIME, traj_id=TRAJ_ID, type_=TYPE_PANDAS )
def _default_move_df(): return MoveDataFrame(data=[ [39.984094, 116.319236, '2008-10-23 05:53:05', 1], [39.984198, 116.319322, '2008-10-23 05:53:06', 1], [39.984224, 116.319402, '2008-10-23 05:53:11', 1], [39.984224, 116.319402, '2008-10-23 05:53:11', 2], ])
def read_csv(filepath_or_buffer: FilePathOrBuffer, latitude: str = LATITUDE, longitude: str = LONGITUDE, datetime: str = DATETIME, traj_id: str = TRAJ_ID, type_: str = TYPE_PANDAS, n_partitions: int = 1, **kwargs): """ Reads a `csv` file and structures the data. Parameters ---------- filepath_or_buffer : str or path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is expected. A local file could be: file://localhost/path/to/table.csv. If you want to pass in a path object, pandas accepts any os.PathLike. By file-like object, we refer to objects with a read() method, such as a file handle (e.g. via builtin open function) or StringIO. latitude : str, optional Represents the column name of feature latitude, by default 'lat' longitude : str, optional Represents the column name of feature longitude, by default 'lon' datetime : str, optional Represents the column name of feature datetime, by default 'datetime' traj_id : str, optional Represents the column name of feature id trajectory, by default 'id' type_ : str, optional Represents the type of the MoveDataFrame, by default 'pandas' n_partitions : int, optional Represents number of partitions for DaskMoveDataFrame, by default 1 **kwargs : Pandas read_csv arguments https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html?highlight=read_csv#pandas.read_csv Returns ------- MoveDataFrameAbstract subclass Trajectory data Examples -------- >>> from pymove.utils.trajectories import read_csv >>> move_df = read_csv('geolife_sample.csv') >>> move_df.head() lat lon datetime id 0 39.984094 116.319236 2008-10-23 05:53:05 1 1 39.984198 116.319322 2008-10-23 05:53:06 1 2 39.984224 116.319402 2008-10-23 05:53:11 1 3 39.984211 116.319389 2008-10-23 05:53:16 1 4 39.984217 116.319422 2008-10-23 05:53:21 1 >>> type(move_df) <class 'pymove.core.pandas.PandasMoveDataFrame'> """ data = _read_csv(filepath_or_buffer, **kwargs) return MoveDataFrame(data, latitude, longitude, datetime, traj_id, type_, n_partitions)
def test_map_matching_node(): move_df = MoveDataFrame(data=dict_data) map_matching_node(move_df) cols = ['id', 'lat', 'lon', 'datetime', 'geometry'] expected = DataFrame( data=[ [ 1, -3.779240, -38.678747, Timestamp('2008-06-12 12:00:50'), Point(-38.6787469, -3.7792405), ], [ 1, -3.779240, -38.678747, Timestamp('2008-06-12 12:00:56'), Point(-38.6787469, -3.7792405), ], [ 1, -3.778692, -38.678440, Timestamp('2008-06-12 12:01:01'), Point(-38.6784397, -3.7786924), ], [ 1, -3.778692, -38.678440, Timestamp('2008-06-12 12:01:06'), Point(-38.6784397, -3.7786924) ], ], columns=cols, index=[0, 1, 2, 3], ) assert_frame_equal(move_df, expected) assert move_df.len() == 4
def read_csv(filepath_or_buffer: FilePathOrBuffer, latitude: Optional[Text] = LATITUDE, longitude: Optional[Text] = LONGITUDE, datetime: Optional[Text] = DATETIME, traj_id: Optional[Text] = TRAJ_ID, type_: Optional[Text] = TYPE_PANDAS, n_partitions: Optional[int] = 1, **kwargs): """ Reads a .csv file and structures the data into the desired structure supported by PyMove. Parameters ---------- filepath_or_buffer : str or path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is expected. A local file could be: file://localhost/path/to/table.csv. If you want to pass in a path object, pandas accepts any os.PathLike. By file-like object, we refer to objects with a read() method, such as a file handle (e.g. via builtin open function) or StringIO. latitude : str, optional Represents the column name of feature latitude, by default 'lat' longitude : str, optional Represents the column name of feature longitude, by default 'lon' datetime : str, optional Represents the column name of feature datetime, by default 'datetime' traj_id : str, optional Represents the column name of feature id trajectory, by default 'id' type_ : str, optional Represents the type of the MoveDataFrame, by default 'pandas' n_partitions : int, optional Represents number of partitions for DaskMoveDataFrame, by default 1 **kwargs : Pandas read_csv arguments https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html?highlight=read_csv#pandas.read_csv Returns ------- MoveDataFrameAbstract subclass Trajectory data """ data = _read_csv(filepath_or_buffer, **kwargs) return MoveDataFrame(data, latitude, longitude, datetime, traj_id, type_, n_partitions)
def test_validate_columns(): df = DataFrame(data=[[0, 0, '01-01-2020', 0]], columns=['lat', 'lon', 'datetime', 'id']) MoveDataFrame.validate_move_data_frame(df) expected = Series( data=['float64', 'float64', '<M8[ns]', 'int64'], index=['lat', 'lon', 'datetime', 'id'], dtype='object', name=None, ) assert_series_equal(df.dtypes, expected) df = DataFrame(data=[[0, 0]], columns=['lat', 'lon']) try: MoveDataFrame.validate_move_data_frame(df) raise AssertionError( 'AttributeError error not raised by MoveDataFrame') except KeyError: pass df = DataFrame(data=[['a', 0, '01-01-2020']], columns=['lat', 'lon', 'datetime']) try: MoveDataFrame.validate_move_data_frame(df) raise AssertionError( 'AttributeError error not raised by MoveDataFrame') except ValueError: pass df = DataFrame(data=[[0, 0, '0']], columns=['lat', 'lon', 'datetime']) try: MoveDataFrame.validate_move_data_frame(df) raise AssertionError( 'AttributeError error not raised by MoveDataFrame') except ParserError: pass
def test_has_columns(): df = DataFrame(columns=['lat', 'lon', 'datetime']) assert MoveDataFrame.has_columns(df) df = DataFrame(columns=['lat', 'lon', 'time']) assert not MoveDataFrame.has_columns(df)
def test_format_labels(): expected = {'col1': 'id', 'col3': 'lon', 'col2': 'lat', 'col4': 'datetime'} labels = MoveDataFrame.format_labels('col1', 'col2', 'col3', 'col4') assert_equal(labels, expected)
def __init__( self, data: Union[DataFrame, List, Dict], latitude: Optional[Text] = LATITUDE, longitude: Optional[Text] = LONGITUDE, datetime: Optional[Text] = DATETIME, traj_id: Optional[Text] = TRAJ_ID, n_partitions: Optional[int] = 1, ): """ Checks whether past data has 'lat', 'lon', 'datetime' columns, and renames it with the PyMove lib standard. After starts the attributes of the class. - self._data : Represents trajectory data. - self._type : Represents the type of layer below the data structure. - self.last_operation : Represents the last operation performed. Parameters ---------- data : dict, list, numpy array or pandas.core.DataFrame Input trajectory data. latitude : str, optional, default 'lat'. Represents column name latitude. longitude : str, optional, default 'lon'. Represents column name longitude. datetime : str, optional, default 'datetime'. Represents column name datetime. traj_id : str, optional, default 'id'. Represents column name trajectory id. n_partitions : int, optional, default 1. Number of partitions of the dask dataframe. Raises ------ KeyError If missing one of lat, lon, datetime columns ValueError, ParserError If the data types can't be converted. """ if isinstance(data, dict): data = pd.DataFrame.from_dict(data) elif ( (isinstance(data, list) or isinstance(data, np.ndarray)) and len(data) >= 4 ): zip_list = [LATITUDE, LONGITUDE, DATETIME, TRAJ_ID] for i in range(len(data[0])): try: zip_list[i] = zip_list[i] except KeyError: zip_list.append(i) data = pd.DataFrame(data, columns=zip_list) mapping_columns = MoveDataFrame.format_labels( traj_id, latitude, longitude, datetime ) dsk = data.rename(columns=mapping_columns) if MoveDataFrame.has_columns(dsk): MoveDataFrame.validate_move_data_frame(dsk) self._data = dask.dataframe.from_pandas( dsk, npartitions=n_partitions ) self._type = TYPE_DASK self.last_operation = None else: raise AttributeError( 'Couldn\'t instantiate MoveDataFrame because data has missing columns.' )
] def _default_move_df(): return MoveDataFrame(data=[ [39.984094, 116.319236, '2008-10-23 05:53:05', 1], [39.984198, 116.319322, '2008-10-23 05:53:06', 1], [39.984224, 116.319402, '2008-10-23 05:53:11', 1], [39.984224, 116.319402, '2008-10-23 05:53:11', 2], ]) move_df = MoveDataFrame( data=list_data, latitude=0, longitude=1, datetime=2, traj_id=3, ) def test_generate_distances(): move_distances = generate_distances(move_df) cols = [ 'lat', 'lon', 'datetime', 'id', 'edgeDistance', 'distFromTrajStartToCurrPoint' ] expected = DataFrame(data=[[
def read_csv( filename, latitude=LATITUDE, longitude=LONGITUDE, datetime=DATETIME, traj_id=TRAJ_ID, type_=TYPE_PANDAS, n_partitions=1, sep=',', encoding='utf-8', header='infer', names=None, index_col=None, usecols=None, dtype=None, nrows=None, ): """ Reads a .csv file and structures the data into the desired structure supported by PyMove. Parameters ---------- filename : String. Represents coordinates lat, lon which will be the center of the map. latitude : String, optional, default 'lat'. Represents the column name of feature latitude. longitude : String, optional, default 'lon'. Represents the column name of feature longitude. datetime : String, optional, default 'datetime'. Represents the column name of feature datetime. traj_id : String, optional, default 'id'. Represents the column name of feature id trajectory. type_ : String, optional, default 'pandas'. Represents the type of the MoveDataFrame n_partitions : int, optional, default 1. Represents number of partitions for DaskMoveDataFrame sep : String, optional, default ','. Delimiter to use. encoding : String, optional, default 'utf-8'. Encoding to use for UTF when reading/writing header : int, list of int, default ‘infer’ Row number(srs) to use as the column names, and the start of the data. Default behavior is to infer the column names: if no names are passed the behavior is identical to header=0 and column names are inferred from the first line of the file, if column names are passed explicitly then the behavior is identical to header=None names : array-like, optional List of column names to use. If the file contains a header row, then you should explicitly pass header=0 to override the column names. Duplicates in this list are not allowed. index_col : int, str, sequence of int / str, or False, default None Column(s) to use as the row labels of the DataFrame, either given as string name or column index. If a sequence of int / str is given, a MultiIndex is used. usecols : list-like or callable, optional, default None Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in names or inferred from the document header row(s). dtype : Type name or dict of column -> type, optional, default None Data type for data or columns. E.g. {‘a’: np.float64, ‘b’: np.int32, ‘c’: ‘Int64’} Use str or object together with suitable na_values settings to preserve and not interpret dtype. nrows : int, optional, default None Number of rows of file to read. Useful for reading pieces of large files. Returns ------- pymove.core.MoveDataFrameAbstract subclass. Trajectory data. """ df = pd.read_csv(filename, sep=sep, encoding=encoding, header=header, names=names, parse_dates=[datetime], index_col=index_col, usecols=usecols, dtype=dtype, nrows=nrows) return MoveDataFrame(df, latitude, longitude, datetime, traj_id, type_, n_partitions)
def test_map_matching_edge(): move_df = MoveDataFrame(data=dict_data) map_matching_edge(move_df) cols = ['id', 'lat', 'lon', 'datetime', 'edge', 'geometry'] expected = DataFrame( data=[ [ 1, -3.779936, -38.67921, Timestamp('2008-06-12 12:00:50'), (3971291384, 7625732459), LineString([ (-38.6784397, -3.7786924), (-38.6784773, -3.7787981), (-38.6785128, -3.7788737), (-38.678547, -3.7789333), (-38.6787079, -3.7791822), (-38.6787469, -3.7792405), ]) ], [ 1, -3.779240, -38.678747, Timestamp('2008-06-12 12:00:56'), (3971291384, 7625732459), LineString([ (-38.6784397, -3.7786924), (-38.6784773, -3.7787981), (-38.6785128, -3.7788737), (-38.678547, -3.7789333), (-38.6787079, -3.7791822), (-38.6787469, -3.7792405), ]) ], [ 1, -3.778692, -38.67844, Timestamp('2008-06-12 12:01:01'), (3971291384, 7625732459), LineString([ (-38.6784397, -3.7786924), (-38.6784773, -3.7787981), (-38.6785128, -3.7788737), (-38.678547, -3.7789333), (-38.6787079, -3.7791822), (-38.6787469, -3.7792405), ]) ], [ 1, -3.778191, -38.678071, Timestamp('2008-06-12 12:01:06'), (3971291384, 7625732459), LineString([ (-38.6784397, -3.7786924), (-38.6784773, -3.7787981), (-38.6785128, -3.7788737), (-38.678547, -3.7789333), (-38.6787079, -3.7791822), (-38.6787469, -3.7792405), ]) ], ], columns=cols, index=[0, 1, 2, 3], ) assert_frame_equal(move_df, expected) assert move_df.len() == 4