def from_query(self, query, append=False): """ Fill from query Fills the data frame with data from a table in the database. Args: query: The query used to retrieve the data. append (bool): If a DataFrame already exists, should table be appended? """ # ------------------------------------------- # Send JSON command to getml engine cmd = dict() cmd["type_"] = "DataFrame.from_query" cmd["name_"] = self.name cmd["query_"] = query cmd["categoricals_"] = self.categorical_names cmd["discretes_"] = self.discrete_names cmd["join_keys_"] = self.join_key_names cmd["numericals_"] = self.numerical_names cmd["targets_"] = self.target_names cmd["time_stamps_"] = self.time_stamp_names cmd["append_"] = append comm.send(cmd) # ------------------------------------------- return self
def group_by(self, join_key, name, aggregations): """ Creates new DataFrame by grouping over a join key. Args: join_key (str): Name of the join key to group by. name (str): Name of the new DataFrame. aggregations: List containing aggregations. Returns: :class:`~getml.engine.DataFrame` """ # ---------------------------------------------------------------------- # Build command cmd = dict() cmd["name_"] = name cmd["type_"] = "DataFrame.group_by" cmd["join_key_name_"] = join_key cmd["df_name_"] = self.name cmd["aggregations_"] = [agg.thisptr for agg in aggregations] comm.send(cmd) # ---------------------------------------------------------------------- # Create handle for new data frame. new_df = DataFrame(name) return new_df.refresh()
def connect_sqlite3( name=":memory:", time_formats=["%Y-%m-%dT%H:%M:%s%z", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d"]): """ Creates a new SQLite3 database connection. Args: name (str): Name of the sqlite3 file. If the file does not exist, it will be created. Set to ":memory:" for a purely in-memory SQLite3 database. time_formats (str, optional): The formats tried when parsing time stamps. Check out https://pocoproject.org/docs/Poco.DateTimeFormatter.html#9946 for the options. """ # ------------------------------------------- # Prepare command. cmd = dict() cmd["name_"] = os.path.abspath(name) cmd["type_"] = "Database.new" cmd["db_"] = "sqlite3" cmd["time_formats_"] = time_formats # ------------------------------------------- # Send JSON command to engine. comm.send(cmd)
def from_db(self, table_name, append=False): """ Fill from Database The DataFrame will be filled from a table in the database. Args: table_name(str): Table from which we want to retrieve the data. append(bool): If a DataFrame already exists, should table be appended? """ # ------------------------------------------- # Send JSON command to getml engine cmd = dict() cmd["type_"] = "DataFrame.from_db" cmd["name_"] = self.name cmd["table_name_"] = table_name cmd["categoricals_"] = self.categorical_names cmd["discretes_"] = self.discrete_names cmd["join_keys_"] = self.join_key_names cmd["numericals_"] = self.numerical_names cmd["targets_"] = self.target_names cmd["time_stamps_"] = self.time_stamp_names cmd["append_"] = append comm.send(cmd) # ------------------------------------------- return self
def send(self): """ Send this RelboostModel to the getml engine. """ # ------------------------------------------- # Send own JSON command to getML engine if self.params["population"] is None: raise Exception("Population cannot be None!") if self.params["peripheral"] is None: raise Exception("Peripheral cannot be None!") cmd = dict() cmd["name_"] = self.name cmd["type_"] = "RelboostModel" cmd["population_"] = self.params["population"].thisptr cmd["peripheral_"] = [ per.thisptr["name_"] for per in self.params["peripheral"] ] cmd["hyperparameters_"] = self.__make_hyperparameters() comm.send(cmd) # ------------------------------------------- return self
def where(self, name, condition): """ Creates a new DataFrame as a subselection of this one. Args: name (str): Name of the new DataFrame. condition (bool): Boolean column indicating the rows you want to select. """ # ---------------------------------------------------------------------- # Build command cmd = dict() cmd["type_"] = "DataFrame.where" cmd["name_"] = self.name cmd["new_df_"] = name cmd["condition_"] = condition.thisptr comm.send(cmd) # ---------------------------------------------------------------------- # Create handle for new data frame. new_df = DataFrame(name) return new_df.refresh()
def set_unit(self, unit): """ Sets the unit of the column. Args: unit: The new unit. """ # ------------------------------------------- # Build command string cmd = dict() cmd.update(self.thisptr) cmd["unit_"] = unit cmd["type_"] += ".set_unit" # ------------------------------------------- # Send JSON command to engine comm.send(cmd) # ------------------------------------------- # Store the new unit self.thisptr["unit_"] = unit
def to_csv(self, fname, quotechar='"', sep=','): """ Writes the data frame into a newly created CSV file. Args: fname (str): The name of the CSV file. quotechar (str): The character used to wrap strings. sep (str): The separator used for separating fields. """ # ---------------------------------------------------------------------- # Transform path fname_ = os.path.abspath(fname) # ---------------------------------------------------------------------- # Build command cmd = dict() cmd["type_"] = "DataFrame.to_csv" cmd["name_"] = self.name cmd["fname_"] = fname_ cmd["quotechar_"] = quotechar cmd["sep_"] = sep comm.send(cmd)
def join( self, name, other, join_key, other_join_key=None, cols=None, other_cols=None, how="inner", where=None): """ Create a new DataFrame by joining this DataFrame with another DataFrame. Args: name (str): The name of the new DataFrame. other (DataFrame): The other DataFrame. join_key (str): Name of the join key in this DataFrame. other_join_key (str, optional): Name of the join key in the other table (if not identical to join_key). cols (optional): List of columns from this DataFrame to be included. If left blank, all columns from this DataFrame will be included. other_cols (optional): List of columns from the other DataFrame to be included. If left blank, all columns from the other DataFrame will be included. how (str): Type of the join. Supports "left", "right" and "inner". where (bool): Boolean column that imposes WHERE conditions on the join. """ # ------------------------------------------- # Send JSON command to getml engine cmd = dict() cmd["type_"] = "DataFrame.join" cmd["name_"] = name cmd["df1_name_"] = self.name cmd["df2_name_"] = other.name cmd["join_key_used_"] = join_key cmd["other_join_key_used_"] = other_join_key or join_key cmd["cols1_"] = cols or [] cmd["cols2_"] = other_cols or [] cmd["cols1_"] = [c.thisptr for c in cmd["cols1_"]] cmd["cols2_"] = [c.thisptr for c in cmd["cols2_"]] cmd["how_"] = how if where is not None: cmd["where_"] = where.thisptr comm.send(cmd) # ------------------------------------------- return DataFrame(name=name).refresh()
def read_csv( self, fnames, append=True, quotechar='"', sep=',', time_formats=["%Y-%m-%dT%H:%M:%s%z", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d"]): """ Read CSV file It is assumed that the first line of each CSV file contains the column names. Args: fnames (List[str]): CSV file names to be read. append (bool): If a DataFrame already exists, should the file be appended? quotechar (str): The character used to wrap strings. sep (str): The separator used for separating fields. time_formats (str): The formats tried when parsing time stamps. Refer to https://pocoproject.org/docs/Poco.DateTimeFormatter.html#9946 for the options. """ # ------------------------------------------- # Transform paths fnames_ = [os.path.abspath(_) for _ in fnames] # ------------------------------------------- # Send JSON command to getml engine cmd = dict() cmd["type_"] = "DataFrame.read_csv" cmd["name_"] = self.name cmd["fnames_"] = fnames_ cmd["append_"] = append cmd["quotechar_"] = quotechar cmd["sep_"] = sep cmd["time_formats_"] = time_formats cmd["categoricals_"] = self.categorical_names cmd["discretes_"] = self.discrete_names cmd["join_keys_"] = self.join_key_names cmd["numericals_"] = self.numerical_names cmd["targets_"] = self.target_names cmd["time_stamps_"] = self.time_stamp_names comm.send(cmd) # ------------------------------------------- return self
def __save(self): """ Saves the model as a JSON file. """ # ------------------------------------------- # Send JSON command to getML engine cmd = dict() cmd["type_"] = "RelboostModel.save" cmd["name_"] = self.name comm.send(cmd)
def load(self): """ Loads the DataFrame object from the engine. """ # ---------------------------------------------------------------------- cmd = dict() cmd["type_"] = "DataFrame.load" cmd["name_"] = self.name comm.send(cmd) # ---------------------------------------------------------------------- return self.refresh()
def __rm_col(self, name, role): # ------------------------------------------------------ # Send command cmd = dict() cmd["type_"] = "DataFrame.remove_column" cmd["name_"] = name cmd["df_name_"] = self.name cmd["role_"] = role comm.send(cmd) # ------------------------------------------------------ self.refresh()
def load(self): """ Loads the model from a JSON file. """ # ------------------------------------------- # Send JSON command to getML engine cmd = dict() cmd["type_"] = "RelboostModel.load" cmd["name_"] = self.name comm.send(cmd) # ------------------------------------------- return self.refresh()
def delete_project(name): """ Deletes the project. All data and models contained in the project directory will be lost. Args: name (str): Name of your project. Raises: ConnectionRefusedError: If unable to connect to engine """ cmd = dict() cmd["type_"] = "delete_project" cmd["name_"] = name comm.send(cmd)
def delete(self, mem_only=False): """ Deletes the model from the engine. Args: mem_only (bool): If True, then the data frame will be deleted from memory only, but not from disk. Default: False. """ # ------------------------------------------- # Send JSON command to getML engine cmd = dict() cmd["type_"] = "RelboostModel.delete" cmd["name_"] = self.name cmd["mem_only_"] = mem_only comm.send(cmd)
def delete(self, mem_only=False): """ Deletes the data frame from the engine. Args: mem_only (bool): If True, the data frame will be deleted from memory only, but not from disk. """ # ------------------------------------------- # Send JSON command to getml engine cmd = dict() cmd["type_"] = "DataFrame.delete" cmd["name_"] = self.name cmd["mem_only_"] = mem_only comm.send(cmd)
def to_db(self, table_name): """ Writes the data frame into a newly created table in the database. Args: table_name (str): Name of the table to be created. If a table of that name already exists, it will be replaced. """ # ---------------------------------------------------------------------- # Build command cmd = dict() cmd["type_"] = "DataFrame.to_db" cmd["name_"] = self.name cmd["table_name_"] = table_name comm.send(cmd)
def __add_column(self, col, name, role, unit): # ------------------------------------------------------ # Send command cmd = dict() cmd["type_"] = "DataFrame.add_column" cmd["name_"] = name cmd["col_"] = col.thisptr cmd["df_name_"] = self.name cmd["role_"] = role cmd["unit_"] = unit comm.send(cmd) # ------------------------------------------------------ self.refresh()
def save(self): """Saves the DataFrame on the engine. To be saved on the engine, it already has to be present there. You can use the :meth:`~getml.engine.DataFrame.send` function to upload it to the engine. Returns: :class:`~getml.engine.DataFrame`: The current instance of the DataFrame class. """ cmd = dict() cmd["type_"] = "DataFrame.save" cmd["name_"] = self.name comm.send(cmd) return self
def connect_postgres( pg_host, pg_hostaddr, pg_port, dbname, user, password, time_formats=["%Y-%m-%dT%H:%M:%s%z", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d"]): """ Creates a new PostgreSQL database connection. Args: pg_host (str): Host of the PostgreSQL database. pg_hostaddr (str): IP address of the PostgreSQL database. pg_port(int): Port of the PostgreSQL database. user (str): User name with which to log into the PostgreSQL database. password (str): Password with which to log into the PostgreSQL database. time_formats (str, optional): The formats tried when parsing time stamps. Check out https://pocoproject.org/docs/Poco.DateTimeFormatter.html#9946 for the options. """ # ------------------------------------------- # Prepare command. cmd = dict() cmd["name_"] = "" cmd["type_"] = "Database.new" cmd["db_"] = "postgres" cmd["host_"] = pg_host cmd["hostaddr_"] = pg_hostaddr cmd["port_"] = pg_port cmd["dbname_"] = dbname cmd["user_"] = user cmd["password_"] = password cmd["time_formats_"] = time_formats # ------------------------------------------- # Send JSON command to engine. comm.send(cmd)
def drop_table(name): """ Drops a table from the database. Args: name (str): The table to be dropped. """ # ------------------------------------------- # Prepare command. cmd = dict() cmd["name_"] = name cmd["type_"] = "Database.drop_table" # ------------------------------------------- # Send JSON command to engine. comm.send(cmd)
def copy(self, other): """ Copies the parameters and hyperparameters from another model. Args: other (:class:`getml.models.RelboostModel`): The other model. """ # ------------------------------------------- # Send JSON command to getML engine cmd = dict() cmd["type_"] = "RelboostModel.copy" cmd["name_"] = self.name cmd["other_"] = other.name comm.send(cmd) # ------------------------------------------- self.refresh()
def set_project(name): """ Select a project. All data frames and models will be stored in the corresponding project directory. If a project of that name does not already exist, a new one will be created. Args: name (str): Name of your project. Raises: ConnectionRefusedError: If unable to connect to engine """ if not is_alive(): err_msg = "Cannot connect to getML engine. Make sure the engine is running and you are logged in." raise ConnectionRefusedError(err_msg) cmd = dict() cmd["type_"] = "set_project" cmd["name_"] = name comm.send(cmd)
def read_csv(name, fnames, header=True, quotechar='"', sep=',', skip=0): """ Reads a CSV file into the database. Args: name (str): Name of the table in which the data is to be inserted. fnames (List[str]): The list of CSV file names to be read. header (bool, optional): Whether the CSV file contains a header with the column names. Default to True. quotechar (str, optional): The character used to wrap strings. Default:`"` sep (str, optional): The separator used for separating fields. Default:`,` skip (int, optional): Number of lines to skip at the beginning of each file (Default: 0). If *header* is True, the lines will be skipped before the header. """ # ------------------------------------------- # Transform paths fnames_ = [os.path.abspath(_) for _ in fnames] # ------------------------------------------- # Prepare command. cmd = dict() cmd["name_"] = name cmd["type_"] = "Database.read_csv" cmd["fnames_"] = fnames_ cmd["header_"] = header cmd["quotechar_"] = quotechar cmd["sep_"] = sep cmd["skip_"] = skip # ------------------------------------------- # Send JSON command to engine. comm.send(cmd)