def generateDataSet(self, sparkSession, numSims: int, columns: List[Tuple[str, str, List[Any]]]): _jColNameTypeArray = toJArray( self.gateway, self.ColumnSummary, [ self.ColumnSummary( v[0], v[1], self.gateway.jvm.scala.util.Left( toJArray(self.gateway, self.gateway.jvm.double, v[2])) if v[1] == self.ColumnType.Numeric().toString() else self.gateway.jvm.scala.util.Right( toJArray(self.gateway, self.gateway.jvm.String, v[2])), ) for v in columns ], ) return DataFrame( self._java_obj.generateDataSet( sparkSession._instantiatedSession._jsparkSession, numSims, _jColNameTypeArray, ), sparkSession._instantiatedSession, )
def load(self, path=None, format=None, schema=None, **options): """Loads data from a data source and returns it as a :class`DataFrame`. :param path: optional string for file-system backed data sources. :param format: optional string for format of the data source. Default to 'parquet'. :param schema: optional :class:`StructType` for the input schema. :param options: all other string options >>> df = sqlContext.read.load('python/test_support/sql/parquet_partitioned', opt1=True, ... opt2=1, opt3='str') >>> df.dtypes [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')] >>> df = sqlContext.read.format('json').load(['python/test_support/sql/people.json', ... 'python/test_support/sql/people1.json']) >>> df.dtypes [('age', 'bigint'), ('aka', 'string'), ('name', 'string')] """ if format is not None: self.format(format) if schema is not None: self.schema(schema) self.options(**options) if path is not None: if type(path) == list: paths = path gateway = self._sqlContext._sc._gateway jpaths = utils.toJArray(gateway, gateway.jvm.java.lang.String, paths) return self._df(self._jreader.load(jpaths)) else: return self._df(self._jreader.load(path)) else: return self._df(self._jreader.load())
def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None, predicates=None, properties=None): """ Construct a :class:`DataFrame` representing the database table named ``table`` accessible via JDBC URL ``url`` and connection ``properties``. Partitions of the table will be retrieved in parallel if either ``column`` or ``predicates`` is specified. ``lowerBound`, ``upperBound`` and ``numPartitions`` is needed when ``column`` is specified. If both ``column`` and ``predicates`` are specified, ``column`` will be used. .. note:: Don't create too many partitions in parallel on a large cluster; \ otherwise Spark might crash your external database systems. :param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` :param table: the name of the table :param column: the name of an integer column that will be used for partitioning; if this parameter is specified, then ``numPartitions``, ``lowerBound`` (inclusive), and ``upperBound`` (exclusive) will form partition strides for generated WHERE clause expressions used to split the column ``column`` evenly :param lowerBound: the minimum value of ``column`` used to decide partition stride :param upperBound: the maximum value of ``column`` used to decide partition stride :param numPartitions: the number of partitions :param predicates: a list of expressions suitable for inclusion in WHERE clauses; each one defines one partition of the :class:`DataFrame` :param properties: a dictionary of JDBC database connection arguments. Normally at least properties "user" and "password" with their corresponding values. For example { 'user' : 'SYSTEM', 'password' : 'mypassword' } :return: a DataFrame """ if properties is None: properties = dict() jprop = JavaClass("java.util.Properties", self._spark._sc._gateway._gateway_client)() for k in properties: jprop.setProperty(k, properties[k]) if column is not None: assert lowerBound is not None, "lowerBound can not be None when ``column`` is specified" assert upperBound is not None, "upperBound can not be None when ``column`` is specified" assert numPartitions is not None, \ "numPartitions can not be None when ``column`` is specified" return self._df( self._jreader.jdbc(url, table, column, int(lowerBound), int(upperBound), int(numPartitions), jprop)) if predicates is not None: gateway = self._spark._sc._gateway jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates) return self._df(self._jreader.jdbc(url, table, jpredicates, jprop)) return self._df(self._jreader.jdbc(url, table, jprop))
def aggregate_properties(self, app_name, entity_type, channel_name=None, start_time=None, until_time=None, required=None): pes = self._sc._jvm.org.apache.predictionio.data.store.python.PPythonEventStore jdf = pes.aggregateProperties(app_name, entity_type, channel_name, start_time, until_time, utils.toJArray(self._sc._gateway, self._sc._gateway.jvm.String, required), self._jss) return DataFrame(jdf, self.sql_ctx)
def jdbc( self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None, predicates=None, properties=None, ): """ Construct a :class:`DataFrame` representing the database table named ``table`` accessible via JDBC URL ``url`` and connection ``properties``. Partitions of the table will be retrieved in parallel if either ``column`` or ``predicates`` is specified. If both ``column`` and ``predicates`` are specified, ``column`` will be used. .. note:: Don't create too many partitions in parallel on a large cluster; \ otherwise Spark might crash your external database systems. :param url: a JDBC URL of the form ``jdbc:subprotocol:subname`` :param table: the name of the table :param column: the name of an integer column that will be used for partitioning; if this parameter is specified, then ``numPartitions``, ``lowerBound`` (inclusive), and ``upperBound`` (exclusive) will form partition strides for generated WHERE clause expressions used to split the column ``column`` evenly :param lowerBound: the minimum value of ``column`` used to decide partition stride :param upperBound: the maximum value of ``column`` used to decide partition stride :param numPartitions: the number of partitions :param predicates: a list of expressions suitable for inclusion in WHERE clauses; each one defines one partition of the :class:`DataFrame` :param properties: a dictionary of JDBC database connection arguments. Normally at least properties "user" and "password" with their corresponding values. For example { 'user' : 'SYSTEM', 'password' : 'mypassword' } :return: a DataFrame """ if properties is None: properties = dict() jprop = JavaClass("java.util.Properties", self._spark._sc._gateway._gateway_client)() for k in properties: jprop.setProperty(k, properties[k]) if column is not None: if numPartitions is None: numPartitions = self._spark._sc.defaultParallelism return self._df( self._jreader.jdbc(url, table, column, int(lowerBound), int(upperBound), int(numPartitions), jprop) ) if predicates is not None: gateway = self._spark._sc._gateway jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates) return self._df(self._jreader.jdbc(url, table, jpredicates, jprop)) return self._df(self._jreader.jdbc(url, table, jprop))
def run_pio_workflow(model, userdict, itemdict, orig_sys_args): sys.argv = orig_sys_args template_engine = sc._jvm.org.example.vanilla.VanillaEngine template_engine.modelRef().set(model) template_engine.userdictRef().set(userdict) template_engine.itemdictRef().set(itemdict) main_args = utils.toJArray(sc._gateway, sc._gateway.jvm.String, sys.argv) create_workflow = sc._jvm.org.apache.predictionio.workflow.CreateWorkflow sc.stop() create_workflow.main(main_args)
def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None, predicates=None, properties=None): """ Construct a :class:`DataFrame` representing the database table accessible via JDBC URL `url` named `table` and connection `properties`. The `column` parameter could be used to partition the table, then it will be retrieved in parallel based on the parameters passed to this function. The `predicates` parameter gives a list expressions suitable for inclusion in WHERE clauses; each one defines one partition of the :class:`DataFrame`. ::Note: Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems. :param url: a JDBC URL :param table: name of table :param column: the column used to partition :param lowerBound: the lower bound of partition column :param upperBound: the upper bound of the partition column :param numPartitions: the number of partitions :param predicates: a list of expressions :param properties: JDBC database connection arguments, a list of arbitrary string tag/value. Normally at least a "user" and "password" property should be included. :return: a DataFrame """ if properties is None: properties = dict() jprop = JavaClass("java.util.Properties", self._sqlContext._sc._gateway._gateway_client)() for k in properties: jprop.setProperty(k, properties[k]) if column is not None: if numPartitions is None: numPartitions = self._sqlContext._sc.defaultParallelism return self._df( self._jreader.jdbc(url, table, column, int(lowerBound), int(upperBound), int(numPartitions), jprop)) if predicates is not None: gateway = self._sqlContext._sc._gateway jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates) return self._df(self._jreader.jdbc(url, table, jpredicates, jprop)) return self._df(self._jreader.jdbc(url, table, jprop))
def jdbc( self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None, predicates=None, properties=None, ): """ Construct a :class:`DataFrame` representing the database table accessible via JDBC URL `url` named `table` and connection `properties`. The `column` parameter could be used to partition the table, then it will be retrieved in parallel based on the parameters passed to this function. The `predicates` parameter gives a list expressions suitable for inclusion in WHERE clauses; each one defines one partition of the :class:`DataFrame`. ::Note: Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems. :param url: a JDBC URL :param table: name of table :param column: the column used to partition :param lowerBound: the lower bound of partition column :param upperBound: the upper bound of the partition column :param numPartitions: the number of partitions :param predicates: a list of expressions :param properties: JDBC database connection arguments, a list of arbitrary string tag/value. Normally at least a "user" and "password" property should be included. :return: a DataFrame """ if properties is None: properties = dict() jprop = JavaClass("java.util.Properties", self._sqlContext._sc._gateway._gateway_client)() for k in properties: jprop.setProperty(k, properties[k]) if column is not None: if numPartitions is None: numPartitions = self._sqlContext._sc.defaultParallelism return self._df( self._jreader.jdbc(url, table, column, int(lowerBound), int(upperBound), int(numPartitions), jprop) ) if predicates is not None: gateway = self._sqlContext._sc._gateway jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates) return self._df(self._jreader.jdbc(url, table, jpredicates, jprop)) return self._df(self._jreader.jdbc(url, table, jprop))
def aggregate_properties(self, app_name, entity_type, channel_name=None, start_time=None, until_time=None, required=None): pes = self._sc._jvm.org.apache.predictionio.data.store.python.PPythonEventStore jdf = pes.aggregateProperties( app_name, entity_type, channel_name, start_time, until_time, utils.toJArray(self._sc._gateway, self._sc._gateway.jvm.String, required), self._jss) return DataFrame(jdf, self.sql_ctx)
def jdbc(self, url, table, column=None, lowerBound=None, upperBound=None, numPartitions=None, predicates=None, properties=None): """ Construct a :class:`DataFrame` representing the database table named ``table`` accessible via JDBC URL ``url`` and connection ``properties``. Partitions of the table will be retrieved in parallel if either ``column`` or ``predicates`` is specified. ``lowerBound``, ``upperBound`` and ``numPartitions`` is needed when ``column`` is specified. If both ``column`` and ``predicates`` are specified, ``column`` will be used. .. versionadded:: 1.4.0 Parameters ---------- table : str the name of the table column : str, optional alias of ``partitionColumn`` option. Refer to ``partitionColumn`` in `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html#data-source-option>`_ in the version you use. predicates : list, optional a list of expressions suitable for inclusion in WHERE clauses; each one defines one partition of the :class:`DataFrame` properties : dict, optional a dictionary of JDBC database connection arguments. Normally at least properties "user" and "password" with their corresponding values. For example { 'user' : 'SYSTEM', 'password' : 'mypassword' } Other Parameters ---------------- Extra options For the extra options, refer to `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html#data-source-option>`_ in the version you use. .. # noqa Notes ----- Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems. Returns ------- :class:`DataFrame` """ if properties is None: properties = dict() jprop = JavaClass("java.util.Properties", self._spark._sc._gateway._gateway_client)() for k in properties: jprop.setProperty(k, properties[k]) if column is not None: assert lowerBound is not None, "lowerBound can not be None when ``column`` is specified" assert upperBound is not None, "upperBound can not be None when ``column`` is specified" assert numPartitions is not None, \ "numPartitions can not be None when ``column`` is specified" return self._df( self._jreader.jdbc(url, table, column, int(lowerBound), int(upperBound), int(numPartitions), jprop)) if predicates is not None: gateway = self._spark._sc._gateway jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates) return self._df(self._jreader.jdbc(url, table, jpredicates, jprop)) return self._df(self._jreader.jdbc(url, table, jprop))
def run_pio_workflow(model, userdict, itemdict, orig_sys_args): sys.argv = orig_sys_args template_engine = sc._jvm.org.example.vanilla.VanillaEngine template_engine.modelRef().set(model) template_engine.userdictRef().set(userdict) template_engine.itemdictRef().set(itemdict) main_args = utils.toJArray(sc._gateway, sc._gateway.jvm.String, sys.argv) create_workflow = sc._jvm.org.apache.predictionio.workflow.CreateWorkflow sc.stop() create_workflow.main(main_args) sqlContext = spark._wrapped sqlCtx = sqlContext app_name = 'NCF' event_names = utils.toJArray(sc._gateway, sc._gateway.jvm.String, ['purchased-event']) p_event_store = PEventStore(spark._jsparkSession, sqlContext) event_df = p_event_store.find(app_name, entity_type='user', target_entity_type='item', event_names=event_names) ratings = event_df.toPandas().rename(index=str, columns={'entityId': 'userid', 'targetEntityId': 'itemid', 'eventTime': 'timestamp'}) #For running with eval only, drop dupe user-item interactions and users with < 2 interactions ratings = ratings.drop_duplicates(subset=["userid", "itemid"], keep="last") ratings = ratings[ratings.duplicated(subset=['userid'], keep=False)] ratings['rating'] = 1 ratings['userid'] = pd.to_numeric(ratings['userid'].str[5:]).astype(int) ratings['itemid'] = pd.to_numeric(ratings['itemid'].str[6:]).astype(int) ratings['timestamp'] = pd.to_numeric(ratings['timestamp'])