Exemple #1
0
 def generateDataSet(self, sparkSession, numSims: int,
                     columns: List[Tuple[str, str, List[Any]]]):
     _jColNameTypeArray = toJArray(
         self.gateway,
         self.ColumnSummary,
         [
             self.ColumnSummary(
                 v[0],
                 v[1],
                 self.gateway.jvm.scala.util.Left(
                     toJArray(self.gateway, self.gateway.jvm.double, v[2]))
                 if v[1] == self.ColumnType.Numeric().toString() else
                 self.gateway.jvm.scala.util.Right(
                     toJArray(self.gateway, self.gateway.jvm.String, v[2])),
             ) for v in columns
         ],
     )
     return DataFrame(
         self._java_obj.generateDataSet(
             sparkSession._instantiatedSession._jsparkSession,
             numSims,
             _jColNameTypeArray,
         ),
         sparkSession._instantiatedSession,
     )
Exemple #2
0
    def load(self, path=None, format=None, schema=None, **options):
        """Loads data from a data source and returns it as a :class`DataFrame`.

        :param path: optional string for file-system backed data sources.
        :param format: optional string for format of the data source. Default to 'parquet'.
        :param schema: optional :class:`StructType` for the input schema.
        :param options: all other string options

        >>> df = sqlContext.read.load('python/test_support/sql/parquet_partitioned', opt1=True,
        ...     opt2=1, opt3='str')
        >>> df.dtypes
        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
        >>> df = sqlContext.read.format('json').load(['python/test_support/sql/people.json',
        ...     'python/test_support/sql/people1.json'])
        >>> df.dtypes
        [('age', 'bigint'), ('aka', 'string'), ('name', 'string')]
        """
        if format is not None:
            self.format(format)
        if schema is not None:
            self.schema(schema)
        self.options(**options)
        if path is not None:
            if type(path) == list:
                paths = path
                gateway = self._sqlContext._sc._gateway
                jpaths = utils.toJArray(gateway, gateway.jvm.java.lang.String,
                                        paths)
                return self._df(self._jreader.load(jpaths))
            else:
                return self._df(self._jreader.load(path))
        else:
            return self._df(self._jreader.load())
Exemple #3
0
    def load(self, path=None, format=None, schema=None, **options):
        """Loads data from a data source and returns it as a :class`DataFrame`.

        :param path: optional string for file-system backed data sources.
        :param format: optional string for format of the data source. Default to 'parquet'.
        :param schema: optional :class:`StructType` for the input schema.
        :param options: all other string options

        >>> df = sqlContext.read.load('python/test_support/sql/parquet_partitioned', opt1=True,
        ...     opt2=1, opt3='str')
        >>> df.dtypes
        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
        >>> df = sqlContext.read.format('json').load(['python/test_support/sql/people.json',
        ...     'python/test_support/sql/people1.json'])
        >>> df.dtypes
        [('age', 'bigint'), ('aka', 'string'), ('name', 'string')]
        """
        if format is not None:
            self.format(format)
        if schema is not None:
            self.schema(schema)
        self.options(**options)
        if path is not None:
            if type(path) == list:
                paths = path
                gateway = self._sqlContext._sc._gateway
                jpaths = utils.toJArray(gateway, gateway.jvm.java.lang.String, paths)
                return self._df(self._jreader.load(jpaths))
            else:
                return self._df(self._jreader.load(path))
        else:
            return self._df(self._jreader.load())
Exemple #4
0
    def jdbc(self,
             url,
             table,
             column=None,
             lowerBound=None,
             upperBound=None,
             numPartitions=None,
             predicates=None,
             properties=None):
        """
        Construct a :class:`DataFrame` representing the database table named ``table``
        accessible via JDBC URL ``url`` and connection ``properties``.

        Partitions of the table will be retrieved in parallel if either ``column`` or
        ``predicates`` is specified. ``lowerBound`, ``upperBound`` and ``numPartitions``
        is needed when ``column`` is specified.

        If both ``column`` and ``predicates`` are specified, ``column`` will be used.

        .. note:: Don't create too many partitions in parallel on a large cluster; \
        otherwise Spark might crash your external database systems.

        :param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
        :param table: the name of the table
        :param column: the name of an integer column that will be used for partitioning;
                       if this parameter is specified, then ``numPartitions``, ``lowerBound``
                       (inclusive), and ``upperBound`` (exclusive) will form partition strides
                       for generated WHERE clause expressions used to split the column
                       ``column`` evenly
        :param lowerBound: the minimum value of ``column`` used to decide partition stride
        :param upperBound: the maximum value of ``column`` used to decide partition stride
        :param numPartitions: the number of partitions
        :param predicates: a list of expressions suitable for inclusion in WHERE clauses;
                           each one defines one partition of the :class:`DataFrame`
        :param properties: a dictionary of JDBC database connection arguments. Normally at
                           least properties "user" and "password" with their corresponding values.
                           For example { 'user' : 'SYSTEM', 'password' : 'mypassword' }
        :return: a DataFrame
        """
        if properties is None:
            properties = dict()
        jprop = JavaClass("java.util.Properties",
                          self._spark._sc._gateway._gateway_client)()
        for k in properties:
            jprop.setProperty(k, properties[k])
        if column is not None:
            assert lowerBound is not None, "lowerBound can not be None when ``column`` is specified"
            assert upperBound is not None, "upperBound can not be None when ``column`` is specified"
            assert numPartitions is not None, \
                "numPartitions can not be None when ``column`` is specified"
            return self._df(
                self._jreader.jdbc(url, table, column, int(lowerBound),
                                   int(upperBound), int(numPartitions), jprop))
        if predicates is not None:
            gateway = self._spark._sc._gateway
            jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String,
                                         predicates)
            return self._df(self._jreader.jdbc(url, table, jpredicates, jprop))
        return self._df(self._jreader.jdbc(url, table, jprop))
 def aggregate_properties(self, app_name, entity_type, channel_name=None,
                          start_time=None, until_time=None, required=None):
     pes = self._sc._jvm.org.apache.predictionio.data.store.python.PPythonEventStore
     jdf = pes.aggregateProperties(app_name, entity_type, channel_name,
                                   start_time, until_time,
                                   utils.toJArray(self._sc._gateway, self._sc._gateway.jvm.String, required),
                                   self._jss)
     return DataFrame(jdf, self.sql_ctx)
Exemple #6
0
    def jdbc(
        self,
        url,
        table,
        column=None,
        lowerBound=None,
        upperBound=None,
        numPartitions=None,
        predicates=None,
        properties=None,
    ):
        """
        Construct a :class:`DataFrame` representing the database table named ``table``
        accessible via JDBC URL ``url`` and connection ``properties``.

        Partitions of the table will be retrieved in parallel if either ``column`` or
        ``predicates`` is specified.

        If both ``column`` and ``predicates`` are specified, ``column`` will be used.

        .. note:: Don't create too many partitions in parallel on a large cluster; \
        otherwise Spark might crash your external database systems.

        :param url: a JDBC URL of the form ``jdbc:subprotocol:subname``
        :param table: the name of the table
        :param column: the name of an integer column that will be used for partitioning;
                       if this parameter is specified, then ``numPartitions``, ``lowerBound``
                       (inclusive), and ``upperBound`` (exclusive) will form partition strides
                       for generated WHERE clause expressions used to split the column
                       ``column`` evenly
        :param lowerBound: the minimum value of ``column`` used to decide partition stride
        :param upperBound: the maximum value of ``column`` used to decide partition stride
        :param numPartitions: the number of partitions
        :param predicates: a list of expressions suitable for inclusion in WHERE clauses;
                           each one defines one partition of the :class:`DataFrame`
        :param properties: a dictionary of JDBC database connection arguments. Normally at
                           least properties "user" and "password" with their corresponding values.
                           For example { 'user' : 'SYSTEM', 'password' : 'mypassword' }
        :return: a DataFrame
        """
        if properties is None:
            properties = dict()
        jprop = JavaClass("java.util.Properties", self._spark._sc._gateway._gateway_client)()
        for k in properties:
            jprop.setProperty(k, properties[k])
        if column is not None:
            if numPartitions is None:
                numPartitions = self._spark._sc.defaultParallelism
            return self._df(
                self._jreader.jdbc(url, table, column, int(lowerBound), int(upperBound), int(numPartitions), jprop)
            )
        if predicates is not None:
            gateway = self._spark._sc._gateway
            jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates)
            return self._df(self._jreader.jdbc(url, table, jpredicates, jprop))
        return self._df(self._jreader.jdbc(url, table, jprop))
def run_pio_workflow(model, userdict, itemdict, orig_sys_args):
    sys.argv = orig_sys_args
    template_engine = sc._jvm.org.example.vanilla.VanillaEngine
    template_engine.modelRef().set(model)
    template_engine.userdictRef().set(userdict)
    template_engine.itemdictRef().set(itemdict)
    main_args =  utils.toJArray(sc._gateway, sc._gateway.jvm.String, sys.argv)
    create_workflow = sc._jvm.org.apache.predictionio.workflow.CreateWorkflow
    sc.stop()
    create_workflow.main(main_args)
Exemple #8
0
    def jdbc(self,
             url,
             table,
             column=None,
             lowerBound=None,
             upperBound=None,
             numPartitions=None,
             predicates=None,
             properties=None):
        """
        Construct a :class:`DataFrame` representing the database table accessible
        via JDBC URL `url` named `table` and connection `properties`.

        The `column` parameter could be used to partition the table, then it will
        be retrieved in parallel based on the parameters passed to this function.

        The `predicates` parameter gives a list expressions suitable for inclusion
        in WHERE clauses; each one defines one partition of the :class:`DataFrame`.

        ::Note: Don't create too many partitions in parallel on a large cluster;
        otherwise Spark might crash your external database systems.

        :param url: a JDBC URL
        :param table: name of table
        :param column: the column used to partition
        :param lowerBound: the lower bound of partition column
        :param upperBound: the upper bound of the partition column
        :param numPartitions: the number of partitions
        :param predicates: a list of expressions
        :param properties: JDBC database connection arguments, a list of arbitrary string
                           tag/value. Normally at least a "user" and "password" property
                           should be included.
        :return: a DataFrame
        """
        if properties is None:
            properties = dict()
        jprop = JavaClass("java.util.Properties",
                          self._sqlContext._sc._gateway._gateway_client)()
        for k in properties:
            jprop.setProperty(k, properties[k])
        if column is not None:
            if numPartitions is None:
                numPartitions = self._sqlContext._sc.defaultParallelism
            return self._df(
                self._jreader.jdbc(url, table, column, int(lowerBound),
                                   int(upperBound), int(numPartitions), jprop))
        if predicates is not None:
            gateway = self._sqlContext._sc._gateway
            jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String,
                                         predicates)
            return self._df(self._jreader.jdbc(url, table, jpredicates, jprop))
        return self._df(self._jreader.jdbc(url, table, jprop))
Exemple #9
0
    def jdbc(
        self,
        url,
        table,
        column=None,
        lowerBound=None,
        upperBound=None,
        numPartitions=None,
        predicates=None,
        properties=None,
    ):
        """
        Construct a :class:`DataFrame` representing the database table accessible
        via JDBC URL `url` named `table` and connection `properties`.

        The `column` parameter could be used to partition the table, then it will
        be retrieved in parallel based on the parameters passed to this function.

        The `predicates` parameter gives a list expressions suitable for inclusion
        in WHERE clauses; each one defines one partition of the :class:`DataFrame`.

        ::Note: Don't create too many partitions in parallel on a large cluster;
        otherwise Spark might crash your external database systems.

        :param url: a JDBC URL
        :param table: name of table
        :param column: the column used to partition
        :param lowerBound: the lower bound of partition column
        :param upperBound: the upper bound of the partition column
        :param numPartitions: the number of partitions
        :param predicates: a list of expressions
        :param properties: JDBC database connection arguments, a list of arbitrary string
                           tag/value. Normally at least a "user" and "password" property
                           should be included.
        :return: a DataFrame
        """
        if properties is None:
            properties = dict()
        jprop = JavaClass("java.util.Properties", self._sqlContext._sc._gateway._gateway_client)()
        for k in properties:
            jprop.setProperty(k, properties[k])
        if column is not None:
            if numPartitions is None:
                numPartitions = self._sqlContext._sc.defaultParallelism
            return self._df(
                self._jreader.jdbc(url, table, column, int(lowerBound), int(upperBound), int(numPartitions), jprop)
            )
        if predicates is not None:
            gateway = self._sqlContext._sc._gateway
            jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String, predicates)
            return self._df(self._jreader.jdbc(url, table, jpredicates, jprop))
        return self._df(self._jreader.jdbc(url, table, jprop))
 def aggregate_properties(self,
                          app_name,
                          entity_type,
                          channel_name=None,
                          start_time=None,
                          until_time=None,
                          required=None):
     pes = self._sc._jvm.org.apache.predictionio.data.store.python.PPythonEventStore
     jdf = pes.aggregateProperties(
         app_name, entity_type, channel_name, start_time, until_time,
         utils.toJArray(self._sc._gateway, self._sc._gateway.jvm.String,
                        required), self._jss)
     return DataFrame(jdf, self.sql_ctx)
Exemple #11
0
    def jdbc(self,
             url,
             table,
             column=None,
             lowerBound=None,
             upperBound=None,
             numPartitions=None,
             predicates=None,
             properties=None):
        """
        Construct a :class:`DataFrame` representing the database table named ``table``
        accessible via JDBC URL ``url`` and connection ``properties``.

        Partitions of the table will be retrieved in parallel if either ``column`` or
        ``predicates`` is specified. ``lowerBound``, ``upperBound`` and ``numPartitions``
        is needed when ``column`` is specified.

        If both ``column`` and ``predicates`` are specified, ``column`` will be used.

        .. versionadded:: 1.4.0

        Parameters
        ----------
        table : str
            the name of the table
        column : str, optional
            alias of ``partitionColumn`` option. Refer to ``partitionColumn`` in
            `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html#data-source-option>`_
            in the version you use.
        predicates : list, optional
            a list of expressions suitable for inclusion in WHERE clauses;
            each one defines one partition of the :class:`DataFrame`
        properties : dict, optional
            a dictionary of JDBC database connection arguments. Normally at
            least properties "user" and "password" with their corresponding values.
            For example { 'user' : 'SYSTEM', 'password' : 'mypassword' }

        Other Parameters
        ----------------
        Extra options
            For the extra options, refer to
            `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html#data-source-option>`_
            in the version you use.

            .. # noqa

        Notes
        -----
        Don't create too many partitions in parallel on a large cluster;
        otherwise Spark might crash your external database systems.

        Returns
        -------
        :class:`DataFrame`
        """
        if properties is None:
            properties = dict()
        jprop = JavaClass("java.util.Properties",
                          self._spark._sc._gateway._gateway_client)()
        for k in properties:
            jprop.setProperty(k, properties[k])
        if column is not None:
            assert lowerBound is not None, "lowerBound can not be None when ``column`` is specified"
            assert upperBound is not None, "upperBound can not be None when ``column`` is specified"
            assert numPartitions is not None, \
                "numPartitions can not be None when ``column`` is specified"
            return self._df(
                self._jreader.jdbc(url, table, column, int(lowerBound),
                                   int(upperBound), int(numPartitions), jprop))
        if predicates is not None:
            gateway = self._spark._sc._gateway
            jpredicates = utils.toJArray(gateway, gateway.jvm.java.lang.String,
                                         predicates)
            return self._df(self._jreader.jdbc(url, table, jpredicates, jprop))
        return self._df(self._jreader.jdbc(url, table, jprop))
def run_pio_workflow(model, userdict, itemdict, orig_sys_args):
    sys.argv = orig_sys_args
    template_engine = sc._jvm.org.example.vanilla.VanillaEngine
    template_engine.modelRef().set(model)
    template_engine.userdictRef().set(userdict)
    template_engine.itemdictRef().set(itemdict)
    main_args =  utils.toJArray(sc._gateway, sc._gateway.jvm.String, sys.argv)
    create_workflow = sc._jvm.org.apache.predictionio.workflow.CreateWorkflow
    sc.stop()
    create_workflow.main(main_args)

sqlContext = spark._wrapped
sqlCtx = sqlContext

app_name = 'NCF'
event_names = utils.toJArray(sc._gateway, sc._gateway.jvm.String, ['purchased-event'])

p_event_store = PEventStore(spark._jsparkSession, sqlContext)
event_df = p_event_store.find(app_name, entity_type='user', target_entity_type='item', event_names=event_names)
ratings = event_df.toPandas().rename(index=str, columns={'entityId': 'userid', 'targetEntityId': 'itemid', 'eventTime': 'timestamp'})

#For running with eval only, drop dupe user-item interactions and users with < 2 interactions

ratings = ratings.drop_duplicates(subset=["userid", "itemid"], keep="last")
ratings = ratings[ratings.duplicated(subset=['userid'], keep=False)]

ratings['rating'] = 1

ratings['userid'] = pd.to_numeric(ratings['userid'].str[5:]).astype(int)
ratings['itemid'] = pd.to_numeric(ratings['itemid'].str[6:]).astype(int)
ratings['timestamp'] = pd.to_numeric(ratings['timestamp'])