コード例 #1
0
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")

    ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh')
    today = datetime.now(ho_chi_minh_timezone)
    today = today.replace(hour=0, minute=0, second=0, microsecond=0)
    first_day_of_month = today.replace()
    print('today: ', today)
    yesterday = today - timedelta(1)
    print('yesterday: ', yesterday)
    today_id = long(today.strftime("%Y%m%d"))
    yesterday_id = long(yesterday.strftime("%Y%m%d"))
    today_id_0h00 = long(today.strftime("%s"))
    print('today_id: ', today_id)
    print('yesterday_id: ', yesterday_id)
    print('today_id_0h00: ', today_id_0h00)

    date_end = 1573232400L
    General = 'General'
    Vocabulary = 'Vocabulary'
    Grammar = 'Grammar'
    Speaking = 'Speaking'
    Listening = 'Listening'
    Phrasal_Verb = 'Phrasal'
    Pronunciation = 'Pronunciation'

    # Phrasal
    # Verb

    # Speaking
    # 2
    # General
    # 3
    # Phrasal Verb
    # 4
    # Grammar
    # 5
    # Vocabulary
    # 6
    # Pronunciation
    # 7
    # Listening

    is_dev = True
    is_just_monthly_exam = False
    is_limit_test = False

    start_load_date = 0L

    BEHAVIOR_ID_TEST_TUAN = 22L
    BEHAVIOR_ID_TEST_THANG = 23L

    PERIOD_DAYLY = 1L
    PERIOD_WEEKLY = 2L
    PERIOD_MONTHLY = 3L

    def doCheckClassID(code):
        if code is None:
            return None
        code = str(code)
        if code == General:
            return 61L
        if code == Vocabulary:
            return 62L
        if code == Grammar:
            return 63L
        if code == Speaking:
            return 64L
        if code == Listening:
            return 65L
        if code == Pronunciation:
            return 66L
        if Phrasal_Verb in code:
            return 67L
        return None

    check_class_id = udf(doCheckClassID, LongType())

    # ------------------------------------------------------------------------------------------------------------------#
    my_partition_predicate = "(behavior_id=='22' or behavior_id=='23')"
    dyf_student_behavior = glueContext.create_dynamic_frame.from_catalog(
        database="od_student_behavior",
        table_name="student_behavior",
        push_down_predicate=my_partition_predicate)

    dyf_student_behaviors = dyf_student_behavior.resolveChoice(
        specs=[('behavior_id', 'cast:long'), ('transformed_at', 'cast:long')])

    # try:
    #     # # doc moc flag tu s3
    #     df_flag = spark.read.parquet("s3://dts-odin/flag/flag_student_testing_history.parquet")
    #     max_key = df_flag.collect()[0]['flag']
    #     print('read from index: ', max_key)
    #
    #     # so sanh _key datasource voi flag, lay nhung gia tri co key > flag
    #     dyf_student_behaviors = Filter.apply(frame=dyf_student_behaviors, f=lambda x: x['transformed_at'] > max_key)
    # except:
    #     print('read flag error ')

    if dyf_student_behaviors.count() > 0:

        dyf_student_behaviors = Filter.apply(
            frame=dyf_student_behaviors,
            f=lambda x: x["student_behavior_id"] is not None and x[
                "student_id"] is not None
            # and x["behavior_id"] in [BEHAVIOR_ID_TEST_TUAN,
            #                          BEHAVIOR_ID_TEST_THANG
            #                          ]
            and start_load_date <= x["student_behavior_date"] < today_id_0h00)

        number_dyf_student_behavior = dyf_student_behaviors.count()
        print('number_dyf_student_behavior after filtering: ',
              number_dyf_student_behavior)
        if number_dyf_student_behavior == 0:
            return

        dyf_student_behavior = dyf_student_behaviors \
            .select_fields(['student_behavior_id',
                            'student_behavior_date',
                            'student_id',
                            'behavior_id'])

        df_student_behavior = dyf_student_behavior.toDF()
        df_student_behavior = df_student_behavior.drop_duplicates(
            ['student_behavior_id'])
        if is_limit_test:
            df_student_behavior = df_student_behavior.limit(1000)

        df_student_behavior = df_student_behavior.repartition('behavior_id')
        df_student_behavior.cache()

        student_behavior_number = df_student_behavior.count()

        if is_dev:
            print('dy_student_behavior')
            print('student_behavior_number: ', student_behavior_number)
            df_student_behavior.printSchema()
            df_student_behavior.show(3)

        if student_behavior_number == 0:
            return

        # ------------------------------------------------------------------------------------------------------------------#
        dyf_student_test_mark = glueContext.create_dynamic_frame.from_catalog(
            database="od_student_behavior",
            table_name="student_test_mark",
            push_down_predicate=my_partition_predicate)

        dyf_student_test_mark = dyf_student_test_mark.select_fields(
            ['student_behavior_id', 'question_category', 'grade'])

        # dyf_student_test_mark = Filter.apply(frame=dyf_student_test_mark,
        #                                     f=lambda x: x["behavior_id"] in [BEHAVIOR_ID_TEST_TUAN,
        #                                                                      BEHAVIOR_ID_TEST_THANG
        #                                                                      ]
        #                                     )

        df_student_test_mark = dyf_student_test_mark.toDF()

        number_student_test_mark = df_student_test_mark.count()

        if is_dev:
            print('df_student_test_mark')
            print('df_student_test_mark: ', number_student_test_mark)
            df_student_test_mark.printSchema()
            df_student_test_mark.show(3)

        if number_student_test_mark == 0:
            return

        df_student_behavior_mark = df_student_behavior\
            .join(df_student_test_mark,
                    on='student_behavior_id',
                    how='left')

        if is_dev:
            print('df_student_behavior_mark')
            print('df_student_behavior_mark: ', df_student_behavior_mark)
            df_student_behavior_mark.printSchema()
            df_student_behavior_mark.show(3)

        df_student_behavior_mark = df_student_behavior_mark.dropDuplicates([
            'student_behavior_id', 'student_id', 'behavior_id',
            'question_category'
        ])

        df_student_behavior_mark_week = df_student_behavior_mark\
            .filter(df_student_behavior_mark.behavior_id == BEHAVIOR_ID_TEST_TUAN)
        df_student_behavior_mark_month = df_student_behavior_mark.filter(
            df_student_behavior_mark.behavior_id == BEHAVIOR_ID_TEST_THANG)


        df_student_behavior_mark_week = df_student_behavior_mark_week\
            .withColumn('agg_week_id',  from_unixtime(df_student_behavior_mark_week.student_behavior_date, "yyyyww"))

        df_student_behavior_mark_month = df_student_behavior_mark_month \
            .withColumn('agg_month_id',
                        from_unixtime(df_student_behavior_mark_month.student_behavior_date, "yyyyMM"))

        if is_dev:
            print('df_student_behavior_mark_week')
            df_student_behavior_mark_week.printSchema()
            df_student_behavior_mark_week.show(3)

            print('df_student_behavior_mark_month')
            df_student_behavior_mark_month.printSchema()
            df_student_behavior_mark_month.show(3)

        df_student_behavior_mark_week = df_student_behavior_mark_week \
            .withColumn("class_id", check_class_id(df_student_behavior_mark_week.question_category))

        df_student_behavior_mark_week_agg = df_student_behavior_mark_week.groupby(
            'student_id', 'agg_week_id', 'class_id').agg(
                f.round(f.max(df_student_behavior_mark_week.grade)).cast(
                    'long').alias('grade_total'),
                f.lit(PERIOD_WEEKLY).alias('period_type_id'),
                f.lit(None).cast('string').alias('agg_date_id'),
                f.lit(None).cast('string').alias('agg_month_id'))

        df_student_behavior_mark_month = df_student_behavior_mark_month.na.fill(
            {'grade': 0})

        df_student_behavior_mark_month = df_student_behavior_mark_month.groupby(
            'student_behavior_id').agg(
                f.first('student_id').alias('student_id'),
                f.first('agg_month_id').alias('agg_month_id'),
                f.round(
                    f.sum('grade')).cast('long').alias('grade_total_attempt'),
            )

        df_student_behavior_mark_month_agg = df_student_behavior_mark_month.groupby(
            'student_id', 'agg_month_id').agg(
                f.max(
                    df_student_behavior_mark_month.grade_total_attempt).alias(
                        'grade_total'),
                f.lit(PERIOD_MONTHLY).alias('period_type_id'),
                f.lit(None).cast('string').alias('agg_date_id'),
                f.lit(None).cast('string').alias('agg_week_id'),
                f.lit(68L).cast('long').alias('class_id'))

        df_student_behavior_mark_month_agg = df_student_behavior_mark_month_agg.select(
            'student_id', 'agg_week_id', 'class_id', 'grade_total',
            'period_type_id', 'agg_date_id', 'agg_month_id')

        if is_dev:
            print('df_student_behavior_mark_week_agg')
            df_student_behavior_mark_week_agg.printSchema()
            df_student_behavior_mark_week_agg.show(3)

            print('df_student_behavior_mark_month_agg')
            df_student_behavior_mark_month_agg.printSchema()
            df_student_behavior_mark_month_agg.show(3)

        df_student_behavior_mark_agg = df_student_behavior_mark_week_agg.union(
            df_student_behavior_mark_month_agg)

        if is_dev:
            print('df_student_behavior_mark_agg')
            df_student_behavior_mark_agg.printSchema()
            df_student_behavior_mark_agg.show(3)

        dyf_student_behavior_mark_agg = DynamicFrame.fromDF(
            df_student_behavior_mark_agg, glueContext,
            'dyf_student_behavior_mark_agg')
        dyf_student_behavior_mark_agg = Filter.apply(
            frame=dyf_student_behavior_mark_agg,
            f=lambda x: x["class_id"] is not None)
        dyf_student_behavior_mark_agg.show(3)
        apply_output_month = ApplyMapping.apply(
            frame=dyf_student_behavior_mark_agg,
            mappings=[("student_id", "long", "student_id", "long"),
                      ("class_id", "long", "class_id", "long"),
                      ("period_type_id", "long", "period_type_id", "long"),
                      ("agg_date_id", "string", "created_date_id", "long"),
                      ("agg_week_id", "string", "created_week_id", "long"),
                      ("agg_month_id", "string", "created_month_id", "long"),
                      ("grade_total", "long", "measure1", "long")])

        dfy_output_month = ResolveChoice.apply(
            frame=apply_output_month,
            choice="make_cols",
            transformation_ctx="resolvechoice2")

        datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
            frame=dfy_output_month,
            catalog_connection="nvn_knowledge",
            connection_options={
                "dbtable": "student_learning_history",
                "database": "nvn_knowledge_v2"
            },
            redshift_tmp_dir=
            "s3n://dtsodin/temp/nvn_knowledge_v2/student_learning_history",
            transformation_ctx="datasink4")

        df_temp = dyf_student_behaviors.toDF()
        flag = df_temp.agg({"transformed_at": "max"}).collect()[0][0]

        flag_data = [flag]
        df = spark.createDataFrame(flag_data, "long").toDF('flag')
        # ghi de _key vao s3
        df.write.parquet(
            "s3a://dts-odin/flag/flag_student_testing_history.parquet",
            mode="overwrite")
コード例 #2
0
    )

    if ctx.job_name == 'chief':
        print("Exporting saved_model to {}".format(args.export_dir))
        classifier.export_saved_model(args.export_dir,
                                      serving_input_receiver_fn)


if __name__ == "__main__":

    from pyspark.context import SparkContext
    from pyspark.conf import SparkConf
    from tensorflowonspark import TFCluster
    import argparse

    sc = SparkContext(conf=SparkConf().setAppName("mnist_estimator"))
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1

    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size",
                        help="number of records per batch",
                        type=int,
                        default=64)
    parser.add_argument("--buffer_size",
                        help="size of shuffle buffer",
                        type=int,
                        default=10000)
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
コード例 #3
0
                cnt += 1
                # output_file.write("lbl: {} pred: {}\n".format(truelbl,np.argmax(p) ))
                if truelbl != np.argmax(p):
                    output_file.write("lbl: {} pred: {}\n".format(
                        truelbl, np.argmax(p)))
        except tf.errors.OutOfRangeError:
            break

    output_file.close()


if __name__ == '__main__':
    from pyspark.context import SparkContext
    from pyspark.conf import SparkConf

    sc = SparkContext(conf=SparkConf().setAppName("mnist_inference"))
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--cluster_size",
        help=
        "number of nodes in the cluster (for S with labelspark Standalone)",
        type=int,
        default=num_executors)
    parser.add_argument('--images_labels',
                        type=str,
                        help='Directory for input images with labels')
    parser.add_argument("--export",
                        help="HDFS path to export model",
コード例 #4
0
ファイル: regression.py プロジェクト: sushmitkarar/spark
        """
        Gets the value of stepSize or its default value.
        """
        return self.getOrDefault(self.stepSize)


class GBTRegressionModel(TreeEnsembleModels):
    """
    Model fitted by GBTRegressor.

    .. versionadded:: 1.4.0
    """


if __name__ == "__main__":
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[2]", "ml.regression tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    sc.stop()
    if failure_count:
        exit(-1)
コード例 #5
0
                        required=True,
                        help='hdfs path to output data')

    existing_model_group = parser.add_mutually_exclusive_group(required=True)
    existing_model_group.add_argument(
        '--model_pkl',
        dest='model_pkl',
        type=str,
        default=None,
        help='a pickled LOPQModel to evaluate on the data')
    existing_model_group.add_argument(
        '--model_proto',
        dest='model_proto',
        type=str,
        default=None,
        help='a protobuf LOPQModel to evaluate on the data')

    args = parser.parse_args()

    sc = SparkContext(appName='LOPQ code computation')

    # Load UDF module if provided
    if args.data_udf:
        udf_module = __import__(args.data_udf, fromlist=['udf'])
        load_udf = udf_module.udf
        main(sc, args, data_load_fn=load_udf)
    else:
        main(sc, args)

    sc.stop()
コード例 #6
0
        >>> algo.getInitSteps()
        10
        """
        self._paramMap[self.initSteps] = value
        return self

    @since("1.5.0")
    def getInitSteps(self):
        """
        Gets the value of `initSteps`
        """
        return self.getOrDefault(self.initSteps)


if __name__ == "__main__":
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[2]", "ml.clustering tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    sc.stop()
    if failure_count:
        exit(-1)
コード例 #7
0
                                    y_: batch_ys
                                })))

                        if sv.is_chief:
                            summary_writer.add_summary(summary, step)

            if sv.should_stop() or step >= args.steps:
                tf_feed.terminate()

        # Ask for all the services to stop.
        print("{0} stopping supervisor".format(datetime.now().isoformat()))
        sv.stop()


if __name__ == '__main__':
    sc = SparkContext(conf=SparkConf().setAppName("read hdfs save to hdfs "))
    hive_context = HiveContext(sc)
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1
    num_ps = 1

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="input hdfs path")
    parser.add_argument(
        "-m",
        "--model",
        help="HDFS path to save/load model during train/inference",
        default="mnist_model")
    parser.add_argument("-tb",
                        "--tensorboard",
                        help="launch tensorboard process",
コード例 #8
0
                           steps_per_epoch=steps_per_epoch,
                           callbacks=callbacks)

    from tensorflow_estimator.python.estimator.export import export_lib
    export_dir = export_lib.get_timestamped_export_dir(args.export_dir)
    compat.export_saved_model(multi_worker_model, export_dir,
                              ctx.job_name == 'chief')


if __name__ == '__main__':
    import argparse
    from pyspark.context import SparkContext
    from pyspark.conf import SparkConf
    from tensorflowonspark import TFCluster

    sc = SparkContext(conf=SparkConf().setAppName("mnist_keras"))
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1

    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size",
                        help="number of records per batch",
                        type=int,
                        default=64)
    parser.add_argument("--buffer_size",
                        help="size of shuffle buffer",
                        type=int,
                        default=10000)
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
コード例 #9
0
ファイル: tuning.py プロジェクト: sgaviner/spark-1
    def copy(self, extra=None):
        """
        Creates a copy of this instance with a randomly generated uid
        and some extra params. This copies the underlying bestModel,
        creates a deep copy of the embedded paramMap, and
        copies the embedded and extra parameters over.
        :param extra: Extra parameters to copy to the new instance
        :return: Copy of this instance
        """
        if extra is None:
            extra = dict()
        return CrossValidatorModel(self.bestModel.copy(extra))


if __name__ == "__main__":
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[2]", "ml.tuning tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    sc.stop()
    if failure_count:
        exit(-1)
コード例 #10
0
import redis_logger_handler


def parseFile(images_path, labels_path, fmt):
    if fmt == "csv":
        images = sc.textFile(images_path).map(
            lambda ln: [int(x) for x in ln.split(',')])
        labels = sc.textFile(labels_path).map(
            lambda ln: [int(x) for x in ln.split(',')])
    else:
        images = sc.pickleFile(images_path)
        labels = sc.pickleFile(labels_path)
    return images, labels


sc = SparkContext(conf=SparkConf().setAppName("lstm_ctc_ocr_spark"))
executors = sc._conf.get("spark.executor.instances")
num_executors = int(executors) if executors is not None else 1
num_ps = 1

parser = argparse.ArgumentParser()
parser.add_argument("-b",
                    "--batch_size",
                    help="number of records per batch",
                    type=int,
                    default=64)
parser.add_argument("-e",
                    "--epochs",
                    help="number of epochs",
                    type=int,
                    default=1)
コード例 #11
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from pyspark.context import SparkContext
from pyspark.conf import SparkConf

import argparse
from datetime import datetime

from tensorflowonspark import TFCluster

import criteo_dist

if __name__ == "__main__":
    sc = SparkContext(conf=SparkConf().setAppName("criteo_spark"))
    executors = sc._conf.get("spark.executor.instances")
    if executors is None:
        raise Exception(
            "Could not retrieve the number of executors from the SparkContext")
    num_executors = int(executors)
    num_ps = 1

    parser = argparse.ArgumentParser()
    parser.add_argument("-b",
                        "--batch_size",
                        help="number of records per batch",
                        type=int,
                        default=100)
    parser.add_argument("-e",
                        "--epochs",
コード例 #12
0
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")
    # get dynamic frame source

    #------------------------------------------------------------------------------------------------------------------#
    dyf_native_talk = glueContext.create_dynamic_frame.from_catalog(database='native_talk',
                                                                table_name='native_talk_history_log_api')

    dyf_native_talk = dyf_native_talk.resolveChoice(specs=[('id', 'cast:long')])

    try:
        df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet")
        read_from_index = df_flag.collect()[0]['flag']
        print('read from index: ', read_from_index)
        dyf_native_talk = Filter.apply(frame=dyf_native_talk,
                                       f=lambda x: x["id"] > read_from_index)
    except:
        print('read flag file error ')

    dyf_native_talk = dyf_native_talk.select_fields(
        ['id', 'learning_date', 'speaking_dialog_score', 'username', 'updated_time'])

    dy_cache = dyf_native_talk.toDF()
    dy_cache = dy_cache.cache()
    dyf_native_talk = DynamicFrame.fromDF(dy_cache, glueContext, 'dyf_native_talk')

    print('dy_cache------------')
    dy_cache.printSchema()
    print('dy_cache: ', dy_cache.count())
    dy_cache.show(2)

    #------------------------------------------------------------------------------------------------------------------#

    if (dyf_native_talk.count() > 0):

        #---------------------------------------------------------datasource0-----------------------------------------------------#
        dyf_native_talk = Filter.apply(frame=dyf_native_talk,
                                              f=lambda x: x["username"] is not None and x["username"] != ''
                                                          and x["speaking_dialog_score"] is not None
                                                          and x["learning_date"] is not None and x["learning_date"] != '')
        # ----------------------------------datasource1---------------------------------------------------------------------------#
        if (dyf_native_talk.count() > 0):
            dyf_nt_account_mapping = glueContext.create_dynamic_frame.from_catalog(database='native_talk',
                                                                        table_name='native_talk_account_mapping')

            dyf_nt_account_mapping = dyf_nt_account_mapping.select_fields(['contact_id', 'username']).rename_field('username', 'nativetalk_user')
            dy_cache_2 = dyf_nt_account_mapping.toDF()
            dy_cache_2 = dy_cache_2.cache()
            dyf_nt_account_mapping = DynamicFrame.fromDF(dy_cache_2, glueContext, 'dyf_nt_account_mapping')

            dyf_nt_account_mapping = Filter.apply(frame=dyf_nt_account_mapping,
                                                  f=lambda x: x["nativetalk_user"] is not None and x["nativetalk_user"] != '')
            # ----------------------------------datasource1---------------------------------------------------------------------------#

            # -------------------------------------------------------------------------------------------------------------#
            join = Join.apply(dyf_native_talk, dyf_nt_account_mapping, 'username', 'nativetalk_user')
            if(join.count() > 0):
                df_nativetalk = join.toDF()
                df_nativetalk = df_nativetalk.withColumn('sogio', f.lit(0.083333)) #5 phut
                df_nativetalk = df_nativetalk.withColumn('id_time',
                                                         from_unixtime(
                                                             unix_timestamp(df_nativetalk.learning_date, "yyyy-MM-dd"),
                                                             "yyyyMMdd"))
                df_nativetalk = df_nativetalk.where("contact_id IS NOT NULL")

                data_nativetalk = DynamicFrame.fromDF(df_nativetalk, glueContext, 'data_nativetalk')
                data_nativetalk = data_nativetalk.resolveChoice(specs=[('sogio', 'cast:float')])
                # -------------------------------------------------------------------------------------------------------------#
                print('data_nativetalk----------')
                data_nativetalk.printSchema()


                # tinh bang "fact_hieusuathoctap"
                df_hieusuathoctap = data_nativetalk.toDF()
                # tinh so ca hoc, thoi gian hoc cua hoc vien trong ngay id_time
                df_hieusuathoctap = df_hieusuathoctap.groupby('contact_id', 'id_time').agg(f.sum('sogio'),
                                                                                               f.count('contact_id'))

                df_hieusuathoctap = df_hieusuathoctap.withColumn('tu_hoc_type_id', f.lit(400))
                data_hieusuathoctap = DynamicFrame.fromDF(df_hieusuathoctap, glueContext, 'data_hieusuathoctap')
                data_hieusuathoctap = data_hieusuathoctap.resolveChoice(specs=[('sum(sogio)', 'cast:double')])

                print('data_hieusuathoctap::data_hieusuathoctap::data_hieusuathoctap------------------------------------------')
                data_hieusuathoctap.printSchema();

                applymapping2 = ApplyMapping.apply(frame=data_hieusuathoctap,
                                                   mappings=[("contact_id", "string", "contact_id", "string"),
                                                             ("id_time", 'string', 'id_time', 'bigint'),
                                                             ("count(contact_id)", 'long', 'soca', 'int'),
                                                             ("sum(sogio)", 'double', 'sogio', 'double'),
                                                             ("tu_hoc_type_id", 'int', "tu_hoc_type_id", "int")])


                resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols",
                                                     transformation_ctx="resolvechoice2")
                dropnullfields2 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2")

                print('dropnullfields2 number: ', dropnullfields2.count())

                datasink2 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2,
                                                                           catalog_connection="glue_redshift",
                                                                           connection_options={"dbtable": "temp_staging_lich_su_tu_hoc_native_talk",
                                                                                               "database": "dts_odin",
                                                                                               "postactions": """INSERT into mapping_changed_status_student(user_id, change_status_date_id, to_status_id, measure1, measure2)
                                                                                                                            SELECT um.user_id, hwb.id_time, 53, hwb.soca, round(hwb.sogio, 4)
                                                                                                                            FROM temp_staging_lich_su_tu_hoc_native_talk hwb
                                                                                                                            LEFT JOIN user_map um
                                                                                                                                ON um.source_type = 1
                                                                                                                                AND um.source_id = hwb.contact_id;
                                                                                                                 DROP TABLE IF EXISTS public.temp_staging_lich_su_tu_hoc_native_talk    
                                                                                                                """
                                                                                               },
                                                                           redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/",
                                                                           transformation_ctx="datasink2")

                df_datasource = dyf_native_talk.toDF()
                flag = df_datasource.agg({"id": "max"}).collect()[0][0]
                print('flag: ', flag)
                flag_data = [flag]
                df = spark.createDataFrame(flag_data, "long").toDF('flag')
                df.write.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk.parquet", mode="overwrite")
                dy_cache.unpersist()
                dy_cache_2.unpersist()
コード例 #13
0
SOURCE_SYSTEM = ARGS['source_system']  # must
JSON_FILE_NAME = TARGET_TABLE + \
    "_" + SOURCE_SYSTEM  # ar_invc_hdr_f_must
STAGE_TABLE = ARGS['rs_stage_table']  # ar_invc_hdr_f_stage_must
CTLG_CONNECTION = ARGS['glue_conn']  # TestRedshift3
REDSHIFTDB = ARGS['rs_db']  # usinnovationredshift
S3_BUCKET = ARGS['bkt_name']  # "odp-us-innovation-raw"
MD5_COLUMN_SCD1 = TARGET_TABLE + "_md5_scd1"  # ar_invc_hdr_f_md5_scd1
TARGET_TABLE_COLUMNS = ARGS['target_cols']  # As per DDL(col1,col2,col3)
STAGE_TABLE_COLUMNS = ARGS['stage_cols']  # As per DDL(col1,col2,col3)
DBTABLE_STG = STAGE_DATABASE_NAME + "." + STAGE_TABLE

URL = ARGS["jdbc_url"]
IAM_ROLE = ARGS["iam_role"]

SC = SparkContext()
GLUECONTEXT = GlueContext(SC)
SPARK = GLUECONTEXT.spark_session
JOB = Job(GLUECONTEXT)
JOB.init(ARGS['JOB_NAME'], ARGS)
RUN_ID = ARGS['JOB_RUN_ID']
JOB_NAME = ARGS['JOB_NAME']
TEMPDIR = ARGS['TempDir']

SRC_NOTEMPTY = True
try:
    # @type: DataSource
    # @args: [database = "db_mrr_must",
    ## table_name = "billing"
    # transformation_ctx = "billing_df"]
    # @return: DynamicFrame
コード例 #14
0
                    tags=[tag_constants.SERVING],
                    signature_def_map={'predict': signature},
                    clear_devices=True)
                builder.save()

            if args.input_mode == 'spark':
                tf_feed.terminate()


if __name__ == '__main__':
    import argparse
    from pyspark.context import SparkContext
    from pyspark.conf import SparkConf
    from tensorflowonspark import TFCluster

    sc = SparkContext(conf=SparkConf().setAppName("mnist_mlp"))
    executors = sc._conf.get("spark.executor.instances")
    num_executors = int(executors) if executors is not None else 1
    num_ps = 1

    parser = argparse.ArgumentParser()
    parser.add_argument("--cluster_size",
                        help="number of nodes in the cluster",
                        type=int,
                        default=num_executors)
    parser.add_argument("--epochs",
                        help="number of epochs of training data",
                        type=int,
                        default=20)
    parser.add_argument("--export_dir", help="directory to export saved_mode")
    parser.add_argument(
コード例 #15
0
def dohash(value):
    return abs(hash(value))


if __name__ == "__main__":
    # ensure number of inputs is 4: py file, input files, output files
    if len(sys.argv) != 4:
        print(
            "This script requires 3 input arguments to run: 1 inputFile and 1 outputFile"
        )

        # break it
        sys.exit(1)

    # create an interface between pyspark and spark server
    sc = SparkContext('local[*]')

    # to simplify output
    # sc.setLogLevel("ERROR")
    """
    DEFAULT APPROACH
    """

    # start timer
    startTimer1 = time.time()

    # get input file and import into the SparkContext object
    task2Input1 = sc.textFile(sys.argv[1])

    answer1, builder1 = task2Processor(task2Input1)
コード例 #16
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from pyspark.context import SparkContext
from pyspark.conf import SparkConf

import argparse
import numpy
import tensorflow as tf
from datetime import datetime

from tensorflowonspark import TFCluster
import wiki_dist

sc = SparkContext(conf=SparkConf().setAppName("wiki_spark"))
executors = sc._conf.get("spark.executor.instances")
num_executors = int(executors) if executors is not None else 1
num_ps = 1

parser = argparse.ArgumentParser()
parser.add_argument("--batch_size",
                    help="number of records per batch",
                    type=int,
                    default=100)
parser.add_argument("--epochs", help="number of epochs", type=int, default=1)
parser.add_argument("--format",
                    help="example format: (csv|pickle|tfr)",
                    choices=["csv", "pickle", "tfr"],
                    default="csv")
parser.add_argument("--images",
コード例 #17
0
ファイル: mylfm.py プロジェクト: ChenHaoyang/lfm
'''
Created on 2015/12/08

@author: charles
'''
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.context import SparkContext

sc = SparkContext("local")
# Load and parse the data
data = sc.textFile("data/mllib/als/test.data")
ratings = data.map(lambda l: l.split(',')).map(
    lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

# Build the recommendation model using Alternating Least Squares
rank = 10
numIterations = 10
model = ALS.train(ratings, rank, numIterations)

# Evaluate the model on training data
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

# Save and load model
model.save(sc, "myModelPath")
sameModel = MatrixFactorizationModel.load(sc, "myModelPath")
コード例 #18
0
import calendar
import datetime
from pyspark.sql.types import StringType
from pyspark.sql.functions import lit, col


def findDay(date):
    ref = [
        'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
        'Sunday'
    ]
    born = datetime.datetime.strptime(date, '%d %m %Y').weekday()
    return ref[int(born)]


sc = SparkContext('local')
# spark = SparkSession(sc)
sql = SQLContext(sc)

sample_udf = udf(lambda x: findDay(x), StringType())

# df = sql.read.csv("KCcrime2010To2018.csv", inferSchema = True, header = True)
df = sql.read.csv("joined_date - Copy.csv", inferSchema=True, header=True)

df.createTempView(name='kc_crime')

pdb.set_trace()

#getting the number of crimes in 2010 count by month
df2 = sql.sql(
    "select Reported_month, count(1) from kc_crime where Reported_year = '2010' group by Reported_month order by Reported_month"
コード例 #19
0
from __future__ import print_function
from pyspark.conf import SparkConf
from pyspark.context import SparkContext




config = SparkConf()
config.setAppName("SPARK_WORD_COUNT_JOB")
config.setMaster("local[*]")

sc = SparkContext(conf=config)
sc.setLogLevel("info")
text_file_rdd = sc.textFile("/home/dharshekthvel/history_1.txt")
flat_mapped_rdd=text_file_rdd.flatMap(lambda each: each.split(' '))
mapped_rdd = flat_mapped_rdd.map(lambda each: (each,1))
mapped_rdd.reduceByKey(lambda x,y: x+y)\
    .foreach(print)
コード例 #20
0
def main():
    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    # job = Job(glueContext)
    # job.init(args['JOB_NAME'], args)
    spark.conf.set("spark.sql.session.timeZone", "GMT+07:00")

    dyf_care_call = glueContext.create_dynamic_frame.from_catalog(
        database='tig_advisor', table_name='care_call')

    dyf_care_call = dyf_care_call.resolveChoice(specs=[('_key', 'cast:long')])
    # print schema and select fields
    print('original schema')
    dyf_care_call.printSchema()
    dyf_care_call.show(10)

    # try:
    #     df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet")
    #     read_from_index = df_flag.collect()[0]['flag']
    #     print('read from index: ', read_from_index)
    #     dyf_care_call = Filter.apply(frame=dyf_care_call,
    #                                            f=lambda x: x["_key"] > read_from_index)
    # except:
    #     print('read flag file error ')
    # print('the number of new contacts: ', dyf_care_call.count())

    dyf_care_call = dyf_care_call.select_fields(
        ['_key', 'id', 'phone', 'duration', 'call_status',
         'time_created']).rename_field('time_created', 'call_date')

    dy_source_care_call_cache = dyf_care_call.toDF()
    dy_source_care_call_cache = dy_source_care_call_cache.dropDuplicates(
        ['id'])
    dy_source_care_call_cache = dy_source_care_call_cache.cache()
    dyf_care_call = DynamicFrame.fromDF(dy_source_care_call_cache, glueContext,
                                        'dyf_care_call')
    #
    if (dyf_care_call.count() > 0):
        dyf_care_call = Filter.apply(
            frame=dyf_care_call,
            f=lambda x: x["phone"] is not None and x["phone"] != '' and
            (x["call_status"] == 'success' or x["call_status"] ==
             'call_success') and x["call_date"] is not None and x["call_date"]
            != '' and x["duration"] is not None and x["duration"] > 30)
        #
        print('dyf_care_call::corrcect')
        print('dyf_care_call number', dyf_care_call.count())
        if (dyf_care_call.count() > 0):

            dyf_ad_contact_phone = glueContext.create_dynamic_frame.from_catalog(
                database='tig_advisor', table_name='student_contact_phone')

            dyf_ad_contact_phone = dyf_ad_contact_phone.select_fields(
                ['phone', 'contact_id'])

            dyf_ad_contact_phone = Filter.apply(
                frame=dyf_ad_contact_phone,
                f=lambda x: x["phone"] is not None and x["phone"] != '' and x[
                    "contact_id"] is not None and x["contact_id"] != '')

            print('dyf_ad_contact_phone::schema')
            dyf_ad_contact_phone.printSchema()

            #         dyf_advisor_ip_phone = glueContext.create_dynamic_frame.from_catalog(database='callcenter',
            #                                                                              table_name='advisor_ip_phone')
            #
            #         dyf_advisor_ip_phone = Filter.apply(frame=dyf_advisor_ip_phone,
            #                                             f=lambda x: x["ip_phone"] is not None and x["ip_phone"] != '')
            #
            #
            #
            #
            #
            #
            #-----------------------------------------------------------------------------------------------------------#

            join_call_contact = Join.apply(dyf_care_call, dyf_ad_contact_phone,
                                           'phone', 'phone')
            # join_call_contact = join_call_contact.select_fields(['id_time', 'answertime', 'calldate', 'phonenumber_correct', 'calldate', 'ipphone', 'contact_id'])
            # print('join_call_contact::schema------------')
            join_call_contact.printSchema()
            join_call_contact.show(2)
            print('join: ', join_call_contact.count())
            #
            #
            #         #-----------------------------------------------------------------------------------------------------------#
            #
            dyf_source_ls_dong_tien = glueContext.create_dynamic_frame.from_catalog(
                database='poss', table_name='nvn_poss_lich_su_dong_tien')

            dyf_source_ls_dong_tien = Filter.apply(
                frame=dyf_source_ls_dong_tien,
                f=lambda x: x["contact_id"] is not None and x["contact_id"] !=
                '' and x["ngay_thanhtoan"] is not None and x["ngay_thanhtoan"
                                                             ] != '')

            dyf_source_ls_dong_tien = dyf_source_ls_dong_tien.select_fields([
                '_key', 'id', 'contact_id', 'ngay_thanhtoan', 'ngay_tao',
                'makh'
            ]).rename_field('ngay_tao', 'ngay_a0')

            dy_source_ls_dt_cache = dyf_source_ls_dong_tien.toDF()
            dy_source_ls_dt_cache = dy_source_ls_dt_cache.dropDuplicates(
                ['id'])
            dy_source_ls_dt_cache = dy_source_ls_dt_cache.cache()
            dyf_source_ls_dong_tien = DynamicFrame.fromDF(
                dy_source_ls_dt_cache, glueContext, 'dyf_source_ls_dong_tien')
            #
            join_call_contact_ao = Join.apply(join_call_contact,
                                              dyf_source_ls_dong_tien,
                                              'contact_id', 'contact_id')
            #
            print('join_call_contact_ao::schema------------')
            join_call_contact_ao.printSchema()
            join_call_contact_ao.show(2)
            print('join: ', join_call_contact_ao.count())
            #
            #         # join_call_contact_ao = join_call_contact_ao.resolveChoice(specs=[('calldate', 'cast:timestamp'),
            #         #                                                                  ('ngay_a0', 'cast:timestamp')])
            #
            #
            join_call_contact_ao = Filter.apply(
                frame=join_call_contact_ao,
                f=lambda x: x["call_date"] is not None and x[
                    "ngay_a0"] is not None and x["call_date"] > x["ngay_a0"])
            #
            print(
                'join_call_contact_ao::after filter calldate > ngay_a0------------'
            )
            # join_call_contact_ao.printSchema()
            join_call_contact_ao.show(2)
            print('join_call_contact_ao: ', join_call_contact_ao.count())
            #
            #         #get lich su chao mung thanh cong
            df_join_call_contact_ao = join_call_contact_ao.toDF()
            df_join_call_contact_ao = df_join_call_contact_ao.groupby(
                'contact_id', 'makh').agg(f.min('call_date').alias("ngay_a1"))

            df_join_call_contact_ao = df_join_call_contact_ao.withColumn(
                'id_time',
                from_unixtime(
                    unix_timestamp(df_join_call_contact_ao.ngay_a1,
                                   "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd"))
            dyf_result = DynamicFrame.fromDF(df_join_call_contact_ao,
                                             glueContext, 'dyf_result')
            #
            #         print('dyf_result------------')
            # join_call_contact_ao.printSchema()
            dyf_result.show(2)
            print('dyf_result: ', dyf_result.count())
            #
            #
            #
            #
            #         # # chon field
            applymapping1 = ApplyMapping.apply(
                frame=dyf_result,
                mappings=[("contact_id", "string", "contact_id", "string"),
                          ("id_time", "string", "id_time", "bigint"),
                          ("makh", "int", "makh", "int"),
                          ("ngay_a1", "string", "ngay_a1", "timestamp")])
            #
            resolvechoice2 = ResolveChoice.apply(
                frame=applymapping1,
                choice="make_cols",
                transformation_ctx="resolvechoice2")
            dropnullfields3 = DropNullFields.apply(
                frame=resolvechoice2, transformation_ctx="dropnullfields3")

            # print('dropnullfields3::printSchema')
            # dropnullfields3.printSchema()
            # dropnullfields3.show(2)

            # ghi data vao redshift
            datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
                frame=dropnullfields3,
                catalog_connection="glue_redshift",
                connection_options={
                    "dbtable":
                    "temp_ls_dong_tien_a1_v3",
                    "database":
                    "dts_odin",
                    "postactions":
                    """
                                                                                        INSERT into mapping_changed_status_student(description, user_id, change_status_date_id, to_status_id, timestamp1)
                                                                                        SELECT 'contact_id: ' + temp_a1.contact_id +' - makh: ' + temp_a1.makh, um.user_id ,temp_a1.id_time, 2, temp_a1.ngay_a1
                                                                                        FROM temp_ls_dong_tien_a1_v3 temp_a1
                                                                                        LEFT JOIN user_map um
                                                                                             ON um.source_type = 1
                                                                                             AND um.source_id = temp_a1.contact_id
                                                                                        ;
                                                                                        DROP TABLE IF EXISTS public.temp_ls_dong_tien_a1_v3;
                                                                                        CALL update_a1_exception_from_eg()
                                                                           """
                },
                redshift_tmp_dir="s3n://dts-odin/temp/temp_ls_dong_tien/v2",
                transformation_ctx="datasink4")
            df_datasource = dyf_care_call.toDF()
            flag = df_datasource.agg({"_key": "max"}).collect()[0][0]
            flag_data = [flag]
            df = spark.createDataFrame(flag_data, "long").toDF('flag')
            df.write.parquet(
                "s3a://dts-odin/flag/student_status/temp_ls_a1_dong_tien_tc.parquet",
                mode="overwrite")
            dy_source_care_call_cache.unpersist()
コード例 #21
0
    return sorted(connectedGraph, key=lambda x: -len(x))


def calculateModularity(communities, modularityDict):
    Q = 0
    for c in communities:
        for i in c:
            for j in c:
                Q += modularityDict[(i, j)]
    return Q / (2 * M)


if __name__ == "__main__":

    startTime = time.time()
    sc = SparkContext('local[*]', '578task')
    sc.setLogLevel("WARN")

    input_file = "edgeList.csv"
    output_file = "output_cluster.csv"

    rdd = sc.textFile(input_file)

    edges = rdd.map(lambda x: x.split(","))\
        .map(lambda x: (x[0], x[1]))

    M = edges.count()
    print("edgesNumber:", M)

    # record every users' neighbor, (user1, [user2, user3...])
    userNeighbor = edges.flatMap(lambda x: [(x[0], [x[1]]), (x[1], [x[0]])]
コード例 #22
0
# coding: utf-8

# In[1]:


from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark import SparkConf
conf = SparkConf()
sc = SparkContext('local', conf)
spark = SparkSession(sc)


# In[2]:


import pyspark.sql.functions as F
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.clustering import KMeans


# In[3]:


spark.read.json('C:/Users/maxen/Downloads/Test_DS/Test_DS/Brisbane_CityBike.json').show(150,False)


# In[42]:

コード例 #23
0
def spark_context(request):
    sc = SparkContext('local', 'tests_practicas_spark')
    request.addfinalizer(lambda: sc.stop())
    logger = logging.getLogger('py4j')
    logger.setLevel(logging.WARN)
    return sc
コード例 #24
0
def stablish_spark_connection():
    sConf = SparkConf("spark://localhost:7077")
    sc = SparkContext(conf=sConf)
    spark = SparkSession(sc)

    return sConf, sc, spark
コード例 #25
0
ファイル: mnist_spark.py プロジェクト: wucng/Tensorflow
from pyspark.conf import SparkConf

import argparse
import os
import numpy
import sys
import tensorflow as tf
import threading
import time
from datetime import datetime

# from com.yahoo.ml.tf import TFCluster
from tensorflowonspark import TFCluster
import mnist_dist

sc = SparkContext(
    conf=SparkConf().setAppName("mnist_spark"))  # mnist_spark 可以自行修改
executors = sc._conf.get("spark.executor.instances")  # spark worker实例个数
num_executors = int(executors) if executors is not None else 1
num_ps = 1  # ps(主)节点个数

parser = argparse.ArgumentParser()
parser.add_argument("-b",
                    "--batch_size",
                    help="number of records per batch",
                    type=int,
                    default=128)  # 每步训练的样本数
parser.add_argument("-e",
                    "--epochs",
                    help="number of epochs",
                    type=int,
                    default=1)  # 样本迭代总数
コード例 #26
0
if sparkVersion.isImportAllPackageUnderSparkSql():
    java_import(gateway.jvm, "org.apache.spark.sql.*")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.*")
else:
    java_import(gateway.jvm, "org.apache.spark.sql.SQLContext")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.HiveContext")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.LocalHiveContext")
    java_import(gateway.jvm, "org.apache.spark.sql.hive.TestHiveContext")

java_import(gateway.jvm, "scala.Tuple2")

_zcUserQueryNameSpace = {}

jconf = intp.getSparkConf()
conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf)
sc = _zsc_ = SparkContext(jsc=jsc, gateway=gateway, conf=conf)
_zcUserQueryNameSpace["_zsc_"] = _zsc_
_zcUserQueryNameSpace["sc"] = sc

if sparkVersion.isSpark2():
    spark = __zSpark__ = SparkSession(sc, intp.getSparkSession())
    sqlc = __zSqlc__ = __zSpark__._wrapped
    _zcUserQueryNameSpace["sqlc"] = sqlc
    _zcUserQueryNameSpace["__zSqlc__"] = __zSqlc__
    _zcUserQueryNameSpace["spark"] = spark
    _zcUserQueryNameSpace["__zSpark__"] = __zSpark__
else:
    sqlc = __zSqlc__ = SQLContext(sparkContext=sc,
                                  sqlContext=intp.getSQLContext())
    _zcUserQueryNameSpace["sqlc"] = sqlc
    _zcUserQueryNameSpace["__zSqlc__"] = sqlc
コード例 #27
0
ファイル: evaluation.py プロジェクト: yshthdn/spark
    @keyword_only
    def setParams(self,
                  predictionCol="prediction",
                  labelCol="label",
                  metricName="f1"):
        """
        setParams(self, predictionCol="prediction", labelCol="label", \
                  metricName="f1")
        Sets params for multiclass classification evaluator.
        """
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)


if __name__ == "__main__":
    import doctest
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext
    globs = globals().copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[2]", "ml.evaluation tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext
    (failure_count, test_count) = doctest.testmod(globs=globs,
                                                  optionflags=doctest.ELLIPSIS)
    sc.stop()
    if failure_count:
        exit(-1)
コード例 #28
0
ファイル: recommendation.py プロジェクト: terrytong0876/spark
        `features`
        """
        return self._call_java("userFactors")

    @property
    @since("1.4.0")
    def itemFactors(self):
        """
        a DataFrame that stores item factors in two columns: `id` and
        `features`
        """
        return self._call_java("itemFactors")


if __name__ == "__main__":
    import doctest
    import pyspark.ml.recommendation
    from pyspark.context import SparkContext
    from pyspark.sql import SQLContext
    globs = pyspark.ml.recommendation.__dict__.copy()
    # The small batch size here ensures that we see multiple batches,
    # even in these small test examples:
    sc = SparkContext("local[2]", "ml.recommendation tests")
    sqlContext = SQLContext(sc)
    globs['sc'] = sc
    globs['sqlContext'] = sqlContext
    (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
    sc.stop()
    if failure_count:
        exit(-1)
コード例 #29
0
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3

target_format = "parquet"

## @params: [JOB_NAME]
args = getResolvedOptions(
    sys.argv,
    ['JOB_NAME', 'DL_BUCKET', 'DL_PREFIX', 'DL_REGION', 'GLUE_SRC_DATABASE'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session

job = Job(glueContext)

dataLakeBucket = args["DL_BUCKET"]
dataLakePrefix = args["DL_PREFIX"]
aws_region = args["DL_REGION"]
glue_database = args["GLUE_SRC_DATABASE"]

job.init(args['JOB_NAME'], args)

client = boto3.client(service_name='glue', region_name=aws_region)
responseGetTables = client.get_tables(DatabaseName=glue_database)
コード例 #30
0
                    "gameweek": i['gameweek'],
                    "goals": i['goals'],
                    "own_goals": i['own_goals'],
                    "yellow_cards": i['yellow_cards'],
                    "red_cards": i['red_cards']
                }
                json_object = json.dumps(dictionary, indent=4)
                # Writing to sample.json
                with open("output_req_2.json", "w") as outfile:
                    print("Writing....to JSON")
                    outfile.write(json_object)
                break


if __name__ == "__main__":
    sp_context = SparkContext('local[2]', "UI")
    sp_sess = SparkSession.builder.appName('user_input').getOrCreate()
    sp_context.addFile("model.py")
    input_file = sys.argv[1]
    with open(input_file, 'r') as file:
        content = file.read()
        input_data = eval(content)
        if input_data["req_type"] == 1:
            # calling predict function:
            """
            output = predict(input_)
            """
            predict_helper(input_data)

        elif input_data["req_type"] == 2:
            # calling profile function