Beispiel #1
0
    def test_tf_column_filter(self):
        """InputMode.TENSORFLOW TFEstimator saving temporary TFRecords, filtered by input_mapping columns"""

        # create a Spark DataFrame of training examples (features, labels)
        trainDF = self.spark.createDataFrame(self.train_examples,
                                             ['col1', 'col2'])

        # and add some extra columns
        df = trainDF.withColumn('extra1', trainDF.col1)
        df = df.withColumn('extra2', trainDF.col2)
        self.assertEquals(len(df.columns), 4)

        # train model
        args = {}
        estimator = TFEstimator(self.get_function('tf/train'), args, export_fn=self.get_function('tf/export')) \
                      .setInputMapping( { 'col1': 'x', 'col2': 'y_' }) \
                      .setInputMode(TFCluster.InputMode.TENSORFLOW) \
                      .setModelDir(self.model_dir) \
                      .setExportDir(self.export_dir) \
                      .setTFRecordDir(self.tfrecord_dir) \
                      .setClusterSize(self.num_workers) \
                      .setNumPS(1) \
                      .setBatchSize(10)
        model = estimator.fit(df)
        self.assertTrue(os.path.isdir(self.model_dir))
        self.assertTrue(os.path.isdir(self.tfrecord_dir))

        df_tmp = dfutil.loadTFRecords(self.sc, self.tfrecord_dir)
        self.assertEquals(df_tmp.columns, ['col1', 'col2'])
Beispiel #2
0
    def test_dfutils(self):
        # create a DataFrame of a single row consisting of standard types (str, int, int_array, float, float_array, binary)
        row1 = (bytearray(b'text string'), 1, [2, 3, 4, 5], -1.1,
                [-2.2, -3.3, -4.4, -5.5], bytearray(b'\xff\xfe\xfd\xfc'))
        rdd = self.sc.parallelize([row1])
        df1 = self.spark.createDataFrame(rdd, ['a', 'b', 'c', 'd', 'e', 'f'])
        print("schema: {}".format(df1.schema))

        # save the DataFrame as TFRecords
        dfutil.saveAsTFRecords(df1, self.tfrecord_dir)
        self.assertTrue(hdfs_util.isdir(self.tfrecord_dir))
        # self.assertTrue(os.path.isdir(self.tfrecord_dir))

        # reload the DataFrame from exported TFRecords
        df2 = dfutil.loadTFRecords(self.sc,
                                   self.tfrecord_dir,
                                   binary_features=['a', 'f'])
        row2 = df2.take(1)[0]

        print("row_saved: {}".format(row1))
        print("row_loaded: {}".format(row2))

        # confirm loaded values match original/saved values
        self.assertEqual(row1[0], row2['a'])
        self.assertEqual(row1[1], row2['b'])
        self.assertEqual(row1[2], row2['c'])
        self.assertAlmostEqual(row1[3], row2['d'], 6)
        for i in range(len(row1[4])):
            self.assertAlmostEqual(row1[4][i], row2['e'][i], 6)
        print("type(f): {}".format(type(row2['f'])))
        for i in range(len(row1[5])):
            self.assertEqual(row1[5][i], row2['f'][i])

        # check origin of each DataFrame
        self.assertFalse(dfutil.isLoadedDF(df1))
        self.assertTrue(dfutil.isLoadedDF(df2))

        # references are equivalent
        df_ref = df2
        self.assertTrue(dfutil.isLoadedDF(df_ref))

        # mutated DFs are not equal, even if contents are identical
        df3 = df2.filter(df2.a == 'string_label')
        self.assertFalse(dfutil.isLoadedDF(df3))

        # re-used/re-assigned variables are not equal
        df2 = df3
        self.assertFalse(dfutil.isLoadedDF(df2))
                        help="path to export saved_model",
                        default="mnist_export")
    parser.add_argument("--output",
                        help="HDFS path to save predictions",
                        type=str,
                        default="predictions")
    parser.add_argument("--tensorboard",
                        help="launch tensorboard process",
                        action="store_true")

    args = parser.parse_args()
    print("args:", args)

    if args.format == 'tfr':
        # load TFRecords as a DataFrame
        df = dfutil.loadTFRecords(sc, args.images_labels)
    else:  # args.format == 'csv':
        # create RDD of input data
        def parse(ln):
            vec = [int(x) for x in ln.split(',')]
            return (vec[1:], vec[0])

        images_labels = sc.textFile(args.images_labels).map(parse)
        df = spark.createDataFrame(images_labels, ['image', 'label'])

    df.show()

    if args.mode == 'train':
        estimator = TFEstimator(main_fun, args) \
            .setInputMapping({'image': 'image', 'label': 'label'}) \
            .setModelDir(args.model_dir) \
Beispiel #4
0
                        type=str)
    parser.add_argument("--train_data",
                        help="HDFS path to training data",
                        type=str)
    parser.add_argument("--validation_data",
                        help="HDFS path to validation data",
                        type=str)

    (args, rem) = parser.parse_known_args()

    input_mode = TFCluster.InputMode.SPARK if args.input_mode == 'spark' else TFCluster.InputMode.TENSORFLOW

    print("{0} ===== Start".format(datetime.now().isoformat()))

    df = dfutil.loadTFRecords(sc,
                              args.train_data,
                              binary_features=['image/encoded'])
    estimator = TFEstimator(main_fun, sys.argv, export_fn=inception_export.export) \
            .setModelDir(args.train_dir) \
            .setExportDir(args.export_dir) \
            .setTFRecordDir(args.tfrecord_dir) \
            .setClusterSize(args.cluster_size) \
            .setNumPS(args.num_ps) \
            .setInputMode(TFCluster.InputMode.TENSORFLOW) \
            .setTensorboard(args.tensorboard) \

    print("{0} ===== Train".format(datetime.now().isoformat()))
    model = estimator.fit(df)

    print("{0} ===== Inference".format(datetime.now().isoformat()))
    df = dfutil.loadTFRecords(sc,