def test_parse_maxcompute_dsn(self): self.assertEqual( ("access_id", "access_key", "http://maxcompute-service.com/api", "test_ci"), parseMaxComputeDSN( "access_id:[email protected]/api?curr_project=test_ci&scheme=http" ))
def pai_maxcompute_input_fn(): table_parts = pai_table.split(".") if len(table_parts) == 2: database, table_name = table_parts elif len(table_parts) == 1: table_name = pai_table driver, dsn = datasource.split("://") database = parseMaxComputeDSN(dsn)[-1] else: raise ValueError("error database.table format: %s" % pai_table) tables = ["odps://%s/tables/%s" % (database, table_name)] record_defaults = [] for name in feature_column_names: dtype = get_dtype(feature_metas[name]["dtype"]) record_defaults.append(tf.constant(0, dtype=dtype, shape=feature_metas[name]["shape"])) dataset = tf.data.TableRecordDataset(tables, record_defaults=record_defaults, selected_cols=",".join(feature_column_names)) def tensor_to_dict(*args): num_features = len(feature_column_names) features_dict = dict() for idx in range(num_features): name = feature_column_names[idx] features_dict[name] = tf.reshape(args[idx], [-1]) return features_dict return dataset.map(tensor_to_dict)
def pai_maxcompute_input_fn(pai_table, datasource, feature_column_names, feature_metas, label_meta, num_workers=1, worker_id=0): # NOTE(typhoonzero): datasource is only used to get current selected maxcompute project(database). table_parts = pai_table.split(".") if len(table_parts) == 2: database, table_name = table_parts elif len(table_parts) == 1: table_name = pai_table driver, dsn = datasource.split("://") database = parseMaxComputeDSN(dsn)[-1] else: raise ValueError("error database.table format: %s" % pai_table) tables = ["odps://%s/tables/%s" % (database, table_name)] record_defaults = [] for name in feature_column_names: dtype = get_dtype(feature_metas[name]["dtype"]) record_defaults.append( tf.constant(0, dtype=dtype, shape=feature_metas[name]["shape"])) record_defaults.append( tf.constant(0, get_dtype(label_meta["dtype"]), shape=label_meta["shape"])) selected_cols = copy.copy(feature_column_names) selected_cols.append(label_meta["feature_name"]) if num_workers == 0: num_workers = 1 dataset = tf.data.TableRecordDataset(tables, record_defaults=record_defaults, selected_cols=",".join(selected_cols), slice_id=worker_id, slice_count=num_workers) def tensor_to_dict(*args): num_features = len(feature_column_names) label = args[num_features] features_dict = dict() for idx in range(num_features): name = feature_column_names[idx] features_dict[name] = tf.reshape(args[idx], [-1]) return features_dict, label return dataset.map(tensor_to_dict)
def pai_maxcompute_input_fn(datasetStr): driver, dsn = datasource.split("://") _, _, _, database = parseMaxComputeDSN(dsn) tables = ["odps://%s/tables/%s" % (database, pai_table)] record_defaults = [] selected_cols = feature_column_names selected_cols.append(label_meta["name"]) for name in feature_column_names: dtype = get_dtype(feature_metas[name]["dtype"]) record_defaults.append( tf.constant(0, dtype=dtype, shape=feature_metas[name]["shape"])) record_defaults.append( tf.constant(0, get_dtype(label_meta["dtype"]), shape=label_meta["shape"])) dataset = tf.data.TableRecordDataset(tables, record_defaults=record_defaults, selected_cols=selected_cols) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) return dataset.map(ds_mapper)
def pai_maxcompute_input_fn(pai_table, datasource, feature_column_names, feature_metas, label_meta, num_workers=1, worker_id=0, map_to_dict=True): # NOTE(typhoonzero): datasource is only used to get current selected maxcompute project(database). table_parts = pai_table.split(".") if len(table_parts) == 2: database, table_name = table_parts elif len(table_parts) == 1: table_name = pai_table driver, dsn = datasource.split("://") database = parseMaxComputeDSN(dsn)[-1] else: raise ValueError("error database.table format: %s" % pai_table) tables = ["odps://%s/tables/%s" % (database, table_name)] record_defaults = [] for name in feature_column_names: dtype = get_dtype(feature_metas[name]["dtype"]) if feature_metas[name]["delimiter"] != "": record_defaults.append(tf.constant("", dtype=tf.string, shape=[1])) else: record_defaults.append( tf.constant(0, dtype=dtype, shape=[1])) #shape=feature_metas[name]["shape"])) record_defaults.append( tf.constant(0, get_dtype(label_meta["dtype"]), shape=label_meta["shape"])) selected_cols = copy.copy(feature_column_names) selected_cols.append(label_meta["feature_name"]) if num_workers == 0: num_workers = 1 dataset = paiio.TableRecordDataset(tables, record_defaults=record_defaults, selected_cols=",".join(selected_cols), slice_id=worker_id, slice_count=num_workers) def tensor_to_dict(*args): num_features = len(feature_column_names) label = args[num_features] features_dict = dict() for idx in range(num_features): name = feature_column_names[idx] field_meta = feature_metas[name] if field_meta["delimiter"] != "": # process as CSV dtype = get_dtype(feature_metas[name]["dtype"]) # FIXME(typhoonzero): when shape has multiple dimentions, do not use field_meta["shape"][0] t = tf.io.decode_csv(args[idx], [ tf.constant(0, dtype=dtype, shape=[1]) for i in range(field_meta["shape"][0]) ], field_delim=field_meta["delimiter"]) else: t = tf.reshape(args[idx], [-1]) features_dict[name] = t return features_dict, label def tensor_to_list(*args): num_features = len(feature_column_names) label = args[num_features] feature_list = [] for f in args[:num_features]: feature_list.append(f.eval()) return feature_list, label.eval() if map_to_dict: return dataset.map(tensor_to_dict) else: return dataset.as_numpy().map(tensor_to_list)