def train_test_split(unrolled, test_size): cutPoint = unrolled.count() - test_size train = unrolled.filter(lambda x: x.index < cutPoint) \ .map(lambda x: Sample.from_ndarray(np.array(x.feature), np.array(x.label))) test = unrolled.filter(lambda x: x.index >= cutPoint) \ .map(lambda x: Sample.from_ndarray(np.array(x.feature), np.array(x.label))) return [train, test]
def gen_rand_user_item_feature(user_num, item_num, class_num): user_id = random.randint(1, user_num) item_id = random.randint(1, item_num) rating = random.randint(1, class_num) sample = Sample.from_ndarray(np.array([user_id, item_id]), np.array([rating])) return UserItemFeature(user_id, item_id, sample)
def to_sample(vectors, label, embedding_dim): # flatten nested list flatten_features = list(itertools.chain(*vectors)) features = np.array(flatten_features, dtype='float').reshape( [sequence_len, embedding_dim]) return Sample.from_ndarray(features, np.array(label))
def get_featureset(x, y, shuffle=True): x = np.split(x.data.numpy(), x.shape[0]) y = np.split(y.data.numpy(), y.shape[0]) print(x[0].shape) print(y[0].shape) samples = [Sample.from_ndarray(np.squeeze(x[i]), np.squeeze(y[i])) for i in range(len(x))] sample_rdd = sc.parallelize(samples) return FeatureSet.sample_rdd(sample_rdd, shuffle=shuffle)
def to_sample(vectors, label, embedding_dim): # flatten nested list flatten_features = list(itertools.chain(*vectors)) features = np.array(flatten_features, dtype='float').reshape([sequence_len, embedding_dim]) if model_type.lower() == "cnn": features = features.transpose(1, 0) return Sample.from_ndarray(features, np.array(label))
def to_sample(data): from bigdl.util.common import Sample data = check_type_and_convert(data, allow_list=True, allow_tuple=False) features = data["x"] labels = data["y"] length = features[0].shape[0] for i in range(length): fs = [feat[i] for feat in features] ls = [l[i] for l in labels] yield Sample.from_ndarray(np.array(fs), np.array(ls))
def to_sample_rdd(x, y, numSlices=None): """ Conver x and y into RDD[Sample] :param x: ndarray and the first dimension should be batch :param y: ndarray and the first dimension should be batch :param numSlices: :return: """ from bigdl.util.common import Sample x_rdd = sc.parallelize(x, numSlices) y_rdd = sc.parallelize(y, numSlices) return x_rdd.zip(y_rdd).map(lambda item: Sample.from_ndarray(item[0], item[1]))
def row_to_sample(row, column_info, model_type="wide_n_deep"): wide_tensor = get_wide_tensor(row, column_info) deep_tensor = JTensor.from_ndarray(get_deep_tensor(row, column_info)) label = row[column_info.label] model_type = model_type.lower() if model_type == "wide_n_deep": feature = [wide_tensor, deep_tensor] elif model_type == "wide": feature = wide_tensor elif model_type == "deep": feature = deep_tensor else: raise TypeError("Unsupported model_type: %s" % model_type) return Sample.from_jtensor(feature, label)
def row_to_sample(row, column_info, model_type="wide_n_deep"): """ convert a row to sample given column feature information of a WideAndDeep model :param row: Row of userId, itemId, features and label :param column_info: ColumnFeatureInfo specify information of different features :return: TensorSample as input for WideAndDeep model """ wide_tensor = get_wide_tensor(row, column_info) deep_tensor = get_deep_tensors(row, column_info) deep_tensors = [JTensor.from_ndarray(ele) for ele in deep_tensor] label = row[column_info.label] model_type = model_type.lower() if model_type == "wide_n_deep": feature = [wide_tensor] + deep_tensors elif model_type == "wide": feature = wide_tensor elif model_type == "deep": feature = deep_tensors else: raise TypeError("Unsupported model_type: %s" % model_type) return Sample.from_jtensor(feature, label)
import matplotlib.pyplot as plt from pyspark import SparkContext from bigdl.util.common import init_engine from bigdl.util.common import create_spark_conf from bigdl.util.common import JavaCreator from bigdl.util.common import Sample from vision.image3d.transformation import * import h5py from math import pi img_path = os.path.abspath(__file__ + "/../../resources/image_3d/a.mat") sample = h5py.File(img_path)['meniscus_im'] sample = np.array(sample) sample = Sample.from_ndarray(features=sample, label=np.array(-1)) # sample = np.expand_dims(sample,0) print(sample.features[0].shape) sc = SparkContext(appName="test", conf=create_spark_conf()) JavaCreator.set_creator_class( "com.intel.analytics.zoo.transform.vision.image3d.python.api.VisionPythonBigDL" ) init_engine() data_rdd = sc.parallelize([sample]) start_loc = [13, 80, 125] patch = [5, 40, 40] # end_loc = [17,119,164]
def prepare_data(sc, folder, vocabsize, training_split): if not folder.startswith('hdfs://'): file = download_data(folder) else: file = folder sentences_rdd = sc.textFile(file) \ .map(lambda line: sentence.sentences_split(line)) pad_sent = sentences_rdd.flatMap(lambda x: x). \ map(lambda sent: sentence.sentences_bipadding(sent)) tokens = pad_sent.map(lambda pad: sentence.sentence_tokenizer(pad)) train_tokens, val_tokens = tokens.randomSplit( [training_split, 1 - training_split]) train_tokens.cache() val_tokens.cache() train_max_len = train_tokens.map(lambda x: len(x)).max() print("max length %s" % train_max_len) words = train_tokens.flatMap(lambda x: x) print("%s words and %s sentences processed in train data" % (words.count(), train_tokens.count())) val_max_len = val_tokens.map(lambda x: len(x)).max() print("val max length %s" % val_max_len) val_words = val_tokens.flatMap(lambda x: x) print("%s words and %s sentences processed in validation data" % (val_words.count(), val_tokens.count())) sort_words = words.map(lambda w: (w, 1)) \ .reduceByKey(lambda a, b: a + b) \ .sortBy(lambda w_c: w_c[1]) vocabulary = np.array(sort_words.map(lambda w: w[0]).collect()) fre_len = vocabulary.size if vocabsize > fre_len: length = fre_len else: length = vocabsize discard_vocab = vocabulary[:fre_len - length] used_vocab = vocabulary[fre_len - length:fre_len] used_vocab_size = used_vocab.size index = np.arange(used_vocab_size) index2word = dict(enumerate(used_vocab)) word2index = dict(zip(used_vocab, index)) total_vocab_len = used_vocab_size + 1 startIdx = word2index.get("SENTENCESTART") endIdx = word2index.get("SENTENCEEND") def text2labeled(sent): indexes = [word2index.get(x, used_vocab_size) for x in sent] data = indexes[0:-1] label = indexes[1:len(indexes)] return data, label def labeled2onehotformat(labeled_sent): label = [x + 1 for x in labeled_sent[1]] size = len(labeled_sent[0]) feature_onehot = np.zeros(size * total_vocab_len, dtype='int').reshape([size, total_vocab_len]) for i, el in enumerate(labeled_sent[0]): feature_onehot[i, el] = 1 return feature_onehot, label def padding(features, label, length): pad_len = length - len(label) padded_label = (label + [startIdx] * length)[:length] feature_padding = np.zeros((pad_len, total_vocab_len), dtype=np.int) feature_padding[:, endIdx + 1] = np.ones(pad_len) padded_feautres = np.concatenate((features, feature_padding), axis=0) return padded_feautres, padded_label sample_rdd = train_tokens.map(lambda sentence_te: text2labeled(sentence_te)) \ .map(lambda labeled_sent: labeled2onehotformat(labeled_sent)) \ .map(lambda x: padding(x[0], x[1], train_max_len)) \ .map(lambda vectors_label: Sample.from_ndarray(vectors_label[0], np.array(vectors_label[1]))).cache() val_sample_rdd = val_tokens.map(lambda sentence_t: text2labeled(sentence_t)) \ .map(lambda labeled_sent: labeled2onehotformat(labeled_sent)) \ .map(lambda x: padding(x[0], x[1], val_max_len)) \ .map(lambda vectors_label: Sample.from_ndarray(vectors_label[0], np.array(vectors_label[1]))).cache() return sample_rdd, val_sample_rdd, total_vocab_len
def prepare_data(sc, folder, vocabsize, training_split): if not folder.startswith( 'hdfs://' ): file = download_data(folder) else: file = folder sentences_rdd = sc.textFile(file) \ .map(lambda line: sentence.sentences_split(line)) pad_sent = sentences_rdd.flatMap(lambda x: x). \ map(lambda sent: sentence.sentences_bipadding(sent)) tokens = pad_sent.map(lambda pad: sentence.sentence_tokenizer(pad)) train_tokens, val_tokens = tokens.randomSplit([training_split, 1 - training_split]) train_tokens.cache() val_tokens.cache() train_max_len = train_tokens.map(lambda x: len(x)).max() print("max length %s" % train_max_len) words = train_tokens.flatMap(lambda x: x) print("%s words and %s sentences processed in train data" % (words.count(), train_tokens.count())) val_max_len = val_tokens.map(lambda x: len(x)).max() print("val max length %s" % val_max_len) val_words = val_tokens.flatMap(lambda x: x) print("%s words and %s sentences processed in validation data" % (val_words.count(), val_tokens.count())) sort_words = words.map(lambda w: (w, 1)) \ .reduceByKey(lambda a, b: a + b) \ .sortBy(lambda w_c: w_c[1]) vocabulary = np.array(sort_words.map(lambda w: w[0]).collect()) fre_len = vocabulary.size if vocabsize > fre_len: length = fre_len else: length = vocabsize discard_vocab = vocabulary[: fre_len-length] used_vocab = vocabulary[fre_len-length: fre_len] used_vocab_size = used_vocab.size index = np.arange(used_vocab_size) index2word = dict(enumerate(used_vocab)) word2index = dict(zip(used_vocab, index)) total_vocab_len = used_vocab_size + 1 startIdx = word2index.get("SENTENCESTART") endIdx = word2index.get("SENTENCEEND") def text2labeled(sent): indexes = [word2index.get(x, used_vocab_size) for x in sent] data = indexes[0: -1] label = indexes[1: len(indexes)] return data, label def labeled2onehotformat(labeled_sent): label = [x+1 for x in labeled_sent[1]] size = len(labeled_sent[0]) feature_onehot = np.zeros(size * total_vocab_len, dtype='int').reshape( [size, total_vocab_len]) for i, el in enumerate(labeled_sent[0]): feature_onehot[i, el] = 1 return feature_onehot, label def padding(features, label, length): pad_len = length - len(label) padded_label = (label + [startIdx] * length)[:length] feature_padding = np.zeros((pad_len, total_vocab_len), dtype=np.int) feature_padding[:, endIdx + 1] = np.ones(pad_len) padded_feautres = np.concatenate((features, feature_padding), axis=0) return padded_feautres, padded_label sample_rdd = train_tokens.map(lambda sentence_te: text2labeled(sentence_te)) \ .map(lambda labeled_sent: labeled2onehotformat(labeled_sent)) \ .map(lambda x: padding(x[0], x[1], train_max_len)) \ .map(lambda vectors_label: Sample.from_ndarray(vectors_label[0], np.array(vectors_label[1]))).cache() val_sample_rdd = val_tokens.map(lambda sentence_t: text2labeled(sentence_t)) \ .map(lambda labeled_sent: labeled2onehotformat(labeled_sent)) \ .map(lambda x: padding(x[0], x[1], val_max_len)) \ .map(lambda vectors_label: Sample.from_ndarray(vectors_label[0], np.array(vectors_label[1]))).cache() return sample_rdd, val_sample_rdd, total_vocab_len
def get_rdd(x, y, shuffle=False): x = np.split(x.data.numpy(), x.shape[0]) y = np.split(x.data.numpy(), y.shape[0]) samples = [Sample.from_ndarray(np.squeeze(x[i]), np.squeeze(y[i])) for i in range(len(x))] sample_rdd = sc.parallelize(samples) return sample_rdd
def normalizer(mean, std): """ Normalize features by standard deviation """ return lambda sample: Sample.from_ndarray((sample.features - mean) / std, sample.label, sample.bigdl_type)