Ejemplo n.º 1
0
 def train_test_split(unrolled, test_size):
     cutPoint = unrolled.count() - test_size
     train = unrolled.filter(lambda x: x.index < cutPoint) \
         .map(lambda x: Sample.from_ndarray(np.array(x.feature), np.array(x.label)))
     test = unrolled.filter(lambda x: x.index >= cutPoint) \
         .map(lambda x: Sample.from_ndarray(np.array(x.feature), np.array(x.label)))
     return [train, test]
Ejemplo n.º 2
0
 def gen_rand_user_item_feature(user_num, item_num, class_num):
     user_id = random.randint(1, user_num)
     item_id = random.randint(1, item_num)
     rating = random.randint(1, class_num)
     sample = Sample.from_ndarray(np.array([user_id, item_id]),
                                  np.array([rating]))
     return UserItemFeature(user_id, item_id, sample)
def to_sample(vectors, label, embedding_dim):
    # flatten nested list
    flatten_features = list(itertools.chain(*vectors))
    features = np.array(flatten_features, dtype='float').reshape(
        [sequence_len, embedding_dim])

    return Sample.from_ndarray(features, np.array(label))
Ejemplo n.º 4
0
def to_sample(vectors, label, embedding_dim):
    # flatten nested list
    flatten_features = list(itertools.chain(*vectors))
    features = np.array(flatten_features, dtype='float').reshape(
        [sequence_len, embedding_dim])

    return Sample.from_ndarray(features, np.array(label))
Ejemplo n.º 5
0
def get_featureset(x, y, shuffle=True):
    x = np.split(x.data.numpy(), x.shape[0])
    y = np.split(y.data.numpy(), y.shape[0])
    print(x[0].shape)
    print(y[0].shape)
    samples = [Sample.from_ndarray(np.squeeze(x[i]), np.squeeze(y[i])) for i in range(len(x))]
    sample_rdd = sc.parallelize(samples)
    return FeatureSet.sample_rdd(sample_rdd, shuffle=shuffle)
Ejemplo n.º 6
0
def to_sample(vectors, label, embedding_dim):
    # flatten nested list
    flatten_features = list(itertools.chain(*vectors))
    features = np.array(flatten_features,
                        dtype='float').reshape([sequence_len, embedding_dim])

    if model_type.lower() == "cnn":
        features = features.transpose(1, 0)
    return Sample.from_ndarray(features, np.array(label))
Ejemplo n.º 7
0
def to_sample(data):
    from bigdl.util.common import Sample
    data = check_type_and_convert(data, allow_list=True, allow_tuple=False)
    features = data["x"]
    labels = data["y"]
    length = features[0].shape[0]

    for i in range(length):
        fs = [feat[i] for feat in features]
        ls = [l[i] for l in labels]
        yield Sample.from_ndarray(np.array(fs), np.array(ls))
Ejemplo n.º 8
0
def to_sample_rdd(x, y, numSlices=None):
    """
    Conver x and y into RDD[Sample]
    :param x: ndarray and the first dimension should be batch
    :param y: ndarray and the first dimension should be batch
    :param numSlices:
    :return:
    """
    from bigdl.util.common import Sample
    x_rdd = sc.parallelize(x, numSlices)
    y_rdd = sc.parallelize(y, numSlices)
    return x_rdd.zip(y_rdd).map(lambda item: Sample.from_ndarray(item[0], item[1]))
Ejemplo n.º 9
0
def prepare_data(sc, folder, vocabsize, training_split):
    if not folder.startswith('hdfs://'):
        file = download_data(folder)
    else:
        file = folder
    sentences_rdd = sc.textFile(file) \
        .map(lambda line: sentence.sentences_split(line))
    pad_sent = sentences_rdd.flatMap(lambda x: x). \
        map(lambda sent: sentence.sentences_bipadding(sent))
    tokens = pad_sent.map(lambda pad: sentence.sentence_tokenizer(pad))
    train_tokens, val_tokens = tokens.randomSplit(
        [training_split, 1 - training_split])
    train_tokens.cache()
    val_tokens.cache()

    train_max_len = train_tokens.map(lambda x: len(x)).max()
    print("max length %s" % train_max_len)

    words = train_tokens.flatMap(lambda x: x)
    print("%s words and %s sentences processed in train data" %
          (words.count(), train_tokens.count()))

    val_max_len = val_tokens.map(lambda x: len(x)).max()
    print("val max length %s" % val_max_len)

    val_words = val_tokens.flatMap(lambda x: x)
    print("%s words and %s sentences processed in validation data" %
          (val_words.count(), val_tokens.count()))

    sort_words = words.map(lambda w: (w, 1)) \
                .reduceByKey(lambda a, b: a + b) \
                .sortBy(lambda w_c: w_c[1])
    vocabulary = np.array(sort_words.map(lambda w: w[0]).collect())

    fre_len = vocabulary.size
    if vocabsize > fre_len:
        length = fre_len
    else:
        length = vocabsize
    discard_vocab = vocabulary[:fre_len - length]
    used_vocab = vocabulary[fre_len - length:fre_len]
    used_vocab_size = used_vocab.size
    index = np.arange(used_vocab_size)
    index2word = dict(enumerate(used_vocab))
    word2index = dict(zip(used_vocab, index))
    total_vocab_len = used_vocab_size + 1
    startIdx = word2index.get("SENTENCESTART")
    endIdx = word2index.get("SENTENCEEND")

    def text2labeled(sent):
        indexes = [word2index.get(x, used_vocab_size) for x in sent]
        data = indexes[0:-1]
        label = indexes[1:len(indexes)]
        return data, label

    def labeled2onehotformat(labeled_sent):
        label = [x + 1 for x in labeled_sent[1]]
        size = len(labeled_sent[0])
        feature_onehot = np.zeros(size * total_vocab_len,
                                  dtype='int').reshape([size, total_vocab_len])
        for i, el in enumerate(labeled_sent[0]):
            feature_onehot[i, el] = 1
        return feature_onehot, label

    def padding(features, label, length):
        pad_len = length - len(label)
        padded_label = (label + [startIdx] * length)[:length]
        feature_padding = np.zeros((pad_len, total_vocab_len), dtype=np.int)
        feature_padding[:, endIdx + 1] = np.ones(pad_len)
        padded_feautres = np.concatenate((features, feature_padding), axis=0)
        return padded_feautres, padded_label

    sample_rdd = train_tokens.map(lambda sentence_te: text2labeled(sentence_te)) \
        .map(lambda labeled_sent: labeled2onehotformat(labeled_sent)) \
        .map(lambda x: padding(x[0], x[1], train_max_len)) \
        .map(lambda vectors_label: Sample.from_ndarray(vectors_label[0],
                                                       np.array(vectors_label[1]))).cache()

    val_sample_rdd = val_tokens.map(lambda sentence_t: text2labeled(sentence_t)) \
        .map(lambda labeled_sent: labeled2onehotformat(labeled_sent)) \
        .map(lambda x: padding(x[0], x[1], val_max_len)) \
        .map(lambda vectors_label: Sample.from_ndarray(vectors_label[0],
                                                       np.array(vectors_label[1]))).cache()

    return sample_rdd, val_sample_rdd, total_vocab_len
Ejemplo n.º 10
0
def get_rdd(x, y, shuffle=False):
    x = np.split(x.data.numpy(), x.shape[0])
    y = np.split(x.data.numpy(), y.shape[0])
    samples = [Sample.from_ndarray(np.squeeze(x[i]), np.squeeze(y[i])) for i in range(len(x))]
    sample_rdd = sc.parallelize(samples)
    return sample_rdd
Ejemplo n.º 11
0
def prepare_data(sc, folder, vocabsize, training_split):
    if not folder.startswith( 'hdfs://' ):
        file = download_data(folder)
    else:
        file = folder
    sentences_rdd = sc.textFile(file) \
        .map(lambda line: sentence.sentences_split(line))
    pad_sent = sentences_rdd.flatMap(lambda x: x). \
        map(lambda sent: sentence.sentences_bipadding(sent))
    tokens = pad_sent.map(lambda pad: sentence.sentence_tokenizer(pad))
    train_tokens, val_tokens = tokens.randomSplit([training_split, 1 - training_split])
    train_tokens.cache()
    val_tokens.cache()

    train_max_len = train_tokens.map(lambda x: len(x)).max()
    print("max length %s" % train_max_len)

    words = train_tokens.flatMap(lambda x: x)
    print("%s words and %s sentences processed in train data" % (words.count(), train_tokens.count()))

    val_max_len = val_tokens.map(lambda x: len(x)).max()
    print("val max length %s" % val_max_len)

    val_words = val_tokens.flatMap(lambda x: x)
    print("%s words and %s sentences processed in validation data" % (val_words.count(), val_tokens.count()))

    sort_words = words.map(lambda w: (w, 1)) \
                .reduceByKey(lambda a, b: a + b) \
                .sortBy(lambda w_c: w_c[1])
    vocabulary = np.array(sort_words.map(lambda w: w[0]).collect())

    fre_len = vocabulary.size
    if vocabsize > fre_len:
        length = fre_len
    else:
        length = vocabsize
    discard_vocab = vocabulary[: fre_len-length]
    used_vocab = vocabulary[fre_len-length: fre_len]
    used_vocab_size = used_vocab.size
    index = np.arange(used_vocab_size)
    index2word = dict(enumerate(used_vocab))
    word2index = dict(zip(used_vocab, index))
    total_vocab_len = used_vocab_size + 1
    startIdx = word2index.get("SENTENCESTART")
    endIdx = word2index.get("SENTENCEEND")

    def text2labeled(sent):
        indexes = [word2index.get(x, used_vocab_size) for x in sent]
        data = indexes[0: -1]
        label = indexes[1: len(indexes)]
        return data, label

    def labeled2onehotformat(labeled_sent):
        label = [x+1 for x in labeled_sent[1]]
        size = len(labeled_sent[0])
        feature_onehot = np.zeros(size * total_vocab_len, dtype='int').reshape(
            [size, total_vocab_len])
        for i, el in enumerate(labeled_sent[0]):
            feature_onehot[i, el] = 1
        return feature_onehot, label

    def padding(features, label, length):
        pad_len = length - len(label)
        padded_label = (label + [startIdx] * length)[:length]
        feature_padding = np.zeros((pad_len, total_vocab_len), dtype=np.int)
        feature_padding[:, endIdx + 1] = np.ones(pad_len)
        padded_feautres = np.concatenate((features, feature_padding), axis=0)
        return padded_feautres, padded_label

    sample_rdd = train_tokens.map(lambda sentence_te: text2labeled(sentence_te)) \
        .map(lambda labeled_sent: labeled2onehotformat(labeled_sent)) \
        .map(lambda x: padding(x[0], x[1], train_max_len)) \
        .map(lambda vectors_label: Sample.from_ndarray(vectors_label[0],
                                                       np.array(vectors_label[1]))).cache()

    val_sample_rdd = val_tokens.map(lambda sentence_t: text2labeled(sentence_t)) \
        .map(lambda labeled_sent: labeled2onehotformat(labeled_sent)) \
        .map(lambda x: padding(x[0], x[1], val_max_len)) \
        .map(lambda vectors_label: Sample.from_ndarray(vectors_label[0],
                                                       np.array(vectors_label[1]))).cache()

    return sample_rdd, val_sample_rdd, total_vocab_len
Ejemplo n.º 12
0
 def gen_rand_user_item_feature(user_num, item_num, class_num):
     user_id = random.randint(1, user_num)
     item_id = random.randint(1, item_num)
     rating = random.randint(1, class_num)
     sample = Sample.from_ndarray(np.array([user_id, item_id]), np.array([rating]))
     return UserItemFeature(user_id, item_id, sample)
Ejemplo n.º 13
0
def normalizer(mean, std):
    """
    Normalize features by standard deviation
    """
    return lambda sample: Sample.from_ndarray((sample.features - mean) / std,
                                              sample.label, sample.bigdl_type)
Ejemplo n.º 14
0
import matplotlib.pyplot as plt

from pyspark import SparkContext

from bigdl.util.common import init_engine
from bigdl.util.common import create_spark_conf
from bigdl.util.common import JavaCreator
from bigdl.util.common import Sample
from vision.image3d.transformation import *
import h5py
from math import pi

img_path = os.path.abspath(__file__ + "/../../resources/image_3d/a.mat")
sample = h5py.File(img_path)['meniscus_im']
sample = np.array(sample)
sample = Sample.from_ndarray(features=sample, label=np.array(-1))
# sample = np.expand_dims(sample,0)

print(sample.features[0].shape)

sc = SparkContext(appName="test", conf=create_spark_conf())
JavaCreator.set_creator_class(
    "com.intel.analytics.zoo.transform.vision.image3d.python.api.VisionPythonBigDL"
)
init_engine()

data_rdd = sc.parallelize([sample])

start_loc = [13, 80, 125]
patch = [5, 40, 40]
# end_loc = [17,119,164]