def test_fit_and_transform_generator(self):
        source = numpy.vstack((
            numpy.array(list(self.data_source(20))),
            numpy.array(list(self.data_source(20))) * 100000,
        ))

        self.clustering.fit(numpy.array(list(self.data_source(20))))

        # infinite list
        encoded = self.clustering.predict_generator(
            self.data_source(100000000)) | pipe.take(60) | pipe.as_list

        source = numpy.array(list(self.data_source(60)))

        # sample from each cluster
        counts = [0, 0]
        for vec, label in zip(source, encoded):
            counts[label] += 1
            if label == 0:
                self.assertLessEqual(
                    self.clustering.distance(vec, self.clustering.min_vec),
                    self.clustering.distance(vec, self.clustering.max_vec))
            else:
                self.assertLessEqual(
                    self.clustering.distance(vec, self.clustering.max_vec),
                    self.clustering.distance(vec, self.clustering.min_vec))
        self.assertEqual(counts[0], counts[1])
def test(postivies, negatives, predictor, head=1000, prefix=""):
    data_loader_positive = data_loader(postivies) | max_length_filter
    data_loader_negative = data_loader(negatives) | max_length_filter
    num_correct = 0
    num_total = 0
    for sample in data_loader_positive | pipe.take(head):
        predicted = chainer.functions.softmax(predictor(sample))
        if numpy.argmax(predicted.data[0]) == 1:
            num_correct += 1
        num_total += 1
    for sample in data_loader_negative | pipe.take(head):
        predicted = chainer.functions.softmax(predictor(sample))
        if numpy.argmax(predicted.data[0]) == 0:
            num_correct += 1
        num_total += 1
    logging.info("{}correct: {}, total: {}".format(prefix, num_correct, num_total))
    logging.info("{}accuracy: {}".format(prefix, float(num_correct)/num_total))
Exemple #3
0
    def test_fit_and_transform_generator(self):
        self.encoder.fit(numpy.array(list(self.data_source(300))))

        # infinite list
        encoded = self.encoder.transform_generator(
            self.data_source(100000000)) | pipe.take(60) | pipe.as_list

        for i in range(0, len(encoded), 3):
            numpy.testing.assert_array_almost_equal(encoded[i], encoded[i + 1])
            numpy.testing.assert_array_almost_equal(encoded[i], encoded[i + 2])
    def test_fit_and_transform_generator(self):
        self.clustering.fit(numpy.array(list(self.data_source(20))))

        # infinite list
        encoded = self.clustering.predict_generator(
            self.data_source(100000000)) | pipe.take(60) | pipe.as_list

        source = numpy.array(list(self.data_source(60)))

        # sample from each cluster
        counts = [0, 0]
        for vec, label in zip(source, encoded):
            counts[int(label)] += 1
        self.assertEqual(counts[0], counts[1])
    return np.array(iterable)

def balancedSignalGenerator(X,y,num_classes=12):
    
    class_map = {}
    for c in range(num_classes):
        class_map[c] = list(np.where( y==c)[0])
                            
    D = range(num_classes)         | select( lambda c: class_map[c] | pcycle | select( lambda i: (c,X[i]) )  )         | as_list

    while True:
        for c in D:
            yield next(c)[0], next(c)[1]

        
data = balancedSignalGenerator(X_train,y_train) | take(4200) | as_list

# note that we need as_list on the data
X_train_bal = data | select(lambda el: el[1])  | as_list | as_npy
y_train_bal = data | select(lambda el: el[0])  | as_list | as_npy


#%%
print( X_train_bal.shape, y_train_bal.shape )

#%% [markdown]
# And we are now balanced!

#%%
def labelDist(y, title):
    plt.figure()
Exemple #6
0
except:
    print("{} already exists".format(args.model_output_dir))

if args.gpu >= 0:
    chainer.cuda.check_cuda_available()
    chainer.cuda.get_device(args.gpu).use()
    xp = chainer.cuda.cupy
else:
    xp = numpy

################
## vocabulary
################
if args.vocabulary is None:
    count_dict = collections.defaultdict(int)
    for _, comment in load_id_comments_pretty(args.comment_tsv) | pipe.take(100000):
        for character in comment:
            count_dict[character] += 1


    count_dict[CHARACTER_START] = MINIMUM_FREQUENCY + 1
    count_dict[CHARACTER_END] = MINIMUM_FREQUENCY + 1
    vocabulary = [character for character, count in count_dict.items() if count >= MINIMUM_FREQUENCY]
    print(len(vocabulary))
else:
    vocabulary = [line.rstrip().decode("utf-8") for line in open(args.vocabulary)]
character_embedder = illust2comment.model.WordEmbedder(vocabulary)
character_embedder.save_vocabulary(os.path.join(args.model_output_dir, "vocabulary.txt"))
print(len(vocabulary))
print("vocabulary size: ", character_embedder.vecsize)
Exemple #7
0
     """Returns a list of versions for package :param:`package_name`"""
     version_list, outdated = self.package_versions.get(package_name,
                                                        fallback=True)
     if outdated == True:
         try:
             new_list = self.pypi_proxy.get_versions(package_name)
         except DataUnavailable, err:
             if version_list is None:
                 raise err
         else:
             version_list  = new_list
             self.package_versions.put(package_name, version_list)
     if version_list is None:
         version_list = []
         
     return version_list | pipe.take(10) \
             | pipe.map(lambda version: Version(package_name, version)) \
             | pipe.as_list
 
 def get_version_info(self, package_name, version):
     """Returns a Version instance with realease data for the version
     :param:`version` of the package :param:`package_name"""
     version_info = self.version_info.get(package_name, version)
         
     if version_info is None:
         version_info = self.pypi_proxy.get_version_info(package_name, 
                                                         version)
         self.version_info.put(package_name, version, version_info)
          
     return version_info
 
def train(postivies, negatives, predictor, optimizer, batch=10, epoch_size=10000000, num_augment=1):
    xp = predictor.xp
    # num_sample = min(len(postivies), len(negatives), epoch_size)
    data_loader_positive = data_loader(postivies) | max_length_filter | compilable_filter | pipe.take(epoch_size)
    data_loader_negative = data_loader(negatives) | max_length_filter | compilable_filter | pipe.take(epoch_size)
    if num_augment > 1:
        data_loader_positive = data_loader_positive | augmentation(num_augment=num_augment)
        data_loader_negative = data_loader_negative | augmentation(num_augment=num_augment)
    while True:
        loss = chainer.Variable(xp.zeros((), dtype=xp.float32))
        try:
            for _i in range(batch):
                postive_predicted = predictor(data_loader_positive.next())
                loss += chainer.functions.softmax_cross_entropy(postive_predicted, chainer.Variable(xp.array([1], xp.int32)))
                negative_predicted = predictor(data_loader_negative.next())
                loss += chainer.functions.softmax_cross_entropy(negative_predicted, chainer.Variable(xp.array([0], xp.int32)))
        except StopIteration:
            optimizer.zero_grads()
            loss.backward()
            optimizer.update()
            break
        optimizer.zero_grads()
        loss.backward()
        optimizer.update()
Exemple #9
0
    print("{} already exists".format(args.model_output_dir))

if args.gpu >= 0:
    chainer.cuda.check_cuda_available()
    chainer.cuda.get_device(args.gpu).use()
    xp = chainer.cuda.cupy
else:
    xp = numpy

################
## vocabulary
################
if args.vocabulary is None:
    count_dict = collections.defaultdict(int)
    for _, comment in load_id_comments_pretty(
            args.comment_tsv) | pipe.take(100000):
        for character in comment:
            count_dict[character] += 1

    count_dict[CHARACTER_START] = MINIMUM_FREQUENCY + 1
    count_dict[CHARACTER_END] = MINIMUM_FREQUENCY + 1
    vocabulary = [
        character for character, count in count_dict.items()
        if count >= MINIMUM_FREQUENCY
    ]
    print(len(vocabulary))
else:
    vocabulary = [
        line.rstrip().decode("utf-8") for line in open(args.vocabulary)
    ]
character_embedder = illust2comment.model.WordEmbedder(vocabulary)
Exemple #10
0
def pipe_pad_to(seq, padded_length):
    return (seq | chain_with(repeat(None))
                | take(padded_length))