def test_fit_and_transform_generator(self): source = numpy.vstack(( numpy.array(list(self.data_source(20))), numpy.array(list(self.data_source(20))) * 100000, )) self.clustering.fit(numpy.array(list(self.data_source(20)))) # infinite list encoded = self.clustering.predict_generator( self.data_source(100000000)) | pipe.take(60) | pipe.as_list source = numpy.array(list(self.data_source(60))) # sample from each cluster counts = [0, 0] for vec, label in zip(source, encoded): counts[label] += 1 if label == 0: self.assertLessEqual( self.clustering.distance(vec, self.clustering.min_vec), self.clustering.distance(vec, self.clustering.max_vec)) else: self.assertLessEqual( self.clustering.distance(vec, self.clustering.max_vec), self.clustering.distance(vec, self.clustering.min_vec)) self.assertEqual(counts[0], counts[1])
def test(postivies, negatives, predictor, head=1000, prefix=""): data_loader_positive = data_loader(postivies) | max_length_filter data_loader_negative = data_loader(negatives) | max_length_filter num_correct = 0 num_total = 0 for sample in data_loader_positive | pipe.take(head): predicted = chainer.functions.softmax(predictor(sample)) if numpy.argmax(predicted.data[0]) == 1: num_correct += 1 num_total += 1 for sample in data_loader_negative | pipe.take(head): predicted = chainer.functions.softmax(predictor(sample)) if numpy.argmax(predicted.data[0]) == 0: num_correct += 1 num_total += 1 logging.info("{}correct: {}, total: {}".format(prefix, num_correct, num_total)) logging.info("{}accuracy: {}".format(prefix, float(num_correct)/num_total))
def test_fit_and_transform_generator(self): self.encoder.fit(numpy.array(list(self.data_source(300)))) # infinite list encoded = self.encoder.transform_generator( self.data_source(100000000)) | pipe.take(60) | pipe.as_list for i in range(0, len(encoded), 3): numpy.testing.assert_array_almost_equal(encoded[i], encoded[i + 1]) numpy.testing.assert_array_almost_equal(encoded[i], encoded[i + 2])
def test_fit_and_transform_generator(self): self.clustering.fit(numpy.array(list(self.data_source(20)))) # infinite list encoded = self.clustering.predict_generator( self.data_source(100000000)) | pipe.take(60) | pipe.as_list source = numpy.array(list(self.data_source(60))) # sample from each cluster counts = [0, 0] for vec, label in zip(source, encoded): counts[int(label)] += 1 self.assertEqual(counts[0], counts[1])
return np.array(iterable) def balancedSignalGenerator(X,y,num_classes=12): class_map = {} for c in range(num_classes): class_map[c] = list(np.where( y==c)[0]) D = range(num_classes) | select( lambda c: class_map[c] | pcycle | select( lambda i: (c,X[i]) ) ) | as_list while True: for c in D: yield next(c)[0], next(c)[1] data = balancedSignalGenerator(X_train,y_train) | take(4200) | as_list # note that we need as_list on the data X_train_bal = data | select(lambda el: el[1]) | as_list | as_npy y_train_bal = data | select(lambda el: el[0]) | as_list | as_npy #%% print( X_train_bal.shape, y_train_bal.shape ) #%% [markdown] # And we are now balanced! #%% def labelDist(y, title): plt.figure()
except: print("{} already exists".format(args.model_output_dir)) if args.gpu >= 0: chainer.cuda.check_cuda_available() chainer.cuda.get_device(args.gpu).use() xp = chainer.cuda.cupy else: xp = numpy ################ ## vocabulary ################ if args.vocabulary is None: count_dict = collections.defaultdict(int) for _, comment in load_id_comments_pretty(args.comment_tsv) | pipe.take(100000): for character in comment: count_dict[character] += 1 count_dict[CHARACTER_START] = MINIMUM_FREQUENCY + 1 count_dict[CHARACTER_END] = MINIMUM_FREQUENCY + 1 vocabulary = [character for character, count in count_dict.items() if count >= MINIMUM_FREQUENCY] print(len(vocabulary)) else: vocabulary = [line.rstrip().decode("utf-8") for line in open(args.vocabulary)] character_embedder = illust2comment.model.WordEmbedder(vocabulary) character_embedder.save_vocabulary(os.path.join(args.model_output_dir, "vocabulary.txt")) print(len(vocabulary)) print("vocabulary size: ", character_embedder.vecsize)
"""Returns a list of versions for package :param:`package_name`""" version_list, outdated = self.package_versions.get(package_name, fallback=True) if outdated == True: try: new_list = self.pypi_proxy.get_versions(package_name) except DataUnavailable, err: if version_list is None: raise err else: version_list = new_list self.package_versions.put(package_name, version_list) if version_list is None: version_list = [] return version_list | pipe.take(10) \ | pipe.map(lambda version: Version(package_name, version)) \ | pipe.as_list def get_version_info(self, package_name, version): """Returns a Version instance with realease data for the version :param:`version` of the package :param:`package_name""" version_info = self.version_info.get(package_name, version) if version_info is None: version_info = self.pypi_proxy.get_version_info(package_name, version) self.version_info.put(package_name, version, version_info) return version_info
def train(postivies, negatives, predictor, optimizer, batch=10, epoch_size=10000000, num_augment=1): xp = predictor.xp # num_sample = min(len(postivies), len(negatives), epoch_size) data_loader_positive = data_loader(postivies) | max_length_filter | compilable_filter | pipe.take(epoch_size) data_loader_negative = data_loader(negatives) | max_length_filter | compilable_filter | pipe.take(epoch_size) if num_augment > 1: data_loader_positive = data_loader_positive | augmentation(num_augment=num_augment) data_loader_negative = data_loader_negative | augmentation(num_augment=num_augment) while True: loss = chainer.Variable(xp.zeros((), dtype=xp.float32)) try: for _i in range(batch): postive_predicted = predictor(data_loader_positive.next()) loss += chainer.functions.softmax_cross_entropy(postive_predicted, chainer.Variable(xp.array([1], xp.int32))) negative_predicted = predictor(data_loader_negative.next()) loss += chainer.functions.softmax_cross_entropy(negative_predicted, chainer.Variable(xp.array([0], xp.int32))) except StopIteration: optimizer.zero_grads() loss.backward() optimizer.update() break optimizer.zero_grads() loss.backward() optimizer.update()
print("{} already exists".format(args.model_output_dir)) if args.gpu >= 0: chainer.cuda.check_cuda_available() chainer.cuda.get_device(args.gpu).use() xp = chainer.cuda.cupy else: xp = numpy ################ ## vocabulary ################ if args.vocabulary is None: count_dict = collections.defaultdict(int) for _, comment in load_id_comments_pretty( args.comment_tsv) | pipe.take(100000): for character in comment: count_dict[character] += 1 count_dict[CHARACTER_START] = MINIMUM_FREQUENCY + 1 count_dict[CHARACTER_END] = MINIMUM_FREQUENCY + 1 vocabulary = [ character for character, count in count_dict.items() if count >= MINIMUM_FREQUENCY ] print(len(vocabulary)) else: vocabulary = [ line.rstrip().decode("utf-8") for line in open(args.vocabulary) ] character_embedder = illust2comment.model.WordEmbedder(vocabulary)
def pipe_pad_to(seq, padded_length): return (seq | chain_with(repeat(None)) | take(padded_length))