def datasetToSource( user, dataset, pattern='.*root', readCache=False): # print user, dataset, pattern data = createDataset(user, dataset, pattern, readCache) source = cms.Source( "PoolSource", noEventSort = cms.untracked.bool(True), duplicateCheckMode = cms.untracked.string("noDuplicateCheck"), fileNames = cms.untracked.vstring() ) source.fileNames.extend( data.listOfGoodFiles() ) return source
def CreateModel(dataset_fileName, use_mid=False, from_raw=False): if from_raw: X, y, Decoder = dataset.createDataset() else: X, y = joblib.load(dataset_fileName) if use_mid: accx_mid = X[:, 2, :].reshape(100, 1, 50) accy_mid = X[:, 7, :].reshape(100, 1, 50) accz_mid = X[:, 12, :].reshape(100, 1, 50) gryx_mid = X[:, 17, :].reshape(100, 1, 50) gyry_mid = X[:, 22, :].reshape(100, 1, 50) gyrz_mid = X[:, 27, :].reshape(100, 1, 50) X = np.concatenate( (accx_mid, accy_mid, accz_mid, gryx_mid, gyry_mid, gyrz_mid), axis=1) print(X.shape) X = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=24) print("split complete") params = {'C': [0.001, 0.01, 0.1, 1], 'kernel': ['linear', 'rbf', 'poly']} svc = svm.SVC(probability=True) clf = GridSearchCV(svc, params, verbose=10, n_jobs=8) print("fitting Model") clf.fit(x_train, y_train) print("Confusion Matrix:") Y_predicted = clf.predict(x_test) print(confusion_matrix(y_test, Y_predicted)) print("\nBest estimator parameters: ") print(clf.best_estimator_) #Calculates the score of the best estimator found. score = clf.score(x_test, y_test) print("\nSCORE: {score}\n".format(score=score)) print("Saving model....") if use_mid: joblib.dump(clf, "MODEL_MID.pkl") else: joblib.dump(clf, "MODEL.pkl") return score
ToTensor(), Normalize([.485, .456, .406], [.229, .224, .225]), RandomHorizontalFlip(), ] else: print("without data augmentation") transformList = [ Resize(opt.loadSize), ToTensor(), Normalize([.485, .456, .406], [.229, .224, .225]) ] transform = Compose(transformList) supervisedADataset = createDataset([opt.supervisedADataset], transform=transform, outputFile=False)[0] supervisedBDataset = createDataset([opt.supervisedBDataset], transform=transform, outputFile=False)[0] unsupervisedADataset = createDataset([opt.unsupervisedADataset], transform=transform, outputFile=False)[0] unsupervisedBDataset = createDataset([opt.unsupervisedBDataset], transform=transform, outputFile=False)[0] dataset = CycleMcdDataset(supervisedA=supervisedADataset, unsupervisedA=unsupervisedADataset, supervisedB=supervisedBDataset, unsupervisedB=unsupervisedBDataset)
Resize(opt.loadSize), ToTensor(), Normalize([.485, .456, .406], [.229, .224, .225]), RandomHorizontalFlip(), ] else: transformList = [ Resize(opt.loadSize), ToTensor(), Normalize([.485, .456, .406], [.229, .224, .225]) ] transform = Compose(transformList) sourceDataset = createDataset(opt.sourceDataset, transform=transform, outputFile=False) targetDataset = createDataset(opt.targetDataset, transform=transform, outputFile=False) dataLoader = torch.utils.data.DataLoader(ConcatDataset(source=sourceDataset, target=targetDataset), batch_size=opt.batchSize, shuffle=True) # set visualizer visualizer = Visualizer(opt, dataLoader.dataset).reset()
recommendedSongs = recommendedSongs.append( pd.Series(row, index=recommendedSongs.columns, name=index)) return recommendedSongs[['genre', 'track_id', 'artist_name', 'title']] timbreFeaturesAndGenre = [ 'avg_timbre1', 'avg_timbre2', 'avg_timbre3', 'avg_timbre4', 'avg_timbre5', 'avg_timbre6', 'avg_timbre7', 'avg_timbre8', 'avg_timbre9', 'avg_timbre10', 'avg_timbre11', 'avg_timbre12', 'var_timbre1', 'var_timbre2', 'var_timbre3', 'var_timbre4', 'var_timbre5', 'var_timbre6', 'var_timbre7', 'var_timbre8', 'var_timbre9', 'var_timbre10', 'var_timbre11', 'var_timbre12', 'track_id', 'genre' ] datasetMinusSamples, testingDataset = createDataset() datasetMinusSamples = removeTestingSongsFromDataset(datasetMinusSamples, testingDataset) formattedDataset = datasetMinusSamples[[ 'genre', 'track_id', 'artist_name', 'title', 'avg_timbre1' ]] formattedDataset.to_csv('testingDataSet.csv', sep='\t') featureVector = testingDataset[timbreFeaturesAndGenre] kMeans = KMeans(n_clusters=10) timbreFeatures = timbreFeaturesAndGenre[:-2] kMeans.fit(featureVector[timbreFeatures]) featureVector.loc[:, 'genre'] = kMeans.labels_ genreNumbers = featureVector[['genre']] genreTitles = testingDataset[['genre']]
RandomResizedCrop(), Resize(opt.loadSize), ToTensor(), Normalize([.485, .456, .406], [.229, .224, .225]), RandomHorizontalFlip(), ] else: transformList = [ Resize(opt.loadSize), ToTensor(), Normalize([.485, .456, .406], [.229, .224, .225]) ] transform = Compose(transformList) datasetA = createDataset([opt.datasetA], transform= transform, outputFile = False) datasetB = createDataset([opt.datasetB], transform= transform, outputFile = False) dataset = ConcatDataset( source = datasetA, target = datasetB) dataLoader= torch.utils.data.DataLoader( dataset, batch_size= opt.batchSize, shuffle=True) # set visualizer visualizer = Visualizer(opt, dataLoader.dataset).reset()
import sys from dataset import createDataset from pybrain.supervised.trainers import BackpropTrainer from pybrain.tools.shortcuts import buildNetwork from pybrain.structure.modules import TanhLayer from time import sleep def getArgOne(): # if len(sys.argv) < 2: # raise NameError("no argument given") # elif len(sys.argv) > 2: # raise NameError("too much arguments given") return sys.argv[1] filename = getArgOne() # read filename ((dsTrn, dsTst), (scalizer, normalizer)) = createDataset(filename) # create dataset ############## print "Training data length:", len(dsTrn) print "Test data length:", len(dsTst) # for inpt, target in dsTst: # print inpt, target # print dsTrn['input'] # print dsTrn['target'] net = buildNetwork(dsTrn.indim, dsTrn.indim, dsTrn.indim / 2, dsTrn.outdim, bias = True, recurrent = True)# create NN trainer = BackpropTrainer(net, dsTrn, learningrate = 0.03, momentum = 0.1, verbose = True) print "Network :", net sleep(3) # for epoch in range(0, 1000): # print "Epoch", epoch # trainer.train() trainer.trainUntilConvergence(maxEpochs = 10000) # train NN # test result
from fuel.streams import DataStream from fuel.schemes import SequentialScheme from blocks.extensions import FinishAfter, Printing, ProgressBar #from blocks.extensions.saveload import load from blocks.serialization import load from blocks.monitoring import aggregation # ??? from dataset import Corpus, createDataset args = getArguments() corpus = Corpus(open(args.corpus).read()) train_data, vocab_size = createDataset(corpus=corpus, sequence_length=750, repeat=20) if args.mode == "train": seq_len = 100 dim = 100 feedback_dim = 100 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout( readout_dim=vocab_size, source_names=["states"], # transition.apply.states ??? emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(vocab_size,
import dataset import tensorflow as tf import cv2 import numpy as np import matplotlib.pyplot as plt tfrecord_path_train = 'D:\Coco\coco_train.tfrecord' dataset = dataset.createDataset(tfrecord_path_train) category_map = { 0: '', 1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus', 7: 'train', 8: 'truck', 9: 'boat', 10: 'traffic light', 11: 'fire hydrant', 13: 'stop sign', 14: 'parking meter', 15: 'bench', 16: 'bird', 17: 'cat', 18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow',
from dataset import Corpus, createDataset from rnn_model import create_rnn if __name__ == "__main__": args = parser.parse_args() if args.retrain: main_loop = load(args.model) else: # create Corpus and Dateset corpus = Corpus(open(args.corpus).read()) train_data,vocab_size = createDataset( corpus = corpus, sequence_length = 750, repeat = 20 ) # create Computation Graph cg, layers, y_hat, cost = create_rnn(args.hidden, vocab_size, mode=args.mode) # create training loop main_loop = MainLoop( data_stream = DataStream( train_data, iteration_scheme = SequentialScheme( train_data.num_examples, batch_size = 50 ) ), algorithm = GradientDescent( cost = cost,
# PyTorch includes import torch from torch.autograd import Variable # Custom includes from options import TrainOptions from dataset import createDataset from models import createModel from visualizer import ProgressVisualizer assert Variable parser = TrainOptions() opt = parser.parse() # set dataloader trainDataset = createDataset(opt, split='train', nInput=opt.nInput) valDataset = createDataset(opt, split='val', nInput=opt.nInput) trainDataLoader = torch.utils.data.DataLoader(trainDataset, batch_size=opt.batchSize, shuffle=True, num_workers=opt.nThreads) valDataLoader = torch.utils.data.DataLoader(valDataset, batch_size=opt.batchSize, shuffle=True, num_workers=opt.nThreads) # set model model = createModel(opt) model.setup(opt)
plt.show() opt = TestOptions().parse() # set dataloader transformList = [ Resize(opt.loadSize), ToTensor(), Normalize([.485, .456, .406], [.229, .224, .225]) ] transform = Compose(transformList) dataset = createDataset(opt.dataset, transform=transform, outputFile=True) dataLoader = torch.utils.data.DataLoader(dataset, batch_size=opt.batchSize, shuffle=False) # set model model = createModel(opt) model.setup(opt) model.eval() # set visualizer visualizer = TestVisualizer(opt, dataset) # set evaluator
import dataset import readability import information import textProp import numpy as np from sklearn.svm import SVC features_train = [] labels_train = [] dataset.createDataset() data = dataset.data data = sorted(data, key=lambda k: k['qId']) count = 0 for row in data: featureVector = [] ts = readability.TextStatistics() tp = textProp.TextProperties() inf = information.Informativity() totalEntropy = 0 model, stats = inf.markov_model(inf.chars(row['answerBody']), 3) for prefix in stats: totalEntropy = totalEntropy + inf.entropy(stats, stats[prefix]) featureVector.append(abs(totalEntropy)) # information featureVector.append( tp.relevancy(row['answerBody'], row['qBody'], row['qTags'])) # relevancy featureVector.append(tp.UniqueWords(row['answerBody'])) # unique featureVector.append(tp.NonstopWords(row['answerBody'])) # nonstop featureVector.append(tp.subjective(row['answerBody'])) # subjectivity