def runTest(self): data = cleanData(self.filename) knearest = getDiscreetClassifier('knearest') resultTime = CV.doShuffleCrossValidation(knearest, data.data, data.target) print('knearest result: ' + str(resultTime.meanScore) + ' Time taken: ' + str(resultTime.timeTaken)) self.assertTrue(resultTime.timeTaken > 0)
def testAlgorithm(): userName = getUsername() check_request_for_params(["name", "algorithm"]) csynapseName = re_space(request.params.get('name')) algos = request.params.getall('algorithm') failedAlgos = [] successAlgos = [] for j in algos: # Try and load the algorithm and return errors with information about the algorithms that failed algoData = '' try: algoData = json.loads(j) successAlgos.append(algoData) except Exception as e: failedAlgos.append({'error':'json data {0} is invalid'.format(j),\ "error_type":"invalid_json", "exception_message":str(e)}) continue try: if ('params' in algoData): paramsData = algoData['params'] for key, val in paramsData.iteritems(): try: if (key == u'n_estimators'): paramsData[key] = int(val) else: paramsData[key] = float(val) except Exception: continue getDiscreetClassifier(algoData['algorithm'], algoData['params']) except Exception as e: failedAlgos.append({'error':'algo params {0} is invalid'.format(j),\ "error_type":"invalid_params", "exception_message":str(e)}) runAlgoTest.delay(algoData, userName, csynapseName) if (len(failedAlgos) > 0): raise HTTPResponse(status=500, body=json.dumps({"status":"error",'failedAlgorithms':failedAlgos,\ "error_type":"failedAlgorithms"})) else: return HTTPResponse(status=200, body=json.dumps({ "status": "ok", "message": "submitted for testing", "csynapse": csynapseName, "algorithms": successAlgos }))
def runTest(self): svm = 'svm' svmC = getDiscreetClassifier(svm) meanScore = CV.doShuffleCrossValidation(svmC, self.data.data, self.data.target).meanScore trained = TrainClassifier.trainWithLabels(getDiscreetClassifier(svm), self.data.data, self.data.target) predictions = predict(trained, self.data.data) # Ensure the score after from predicting is close to the cross validation score correct = 0 for x in range(len(predictions)): if (predictions[x][0] == self.data.target[x]): correct += 1 newScore = float(correct) / len(self.data.target) result = abs(meanScore - newScore) self.assertTrue(result < .05)
def runTest(self): svm = 'svm' svmC = getDiscreetClassifier(svm) saveAble = TrainClassifier.trainWithLabels(svmC,self.data.data, self.data.target) meanScore = CV.doShuffleCrossValidation(svmC, self.data.data, self.data.target).meanScore s = saveClassifierAsString(saveAble) clf = loadClassifierFromString(s) newScore = clf.score(self.data.data,self.data.target) result = abs(meanScore - newScore) # Ensure the score after reloading the classifier is close to the cross validation score self.assertTrue(result < .05)
def classifyForDemo(trainFile, newImageFile, algorithmName='passiveAggressive'): imagePixels = getPixels(newImageFile,(Constants.DEMO_SIZE,Constants.DEMO_SIZE)) # get data out of file data = [makeInts(line) for line in trainFile.read().split('\n')] pixels = [] labels = [] for x in data: labels.append(x[0]) pixels.append(x[1]) # make big list of photos putting the last one on the end pixels.append(imagePixels) components = len(pixels) if(components > 200): components = 200 pca = PCA(n_components=components,copy=False) transformed = pca.fit_transform(pixels) alg = getDiscreetClassifier(algorithmName) alg.fit(transformed[:-1], labels) return predict(alg, transformed[-1])[0][0]
def runTest(self): preceptron = 'perceptron' kC = getDiscreetClassifier(preceptron) meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target) print('Perceptron result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
def runTest(self): kC = getDiscreetClassifier('orthogonalMatchingPursuit') meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target) print('OrthogonalMatchingPursuit result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
def classify(newDataId, oldDataId, algorithm, params, userName, csynapseName, dataName): result = None finalString = '' # special case for homegrown algorithms if (algorithm in externalAlgorithms): # training data trainingPath = getDataFile(oldDataId) # get headers possibleHeaders = getHeaders(trainingPath) # new Data newDataPath = getDataFile(newDataId) resultsPath = trainingPath + 'results' runEx.execute( 'java -jar externalExecutables/{0}.jar {1} {2} {3}'.format( algorithm, trainingPath, newDataPath, resultsPath)) # write header to file if (possibleHeaders != ''): with open(resultsPath, 'r') as f: content = f.read() finalString = possibleHeaders + content else: # Get old data oldData = cleanData(getDataFile(oldDataId)) # Get params of algorithm # Instantiate Classifier alg = getDiscreetClassifier(algorithm, params) # Train on data alg.fit(oldData.data, oldData.target) # Get new data newData = cleanUntagged(getDataFile(newDataId)) # Get headers from old file to save with results possibleHeaders = oldData.headers # Predict the new data result = predict(alg, newData.data) finalStringList = [] if (possibleHeaders != ''): finalStringList.append(possibleHeaders) # Put data into string to save for x in result: finalStringList.append(str(x[0]) + ',') for v in x[1][:-1]: finalStringList.append(str(v)) finalStringList.append(',') finalStringList[-1] = '\n' finalString = ''.join(finalStringList) # Put classified data file into the database classifiedDataId = db.files.put(finalString) # Save data Id to cynapse users = db.users users.update_one({'_id':userName},\ {'$set':{'csynapses.{0}.classified.{1}'.format(csynapseName,dataName):classifiedDataId}}) return
def runAlgoTest(algoData, userName, csynapseName): newObjectId = str(ObjectId()) try: userCollection = db.users algorithm = algoData['algorithm'] params = {} if ('params' in algoData): params = algoData['params'] # userCollection = db.users set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) userCollection.update_one({'_id':userName}, \ {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'algoId':algorithm, 'params':params, 'status':"processing", "last_updated":set_time}}}) doc = userCollection.find_one({'_id': userName}) #the following is ATROCIOUS code, but should work for now. #need better reporting tools to do better if "data_id" not in (doc['csynapses'][csynapseName]).keys( ) and "multipart_data" not in (doc['csynapses'][csynapseName]).keys(): set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) userCollection.update_one({'_id':userName}, \ {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "last_updated":set_time}}}) raise DataException("Data Not Available") found = False if "data_id" in (doc['csynapses'][csynapseName]).keys(): found = True cycles_to_wait = 20 cycles = 0 if not found: while not found: doc = userCollection.find_one({'_id': userName}) if ("multipart_data" in (doc['csynapses'][csynapseName]).keys() and "multipart_reduced" in (doc['csynapses'][csynapseName]).keys()): found = True else: cycles = cycles + 1 if cycles == cycles_to_wait: set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) userCollection.update_one({'_id':userName}, \ {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "last_updated":set_time}}}) raise WaitException("Took Too Long to Generate Data") sleep(1) sleep(3) dataId = doc['csynapses'][csynapseName]['data_id'] ret = {} # special case for homegrown algos if (algorithm in externalAlgorithms): # get file path = getDataFile(dataId) resultsPath = path + 'results' script_dir = os.path.dirname(os.path.realpath(__file__)) runEx.execute( 'java -jar {3}/externalExecutables/{0}.jar {1} {2}'.format( algorithm, path, resultsPath, script_dir)) # get results from file with open(resultsPath, 'r') as f: data = f.read() score, time = data.split(',') ret['score'] = float(score) / 100 ret['time'] = float(time) / 1000 else: # Instantiate Classifier alg = getDiscreetClassifier(algorithm, params) # Get data from file data = cleanData(getDataFile(dataId)) # Run Cross Validation meanScoreTime = doShuffleCrossValidation(alg, data.data, data.target) ret['score'] = meanScoreTime.meanScore ret['time'] = meanScoreTime.timeTaken # save result in db set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) userCollection.update_one({'_id':userName}, \ {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'score':ret['score'],\ 'time':ret['time'], 'algoId':algorithm, 'params':params, 'status':"complete", "last_updated":set_time}}}) return except: tb = traceback.format_exc() #sys.exc_info()[0] set_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()) userCollection.update_one({'_id':userName}, \ {'$set':{'csynapses.{0}.algorithms.{1}'.format(csynapseName,newObjectId):{'status':"error", "error_text":"{}".format(tb), "algoId":algorithm, "params":params, "last_updated":set_time}}})
def classifyImages(dataIds, oldDataId, algorithm, params, userName, csynapseName, dataName): fileNames = getMultiPartDataFiles(userName, csynapseName) userCollection = db.users doc = userCollection.find_one({'_id': userName}) labelMap = doc['csynapses'][csynapseName]['multipart_data_tagmap'] # make dictionary mapping labels to lists of filenames mappedFiles = {} for key, value in labelMap.iteritems(): mappedFiles[key] = [getDataFile(mongoId) for mongoId in value] unlabeledFiles = [] for x in dataIds: filename, dataId = x unlabeledFiles.append((filename, getDataFile(dataId))) finalString = '' # special case for homegrown algorithms if (algorithm in externalAlgorithms): # Get file paths for old and new data unique = str(uuid.uuid4()) trainingPath = 'train' + unique newDataPath = 'classify' + unique resultsPath = 'results' + unique script_dir = os.path.dirname(os.path.realpath(__file__)) imageNames = vectorizeForClassify(mappedFiles, unlabeledFiles, trainingPath, newDataPath) runEx.execute( 'java -jar {4}/externalExecutables/{0}.jar {1} {2} {3}'.format( algorithm, trainingPath, newDataPath, resultsPath, script_dir)) stringBuilder = [] with open(resultsPath, 'r') as f: for i, line in enumerate(f): stringBuilder.append(line.split(',')[0] + ',' + imageNames[i]) finalString = '\n'.join(stringBuilder) else: trainingData, toClassify = vectorizeForClassify( mappedFiles, unlabeledFiles) # Instantiate Classifier alg = getDiscreetClassifier(algorithm, params) # Train on data alg.fit(trainingData.data, trainingData.target) predictResult = predict(alg, toClassify.data) classifiedData = [] # add file names back for index, label in enumerate(toClassify.names): classifiedData.append((predictResult[index][0], label)) commaStringList = [','.join(x) for x in classifiedData] finalString = '\n'.join(commaStringList) classifiedDataId = db.files.put(finalString.encode("UTF-8")) # Save data Id to cynapse users = db.users users.update_one({'_id':userName},\ {'$set':{'csynapses.{0}.classified.{1}'.format(csynapseName,dataName):classifiedDataId}})
def runTest(self): leastSquares = 'leastSquares' kC = getDiscreetClassifier(leastSquares) meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target) print('LeastSquares result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
def runTest(self): dt = 'decisionTree' kC = getDiscreetClassifier(dt) meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target) print('Decision Tree result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
def runTest(self): pa = 'passiveAggressive' kC = getDiscreetClassifier(pa) meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target) print('Passive Aggressive result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
def runTest(self): nCentroid = 'nearestCentroid' kC = getDiscreetClassifier(nCentroid) meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target) print('Nearest Centroid result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
def runTest(self): rForest = 'randomForest' kC = getDiscreetClassifier(rForest) meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target) print('Random Forest result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
def runTest(self): sgd = 'sgd' kC = getDiscreetClassifier(sgd) meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target) print('SGD result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
def runTest(self): gauss = 'guassNB' kC = getDiscreetClassifier(gauss) meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target) print('Guassian Naive Bayes result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
def runTest(self): with self.assertRaises(ValueError): getDiscreetClassifier('blksci')
def runTest(self): kC = getDiscreetClassifier('logisticRegression') meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target) print('logisticRegression result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')
def runTest(self): kC = getDiscreetClassifier('bayesianRidge') meanScoreTimeTaken = CV.doShuffleCrossValidation(kC, self.data.data, self.data.target) print('BayesianRidge result: ' + str(meanScoreTimeTaken.meanScore) + ' Time taken:' + str(meanScoreTimeTaken.timeTaken) + ' seconds')