def main(): parser = argparse.ArgumentParser( description='Gather information about the Apache Kafka host.') parser.add_argument( 'command', nargs='?', default="gather", choices=["gather", "display", "test"], help='command to perform, either gather (default), analyze or display') parser.add_argument('--directory', help='input or output directory') parser.add_argument( '--force', '-f', nargs="?", const=True, default=False, type=bool, help='force the output even if the directory contains data') ns = parser.parse_args(sys.argv[1:]) if ns.command == "gather": directory = ns.directory if directory is None: directory = tempfile.mkdtemp('_output', 'kdiag_') writer.validate(directory, force=ns.force) gather() sys.stdout.write("\n") writer.write(directory, environment.Environment.getInstance(), force=ns.force) elif ns.command == "display": directory = ns.directory if directory is None: raise Exception("require input directory to be specified") display(directory) elif ns.command == "test": directory = ns.directory if directory is None: raise Exception("require input directory to be specified") reader.read(directory) test() pass
def main(argv): raise Exception("Dont' call this from main. Instead, open up a Python interpreter in the project directory and type `from clf import *` ") if len(argv) < 3: print "Usage: python naive_bayes.py <train_data> <test_data>" sys.exit(1) y, X = reader.read(argv[1], **{ 'extractFeaturesFn': extractFeatures2009, 'extractLabelsFn': extractLabel, 'limit': LIMIT }) testY, testX = reader.read(argv[2], **{ 'extractFeaturesFn': extractFeatures2010, 'extractLabelsFn': extractLabel, 'limit': LIMIT })
def main(argv): if len(argv) < 3: print "Usage: python pca.py <train_data> <test_data>" sys.exit(1) y, x = reader.read(argv[1], extractFeaturesFn=extractFeatures, extractLabelsFn=extractLabel, limit=LIMIT) # testY, testX = reader.read(argv[2], extractFeaturesFn=extractFeatures, extractLabelsFn=extractLabel, limit=LIMIT) k = 3 model = Model(k) compressedFeatures = model.pca(x)
def test_weightedquickunion_compression(): qf = UFapi.WeightedQuickUnion(10, True) nums = reader.read(PATH) for p, q in nums: qf.union(p, q) # 1 1 1 3 3 1 1 1 3 3 assert qf.connected(0, 1) assert qf.connected(4, 9) assert qf.find(6) == 1 assert qf.find(2) == 1 assert qf.find(8) == 3 assert qf.count() == 2
def test_quickunion(): qf = UFapi.QuickUnion(10) nums = reader.read(PATH) for p, q in nums: qf.union(p, q) # 1 1 1 8 8 1 1 1 8 8 assert qf.connected(0, 1) assert qf.connected(4, 9) assert qf.find(6) == 1 assert qf.find(2) == 1 assert qf.find(8) == 8 assert qf.count() == 2
def main(argv): if len(argv) < 3: print "Usage: python kmeans.py <train_data> <test_data>" sys.exit(1) y, x = reader.read(argv[1], extractFeaturesFn=extractFeatures, extractLabelsFn=extractLabel, limit=LIMIT) # testY, testX = reader.read(argv[2], extractFeaturesFn=extractFeatures, extractLabelsFn=extractLabel, limit=LIMIT) print np.shape(x) print np.shape(y) model = Model() model.train(x, y) model.runKmeans(x) distances = model.distanceToCentroids(x)
if __name__ == "__main__": # main(sys.argv) print WARNING + "Don't call this via the command line; instead, open up ipython and type in `from final import *`" sys.exit(1) else: # ----------------------------------------------------------- print OKBLUE print "Reading in data" print ENDC y, X = reader.read("data/2009", **{ 'extractFeaturesFn': extractFeatures2009, 'extractLabelsFn': extractLabel, 'limit': LIMIT }) testY, testX = reader.read("data/2010", **{ 'extractFeaturesFn': extractFeatures2010, 'extractLabelsFn': extractLabel, 'limit': LIMIT }) print OKGREEN print "Done reading data" print ENDC # ----------------------------------------------------------- # Preprocess for linear regression
'limit': LIMIT }) # ---------------------------------------------------------- # Exec if __name__ == "__main__": main(sys.argv) else: # ------- Actual --------- # setup for scripting y, X = reader.read("data/2009", **{ 'extractFeaturesFn': extractFeatures2009, 'extractLabelsFn': extractLabel, 'limit': LIMIT }) # idxs = np.where(X[:, 3] == 0) # X = X[idxs] # y = y[idxs] # Process and remove drugs and procedures and # leave it up to PCA to reduce dimensionality # procedures = X[:, 3:11] # drugTypes = X[:, 12:19] # X = np.delete(X, range(3, 19), axis=1) # idxs = np.where(y==10)[0][:25] # X = np.delete(X, idxs, axis=0)
def main(argv): if len(argv) < 2: print "Usage: python linear_regression.py <data>" sys.exit(1) Y, X = reader.read(argv[1], **{ 'extractFeaturesFn': extractFeatures, 'extractLabelsFn': extractTarget, 'limit': LIMIT }) # Take out invalid values takeOutInvalid = False XY = np.array([xy for xy in np.hstack((X, Y.reshape(-1, 1))) if not takeOutInvalid or all([i > -7 for i in xy])]) XY = np.random.permutation(XY) X = XY[:, :-1] Y = XY[:, -1] print len(Y) # Applying average feature values to invalid values averages = [] numInvalidByFeature = [] for j in range(len(X[0])): averages.append(sum([x[j] for x in X]) / len(X)) numInvalidByFeature.append(0) numInvalid = 0 numBadX = 0 numInvalidInX = 0 cleanX = [] cleanY = [] for i in range(len(X)): numInvalidInX = 0 for j in range(len(X[0])): if X[i][j] <= -7: X[i][j] = averages[j] numInvalid = numInvalid + 1 numInvalidInX = numInvalidInX + 1 numInvalidByFeature[j] = numInvalidByFeature[j] + 1 if numInvalidInX > len(X[0]) / 15: numBadX = numBadX + 1 else: cleanX.append(X[i]) cleanY.append(Y[i]) print numInvalid # 107123 print numBadX # 9854 xs have >1/10 invalid cells # 14582 xs have >1/15 invalid cells # 19452 xs have >1/20 invalid cells # Take out invalid features (i.e. features with too many invalid values) invalidFeatures = [] # Indices of invalid features for j in range(len(X[0])): if numInvalidByFeature[j] > len(X) / 10: invalidFeatures.append(j) for i in range(len(cleanX)): cleanX[i] = np.delete(cleanX[i], invalidFeatures) X = np.array(cleanX) Y = np.array(cleanY) print X.shape, Y.shape # Add K-means features X = addKMeansFeatures(X, Y) # Create model # model = linear_model.LinearRegression() # model = linear_model.Lasso(alpha=.01) # model = linear_model.LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100]) # model.fit(X, Y) # print model.alpha_ # model = linear_model.Ridge(alpha=100) model = linear_model.RidgeCV(normalize=True, alphas=[0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 1, 10, 100, 1000, 10000, 100000]) model.fit(X, Y) print 'alpha:', model.alpha_ # model = linear_model.ElasticNet(alpha=.1, l1_ratio=.1) model.fit(X, Y) print 'intercept:', model.intercept_ print 'coef:', model.coef_ # Feature selection, increases performance a lot X = fsel(model, X, Y) # Error over m plotTrainingTestError(model, X, Y)
sys.exit(1) y, x = reader.read(argv[1], extractFeaturesFn=extractFeatures, extractLabelsFn=extractLabel, limit=LIMIT) # testY, testX = reader.read(argv[2], extractFeaturesFn=extractFeatures, extractLabelsFn=extractLabel, limit=LIMIT) print np.shape(x) print np.shape(y) model = Model() model.train(x, y) model.runKmeans(x) distances = model.distanceToCentroids(x) if __name__ == "__main__": main(sys.argv) else: DEFAULT_TRAIN = './data/2009' DEFAULT_TEST = './data/2010' y, x = reader.read(DEFAULT_TRAIN, extractFeaturesFn=extractFeatures, extractLabelsFn=extractLabel, limit=LIMIT) model = Model() model.train(x, y) ty, tx = reader.read(DEFAULT_TEST, extractFeaturesFn=extractFeatures, extractLabelsFn=extractLabel) model.test()
outfile = open('./2010.csv', 'w') def writeline(arr): global outfile return outfile.write(",".join(arr) + "\n") def extractTimeWithMd(line): return int(line[291:293]) def extractFeatures2010(line): """ Extract features based on specs from 2010 """ return [extract(line, spec) for _, spec in features["2010"]] def extractLabel(line): """ Main label extraction fn. """ return extractTimeWithMd(line) y, X = read("./data/2010", **{ 'extractFeaturesFn': extractFeatures2010, 'extractLabelsFn': extractLabel }) writeline([spec[0] for spec in features["2010"]] + ['timeWithMD(y)']) for example in np.hstack([X, y.reshape(-1,1)]): writeline([str(a) for a in example]) outfile.close()