def main(): """高维数据的迭代次数统计,及与理论迭代次数上限的比较 """ DATA_NUM = 100 DATA_DIM = 10 ITER_NUM = 10000 print('DATA_NUM:', DATA_NUM) print('DATA_DIM:', DATA_DIM) T = [] t = [] for i in range(ITER_NUM): print(f'\riter: {i+1:d} / {ITER_NUM:d}', end='', flush=True) x, y, wf = get_data(DATA_NUM, DATA_DIM) T.append(calculate_T(x, y, wf)) p = Perceptron(data_dim=DATA_DIM) p.train(x, y) t.append(p.n_iter) print() print(f'meant/meanT: {np.mean(t):.2f}/{np.mean(T):.2f}') draw_hist(t, 't', ITER_NUM, 'fig1_t') draw_hist(T, 'T', ITER_NUM, 'fig2_T') draw_hist(np.log(t), 'log_t', ITER_NUM, 'fig3_log_t') draw_hist(np.log(T), 'log_T', ITER_NUM, 'fig4_log_T')
def main(): """二维数据的迭代过程作图 """ DATA_NUM = 20 # num of data DATA_DIM = 2 # dimension of data x, y, wf = get_data(DATA_NUM, DATA_DIM) print('x:', x) print('y:', y) print('wf:', wf) positive_id = np.where(y > 0) negative_id = np.where(y < 0) print('true result:') print('positive_id:', positive_id) print('negative_id:', negative_id) p = Perceptron(data_dim=DATA_DIM) stop = False while not stop: draw(wf, p.wt, x[positive_id], x[negative_id], p.n_iter) stop = p.update(x, y) p.wt /= sum(p.wt) print('iter num:', p.n_iter) print('wt:', p.wt) wt_dot_x = np.dot(p.wt[:-1], x.transpose()) + p.wt[-1] positive_id = np.where(wt_dot_x > 0) negative_id = np.where(wt_dot_x < 0) print('perceptron result:') print('positive_id:', positive_id) print('negative_id:', negative_id)
def optimize_portfolio(sd=dt.datetime(2008,1,1), ed=dt.datetime(2009,1,1), \ syms=['GOOG','AAPL','GLD','XOM'], gen_plot=False): # Read in adjusted closing prices for given symbols, date range dates = pd.date_range(sd, ed) prices_all = get_data(syms, dates) # automatically adds SPY prices = prices_all[syms] # only portfolio symbols prices_SPY = prices_all['SPY'] # only SPY, for comparison later # find the allocations for the optimal portfolio # note that the values here ARE NOT meant to be correct for a test case allocs = np.ones(len(syms)) # add code here to find the allocations constraint = ({ 'type': 'eq', 'fun': lambda inputs: 1.0 - np.sum(inputs) }) bounds = ((0,2),) * len(syms) result = spo.minimize(err, allocs, args=(prices,), method="SLSQP", constraints=constraint,bounds=bounds) sddr = 0 sr = sharpe(result.x, prices) allocs = result.x adr = average_daily_return(allocs, prices) cr = cumulative_return(allocs, prices) sddr = volatility(allocs, prices) # Get daily portfolio value port_val = prices_SPY # add code here to compute daily portfolio values # Compare daily portfolio value with SPY using a normalized plot if gen_plot: # add code to plot here df_temp = pd.concat([port_val, prices_SPY], keys=['Portfolio', 'SPY'], axis=1) pass return allocs, cr, adr, sddr, sr
def main(): DATA_NUM = 20 DATA_DIM = 2 x, y, wf = get_data(DATA_NUM, DATA_DIM) p = Perceptron(data_dim=DATA_DIM) p.train(x, y) print(p.n_iter) print(p.wt)
def extract_data(flatten): data, labels = get_data(_DATA_PATH, class_labels=_CLASS_LABELS, flatten=flatten) x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42) return np.array(x_train), np.array(x_test), np.array(y_train), np.array( y_test), len(_CLASS_LABELS)
def get_num_sections(subject_id, course_id): data = get_data(subject_id) num_sections = 0 for course in data: if(course["courseNumber"]==str(course_id)): for section in course["sections"]: if (section["printed"]=="Y"): num_sections = num_sections + 1 print num_sections return num_sections
def get_open_sections(subject_id, course_id): data = get_data(subject_id) num_sections = 0 for course in data: if(course["courseNumber"]==str(course_id)): for section in course["sections"]: if (section["openStatus"] and section["printed"]=="Y"): print section["index"] num_sections = num_sections + 1 return num_sections
def __init__(self, \ symbols, \ sd, \ ed, \ sv, \ verbose = False): self.symbols = symbols self.sd = sd self.ed = ed self.sv = sv self.dates = pd.date_range(sd, ed) self.stocks = utility.get_data(self.symbols, self.dates) self.tenmean_pd = [] self.twentymean_pd = [] self.return_pd = []
def create_csv(subject_id): target = open("static/csv/"+str(subject_id)+".csv", 'w') target.write("Course,Open,Closed\n") data = get_data(subject_id) num_sections = 0 num_open = 0 for course in data: for section in course["sections"]: if (section["printed"]=="Y"): num_sections = num_sections + 1 if (section["openStatus"]): num_open = num_open +1 if(num_sections!=0): target.write(course["courseNumber"]+","+str(num_open)+","+str(num_sections-num_open)+"\n") num_sections=0 num_open=0 target.close()
label_data = ut.add_labels_to_data(label_data, labels) # initialize empty dataframes for acc and gyro data for different frequencies acc_hz_5 = ut.get_empty_dataframe() acc_hz_10 = ut.get_empty_dataframe() acc_hz_25 = ut.get_empty_dataframe() acc_hz_50 = ut.get_empty_dataframe() gyro_hz_5 = ut.get_empty_dataframe() gyro_hz_10 = ut.get_empty_dataframe() gyro_hz_25 = ut.get_empty_dataframe() gyro_hz_50 = ut.get_empty_dataframe() # for each file for file in files: # read into dataframe df = ut.get_data(file) # get filename from filepath filename = ut.get_file_name_from_path(file) # for all files other than labels.txt file if (not (filename.__contains__('labels'))): # get experimentID and userID from filename exp_id = int(filename.split('_')[1][3:]) user_id = int(filename.split('_')[2][4:]) # add experimentID userID and activityID to the data df['experimentID'] = exp_id df['userID'] = user_id df['activityID'] = 0
from sklearn.metrics import accuracy_score import numpy as np import matplotlib.pyplot as plt import xgboost as xgb from sklearn import preprocessing import operator from scipy import sparse from sklearn.metrics import pairwise_distances from scipy.spatial.distance import cosine import time import pickle import datetime fullTrainFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/train_svm_light.v5.4.BOTH.2015-12-26_16-48-50.txt' fullTestFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/test_svm_light.v5.4.BOTH.2015-12-26_09-24-51.txt' X, Y = get_data(fullTrainFile) #X = sparse.csr_matrix(X)[:,list(range(0,155))] #Y_14 = np.where(Y==14) # to delete TT_14 rows #Y = np.delete(Y, Y_14, 0) #X = delete_rows_csr(X,Y_14) le = preprocessing.LabelEncoder() Y = le.fit_transform(Y).astype(np.int32) #X=X.astype(np.float32) skf = StratifiedKFold(Y, n_folds=3, random_state=app_random_state_value)#,shuffle=True) skfList = list(skf) train_index, test_index = skfList[0] X=X[:,0:5391] XD = X#.todense() xTr, xTe = XD[train_index], XD[test_index] yTr, yTe = Y[train_index], Y[test_index]
def get_subject_name(subject_id, course_id): data = get_data(subject_id) for course in data: if(course["courseNumber"]==str(course_id)): return course["title"]
def train(imgs_train_path,imgs_test_path,output_models_path,output_images_path,batch_size,epochs,epoch_size = 1000,training_images_to_load = 3000,test_images = 50,training_for_generator_each_batch=2,save_data= True): train_x,train_y = utility.get_data(imgs_train_path,IMAGE_SHAPE,training_images_to_load) test_x,test_y = utility.get_data(imgs_test_path,IMAGE_SHAPE,test_images,False) if save_data: utility.save_data_as_pickle(train_x,"dataset/operative_data/train_x") utility.save_data_as_pickle(train_y,"dataset/operative_data/train_y") utility.save_data_as_pickle(test_x,"dataset/operative_data/test_x") utility.save_data_as_pickle(test_y,"dataset/operative_data/test_y") print("--SAVED DATA") generator = model.get_generator(IMAGE_SHAPE) discriminator = model.get_discriminator(IMAGE_SHAPE) loss_generator = loss.VGG_LOSS(IMAGE_SHAPE) generator.compile(loss=loss_generator.vgg_loss, optimizer=OPTIMIZER) discriminator.compile(loss="binary_crossentropy", optimizer=OPTIMIZER) gan = model.get_gan_model(DOWNSCALED_IMG_SHAPE,generator,discriminator,loss_generator.vgg_loss,"binary_crossentropy",OPTIMIZER) gan.summary() n_batch = int((epoch_size)/ batch_size) n_batch_test = int((test_images/batch_size)) true_batch_vector = np.ones((batch_size,1)) false_batch_vector = np.zeros((batch_size,1)) print("--START TRAINING") for epoch in range(epochs): disciminator_losses = [] gan_losses = [] epoch_start_time = time.time() # train each batch for batch in range(n_batch): random_indexes = np.random.randint(0, len(train_x), size=batch_size) batch_x = np.array(train_x)[random_indexes.astype(int)] batch_y = np.array(train_y)[random_indexes.astype(int)] generated_images = generator.predict(x=batch_x, batch_size=batch_size) discriminator.trainable = True #we can decide to perform more than one train for batch on the discriminator for _ in range(training_for_generator_each_batch): d_loss_r = discriminator.train_on_batch(batch_y, true_batch_vector) d_loss_f = discriminator.train_on_batch(generated_images, np.random.random_sample(batch_size)*0.2) disciminator_losses.append(0.5 * np.add(d_loss_f, d_loss_r)) discriminator.trainable = False # train the generator gan_Y = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2 gan_loss = gan.train_on_batch(batch_x, [batch_y, gan_Y]) gan_losses.append(gan_loss) test_losses = [] for i in range(n_batch_test): batch_x = np.array(test_x)[i*batch_size:(i+1)*batch_size] batch_y = np.array(test_y)[i*batch_size:(i+1)*batch_size] gan_Y = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2 gan_loss = gan.test_on_batch(batch_x, [batch_y, gan_Y]) test_losses.append(gan_loss) #print("discriminator loss: ",np.mean(disciminator_losses), " gan losses: ",[np.mean(x) for x in zip(*gan_losses)] ," time: ",time.time()-epoch_start_time) print("test: ",[np.mean(x) for x in zip(*test_losses)]) if epoch % 3 == 0 or epoch == 0: generator.save(output_models_path + 'gen_model%d.h5' % epoch) discriminator.save(output_models_path + 'dis_model%d.h5' % epoch) utility.plot_generated_images(output_images_path,epoch,generator,test_y,test_x)
from utility import delete_rows_csr from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import accuracy_score import numpy as np import matplotlib.pyplot as plt import xgboost as xgb from sklearn import preprocessing import operator from scipy import sparse from sklearn.metrics import pairwise_distances from scipy.spatial.distance import cosine fullTrainFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/train_svm_light.v5.4.BOTH.2015-12-26_16-48-50.txt' fullTestFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/test_svm_light.v5.4.BOTH.2015-12-26_09-24-51.txt' X, Y = get_data(fullTrainFile) #X = sparse.csr_matrix(X)[:,list(range(0,155))] #Y_14 = np.where(Y==14) # to delete TT_14 rows #Y = np.delete(Y, Y_14, 0) #X = delete_rows_csr(X,Y_14) le = preprocessing.LabelEncoder() Y = le.fit_transform(Y).astype(np.int32) #X=X.astype(np.float32) skf = StratifiedKFold(Y, n_folds=3, random_state=app_random_state_value) skfList = list(skf) train_index, test_index = skfList[0] XD = X#.todense() xTr, xTe = XD[train_index], XD[test_index] yTr, yTe = Y[train_index], Y[test_index]
from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error import numpy as np fullTrainFile = '../files/train_low_freq_removed_500k.txt' maxIterations = 5 # number of CV results you want to see, value between 1 to 5 printPredictionToFile = False # deprecated.. should not use since we are doing CV, keep it false algoVerbose = False predFile = 'files/nb.mn.25.75.predictions' #sys.argv[2]; useTfIdf = True # this becomes feature 3! # True REDUCES MAE(good) #Hyperparameters list ## X, Y = get_data(fullTrainFile) if (useTfIdf): transformer = TfidfTransformer() X = transformer.fit_transform(X, Y) skf = StratifiedKFold(Y, n_folds=5, random_state=app_random_state_value) scoreSumMatrix = np.zeros((4, len(classes))) mseSum = 0 maeSum = 0 count = 1 for train_index, test_index in skf: xTr, xTe = X[train_index], X[test_index] yTr, yTe = Y[train_index], Y[test_index] clf = MultinomialNB() yhTe = clf.fit(xTr, yTr).predict(xTe)
import csv import xgboost as xgb import matplotlib.pyplot as plt import sklearn.model_selection as ms from utility import get_data # 获取数据 data, label = get_data(training=True) train_data, val_data, train_label, val_label = ms.train_test_split( data, label, test_size=0.25, random_state=1) feature_name = [ 'Pclass', 'Sex', 'Fare', 'is_child', 'family_size', 'Embarked-c', 'Embarked-s', 'Embarked-q' ] data_matrix = xgb.DMatrix(data, label, feature_names=feature_name) train_matrix = xgb.DMatrix(train_data, train_label, feature_names=feature_name) val_matrix = xgb.DMatrix(val_data, val_label, feature_names=feature_name) test_matrix = xgb.DMatrix(get_data(training=False), feature_names=feature_name) num_trees = 47 eval_list = [(train_matrix, 'train'), (val_matrix, 'eval')] params = { 'objective': 'reg:logistic', 'eval_metric': 'error', 'max_depth': 5, 'min_child_weight': 1, 'eta': 0.5,
from utility import delete_rows_csr from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_recall_fscore_support from sklearn.metrics import accuracy_score import numpy as np import matplotlib.pyplot as plt import xgboost as xgb from sklearn import preprocessing import operator from scipy import sparse from sklearn.metrics import pairwise_distances from scipy.spatial.distance import cosine fullTrainFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/train_svm_light.v5.4.BOTH.2015-12-26_16-48-50.txt' fullTestFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/test_svm_light.v5.4.BOTH.2015-12-26_09-24-51.txt' X, Y = get_data(fullTrainFile) #X = sparse.csr_matrix(X)[:,list(range(0,155))] #Y_14 = np.where(Y==14) # to delete TT_14 rows #Y = np.delete(Y, Y_14, 0) #X = delete_rows_csr(X,Y_14) le = preprocessing.LabelEncoder() Y = le.fit_transform(Y).astype(np.int32) #X=X.astype(np.float32) skf = StratifiedKFold(Y, n_folds=3, random_state=app_random_state_value) skfList = list(skf) train_index, test_index = skfList[0] XD = X #.todense() xTr, xTe = XD[train_index], XD[test_index] yTr, yTe = Y[train_index], Y[test_index]
import matplotlib.pyplot as plt ##from matplotlib.font_manager import FontProperties sd = dt.datetime(2010, 1, 1) ed = dt.datetime(2010, 12, 31) symbol = "GLD" learner = sl.StrategyLearner(symbols=[symbol],\ sd = sd,\ ed = ed,\ sv = 100000, \ verbose = False) # constructor result = learner.train() dates = pd.date_range(sd, ed) stocks = utility.get_data(symbol, dates) prices = pd.DataFrame(np.zeros(((ed - sd).days + 1, 1))) prices.columns = [symbol] result = np.zeros((251, 1)) + 10 portfolio = 0 inv = 0 out = 100000 ####3 value = 100000 money = True for i in range(0, 250): prices[symbol].iloc[i] = stocks[symbol].iloc[i] if i > 14: fiftendays = prices[symbol].loc[i - 14:i]
import seaborn as sns sns.set(style="darkgrid") def load_model(model_name, model_weights_filename): with open("model/{}.json".format(model_name), "r") as json_file: model = model_from_json(json_file.read()) model.load_weights("model/{}.h5".format(model_weights_filename)) model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["categorical_accuracy"]) return model if __name__ == "__main__": model = load_model("CNN-RNN", "CNN-RNN-61(-3395)") # Change your file path data_path = "./data/focused/darkfanxing_1.csv" data = get_data(data_path) data = change_to_sequence_data(data) predictions = np.argmax(model.predict(data), axis=-1) # predictions = np.where(predictions==1)[1] sns.lineplot(x=[time for time in range(predictions.shape[0])], y=predictions).set(xlabel="time(s)", ylabel="is_focused") plt.show()
''' Created on Dec 14, 2015 @author: vaibhav ''' from utility import get_data from utility import app_random_state_value import numpy as np import xgboost as xgb from sklearn import preprocessing import time import datetime bothDataDir = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/' fullTrainFile = bothDataDir+'train_svm_light.v5.4.BOTH.2015-12-26_16-48-50.txt' xTr, yTr = get_data(fullTrainFile) le = preprocessing.LabelEncoder() le.fit(yTr) yTr = le.transform(yTr) fullTestFile = bothDataDir+'test_svm_light.v5.4.BOTH.2015-12-26_16-48-50.txt' xTe, vnTe = get_data(fullTestFile) # vnTe is visit number for TEST FILE xg_train = xgb.DMatrix( xTr, label=yTr) xg_test = xgb.DMatrix(xTe)#, label=yTe) param = {'max_depth':50, 'eta':0.15, 'silent':1, 'objective':'multi:softprob', 'max_delta_step':1, 'num_class':len(le.classes_), 'eval_metric':'mlogloss', 'seed':54325, 'nthread':4, 'subsample':0.8, 'colsample_bytree':0.8, 'min_child_weight':2, 'lambda':8, 'alpha':3, 'gamma':1} # param = {'max_depth':50, 'eta':0.1, 'silent':1, 'objective':'multi:softprob', # 'num_class':38, "eval_metric":"mlogloss", "seed":app_random_state_value} watchlist = [ (xg_train,'train') ]#, (xg_test, 'test') ]
def get_label_data(files): for file in files: if file.__contains__('labels'): label_file = file break return ut.get_data(label_file)