def main():
    """高维数据的迭代次数统计,及与理论迭代次数上限的比较
    """
    DATA_NUM = 100
    DATA_DIM = 10
    ITER_NUM = 10000

    print('DATA_NUM:', DATA_NUM)
    print('DATA_DIM:', DATA_DIM)
    T = []
    t = []

    for i in range(ITER_NUM):
        print(f'\riter: {i+1:d} / {ITER_NUM:d}', end='', flush=True)
        x, y, wf = get_data(DATA_NUM, DATA_DIM)
        T.append(calculate_T(x, y, wf))

        p = Perceptron(data_dim=DATA_DIM)
        p.train(x, y)
        t.append(p.n_iter)

    print()
    print(f'meant/meanT: {np.mean(t):.2f}/{np.mean(T):.2f}')

    draw_hist(t, 't', ITER_NUM, 'fig1_t')
    draw_hist(T, 'T', ITER_NUM, 'fig2_T')
    draw_hist(np.log(t), 'log_t', ITER_NUM, 'fig3_log_t')
    draw_hist(np.log(T), 'log_T', ITER_NUM, 'fig4_log_T')
Ejemplo n.º 2
0
def main():
    """二维数据的迭代过程作图
    """
    DATA_NUM = 20  # num of data
    DATA_DIM = 2  # dimension of data
    x, y, wf = get_data(DATA_NUM, DATA_DIM)
    print('x:', x)
    print('y:', y)
    print('wf:', wf)

    positive_id = np.where(y > 0)
    negative_id = np.where(y < 0)

    print('true result:')
    print('positive_id:', positive_id)
    print('negative_id:', negative_id)

    p = Perceptron(data_dim=DATA_DIM)

    stop = False
    while not stop:
        draw(wf, p.wt, x[positive_id], x[negative_id], p.n_iter)
        stop = p.update(x, y)
    p.wt /= sum(p.wt)

    print('iter num:', p.n_iter)
    print('wt:', p.wt)

    wt_dot_x = np.dot(p.wt[:-1], x.transpose()) + p.wt[-1]
    positive_id = np.where(wt_dot_x > 0)
    negative_id = np.where(wt_dot_x < 0)

    print('perceptron result:')
    print('positive_id:', positive_id)
    print('negative_id:', negative_id)
Ejemplo n.º 3
0
def optimize_portfolio(sd=dt.datetime(2008,1,1), ed=dt.datetime(2009,1,1), \
    syms=['GOOG','AAPL','GLD','XOM'], gen_plot=False):

    # Read in adjusted closing prices for given symbols, date range
    dates = pd.date_range(sd, ed)
    prices_all = get_data(syms, dates)  # automatically adds SPY
    prices = prices_all[syms]  # only portfolio symbols
    prices_SPY = prices_all['SPY']  # only SPY, for comparison later

    # find the allocations for the optimal portfolio
    # note that the values here ARE NOT meant to be correct for a test case
    allocs = np.ones(len(syms)) # add code here to find the allocations
    constraint = ({ 'type': 'eq', 'fun': lambda inputs: 1.0 - np.sum(inputs) })
    bounds = ((0,2),) * len(syms)
    result = spo.minimize(err, allocs, args=(prices,), method="SLSQP", constraints=constraint,bounds=bounds)
    sddr = 0
    sr = sharpe(result.x, prices)
    allocs = result.x
    adr = average_daily_return(allocs, prices)
    cr = cumulative_return(allocs, prices)
    sddr = volatility(allocs, prices)

    # Get daily portfolio value
    port_val = prices_SPY # add code here to compute daily portfolio values

    # Compare daily portfolio value with SPY using a normalized plot
    if gen_plot:
        # add code to plot here
        df_temp = pd.concat([port_val, prices_SPY], keys=['Portfolio', 'SPY'], axis=1)
        pass

    return allocs, cr, adr, sddr, sr
Ejemplo n.º 4
0
def main():
    DATA_NUM = 20
    DATA_DIM = 2
    x, y, wf = get_data(DATA_NUM, DATA_DIM)
    p = Perceptron(data_dim=DATA_DIM)
    p.train(x, y)
    print(p.n_iter)
    print(p.wt)
Ejemplo n.º 5
0
def extract_data(flatten):
    data, labels = get_data(_DATA_PATH,
                            class_labels=_CLASS_LABELS,
                            flatten=flatten)
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        labels,
                                                        test_size=0.2,
                                                        random_state=42)
    return np.array(x_train), np.array(x_test), np.array(y_train), np.array(
        y_test), len(_CLASS_LABELS)
Ejemplo n.º 6
0
def get_num_sections(subject_id, course_id):
	data = get_data(subject_id)
	num_sections = 0

	for course in data:
		if(course["courseNumber"]==str(course_id)):
			for section in course["sections"]:
				if (section["printed"]=="Y"):
					num_sections = num_sections + 1
			print num_sections
			return num_sections
Ejemplo n.º 7
0
def get_open_sections(subject_id, course_id):
	data = get_data(subject_id)
	num_sections = 0

	for course in data:
		if(course["courseNumber"]==str(course_id)):
			for section in course["sections"]:
				if (section["openStatus"] and section["printed"]=="Y"):
					print section["index"]
					num_sections = num_sections + 1
			return num_sections
Ejemplo n.º 8
0
    def __init__(self, \
        symbols, \
        sd, \
        ed, \
        sv, \
        verbose = False):
        self.symbols = symbols
        self.sd = sd
        self.ed = ed
        self.sv = sv

        self.dates = pd.date_range(sd, ed)
        self.stocks = utility.get_data(self.symbols, self.dates)
        self.tenmean_pd = []
        self.twentymean_pd = []
        self.return_pd = []
Ejemplo n.º 9
0
def create_csv(subject_id):
	target = open("static/csv/"+str(subject_id)+".csv", 'w')
	target.write("Course,Open,Closed\n")

	data = get_data(subject_id)
	num_sections = 0
	num_open = 0

	for course in data:
		for section in course["sections"]:
			if (section["printed"]=="Y"):
				num_sections = num_sections + 1
				if (section["openStatus"]):
					num_open = num_open +1
		if(num_sections!=0):
			target.write(course["courseNumber"]+","+str(num_open)+","+str(num_sections-num_open)+"\n")
		num_sections=0
		num_open=0

	target.close()
Ejemplo n.º 10
0
label_data = ut.add_labels_to_data(label_data, labels)

# initialize empty dataframes for acc and gyro data for different frequencies
acc_hz_5 = ut.get_empty_dataframe()
acc_hz_10 = ut.get_empty_dataframe()
acc_hz_25 = ut.get_empty_dataframe()
acc_hz_50 = ut.get_empty_dataframe()
gyro_hz_5 = ut.get_empty_dataframe()
gyro_hz_10 = ut.get_empty_dataframe()
gyro_hz_25 = ut.get_empty_dataframe()
gyro_hz_50 = ut.get_empty_dataframe()

# for each file
for file in files:
    # read into dataframe
    df = ut.get_data(file)

    # get filename from filepath
    filename = ut.get_file_name_from_path(file)

    # for all files other than labels.txt file
    if (not (filename.__contains__('labels'))):

        # get experimentID and userID from filename
        exp_id = int(filename.split('_')[1][3:])
        user_id = int(filename.split('_')[2][4:])

        # add experimentID userID and activityID to the data
        df['experimentID'] = exp_id
        df['userID'] = user_id
        df['activityID'] = 0
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import preprocessing
import operator
from scipy import sparse
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
import time
import pickle
import datetime

fullTrainFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/train_svm_light.v5.4.BOTH.2015-12-26_16-48-50.txt'
fullTestFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/test_svm_light.v5.4.BOTH.2015-12-26_09-24-51.txt'
X, Y = get_data(fullTrainFile)
#X = sparse.csr_matrix(X)[:,list(range(0,155))]
#Y_14 = np.where(Y==14) # to delete TT_14 rows
#Y = np.delete(Y, Y_14, 0)
#X = delete_rows_csr(X,Y_14)
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y).astype(np.int32)
#X=X.astype(np.float32)

skf = StratifiedKFold(Y, n_folds=3, random_state=app_random_state_value)#,shuffle=True)
skfList = list(skf)
train_index, test_index = skfList[0]
X=X[:,0:5391]
XD = X#.todense()
xTr, xTe = XD[train_index], XD[test_index]
yTr, yTe = Y[train_index], Y[test_index]
Ejemplo n.º 12
0
def get_subject_name(subject_id, course_id):
	data = get_data(subject_id)
	for course in data:
		if(course["courseNumber"]==str(course_id)):
			return course["title"]
Ejemplo n.º 13
0
def train(imgs_train_path,imgs_test_path,output_models_path,output_images_path,batch_size,epochs,epoch_size = 1000,training_images_to_load = 3000,test_images = 50,training_for_generator_each_batch=2,save_data= True):

    train_x,train_y = utility.get_data(imgs_train_path,IMAGE_SHAPE,training_images_to_load)
    test_x,test_y = utility.get_data(imgs_test_path,IMAGE_SHAPE,test_images,False)

    if save_data:
        utility.save_data_as_pickle(train_x,"dataset/operative_data/train_x")
        utility.save_data_as_pickle(train_y,"dataset/operative_data/train_y")
        utility.save_data_as_pickle(test_x,"dataset/operative_data/test_x")
        utility.save_data_as_pickle(test_y,"dataset/operative_data/test_y")
        print("--SAVED DATA")


    generator = model.get_generator(IMAGE_SHAPE)
    discriminator = model.get_discriminator(IMAGE_SHAPE)
    loss_generator = loss.VGG_LOSS(IMAGE_SHAPE)
    generator.compile(loss=loss_generator.vgg_loss, optimizer=OPTIMIZER)
    discriminator.compile(loss="binary_crossentropy", optimizer=OPTIMIZER)
    gan = model.get_gan_model(DOWNSCALED_IMG_SHAPE,generator,discriminator,loss_generator.vgg_loss,"binary_crossentropy",OPTIMIZER)
    gan.summary()

    n_batch = int((epoch_size)/ batch_size)
    n_batch_test = int((test_images/batch_size))

    true_batch_vector = np.ones((batch_size,1))
    false_batch_vector = np.zeros((batch_size,1))
    

    print("--START TRAINING")
    for epoch in range(epochs):

        disciminator_losses = []
        gan_losses = []
        epoch_start_time = time.time()

        # train each batch
        for batch in range(n_batch):
            random_indexes = np.random.randint(0, len(train_x), size=batch_size)

            batch_x  =  np.array(train_x)[random_indexes.astype(int)]
            batch_y =  np.array(train_y)[random_indexes.astype(int)]

            generated_images = generator.predict(x=batch_x, batch_size=batch_size)

            discriminator.trainable = True

            #we can decide to perform more than one train for batch on the discriminator 
            for _ in range(training_for_generator_each_batch):
                d_loss_r = discriminator.train_on_batch(batch_y, true_batch_vector)
                d_loss_f = discriminator.train_on_batch(generated_images, np.random.random_sample(batch_size)*0.2)      

                disciminator_losses.append(0.5 * np.add(d_loss_f, d_loss_r))

            discriminator.trainable = False
            
            # train the generator 
            gan_Y = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
            gan_loss = gan.train_on_batch(batch_x, [batch_y, gan_Y])
            gan_losses.append(gan_loss)

        test_losses = []
        for i in range(n_batch_test):
            batch_x = np.array(test_x)[i*batch_size:(i+1)*batch_size]
            batch_y = np.array(test_y)[i*batch_size:(i+1)*batch_size]
            gan_Y = np.ones(batch_size) - np.random.random_sample(batch_size)*0.2
            gan_loss = gan.test_on_batch(batch_x, [batch_y, gan_Y])
            test_losses.append(gan_loss)

        #print("discriminator loss: ",np.mean(disciminator_losses), " gan losses: ",[np.mean(x) for x in zip(*gan_losses)] ," time: ",time.time()-epoch_start_time)
        print("test: ",[np.mean(x) for x in zip(*test_losses)])

        if epoch % 3 == 0 or epoch == 0:
            generator.save(output_models_path + 'gen_model%d.h5' % epoch)
            discriminator.save(output_models_path + 'dis_model%d.h5' % epoch)
            utility.plot_generated_images(output_images_path,epoch,generator,test_y,test_x)
from utility import delete_rows_csr
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import preprocessing
import operator
from scipy import sparse
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

fullTrainFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/train_svm_light.v5.4.BOTH.2015-12-26_16-48-50.txt'
fullTestFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/test_svm_light.v5.4.BOTH.2015-12-26_09-24-51.txt'
X, Y = get_data(fullTrainFile)
#X = sparse.csr_matrix(X)[:,list(range(0,155))]
#Y_14 = np.where(Y==14) # to delete TT_14 rows
#Y = np.delete(Y, Y_14, 0)
#X = delete_rows_csr(X,Y_14)
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y).astype(np.int32)
#X=X.astype(np.float32)

skf = StratifiedKFold(Y, n_folds=3, random_state=app_random_state_value)
skfList = list(skf)
train_index, test_index = skfList[0]
XD = X#.todense()
xTr, xTe = XD[train_index], XD[test_index]
yTr, yTe = Y[train_index], Y[test_index]
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import numpy as np

fullTrainFile = '../files/train_low_freq_removed_500k.txt'
maxIterations = 5  # number of CV results you want to see, value between 1 to 5
printPredictionToFile = False  # deprecated.. should not use since we are doing CV, keep it false
algoVerbose = False
predFile = 'files/nb.mn.25.75.predictions'  #sys.argv[2];
useTfIdf = True  # this becomes feature 3! # True REDUCES MAE(good)
#Hyperparameters list

##

X, Y = get_data(fullTrainFile)
if (useTfIdf):
    transformer = TfidfTransformer()
    X = transformer.fit_transform(X, Y)
skf = StratifiedKFold(Y, n_folds=5, random_state=app_random_state_value)

scoreSumMatrix = np.zeros((4, len(classes)))
mseSum = 0
maeSum = 0
count = 1
for train_index, test_index in skf:
    xTr, xTe = X[train_index], X[test_index]
    yTr, yTe = Y[train_index], Y[test_index]

    clf = MultinomialNB()
    yhTe = clf.fit(xTr, yTr).predict(xTe)
import csv

import xgboost as xgb
import matplotlib.pyplot as plt
import sklearn.model_selection as ms

from utility import get_data

# 获取数据
data, label = get_data(training=True)
train_data, val_data, train_label, val_label = ms.train_test_split(
    data, label, test_size=0.25, random_state=1)

feature_name = [
    'Pclass', 'Sex', 'Fare', 'is_child', 'family_size', 'Embarked-c',
    'Embarked-s', 'Embarked-q'
]

data_matrix = xgb.DMatrix(data, label, feature_names=feature_name)
train_matrix = xgb.DMatrix(train_data, train_label, feature_names=feature_name)
val_matrix = xgb.DMatrix(val_data, val_label, feature_names=feature_name)
test_matrix = xgb.DMatrix(get_data(training=False), feature_names=feature_name)

num_trees = 47
eval_list = [(train_matrix, 'train'), (val_matrix, 'eval')]
params = {
    'objective': 'reg:logistic',
    'eval_metric': 'error',
    'max_depth': 5,
    'min_child_weight': 1,
    'eta': 0.5,
from utility import delete_rows_csr
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import preprocessing
import operator
from scipy import sparse
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

fullTrainFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/train_svm_light.v5.4.BOTH.2015-12-26_16-48-50.txt'
fullTestFile = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/test_svm_light.v5.4.BOTH.2015-12-26_09-24-51.txt'
X, Y = get_data(fullTrainFile)
#X = sparse.csr_matrix(X)[:,list(range(0,155))]
#Y_14 = np.where(Y==14) # to delete TT_14 rows
#Y = np.delete(Y, Y_14, 0)
#X = delete_rows_csr(X,Y_14)
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y).astype(np.int32)
#X=X.astype(np.float32)

skf = StratifiedKFold(Y, n_folds=3, random_state=app_random_state_value)
skfList = list(skf)
train_index, test_index = skfList[0]
XD = X  #.todense()
xTr, xTe = XD[train_index], XD[test_index]
yTr, yTe = Y[train_index], Y[test_index]
Ejemplo n.º 18
0
import matplotlib.pyplot as plt
##from matplotlib.font_manager import FontProperties

sd = dt.datetime(2010, 1, 1)
ed = dt.datetime(2010, 12, 31)

symbol = "GLD"
learner = sl.StrategyLearner(symbols=[symbol],\
                             sd = sd,\
                             ed = ed,\
                             sv = 100000, \
                             verbose = False) # constructor
result = learner.train()

dates = pd.date_range(sd, ed)
stocks = utility.get_data(symbol, dates)
prices = pd.DataFrame(np.zeros(((ed - sd).days + 1, 1)))
prices.columns = [symbol]
result = np.zeros((251, 1)) + 10
portfolio = 0
inv = 0
out = 100000

####3

value = 100000
money = True
for i in range(0, 250):
    prices[symbol].iloc[i] = stocks[symbol].iloc[i]
    if i > 14:
        fiftendays = prices[symbol].loc[i - 14:i]
Ejemplo n.º 19
0
import seaborn as sns

sns.set(style="darkgrid")


def load_model(model_name, model_weights_filename):
    with open("model/{}.json".format(model_name), "r") as json_file:
        model = model_from_json(json_file.read())

    model.load_weights("model/{}.h5".format(model_weights_filename))
    model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=["categorical_accuracy"])

    return model


if __name__ == "__main__":
    model = load_model("CNN-RNN", "CNN-RNN-61(-3395)")

    # Change your file path
    data_path = "./data/focused/darkfanxing_1.csv"
    data = get_data(data_path)
    data = change_to_sequence_data(data)

    predictions = np.argmax(model.predict(data), axis=-1)
    # predictions = np.where(predictions==1)[1]

    sns.lineplot(x=[time for time in range(predictions.shape[0])],
                 y=predictions).set(xlabel="time(s)", ylabel="is_focused")
    plt.show()
'''
Created on Dec 14, 2015

@author: vaibhav
'''
from utility import get_data
from utility import app_random_state_value
import numpy as np
import xgboost as xgb
from sklearn import preprocessing
import time
import datetime

bothDataDir = 'C:/myD/workarea/KaggleWallmartWorkarea/kaggle_wallmart/python/src/preprocess/BOTH_output_files/'
fullTrainFile = bothDataDir+'train_svm_light.v5.4.BOTH.2015-12-26_16-48-50.txt'
xTr, yTr = get_data(fullTrainFile)
le = preprocessing.LabelEncoder()
le.fit(yTr)
yTr = le.transform(yTr) 
fullTestFile = bothDataDir+'test_svm_light.v5.4.BOTH.2015-12-26_16-48-50.txt'
xTe, vnTe = get_data(fullTestFile) # vnTe is visit number for TEST FILE


xg_train = xgb.DMatrix( xTr, label=yTr)
xg_test = xgb.DMatrix(xTe)#, label=yTe)
param = {'max_depth':50, 'eta':0.15, 'silent':1, 'objective':'multi:softprob', 'max_delta_step':1,
         'num_class':len(le.classes_), 'eval_metric':'mlogloss', 'seed':54325, 'nthread':4,
         'subsample':0.8, 'colsample_bytree':0.8, 'min_child_weight':2, 'lambda':8, 'alpha':3, 'gamma':1}
# param = {'max_depth':50, 'eta':0.1, 'silent':1, 'objective':'multi:softprob', 
#          'num_class':38, "eval_metric":"mlogloss", "seed":app_random_state_value}
watchlist = [ (xg_train,'train') ]#, (xg_test, 'test') ]
Ejemplo n.º 21
0
def get_label_data(files):
    for file in files:
        if file.__contains__('labels'):
            label_file = file
            break
    return ut.get_data(label_file)