def valid(self, epoch): self.dataset.mode = 'valid' dataloader = DataLoad(self.dataset, batch_size = self.batch_size, shuffle = False) self.model.eval() total_acc = 0 total_loss = 0 cnt = 0 for batch_idx, data in enumerate(dataloader): contexts, questions, answers = data batch_size = len(answers) output = self.model(contexts, questions, self.dataset.train_tree_dict) predict = self.model.predict(output, questions, self.dataset.train_tree_dict) acc = self.metrics(predict, answers) m = nn.LogSoftmax(dim = 1) loss = self.criteria(m(output), answers) print('|', batch_size, 'acc:', acc, 'loss:', loss,'|') total_loss += loss.item() total_acc += acc * batch_size cnt += batch_size total_acc = total_acc / cnt if total_acc > self.best_acc: self.best_acc = total_acc self.best_state = self.model.state_dict() self.early_stopping_cnt = 0 else: self.early_stopping_cnt += 1 if self.early_stopping_cnt >= 256: self.early_stopping_flag = True with open(self.LOG_PATH, 'a+') as fp: fp.write(f'[Run {self.run}, Task {self.task_id}, Epoch {epoch}] [Validate] Accuracy : {total_acc: {5}.{4}} Loss : {total_loss / cnt: {5}.{4}}' + '\n') return total_acc
def train(self, epoch): self.dataset.mode = 'train' # 每一次取出 batch 对 batch 再按照contexts长度进行分组 dataloader = DataLoad(self.dataset, batch_size = self.batch_size) self.model.train() total_acc = 0 total_loss = 0 cnt = 0 # 0-6 7-12 13-18... 有多少组循环多少次 每个data里的context长度相似 for batch_idx, data in enumerate(dataloader): self.optimizer.zero_grad() contexts, questions, answers = data batch_size = len(answers) ''' batch_idx 0 contexts ([0, 1, 3, 4, 6, 7, 9, 10, 12, 13], [0, 1, 3, 4, 6, 7], [15, 16, 18, 19, 21, 22], [15, 16, 18, 19, 21, 22, 24, 25], [0, 1, 3, 4, 6, 7, 9, 10]) questions tensor([[ 10, 11, 14], [ 10, 11, 12], [ 10, 11, 14], [ 10, 11, 7], [ 10, 11, 12]]) answers tensor([ 5, 9, 5, 16, 16]) ''' output = self.model(contexts, questions, self.dataset.train_tree_dict) predict = self.model.predict(output, questions, self.dataset.train_tree_dict) m = nn.LogSoftmax(dim = 1) loss = self.criteria(m(output), answers) total_loss += loss.item() acc = self.metrics(predict, answers) total_acc += acc * batch_size cnt += batch_size loss.backward() self.optimizer.step() with open(self.LOG_PATH, 'a+') as fp: fp.write(f'[Run {self.run}, Task {self.task_id}, Epoch {epoch}] [Training] loss : {total_loss / cnt: {10}.{8}}, acc : {total_acc / cnt: {5}.{4}}, batch_idx : {batch_idx}' + '\n')
def test(self): self.dataset.mode = 'test' dataloader = DataLoad(self.dataset, batch_size=self.batch_size, shuffle=False) self.model.load_state_dict(self.best_state) self.model.eval() total_acc = 0 cnt = 0 for batch_idx, data in enumerate(dataloader): contexts, questions, answers = data batch_size = len(answers) output = self.model(contexts, questions, self.dataset.test_tree_dict) predict = self.model.predict(output, questions, self.dataset.test_tree_dict) acc = self.metrics(predict, answers) total_acc += acc * batch_size cnt += batch_size with open(self.LOG_PATH, 'a+') as fp: fp.write( f'[Run {self.run}, Task {self.task_id}, Epoch {epoch}] [Test] Accuracy : {total_acc / cnt: {5}.{4}}' + '\n') os.makedirs('models', exist_ok=True) with open( f'models/task{self.task_id}_epoch{epoch}_run{self.run}_acc{total_acc/cnt}.pth', 'wb') as fp: torch.save(self.model.state_dict(), fp)
D['X_val'] = np.load(datadirec + 'X_val.npy') D['y_val'] = np.load(datadirec + 'y_val.npy') print('Read data from pickles') except: print('One or more files are missing from ' + datadirec) else: os.makedirs(datadirec) names = 3 filenames = [] for i in range(names): filenames.append( ('/home/rob/Dropbox/ml_projects/deepclust/data/male' + str(i + 1) + '.wav', '/home/rob/Dropbox/ml_projects/deepclust/data/female' + str(i + 1) + '.wav')) dl = DataLoad(seq_len, Nn, low_freq=6000) dl.read_data(filenames) dl.strip_zero() D = dl.return_data() for key, value in D.iteritems(): np.save(datadirec + key + '.npy', value) print('Finished saving data') #Obtain some sizes N = D['X_train'].shape[0] Nval = D['X_val'].shape[0] """Visualize some data""" row = 10 ind = np.random.choice(D['X_train'].shape[0], row) f, axarr = plt.subplots(row, 1)
def test_file_exists(self): # A dataframe must be returned when the path exists print('testing valid file...') DL = DataLoad() df = DL.load_csv('data/ross_train.csv', show_details=False) self.assertIsInstance(df, pd.DataFrame)
def test_invalid_format(self): print('testing invalid file format - not CSV...') DL = DataLoad() df = DL.load_csv('data/transportation.json') self.assertEqual(df, None)
def test_dir(self): print('testing dir instead of file...') DL = DataLoad() df = DL.load_csv('data') self.assertEqual(df, None)
def test_corrupted_file(self): print('testing corrupted file...') DL = DataLoad() df = DL.load_csv('data/corrupted.csv') self.assertEqual(df, None)
def test_file_does_not_exist(self): print('testing inexistent file...') DL = DataLoad() df = DL.load_csv('data/ross_trai.csv') self.assertEqual(df, None)
from dataload import DataLoad from evaluation import EvaluateModel from preprocessing import DataSplit from models import Exponential_Smoothing, AutoArima, EnsembleModels, FBProphet import matplotlib.pyplot as plt import pandas as pd import warnings warnings.simplefilter("ignore") from datapreparation import DataVisualization print('Time Series model composition') # Loading Data DL = DataLoad() df = DL.load_csv('data/ross_train.csv') df = df[df['Store'] == 1] df = df[['Date', 'Sales']] df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d') df.set_index('Date', inplace=True) df.sort_index(inplace=True) print(df) # Analyzing using different frequencies x = [] y = [] title = [] # Daily x.append(df.index) y.append(df.Sales) title.append('Daily')
from models import Exponential_Smoothing, Arima, EnsembleModels, FBProphet, RandomForest, XGBoost, SupportVectorRegressor import warnings from dataload import DataLoad from analytics import DataVisualization from evaluation import EvaluateModel from preprocessing import FeatureEngineering warnings.simplefilter("ignore") print('Time Series model composition') # Loading Data DL = DataLoad() df = DL.load_csv(file_path='data/ross_train.csv', date='Date', target='Sales', show_details=True) print(df) # Analyzing using different frequencies x = [] y = [] title = [] # Daily x.append(df.index) y.append(df.Sales) title.append('Daily') freq = {'W': 'Weekly', 'M': 'Monthly', 'Y': 'Yearly'} for f in freq.keys(): sampled_df = df.resample(f).sum() x.append(sampled_df.index)