def split( self ): # process if the dataset split into two parts, which contains lots of missing value self.train = loader('train.csv') self.test = loader('test.csv') self.test, self.id = dp.Pandas_dataProcess().label_extract(self.test, id=True, istest=True) self.train, feature_name = dp.Pandas_dataProcess().trans_categorical( self.train) self.test = dp.Pandas_dataProcess().trans_categorical( self.test, feature_name=feature_name, istest=True) self.train = dp.Pandas_dataProcess().id_droper(self.train) self.train = self.train.values.tolist() self.test = self.test.values.tolist() self.misstrain, self.normaltrain = dp.List_dataProcess( ).missingset_split(self.train) self.misstrain, self.misstrain_y = dp.List_dataProcess().label_extract( self.misstrain) self.normaltrain, self.normaltrain_y = dp.List_dataProcess( ).label_extract(self.normaltrain) self.misstest, self.normaltest = dp.List_dataProcess( ).missingset_split(self.test) self.normaltrain = dp.Pandas_dataProcess().trans_topanda( self.normaltrain) self.normaltrain = dp.Pandas_dataProcess().fill_missvalue( self.normaltrain, value=self.missing) self.normaltest = dp.Pandas_dataProcess().trans_topanda( self.normaltest) self.normaltest = dp.Pandas_dataProcess().fill_missvalue( self.normaltest, value=self.missing)
def normal(self): # process if the dataset not split into two parts self.train = loader('train.csv') self.train_x, self.train_y = dp.Pandas_dataProcess().label_extract(self.train, real_extract=False) self.test = loader('test.csv') self.test, self.id = dp.Pandas_dataProcess().label_extract(self.test, id=True, istest=True) self.train_x, feature_name = dp.Pandas_dataProcess().trans_categorical(self.train_x) self.test = dp.Pandas_dataProcess().trans_categorical(self.test, feature_name=feature_name, istest=True) self.train_x = dp.Pandas_dataProcess().fill_missvalue(self.train_x, value=0) self.test = dp.Pandas_dataProcess().fill_missvalue(self.test, value=0) self.train_x = self.train_x.iloc[random.permutation(len(self.train_x))] self.train_x, self.train_y = dp.Pandas_dataProcess().label_extract(self.train_x)
def split(self): # process if the dataset split into two parts, which contains lots of missing value self.train = loader('train.csv') self.test = loader('test.csv') self.test, self.id = dp.Pandas_dataProcess().label_extract(self.test, id=True, istest=True) self.train, feature_name = dp.Pandas_dataProcess().trans_categorical(self.train) self.test = dp.Pandas_dataProcess().trans_categorical(self.test, feature_name=feature_name, istest=True) self.train = dp.Pandas_dataProcess().id_droper(self.train) self.train = self.train.values.tolist() self.test = self.test.values.tolist() self.misstrain, self.normaltrain = dp.List_dataProcess().missingset_split(self.train) self.misstrain, self.misstrain_y = dp.List_dataProcess().label_extract(self.misstrain) self.normaltrain, self.normaltrain_y = dp.List_dataProcess().label_extract(self.normaltrain) self.misstest, self.normaltest = dp.List_dataProcess().missingset_split(self.test) self.normaltrain = dp.Pandas_dataProcess().trans_topanda(self.normaltrain) self.normaltrain = dp.Pandas_dataProcess().fill_missvalue(self.normaltrain, value=self.missing) self.normaltest = dp.Pandas_dataProcess().trans_topanda(self.normaltest) self.normaltest = dp.Pandas_dataProcess().fill_missvalue(self.normaltest, value=self.missing)
def doclassify(self): train = loader("train.csv") target = train['target'].values test = loader("test.csv") id_test = test['ID'].values train['v22-1'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0])) test['v22-1'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[0])) train['v22-2'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1])) test['v22-2'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[1])) # to process v22 features which is an important categorical feature train['v22-3'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2])) test['v22-3'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[2])) train['v22-4'] = train['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3])) test['v22-4'] = test['v22'].fillna('@@@@').apply(lambda x:'@'*(4-len(str(x)))+str(x)).apply(lambda x:ord(x[3])) drop_list = ['v91', 'v1', 'v8', 'v25', 'v29', 'v34', 'v41', 'v46', 'v54', 'v67', 'v97', 'v105', 'v122', 'v38', 'v72','v24', 'v52'] train = train.drop(['ID', 'target'] + drop_list, axis=1).fillna(self.missing) test = test.drop(['ID'] + drop_list, axis=1).fillna(self.missing) # train, train_y = dp.Pandas_dataProcess().label_extract(train, real_extract=True) # test, id = dp.Pandas_dataProcess().label_extract(test, id=True, istest=True) # train, feature_name = dp.Pandas_dataProcess().trans_categorical(train) # test = dp.Pandas_dataProcess().trans_categorical(test, feature_name=feature_name, istest=True) # train = dp.Pandas_dataProcess().fill_missvalue(train, value='mean') # test = dp.Pandas_dataProcess().fill_missvalue(test, value='mean') refcols=list(train.columns) print refcols for elt in refcols: if train[elt].dtype == 'O': train[elt], temp = pd.factorize(train[elt]) test[elt] = temp.get_indexer(test[elt]) else: train[elt] = train[elt].round(5) test[elt] = test[elt].round(5) a = dp.AddNearestNeighbourLinearFeatures(n_neighbours=self.n_ft, max_elts=self.max_elts, verbose=True, random_state=self.rnd) a.fit(train, target) train = a.transform(train) test = a.transform(test) clf = ensemble.ExtraTreesClassifier(n_estimators=1200, max_features=30, criterion='entropy', min_samples_split=2, max_depth=35, min_samples_leaf=2, n_jobs=-1, random_state=self.rnd) clf.fit(train,target) pred_et = clf.predict_proba(test) submission = pd.read_csv('sample_submission.csv') submission.index = submission.ID submission.PredictedProb = pred_et[:, 1] submission.to_csv('./addNNLinearFt.csv', index=False) submission.PredictedProb.hist(bins=30)
def normal(self): # process if the dataset not split into two parts self.train = loader('train.csv') self.train_x, self.train_y = dp.Pandas_dataProcess().label_extract( self.train, real_extract=False) self.test = loader('test.csv') self.test, self.id = dp.Pandas_dataProcess().label_extract(self.test, id=True, istest=True) self.train_x, feature_name = dp.Pandas_dataProcess().trans_categorical( self.train_x) self.test = dp.Pandas_dataProcess().trans_categorical( self.test, feature_name=feature_name, istest=True) self.train_x = dp.Pandas_dataProcess().fill_missvalue(self.train_x, value=0) self.test = dp.Pandas_dataProcess().fill_missvalue(self.test, value=0) self.train_x = self.train_x.iloc[random.permutation(len(self.train_x))] self.train_x, self.train_y = dp.Pandas_dataProcess().label_extract( self.train_x)
def plot2d(): ld = dl.loader(dimensions=2) tsi = ld.getTestInp() tso = ld.getTestOut() tri = ld.getTrainInp() tro = ld.getTrainOut() mlp = MLP(tri, tro, (20,25), epoches=2500) plt.plot(tri, tro, "r+") out = mlp.calc(tri) plt.plot(tri, out, "b-") out = mlp.calc(tsi) plt.plot(tsi, out, "go") plt.show()
def plot3d(): ld = dl.loader(dimensions=3) ai = ld.getAllInp() ao = ld.getAllOut() x, y, z = mesh_data(ai.T[0], ai.T[1], ao.T[0]) fig = plt.figure() axes = fig.gca(projection='3d') axes.scatter(x, y, z, color="#bbff8016") mlp = MLP(ai, ao, (25,), epoches=1000) out = mlp.calc(ai) x, y, z = mesh_data(ai.T[0], ai.T[1], out) axes.plot_surface(x, y, z, cmap=cm.get_cmap("coolwarm"), linewidth=0, antialiased=True) plt.show()
def painting(): Not_number = [ 'v3', 'v22', 'v24', 'v30', 'v31', 'v47', 'v52', 'v56', 'v66', 'v71', 'v74', 'v75', 'v79', 'v91', 'v107', 'v110', 'v112', 'v113', 'v125' ] start = time.time() print 'painting...' dataset = dataloader.loader('train.csv') p = Paint() # p.missingVL_abs(dataset) # p.correlation_digonal(dataset) # p.hexbin(dataset, 'v83', 'v130') # p.hexbin(dataset, 'v1', 'v37') p.label_prop(dataset, 'v125') # p.label_prop(dataset, 'v24') # p.hexbin(dataset, 'v10', 'v131') # p.distribution(dataset, 'v1') # p.distribution(dataset, 'v131') # p.distribution(dataset, 'target') end = time.time() print '-' * 100 print 'total run time is:', end - start, 's'
def sta(): dataset = loader('train.csv') stat.Categorical_stat().co_relation(dataset)
return support_index def observer(self,step = 1): color = array([[0,0,1], [1,0,0],[0,1,0] ] ) #color for classifier datacolor = [color[lbl] for lbl in self.label] X= zeros(( len(self.label),len(self.points[0]) )) #init a the data matrix for i in xrange(0,len(self.label)): X[i][0],X[i][1] = self.points[i][0],self.points[i][1] #clear the previous figure pyplot.plot(hold = False) pyplot.hold = True pyplot.scatter(X[:,0],X[:,1],c = datacolor) pyplot.savefig("data/twofeature.png",format = "png") def get_label(self): return label def get_points(self): return points if __name__ == "__main__": label,points = dataloader.loader("twofeature.txt") svm = SVM(label = label,points = points) svm.observer()
import cv2 import numpy as np from dataloader import loader from unet import Models from sklearn.model_selection import train_test_split from keras.preprocessing.image import ImageDataGenerator from keras.optimizers import Adam import matplotlib.pyplot as plt w = 256 h = 256 c = 3 mod = Models(w, h, c) auto_encoder = mod.arch3() load_img = loader() auto_encoder.summary() x_data, y_data = load_img.load() x_data = np.array(x_data, dtype='float') / 255.0 y_data = np.array(y_data, dtype='float') / 255.0 opt = Adam(lr=0.001, decay=0.001 / 50) train_x, test_x, train_y, test_y = train_test_split(x_data, y_data, test_size=0.1, random_state=30) auto_encoder.compile(optimizer=Adam(lr=1e-4), loss='binary_crossentropy', metrics=['accuracy']) history = auto_encoder.fit(train_x, train_y, batch_size=1,
torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) np.random.seed(seed) random.seed(seed) setup_seed(44) rel2id, id2rel = map_id_rel() print(id2rel) USE_CUDA = torch.cuda.is_available() #USE_CUDA=False path = "data/questions.json" train, dev = loader(path) data = train train_text = data['text'] train_mask = data['mask'] train_label = data['label'] train_text = [t.numpy() for t in train_text] train_mask = [t.numpy() for t in train_mask] train_text = torch.tensor(train_text) train_mask = torch.tensor(train_mask) train_label = torch.tensor(train_label) data = dev dev_text = data['text'] dev_mask = data['mask']
def doclassify(self): train = loader("train.csv") target = train['target'].values test = loader("test.csv") id_test = test['ID'].values train['v22-1'] = train['v22'].fillna('@@@@').apply(lambda x: '@' * ( 4 - len(str(x))) + str(x)).apply(lambda x: ord(x[0])) test['v22-1'] = test['v22'].fillna('@@@@').apply(lambda x: '@' * ( 4 - len(str(x))) + str(x)).apply(lambda x: ord(x[0])) train['v22-2'] = train['v22'].fillna('@@@@').apply(lambda x: '@' * ( 4 - len(str(x))) + str(x)).apply(lambda x: ord(x[1])) test['v22-2'] = test['v22'].fillna( '@@@@' ).apply(lambda x: '@' * (4 - len(str(x))) + str(x)).apply( lambda x: ord(x[1]) ) # to process v22 features which is an important categorical feature train['v22-3'] = train['v22'].fillna('@@@@').apply(lambda x: '@' * ( 4 - len(str(x))) + str(x)).apply(lambda x: ord(x[2])) test['v22-3'] = test['v22'].fillna('@@@@').apply(lambda x: '@' * ( 4 - len(str(x))) + str(x)).apply(lambda x: ord(x[2])) train['v22-4'] = train['v22'].fillna('@@@@').apply(lambda x: '@' * ( 4 - len(str(x))) + str(x)).apply(lambda x: ord(x[3])) test['v22-4'] = test['v22'].fillna('@@@@').apply(lambda x: '@' * ( 4 - len(str(x))) + str(x)).apply(lambda x: ord(x[3])) drop_list = [ 'v91', 'v1', 'v8', 'v25', 'v29', 'v34', 'v41', 'v46', 'v54', 'v67', 'v97', 'v105', 'v122', 'v38', 'v72', 'v24', 'v52' ] train = train.drop(['ID', 'target'] + drop_list, axis=1).fillna(self.missing) test = test.drop(['ID'] + drop_list, axis=1).fillna(self.missing) # train, train_y = dp.Pandas_dataProcess().label_extract(train, real_extract=True) # test, id = dp.Pandas_dataProcess().label_extract(test, id=True, istest=True) # train, feature_name = dp.Pandas_dataProcess().trans_categorical(train) # test = dp.Pandas_dataProcess().trans_categorical(test, feature_name=feature_name, istest=True) # train = dp.Pandas_dataProcess().fill_missvalue(train, value='mean') # test = dp.Pandas_dataProcess().fill_missvalue(test, value='mean') refcols = list(train.columns) print refcols for elt in refcols: if train[elt].dtype == 'O': train[elt], temp = pd.factorize(train[elt]) test[elt] = temp.get_indexer(test[elt]) else: train[elt] = train[elt].round(5) test[elt] = test[elt].round(5) a = dp.AddNearestNeighbourLinearFeatures(n_neighbours=self.n_ft, max_elts=self.max_elts, verbose=True, random_state=self.rnd) a.fit(train, target) train = a.transform(train) test = a.transform(test) clf = ensemble.ExtraTreesClassifier(n_estimators=1200, max_features=30, criterion='entropy', min_samples_split=2, max_depth=35, min_samples_leaf=2, n_jobs=-1, random_state=self.rnd) clf.fit(train, target) pred_et = clf.predict_proba(test) submission = pd.read_csv('sample_submission.csv') submission.index = submission.ID submission.PredictedProb = pred_et[:, 1] submission.to_csv('./addNNLinearFt.csv', index=False) submission.PredictedProb.hist(bins=30)
return model elif type == "dns201": model = torch.hub.load('pytorch/vision:v0.9.0', 'densenet201', pretrained=False) return model elif type == "rsnxt-50": model = pretrainedmodels.__dict__["se_resnext50_32x4d"]( num_classes=2, pretrained=None) return model if __name__ == '__main__': dataloaders, dataset_sizes = loader( "/content/drive/MyDrive/competitions/recog-r2/train.csv", 0.2) model_ft = mdl("res50") model_ft = model_ft.to(DEVICE) criterion = nn.CrossEntropyLoss() # Observe that all parameters are being optimized # optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9) optimizer_ft = optim.Adam(model_ft.parameters(), lr=0.001) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
import torch import torchvision import torch.nn as nn import datetime as dt import Levenshtein as L import torch.optim as optim from model import ZLNet from utils import details from constant import CHARSET from dataloader import loader DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") test_data_file = '../data/test.npy' testloader = loader(test_data_file) model = ZLNet().to(DEVICE) model.load_state_dict(torch.load("model.pt")) model.eval() result = [] for inputs, lens in testloader: lens = lens.to(DEVICE) inputs = inputs.to(DEVICE) charindice = model(inputs, lens) charindice = charindice[0] pred = ''.join([CHARSET[idx] for idx in charindice]) result.append(pred[1:-1])
save_ckp(checkpoint, checkpoint_path) print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model_wts) plot(loss_p,acc_p,num_epochs) return model, best_acc if __name__ == '__main__': dataloaders,dataset_sizes = loader(use_pretrained=True) model_ft = mdl("res34") # model_ft = drklrd() model_ft = model_ft.to(config.DEVICE) criterion = nn.CrossEntropyLoss() optimizer_ft = optim.Adam(model_ft.parameters(), lr=0.0001) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1) checkpoint_path = "/content/drive/MyDrive/competitions/mosaic-r2/weights/res34_albu_seg.pt" model_ft, best_acc = train_model(model_ft, dataloaders, criterion, optimizer_ft, exp_lr_scheduler, dataset_sizes, checkpoint_path, num_epochs=config.NUM_EPOCHS)
def __init__(self, config_path): with open(os.path.join(config_path, "config.yml")) as cf: config = yaml.load(cf, Loader=yaml.FullLoader) self.num_layers = config["num_layers"] self.d_model = config["d_model"] self.dff = config["dff"] self.num_heads = config["num_heads"] self.dropout_rate = config["dropout_rate"] self.max_length = config["max_length"] self.epochs = config["epochs"] self.batch_size = config["batch_size"] self.target_vocab_size = config["target_vocab_size"] self.checkpoint = config["checkpoint"] self.max_checkpoint = config["max_checkpoint"] self.custom_checkpoint = config["custom_checkpoint"] self.eval_limit = config["eval_limit"] self.exit_phrase = config["exit_phrase"] if config["storage_path"] != None: self.storage_path = config["storage_path"] else: self.storage_path = "./" if config["ckpt_path"] != None: self.ckpt_path = config["ckpt_path"] else: self.ckpt_path = "./" if not self.storage_path.endswith("/"): self.storage_path += "/" if not self.ckpt_path.endswith("/"): self.ckpt_path += "/" self.data_path = f"{self.storage_path}data" self.checkpoint_path = f"{self.ckpt_path}checkpoints/train" self.tokenizer_path = f"{self.storage_path}tokenizers" self.inputs_savepath = f"{self.tokenizer_path}/inputs_token" self.outputs_savepath = f"{self.tokenizer_path}/outputs_token" if not os.path.exists(f"{self.ckpt_path}checkpoints"): os.mkdir(f"{self.ckpt_path}checkpoints") if not os.path.exists(f"{self.ckpt_path}checkpoints/train"): os.mkdir(f"{self.ckpt_path}checkpoints/train") if not os.path.exists(f"{self.storage_path}tokenizers"): os.mkdir(f"{self.storage_path}tokenizers") if not os.path.exists(f"{self.storage_path}models"): os.mkdir(f"{self.storage_path}models") if config["mode"] in ["train", "eval"]: if os.path.exists(os.path.join( config_path, "data/train.from")) and os.path.exists( os.path.join(config_path, "data/train.to")): pass else: if config["reddit_data"]: print("Starting to generate train data from Subreddits.") get_data(config_path) loader(config_path) self.inputs, self.outputs = load_data( f"{self.data_path}/training_data.txt") try: self.inputs_tokenizer, self.outputs_tokenizer = load_tokenizers( inputs_outputs_savepaths=[ self.inputs_savepath, self.outputs_savepath ]) except: print( "No tokenizers has been created yet, creating new tokenizers..." ) self.inputs_tokenizer, self.outputs_tokenizer = create_tokenizers( inputs_outputs=[self.inputs, self.outputs], inputs_outputs_savepaths=[ self.inputs_savepath, self.outputs_savepath ], target_vocab_size=self.target_vocab_size) self.input_vocab_size = self.inputs_tokenizer.vocab_size + 2 self.target_vocab_size = self.outputs_tokenizer.vocab_size + 2 self.learning_rate = CustomSchedule(self.d_model) self.optimizer = tf.keras.optimizers.Adam(self.learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) self.train_loss = tf.keras.metrics.Mean(name='train_loss') self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy') self.transformer = Transformer(self.num_layers, self.d_model, self.num_heads, self.dff, self.input_vocab_size, self.target_vocab_size, pe_input=self.input_vocab_size, pe_target=self.target_vocab_size, rate=self.dropout_rate) self.ckpt = tf.train.Checkpoint(transformer=self.transformer, optimizer=self.optimizer) self.ckpt_manager = tf.train.CheckpointManager( self.ckpt, self.checkpoint_path, max_to_keep=self.max_checkpoint) if self.custom_checkpoint: self.ckpt.restore(self.custom_checkpoint) print(f"Custom checkpoint restored: {self.custom_checkpoint}") # if a checkpoint exists, restore the latest checkpoint. elif self.ckpt_manager.latest_checkpoint: self.ckpt.restore(self.ckpt_manager.latest_checkpoint) print( f"Latest checkpoint restored: {self.ckpt_manager.latest_checkpoint}" ) if config["mode"] == "train": print("\nMODE: train\n===========\n") self.train_dataset = prepare_data( self.batch_size, [self.inputs, self.outputs], [self.inputs_tokenizer, self.outputs_tokenizer], self.max_length) self.train() eval_indexes = random.choices(range(len(self.inputs)), k=int(len(self.inputs) * 0.01)) for i in eval_indexes: predicted_sentence, attention_weights, sentence, result = self.reply( self.inputs[i]) print(f"\nInput: {self.inputs[i]}") print(f"Predicted: {predicted_sentence}") print(f"Sample output: {self.outputs[i]}") elif config["mode"] == "eval": print("\nMODE: eval\n==========\n") self.inputs = self.inputs[:self.eval_limit] self.outputs = self.outputs[:self.eval_limit] for (ins, outs) in zip(self.inputs, self.outputs): predicted_sentence, attention_weights, sentence, result = self.reply( ins) print(f"\nInput: {ins}") print(f"Predicted: {predicted_sentence}") print(f"Sample output: {outs}") elif config["mode"] == "test": print("\nMODE: test\n==========\n") while True: usr_input = input("[USER]: ") if usr_input == self.exit_phrase: print("Exiting test mode...") break else: predicted_sentence, _, _, _ = self.reply(usr_input) print(f"[BOT]: {predicted_sentence}") elif config["mode"] == "script": print("\nMODE: script\n==========\n")
import argparse from analog.load import ExperimentLog from dataloader import loader from plots import plot_learning_curves parser = argparse.ArgumentParser() parser.add_argument('--logdir', type=str, required=True) parser.add_argument('--exp_names', nargs='+', type=str, required=True) parser.add_argument('--std_type', type=str, choices=['time', 'run']) parser.add_argument('--min_t', type=float, default=0.) parser.add_argument('--max_t', type=float, default=100.) args = parser.parse_args() start_date = 'last' stop_date = None expdata: ExperimentLog = ExperimentLog() for exp_name in args.exp_names: expdata.extend(loader(args.logdir, exp_name, start_date=start_date, stop_date=stop_date)) # def filter(args) -> bool: # return ('noscale' in args and args['noscale'] and 'value' in args['algo']) \ # or 'advantage' in args['algo'] # expdata = filter(expdata, filter) plot_learning_curves(expdata, ['Return'], args.exp_names[0], mint=args.min_t, maxt=args.max_t, gtype=args.std_type + "_std") expdata.repr_rawlogs("Return", 5)