def main(): # find where data is parser = argparse.ArgumentParser() parser.add_argument( '--data_folder', type=str, default='', dest='data_folder', help='folder in datastore where training data is located') args = parser.parse_args() run = Run.get_context() run.tag('data_folder', utils.last_two_folders_if_exists(args.data_folder)) # read and process data # df = utils.read_raw_data(data_folder) # df = utils.process_raw_data(df) df = utils.read_train_data(args.data_folder) x_train, x_test, y_train, y_test = split_data(df) model, rmse, mape = train_model(x_train, x_test, y_train, y_test) run.log('rmse', rmse) run.log('mape', mape) # save the model os.makedirs('outputs', exist_ok=True) model_file = os.path.join('outputs', consts.model_name) run.tag('model_file', model_file) joblib.dump(value=model, filename=model_file)
def test_process_raw_data(): raw_data_dir = 'tests/unit/test_data/raw' train_data_dir = 'tests/unit/test_data/processed' dfraw = utils.read_raw_data(raw_data_dir) dfuut = utils.process_raw_data(dfraw) dfexpected = utils.read_train_data(train_data_dir) np.array_equal(dfuut.values, dfexpected.values)
def run(model, cv): experiment_id = f'{str(uuid.uuid4())}' train_data = read_train_data() cv = config.cv_schemes[cv] pipe = model_dispatcher.models[model] # note we pass in all data as pandas df # it is up to the model pipeline # to select which cols it wants X = train_data groups = train_data['date'].values y = (train_data['resp'] > 0).astype(int).values cv_scores = cross_val_score(pipe, X, y, cv=cv, groups=groups, scoring='roc_auc', verbose=10) print(cv_scores) print(f'mean: {np.mean(cv_scores)} std: {np.std(cv_scores)}') # save the model for reproducibility later # we haven't fit the model though, just run CV joblib.dump(pipe, f'../models/{experiment_id}_pipe_notfitted.bin')
def test(): print("=== Test ===") args = get_args() print(args) data_dir = f"./../../asset/{args.dataset}/" if args.train : test_labels, test_texts = read_train_data(data_dir) else : test_labels, test_texts = read_test_data(data_dir) # test_texts = list(test_texts)[:100] # test_labels = list(test_labels)[:100] test_texts = list(test_texts) test_labels = list(test_labels) model_name = args.model tokenizer = AutoTokenizer.from_pretrained(model_name) test_encodings = tokenizer( test_texts, truncation=True, padding=True, max_length=512) test_dataset = CustomDataset(test_encodings, test_labels) checkpoint_dir = f"./models/{args.task}/{args.model}/" best_checkpoint = find_best_checkpoint(checkpoint_dir) model = AutoModelForSequenceClassification.from_pretrained(best_checkpoint) test_trainer = Trainer(model) test_loader = DataLoader( test_dataset, batch_size=args.batch_size, shuffle=False) raw_pred, _, _ = test_trainer.prediction_loop( test_loader, description="prediction") # Preprocess raw predictions y_pred = np.argmax(raw_pred, axis=1) metrics = compute_metrics(y_pred, test_labels) print(metrics) if args.train : fpath = os.path.join(data_dir, f"train-predictions/{args.model}.pkl") else : fpath = os.path.join(data_dir, f"predictions/{args.model}.pkl") parent_dir = "/".join(str(fpath).split('/')[:-1]) if not os.path.exists(parent_dir): os.makedirs(parent_dir) with open(fpath, 'wb') as f: pickle.dump(y_pred, f)
def get_datas(): train_data = read_train_data() comment = train_data[0] result = train_data[1] test_data = read_test_data() lab = [] classes_name, classes_count = np.unique(result, return_counts=True) for i in range(len(result)): lab.append(np.where(classes_name == result[i])[0][0]) lab = np.asarray(lab) return comment, lab, test_data, classes_name
def main(): # set model model = getattr(models, args.model)(args) if args.data == 'cifar10': image_size = 32 args.num_classes = 10 elif args.data == 'cifar100': image_size = 32 args.num_classes = 100 elif args.data == 'imagenet': image_size = 224 args.num_classes = 1000 else: raise NotImplementedError n_flops, n_params = measure_model(model, image_size, image_size) print('FLOPs: %.2fM, Params: %.2fM' % (n_flops / 1e6, n_params / 1e6)) if torch.cuda.device_count(): model = torch.nn.DataParallel(model) # for multi-GPU training if torch.cuda.is_available(): model.cuda() print(model) if args.mode == 'train': # get the training loader and validation loader train_set, val_set = read_train_data(datadir=args.data_dir, data=args.data) # set the start epoch value if args.resume: start_epoch = None else: start_epoch = args.start_epoch train(startepoch=start_epoch, epochs=args.epochs, model=model, train_set=train_set, val_set=val_set, resume=args.resume) elif args.mode == 'test': test_set = read_test_data(datadir=args.data_dir, data=args.data, mode='test') test(model=model, test_set=test_set) else: raise NotImplementedError
def test_train_model(): """ test xgboost train in a single machine :return: trained model """ rank = 1 world_size = 10 place = "/tmp/data" dmatrix = read_train_data(rank, world_size, place) param_xgboost_default = { 'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'multi:softprob', 'num_class': 3 } booster = xgb.train(param_xgboost_default, dtrain=dmatrix) assert booster is not None return booster
if (predictions[i]['Category'] == result[i]): count += 1 return count / len(predictions) def define_alpha(self, validation_comments, validation_result): """ Helper function to find a good value for hyper param alpha """ alpha = [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01] result = np.zeros(len(alpha)) for i in range(len(alpha)): print('Alpha ', i + 1, '/', len(alpha), ' : ', alpha[i]) predict = bayes_classifier.predict(validation_comments, alpha[i]) result[i] = bayes_classifier.score(predict, validation_result) print(result[i]) print(result) print(alpha[np.argmax(result)]) return alpha[np.argmax(result)] if __name__ == "__main__": train_data = read_train_data() test_data = read_test_data() comment = train_data[0] result = train_data[1] bayes_classifier = BayesClassifier() alpha_star = 0.01 bayes_classifier.train(comment, result) predictions = bayes_classifier.predict(test_data, alpha_star) convert_to_csv(predictions)
def train(args): """ :param args: configuration for train job :return: XGBoost model """ # get env from xgboost operator start = time.time() addr, port, rank = extract_xgbooost_cluster_env(args.scheduler_ip_file) world_size = args.num_workers n_estimators = args.n_estimators rabit_tracker = None initial_pool_size = 7 << 30 rmm.reinitialize( pool_allocator=True, managed_memory=True, initial_pool_size=initial_pool_size, logging=False, ) try: """start to build the network""" if world_size > 1: if rank == 0: logger.info("start the master node") rabit = RabitTracker(hostIP="0.0.0.0", nslave=world_size) rabit.start(world_size) rabit_tracker = rabit logger.info('###### RabitTracker Setup Finished ######') envs = [ 'DMLC_NUM_WORKER=%d' % world_size, 'DMLC_TRACKER_URI=%s' % addr, 'DMLC_TRACKER_PORT=%d' % port, 'DMLC_TASK_ID=%d' % rank, 'DMLC_WORKER_CONNECT_RETRY=99999999' ] logger.info('##### Rabit rank setup with below envs #####') for i, env in enumerate(envs): logger.info(env) envs[i] = str.encode(env) xgb.rabit.init(envs) logger.info('##### Rabit rank = %d' % xgb.rabit.get_rank()) rank = xgb.rabit.get_rank() else: world_size = 1 logging.info("Start the train in a single node") logger.info('Init with DMLC rabit: ' + str(time.time() - start)) start = time.time() dmatrix = read_train_data(args.train_files, rank=rank) logger.info('IO time with cudf: ' + str(time.time() - start)) start = time.time() kwargs = {} kwargs["dtrain"] = dmatrix kwargs["num_boost_round"] = int(n_estimators) param_xgboost_default = {'learning_rate':0.3, 'max_depth': 8, 'silent': True, 'objective': 'reg:squarederror', 'subsample': 0.1, 'gamma': 1, 'verbose_eval': True, 'tree_method':'gpu_hist'} kwargs["params"] = param_xgboost_default logging.info("starting to train xgboost at node with rank %d", rank) bst = xgb.train(**kwargs) logger.info('xgboost training time with cudf and rabit on GPU: ' + str(time.time() - start)) if rank == 0: model = bst else: model = None logging.info("finish xgboost training at node with rank %d", rank) except Exception as e: logger.error("something wrong happen: %s", traceback.format_exc()) raise e finally: logger.info("xgboost training job finished!") if world_size > 1: xgb.rabit.finalize() if rabit_tracker: rabit_tracker.join() return model
import numpy as np import pandas as pd from utils import read_train_data, read_test_data, flip_images, eltransform_images, allmasks_to_rles, train_masks_to_rles, draw_grid, elastic_transform, add_noise, affine_transform, rotate_images, invert_images, blur_images, crop_images from model import build_unet, dice_coef, mean_iou from keras.models import load_model from keras.callbacks import EarlyStopping, ModelCheckpoint from skimage.transform import resize, warp, AffineTransform, rotate from skimage import io, img_as_ubyte from skimage.util import invert from matplotlib import pyplot as plt import random from cv2 import GaussianBlur # get train train data X_train, Y_train = read_train_data() if 1: ix = 2 img = X_train[ix] label = Y_train[ix] X_tf, Y_tf = crop_images(X_train, Y_train) img_tf = X_tf[ix] label_tf = Y_tf[ix] plt.figure(figsize=(8, 8)) plt.subplot(221) plt.title('image') io.imshow(img) plt.subplot(222)
import numpy as np from utils import read_train_data, read_test_data #read training data train_imgs, train_gts = read_train_data('train_data') #remove dublicate training imgs idx_to_rmv = [] for i in range(len(train_imgs) - 1): for j in range(i + 1, len(train_imgs)): if np.all(train_imgs[i] == train_imgs[j]): idx_to_rmv.append(i) if train_gts[i] != train_gts[j]: idx_to_rmv.append(j) idx = [i for i in range(len(train_imgs)) if not (i in idx_to_rmv)] print('unique train imgs:', len(idx)) #save unique training imgs np.save('unique_train_imgs_rot_fixed', np.array(train_imgs)[idx]) np.save('unique_train_gts_rot_fixed', np.array(train_gts)[idx]) #read test data test_imgs, test_gts, ids = read_test_data('test_data') #save test data np.save('test_imgs_rot_fixed', np.array(test_imgs)) np.save('test_gts', np.array(test_gts)) np.save('ids', np.array(ids))
help='path to training data folder', default='train_data', type=str) parser.add_argument('--test_data_path', help='path to test data folder', default='test_data', type=str) parser.add_argument( '--save_path', help='save path for training and test numpy matrices of images', default='.', type=str) args = parser.parse_args() #read training data train_imgs, train_gts = read_train_data(args.train_data_path) #remove dublicate training imgs idx_to_rmv = [] for i in range(len(train_imgs) - 1): for j in range(i + 1, len(train_imgs)): if np.all(train_imgs[i] == train_imgs[j]): idx_to_rmv.append(i) if train_gts[i] != train_gts[j]: idx_to_rmv.append(j) idx = [i for i in range(len(train_imgs)) if not (i in idx_to_rmv)] print('unique train imgs:', len(idx)) #save unique training imgs np.save(os.path.join(args.save_path, 'unique_train_imgs_rot_fixed'),
def train(args): """ :param args: configuration for train job :return: XGBoost model """ addr, port, rank, world_size = extract_xgbooost_cluster_env() rabit_tracker = None try: """start to build the network""" if world_size > 1: if rank == 0: logger.info("start the master node") rabit = RabitTracker(hostIP="0.0.0.0", nslave=world_size, port=port, port_end=port + 1) rabit.start(world_size) rabit_tracker = rabit logger.info('###### RabitTracker Setup Finished ######') envs = [ 'DMLC_NUM_WORKER=%d' % world_size, 'DMLC_TRACKER_URI=%s' % addr, 'DMLC_TRACKER_PORT=%d' % port, 'DMLC_TASK_ID=%d' % rank ] logger.info('##### Rabit rank setup with below envs #####') for i, env in enumerate(envs): logger.info(env) envs[i] = str.encode(env) xgb.rabit.init(envs) logger.info('##### Rabit rank = %d' % xgb.rabit.get_rank()) rank = xgb.rabit.get_rank() else: world_size = 1 logging.info("Start the train in a single node") df = read_train_data(rank=rank, num_workers=world_size, path=None) kwargs = {} kwargs["dtrain"] = df kwargs["num_boost_round"] = int(args.n_estimators) param_xgboost_default = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'multi:softprob', 'num_class': 3} kwargs["params"] = param_xgboost_default logging.info("starting to train xgboost at node with rank %d", rank) bst = xgb.train(**kwargs) if rank == 0: model = bst else: model = None logging.info("finish xgboost training at node with rank %d", rank) except Exception as e: logger.error("something wrong happen: %s", traceback.format_exc()) raise e finally: logger.info("xgboost training job finished!") if world_size > 1: xgb.rabit.finalize() if rabit_tracker: rabit_tracker.join() return model
# Work back in time through the backpointers to get the best sequence predicted_tags = [None for x in range(len(S))] predicted_tags[0] = self.tags[numpy.argmax(T[0, :])] predicted_tags[-1] = self.tags[numpy.argmax(T[-1, :])] for i in range(len(S) - 2, 0, -1): ind = numpy.argmax(T[i, :]) tag = self.tags[int(backpointers[i + 1, ind])] predicted_tags[i] = tag return predicted_tags def train(self, sentences, targets): # Get word-counts in the training set self.get_word_counts(sentences) # Use counts to replace infrequent words with UNK if self.handle_unks: self.replace_UNK(sentences, self.unk_freq) # Get counts to compute transition and emission probs later self.compute_counts(sentences, targets) # Compute P(w|t) and P(ti|ti-1) self.estimate_params() self.tags = list(self.pos_unigram_counts.keys()) self.tags.remove("<START>") if __name__ == "__main__": train_sentences, train_targets = read_train_data('train.txt') test_sentences, test_targets = read_train_data('test.txt') hmm = HMM(unk_freq=1) hmm.train(train_sentences, train_targets) test_accuracy(hmm, test_sentences, test_targets)
def test_read_data(): df = read_train_data() return True
def __init__(self, data_path=os.path.join(DATA_DIR,"train.txt"), vocab_path=os.path.join(DATA_DIR, "conv_word_dict.txt"), SPO_vocab_path=os.path.join(DATA_DIR, "p_word_dict.txt"), entities_vocab_path=os.path.join(DATA_DIR, "entities_dict.txt"), segment_vacab_path=os.path.join(DATA_DIR, "Segment_dict.json"), goaltype_vocab_path=os.path.join(DATA_DIR, "goal_type_dict.txt"), goals_str_path=os.path.join(DATA_DIR, "goals_dict.json"), CLS_token='[CLS]', SEP_token='[SEP]', UNK_token='[UNK]', PAD_token='[PAD]', MASK_token='[MASK]', limit=None): """ Todo: 1. get dataset 2. get the dict for word, entities, segment type 3. get :param data_path: """ # get raw dataset self.raw_dataset = read_train_data(data_path, limit=limit) self.word2idx = read_json(vocab_path) self.idx2word = {value:key for key, value in self.word2idx.items()} self.p_dict = read_json(SPO_vocab_path) self.id2p = {value:key for key, value in self.p_dict.items()} self.entity2id = read_json(entities_vocab_path) self.id2entity = {value:key for key, value in self.entity2id.items()} self.segtype2id = read_json(segment_vacab_path) self.id2segtype = {value:key for key, value in self.segtype2id.items()} self.goaltype2id = read_json(goaltype_vocab_path) self.id2goaltype = {value:key for key, value in self.goaltype2id.items()} self.goal2str = read_json(goals_str_path) # the special token self.CLS_id = self.word2idx[CLS_token] self.SEP_id = self.word2idx[SEP_token] self.PAD_id = self.word2idx[PAD_token] self.UNK_id = self.word2idx[UNK_token] self.MASK_id = self.word2idx[MASK_token] self.CLS_token = CLS_token self.SEP_token = SEP_token self.PAD_token = PAD_token self.UNK_token = UNK_token self.MASK_token = MASK_token # special token self.situation_type = ["date", "workday", "time", "location", "theme"] self.situation_sp_token = '<date>' self.kg_type = ["S", "P", "O"] self.task_type = 'gtsp' # the list without dialogue info self.flatten_dataset = [dial for item in self.raw_dataset for dial in item] self.bot_dial = [item for item in self.flatten_dataset if Raw_data(*item).role == "bot"] self.whole_dial = [item[-1] for item in self.raw_dataset] # work type self.int_type = np.int64