def test_lstm(self): df_train, df_dev, df_test, metadata = get_fake_dataset( with_text_col=True) glove_file_path = 'glove/glove.6B.50d.txt' # need be changed to where you store the pre-trained GloVe file. text_config = Mapping() text_config.mode = 'glove' text_config.max_words = 20 text_config.maxlen = 5 text_config.embedding_dim = 50 text_config.embeddings_index = open_glove( glove_file_path) # need to change encoder = Encoder(metadata, text_config=text_config) y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train) y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev) y_test, X_test_struc, X_test_text = encoder.transform(df_test) text_config.embedding_matrix = encoder.embedding_matrix model_config = get_fake_modelconfig('./outputs_test') model_config.output_dir = os.path.join(model_config.output_dir, 'lstm') if not os.path.exists(model_config.output_dir): os.makedirs(model_config.output_dir) model = Model(text_config, model_config) hist = model.train(y_train, X_train_struc, X_train_text, y_train, X_train_struc, X_train_text) # print(hist.history) # y_dev, X_dev_struc, X_dev_text) val_acc_true = 1.0 self.assertTrue(np.isclose(val_acc_true, hist.history['val_acc'][-1]))
def test_tfidf(self): df_train, df_dev, df_test, metadata = get_fake_dataset( with_text_col=True) text_config = Mapping() text_config.mode = 'tfidf' text_config.max_words = 20 encoder = Encoder(metadata, text_config) y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train) y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev) y_test, X_test_struc, X_test_text = encoder.transform(df_test) model_config = get_fake_modelconfig('./outputs_test') model_config.output_dir = os.path.join(model_config.output_dir, 'tfidf_text_only') if not os.path.exists(model_config.output_dir): os.makedirs(model_config.output_dir) model = Model(text_config, model_config) hist = model.train(y_train, X_train_struc, X_train_text, y_train, X_train_struc, X_train_text) val_acc_true = 1.0 self.assertTrue(np.isclose(val_acc_true, hist.history['val_acc'][-1]))
def test_db_stride1_forward(self): cfg.merge_from_file('../../experiments/base_db.yml') cfg.MODEL.DB.STRIDE = 1 model = Model(cfg, is_training=False) x = torch.ones([1, 3, 512, 512]) x = model(x) self.assertEqual(x[0].shape, torch.Size([1, 1, 512, 512]))
def test_pse_stride4_forward(self): cfg.merge_from_file('../../experiments/base_pse.yml') cfg.MODEL.PSE.STRIDE = 4 model = Model(cfg, is_training=False) x = torch.ones([1, 3, 512, 512]) x = model(x) self.assertEqual(x[0].shape, torch.Size([1, 6, 128, 128]))
def test_strucdata_only(self): df_train, df_dev, df_test, metadata = get_fake_dataset( with_text_col=False) encoder = Encoder(metadata, text_config=None) y_train, X_train_struc, X_train_text = encoder.fit_transform(df_train) y_dev, X_dev_struc, X_dev_text = encoder.transform(df_dev) y_test, X_test_struc, X_test_text = encoder.transform(df_test) print(X_train_text, X_dev_text, X_test_text) model_config = get_fake_modelconfig('./outputs_test') model_config.output_dir = os.path.join(model_config.output_dir, 'dense_mlp') if not os.path.exists(model_config.output_dir): os.makedirs(model_config.output_dir) model = Model(text_config=None, model_config=model_config) hist = model.train(y_train, X_train_struc, X_train_text, y_train, X_train_struc, X_train_text) val_acc_true = 1.0 self.assertTrue(np.isclose(val_acc_true, hist.history['val_acc'][-1]))
def get(self): country = request.args.get('country') sector = request.args.get('sector') subject = request.args.getlist('subject') if (country is None and sector is None and subject is None) or not subject: return 1 quality = classify_subject(country, sector, str(subject[0]), Model.get_instance(), LemmasPt.get_instance()) return int(quality[0])
def __init__(self, cfg): self.storage = {} self.device = cfg.SOLVER.DEVICE self.max_iter = cfg.SOLVER.MAX_ITERS self.log_dir = cfg.SOLVER.TENSORBOARD_WRITER.LOG_DIR self.base_lr = cfg.SOLVER.LR.BASE_LR optimizer_name = cfg.SOLVER.OPTIMIZER self.weight_decay = cfg.SOLVER.WEIGHT_DECAY self.weights = cfg.SOLVER.WEIGHTS self.image_period = cfg.SOLVER.TENSORBOARD_WRITER.IMAGE_PERIOD self.scalar_period = cfg.SOLVER.TENSORBOARD_WRITER.SCALAR_PERIOD self.save_period = cfg.SOLVER.CHECKPOINT_PERIOD self.save_model_dir = cfg.SOLVER.SAVE_DIR self.model_name = cfg.SOLVER.CHECKPOINT_NAME data_loader = build_train_data_loader(cfg) self._data_loader_iter = iter(data_loader) self.model = Model(cfg, True).train().to(self.device) self.optimizer = self.build_optimizer(optimizer_name, self.model) self.lr_scheduler = build_LRscheduler(self.optimizer, cfg) self.iter = 0 self.writer = None self.tic = 0 self.toc = 0
from modeling import Model from cross_validation import CrossValidationDQNN if __name__ == '__main__': from config import config model = Model(config=config) model.add_csv_data('data/csv/test.csv') model.train_model() model.init_onehot() cross_validation = CrossValidationDQNN(config=config) estimate = cross_validation.run()
def __init__(self, config): self.config = config self.model = Model(config) self.model_dqnn = ModelDQNN() self.model_dqnn.init_dqnn(None, (1))
class CrossValidationDQNN: def __init__(self, config): self.config = config self.model = Model(config) self.model_dqnn = ModelDQNN() self.model_dqnn.init_dqnn(None, (1)) def train(self, intervals): finish = False prepare = Preparer(intervals, **self.config) while not finish: # get events rows, flag = prepare.get_data_from_db() batch_states = [] batch_newstates = [] batch_actions = [] for row in rows: # get info about event time_event = None tag_id = None user_id = None time_delta = None # init features state = self.model.get_features(user_id, tag_id, time_event) next_state = self.model.get_features(user_id, tag_id, time_event + time_delta) action = 1 batch_states.append(state) batch_newstates.append(next_state) batch_actions.append(action) if len(batch_states) > 0: self.model_dqnn.train(batch_states, batch_newstates, batch_actions) if not flag: finish = prepare.next_iteration() def predict(self, intervals): finish = False prepare = Preparer(intervals, **self.config) while not finish: # get events rows, flag = prepare.get_data_from_db() for row in rows: # get info about event time_event = None tag_id = None user_id = None time_delta = None # init features state = self.model.get_features(user_id, tag_id, time_event) predict = self.model_dqnn.predict(state) def run(self): self.init_interval() for train_interval, test_interval in zip(self.train_interaval, self.test_interval): self.train(train_interval) estimate = self.predict(test_interval) def init_interval(self): train_dates, test_dates = get_cv_data(db, parts_count=3) self.train_interaval = train_dates self.test_interval = test_dates def prepare(self): pass
import yaml from data_utils import DataLoader from training_utils import Trainer, TrainerConfig import training_utils from modeling import Model if __name__ == '__main__': with open("config/config.yaml", "r") as f: config = yaml.safe_load(f) model = Model(config) args = getattr(training_utils, "baseline") dl = DataLoader(args) tr_dataset, val_dataset = dl.setup() tr_dataset = dl.train_dataloader(tr_dataset) val_dataset = dl.val_dataloader(val_dataset) trainer = Trainer(model, args) trainer.fit(tr_dataset, val_dataset)
crawler = Crawler() # 데이터 수집 weather_data = crawler.weather_fetch() yesterday_data = crawler.yesterday_fetch() print(weather_data) print(yesterday_data) # 날씨 이벤트 처리 weather_events = Events(weather_data) weather_events.process_events() weather_info = (weather_events.temp_max) # 모델링 now_model = Model(weather_info) yes_model = Model(yesterday_data) today_visitor = now_model.modeling() yesterday_visitor = yes_model.modeling() # Mood decision mood = Mood() template = mood.decision(today_visitor, yesterday_visitor) print(template) # 기사생성 article = Article(template, weather_events, today_visitor, yesterday_data, yesterday_visitor) print(article.generate()) f = open("better_than_yesterday.txt", "w", encoding = "UTF-8") f.write(article.generate()) f.close()
class Trainer(object): def __init__(self, cfg): self.storage = {} self.device = cfg.SOLVER.DEVICE self.max_iter = cfg.SOLVER.MAX_ITERS self.log_dir = cfg.SOLVER.TENSORBOARD_WRITER.LOG_DIR self.base_lr = cfg.SOLVER.LR.BASE_LR optimizer_name = cfg.SOLVER.OPTIMIZER self.weight_decay = cfg.SOLVER.WEIGHT_DECAY self.weights = cfg.SOLVER.WEIGHTS self.image_period = cfg.SOLVER.TENSORBOARD_WRITER.IMAGE_PERIOD self.scalar_period = cfg.SOLVER.TENSORBOARD_WRITER.SCALAR_PERIOD self.save_period = cfg.SOLVER.CHECKPOINT_PERIOD self.save_model_dir = cfg.SOLVER.SAVE_DIR self.model_name = cfg.SOLVER.CHECKPOINT_NAME data_loader = build_train_data_loader(cfg) self._data_loader_iter = iter(data_loader) self.model = Model(cfg, True).train().to(self.device) self.optimizer = self.build_optimizer(optimizer_name, self.model) self.lr_scheduler = build_LRscheduler(self.optimizer, cfg) self.iter = 0 self.writer = None self.tic = 0 self.toc = 0 def build_optimizer(self, name: str, model: torch.nn.Module) -> torch.optim.Optimizer: """No bias decay: Bag of Tricks for Image Classification with Convolutional Neural Networks (https://arxiv.org/pdf/1812.01187.pdf)""" weight_p, bias_p = [], [] for p_name, p in model.named_parameters(): if 'bias' in p_name: bias_p += [p] else: weight_p += [p] parameters = [{ 'params': weight_p, 'weight_decay': self.weight_decay }, { 'params': bias_p, 'weight_decay': 0 }] if name == 'Adam': return torch.optim.Adam(model.parameters(), lr=self.base_lr) if name == 'SGD': return torch.optim.SGD(model.parameters(), lr=self.base_lr) if name == 'SWA': """Stochastic Weight Averaging: Averaging Weights Leads to Wider Optima and Better Generalization (https://arxiv.org/pdf/1803.05407.pdf)""" base_opt = torch.optim.SGD(parameters, lr=self.base_lr) return SWA(base_opt, swa_start=10, swa_freq=5, swa_lr=self.base_lr) def before_train(self): if self.weights != '': checkpoint = torch.load(self.weights) self.model.load_state_dict(checkpoint) if not os.path.exists(self.save_model_dir): os.makedirs(self.save_model_dir) self.writer = SummaryWriter(self.log_dir) self.model.train() def after_train(self): model_name = self.model_name + '_' + str(self.iter) + '.pth' torch.save(self.model.state_dict(), os.path.join(self.save_model_dir, model_name)) def before_step(self): self.tic = time.time() def after_step(self): # 统计时间 self.toc = time.time() iter_time = self.toc - self.tic self.storage['iter_time'] = iter_time # 写tensorboard for key in self.storage: if isinstance(self.storage[key], dict): sub_dict = self.storage[key] for sub_key in sub_dict: value = sub_dict[sub_key] self._write_tensorboard(key + '/' + sub_key, value) else: value = self.storage[key] self._write_tensorboard(key, value) # 保存模型 if self.iter % self.save_period == 0: model_name = self.model_name + '_' + str(self.iter) + '.pth' torch.save(self.model.state_dict(), os.path.join(self.save_model_dir, model_name)) def _write_tensorboard(self, key: str, value: Union[torch.Tensor, int, float]): if isinstance(value, torch.Tensor) and len(value.shape) == 4: if self.iter % self.image_period == 0: self.writer.add_images(key, value, self.iter) elif self.iter % self.scalar_period == 0: self.writer.add_scalar(key, value, self.iter) def train(self, start_iter=0): try: self.before_train() for self.iter in range(start_iter, self.max_iter): self.before_step() self.run_step() self.after_step() self.after_train() finally: self.after_train() def run_step(self): data = next(self._data_loader_iter) total_loss, losses, metries = self.model(data) self.storage['total_loss'] = total_loss self.storage['losses'] = losses self.storage['image'] = data['image'] self.storage['training_mask'] = data['training_mask'] self.storage['metries'] = metries grads = {} self.storage['grads'] = grads self.optimizer.zero_grad() total_loss.backward() self.optimizer.step() self.storage['lr'] = self.lr_scheduler.get_lr()[0] self.lr_scheduler.step() for name, parm in self.model.named_parameters(): if parm.grad is not None: grads[name] = torch.mean(torch.abs(parm.grad))
import datetime from modeling import Model if __name__ == '__main__': # create untrained Model object based on config.py from config import config model = Model(config=config) # or load previously created (using the same DB) model with its own configuration # model = Model(model_file="/home/user/repos/python_deep_reinforcement_sequence/data/LDA/model/1d439b93-fbc9-4ae4-b76d-7d1332259344_model.pickle") # create main dictionary from .csv: this complete replaces data in DB. # Call add_csv_data ONLY ONCE before the first launch of training!! # model.add_csv_data('data/csv/test_seasons.csv') # this drops DB, but not deletes saved model files, built using it # or create it from all .csv files in a given folder, assuming all of them have the proper format model.add_csv_data('data/csv/test.csv') # this drops db, but not deletes saved model files, built using it # a, b = model.get_part_of_data(parts=10) model.compute_context_features([datetime.timedelta(days=2), datetime.timedelta(hours=1)]) print(1) # train model # If the description of a model with appropriate config parameters is found in db, the training phase will be skipped # Otherwise, the new model will be trained and saved for further usage, model_id and config parameters will be added to DB # Also this function creates word_id index according to occurrences_threshold parameter model.train_model() # get vector for the given text
"toxic": "This comment is toxic.", "severe_toxic": "This comment is severely toxic.", "obscene": "This comment is obscene.", "threat": "This comment is a threat.", "insult": "This comment is an insult.", "identity_hate": "This comment is hate speech." } max_label_len = max( [len(word_tokenize(x)) for x in labelSentencesDict.values()]) print("Train Model") model = Model(binaryClassification=args["binaryClassification"], model_str=tokenizer_model[1], doLower=args["doLower"], train_batchSize=args["train_batchSize"], testval_batchSize=args["testval_batchSize"], learningRate=args["learningRate"], doLearningRateScheduler=args["doLearningRateScheduler"], labelSentences=labelSentencesDict, max_label_len=max_label_len, device=device) model.run(train_data=train_df[data_column], train_target=train_df[args["targets"]], val_data=val_df[data_column], val_target=val_df[args["targets"]], test_data=test_df[data_column], test_target=test_df[args["targets"]], epochs=args["numEpochs"]) wandb.log({'finished': True}) run_infos = wandb_summarizer.download.get_results(wandb_project_name)
texts = [ "Die Bewertung {} ist {}.".format(cathegoryDict[x.split("_")[0]], sentimentDict[x.split("_")[1]]) for x in labels ] labelSentencesDict = dict(zip(labels, texts)) max_label_len = max( [len(word_tokenize(x)) for x in labelSentencesDict.values()]) print("Make Predictions") model = Model(binaryClassification=model_useBinary, model_str=model_technique, doLower=args["doLower"], train_batchSize=args["train_batchSize"], testval_batchSize=args["testval_batchSize"], learningRate=args["learningRate"], doLearningRateScheduler=args["doLearningRateScheduler"], labelSentences=labelSentencesDict, smartBatching=args["smartBatching"], max_label_len=max_label_len, device=device) model.load(os.path.join(args["model_path"], "apple-flambee-545.pt")) pred = model.predict(data=predict_df["processed"], device=device) pd.concat((predict_df.reset_index(drop=True), pred), axis=1).to_csv( os.path.join(args["data_path"], "predict", filename[:-4] + "_predictions_raw.csv")) def logits_to_pred(column):
import datetime from modeling import Model if __name__ == '__main__': # create untrained Model object based on config.py from config import config model = Model(config=config) model.create_usarname_tag_onehot_tables( usernames=["summer", "autumn", "winter", "newseason"], tags=["tag1", "tag2", "newtag"]) # model._dictionary.test() average_user = model._dictionary.get_average_user( 2, datetime.datetime.fromtimestamp(0)) pass # # mem usage # import os # import psutil # process = psutil.Process(os.getpid()) # print(process.memory_info().rss) # pass
def main(): parser = argparse.ArgumentParser() parser.add_argument('--encoded_data_dir', type=str, # default='/data/home/t-chepan/projects/MS-intern-project/raw_data', help=('directory to load the encoded data.')) # this is optional parser.add_argument('--data_name', type=str, # default='KICK', help=('which data will be used? (kickstarter Or indiegogo?)')) parser.add_argument('--search_space_filepath', type=str, # default='path/to/search_space.json', help=('where to load the search space file?')) parser.add_argument('--output_dir', type=str, # default='path/to/save/outputs', help=('directory to save the trained model and related model_config.')) parser.add_argument('--task_type', type=str, default='classification', help=('what is the type of this task? (classification or regression?)')) parser.add_argument('--num_classes', type=int, # default='classification', help=('what is the number of classes (classification) or outputs (regression)?')) parser.add_argument('--model_type', type=str, default='mlp', help=('what type of NN model you want to try? (mlp or skip_connections?)')) parser.add_argument('--num_trials', type=int, default= 1, help=('how many trials you want to run the model?')) args = parser.parse_args() if args.data_name is not None and args.encoded_data_dir is not None: path_to_data = os.path.join(args.encoded_data_dir, args.data_name) path_to_save = os.path.join(args.output_dir, args.data_name) if not os.path.exists(path_to_save): os.makedirs(path_to_save) elif args.data_name is None and args.encoded_data_dir is not None: path_to_data = args.encoded_data_dir path_to_save = args.output_dir else: raise argparse.ArgumentTypeError(args.data_name + ' or ' + args.encoded_data_dir + " can't be recognized.") ########################################### ## load encoded training set and dev set ## ########################################### y_train_path = os.path.join(path_to_data, 'y_train.npy') if os.path.exists(y_train_path): y_train = np.load(y_train_path, mmap_mode='r') else: raise ValueError('y_train is not found!') X_train_struc_path = os.path.join(path_to_data, 'X_train_struc.npy') if os.path.exists(X_train_struc_path): X_train_struc = np.load(X_train_struc_path, mmap_mode='r') else: X_train_struc = None X_train_text_path = os.path.join(path_to_data, 'X_train_text.npy') if os.path.exists(X_train_text_path): X_train_text = np.load(X_train_text_path, mmap_mode='r') else: X_train_text = None y_dev_path = os.path.join(path_to_data, 'y_dev.npy') if os.path.exists(y_dev_path): y_dev = np.load(y_dev_path, mmap_mode='r') else: raise ValueError('y_dev is not found!') X_dev_struc_path = os.path.join(path_to_data, 'X_dev_struc.npy') if os.path.exists(X_dev_struc_path): X_dev_struc = np.load(X_dev_struc_path, mmap_mode='r') else: X_dev_struc = None X_dev_text_path = os.path.join(path_to_data, 'X_dev_text.npy') if os.path.exists(X_dev_text_path): X_dev_text = np.load(X_dev_text_path, mmap_mode='r') else: X_dev_text = None text_config_path = os.path.join(path_to_data, 'text_config.json') if os.path.exists(text_config_path): with open(text_config_path, 'r') as f: text_config = json.load(f) text_config = Mapping(text_config) else: text_config = None if text_config is not None and text_config.mode == 'glove': embedding_matrix_path = text_config.embedding_matrix_path if os.path.exists(embedding_matrix_path): embedding_matrix = np.load(embedding_matrix_path, mmap_mode='r') text_config.embedding_matrix = embedding_matrix else: raise ValueError('embedding_matrix is not found!') else: embedding_matrix = None ########################################### ## sample model config from search space ## ########################################### if args.task_type is not None and args.num_classes is not None: print('you are choosing ' + args.model_type + ' as the model type!') default_model_config = create_default_modelconfig(args.task_type, args.num_classes, args.model_type, path_to_save) else: raise ValueError('You are missing task_type or num_classes or both!') ## load search space file which is provided by users ## with open(args.search_space_filepath, 'r') as f: search_space = json.load(f) search_space = Mapping(search_space) ####################################################################### ## update default model_config based on search_space and train model ## ####################################################################### for i in range(args.num_trials): model_config = sample_modelconfig(search_space, default_model_config) model_name = 'model_{}'.format(i) print('*' * 20) print('model_config: ' + model_config['output_dir']) model_config = Mapping(model_config) print('*' * 20) print('model_config: ' + model_config.output_dir) model_config.output_dir = os.path.join(default_model_config.output_dir, model_name) if not os.path.exists(model_config.output_dir): os.makedirs(model_config.output_dir) model = Model(text_config, model_config) hist = model.train(y_train, X_train_struc, X_train_text, y_train, X_train_struc, X_train_text) ## save hist.history and model_config ## history_path = os.path.join(model_config.output_dir, 'history.json') with open(history_path, 'w') as hf: json.dump(hist.history, hf) model_config_savepath = os.path.join(model_config.output_dir, 'model_config.json') with open(model_config_savepath, 'w') as mf: json.dump(model_config, mf)
## create the auxiliary sentences which are needed if binary classification is done. texts = [ "The article is about {}.".format(cathegoryDict[x]) for x in args["targets"] ] labelSentencesDict = dict(zip(args["targets"], texts)) max_label_len = max( [len(word_tokenize(x)) for x in labelSentencesDict.values()]) print("Train Model") model = Model(args=tokenizer_model, doLower=args["doLower"], train_batchSize=args["train_batchSize"], testval_batchSize=args["testval_batchSize"], learningRate=args["learningRate"], doLearningRateScheduler=args["doLearningRateScheduler"], labelSentences=labelSentencesDict, smartBatching=args["smartBatching"], max_label_len=max_label_len, device=device, target_columns=args["targets"]) # train and test the model model.run(train_data=train_data, train_target=train_target, val_data=val_data, val_target=val_target, test_data=test_data, test_target=test_target, epochs=args["numEpochs"])