def predict(self, fn): '''样本预测;''' pt = Preprocessor() tmp = pt.img2vec(fn) X_test = tmp.reshape(1, -1) ans = self.clf.predict(X_test) return ans
def main(args): if args.dataset == 'Dolphin18k': from utils import Preprocessor as Preprocessor elif args.dataset == 'Math23k': from utils import Math23kPreprocessor as Preprocessor else: logging.error('Not compitable dataset!') return if args.index is not None: with open(args.index) as f: shuffled_index = json.load(f) else: shuffled_index = None preprocessor = Preprocessor(args.embedding_path) train, valid = preprocessor.get_train_valid_dataset(args.data_path, args.valid_ratio, index=shuffled_index, char_based=args.char_based) with open(args.output, 'wb') as f: pickle.dump({'train': train, 'valid': valid, 'preprocessor': preprocessor}, f)
def eval_db_agent(env, params): if params['use_preproc']: preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'], params['resize_shape']) params['state_dim'] = preprocessor.state_shape else: preprocessor = None agent = VAE(params['state_dim'], params['action_dim']) if params['use_cuda']: agent = agent.cuda() agent.load_state_dict(torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']))) else: agent.load_state_dict( torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']), map_location='cpu')) agent.eval() agent_steps = 0 episode_rewards = [] start = time.time() for episode in xrange(1, params['num_episodes'] + 1): env_state = env.reset() episode_reward = 0.0 for t in xrange(1, params['max_steps'] + 1): if params['env_render']: env.render() if preprocessor: state = preprocessor.process_state(env_state) else: state = env_state var_state = createVariable(state, use_cuda=params['use_cuda']) action, state_val = agent.sample_action_eval(var_state) reward = 0.0 for _ in range(1): env_state, r, terminal, _ = env.step(action) reward += r if terminal: break episode_reward += reward if terminal: break episode_rewards.append(episode_reward) agent_steps += t if preprocessor: preprocessor.reset() print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3} | Total Time {4}' \ .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100, timeSince(start, episode / params['num_episodes']))
class TestFeatureAdder(unittest2.TestCase): def setUp(self): self.stats_calc = StatsCalculator() self.preprocessor = Preprocessor() self.feature_adder = FeatureAdder() self.col_names = [f'feature_{i}' for i in range(FEATURES)] def tearDown(self): self.stats_calc = None self.preprocessor = None self.feature_adder = None def _get_df(self): df = pd.read_csv('data/train.tsv', sep='\t') df = self.preprocessor.split_features(df) df = self.preprocessor.f_to_int(df) return df def test_max_index_feature(self): """ Test that new feature 'max_feature_2_index' lies in proper range and has dtype 'int64' """ df = self._get_df() new_feature = 'max_feature_2_index' df = self.feature_adder.max_index_feature(df) valid_range, valid_dtype = (0, 255), 'int64' assert df[new_feature].between(*valid_range).all() and df[new_feature].dtype == valid_dtype, \ "max_feature_2_index feature not in range OR has wrong dtype" def test_abs_mean_diff_feature(self): """ Test that new feature 'max_feature_2_abs_mean_diff' is valid """ df = self._get_df() df = self.feature_adder.max_index_feature(df) new_feature = 'max_feature_2_abs_mean_diff' cols = np.array(self.col_names)[df['max_feature_2_index'].values] train_stats = find_train_stats('data/train.tsv', chunksize=10000) df = self.feature_adder.abs_mean_diff_feature( df.loc[:, df.columns != 'id_job'], train_stats) results = [] for i, col in enumerate(cols): # keep in mind outliers in test data lower_bound, upper_bound = 0, train_stats[col]['std'] results.append(lower_bound <= df[new_feature][i] <= upper_bound) self.assertTrue( np.all(results), "max_feature_2_index feature not in expected range OR has wrong dtype" )
def main(test_csv, test_target_csv, prediction_csv, model_dir): start_time = time.time() # load model model_config_filename = os.path.join(model_dir, 'model_config.pkl') metric_file = os.path.join(model_dir, 'rating.txt') with open(model_config_filename, 'rb') as fin: model_config = pickle.load(fin) # read dataset df = pd.read_csv(test_csv) print('Dataset read, shape {}'.format(df.shape)) line_id = df['line_id'] preprocessor = Preprocessor(model_config['features']) df_X = preprocessor.transform(df) model = model_config['model'] if model_config['mode'] == 'regression': df['prediction'] = model.predict(df_X.values) elif model_config['mode'] == 'classification': df['prediction'] = model.predict_proba(df_X.values)[:, 1] df['line_id'] = line_id df[['line_id', 'prediction']].to_csv(prediction_csv, index=False) print('Prediction time: {}'.format(time.time() - start_time)) if test_target_csv: def save_metric(metric): with open(metric_file, 'a') as f: f.write('{}\n'.format(metric)) # read targets test = pd.read_csv(test_target_csv) print('Read targets, shape {}'.format(test.shape)) if model_config['mode'] == 'regression': pred = preprocessor.target_inverse_transform(df['prediction']) mse = np.mean((test.target - pred)**2) r_2 = 1 - mse/np.std(test.target)**2 print('MSE: {}'.format(mse)) print('R^2: {}'.format(r_2)) save_metric(r_2) elif model_config['mode'] == 'classification': auc = roc_auc_score(test.target, df['prediction']) print('AUC: {}'.format(auc)) save_metric(auc)
def run(): pt = Preprocessor() tr = Trainer() X_train, y_train = pt.load_data() X_test, y_test = pt.load_data("mnist_test_data.npz") x1 = X_train.reshape((-1, 28, 28, 1)) x2 = X_test.reshape((-1, 28, 28, 1)) y1 = keras.utils.to_categorical(y_train, len(np.unique(y_train))) y2 = keras.utils.to_categorical(y_test, len(np.unique(y_test))) clf = tr.cnn(x1, y1, x2, y2) tr.save(clf, "cnn_mnist_keras.h5") return clf
def train_agent(cmdl): step_cnt = 0 ep_cnt = 0 preprocess = Preprocessor(cmdl.env_class).transform env = utils.get_new_env(cmdl.env_name) agent = get_agent(cmdl.agent.name)(env.action_space, cmdl.agent) display_setup(env, cmdl) start_time = time.time() while step_cnt < cmdl.training.step_no: ep_cnt += 1 o, r, done = env.reset(), 0, False s = preprocess(o) while not done: a = agent.evaluate_policy(s) o, r, done, _ = env.step(a) _s, _a = s, a s = preprocess(o) agent.improve_policy(_s, _a, r, s, done) step_cnt += 1 agent.gather_stats(r, done) if ep_cnt % cmdl.report_freq == 0: agent.display_stats(start_time) agent.display_model_stats() end_time = time.time() display_stats(ep_cnt, step_cnt, end_time - start_time) """
def main(): # create prediction dataframe and score dataframe list prediction_df = pd.DataFrame() score_df_list = [] # transform, predict, score for each combination for scaler, (model_class, params) in product(config.scalers, config.models): # assign preprocessor and model preprocessor = Preprocessor(scaler) model = model_class(**params) # process train, validation, test data train_score, validation_score = process_train_data(preprocessor, model) test_prediction, test_score = process_test_data(preprocessor, model) # add predicted data to prediction dataframe prediction_df[ f"{scaler.__name__}-{model}" ] = test_prediction # add score dataframes to the list score_df_list.extend([train_score, validation_score, test_score]) # concatenate score dataframes report_df = pd.concat(score_df_list) # save prediction and report to csv files prediction_df.to_csv(config.prediction_file_path) report_df.to_csv(config.report_file_path)
def __init__(self, config_path): self.stream_api = StreamAPI.StreamAPI(config_path) self.sorter = TweetSorter.TweetSorter() self.preprocessor = Preprocessor.Preprocessor() self.selector = AttributeSelector.AttributeSelector() self.saver = ImageSaver.ImageSaver() self.vader = SentimentAnalyzer.SentimentAnalyzer() self.db = [] self.tweets_num = 0
def run_train(): # t0 = time.time() pt = Preprocessor() tr = Trainer_nn() X_train, y_train = pt.get_data_labels() X_test, y_test = pt.get_data_labels("test") # X_train, y_train = pt.load_data() # X_test, y_test = pt.load_data("mnist_test_data.npz") clf = tr.mlp(X_train, y_train) tr.save_model(clf, "mlp_mnist_Hu300x300ReluSgdIter100Acc96Sample60000.m") tester = Tester("mlp_mnist_Hu300x300ReluSgdIter100Acc96Sample60000.m") mt, score, repo = tester.clf_quality(X_test, y_test) print(mt, score, repo) return clf
def run_train(): t0 = time.time() pt = Preprocessor() tr = Trainer() X_train, y_train = pt.get_data_labels() X_test, y_test = pt.get_data_labels("test") t1 = time.time() print(t1 - t0) clf = tr.svc(X_train, y_train) print(time.time() - t1) tr.save_model(clf, "mnist_svm.m") tester = Tester("mnist_svm.m") mt, score, repo = tester.clf_quality(X_test, y_test) print(mt, score, repo) return clf
def run(): pt = Preprocessor() tr = Trainer() ts = Tester() t0 = time.time() X_train, y_train = pt.load_data() X_test, y_test = pt.load_data("mnist_test_data.npz") X_train, y_train = make_shuffle(X_train, y_train) X_test, y_test = make_shuffle(X_test, y_test) X_train = X_train.reshape((-1, 1, 28, 28)) X_test = X_test.reshape((-1, 1, 28, 28)) print(time.time() - t0) t1 = time.time() clf = tr.net(X_train, y_train) print(time.time() - t1) acc = ts.get_acc(clf, X_test, y_test) #acc=97.8% return clf, acc
def main(): emotionals, rationals = emotional_rational() preprocessor = Preprocessor() emotionals = preprocessor.parse_sentences(emotionals) rationals = preprocessor.parse_sentences(rationals) train_pos = emotionals[:len(emotionals) // 2] train_neg = rationals[:len(rationals) // 2] test_pos = emotionals[len(emotionals) // 2:] test_neg = rationals[len(rationals) // 2:] vectorizer = CountVectorizer() X_train = vectorizer.fit_transform(train_pos + train_neg) y_train = np.array([1] * len(train_pos) + [0] * len(train_neg)) X_test = vectorizer.transform(test_pos + test_neg) y_test = np.array([1] * len(test_pos) + [0] * len(test_neg)) print('Vocabulary size : {}'.format(len(vectorizer.vocabulary_))) nbsvm = NBSVM() nbsvm.fit(X_train, y_train) print('Test accuracy : {}'.format(nbsvm.score(X_test, y_test))) y_pred = nbsvm.predict(X_test) print('F1 score : {}'.format(f1_score(y_test, y_pred, average='macro'))) fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1) roc_auc = auc(fpr, tpr) print('AUC of emotionals : {}'.format(roc_auc)) plot_roc_curve(fpr, tpr, roc_auc, 'nbsvm_emotional_roc.png') fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=0) roc_auc = auc(fpr, tpr) print('AUC of rationals : {}'.format(roc_auc)) plot_roc_curve(fpr, tpr, roc_auc, 'nbsvm_rational_roc.png')
def evaluate_agent(crt_training_step, eval_env, eval_agent, policy, cmdl): print("[Evaluator] Initializing at %d training steps:" % crt_training_step) agent = eval_agent eval_env.get_crt_step(crt_training_step) agent.policy_evaluation.policy.load_state_dict(policy.state_dict()) preprocess = Preprocessor(cmdl.env_class).transform step_cnt = 0 o, r, done = eval_env.reset(), 0, False while step_cnt < cmdl.evaluator.eval_steps: s = preprocess(o) a = agent.evaluate_policy(s) o, r, done, _ = eval_env.step(a) step_cnt += 1 if done: o, r, done = eval_env.reset(), 0, False
def train_agent(cmdl): step_cnt = 0 ep_cnt = 0 start_time = time.time() env = utils.get_new_env(cmdl.env_name, cmdl) eval_env = EvaluationMonitor(gym.make(cmdl.env_name), cmdl) name = cmdl.agent.name agent = get_agent(name)(env.action_space, cmdl.agent) eval_agent = get_agent(name)(eval_env.action_space, cmdl.agent, False) preprocess = Preprocessor(cmdl.env_class).transform agent.display_setup(env, cmdl) while step_cnt < cmdl.training.step_no: ep_cnt += 1 o, r, done = env.reset(), 0, False s = preprocess(o) while not done: a = agent.evaluate_policy(s) o, r, done, _ = env.step(a) _s, _a = s, a s = preprocess(o) agent.improve_policy(_s, _a, r, s, done) step_cnt += 1 agent.gather_stats(r, done) if step_cnt % cmdl.report_freq == 0: agent.display_stats(start_time) agent.display_model_stats() gc.collect() if step_cnt % cmdl.eval_freq == 0: evaluate_agent(step_cnt, eval_env, eval_agent, agent.policy, cmdl) end_time = time.time() agent.display_final_report(ep_cnt, step_cnt, end_time - start_time)
def main(): emotionals, rationals = emotional_rational() preprocessor = Preprocessor() emotionals = preprocessor.parse_sentences(emotionals) rationals = preprocessor.parse_sentences(rationals) emotionals = emotionals[:len(emotionals)] rationals = rationals[:len(emotionals)] sentences = emotionals + rationals Y = np.array([[0, 1]] * len(emotionals) + [[1, 0]] * len(rationals)) max_features = 200 tokenizer = Tokenizer(num_words=max_features, split=' ') tokenizer.fit_on_texts(sentences) X = tokenizer.texts_to_sequences(sentences) X = pad_sequences(X, maxlen=MAX_LEN) epochs = 15 # --- Add Features --- dict_loader = EmotionalDict('dataset/nouns', 'dataset/verbs') emotional_dict = dict_loader.load() features_loader = AdditionalFeatures(emotionals+rationals, emotional_dict) add_features = features_loader.emotional_features() ###################### x_aux_train = add_features[:848] x_aux_test = add_features[848:] model = build_model(x_aux_train.shape) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) print(X_train.shape, Y_train.shape) print(X_test.shape, Y_test.shape) batch_size = 32 model.fit({'main_input': X_train, 'add_input': x_aux_train}, Y_train, epochs=epochs, batch_size=batch_size, verbose=2) score, acc = model.evaluate({'main_input': X_test, 'add_input': x_aux_test}, Y_test, verbose=2, batch_size=batch_size) print('score: {}'.format(score)) print('acc: {}'.format(acc)) Y_pred = model.predict({'main_input': X_test, 'add_input': x_aux_test}, batch_size=1, verbose=2) print(classification_report(Y_test[:, 1], np.round(Y_pred[:, 1]), target_names=['rationals', 'emotionals'])) fpr, tpr, _ = roc_curve(Y_test[:, 1], Y_pred[:, 1]) roc_auc = auc(fpr, tpr) plot_roc_curve(fpr, tpr, roc_auc, 'roc.png') cnf_matrix = confusion_matrix(Y_test[:, 1], np.round(Y_pred[:, 1])) plot_confusion_matrix(cnf_matrix, ['rationals', 'emotionals'], 'cnf.png') attention_vector = np.mean(get_activations(model, X_test, True, 'attention_vec')[0], axis=2).squeeze() attention_vector = np.mean(attention_vector, axis=0) import matplotlib.pyplot as plt import pandas as pd pd.DataFrame(attention_vector, columns=['attention (%)']).plot(kind='bar', title='Attention') plt.savefig('attention_vec.png') attention_vector_indices = np.argsort(attention_vector)[::-1] word_index = tokenizer.word_index word_index_inv = {v: k for k, v in word_index.items()} with open('attention_word.txt', 'w') as f: for i, attention_index in enumerate(attention_vector_indices, start=1): try: print('No.{} : {}'.format(i, word_index_inv[attention_index]), file=f) except KeyError: continue
from utils import Preprocessor if __name__ == '__main__': print('Initializing preprocessor') preprocessor = Preprocessor() print('Running preprocessor') preprocessor.run() print('Saving trie and inverted index') preprocessor.save() print('Preprocessor stats') max_key_length = max(map(len, preprocessor.stats.keys())) for k, v in preprocessor.stats.items(): print(f"{k.ljust(max_key_length)}: {v}") print('Most common tokens') max_key_length = max(map(len, preprocessor.most_common.keys())) for token, token_count in preprocessor.most_common.items(): print(f"{repr(token).ljust(max_key_length+2)} appeared at least ONCE in {str(token_count).ljust(5)} documents") print('Done')
def train(args): vocab = Vocab.load(args.vocab, max_size=args.vocab_size) data_reader = DataReader(data_dir=args.data_dir, shuffle=True) preprocessor = Preprocessor( predict_prev=args.predict_prev, predict_cur=args.predict_cur, predict_next=args.predict_next, vocab=vocab, max_length=args.max_length, gpu=args.gpu) model = SkipThought( rnn_type=args.rnn_type, num_words=len(vocab), word_dim=args.word_dim, hidden_dim=args.hidden_dim, bidirectional=args.bidirectional, predict_prev=args.predict_prev, predict_cur=args.predict_cur, predict_next=args.predict_next) print(model) if args.pretrained is not None: print(f'Loading pretrained model from {args.pretrained}') model.load_state_dict( torch.load(args.pretrained, map_location=lambda storage, loc: storage)) if args.gpu > -1: model.cuda(args.gpu) optimizer = optim.Adam(model.parameters()) summary_writer = SummaryWriter(os.path.join(args.save_dir, 'log')) def add_scalar_summary(name, value, step): summary_writer.add_scalar(tag=name, scalar_value=value, global_step=step) def add_text_summary(name, value, step): summary_writer.add_text(tag=name, text_string=value, global_step=step) def variable(tensor, volatile=False): return Variable(tensor, volatile=volatile) def run_train_iter(batch): if not model.training: model.train() src, tgt = preprocessor(batch) src = (variable(src[0]), src[1]) for k in tgt: tgt[k] = (variable(tgt[k][0]), tgt[k][1]) logits = model.forward(src=src, tgt=tgt) loss = 0 for k in tgt: logits_k = logits[k] tgt_k = tgt[k] loss = loss + basic.sequence_cross_entropy( logits=logits_k[:-1], targets=tgt_k[0][1:], length=tgt_k[1] - 1) optimizer.zero_grad() loss.backward() clip_grad_norm(model.parameters(), max_norm=10) optimizer.step() return loss.data[0] def ids_to_words(ids): words = [] eos_id = vocab.stoi(vocab.eos) for id_ in ids: words.append(vocab.itos(id_)) if id_ == eos_id: break return words def generate_using_decoder(name, src, max_length): _, encoder_state = model.encoder(words=src[0], length=src[1]) if isinstance(encoder_state, tuple): # LSTM encoder_state = encoder_state[0] context = (encoder_state.transpose(0, 1).contiguous() .view(-1, args.hidden_dim)) batch_size = src[1].size(0) bos_id = vocab.stoi(vocab.bos) bos = Variable(src[1].new(1, batch_size).fill_(bos_id)) decoder = model.get_decoder(name) prev_pred = bos done = torch.zeros(batch_size).byte() hyps = [] prev_state = context.unsqueeze(0) for t in range(max_length): if done.all(): break decoder_input = prev_pred logit, prev_state = decoder(words=decoder_input, prev_state=prev_state) pred = logit.max(2)[1] prev_pred = pred hyps.append(pred.data) hyps = torch.cat(hyps, dim=0).transpose(0, 1).tolist() return hyps def generate(batch): # Greedy search src, tgt = preprocessor(batch) src = (variable(src[0]), src[1]) for k in tgt: tgt[k] = (variable(tgt[k][0], volatile=True), tgt[k][1]) batch_size = src[0].size(1) max_length = src[0].size(0) * 2 generated = {} for k in tgt: generated[k] = generate_using_decoder( name=k, src=src, max_length=max_length) results = [] for i in range(batch_size): res = {'src': ' '.join(ids_to_words(src[0][:src[1][i], i].data)), 'tgt': {}, 'out': {}} for k in tgt: res['tgt'][k] = ' '.join(ids_to_words(tgt[k][0][1:, i].data)) res['out'][k] = ' '.join(ids_to_words(generated[k][i])) results.append(res) return results def generate_synthetic_batch(real_batch): def sort_by_length(tgt_of_key): sorted_length, sort_inds = tgt_of_key[1].sort( dim=0, descending=True) return tgt_of_key[0][:, sort_inds], sorted_length # Forward: given prev, generate cur' _, tgt = preprocessor(real_batch) tgt_prev, tgt_prev_length = sort_by_length(tgt['prev']) syn_src_fw = generate_using_decoder( name='next', src=(variable(tgt_prev[1:], volatile=True), tgt_prev_length - 1), max_length=args.max_length) # Backward: given next, generate cur'' tgt_next, tgt_next_length = sort_by_length(tgt['next']) syn_src_bw = generate_using_decoder( name='prev', src=(variable(tgt_next[1:], volatile=True), tgt_next_length - 1), max_length=args.max_length) syn_batch_fw = [] syn_batch_bw = [] for i in range(len(real_batch)): syn_src_fw_str = ' '.join(ids_to_words(syn_src_fw[i])) syn_src_bw_str = ' '.join(ids_to_words(syn_src_bw[i])) syn_batch_fw.append( (real_batch[i][0], syn_src_fw_str, real_batch[i][2])) syn_batch_bw.append( (real_batch[i][0], syn_src_bw_str, real_batch[i][2])) return syn_batch_fw, syn_batch_bw global_step = 0 def print_samples(): model.eval() num_samples = 2 samples = data_reader.next_batch(size=num_samples, peek=True) syn_samples_fw, syn_samples_bw = generate_synthetic_batch(samples) gen_results = generate(samples) syn_gen_results_fw = generate(syn_samples_fw) syn_gen_results_bw = generate(syn_samples_bw) text_val = '' for i, res in enumerate(gen_results): text_val += f'* sample (real) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' for i, res in enumerate(syn_gen_results_fw): text_val += f'* sample (syn_fw) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' for i, res in enumerate(syn_gen_results_bw): text_val += f'* sample (syn_bw) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' add_text_summary('Sample', value=text_val, step=global_step) for epoch in range(args.max_epoch): data_reader.start_epoch() for batch in tqdm(data_reader.iterator(args.batch_size), desc=f'Epoch {epoch}'): # Train on real batch real_loss = run_train_iter(batch) # Train on synthetic batches syn_batch_fw, syn_batch_bw = generate_synthetic_batch(batch) syn_loss_fw = run_train_iter(syn_batch_fw) syn_loss_bw = run_train_iter(syn_batch_bw) global_step += 1 add_scalar_summary(name='real_loss', value=real_loss, step=global_step) add_scalar_summary(name='syn_loss_fw', value=syn_loss_fw, step=global_step) add_scalar_summary(name='syn_loss_bw', value=syn_loss_bw, step=global_step) if global_step % args.print_every == 0: print_samples() if global_step % args.save_every == 0: model_filename = f'model-{global_step}.pt' model_path = os.path.join(args.save_dir, model_filename) torch.save(model.state_dict(), model_path) print(f'\nIter #{global_step}: ' f'Saved checkpoint to {model_path}')
text, return_offsets_mapping=True, add_special_tokens=False)[ "offset_mapping"] elif config["encoder"] == "BiLSTM": tokenize = lambda text: text.split(" ") def get_tok2char_span_map(text): tokens = tokenize(text) tok2char_span = [] char_num = 0 for tok in tokens: tok2char_span.append((char_num, char_num + len(tok))) char_num += len(tok) + 1 # +1: whitespace return tok2char_span preprocessor = Preprocessor(tokenize_func=tokenize, get_tok2char_span_map_func=get_tok2char_span_map) ori_format = config["ori_data_format"] if ori_format != "tplinker": # if tplinker, skip transforming for file_name, data in file_name2data.items(): if "train" in file_name: data_type = "train" if "valid" in file_name: data_type = "valid" if "test" in file_name: data_type = "test" data = preprocessor.transform_data(data, ori_format=ori_format, dataset_type=data_type, add_id=True) file_name2data[file_name] = data
from configuration import Config from utils import CharaterTable, Preprocessor from CaptionModel import CaptionModel from tensorflow import flags FLAGS = flags.FLAGS flags.DEFINE_integer("caption_len", 25, "The length of caption") flags.DEFINE_string( "model_weights", "/home/suxin/ImageCaption/ImageCaption_coding_test/checkpoint/weights.031-0.776.hdf5", "The weights file of test model") config = Config() data = Preprocessor(config) ctable = CharaterTable(data.train_captions + data.val_captions) caption_len = FLAGS.caption_len caption_model = CaptionModel(image_len=data.image_len, caption_len=caption_len, vocab_size=ctable.vocab_size, ifpool=config.ifpool) caption_model.build_inference_model(FLAGS.model_weights, beam_search=False) result = caption_model.inference(data.val_set) num = result.shape[0] captions = [ctable.decode(result[i], calc_argmax=False) for i in range(num)] for i, caption in enumerate(captions): print i + 8001, for word in caption:
from utils import save_model from utils import loader from featurizer import Featurize from sentiment_analyzer import NaiveBayes from sklearn.model_selection import train_test_split from evaluator import evaluate_accuracy import pickle X, y = loader() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(X_train) token = Preprocessor() file1 = open('preprocessor.obj', 'wb') pickle.dump(token, file1) file1.close() train_preprocessed = [token.tweet_cleaner(i) for i in X_train] f = Featurize() train_features = f.vectorize_train(train_preprocessed) file2 = open('featurizer.obj', 'wb') pickle.dump(f, file2) file2.close() model = NaiveBayes() clf = model.train(train_features, labels=y_train) save_model(clf) print('Model is trained')
def cache_abstraction(env, params): if os.path.exists('./out/{0}'.format(params['env_name'])): shutil.rmtree('./out/{0}'.format(params['env_name'])) if params['use_preproc']: preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'], params['resize_shape']) params['state_dim'] = preprocessor.state_shape else: preprocessor = None agent = VAE(params['state_dim'], params['action_dim']) if params['use_cuda']: agent = agent.cuda() agent.load_state_dict(torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']))) else: agent.load_state_dict( torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']), map_location='cpu')) agent.eval() agent_steps = 0 episode_rewards = [] start = time.time() for episode in xrange(1): env_state = env.reset() episode_reward = 0.0 for t in xrange(1, params['max_steps'] + 1): if params['env_render']: env.render() if preprocessor: state = preprocessor.process_state(env_state) else: state = env_state var_state = createVariable(state, use_cuda=params['use_cuda']) # action, state_val = agent.sample_action_eval(var_state) action, state_val, code = agent.sample_action_eval_code(var_state) if not os.path.exists('./out/{0}/{1}'.format(params['env_name'], code)): os.makedirs('./out/{0}/{1}'.format(params['env_name'], code)) preprocessor.get_img_state().save('./out/{0}/{1}/{2}.png'.format(params['env_name'], code, t)) reward = 0.0 for _ in range(1): env_state, r, terminal, _ = env.step(action) reward += r if terminal: break episode_reward += reward if terminal: break episode_rewards.append(episode_reward) agent_steps += t if preprocessor: preprocessor.reset() print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3}' \ .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100)
def eval_agent_parallel(envs, params): preprocessors = [] for _ in range(params['num_envs']): if params['use_preproc']: preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'], params['resize_shape']) params['state_dim'] = preprocessor.state_shape else: preprocessor = None preprocessors.append(preprocessor) agent = agent_lookup(params) restore_model(agent, params['restore'], params['use_cuda']) if params['use_cuda']: agent.cuda() agent.eval() episode_rewards = [] start = time.time() for episode in xrange(1, params['num_episodes'] + 1): env_states = [env.reset() for env in envs] states = [ preprocessors[i].process_state(env_states[i]) if preprocessors[i] else env_states[i] for i in range(len(envs)) ] env_status = [False for _ in envs] episode_reward = [0.0 for _ in envs] for t in xrange(1, params['max_steps'] + 1): if reduce(lambda x, y: x and y, env_status): break for i, env in enumerate(envs): if params['env_render']: env.render() if env_status[i]: continue var_state = createVariable(states[i], use_cuda=params['use_cuda']) action, state_val = agent.sample_action_eval(var_state) reward = 0.0 for _ in range(1): env_states[i], r, terminal, _ = env.step(action) reward += r if terminal: env_status[i] = True break # episode_reward[i] += reward states[i] = preprocessors[i].process_state( env_states[i]) if preprocessors[i] else env_states[i] for p in preprocessors: p.reset() episode_rewards.extend(episode_reward) if episode % params['print_every'] == 0: print 'Episode {0} | Total Reward {1} | Mean Reward {2} | Total Time {3} ' \ .format(episode, episode_reward, sum(episode_rewards[-100:]) / 100, timeSince(start, episode / params['num_episodes']))
def cache_eval_episode(env, params): cache_states, cache_distros = [], [] if params['use_preproc']: preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'], params['resize_shape']) params['state_dim'] = preprocessor.state_shape else: preprocessor = None agent = agent_lookup(params) if params['use_cuda']: agent = agent.cuda() agent.load_state_dict( torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']))) else: agent.load_state_dict( torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']), map_location='cpu')) agent_steps = 0 episode_rewards = [] start = time.time() for episode in xrange(1): env_state = env.reset() episode_reward = 0.0 for t in xrange(1, params['max_steps'] + 1): if params['env_render']: env.render() if preprocessor: state = preprocessor.process_state(env_state) else: state = env_state var_state = createVariable(state, use_cuda=params['use_cuda']) action, state_val, distro = agent.sample_action_distro(var_state) cache_states.append(state) cache_distros.append(distro.cpu().numpy()) reward = 0.0 for _ in range(1): env_state, r, terminal, _ = env.step(action) reward += r if terminal: break episode_reward += reward if terminal: break episode_rewards.append(episode_reward) agent_steps += t if preprocessor: preprocessor.reset() if episode % params['print_every'] == 0: print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3}' \ .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100) cache_states, cache_distros = np.array(cache_states), np.array( cache_distros) pickle.dump((cache_states, cache_distros), open( './out/{0}_{1}_episode.pkl'.format(params['arch'], params['env_name']), 'wb'), -1)
class TestStatsCalculator(unittest2.TestCase): def setUp(self): self.stats_calc = StatsCalculator() self.preprocessor = Preprocessor() self.col_names = [f'feature_{i}' for i in range(FEATURES)] def tearDown(self): self.stats_calc = None self.preprocessor = None def _get_df(self): df = pd.read_csv('data/train.tsv', sep='\t') df = self.preprocessor.split_features(df) df = self.preprocessor.f_to_int(df) return df def test_mean_calc(self): df = self._get_df() col = random.choice(self.col_names) res = self.stats_calc.calc_mean(df, col) valid_res = np.mean(df[col]) self.assertEqual(res, valid_res, "Wrong mean calculation") def test_std_calc(self): df = self._get_df() col = random.choice(self.col_names) res = self.stats_calc.calc_std(df, col) valid_res = np.std(df[col]) self.assertEqual(res, valid_res, "Wrong std calculation") def test_speed(self): """ Test parallelized mean calculation """ df = self._get_df() col = random.choice(self.col_names) def wrapper(func): def inner(df, col, multiproc=False): start = time.time() result = func(df, col) end = time.time() print(f'\nResult of calculation: {result}') if multiproc: print(f'Timing of calc in parallel: {end - start}') else: print(f'Timing of sequential calc: {end - start}') return result return inner seq_calc = wrapper(self.stats_calc.calc_mean) res = seq_calc(df, col) parallel_calc = wrapper(self.stats_calc.calc_mean) parallel_calc(df, col, multiproc=True) true_value = np.mean(df[col]) self.assertEqual(res, true_value, "Wrong mean calculation")
def setUp(self): self.stats_calc = StatsCalculator() self.preprocessor = Preprocessor() self.feature_adder = FeatureAdder() self.col_names = [f'feature_{i}' for i in range(FEATURES)]
def train(**kwargs): kwargs["real_batch_size"] = kwargs["batch_size"] * kwargs["gradient_accumulation"] kwargs["train_dataset"] = kwargs["train_dataset"].split("-") kwargs["validation_dataset"] = kwargs["validation_dataset"].split("-") args = SimpleNamespace(**kwargs) if args.dryrun: os.environ['WANDB_MODE'] = 'dryrun' run = wandb.init(project="SAPAUT-PAUSES", name=args.run_name) if args.load: if not os.path.exists(f"models/{args.load}"): artifact = run.use_artifact(args.load + ":latest") artifact.download(root=f"models/{args.load}") args.model_name = f"models/{args.load}" special_tokens = ["<punct>"] if args.include_pauses: if not args.replace_pause: special_tokens.append("<pause>") ds_type = "ref-pauses" else: ds_type = "ref" if args.teacher_forcing: ds_type += "-tf" if args.tagging: ds_type += "-tag" if args.pause_threshold != 0.2: download_mode="reuse_cache_if_exists" else: download_mode="reuse_dataset_if_exists" ds_train = load_dataset( f"punctuation-iwslt2011/{args.train_dataset[0]}.py", ds_type, download_mode=download_mode, splits=[args.train_dataset[1]], ignore_verifications=True, lookahead_range=args.lookahead, pause_threshold=args.pause_threshold, ) print("len", len(ds_train["validation"])) tokenizer = AutoTokenizer.from_pretrained( args.model_name, fast=True, additional_special_tokens=special_tokens, add_prefix_space=args.tagging ) if not args.tagging: label_names = ds_train[args.train_dataset[1]].features["label"].names else: label_names = ds_train[args.train_dataset[1]].features["label"].feature.names preprocessor = Preprocessor( tokenizer, args, label_names, args.replace_pause, args.tagging, ) ds_train = ds_train.map( preprocessor.preprocess, batched=False, num_proc=args.num_proc ) ds_train.rename_column_("label", "labels") ds_valid = load_dataset( f"punctuation-iwslt2011/{args.validation_dataset[0]}.py", ds_type, download_mode=download_mode, splits=[args.validation_dataset[1]], ignore_verifications=True, lookahead_range=args.lookahead, pause_threshold=args.pause_threshold, ) ds_valid = ds_valid.map( preprocessor.preprocess, batched=False, num_proc=args.num_proc ) ds_valid.rename_column_("label", "labels") train = ds_train[args.train_dataset[1]] valid = ds_valid[args.validation_dataset[1]] train.shuffle(42) valid.shuffle(42) training_args = TrainingArguments( output_dir="./results", num_train_epochs=args.epochs, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, weight_decay=args.weight_decay, logging_dir="./logs", logging_steps=args.log_steps, evaluation_strategy="steps", gradient_accumulation_steps=args.gradient_accumulation, eval_steps=args.log_steps, ) config = AutoConfig.from_pretrained( args.model_name, num_labels=4, ) if not args.tagging: model = AutoModelForSequenceClassification.from_pretrained( args.model_name, config=config ) else: if args.bilstm: model = RobertaBiLSTMForTokenClassification.from_pretrained( args.model_name, config=config ) else: model = AutoModelForTokenClassification.from_pretrained( args.model_name, config=config ) model.resize_token_embeddings(len(tokenizer)) optimizer = AdamW( [ {"params": model.base_model.parameters()}, {"params": model.classifier.parameters()}, ], lr=args.lr, weight_decay=args.weight_decay, ) if args.resample != "None": function_dict = { "Max": np.max, "Mean": np.mean, "Median": np.median, } np_function = function_dict[args.resample] mean_samples_excl_none = int( np_function(sorted(np.unique(train["labels"], return_counts=True)[1])[:-1]) ) per_class_samples = mean_samples_excl_none balanced_filter = np.concatenate( [ np.where(np.array(train["labels"]) == i)[0][:per_class_samples] for i in range(4) ], axis=0, ) train = train.select(balanced_filter) total_steps = len(train) // args.real_batch_size total_steps = total_steps * args.epochs schedule = get_linear_schedule_with_warmup(optimizer, total_steps // 2, total_steps) trainer = Trainer( model=model, args=training_args, train_dataset=train, eval_dataset=valid, compute_metrics=preprocessor.compute_metrics, optimizers=(optimizer, schedule), ) wandb.config.update(args.__dict__) if not args.no_train: trainer.train() if args.tagging: return lookahead_test = [] for i in range(5): lookahead_test.append( valid.select(np.where(np.array(valid["lookahead"]) == i)[0]) ) la_metrics = [] for l_test in lookahead_test: la_metrics.append(trainer.predict(l_test).metrics) for k in la_metrics[0].keys(): data = [[i, m[k]] for i, m in enumerate(la_metrics)] table = wandb.Table(data=data, columns=["lookahead", k]) wandb.log( { f"{k}_lookahead": wandb.plot.line( table, "lookahead", k, title=f"{k} vs. lookahead" ) } ) if args.save: trainer.save_model(f"models/{args.save}") tokenizer.save_pretrained(f"models/{args.save}") model_artifact = wandb.Artifact(args.save, type="model") for path in glob.glob(f"models/{args.save}/**/*.*", recursive=True): model_artifact.add_file(path) wandb.run.log_artifact(model_artifact) for i in range(5): res_dict = {key: round(val * 100, 1) for key, val in la_metrics[i].items()} print(f"------- {i} ----------") print( "COMMA", res_dict["eval_precision_<comma>"], res_dict["eval_recall_<comma>"], res_dict["eval_f1_<comma>"], ) print( "PERIOD", res_dict["eval_precision_<period>"], res_dict["eval_recall_<period>"], res_dict["eval_f1_<period>"], ) print( "QUESTION", res_dict["eval_precision_<question>"], res_dict["eval_recall_<question>"], res_dict["eval_f1_<question>"], ) print( "OVERALL", res_dict["eval_precision"], res_dict["eval_recall"], res_dict["eval_f1"], ) print()
CSV_FILES = ["sentiment140_labeled_done.csv"] # first read in the data data = dict() for file in TRAINING_FILES: with open(os.path.join(TRAINING_DATA_PATH, file), "rb") as f: data[file] = pickle.load(f, encoding='latin1') for file in CSV_FILES: data[file] = pd.read_csv(os.path.join(TRAINING_DATA_PATH, file), encoding="ISO-8859-1", header=0, names=["id", "text", "labels"]) # second, preprocess the data for model ingestion pp = Preprocessor(data, debug=False) X_base_train, y_base_train, vocab_processor_base, X_base_test, y_base_test = pp.preprocess( datasource="s140") X1_train, y1_train, vocab_processor1, X1_test, y1_test = pp.preprocess( datasource="scv1") X2_train, y2_train, vocab_processor2, X2_test, y2_test = pp.preprocess( datasource="scv2") X_val, y_val = pp.preprocess(datasource="s140", split=False) # third, run the models # baseline cnn_model_base = SarcasmCNN(data=((X_base_train, y_base_train), (X_base_test, y_base_test)), vocab_processor=vocab_processor_base) print("Baseline Performance") cnn_model_base.run()
def main(train_csv, model_dir, mode): start_time = time.time() #df = pd.read_csv(args.train_csv, low_memory = False) df = pd.read_csv(train_csv) is_big = df.memory_usage().sum() > BIG_DATASET_SIZE # dict with data necessary to make predictions model_config = {} model_config['is_big'] = is_big preprocessor = Preprocessor() df_X, df_y = preprocessor.fit_transform(df) model_config['features'] = preprocessor.features print('Dataset read, shape {}'.format(df_X.shape)) # fitting model_config['mode'] = mode if mode == 'regression': ridge_model = Ridge() cb_model = cb.CatBoostRegressor( iterations=300, boosting_type=('Ordered' if len(df_X) < 1000 else 'Plain'), od_type="IncToDec", depth=6, od_pval=0.0001, #learning_rate=0.03, loss_function='RMSE') models = [ridge_model, cb_model] else: log_reg_model = LogisticRegression() cb_model = cb.CatBoostClassifier( iterations=300, boosting_type=('Ordered' if len(df_X) < 1000 else 'Plain'), od_type="IncToDec", depth=6, od_pval=0.0001, #learning_rate=0.03, loss_function='Logloss', logging_level='Verbose') models = [log_reg_model, cb_model] for model in models: model.fit(df_X, df_y) D = [1 / np.std(model.predict(df_X) - df_y)**2 for model in models] s = sum(D) coef = [d / s for d in D] model = Model(models, coef) model_config['model'] = model model_config_filename = os.path.join(model_dir, 'model_config.pkl') with open(model_config_filename, 'wb') as fout: pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL) print('Train time: {}'.format(time.time() - start_time))
import cv2 import numpy as np from utils.Face_Detector import * from utils.Preprocessor import * from utils.Model_Loader import * haar_face_detector = Haar_Face_Detector('./models/front_face_cascade.xml') preprocessor = Preprocessor(96, 96) model_loader = Model_Loader('./models/model_2.h5') model_loader.load_model() video_capture = cv2.VideoCapture(0) video_capture.set(cv2.CAP_PROP_FRAME_WIDTH, 1280) video_capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 720) while True: _, frame = video_capture.read() rois = haar_face_detector.detect(frame) try: x, y, w, h = rois[0] roi = frame[y:y + h, x:x + w] preprocessed_roi = preprocessor.process(roi) eye_x, eye_y = model_loader.get_coordinates(preprocessed_roi, preprocessed_roi.shape[0], preprocessed_roi.shape[1]) # cv2.circle(preprocessed_roi, (int(eye_x[0]), int(eye_y[0])), 5, (0, 255, 0), -1)