def main(): if len(sys.argv) <= 2: print( "please specify a number of examples and a model name (e.g. models.baseline.random_handle)" ) sys.exit(1) eval_set_size = int(sys.argv[1]) module_name = sys.argv[2] # splitting training data print("splitting training data into", eval_set_size, "(test) v. rest (train)") data.load_train() tweets = np.array(data.TRAIN) np.random.seed(SEED) np.random.shuffle(tweets) test_tweets, train_tweets = tweets[:eval_set_size], tweets[eval_set_size:] hyper_parameters = models.parse_hyper_parameters(sys.argv[3:]) model_class = importlib.import_module(module_name).Model print("Model:", module_name, hyper_parameters) print("Training...") model = model_class(tqdm(train_tweets, dynamic_ncols=True), **hyper_parameters) print("Evaluating...") accuracy, correct, tests = eval.evaluate( model, tqdm(test_tweets, dynamic_ncols=True)) print(f"Label accuracy: {correct}/{tests} ({accuracy:%})")
def train(model: keras.Model): from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, \ ReduceLROnPlateau from tensorflow.keras.optimizers import SGD model.compile(optimizer=SGD(learning_rate=0.1, momentum=0.9, nesterov=True), loss='sparse_categorical_crossentropy', metrics=['accuracy']) weight_path = 'weights/%s.h5' % model.name x_train, y_train = data.load_train('cifar10', channel_first=False) train_iter, val_iter = data.get_train_val_iterator(x_train, y_train) callbacks = [ ReduceLROnPlateau(patience=10, min_lr=1e-3, verbose=1), ModelCheckpoint(weight_path, verbose=1, save_best_only=True, save_weights_only=True), TensorBoard(), ] steps_per_epoch = int(len(x_train) * (1 - data.val_split)) // common.batch_size model.fit(train_iter, epochs=200, callbacks=callbacks, steps_per_epoch=steps_per_epoch, validation_data=val_iter)
def train(options): attributes_train, labels_train = preprocess(load_train(), normalize=options.normalize) attributes_val, labels_val = preprocess(load_val(), normalize=options.normalize) n_attributes = attributes_train.shape[1] model = get_model(options, n_attributes) model.train(attributes_train, labels_train, attributes_val, labels_val) # save model if options.save_model is not None: model.save(options.save_model) # compute validation scores predictions_val = model.predict(attributes_val) return get_binary_class_scores(labels_val, predictions_val)
from keras.optimizers import Adam, Adadelta from sklearn.model_selection import train_test_split from keras import backend as K from matplotlib import pyplot as plt from data import load_train from models.cnn_vgg import VGG print(K.tensorflow_backend._get_available_gpus()) # Data properties num_classes = 10 img_x, img_y = 64, 64 # Load training data train_images, train_labels = load_train() x_train, x_valid, y_train, y_valid = train_test_split(train_images, train_labels, test_size=0.2, random_state=42, stratify=train_labels) # Reshape and normalize images x_train = x_train.reshape(x_train.shape[0], img_x, img_y, 1) x_valid = x_valid.reshape(x_valid.shape[0], img_x, img_y, 1) x_train = x_train.astype('float32') x_valid = x_valid.astype('float32') x_train /= 255. x_valid /= 255. # One-hot encode labels y_train = to_categorical(y_train, num_classes) y_valid = to_categorical(y_valid, num_classes) print(f'Train images dim: {x_train.shape}')
def plot_confusion_matrix(X, Y, figsize=(10, 6), cmap=plt.cm.Greens): Y_pred = model.predict(X) Y_pred = np.argmax(Y_pred, axis=1) Y_true = np.argmax(Y, axis=1) cm = confusion_matrix(Y_true, Y_pred) plt.figure(figsize=figsize) ax = sns.heatmap(cm, cmap=cmap, annot=True, square=True) ax.set_ylabel('Actual', fontsize=30) ax.set_xlabel('Predicted', fontsize=30) plt.show() if __name__ == '__main__': train_data = load_train() X_train, Y_train = separate_train(train_data) X_train, Y_train = preprocess_input(X_train, Y_train) X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=SEED) # train_model(X_train, X_val, Y_train, Y_val) model = load_model('model.h5') # To test new data, load the data, separate the features from the labels # and preprocess the data. Then change X_val and Y_val to the desired data final_loss, final_accuracy = model.evaluate(X_val, Y_val, verbose=0) print('Final Loss: {:.4f}, Final Accuracy: {:.4f}'.format( final_loss, final_accuracy))
def main(): options = parse_arguments() functional_features, non_functional_features, normal_ff, normal_nff = split_features(load_train(), selected_attack_class=options.attack) nff_attributes, labels_mal = preprocess(non_functional_features, normalize=options.normalize) normal_attributes, labels_nor = preprocess(normal_nff, normalize=options.normalize) n_attributes = nff_attributes.shape[1] trainingset = (normal_attributes, nff_attributes, labels_nor, labels_mal) functional_features, non_functional_features, normal_ff, normal_nff = split_features(load_val(), selected_attack_class=options.attack) nff_attributes, labels_mal = preprocess(non_functional_features, normalize=options.normalize) normal_attributes, labels_nor = preprocess(normal_nff, normalize=options.normalize) n_attributes = nff_attributes.shape[1] validationset = (normal_attributes, nff_attributes, labels_nor, labels_mal) model = WGAN(options, n_attributes) model.train(trainingset, validationset) # save model if options.save_model is not None: save_model_directory = os.path.join(options.save_model, options.name) os.makedirs(save_model_directory, exist_ok=True) model.save(save_model_directory)
writer.writerow(review.__dict__) def remove_diacritic(input): """ Accept a unicode string, and return a normal string without any diacritical marks. input arguments: input: the string to strip accents from output arguments: the stripped input """ return unicodedata.normalize('NFKD', input).encode('ASCII', 'ignore') if __name__ == "__main__": dataset = sys.argv[1] if dataset == 'train': reviews = data.load_train() elif dataset == 'test': reviews = data.load_test() else: raise ValueError('No dataset ' + dataset + ' found!') print "reviews loaded" reviews_dict_languages = split_by_language(reviews) for k, v in reviews_dict_languages.iteritems(): print k review_list = correct_spelling_and_stem(k, v) print "corrected and stemmed" save_reviews_to_csv(k, review_list, dataset) print "saved to csv"
''' from nltk.corpus import stopwords from textblob import TextBlob import nltk from nltk import word_tokenize as wt from nltk.stem import WordNetLemmatizer from gensim import corpora, models, similarities from gensim.models import LdaModel stop = set(stopwords.words()) from data import load_train sentences, label = load_train() texts = [[word for word in document.lower().split() if word not in stop] for document in sentences] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] topic_num = 100 lda = LdaModel(corpus, num_topics=topic_num, eval_every=5, passes=200) # train model max_num = topic_num + 10
print(title) start = time.time() pipeline.fit(x, y) train_time = time.time() print(f'Training time: {train_time-start}') print(f'Training accuracy: {pipeline.score(x, y)}') print(f'Validation accuracy: {pipeline.score(x_v, y_v)}') print(f'Scoring time: {time.time()-train_time}') save_model(pipeline, f'{model_name}.joblib') if __name__ == '__main__': print('Loading data...') start = time.time() max_features = None data_train = load_train() train, validation = train_test_split(data_train, test_size=0.3, random_state=42) x, y = zip(*train) x_v, y_v = zip(*validation) print(f'Time to load data: {time.time()-start}') print(f'Training with max_features: {max_features}') logreg_unigram_tfidf = logreg_bigram_tfidf = sgd_unigram_tfidf = sgd_bigram_tfidf = linsvc_unigram_tfidf = linsvc_bigram_tfidf = nb_unigram = True # logreg_unigram_tfidf = False # logreg_bigram_tfidf = False # sgd_unigram_tfidf = False # sgd_bigram_tfidf = False # linsvc_unigram_tfidf = False
import data from bs4 import BeautifulSoup import re from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np from sklearn.linear_model import LogisticRegression from data import load_train from data import load_test from matplotlib import pyplot as plt from create_submission import write_predictions_to_csv import math #Loading training set tot_train = load_train() #Extracting the reviews' contents tot_x_content_train = [review.content for review in tot_train][:196539] #Creating the labels for the training set, taking them directly from the data tot_y_train = np.array([review.rating for review in tot_train])[:196539] import nltk nltk.download() from nltk.corpus import stopwords stop_words = stopwords.words("english") print stop_words # Adding very often used words in the field of hotels' reviewing to the stop-words list. stop_words.extend([ u'hotel', u'hotels', u'room', u'rooms', u'night', u'nights', u'location', u'bed', u'beds', u'place', u'breakfast', u'position', u'station', u'stay', u'stayed', u'staff', u'accomodation', u'accommodations', u'during',
pass if len(vecs) == 0: printv('Warning: entirely OOV tweet, zeroing...') return np.zeros(300) return np.mean(vecs, axis=0) if __name__ == '__main__': print('Loading word2vec...') embeddings = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC, binary=True) print('Loading training data...') train = data.load_train() #[:LIMIT] dic = data.load_dic() print('Computing tweet averages...') X = np.zeros(shape=(len(train), 300)) y = np.zeros(shape=(len(train), ), dtype=int) for i, tweet in enumerate(train): X[i] = tweet_embedding_by_average(tweet[0], dic, embeddings) y[i] = tweet[1] print('Training the model...') clf = RandomForestClassifier(n_estimators=200, max_depth=10) clf.fit(X, y) print('Loading test data...') test = data.load_test() #[:LIMIT]
batch_scores = sess.run(cnn.q_ap_cosine, feed_dict) for score, qid, label in zip(batch_scores, qids, labels): scoreDict.setdefault(qid, list) scoreDict[qid].append([score, label]) lev1 = .0 lev0 = .0 for k, v in scoreDict.items(): v.sort(key=operator.itemgetter(0), reverse=True) score, flag = v[0] if flag == '1': lev1 += 1 if flag == '0': lev0 += 1 # 回答的正确数和错误数 print('回答正确数 ' + str(lev1)) print('回答错误数 ' + str(lev0)) print('准确率 ' + str(float(lev1)/(lev1+lev0))) # 每5000步测试一下 evaluate_every = 5000 # 开始训练和测试 sess.run(tf.global_variables_initializer()) for i in range(config.num_epochs): for (_, x_batch_1, x_batch_2, x_batch_3) in data.load_train(config.batch_size, config.sequence_length, config.sequence_length): train_step(x_batch_1, x_batch_2, x_batch_3) if (i+1) % evaluate_every == 0: print("\n测试{}:".format((i+1)/evaluate_every)) dev_step() print
def main(): logs = { 'start-time': now(), 'lock': LOCK, 'num_workers': WORKERS, 'reg_lambda': REG_LAMBDA, 'epochs': EPOCHS, 'learning_rate': LEARNING_RATE } # Logging configuration logging.basicConfig(filename='logs/tmp_logs.txt', level=logging.WARNING) with Manager() as manager: logging.warning("{}:Loading Training Data...".format(now())) logging.warning("{}:FULL TEST {}".format(now(), FULL_TEST)) logging.warning("{}:WORKERS {}".format(now(), WORKERS)) logging.warning("{}:LOCK {}".format(now(), LOCK)) val, train = data.load_train() train = manager.dict(train) dim = max([max(k) for k in train['features']]) + 1 init_w = [0.0] * dim if LOCK: lock = Lock() w = Array(c_double, init_w, lock=lock) else: w = RawArray(c_double, init_w) logs['start-compute-time'] = now() start_time = time() logging.warning("{}:Starting SGD...".format( logs['start-compute-time'])) val_queue = Queue() workers = [] for worker in range(WORKERS): p = Process(target=sgd, args=(worker, train, w, val_queue)) p.start() workers.append(p) logs['epochs-stats'] = [] # Initial early stopping variables persistence = [0.0] * PERSISTENCE smallest_val_loss = float('inf') workers_done = [False] * WORKERS while True: workers_alive = any([p.is_alive() for p in workers]) if not workers_alive: logging.warning("{}:WORKERS DONE!".format(now())) logs['end-compute-time'] = now() logging.warning("{}:END TIME {}".format( now(), time() - start_time)) if not workers_alive and val_queue.empty(): logging.warning("{}:WORKERS DONE AND QUEUE EMPTY!".format( now())) final_weights = w[:] break # Block until getting a message val_queue_item = val_queue.get() worker = val_queue_item['worker'] epoch = val_queue_item['epoch'] weights = val_queue_item['weights'] val_loss = loss(val, weights) logging.warning("{}:EPOCH:{}".format(now(), epoch)) logging.warning("{}:VAL. LOSS:{}".format(now(), val_loss)) logs['epochs-stats'].append({ 'epoch_number': epoch, 'val_loss': val_loss }) # Early stopping criteria persistence[epoch % PERSISTENCE] = val_loss if smallest_val_loss < min(persistence): # Early stop logging.warning("{}:EARLY STOP!".format(now())) # Terminate all workers, but save the weights before # because a worker could have a lock on them. Terminating # a worker doesn't release its lock. final_weights = w[:] for p in workers: p.terminate() logs['end-compute-time'] = now() logging.warning("{}:END TIME {}".format( now(), time() - start_time)) break else: smallest_val_loss = val_loss if val_loss < smallest_val_loss else smallest_val_loss # Close queue val_queue.close() val_queue.join_thread() logging.warning("{}:Calculating Train Accuracy".format(now())) train_accuracy = accuracy(train, final_weights) logs['train_accuracy'] = train_accuracy logging.warning("{}:TRAIN ACC:{}".format(now(), train_accuracy)) # Calculate test accuracy logging.warning("{}:Calculating Test Accuracy".format(now())) test = data.load_test(FULL_TEST) test_accuracy = accuracy(test, final_weights) logs['test_accuracy'] = test_accuracy logging.warning("{}:TEST ACC:{}".format(now(), test_accuracy)) logs['end_time'] = now() with open( 'logs/logs.w_{}.l_{}.e_{}.time_{}.json'.format( WORKERS, LOCK, EPOCHS, logs['start-time']), 'w') as f: json.dump([logs], f)
emb_dropout=0.1) model = MoCo(dim=args.moco_dim, K=args.moco_k, m=args.moco_m, T=args.moco_t, ver=args.version, arch=args.arch, bn_splits=args.bn_splits, symmetric=args.symmetric, v3_encoder=vit).cuda() print(model) # exit(0) train_data, train_loader = load_train(args) memory_data, memory_loader = load_memory(args) test_data, test_loader = load_test(args) # define optimizer if args.version == 3: optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.wd) else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd, momentum=0.9) # load model if resume
def main(unused_argv): ''' 开始训练和测试 ''' with tf.device('/gpu:0'), tf.Session(config=config.cf) as sess: # 建立CNN网络 cnn = QACNN(config, sess) # 保存Metrics数据 tf_writer = tf.summary.FileWriter(logdir=os.path.join( curdir, 'sdist/'), graph=sess.graph) # Summaries for loss and accuracy during training summary_loss = tf.summary.scalar("train/loss", cnn.loss) summary_accu = tf.summary.scalar("train/accuracy", cnn.accu) summary_op = tf.summary.merge([summary_loss, summary_accu]) # 训练函数 def train_step(x_batch_1, x_batch_2, x_batch_3): feed_dict = { cnn.q: x_batch_1, cnn.aplus: x_batch_2, cnn.aminus: x_batch_3, cnn.keep_prob: config.keep_prob } _, step, loss, accuracy, summaries = sess.run([ cnn.train_op, cnn.global_step, cnn.loss, cnn.accu, summary_op ], feed_dict) tf_writer.add_summary(summaries, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) return time_str, step, loss, accuracy # 测试函数 def dev_step(step): # 混淆矩阵建立评估 # http://www.uta.fi/sis/tie/tl/index/Rates.pdf quality = {'tp': 0, 'tn': 0, 'fp': 0, 'fn': 0} losses = [] labels = [] scores = [] pbar = tqdm(config.test_data) pbar.set_description("evaluate step %s" % step) for x in pbar: _, loss, score = cnn.predict( dict({ 'question': x[1], 'utterance': x[2] }), x[3]) scores.append(score) losses.append(loss) labels.append(x[3]) # 使用Roc Curve生成Threshold # http://alexkong.net/2013/06/introduction-to-auc-and-roc/ fpr, tpr, th = metrics.roc_curve(labels, scores) threshold = round(metrics.auc(fpr, tpr), 5) if score >= threshold and x[3] == 1: quality['tp'] += 1 elif score >= threshold and x[3] == 0: quality['fp'] += 1 elif score < threshold and x[3] == 1: quality['fn'] += 1 else: quality['tn'] += 1 accuracy = float(quality['tp'] + quality['tn']) / ( quality['tp'] + quality['tn'] + quality['fp'] + quality['fn']) loss = tf.reduce_mean(losses).eval() tf_writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag="evaluate/loss", simple_value=loss), tf.Summary.Value(tag="evaluate/accuracy", simple_value=accuracy) ]), step) print('evaluation @ step %d: 准确率: %d, 损失函数: %s, threshold: %d' % (step, accuracy, loss, threshold)) # 每500步测试一下 # 开始训练和测试 sess.run(tf.global_variables_initializer()) for i in range(config.num_epochs): for (_, x_question, x_utterance, y) in data.load_train(config.batch_size, config.sequence_length, config.sequence_length): if len( _ ) == config.batch_size: # 在epoch的最后一个mini batch中,数据条数可能不等于 batch_size _, global_step, _, _ = train_step(x_question, x_utterance, y) if global_step % FLAGS.evaluate_every == 0: dev_step(global_step)
def main(): logs = { 'start-time': now(), 'num_workers': PARTITIONS, 'reg_lambda': REG_LAMBDA, 'epochs': EPOCHS, 'batch': BATCH, 'learning_rate': LEARNING_RATE } # Logging configuration logging.basicConfig(filename='/data/logs/tmp_logs.txt', level=logging.WARNING) logging.warning("{}:Loading Training Data...".format(now())) # Load data val_df, train_df = data.load_train(spark) # Collect validation for loss computation val_collected = val_df.collect() # Create initial weight vector dimensions = train_df.rdd \ .map(lambda row: max(row.features.keys())).max() + 1 w = [0.0] * dimensions # Create the partitions of the train dataset partitions = train_df.rdd.zipWithIndex() \ .map(lambda x: (x[1], x[0])) \ .partitionBy(PARTITIONS) persistence = [0.0] * PERSISTENCE smallest_val_loss = float('inf') logs['start-compute-time'] = now() logging.warning("{}:Starting SGD...".format(logs['start-compute-time'])) logs['epochs-stats'] = [] for epoch in range(EPOCHS): epoch_stat = {'epoch_number': epoch, 'epoch_start': now()} logging.warning("{}:EPOCH:{}".format(now(), epoch)) # Broadcast w to make it available for each worker w_b = sc.broadcast(w) # Calculate Mini Batch Gradient Descent for each partition partition_deltas_w = \ partitions.mapPartitions(lambda x: sgd(x, w_b)).collect() # Collect total update weights for all workers in one epoch total_delta_w = {} for delta_w in partition_deltas_w: for k, v in delta_w.items(): if k in total_delta_w: total_delta_w[k] += v else: total_delta_w[k] = v # Update weights for k, v in total_delta_w.items(): w[k] += LEARNING_RATE * v val_loss = loss(val_collected, w) epoch_stat['val_loss'] = val_loss epoch_stat['epoch_end'] = now() logs['epochs-stats'].append(epoch_stat) logging.warning("{}:VAL. LOSS:{}".format(now(), val_loss)) # Early stopping criteria persistence[epoch % PERSISTENCE] = val_loss if smallest_val_loss < min(persistence): # Early stop logging.warning("{}:EARLY STOP!".format(now())) break else: smallest_val_loss = val_loss if val_loss < smallest_val_loss else smallest_val_loss logs['end-compute-time'] = now() logging.warning("{}:Calculating Train Accuracy".format(now())) train_accuracy = accuracy(train_df, w) logs['train_accuracy'] = train_accuracy logging.warning("{}:TRAIN ACC:{}".format(now(), train_accuracy)) logging.warning("{}:Calculating Test Accuracy".format(now())) test_df = data.load_test(spark) test_accuracy = accuracy(test_df, w) logs['test_accuracy'] = test_accuracy logging.warning("{}:TEST ACC:{}".format(now(), test_accuracy)) spark.stop() logs['end_time'] = now() with open( '/data/logs/logs.workers_{}.batch_{}.epochs_{}.time_{}.json'. format(PARTITIONS, BATCH, EPOCHS, logs['start-time']), 'w') as f: json.dump([logs], f)
import sys import importlib from tqdm import tqdm import data data.load_train() import models def main(): # interpret command line arguments if len(sys.argv) <= 1: print( "please specify a model name (e.g. models.baseline.random_handle)") sys.exit(1) module_name = sys.argv[1] hyper_parameters = models.parse_hyper_parameters(sys.argv[2:]) # training model module = importlib.import_module(module_name) print( f"Training {module_name}.Model with hyperparameters {hyper_parameters}" ) model = module.Model(tqdm(data.TRAIN, dynamic_ncols=True), **hyper_parameters) print("Training done!") models.save(model, module_name, hyper_parameters)
def main( data_path, train_data_path, val_data_path, test_data_path, output_path, prediction_name='suggestion.json', cache_dir=None, model_type='lda', ): ''' train a model and make a prediction Args: data_path: path to the data json file train_data_path: path to the train data val_data_path: path to the val data test_data_path: path to the test data output_path: path to the output dir prediction_name: the name of prediction output file cache_dir: where to save cache model: which model to use Returns: None ''' # load data print('Loading data') documents, titles = data.load_doc_title( data_path, cache_path=os.path.join(cache_dir, 'preproccessed') if cache_dir is not None else None, ) train_data = data.load_train(train_data_path) val_data = data.load_val(val_data_path) test_data = data.load_test(test_data_path) # convert to corpus if needed if model_type in ('lda', ): print('Preparing corpus') dictionary = utils.make_dictionary( documents.content, cache_path=os.path.join(cache_dir, 'dictionary') if cache_dir is not None else None, filter_=False, ) documents['bow'] = utils.make_corpus(documents.content, dictionary) titles['bow'] = utils.make_corpus(titles.content, dictionary) # train print('Training model') if model_type == 'lda': model = engine.CustomLDA(documents, titles, dictionary) model = model.train(train_data, val_data, output_path) elif model_type == 'doc2vec': model = engine.CustomDoc2vec(documents, titles) model = model.train(train_data, val_data, output_path) else: raise ValueError(model_type) # inference prediction = model.predict(test_data) prediction_output = os.path.join(output_path, prediction_name) data.dump_prediction(prediction, prediction_output) return