def __init__(self, training_epochs=50, sequence_length=20, batch_size=100, learning_rate=0.001, dropout=0.2): # Hyper Parameters self.sequence_length = sequence_length self.embedding_size = 512 self.hidden_size = 128 self.num_layers = 1 self.batch_size = batch_size self.learning_rate = learning_rate self.dropout = dropout self.data = DataUtil() self.data.load_split_data() print(self.data.get_dataset(self.data.TRAIN)[:5]) self.data.build_vocab( self.data.get_dataset(self.data.TRAIN) + self.data.get_dataset(self.data.TEST)) self.model = GRURNN(self.embedding_size, self.hidden_size, self.num_layers, self.data.input_lang.n_words, self.data.output_lang.n_words, self.dropout) self.training_epochs = training_epochs self.epoch_start = 1 self.use_cuda = torch.cuda.is_available()
def __init__(self): self.config = Config() self.du = DataUtil(self.config) self.sc = StockScraper(ASingleStockConfig()) self.config.ACTION_NUM = len(self.config.actions) self.memories = [] self.W1 = tf.get_variable('W1', [self.config.INPUT, self.config.M1]) self.b1 = tf.get_variable('b1', [self.config.M1]) self.W2 = tf.get_variable('W2', [self.config.M1, self.config.M2]) self.b2 = tf.get_variable('b2', [self.config.M2]) self.W3 = tf.get_variable('W3', [self.config.M2, 1]) self.b3 = tf.get_variable('b3', [1]) self.current_data = [] self.current_state = [] self.portfolio = { 'fund': 500000, 'stock_quantity': 50000, 'current_stock_price': 0, 'total': -1, 'stock_value': 0 } # self.init_op = tf.initialize_all_variables() self.init_placeholder() scores = self.batch_scoring_op() next_step_scores = self.batch_predict_op() self.add_loss_n_train_op(scores, next_step_scores) self.add_step_predict_op() self.saver = tf.train.Saver() self.init_op = tf.initialize_all_variables()
def printUpgrades(currentGear, allGear): headerString = 'UPGRADES' print('\n\n\n%s\n%s\n' % (headerString, '-' * len(headerString))) headers = ['DPS Diff', 'Name', 'ilvl', 'Location', 'Boss'] for slot in sorted(list(currentGear.keys())): piece = currentGear[slot] print('\n\n\n%s (%s, %.2f DPS)' % (slot, piece['Name'], piece['DPS'])) actualSlotString = CalcUtil.removeUnderscore(slot) outputItems = [] for otherName in allGear[actualSlotString]: otherPiece = copy.deepcopy(allGear[actualSlotString][otherName]) otherPiece['DPS Diff'] = otherPiece['DPS'] - piece['DPS'] if (otherPiece['DPS Diff'] > 0): outputItems.append(otherPiece) outputItems.sort(lambda p1, p2: int(p2['DPS'] - p1['DPS'])) print('') print(DataUtil.getTabulated(outputItems, headers)) print('')
def main(num_epochs=100, n_splits=5): data_util = DataUtil('data', 'spectrogram_data') X, y = data_util.get_data() kf = KFold(n_splits=n_splits, shuffle=True) test_accuracy_sum = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model = Model(data_util.height, data_util.width) param_values, threshold = train_and_validate(model, X_train, y_train, num_epochs) model.set_param_values(param_values) test_accuracy_sum += perform_validation(model, X_test, y_test, threshold) print("Cross-validation results:") print(" accuracy:\t\t{:.2f} %".format(test_accuracy_sum/n_splits * 100))
def __init__(self, charInfoPath): """ Slots (currently - adding weapons and trinkets later): 'Back', 'Belt', 'Bracer', 'Chest', 'Feet', 'Gloves', 'Head', 'Legs', 'Neck', 'Ring', 'Shoulder' """ paths = {'allGear': 'AllGear.json', 'trinkets': 'Trinkets.json'} self.charInfo = FileUtil.getJSONContents(charInfoPath) allGearList = FileUtil.getJSONContents(paths['allGear']) allTrinketsList = FileUtil.getJSONContents(paths['trinkets']) # They don't exlicitly say that they're trinkets for trink in allTrinketsList: trink['Slot'] = 'Trinket' # Combine all gear into one list self.allGear = [] self.allGear.extend(allGearList) self.allGear.extend(allTrinketsList) # Then turn that list into a map from name to the piece of gear self.allGear = DataUtil.toMap(self.allGear, 'Name') # Load the current gear into memory self.currentGear = DataUtil.statifyNamedGear( self.charInfo['Current Gear'], self.allGear) # TODO # SEE IF THIS DOESN'T BREAK THINGS LATER ON IN EXECUTION # (Might not be kosher if slotified this early) self.allGear = CalcUtil.slotifyAllGear(self.allGear) # Calculate each piece's DPS for name in self.currentGear: piece = self.currentGear[name] piece['DPS'] = CalcUtil.calcDPS(piece, self.charInfo) # Get some basic overall stats about the current gear self.totalStats = CalcUtil.getTotalStats(self.currentGear, Globals.allStats)
def __init__(self): self.entity_extractor = EntityExtractor() self.encoder = Encoder() self.action_manipulator = ActionManipulator() self.config = Config() self.du = DataUtil() obs_size = self.config.u_embed_size + self.config.vocab_size + self.config.feature_size self.action_templates = self.action_manipulator.get_action_templates() self.model = Model()
def ensureUniqueGearNames(): allGearList = FileUtil.getJSONContents('AllGear.json') allGear = DataUtil.toMap(allGearList, 'Name') print('%d items in list' % len(allGearList)) print('%d items in map' % len(list(allGear.keys()))) nRings = 0 for name in allGear: piece = allGear[name] if (piece['Slot'] == 'Ring'): nRings += 1 print('%d rings' % nRings)
def printAllGear(slot, globs): nameToPiece = globs.allGear[slot] charInfo = globs.charInfo for name in nameToPiece: piece = nameToPiece[name] piece['DPS'] = CalcUtil.calcDPS(piece, charInfo) items = sorted(list(nameToPiece.values()), lambda p1, p2: int(p2['DPS'] - p1['DPS'])) headers = ['Name', 'ilvl', 'DPS', 'Location', 'Boss'] print(DataUtil.getTabulated(items, headers))
def predict(self, data): logging.info("predict data....") # data 必须是个list if type(data) is not list: logging.error("data's type is not list.") raise Exception("data's type is not list.") if len(data) == 0: logging.error("num of data is 0!") raise Exception("num of data is 0!") if os.path.exists('news_classifier_model.h5') == False: logging.info("news_classifier model is not exists!") self.train() wordVec = Word2Vector() embeddings = np.array(wordVec.embeddings) model = self.model(embeddings) adam = Adagrad(lr=0.01, epsilon=1e-06) model.compile(loss='binary_crossentropy', metrics=[ut.f_score], optimizer=adam) model.load_weights('news_classifier_model.h5') dataUtil = DataUtil("articles_testN") pre_data = dataUtil.filter_data(data, 1) pre_data = dataUtil.transfer_form(pre_data) pre_data["processed_content"] = sequence.pad_sequences(pre_data['processed_content'],\ maxlen=self.content_max_len, padding='post', truncating='post') pre_data["processed_title"] = sequence.pad_sequences(pre_data['processed_title'],\ maxlen=self.title_max_len, padding='post', truncating='post') result = model.predict_classes([pre_data["processed_content"], \ pre_data["processed_title"]], batch_size=self.batch_size, verbose=1) #count = 0 for i in range(len(data)): data[i]["artitle_label"] = result[i][0] return data
def printAllGear(allGear): headerString = 'ALL GEAR' print('\n\n\n%s\n%s\n' % (headerString, '-' * len(headerString))) for slot in sorted(list(allGear.keys())): print('\n\n%s' % slot.upper()) slotPieces = allGear[slot] sortedPieces = sorted(slotPieces.values(), lambda p1, p2: int(p2['DPS'] - p1['DPS'])) headers = ['DPS', 'Name', 'ilvl', 'Location', 'Boss'] print('') print(DataUtil.getTabulated(sortedPieces, headers)) print('')
def calculateDiffs(globs): # Assign stats to the current gear and print it out currentGear = copy.deepcopy(globs.currentGear) # Print current gear items = [currentGear[slot] for slot in sorted(list(currentGear.keys()))] headers = ['Slot', 'Name', 'ilvl', 'Location', 'Boss', 'DPS'] print('\nCurrent gear (%s %s):\n' % (globs.charInfo['Spec'], globs.charInfo['Class'])) print(DataUtil.getTabulated(items, headers)) # Print stat DPS print('\n\n\nStat DPS:\n') for stat in globs.charInfo['Stat DPS']: value = globs.charInfo['Stat DPS'][stat] print('%s:\t%.4f' % (stat, value)) print('') # TODO # Factored this out into the Globals constructor. # Might want to get rid of this for sure later. # # Partition all gear into slots # allGear = CalcUtil.slotifyAllGear(globs.allGear) # Return a new object # Because mutation is wonky out = {} for slot in list(globs.allGear.keys()): out[slot] = {} # curPiece = currentGear[slot] actualSlotString = CalcUtil.removeUnderscore(slot) otherPieces = globs.allGear[actualSlotString] for name in otherPieces: # Making a deep copy gets rid of issues with having 2 ring slots. # The second DPSDiff calculation would clobber the original DPSDiff calculation. otherPiece = copy.deepcopy(otherPieces[name]) # otherPiece['DPSDiff'] = CalcUtil.calcDPSDiff(curPiece, otherPiece, globs.statDPS) otherPiece['DPS'] = CalcUtil.calcDPS(otherPiece, globs.charInfo) out[slot][name] = otherPiece return out
from data_util import DataUtil from lstm import SemiLSTM if __name__ == '__main__': # 根据微博设定,截取文本最长长度为 140 data_util = DataUtil() # 1. 建立 LSTM 网络 lstm = SemiLSTM(lr=1e-4, epochs=20, batch_size=50) feature, label = data_util.load_data('data/train.txt', True) unlabeled_data, _ = data_util.load_data('data/unlabeled.txt', False) test_data, test_label = data_util.load_data('data/test.txt', True) lstm.build_lstm([32]) lstm.train_semi(feature, label, test_data, test_label, unlabeled_data, round=5, saved_model='my-lstm') lstm.test(test_data, test_label) # 2. 根据训练好的模型,预测是否为不良言论 saved_lstm = SemiLSTM(lr=1e-4, epochs=20, batch_size=50) text = '如何真正为自己的利益发声,而不被境外势力利用?那些势力并不关心你想要的民主,它们只想要中国弱下去' feature = data_util.extract_feature(text) result = saved_lstm.test_text(feature, saved_model='my-lstm') print(result) text = '菅义伟在开记者会,两次鞠躬、向国民道歉,“没能解除紧急事态,我非常抱歉”。记者问,“没能解除紧急事态的原因是什么?您自己觉得充分向国民说明了吗?”v光计划 。' feature = data_util.extract_feature(text) result = saved_lstm.test_text(feature, saved_model='my-lstm') print(result)
def load_data(self): self.du = DataUtil(self.config) self.max_as_count = self.du.max_as_count
class Coref_cluster(object): def __init__(self, config): self.config = config self.load_data() self.add_placeholder() scores = self.add_model() self.add_loss_and_train_op(scores) self.add_predict_op(scores) self.init_op = tf.initialize_all_variables() self.saver = tf.train.Saver() def load_data(self): self.du = DataUtil(self.config) self.max_as_count = self.du.max_as_count def add_placeholder(self): self.inputs = tf.placeholder(tf.float32) self.labels = tf.placeholder(tf.int32) self.deltas = tf.placeholder(tf.float32) def create_feed_dict(self, inputs, deltas=None, labels=None): feed = {self.inputs: inputs} if labels: feed[self.deltas] = deltas feed[self.labels] = labels return feed def add_model(self): x = tf.reshape(self.inputs, (-1, self.config.I)) W1 = tf.get_variable('W1', [self.config.I, self.config.M1]) b1 = tf.get_variable('b1', [self.config.M1]) fc1 = tf.matmul(x, W1) + b1 relu1 = tf.nn.relu(fc1) W2 = tf.get_variable('W2', [self.config.M1, self.config.M2]) b2 = tf.get_variable('b2', [self.config.M2]) fc2 = tf.matmul(relu1, W2) + b2 relu2 = tf.nn.relu(fc2) W3 = tf.get_variable('W3', [self.config.M2, 1]) b3 = tf.get_variable('b3', [1]) fc3 = tf.matmul(relu2, W3) + b3 scores = tf.abs(fc3) return scores def add_loss_and_train_op(self, scores): target_scores = tf.gather(scores, self.labels) scores = tf.reshape(scores, (-1, self.max_as_count)) loss = 1 + scores - target_scores self.loss = tf.reduce_sum(tf.reduce_max(loss * self.deltas, 1)) optimizer = tf.train.RMSPropOptimizer(self.config.learning_rate) self.train_op = optimizer.minimize(self.loss) def add_predict_op(self, scores): self.predictions = tf.argmax( tf.reshape(scores, (-1, self.max_as_count)), 1) def run_epoch(self, session, save=None, load=None): if not os.path.exists('./save'): os.makedirs('./save') if load: self.saver.restore(session, load) else: session.run(self.init_op) time0 = time.time() for epoch in range(self.config.epochs): time1 = time.time() shuffled_epoch_Rs, shuffled_epoch_HAs, shuffled_epoch_HTs, shuffled_epoch_deltas, \ shuffled_answer_indices = self.du.get_shuffled_data_set() assert len(shuffled_epoch_HTs) == len( shuffled_answer_indices) == len(shuffled_epoch_deltas) start_ind = 0 len_data_set = len(shuffled_epoch_Rs) step = 1 time2 = time.time() best_loss = float('inf') loss = 0 while start_ind < len_data_set: time3 = time.time() end_ind = start_ind + self.config.batch_size if end_ind > len_data_set: end_ind = len_data_set start_ind = end_ind - self.config.batch_size batch_Rs = shuffled_epoch_Rs[start_ind:end_ind] batch_As = shuffled_epoch_HAs[start_ind:end_ind] batch_Ts = shuffled_epoch_HTs[start_ind:end_ind] batch_labels = shuffled_answer_indices[start_ind:end_ind] batch_deltas = shuffled_epoch_deltas[start_ind:end_ind] batch_HAs = self.du.encode_mention_pairs( batch_Rs, batch_Ts, batch_As) start_ind = end_ind time4 = time.time() batch_labels = [ batch_labels[i] + self.max_as_count * i for i in range(len(batch_labels)) ] feed = self.create_feed_dict(batch_HAs, batch_deltas, batch_labels) batch_loss, _ = sess.run([self.loss, self.train_op], feed_dict=feed) time5 = time.time() loss += batch_loss if step % self.config.interval == 0: print 'Epoch {}, Step {}, Time {:.2f}, Loss {:.2f}'.format( epoch, step, time5 - time0, batch_loss) step += 1 if best_loss >= loss / step: self.evluation(session) if save is not None: self.saver.save(session, save) else: self.saver.save(session, './save/weight_{}'.format(epoch)) def evluation(self, session, load=None): if load: self.saver.restore(session, load) train_answer_indices, train_h_r_antecedents = \ self.du.get_test_data(self.config.test_batch_size, 'train') feed1 = self.create_feed_dict(inputs=train_h_r_antecedents) predictions1 = sess.run(self.predictions, feed_dict=feed1) test_answer_indices, test_h_r_antecedents = \ self.du.get_test_data(self.config.test_batch_size, 'test') feed2 = self.create_feed_dict(inputs=test_h_r_antecedents) predictions2 = sess.run(self.predictions, feed_dict=feed2) train_acc = metrics.accuracy_score(train_answer_indices, predictions1) test_acc = metrics.accuracy_score(test_answer_indices, predictions2) print '=============================' print 'Training Accuracy: {:.4f}'.format(train_acc) print 'Testing Accuracy: {:.4f}'.format(test_acc) print '============================='
def test_getAverageHR(self): datautil = DataUtil('../.config/postgres.config') self.assertEqual(datautil.getAverageHR(0, 0, 0), 0) self.assertEqual(datautil.getAverageHR(0, -1, 0), 0) self.assertEqual(datautil.getAverageHR(45, -1, 45), 30) self.assertEqual(datautil.getAverageHR(45, 45, 45), 45)
def __init__(self): self.config = Config() self.du = DataUtil() self.inv_map = {v: k for k, v in self.config.class_dict.iteritems()}
dev_fname = 'dev-data-processed.json' test_fname = 'test-data-processed.json' modes = ['train_lm', 'test_lm', 'train_tri-an'] if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in modes: print('add one of the following arguments:', modes) else: mode = sys.argv[1] print('mode is', mode) config = Config() lm_config = LMConfig() data_util = DataUtil(config, lm_config, device) # define language model lm = LM(data_util.vocab_size, lm_config.embed_dim, lm_config.hidden_dim, data_util.embedding, lm_config.dropout, device).to(device) lm_train_util = LMTrainUtil(data_util.lm_train_iter, data_util.lm_dev_iter, lm, device, lm_config, data_util.vocab_size, data_util.TEXT) if mode == 'train_lm': # train language model lm_train_util.train_model() elif mode == 'test_lm':
from data_util import DataUtil, get_embd if __name__ == "__main__": parser = argparse.ArgumentParser(description='cmod') parser.add_argument('-c', '--config', help='Config file path', required=True) cfg_parser = configparser.ConfigParser() args = parser.parse_args() cfg_parser.read(args.config) cfg = config.Config(cfg_parser) D = DataUtil (cfg) train_dataset = D.get_data('train') test_dataset = D.get_data('test') dev_dataset = D.get_data('dev') device = torch.device("cuda:0" if cfg.use_cuda() else "cpu") if cfg.sparse() and cfg.weight_decay() != 0: cfg.logger.error('Sparsity and weight decay are incompatible, pick one!') exit() torch.manual_seed(cfg.random_seed()) random.seed(cfg.random_seed()) if cfg.use_cuda():
date: 2018/5/5 0005 ------------------------------------------------- Change Activity: 2018/5/5 0005: ------------------------------------------------- """ __author__ = 'Administrator' import warnings from unittest import TestCase from data_helper import DataHelper from data_util import DataUtil from feature_integrate import * d_h = DataHelper() d_t = DataUtil() path = '../data/dm/train.csv' data = pd.read_csv(path) fi = FeatureIntegrate() class TestFeature(TestCase): def test_train_feature(self): """ Ran 1 test in 3.251s """ train_feature = fi.train_feature_integrate(data) print(train_feature.columns) assert True
num_epochs = 5 vocab_size = 8000 log_window = 30 val_size = 7300 #4000 # validation size make_model = False #True make_test = True categories = ['politics', 'entertainment', 'sport', 'business'] model_path = 'models/textcnn_17-12-10_20-12-07.pkl' train_path = 'train_ksj.json' voca_path = 'voca.json' val_path = 'val_ksj.json' test_path = 'test_ksj.json' savepath = '' util = DataUtil(seq_length, vocab_size, batch_size, train_path, val_path, voca_path) textcnn = TextCNN(seq_length, num_classes, vocab_size, embed_size, filter_sizes, num_filters, dropout_prob) textcnn.cuda() # define loss & optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(textcnn.parameters(), lr=learning_rate) num_batches = util.num_batch def val_accuracy(data): # data = zip(input, label) textcnn.eval() num_correct = 0 num_total = 0
class Experiment: def __init__(self, training_epochs=50, sequence_length=20, batch_size=100, learning_rate=0.001, dropout=0.2): # Hyper Parameters self.sequence_length = sequence_length self.embedding_size = 512 self.hidden_size = 128 self.num_layers = 1 self.batch_size = batch_size self.learning_rate = learning_rate self.dropout = dropout self.data = DataUtil() self.data.load_split_data() print(self.data.get_dataset(self.data.TRAIN)[:5]) self.data.build_vocab( self.data.get_dataset(self.data.TRAIN) + self.data.get_dataset(self.data.TEST)) self.model = GRURNN(self.embedding_size, self.hidden_size, self.num_layers, self.data.input_lang.n_words, self.data.output_lang.n_words, self.dropout) self.training_epochs = training_epochs self.epoch_start = 1 self.use_cuda = torch.cuda.is_available() def as_minutes(self, s): m = math.floor(s / 60) s -= m * 60 return '%dm %ds' % (m, s) def time_since(self, since, percent): now = time.time() s = now - since es = s / percent rs = es - s return '%s (- %s)' % (self.as_minutes(s), self.as_minutes(rs)) def train(self, print_every=20, plot_every=100, learning_rate=0.01): start = time.time() plot_losses = [] print_loss_total = 0 plot_loss_total = 0 # optimizer = optim.SGD(self.model.parameters(), lr=learning_rate) optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.model.parameters()), lr=learning_rate, momentum=0.9) # optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=learning_rate) num_train_data = self.data.get_dataset_size(self.data.TRAIN) num_batches = int(np.ceil(num_train_data / float(self.batch_size))) log('num_batches: ' + str(num_batches)) for epoch in range(self.epoch_start, self.training_epochs + 1): batch_start = time.time() correct = 0 total = 0 train_data = self.data.get_dataset(self.data.TRAIN) random.shuffle(train_data) self.model.train() for cnt, i in enumerate(random.sample(range(num_batches), num_batches), start=1): inputs, seq_lengths, targets = self.data.construct_batch( self.batch_size * i, self.batch_size * (i + 1), dataset=self.data.TRAIN) if self.use_cuda: inputs = inputs.cuda() targets = targets.cuda() optimizer.zero_grad() outputs = self.model(inputs, seq_lengths) _, predicted = torch.max(outputs.data, dim=1) total += targets.data.size(0) correct += (predicted == targets.data).sum() batch_train_acc = 100.0 * ( predicted == targets.data).sum() / targets.data.size(0) # loss = F.nll_loss(outputs, targets) loss = F.cross_entropy(outputs, targets) loss.backward() optimizer.step() log("Epoch %d, batch %d / %d: train loss = %f, train accuracy = %f %%" % (epoch, cnt, num_batches, loss.data[0], batch_train_acc)) print_loss_total += loss.data[0] plot_loss_total += loss.data[0] if cnt % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 log('Average batch loss: %s' % str(print_loss_avg)) log(self.time_since(batch_start, cnt * 1.0 / num_batches)) if cnt % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 log("epoch %s is done" % str(epoch)) log('Train Accuracy: %f %%' % (100.0 * correct / total)) log(self.time_since(start, epoch * 1.0 / self.training_epochs)) # save intermediate training results save_path = "train_saved/epoch%s.pt" % str(epoch) torch.save(self.model, save_path) log('Model saved in file: %s' % save_path) # run test set after one epoch self.test() def test(self, epoch=-1): if epoch > 0: self.model = torch.load("train_saved/epoch%s.pt" % str(epoch)) log('Model of epoch ' + str(epoch) + ' is restored.') self.model.eval() start = time.time() num_test_data = self.data.get_dataset_size(self.data.TEST) num_batches = int(np.ceil(num_test_data / float(self.batch_size))) log('num_batches: ' + str(num_batches)) correct = 0 total = 0 loss = 0.0 labels = [] predictions = [] for i in random.sample(range(num_batches), num_batches): inputs, seq_lengths, targets = self.data.construct_batch( self.batch_size * i, self.batch_size * (i + 1), dataset=self.data.TEST) if self.use_cuda: inputs = inputs.cuda() targets = targets.cuda() outputs = self.model(inputs, seq_lengths) _, predicted = torch.max(outputs.data, dim=1) total += targets.data.size(0) correct += (predicted == targets.data).sum() labels.extend(targets.data.numpy().tolist()) predictions.extend(predicted.numpy().tolist()) loss += F.cross_entropy(outputs, targets).data[0] log('Time used: ' + str(time.time() - start)) log('Test loss: %f' % loss) log('Test Accuracy: %f %%' % (100.0 * correct / total)) log('Test Precision: %f %%' % (100.0 * precision_score(labels, predictions, average='micro'))) log('Test Recall: %f %%' % (100.0 * recall_score(labels, predictions, average='micro'))) log('Test F1 Score: %f %%' % (100.0 * f1_score(labels, predictions, average='micro')))
print "ERROR: need model destination filepath!" sys.exit(1) if len(sys.argv) > 2: layer_arg = int(sys.argv[2]) else: layer_arg = 2 if len(sys.argv) > 3: ep_arg = int(sys.argv[3]) else: ep_arg = 20 # Read the data print ">> Initializing data..." reader = DataUtil(WORDVEC_FILEPATH, TAGGED_NEWS_FILEPATH) X, Y = reader.get_data() print X.shape print Y.shape # Train the model print ">> Training model... epochs = {0}, layers = {1}".format( ep_arg, layer_arg) nermodel = NERModel(reader) nermodel.train(epochs=ep_arg, layers=layer_arg) # Evaluate the model print ">> Evaluating model..." nermodel.evaluate() # Save the model
parser.add_argument( '--varthresh', help='variance thresh (default 0 means take all)', type=float, default=0) args = parser.parse_args() rand_state = 1 n_cv = args.cv n_iter_search = args.iter sample_rate = args.sample sub_sample = (False if sample_rate < 0 else True) var_thresh = args.varthresh scoring = 'log_loss' verbose = 10 du = DataUtil() du.load_data(sub_sample=sub_sample, sample_rate=sample_rate) x_train, x_test = du.vectorize_x( ['brand_code', 'model_code', 'label_id_bag'], variance_thresh=var_thresh) print('train set shape: ', x_train.shape) print('test set shape: ', x_test.shape) # xgb seems have issue detecting number of columns with sparse matrix x_train_xgb = sp.hstack( (x_train, sp.csr_matrix(np.ones((x_train.shape[0], 1))))) print( 'patching train data with non-zero column to get around xgb sparse issue') y_train = du.get_y_train() print('y_train shape: ', y_train.shape)
__author__ = 'jdwang' __date__ = 'create date: 2016-07-05' __email__ = '*****@*****.**' from data_util import DataUtil final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/NLPCC2016_Stance_Detection_Task_A_Testdata.txt' final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/TaskA_all_testdata_15000.csv' # 去进行分类的句子 final_test_classify_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/TaskA_all_testdata_14966.csv' # 分类结果的标签 clasify_result_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/cp_L_rf_1000tree_classify_label.csv' data_util = DataUtil() final_test_data = data_util.load_data(final_test_file_path) print(final_test_data.head()) print(final_test_data.shape) # quit() # final_test_data = final_test_data[[]] print(final_test_data[final_test_data['WORDS'].isnull()].shape) print(final_test_data[final_test_data['WORDS'].isnull()]) final_test_data = final_test_data[final_test_data['WORDS'].notnull()] data_util.save_data(final_test_data,'result/TaskA_all_testdata_15000_A.csv') # print(final_test_data.tail()) # print(final_test_data.sort_values(by=['ID']).tail()) quit() final_test_classify_data = data_util.load_data(final_test_classify_file_path)
class Reinforcer: def __init__(self): self.config = Config() self.du = DataUtil(self.config) self.sc = StockScraper(ASingleStockConfig()) self.config.ACTION_NUM = len(self.config.actions) self.memories = [] self.W1 = tf.get_variable('W1', [self.config.INPUT, self.config.M1]) self.b1 = tf.get_variable('b1', [self.config.M1]) self.W2 = tf.get_variable('W2', [self.config.M1, self.config.M2]) self.b2 = tf.get_variable('b2', [self.config.M2]) self.W3 = tf.get_variable('W3', [self.config.M2, 1]) self.b3 = tf.get_variable('b3', [1]) self.current_data = [] self.current_state = [] self.portfolio = { 'fund': 500000, 'stock_quantity': 50000, 'current_stock_price': 0, 'total': -1, 'stock_value': 0 } # self.init_op = tf.initialize_all_variables() self.init_placeholder() scores = self.batch_scoring_op() next_step_scores = self.batch_predict_op() self.add_loss_n_train_op(scores, next_step_scores) self.add_step_predict_op() self.saver = tf.train.Saver() self.init_op = tf.initialize_all_variables() def init_placeholder(self): self.states = tf.placeholder(tf.float32) self.rewards = tf.placeholder(tf.float32) self.states_next = tf.placeholder(tf.float32) def batch_scoring_op(self): x = tf.reshape(self.states, (self.config.BATCH_SIZE, self.config.INPUT)) scores = self.Q_network_op(x) return scores def add_loss_n_train_op(self, scores, next_scores): self.predict_scores = self.config.gamma * tf.reduce_max(next_scores, 1) self.viewing_scores = scores '''sarsa reward better?''' self.losses = (self.rewards + self.predict_scores - scores)**2 self.loss = tf.reduce_sum(self.losses) optimizer = tf.train.RMSPropOptimizer(self.config.lr) self.train_op = optimizer.minimize(self.loss) def add_step_predict_op(self): x = tf.reshape(self.states, (self.config.ACTION_NUM, self.config.INPUT)) scores = self.Q_network_op(x) self.prediction = tf.argmax(tf.reshape(scores, (-1, self.config.ACTION_NUM)), axis=1)[0] def Q_network_op(self, x): fc1 = tf.matmul(x, self.W1) + self.b1 tanh1 = tf.nn.tanh(fc1) tanh1 = tf.nn.dropout(tanh1, self.config.DROPOUT) fc2 = tf.matmul(tanh1, self.W2) + self.b2 tanh2 = tf.nn.tanh(fc2) tanh2 = tf.nn.dropout(tanh2, self.config.DROPOUT) scores = tf.matmul(tanh2, self.W3) + self.b3 scores = tf.squeeze(scores) return scores def batch_predict_op(self): x = tf.reshape(self.states_next, (self.config.BATCH_SIZE * self.config.ACTION_NUM, self.config.INPUT)) Q_scores = self.Q_network_op(x) Q_scores = tf.reshape(Q_scores, (self.config.BATCH_SIZE, self.config.ACTION_NUM)) return Q_scores def build_feed_dict(self, random_memories): feed = {} feed[self.states] = [m[0] for m in random_memories] states_next = [] feed[self.rewards] = [m[1] for m in random_memories] new_portfolios = [m[-1] for m in random_memories] new_datas = [m[-2] for m in random_memories] assert len(new_datas) == len(new_portfolios) for i in range(len(new_datas)): port = new_portfolios[i] data = new_datas[i] for action in self.config.actions: action = self.action_policy(action, port) port_to_be_evaluated = self.update_portfolio_after_action( port, action) print "predicting...", port_to_be_evaluated, action state_to_be_evaluated = self.du.preprocess_state( data, port_to_be_evaluated) states_next.append(state_to_be_evaluated) feed[self.states_next] = states_next return feed def action_policy(self, buy_quantity, portfolio): stock_price = portfolio['current_stock_price'] fund = portfolio['fund'] stock_quantity = portfolio['stock_quantity'] if buy_quantity > 0: if buy_quantity * stock_price > fund: quantity_max = fund / stock_price for action in self.config.actions[::-1]: if action <= quantity_max: buy_quantity = action return buy_quantity elif buy_quantity < 0: if -buy_quantity > stock_quantity: for action in self.config.actions: if -action <= stock_quantity: buy_quantity = action return buy_quantity else: return 0 @staticmethod def update_portfolio_after_action(portfolio, action): # print 'action', action port = copy(portfolio) if action == 0: return port else: # print 'a', port['fund'], 'b', port['current_stock_price'], 'c',action port['fund'] = port['fund'] - port['current_stock_price'] * action port['stock_quantity'] += action port['stock_value'] += port['current_stock_price'] * action return port @staticmethod def update_portfolio_after_fetch_price(portfolio, new_price): port = copy(portfolio) port['current_stock_price'] = new_price port['stock_value'] = new_price * port['stock_quantity'] port['total'] = port['stock_value'] + port['fund'] return port @staticmethod def calc_total_with_different_price(portfolio, price): return portfolio['stock_quantity'] * price + portfolio['fund'] @staticmethod # def calc_reward(new_portfolio, prev_portfolio): # return 1000.0*(new_portfolio['total'] - prev_portfolio['total']) / prev_portfolio['total'] def calc_reward(new_portfolio, prev_portfolio): '''reward compared to hold''' print "new, ", new_portfolio print "prev, ", prev_portfolio return new_portfolio[ 'total'] - Reinforcer.calc_total_with_different_price( prev_portfolio, new_portfolio['current_stock_price']) def run_epoch(self, session, save=None, load=None): if not os.path.exists('./save'): os.makedirs('./save') if load: self.saver.restore(session, load) else: session.run(self.init_op) while True: if self.portfolio['total'] == -1: init_data = self.sc.request_api() print init_data if init_data[self.config.open_price_ind]: # assert init_data[self.config.open_price_ind] == init_data[self.config.current_ind] self.portfolio['current_stock_price'] = init_data[ self.config.current_ind] self.portfolio['stock_value'] = self.portfolio[ 'stock_quantity'] * self.portfolio[ 'current_stock_price'] self.portfolio['total'] = self.portfolio[ 'stock_value'] + self.portfolio['fund'] self.current_data = init_data self.current_state = self.du.preprocess_state( init_data, self.portfolio) self.config.INPUT = len(self.current_state) else: print "market closed or stock halts" sys.exit(0) print self.config.INPUT is_exploration = random.random() assert self.portfolio['current_stock_price'] != 0 if is_exploration <= self.config.EPSILON: buy_quantity = random.choice(self.config.actions) print "random" else: candidates = [] for action in self.config.actions: action = self.action_policy(action, self.portfolio) candidate_portfolio = self.update_portfolio_after_action( self.portfolio, action) candidate_state = self.du.preprocess_state( self.current_data, candidate_portfolio) candidates.append(candidate_state) max_q_ind = sess.run(self.prediction, feed_dict={self.states: candidates}) buy_quantity = self.config.actions[max_q_ind] '''fetch!!!''' # time.sleep(self.sc.config.time_interval) new_data = self.sc.request_api() '''update my portfolio & get reward''' port_before_action = copy(self.portfolio) new_portfolio = self.update_portfolio_after_action( self.portfolio, buy_quantity) if (new_portfolio['current_stock_price'] * new_portfolio['stock_quantity'] + new_portfolio['fund']) != new_portfolio['total']: print( new_portfolio['current_stock_price'] * new_portfolio['stock_quantity'] + new_portfolio['fund']), new_portfolio['total'] print "*&&&*^*^&*^(&*&^%*&^%*&^" self.portfolio = new_portfolio self.current_state = self.du.preprocess_state( self.current_data, self.portfolio) new_price = new_data[self.config.current_ind] new_portfolio = self.update_portfolio_after_fetch_price( new_portfolio, new_price) assert (new_portfolio['current_stock_price'] * new_portfolio['stock_quantity'] + new_portfolio['fund']) == new_portfolio['total'] reward = self.calc_reward(new_portfolio, port_before_action) print "################### reward : ", reward, "####################" '''now current state is a state where price is old while action has been performed''' '''new_state is a state where price is new and with new portfolio, but has not made furthur action yet''' self.memories.append( (self.current_state, reward, new_data, new_portfolio)) '''update data and portfolio''' self.current_data = new_data self.portfolio = new_portfolio print "action taken: ", buy_quantity print "current portfolio: ", new_portfolio print "total: ", new_portfolio['total'] print "histroy: ", len(self.memories) print "wait for next tick ................\n" if len(self.memories) > 2 * self.config.BATCH_SIZE: random.shuffle(self.memories) batch = self.memories[:self.config.BATCH_SIZE] '''batch BS*I''' feed = self.build_feed_dict(batch) scores1, scores2, losses, loss, _ = sess.run([ self.predict_scores, self.viewing_scores, self.losses, self.loss, self.train_op ], feed_dict=feed) print loss
class Classifier: def __init__(self): self.config = Config() self.du = DataUtil() self.inv_map = {v: k for k, v in self.config.class_dict.iteritems()} def run_trainer(self): #load train set self.raw_sent, self.data, self.raw_labels = self.du.load_data_set() #load test set self.test_sent, self.test_data, self.test_raw_labels = self.du.load_data_set( 'test') #shuffle train set self.raw_sent, self.data, self.raw_labels = shuffle( self.raw_sent, self.data, self.raw_labels) #train/test classes in integer self.classes = self.du.convert_raw_label_to_class( self.raw_labels, self.config.class_dict) self.test_classes = self.du.convert_raw_label_to_class( self.test_raw_labels, self.config.class_dict) #convert to one hot catagory self.test_labels = keras.utils.to_categorical(self.test_classes, self.du.config.n_classes) self.labels = keras.utils.to_categorical(self.classes, self.du.config.n_classes) #compile model self.model = self.build_model() self.model.compile( loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.RMSprop(lr=self.config.lr), metrics=['accuracy']) self.train() self.evaluate() self.model.save(self.config.final_round_model_path) def run_prediction(self, sentence): emb_sent = self.du.prepare_predict_data(sentence) emb_sent = emb_sent.reshape( [1, self.config.max_sent_len, self.config.emb_dim]) self.config.dropout = 0 self.model = load_model(self.config.dl_model_path) pred, prob = self.predict(emb_sent) print pred pred = pred[0] prob = prob[0] response = self.inv_map[pred] return response, prob def build_model(self): input = Input(shape=(self.config.max_sent_len, self.config.emb_dim)) conv_output = Conv1D(self.config.n_filter, kernel_size=self.config.filter_size, strides=1, activation="relu")(input) lstm_output = LSTM(self.config.lstm_dim, dropout=self.config.dropout)(conv_output) out = Dense(self.config.n_classes, activity_regularizer=l2(self.config.l2_rate), activation="softmax")(lstm_output) model = Model(inputs=[input], outputs=[out]) return model def train(self): check = keras.callbacks.ModelCheckpoint(self.du.config.dl_model_path, monitor='val_acc', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1) self.model.fit(self.data, self.labels, batch_size=self.du.config.batch_size, epochs=self.du.config.epochs, verbose=1, validation_split=0.1, callbacks=[check]) def predict(self, test_data): probs = self.model.predict(test_data) predictions = np.argmax(probs, axis=-1) return predictions, probs def evaluate(self): # self.config.dropout = 0 # predictions, _ = self.predict(self.data) # comparison = (predictions == self.classes) # acc = np.mean(comparison) print self.model.evaluate(self.test_data, self.test_labels) self.generate_prediction_results(self.test_data, self.test_raw_labels, self.test_sent) def evaluate_on_model(self, model_path): self.test_sent, self.test_data, self.test_raw_labels = self.du.load_data_set( 'test') self.test_classes = self.du.convert_raw_label_to_class( self.test_raw_labels, self.config.class_dict) self.test_labels = keras.utils.to_categorical(self.test_classes, self.du.config.n_classes) self.model = load_model(model_path) print self.model.evaluate(self.test_data, self.test_labels) self.generate_prediction_results(self.test_data, self.test_raw_labels, self.test_sent) def generate_prediction_results(self, data, raw_labels, raw_sents): with open(self.config.result_path, 'w') as f: predictions = self.model.predict(data) predictions = np.argmax(predictions, axis=-1) '''argsort''' # top_n = predictions.argsort(axis=-1, order=) for i in range(len(predictions)): sent = raw_sents[i] ground_truth = raw_labels[i] pred = self.inv_map[predictions[i]] res = '\t'.join([sent, ground_truth, pred]) + '\n' f.write(res) print "file written"
WORDVEC_FILEPATH = "wordvecs.txt" TAGGED_NEWS_FILEPATH = "news_tagged_data.txt" SAVED_MODEL_FILEPATH = "model_blstm_150_150_ep50.h5" NEWS_DATA_FILEPATH = "news_tagged_data.txt" EXTRA_LOGGING = False PRINT_BAD = True if __name__ == "__main__": if len(sys.argv) > 1: n_samples = int(sys.argv[1]) else: n_samples = sys.maxint reader = DataUtil(WORDVEC_FILEPATH, TAGGED_NEWS_FILEPATH) nermodel = NERModel(reader) nermodel.load(SAVED_MODEL_FILEPATH) with open(NEWS_DATA_FILEPATH, 'r') as f: cur_sentence = [] cur_tags = [] samples_read = 0 total_frames = 0 total_matched_frames = 0 total_correct_preditions = 0 for line in f: line = line.strip()
def my_print(*lists): """ 如果参数列表最后一位为 False 就拒绝打印 """ print(DataUtil.decode(lists))
# Setup flask server server = flask.Flask(__name__) app = dash.Dash("Charge_Tracker", external_stylesheets=[\ "https://codepen.io/chriddyp/pen/bWLwgP.css"]) cache = Cache(app.server, config={ 'CACHE_TYPE': 'filesystem', 'CACHE_DIR': 'cache-directory' }) TIMEOUT = 2 app.config['suppress_callback_exceptions'] = True ## GLOBAL DEFINITIONS query_helper = DataUtil() queries_sig={'signames_ecg':[], 'signals':[]} queries_name={'signames_ecg':[]} queries_name={'signames_ecg':['101', '103']} queries_id={'id':['1']} evnt_df = query_helper.getAllEvents() # sig_df = query_helper.getECGSignal('50') # x=list(range(len(sig_df.get_value(0,'ecg')))) # y_sig= sig_df.get_value(0,'ecg') # trace = [] # trace.append(go.Scatter(x=x, y=y_sig, mode='lines', # marker={'size': 8, "opacity": 0.6, "line": {'width': 0.5}}, )) # print("FIRST TRACE") # print(trace)
class Experiment: def __init__(self, config, sequence_length=20, reload_data=True): # Hyper Parameters self.sequence_length = sequence_length self.hidden_size = 128 self.num_layers = 1 self.config = config self.data = DataUtil(data_dir=config.data_dir, vocab_dir=config.vocab_dir, split_by_sentence=not config.split_by_section, skip_list=config.skip_list) if not self.config.filtered: self.data.make_dir(self.config.output_dir + "/models/") if reload_data: for ds in self.config.textbook_data_sets: self.data.load_textbook_train_dev_data( config.data_dir + 'medlit/train/' + ds, config.data_dir + 'medlit/dev/' + ds) # train self.data.load_i2b2_train_data(train_base_dir=config.data_dir + '/i2b2_ehr/') # test self.data.load_test_data(ref_base_dir=config.data_dir + '/i2b2_ehr/') # dev self.data.load_test_data(ref_base_dir=config.data_dir + '/i2b2_ehr/', type='dev') else: self.data.load_split_data() self.data.make_dir(self.config.output_dir) log_file_name = strftime("log_%Y_%m_%d_%H_%M_%S", localtime()) self.logger = self.setup_logger(self.config.output_dir + '/%s.txt' % log_file_name) if exists(config.vocab_dir + "/NaturalLang.pkl") and not reload_data: print("Loading vocab") self.data.load_vocab() else: print("Building vocab") self.data.build_vocab(self.data.textbook_train_data, pretrain=False) self.model = None self.use_cuda = torch.cuda.is_available() if not self.config.filtered: if self.config.model_type == 'gru_rnn': self.model = GRURNN( self.config.embedding_size, self.hidden_size, self.data.input_lang, self.data.pretrained_embeddings, self.num_layers, self.data.input_lang.n_words, self.data.output_lang.n_words, self.config.dropout) elif self.config.model_type == 'attn_gru_rnn': self.model = AttentionGRURNN( self.config.embedding_size, self.hidden_size, self.data.input_lang, self.data.pretrained_embeddings, self.num_layers, self.data.input_lang.n_words, self.data.output_lang.n_words, self.config.dropout) elif self.config.model_type == 'cnn': self.model = CNN(self.data.input_lang.n_words, self.data.output_lang.n_words, self.config.embedding_size, self.data.input_lang, self.data.pretrained_embeddings, self.config.dropout) self.epoch_start = 1 if self.use_cuda: self.model = self.model.cuda() def setup_logger(self, log_file, level=logging.INFO): logger = logging.getLogger() logger.setLevel(level) handler = logging.FileHandler(log_file) formatter = logging.Formatter('%(asctime)s %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) return logger def log(self, info): print(info) if self.logger is not None: self.logger.info(info) def as_minutes(self, s): m = math.floor(s / 60) s -= m * 60 return '%dm %ds' % (m, s) def time_since(self, since, percent): now = time.time() s = now - since es = s / percent rs = es - s return '%s (- %s)' % (self.as_minutes(s), self.as_minutes(rs)) def train(self, data_setup, save_model_dir, print_every=20, plot_every=100, learning_rate=0.001): start = time.time() plot_losses = [] print_loss_total = 0 plot_loss_total = 0 if self.config.model_type == 'cnn' and self.config.transfer_learning: self.model.output_size = self.data.output_lang.n_words if self.config.reuse_embedding_layer_only: self.model.init_conv1_layer() self.model.init_conv2_layer() self.model.init_fc_layers() if self.config.reuse_embedding_conv1_layers: self.model.init_conv2_layer() self.model.init_fc_layers() if self.use_cuda: self.model = self.model.cuda() elif self.config.transfer_learning: self.model.freeze_layer("fc1") if self.config.optimizer == 'sgd': optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.model.parameters()), lr=learning_rate, momentum=0.9) elif self.config.optimizer == 'adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=learning_rate) self.log('data_setup:' + str(data_setup)) train_data = [] for data_set in data_setup: data_ratio = data_setup[data_set] data = self.data.get_dataset(data_set) train_data += self.data.get_data_subset(data, data_ratio) print('len train_data:', len(train_data)) print('training data examples:', train_data[:5]) if self.config.downsampling: train_data = self.data.downsampling( train_data, number_samples=self.config.downsampling_size) num_train_data = len(train_data) print('num_train_data:', num_train_data) print('train_data:', train_data[:10]) num_batches = int( np.ceil(num_train_data / float(self.config.batch_size))) self.log('num_batches: ' + str(num_batches)) if self.config.weighted_loss: loss_weight = self.data.get_label_weight(train_data) if self.use_cuda: loss_weight = loss_weight.cuda() else: loss_weight = None max_dev_acc = 0 for epoch in range(self.epoch_start, self.config.num_train_epochs + 1): batch_start = time.time() correct = 0 total = 0 random.shuffle(train_data) self.model.train() for cnt, i in enumerate(random.sample(range(num_batches), num_batches), start=1): inputs, seq_lengths, targets, batch = self.data.construct_batch( self.config.batch_size * i, self.config.batch_size * (i + 1), train_data, fixed_length=True if self.config.model_type == 'cnn' else False) if self.use_cuda: inputs = inputs.cuda() targets = targets.cuda() optimizer.zero_grad() if self.config.model_type == 'cnn': outputs = self.model(inputs) # for CNN elif self.config.model_type == 'attn_gru_rnn': outputs = self.model(inputs, self.data.input_lang, seq_lengths) else: outputs = self.model(inputs, seq_lengths) _, predicted = torch.max(outputs.data, dim=1) total += targets.data.size(0) correct += (predicted == targets.data).sum() batch_train_acc = 100.0 * ( predicted == targets.data).sum() / targets.data.size(0) loss = F.cross_entropy(outputs, targets, weight=loss_weight) loss.backward() optimizer.step() self.log( "Epoch %d, batch %d / %d: train loss = %f, train accuracy = %f %%" % (epoch, cnt, num_batches, loss.data.item(), batch_train_acc)) print_loss_total += loss.data.item() plot_loss_total += loss.data.item() if cnt % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 self.log('Average batch loss: %s' % str(print_loss_avg)) self.log( self.time_since(batch_start, cnt * 1.0 / num_batches)) if cnt % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 self.log('Epoch %d is done' % epoch) self.log('Epoch %d Train Accuracy: %f %%' % (epoch, 100.0 * correct / total)) self.log( self.time_since(start, epoch * 1.0 / self.config.num_train_epochs)) datasets = [] print("TUNING SET IS: " + str(self.config.tuning_set)) if 'ALL' in self.config.tuning_set or 'MedLit' in self.config.tuning_set: self.log("Test on MedLit Dev: ") datasets.append(self.data.TEXTBOOK_DEV) if 'ALL' in self.config.tuning_set or 'i2b2' in self.config.tuning_set: self.log("Test on i2b2 EHR Dev: ") datasets.append(self.data.i2b2_DEV) self.log("Tuning on:") self.log(datasets) dev_acc = self.test(datasets=datasets, epoch=epoch, calc_confusion_matrix=True) # save intermediate training results if dev_acc > max_dev_acc: save_path = save_model_dir + "/models/best_model.pt" torch.save(self.model, save_path) self.log('Best Model saved in file: %s' % save_path) max_dev_acc = dev_acc if 'i2b2' in self.config.test_set: self.log("Test on i2b2 Test:") self.test(datasets=[self.data.i2b2_TEST], epoch=epoch, print_test_results=True) save_path = save_model_dir + "/models/epoch_" + str(epoch) + ".pt" torch.save(self.model, save_path) self.log('Model saved in file: %s' % save_path) def test(self, datasets, epoch=-1, calc_confusion_matrix=True, generate_reports=True, print_test_results=False, print_examples=False): if self.model is None: self.log('Restoring model from ' + self.config.reload_model_file) if torch.cuda.is_available(): self.model = torch.load(self.config.reload_model_file) else: self.model = torch.load(self.config.reload_model_file, map_location='cpu') self.log('Model is restored') self.model.eval() start = time.time() data = [] dataset_name = '_'.join(datasets) for dataset in datasets: data.extend(self.data.get_dataset(dataset)) if self.config.downsampling: data = [] for dataset in datasets: data.extend(self.data.get_dataset(dataset)) data = self.data.downsampling(data, number_samples=500) num_test_data = len(data) self.log("num_test_data: " + str(num_test_data)) num_batches = int( np.ceil(num_test_data / float(self.config.batch_size))) self.log('num_batches: ' + str(num_batches)) correct = 0 total = 0 loss = 0.0 labels = [] predictions = [] examples = [] for i in range(num_batches): inputs, seq_lengths, targets, batch = self.data.construct_batch( self.config.batch_size * i, self.config.batch_size * (i + 1), data, fixed_length=True if self.config.model_type == 'cnn' else False) if self.use_cuda: inputs = inputs.cuda() targets = targets.cuda() if self.config.model_type == 'cnn': outputs = self.model(inputs) # for CNN elif self.config.model_type == 'attn_gru_rnn': outputs = self.model(inputs, self.data.input_lang, seq_lengths) else: outputs = self.model(inputs, seq_lengths) _, predicted = torch.max(outputs.data, dim=1) ordered = torch.sort(outputs.data) total += targets.data.size(0) correct += (predicted == targets.data).sum() labels.extend(targets.cpu().data.numpy().tolist()) predictions.extend(predicted.cpu().numpy().tolist()) loss += F.cross_entropy(outputs, targets).data.item() if print_examples or print_test_results: for k, d in enumerate(batch): examples.append([ d[0], d[1].replace('\r', ' ').replace('\n', ' ').replace('\t', ' '), d[2], d[3], str(d[4]), str(d[5]), self.data.output_lang.get_word( predicted[k].cpu().data.item()), self.data.output_lang.get_word( int(ordered[1][k][outputs.data.shape[1] - 2])), self.data.output_lang.get_word( int(ordered[1][k][outputs.data.shape[1] - 3])) ]) if print_examples: self.data.make_dir(self.config.output_dir + '/test_saved') self.log("Save examples to: " + self.config.output_dir + '/test_saved') with open( self.config.output_dir + '/test_saved/' + dataset_name + 'epoch_%d.txt' % epoch, 'w') as f: f.write( "#\tSentence\tTrue\tHeader String\tLocation\tLine\tPrediction 1\tPrediction 2\tPrediction 3\n" ) for e in examples: f.write('\t'.join(e) + '\n') self.log('Epoch %d ' % epoch + 'Time used: ' + str(time.time() - start)) self.log('Epoch %d ' % epoch + 'Test loss: %f' % loss) self.log('Epoch %d ' % epoch + 'Test Accuracy: %f %%' % (100.0 * correct / total)) self.log( 'Epoch %d ' % epoch + 'Test Precision: %f %%' % (100.0 * precision_score(labels, predictions, average='micro'))) self.log('Epoch %d ' % epoch + 'Test Recall: %f %%' % (100.0 * recall_score(labels, predictions, average='micro'))) self.log('Epoch %d ' % epoch + 'Test F1 Score: %f %%' % (100.0 * f1_score(labels, predictions, average='micro'))) text_labels = [self.data.output_lang.get_word(l) for l in labels] text_preds = [self.data.output_lang.get_word(l) for l in predictions] label_set = sorted(list(set(text_labels))) if calc_confusion_matrix: cm = confusion_matrix(text_labels, text_preds, labels=label_set) self.log('confusion_matrix for epoch %d: ' % epoch) header = '\t'.join(label_set) self.log(header) for i, row in enumerate(list(cm)): row = [str(num) for num in row] self.log('\t'.join([label_set[i]] + row)) np.savetxt(self.config.output_dir + '/' + dataset_name + '_confusion_matrix_epoch_%d.csv' % epoch, cm, fmt='%d', header=header, delimiter=',') self.log('Saved confusion matrix!') if generate_reports: reports = classification_report(text_labels, text_preds, labels=label_set, target_names=label_set, digits=4) self.log(reports) with open( self.config.output_dir + '/' + dataset_name + '_report_epoch_%d.txt' % epoch, 'w') as f: f.write(reports) self.log('Saved report!') if print_test_results: with open( self.config.output_dir + '/' + dataset_name + '_predictions_epoch_%d.json' % epoch, 'w') as f: json.dump(examples, f, indent=4, sort_keys=True) return 100.0 * correct / total def test_one(self, header, text): if self.model is None: self.log('Restoring model from ' + self.config.reload_model_file) if torch.cuda.is_available(): self.model = torch.load(self.config.reload_model_file) else: self.model = torch.load(self.config.reload_model_file, map_location='cpu') self.log('Model is restored') self.model.eval() if self.use_cuda: self.model = self.model.cuda() inputs, seq_lengths, targets = self.data.construct_one( header, text, fixed_length=True if self.config.model_type == 'cnn' else False) if self.use_cuda: inputs = inputs.cuda() targets = targets.cuda() if self.config.model_type == 'cnn': outputs = self.model(inputs) # for CNN elif self.config.model_type == 'attn_gru_rnn': outputs = self.model(inputs, self.data.input_lang, seq_lengths) else: outputs = self.model(inputs, seq_lengths) _, predicted = torch.max(outputs.data, dim=1) return predicted.cpu().numpy().tolist() == targets.cpu().data.numpy( ).tolist()
def main(): data_set_names = { 'WikipediaMedical': 'WM', } if args.data_sets == ['EHR']: args.textbook_data_sets = [] else: args.textbook_data_sets = args.data_sets # if len(args.textbook_data_sets) == 0: # base_dir = args.global_dir + '/' + '_'.join(args.data_sets) # else: # base_dir = args.global_dir + '/' + '_'.join([data_set_names[ds] for ds in args.textbook_data_sets]) data = DataUtil(data_dir=args.data_dir, vocab_dir=args.vocab_dir, split_by_sentence=not args.split_by_section, skip_list=args.skip_list) if args.reload_data: # if self.config.textbook_data_ratio > 0: for ds in args.textbook_data_sets: data.load_textbook_train_dev_data( args.ref_data_dir + 'medlit/' + args.data_type + '/train/' + ds, args.ref_data_dir + 'medlit/' + args.data_type + '/dev/' + ds) # train data.load_i2b2_train_data(train_base_dir=args.ref_data_dir + 'i2b2_ehr/' + args.data_type) # test data.load_test_data(ref_base_dir=args.ref_data_dir + 'i2b2_ehr/' + args.data_type, i2b2=True) # dev data.load_test_data(ref_base_dir=args.ref_data_dir + 'i2b2_ehr/' + args.data_type, i2b2=True, type='dev') else: data.load_split_data() logger.info("MedLit Training data: " + str(len(data.textbook_train_data))) logger.info("MedLit Dev data: " + str(len(data.textbook_dev_data))) logger.info("i2b2 Training data: " + str(len(data.i2b2_train_data))) logger.info("i2b2 Dev data: " + str(len(data.i2b2_dev_data))) logger.info("i2b2 Test data: " + str(len(data.i2b2_test_data))) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") # task_name = args.task_name.lower() # # if task_name not in processors: # raise ValueError("Task not found: %s" % (task_name)) #processor = processors[task_name]() num_labels = 11 label_list = ['Allergies', 'Assessment and Plan', 'Chief Complaint', 'Examination', 'Family History', 'Findings', 'Medications', 'Past Medical History', 'Personal and Social history', 'Procedures', 'Review of Systems'] logger.info("Num Labels: " + str(num_labels)) logger.info("Labels: " + str(label_list)) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = [] num_train_steps = None if args.do_train: for data_name in args.train_data: if data_name == "i2b2" or data_name == "ALL": if args.i2b2_data_ratio != 1: train_examples.extend(data.get_data_subset(data.i2b2_train_data, args.i2b2_data_ratio)) else: train_examples.extend(data.i2b2_train_data) if data_name == "MedLit" or data_name == "ALL": train_examples.extend(data.textbook_train_data) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) logger.info("Combined Train data: " + str(len(train_examples))) # Prepare model model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format( args.local_rank), num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 nb_tr_steps = 0 tr_loss = 0 best_f1 = 0 best_model = model if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num train examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) dev_examples = [] name = "" for data_name in args.tuning_set: name += data_name + "_" if data_name == "i2b2" or data_name == "ALL": random.shuffle(data.i2b2_dev_data) dev_examples.extend(data.i2b2_dev_data[:500]) if data_name == "MedLit" or data_name == "ALL": random.shuffle(data.textbook_dev_data) dev_examples.extend(data.textbook_dev_data[:500]) dev_features = convert_examples_to_features( dev_examples, label_list, args.max_seq_length, tokenizer) logger.info(" Num dev examples: " + str(len(dev_examples))) logger.info("EVAL on Pretrained model only: " + args.bert_model) run_eval(args, model, device, dev_examples, dev_features, 0, global_step, name, label_list, save_results=False) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 loss = tr_loss / nb_tr_steps f1 = run_eval(args, model, device, dev_examples, dev_features, loss, global_step, name, label_list, save_results=False) logger.info(str(epoch) + "/" + str(args.num_train_epochs) + ". loss: " + str(loss) + ", F1: " + str(f1)) if f1 > best_f1: best_f1 = f1 best_model = model output_model_file = os.path.join(args.output_dir, "pytorch_model" + str(epoch) + ".bin") logger.info("Saving best model with F1: " + str(best_f1)) model_to_save = best_model.module if hasattr(best_model, 'module') else best_model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) # Save a trained model output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: logger.info("Saving best model with F1: " + str(best_f1)) model_to_save = best_model.module if hasattr(best_model, 'module') else best_model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) else: model_state_dict = torch.load(os.path.join(args.bert_model, "pytorch_model.bin")) best_model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels) best_model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): loss = tr_loss / nb_tr_steps if args.do_train else None if "ALL" in args.test_set or "MedLit" in args.test_set: eval_examples = data.textbook_dev_data eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) run_eval(args, best_model, device, eval_examples, eval_features, loss, global_step, "medlit_dev", label_list, print_examples=True) if "ALL" in args.test_set or "i2b2" in args.test_set: eval_examples = data.i2b2_test_data eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) run_eval(args, best_model, device, eval_examples, eval_features, loss, global_step, "i2b2_test", label_list, print_examples=True)
def __init__(self, config, sequence_length=20, reload_data=True): # Hyper Parameters self.sequence_length = sequence_length self.hidden_size = 128 self.num_layers = 1 self.config = config self.data = DataUtil(data_dir=config.data_dir, vocab_dir=config.vocab_dir, split_by_sentence=not config.split_by_section, skip_list=config.skip_list) if not self.config.filtered: self.data.make_dir(self.config.output_dir + "/models/") if reload_data: for ds in self.config.textbook_data_sets: self.data.load_textbook_train_dev_data( config.data_dir + 'medlit/train/' + ds, config.data_dir + 'medlit/dev/' + ds) # train self.data.load_i2b2_train_data(train_base_dir=config.data_dir + '/i2b2_ehr/') # test self.data.load_test_data(ref_base_dir=config.data_dir + '/i2b2_ehr/') # dev self.data.load_test_data(ref_base_dir=config.data_dir + '/i2b2_ehr/', type='dev') else: self.data.load_split_data() self.data.make_dir(self.config.output_dir) log_file_name = strftime("log_%Y_%m_%d_%H_%M_%S", localtime()) self.logger = self.setup_logger(self.config.output_dir + '/%s.txt' % log_file_name) if exists(config.vocab_dir + "/NaturalLang.pkl") and not reload_data: print("Loading vocab") self.data.load_vocab() else: print("Building vocab") self.data.build_vocab(self.data.textbook_train_data, pretrain=False) self.model = None self.use_cuda = torch.cuda.is_available() if not self.config.filtered: if self.config.model_type == 'gru_rnn': self.model = GRURNN( self.config.embedding_size, self.hidden_size, self.data.input_lang, self.data.pretrained_embeddings, self.num_layers, self.data.input_lang.n_words, self.data.output_lang.n_words, self.config.dropout) elif self.config.model_type == 'attn_gru_rnn': self.model = AttentionGRURNN( self.config.embedding_size, self.hidden_size, self.data.input_lang, self.data.pretrained_embeddings, self.num_layers, self.data.input_lang.n_words, self.data.output_lang.n_words, self.config.dropout) elif self.config.model_type == 'cnn': self.model = CNN(self.data.input_lang.n_words, self.data.output_lang.n_words, self.config.embedding_size, self.data.input_lang, self.data.pretrained_embeddings, self.config.dropout) self.epoch_start = 1 if self.use_cuda: self.model = self.model.cuda()