def testReader1(self): # Create DataReader instance data_reader = DataReader(self.filename, self.batch_size, self.seq_length) print(data_reader.vocab) print(data_reader.get_tensor(self.filename))
def readconfig(self, config, name, template): """ get this reader module configuration from config file """ DataReader.readconfig(self, config, name, template) self._getopt('doublequote', config, name, template, True) if self.doublequote is not True: self.doublequote = self.doublequote == 'True' self._getopt('escapechar', config, name, template, None) if self.escapechar is not None: self.escapechar = self.escapechar[0] self._getopt('quotechar', config, name, template, '"') self.quotechar = self.quotechar[0] self._getopt('skipinitialspace', config, name, template, False) if self.skipinitialspace is not False: self.skipinitialspace = self.skipinitialspace == 'True' self._getopt('field_size_limit', config, name, template, -1, "mem") for opt in [ 'doublequote', 'escapechar', 'quotechar', 'skipinitialspace', 'field_size_limit' ]: self.log.debug("reader.readconfig %s: '%s'" \ % (opt, self.__dict__[opt]))
def evaluate(): places = fluid.CUDAPlace(0) exe = fluid.Executor(places) [eval_prog, feed_target_names, fetch_targets] = fluid.io.load_inference_model(dirname=os.path.join( config.train['checkpoint_path'], 'infer_meteor'), executor=exe) exe = fluid.ParallelExecutor(use_cuda=True, main_program=eval_prog) batch_size = config.train['batch_size'] dr = DataReader() dr = dr.get_reader(batch_size, 'test') bleu_score = [0] * 5 bleu_vec = ([1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]) sentence_said = set() for l, data in enumerate(dr()): img, real_cap = zip(*data) cp = exe.run( feed={feed_target_names[0]: np.array(img, dtype='float32')}, fetch_list=fetch_targets)[0] for idx, vec in enumerate(bleu_vec): bleu_score[idx] += calc_bleu(cp, real_cap, vec) if config.evaluate['sentence_statistics']: for p in cp: p = words2sentence(filter(p)) sentence_said.add(p) for i in range(len(bleu_score)): bleu_score[i] /= l + 1 bleu_score[4] = sum(bleu_score[:-1]) / 4 print('BLEU [{:.7f}, {:.7f}, {:.7f}, {:.7f}] {:.7f}'.format(*bleu_score)) if config.evaluate['sentence_statistics']: print('模型一共说了{}句不同的话'.format(len(sentence_said)))
def readconfig(self, config, name, template): """ get this reader module configuration from config file """ DataReader.readconfig(self, config, name, template) self._getopt('doublequote', config, name, template, True) if self.doublequote is not True: self.doublequote = self.doublequote == 'True' self._getopt('escapechar', config, name, template, None) if self.escapechar is not None: self.escapechar = self.escapechar[0] self._getopt('quotechar', config, name, template, '"') self.quotechar = self.quotechar[0] self._getopt('skipinitialspace', config, name, template, False) if self.skipinitialspace is not False: self.skipinitialspace = self.skipinitialspace == 'True' self._getopt('field_size_limit', config, name, template, -1, "mem") for opt in ['doublequote', 'escapechar', 'quotechar', 'skipinitialspace', 'field_size_limit']: self.log.debug("reader.readconfig %s: '%s'" \ % (opt, self.__dict__[opt]))
def show_summary(): train_labels = DataReader.read_training_labels() test_labels = DataReader.read_test_labels() for label_ix in range(3): print('{} labels:'.format(LABEL_TO_METHANOMETER[label_ix])) train_counts = Counter(train_labels[:, label_ix]) test_counts = Counter(test_labels[:, label_ix]) print('Train -> ' + ' '.join('{} {}'.format(key, value) for key, value in train_counts.items())) print('Test -> ' + ' '.join('{} {}'.format(key, value) for key, value in test_counts.items()))
def __init__(self, log, db, reject, filename, input_encoding, table, columns, newline_escapes = None): """ init textreader with a newline_escapes parameter """ DataReader.__init__(self, log, db, reject, filename, input_encoding, table, columns) if 'newline_escapes' not in self.__dict__: self.newline_escapes = newline_escapes self.log.debug('reader.__init__: newline_escapes %s' \ % self.newline_escapes)
def testReader2(self): # Create DataReader instance data_reader = DataReader(self.filename, self.batch_size, self.seq_length) tensor = data_reader.get_tensor(self.filename) data_reader.generate_batches(tensor) x = data_reader.x_batches y = data_reader.y_batches print(x) print(y)
def _transform_data_to_features(self): X_train_partials = [] for X_train_partial in DataReader.iter_train_files_data(): X_train_partials.append(X_train_partial) X_train = np.concatenate(X_train_partials, axis=0) rows = sum(partial.shape[0] for partial in X_train_partials) assert X_train.shape == (rows, DataReader.SENSOR_NUM * DataReader.SENSOR_DATA_COUNT_IN_ROW) train_features = self.transformer.transform(X_train) feature_names = np.asarray(self.transformer.get_feature_names()) assert train_features.shape == (rows, len(feature_names)) X_test = DataReader.read_test_data() assert X_test.shape == (X_test.shape[0], DataReader.SENSOR_NUM * DataReader.SENSOR_DATA_COUNT_IN_ROW) test_features = self.transformer.transform(X_test) assert test_features.shape == (test_features.shape[0], len(feature_names)) return train_features, test_features, feature_names
def readconfig(self, config, name, template): """ get this reader module configuration from config file """ DataReader.readconfig(self, config, name, template) # this will be called twice if templates are in used, so we # have to protect ourselves against removing already read # configurations while in second run. self._getopt('field_count', config, name, template, None, 'int') self._getopt('trailing_sep', config, name, template, False) if self.trailing_sep is not False: self.trailing_sep = self.trailing_sep == 'True' self.log.debug('reader.readconfig: field_count %s', self.field_count) self.log.debug('reader.readconfig: trailing_sep %s', self.trailing_sep)
def train(args): if not os.path.exists(args.model_path): os.mkdir(args.model_path) writer = SummaryWriter("log") torch.cuda.set_device(args.device_id) model = CrossModal(vocab_size=args.vocab_size, pretrain_path=args.pretrain_path).cuda() #model = torch.nn.DataParallel(model).cuda() criterion = RankLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) step = 0 for epoch in range(args.epochs): train_reader = DataReader(args.vocab_path, args.train_data_path, args.image_path, args.vocab_size, args.batch_size, is_shuffle=True) print("train reader load succ......") for train_batch in train_reader.batch_generator(): query = torch.from_numpy(train_batch[0]).cuda() pos = torch.stack(train_batch[1], 0).cuda() neg = torch.stack(train_batch[2], 0).cuda() optimizer.zero_grad() left, right = model(query, pos, neg) loss = criterion(left, right).cuda() loss.backward() optimizer.step() if step == 0: writer.add_graph(model, (query, pos, neg)) if step % 100 == 0: writer.add_scalar('Train/Loss', loss.item(), step) if step % args.eval_interval == 0: print('Epoch [{}/{}], Step [{}] Loss: {:.4f}'.format(epoch + 1, args.epochs, step, loss.item()), flush=True) if step % args.save_interval == 0: # Save the model checkpoint torch.save(model.state_dict(), '%s/model.ckpt' % args.model_path) step += 1
def train(args): if not os.path.exists(args.model_path): os.mkdir(args.model_path) torch.cuda.set_device(args.device_id) model = CrossModal(vocab_size=args.vocab_size, pretrain_path=args.pretrain_path).cuda() model.load_state_dict(torch.load(args.model_path + "/model.ckpt")) model.eval() train_reader = DataReader(args.vocab_path, "./data/query.txt", args.image_path, args.vocab_size, args.batch_size, is_shuffle=False) for train_batch in train_reader.extract_emb_generator(): query = torch.from_numpy(train_batch).cuda() vec_list = model.query_emb(query) for vec in vec_list: print(" ".join( [str(round(x, 4)) for x in vec.cpu().detach().numpy()]))
class RealDataWorker: def __init__(self, pipe_recv, pipe_send): self.pipe_recv, self.pipe_send = pipe_recv, pipe_send self.queue_data = multiprocessing.Queue() self.queue_recv = multiprocessing.Queue() self.queue_send = multiprocessing.Queue() self.thread_data = threading.Thread(target=self.data_worker) self.thread_recv = threading.Thread(target=self.receiver) self.thread_send = threading.Thread(target=self.sender) self.reader = DataReader() self.thread_data.start() self.thread_recv.start() self.thread_send.start() logging.info(u"RealDataWorker - init finish") def __del__(self): pass def receiver(self): logging.info(u"start receiver thread") while True: c = self.pipe_recv.recv() if c == "stop": self.queue_recv.put(None) self.queue_send.put(None) self.queue_data.put(None) else: self.queue_recv.put(c) def sender(self): logging.info(u"start sender thread") try: while True: to_send = self.queue_send.get() if to_send is None: return self.pipe_send.send(to_send) except: return def data_worker(self): logging.info(u"start DataReader thread") while True: obj = self.queue_recv.get() if obj is None: return self.queue_send.put(self.reader.process(obj))
def main(params): # Arguments passed down from the parser download_data_path = params['input_data_path'] data_basepath = params['output_data_path'] logs_path = params['logs_path'] plots_path = params['plots_path'] contour_type = params['contour_type'] toggle_plot = params['toggle_plot'] mini_batch_size = params['mini_batch_size'] # Set up logging _setup_logging(logs_path) # Meat of the python program logging.info( 'Started running preprocessor for the following parameters: {}'.format( params)) reader = DataReader(download_data_path=download_data_path, data_basepath=data_basepath, logs_path=logs_path, plots_path=plots_path, contour_type=contour_type, save_plot=toggle_plot) images, masks, metadata = reader.load_samples(reader.sample_tuples) loader = DataLoader(output_dir=data_basepath, images=images, masks=masks, metadata=metadata, mini_batch_size=mini_batch_size) minibatches = loader.random_mini_batches() # If user enabled the toggle_plot to evaluate the reader and loader modules if toggle_plot: # Check out the overall view of all samples (dicoms, masks) with no shuffle and no partitioning logging.debug( 'Plotting the overall view of all (dicom, mask) samples...') reader.plot_samples(images, masks, metadata, 'data-reader_no-shuffle_batchset.jpg') # Check out first minibatch to see whether it matches the ones in 'data-reader_no-shuffle_batchset.jpg' with same label logging.debug( 'Extracting and plotting the first minibatch to validate DataLoader against the previous plot from DataReader...' ) for i, minibatch in enumerate(minibatches): if i > 1: break minibatch_image, minibatch_mask, minibatch_metadata = minibatch # minibatch_image (8,256,256), minibatch_mask (8,256,256), minibatch_metadata (8,) reader.plot_samples(minibatch_image, minibatch_mask, minibatch_metadata, 'data-loader_shuffled_batchset.jpg') logging.info('Finished running preprocessor...')
class BitmexData: def __init__(self, data_dir, initial_date): self.reader = DataReader(data_dir, initial_date) def _idtoprice(self, ID, symbolIdx=88, ticksize=0.01): price = ((100000000 * symbolIdx) - ID) * ticksize return price def _zip_orderBookL2(self, bulk_orderBookL2): # return form: data [{side: 'sell', price: '8685', size: '11223'}] zip_data = [] for tick in bulk_orderBookL2: side = tick['side'] price = self._idtoprice(tick['id']) size = tick['size'] zip_data.append({'side': side, 'price': price, 'size': size}) return zip_data def _zip_trade(self, bulk_trade): zip_data = [] for trade in bulk_trade: side = trade['side'] price = trade['price'] size = trade['size'] zip_data.append({'side': side, 'price': price, 'size': size}) return zip_data def _zip_liquid(self, bulk_liquid): zip_data = [] for trade in bulk_liquid: side = trade['side'] price = trade['price'] size = trade['size'] zip_data.append({'side': side, 'price': price, 'size': size}) return zip_data def next_data(self): bulk_data = self.reader.next_file() data = {'orderBookL2': [], 'trade': [], 'liquid': []} for each_data in bulk_data: if each_data['table'] == 'orderBookL2': data['orderBookL2'].append( self._zip_orderBookL2(each_data['data'])) elif each_data['trade'] == 'trade': data['trade'].append(self._zip_trade(each_data['data'])) else: data['liquid'].append(self._zip_liquid(each_data['data'])) return data
def __init__(self, pipe_recv, pipe_send): self.pipe_recv, self.pipe_send = pipe_recv, pipe_send self.queue_data = multiprocessing.Queue() self.queue_recv = multiprocessing.Queue() self.queue_send = multiprocessing.Queue() self.thread_data = threading.Thread(target=self.data_worker) self.thread_recv = threading.Thread(target=self.receiver) self.thread_send = threading.Thread(target=self.sender) self.reader = DataReader() self.thread_data.start() self.thread_recv.start() self.thread_send.start() logging.info(u"RealDataWorker - init finish")
def make_feature_transformer_pipeline(sensor_group_count, n_jobs): assert sensor_group_count % 60 == 0 feature_transformers = [ ('max', SensorTransformer(np.max)), ('min', SensorTransformer(np.min)), ('first_location_of_maximum', SensorTransformer(first_location_of_maximum)), ('last_location_of_maximum', SensorTransformer(last_location_of_maximum)), ('binned_entropy_5', SensorTransformer(binned_entropy, max_bins=5)), ('mean', SensorTransformer(np.mean)), ('median', SensorTransformer(np.median)), ('variance', SensorTransformer(np.var)), ('std', SensorTransformer(np.std)), ('sum_values', SensorTransformer(np.sum)), ('mean_change', SensorTransformer(mean_change)), ('mean_abs_change', SensorTransformer(mean_abs_change)), ('absolute_sum_of_changes', SensorTransformer(absolute_sum_of_changes)), ('abs_energy', SensorTransformer(abs_energy)), ('percentile_10', SensorTransformer(np.percentile, q=10)), ('percentile_20', SensorTransformer(np.percentile, q=20)), ('percentile_80', SensorTransformer(np.percentile, q=80)), ('percentile_90', SensorTransformer(np.percentile, q=90)), # ('fft_coefficent', SensorMultiTransformer( # fft_coefficient, # param=[{'coeff': coeff} for coeff in range(5)] # )), # ('cwt_coeff', SensorMultiTransformer( # cwt_coefficients, # param=[{'coeff': coeff, 'widths': (2, 5, 10, 20), 'w': w} # for coeff in range(15) for w in (2, 5, 10, 20)] # )) ] sensor_names = DataReader.get_sensor_names() for _, feature_transformer in feature_transformers: feature_transformer.sensor_names = sensor_names feature_transformer.sensor_group_minutes_interval = sensor_group_count // 60 return SensorPipeline([ ('groups', SensorGroupingTransformer( sensor_data_count=DataReader.SENSOR_DATA_COUNT_IN_ROW, sensor_group_count=sensor_group_count )), ('features', SensorFeatureUnion(feature_transformers, n_jobs=n_jobs)), ])
def update(input_to_update: pd.DataFrame): feature_cols = [ 'dire_score', 'radiant_score', 'duration', 'patch', 'region', 'radiant_team_id', 'dire_team_id' ] y_cols = ['radiant_win'] x_cols = [ 'avg_dire_score', 'avg_radiant_score', 'avg_duration', 'patch', 'region' ] x_cols += ['radiant_team_id', 'dire_team_id'] #x_cols += [f'radiant_player_{j}' for j in range(1, 6)] + [f'dire_player_{j}' for j in range(1, 6)] data_reader = DataReader('../Datasets/BaseDataset/dota2_dataset.pickle', feature_cols, y_cols, x_cols) data_reader.read_preprocessed( '../Datasets/BaseDataset/dota2_dataset_preprocessed.pickle') input_to_update = data_reader.add_observations(input_to_update) data_reader.write_data( '../Datasets/BaseDataset/dota2_dataset_preprocessed.pickle') radiant_wr = np.where(data_reader.preprocessed_data[y_cols])[0].shape[0] / \ data_reader.preprocessed_data[y_cols].shape[0] cost_weigths = np.asarray([radiant_wr, 1. - radiant_wr]) lr = 1e-5 model, x, y = build_model(cost_weigths_=cost_weigths, learning_rate=lr) train_x = np.expand_dims(input_to_update[x_cols], axis=-1) train_y = np.hstack((input_to_update[y_cols], 1 - input_to_update[y_cols])) print(train_y) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, "model.ckpt") _, c = sess.run([model.optimize(), model.cost()], feed_dict={ x: train_x, y: train_y }) saver.save(sess, "model.ckpt")
def main(_): vocab = load_vocabulary(FLAGS.data_dir) data_reader = DataReader(FLAGS.data_dir) model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items, global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors, img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim, lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate) update_rating, update_review, global_step = train_fn(model) saver = tf.compat.v1.train.Saver(max_to_keep=10) log_file = open('log.txt', 'w') test_step = 0 config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for epoch in range(1, FLAGS.num_epochs + 1): log_info(log_file, "\nEpoch: {}/{}".format(epoch, FLAGS.num_epochs)) count = 0 sum_rating_loss = 0 sum_review_loss = 0 # Training for users, items, ratings in data_reader.read_train_set(FLAGS.batch_size, rating_only=True): count += 1 fd = model.feed_dict(users=users, items=items, ratings=ratings, is_training=True) _step, _, _rating_loss = sess.run([global_step, update_rating, model.rating_loss], feed_dict=fd) sum_rating_loss += _rating_loss review_users, review_items, _, photo_ids, reviews = get_review_data(users, items, ratings, data_reader.train_review) img_idx = [data_reader.train_id2idx[photo_id] for photo_id in photo_ids] images = data_reader.train_img_features[img_idx] fd = model.feed_dict(users=review_users, items=review_items, images=images, reviews=reviews, is_training=True) _, _review_loss = sess.run([update_review, model.review_loss], feed_dict=fd) sum_review_loss += _review_loss if _step % FLAGS.display_step == 0: data_reader.iter.set_postfix(rating_loss=(sum_rating_loss / count), review_loss=(sum_review_loss / count)) # Testing review_gen_corpus = defaultdict(list) review_ref_corpus = defaultdict(list) photo_bleu_scores = defaultdict(list) photo_rouge_scores = defaultdict(list) review_bleu_scores = defaultdict(list) review_rouge_scores = defaultdict(list) sess.run(model.init_metrics) for users, items, ratings in data_reader.read_test_set(FLAGS.batch_size, rating_only=True): test_step += 1 fd = model.feed_dict(users, items, ratings) sess.run(model.update_metrics, feed_dict=fd) review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings, data_reader.test_review) img_idx = [data_reader.test_id2idx[photo_id] for photo_id in photo_ids] images = data_reader.test_img_features[img_idx] fd = model.feed_dict(users=review_users, items=review_items, images=images) _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd) gen_reviews = decode_reviews(_reviews, vocab) ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews] for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews): review_gen_corpus[(user, item)].append(gen) review_ref_corpus[(user, item)] += refs bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): photo_bleu_scores[order].append(score) rouge_scores = rouge([gen], refs) for metric, score in rouge_scores.items(): photo_rouge_scores[metric].append(score) _mae, _rmse = sess.run([model.mae, model.rmse]) log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse)) log_info(log_file, '\nReview generation results:') log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100, np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100)) for user_item, gen_reviews in review_gen_corpus.items(): references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])] user_item_bleu_scores = defaultdict(list) for gen in gen_reviews: bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): user_item_bleu_scores[order].append(score) for order, scores in user_item_bleu_scores.items(): review_bleu_scores[order].append(np.array(scores).mean()) user_item_rouge_scores = defaultdict(list) for gen in gen_reviews: rouge_scores = rouge([gen], references) for metric, score in rouge_scores.items(): user_item_rouge_scores[metric].append(score) for metric, scores in user_item_rouge_scores.items(): review_rouge_scores[metric].append(np.array(scores).mean()) log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100, np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100)) for metric in ['rouge_1', 'rouge_2', 'rouge_l']: log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100)) log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100)) save_path = saver.save(sess, f"tmp/model{epoch}.ckpt") log_info(log_file, '')
def __init__(self, data_dir, initial_date): self.reader = DataReader(data_dir, initial_date)
def train(args): if not os.path.exists(args.model_path): os.mkdir(args.model_path) #tf.reset_default_graph() model = CrossModel(vocab_size=args.vocab_size) # optimizer train_step = tf.contrib.opt.LazyAdamOptimizer( learning_rate=args.learning_rate).minimize(model.loss) saver = tf.train.Saver() loss_summary = tf.summary.scalar("train_loss", model.loss) init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session() as sess: sess.run(init) #variables_to_restore = slim.get_variables_to_restore() #restore_fn = slim.assign_from_checkpoint_fn(args.pretrain_path, variables_to_restore) #restore_fn(sess) #sess.run(tf.global_variables_initializer()) init_variables_from_checkpoint(args.pretrain_path) _writer = tf.summary.FileWriter(args.logdir, sess.graph) # init embedding embedding = load_embedding(args.emb_path, args.vocab_size, 256) _ = sess.run(model.embedding_init, feed_dict={model.embedding_in: embedding}) print("loading pretrain emb succ.") # summary summary_op = tf.summary.merge([loss_summary]) step = 0 for epoch in range(args.epochs): train_reader = DataReader(args.vocab_path, args.train_data_path, args.image_data_path, args.vocab_size, args.batch_size, is_shuffle=True) print("train reader load succ.") for train_batch in train_reader.batch_generator(): query, pos, neg = train_batch _, _loss, _summary = sess.run( [train_step, model.loss, summary_op], feed_dict={ model.text: query, model.img_pos: pos, model.img_neg: neg }) _writer.add_summary(_summary, step) step += 1 # test sum_loss = 0.0 iters = 0 summary = tf.Summary() if step % args.eval_interval == 0: print("Epochs: {}, Step: {}, Train Loss: {:.4}".format( epoch, step, _loss)) test_reader = DataReader(args.vocab_path, args.test_data_path, args.image_data_path, args.vocab_size, args.batch_size) for test_batch in test_reader.batch_generator(): query, pos, neg = test_batch _loss = sess.run(model.loss, feed_dict={ model.text: query, model.img_pos: pos, model.img_neg: neg }) sum_loss += _loss iters += 1 avg_loss = sum_loss / iters summary.value.add(tag="test_loss", simple_value=avg_loss) _writer.add_summary(summary, step) print("Epochs: {}, Step: {}, Test Loss: {:.4}".format( epoch, step, sum_loss / iters)) if step % args.save_interval == 0: save_path = saver.save(sess, "{}/model.ckpt".format( args.model_path), global_step=step) print("Model save to path: {}/model.ckpt".format( args.model_path))
'rawfilename' in config['APP'] and 'fun' in config['APP']\ and 'predictionfilename' in config['APP']) data_columns = config['COVID_DATA']['datacolumns'].split(",") base_dir_report = config['APP']['report_dir'] filereportname = config['APP']['filereportname'] rawfilename = config['APP']['rawfilename'] predictionfilename = config['APP']['predictionfilename'] funname = config['APP']['fun'] ############# ############# ## PREPARE ## reader = DataReader(config['COVID_DATA']['url'], config['APP']['reader_mode']) dataextract = ExtractCovidData(data_columns) fittingclass = CovidFitFunctions() statcalc = ComputeStat() prediction = CovidPrediction() _idelab_ = datetime.now().strftime("%Y%m%d_%H%M%S") _reportdir_ = path.join(base_dir_report, _idelab_) if not path.exists(base_dir_report): mkdir(base_dir_report) _reportlog_ = [] if not path.exists(_reportdir_): mkdir(_reportdir_) _reportlog_.append( LOGROW.format(dt=str(datetime.now()), tx="Start Process " + _idelab_))
def __init__(self): self.dr = DataReader()
def train(args): config = ParameterConfig() data_reader = DataReader(args['data'], config.batch_size, config.seq_length) config.vocab_size = data_reader.vocab_size if not os.path.exists(args['model_dir']): os.makedirs(args['model_dir']) with open(os.path.join(args['model_dir'], 'config.pkl'), 'wb') as f: cPickle.dump(config, f) with open(os.path.join(args['model_dir'], 'vocab.pkl'), 'wb') as f: cPickle.dump((data_reader.tokens, data_reader.vocab), f) training_model = RNNModel(config=config) with tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale) tf.initialize_all_variables().run() saver = tf.train.Saver(tf.all_variables()) #Run a single epoch of training for epoch in range(config.total_max_epoch): current_state = session.run(training_model.initial_state) learning_rate_decay = config.lr_decay ** max(epoch - config.max_epoch, 0.0) training_model.assign_learningRate(session, config.learning_rate * learning_rate_decay) total_cost = 0.0 total_seq = 0 data_reader.reset_batch_pointer() for batch in range(data_reader.num_batches): start = time.time() x,y = data_reader.next_batch() feed_dict = {training_model.input_data: x, training_model.targets: y, training_model.initial_state: current_state} cost, current_state, _ = session.run([training_model.cost, training_model.final_state, training_model.train_op], feed_dict) total_cost += cost total_seq += config.seq_length perplexity = np.exp(total_cost / total_seq) end = time.time() print("{}/{} (epoch {}), perplexity = {:.3f}, time/batch = {:.3f}" \ .format(epoch * data_reader.num_batches + batch, config.total_max_epoch * data_reader.num_batches, epoch, perplexity, end - start)) sys.stdout.flush() if ((epoch * data_reader.num_batches + batch) % 1000 == 0 \ or (epoch == config.total_max_epoch - 1 and batch == data_reader.num_batches - 1)): checkpoint_path = os.path.join(args['model_dir'], 'model.ckpt') saver.save(session, checkpoint_path, global_step = epoch * data_reader.num_batches + batch) print("Model saved to {}".format(checkpoint_path)) sys.stdout.flush() session.close()
import numpy as np import os from nltk.translate.bleu_score import sentence_bleu from nltk.translate.bleu_score import SmoothingFunction from paddle import fluid import config from reader import DataReader index_word = DataReader().index_word stop_tag = config.data['stop_idx'] padding_tag = config.data['padding_idx'] def filter(p): """ 把索引list转换为单词list """ result = [] for idx in p: if idx == stop_tag: break if idx == padding_tag: continue result.append(index_word[idx]) return result def calc_bleu(pred, real, weights=(0.25, 0.25, 0.25, 0.25)): if isinstance(pred, np.ndarray): if pred.dtype == 'float32': pred = np.rint(pred).astype('int32')
import config import evaluate from tools import util from tools.logger import Logger from model.model_adaAttention_aic import ImageCaptionModel from reader import DataReader seed = config.train['seed'] decoder_config = config.md['decoder'] encoder_config = config.md['encoder'] batch_size = config.train['batch_size'] capacity = config.train['data_loader_capacity'] logger = Logger() data_reader = DataReader() random.seed(seed) np.random.seed(seed) def get_optimizer(): base_lr = config.train['learning_rate'] strategy = config.train['lr_decay_strategy'] lr = util.get_lr(strategy, base_lr, config.data['sample_count'], config.train['batch_size']) return fluid.optimizer.Adam(lr), lr def training_net(): startup_prog, train_prog = fluid.Program(), fluid.Program() train_prog.random_seed = 0 # 必须是0,否则dropout会出问题
self._algo.train(*args) def eval(self): self._algo.eval() if __name__ == '__main__': import random random.seed(1) filename = 'data_banknote_authentication.csv' max_depth = 3 min_size = 10 nCut = 20 r =DataReader() r.load_csv(filename) r.str_column_to_float() r.cross_validation_split() #run simple tree a = SimpleTree() dtc = DecisionTreeClassifier() dtc.set_reader(r) dtc.set_algo(a) dtc.train(max_depth, min_size, nCut) dtc.eval() #run random forest a = RandomForest()
def train(args): if not os.path.exists(args.model_path): os.mkdir(args.model_path) tf.reset_default_graph() model = TextClassification(vocab_size=args.vocab_size, encoder_type=args.encoder_type, max_seq_len=args.max_seq_len) # optimizer train_step = tf.contrib.opt.LazyAdamOptimizer(learning_rate=args.learning_rate).minimize(model.loss) saver = tf.train.Saver() loss_summary = tf.summary.scalar("train_loss", model.loss) init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session() as sess: sess.run(init) # feeding embedding _writer = tf.summary.FileWriter(args.logdir, sess.graph) # summary summary_op = tf.summary.merge([loss_summary]) step = 0 for epoch in range(args.epochs): train_reader = DataReader(args.vocab_path, args.train_data_path, args.vocab_size, args.batch_size, args.max_seq_len) for train_batch in train_reader.batch_generator(): text, label = train_batch _, _loss, _summary, _logits = sess.run([train_step, model.loss, summary_op, model.logits], feed_dict={model.label_in: label, model.text_in: text}) _writer.add_summary(_summary, step) step += 1 # test summary = tf.Summary() if step % args.eval_interval == 0: acc, acc_op = tf.metrics.accuracy(labels=tf.argmax(label, 1), predictions=tf.argmax(_logits, 1)) sess.run(tf.local_variables_initializer()) _, _acc = sess.run([acc, acc_op]) summary.value.add(tag="train_accuracy", simple_value=_acc) print("Epochs: {}, Step: {}, Train Loss: {}, Acc: {}".format(epoch, step, _loss, _acc)) test_reader = DataReader(args.vocab_path, args.test_data_path, args.vocab_size, args.batch_size, args.max_seq_len) sum_loss = 0.0 sum_acc = 0.0 iters = 0 for test_batch in test_reader.batch_generator(): text, label = test_batch _loss, _logits = sess.run([model.loss, model.logits], feed_dict={model.label_in: label, model.text_in: text}) acc, acc_op = tf.metrics.accuracy(labels=tf.argmax(label, 1), predictions=tf.argmax(_logits, 1)) sess.run(tf.local_variables_initializer()) _, _acc = sess.run([acc, acc_op]) sum_acc += _acc sum_loss += _loss iters += 1 avg_loss = sum_loss / iters avg_acc = sum_acc / iters summary.value.add(tag="test_accuracy", simple_value=avg_acc) summary.value.add(tag="test_loss", simple_value=avg_loss) _writer.add_summary(summary, step) print("Epochs: {}, Step: {}, Test Loss: {}, Acc: {}".format(epoch, step, avg_loss, avg_acc)) if step % args.save_interval == 0: save_path = saver.save(sess, "{}/birnn.lm.ckpt".format(args.model_path), global_step=step) print("Model save to path: {}/birnn.lm.ckpt".format(args.model_path))
class TextUI: __prefix_data = "./data/" __data_indices = [2, 11, 18, 23, 42, 44, 46, 50, 52] __files: List[AnyStr] def __init__(self): self.dr = DataReader() def find_data_files(self): self.__files = [] for file in os.listdir(self.__prefix_data): if file.endswith(".csv") and "Signals" in file: if any((("{0:0>2}".format(i) in file) for i in self.__data_indices)): self.__files.append(file) self.__files.sort() def run(self): model = None scaler = None while True: self.find_data_files() print( "Enter:\nt - to train the model, \ne - to test a trained model, \nl - to load a pre-trained model\nq - to quit" ) print( "IMPORTANT: Always train or load a model before testing it!\n") choice = input("Your choice: ") if choice is "l": model = load_model("model.h5") elif choice is "t" or choice is "e": print("Data files:\n") for i, file in enumerate(self.__files): print("{} - {}".format(i + 1, file)) if choice is "t": number = input( "\nPlease select the file to train the model on: ") else: number = input( "\nPlease select the file to test the model on: ") index = int(number) index -= 1 if 0 <= index < len(self.__files): data = self.dr.read_set(self.__data_indices[index]) pp = Preprocess() data, scaler = pp.clean_up(data) data = pp.convert_to_supervised(data, sample_shift=0) if choice is "t": train, test = pp.prepare_sets(data, 0.2) train_X, train_y = pp.make_input_output( train, remove_resp_from_input=True) test_X, test_y = pp.make_input_output( test, remove_resp_from_input=True) trainer = RespRatePredictor() self.dr.plot(data) model = trainer.make_network( input_shape=(train_X.shape[1], train_X.shape[2])) model = trainer.fit_network(model, train_X, train_y, test_X, test_y) model.save("model_{0:0>2}.h5".format( self.__data_indices[index - 1])) else: all_X, all_y = pp.make_input_output( data.drop("Time [s]", axis=1), remove_resp_from_input=True) predict_y = model.predict(all_X, batch_size=640) # min_ = scaler.min_[1] # scale_ = scaler.scale_[1] # predict_y = (predict_y - min_) / scale_ predicted = pnd.DataFrame( {"RESP_PREDICTED": predict_y.flatten()}) fused = pnd.concat([data, predicted], axis=1) self.dr.plot(fused) self.dr.plot_detail(fused) else: continue else: break
def readTaxiData(filename): TaxiDataList = DataReader(filename).dataProcess return TaxiDataList
def main(_): vocab = load_vocabulary(FLAGS.data_dir) if FLAGS.generating: data_reader = DataReader(FLAGS.data_dir, n_reviews=5, generating=True) else: data_reader = DataReader(FLAGS.data_dir) model = Model(total_users=data_reader.total_users, total_items=data_reader.total_items, global_rating=data_reader.global_rating, num_factors=FLAGS.num_factors, img_dims=[196, 512], vocab_size=len(vocab), word_dim=FLAGS.word_dim, lstm_dim=FLAGS.lstm_dim, max_length=FLAGS.max_length, dropout_rate=FLAGS.dropout_rate) saver = tf.compat.v1.train.Saver(max_to_keep=10) log_file = open('log.txt', 'w') test_step = 0 config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: saver.restore(sess, FLAGS.ckpt_dir) print('Model succesfully restored') # Testing review_gen_corpus = defaultdict(list) review_ref_corpus = defaultdict(list) photo_bleu_scores = defaultdict(list) photo_rouge_scores = defaultdict(list) review_bleu_scores = defaultdict(list) review_rouge_scores = defaultdict(list) sess.run(model.init_metrics) for users, items, ratings in data_reader.read_real_test_set(FLAGS.batch_size, rating_only=True): test_step += 1 fd = model.feed_dict(users, items, ratings) sess.run(model.update_metrics, feed_dict=fd) review_users, review_items, review_ratings, photo_ids, reviews = get_review_data(users, items, ratings, data_reader.real_test_review) img_idx = [data_reader.real_test_id2idx[photo_id] for photo_id in photo_ids] images = data_reader.real_test_img_features[img_idx] fd = model.feed_dict(users=review_users, items=review_items, images=images) _reviews, _alphas, _betas = sess.run([model.sampled_reviews, model.alphas, model.betas], feed_dict=fd) gen_reviews = decode_reviews(_reviews, vocab) ref_reviews = [decode_reviews(batch_review_normalize(ref), vocab) for ref in reviews] if FLAGS.generating: for gen, ref in zip(gen_reviews, ref_reviews): gen_str = "GENERATED:\n"+" ".join(gen) ref_str = "REFERENCE:\n"+" ".join([" ".join(sentence) for sentence in ref])+"\n" log_info(log_file,gen_str) log_info(log_file,ref_str) for user, item, gen, refs in zip(review_users, review_items, gen_reviews, ref_reviews): review_gen_corpus[(user, item)].append(gen) review_ref_corpus[(user, item)] += refs bleu_scores = compute_bleu([refs], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): photo_bleu_scores[order].append(score) rouge_scores = rouge([gen], refs) for metric, score in rouge_scores.items(): photo_rouge_scores[metric].append(score) _mae, _rmse = sess.run([model.mae, model.rmse]) log_info(log_file, '\nRating prediction results: MAE={:.3f}, RMSE={:.3f}'.format(_mae, _rmse)) log_info(log_file, '\nReview generation results:') log_info(log_file, '- Photo level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(photo_bleu_scores[1]).mean() * 100, np.array(photo_bleu_scores[2]).mean() * 100, np.array(photo_bleu_scores[3]).mean() * 100, np.array(photo_bleu_scores[4]).mean() * 100)) for user_item, gen_reviews in review_gen_corpus.items(): references = [list(ref) for ref in set(tuple(ref) for ref in review_ref_corpus[user_item])] user_item_bleu_scores = defaultdict(list) for gen in gen_reviews: bleu_scores = compute_bleu([references], [gen], max_order=4, smooth=True) for order, score in bleu_scores.items(): user_item_bleu_scores[order].append(score) for order, scores in user_item_bleu_scores.items(): review_bleu_scores[order].append(np.array(scores).mean()) user_item_rouge_scores = defaultdict(list) for gen in gen_reviews: rouge_scores = rouge([gen], references) for metric, score in rouge_scores.items(): user_item_rouge_scores[metric].append(score) for metric, scores in user_item_rouge_scores.items(): review_rouge_scores[metric].append(np.array(scores).mean()) log_info(log_file, '- Review level: BLEU-scores = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format( np.array(review_bleu_scores[1]).mean() * 100, np.array(review_bleu_scores[2]).mean() * 100, np.array(review_bleu_scores[3]).mean() * 100, np.array(review_bleu_scores[4]).mean() * 100)) for metric in ['rouge_1', 'rouge_2', 'rouge_l']: log_info(log_file, '- Photo level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(photo_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(photo_rouge_scores['{}/f_score'.format(metric)]).mean() * 100)) log_info(log_file, '- Review level: {} = {:.2f}, {:.2f}, {:.2f}'.format( metric, np.array(review_rouge_scores['{}/p_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/r_score'.format(metric)]).mean() * 100, np.array(review_rouge_scores['{}/f_score'.format(metric)]).mean() * 100))
___author__ = 'Ahmed Hani Ibrahim' from reader import DataReader from analysis import Analyzer file_path = './data/data_science_dataset_wuzzuf.csv' reader = DataReader(file_path) data = reader.read_data() analyzer = Analyzer(data) analyzer.trending_category() x = 0