def main(): # First load the models as it was, that is loading it with the training sizes and vocab training_data = DataReader(training_data_filepath) vocab = training_data.vocab # Build a list of trigrams words = training_data.get_words() # Get the pretrained word vectors word_to_index, embed_dict = get_pretrained_word_indexes(pretrained_filepath) # Update word_to_index and vocabulary word_to_index, vocab = update_word_indexes_vocab(word_to_index, vocab) # Get the numpy matrix containing the pretrained word vectors # with randomly initialized unknown words from the corpus word_embeddings = get_embeddings_matrix(word_to_index, embed_dict, WORD_EMBEDDINGS_DIMENSION) model = NGramLanguageModeler(len(vocab), 50, CONTEXT_SIZE, word_embeddings) model.load_state_dict(torch.load("AWS_model.pt")) test_data = DataReader(test_data_filepath, read_limit=READ_LIMIT) evaluate_model(model, test_data, word_to_index)
def main(): train_data_reader = DataReader(FLAGS, dtype='train') test_data_reader = DataReader(FLAGS, dtype='test') with tf.Graph().as_default(): net = Net(FLAGS) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) saver = tf.train.Saver() if FLAGS.mode == 'train': do_train.run(FLAGS, sess, net, saver, train_data_reader, test_data_reader) else: ckpt = tf.train.get_checkpoint_state(FLAGS.log_path) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("Model restored...") if FLAGS.mode == 'test': do_validate.run(sess, net, test_data_reader) else: do_train.run(FLAGS, sess, net, saver, train_data_reader, test_data_reader)
def main(args): logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO) coord = tf.train.Coordinator() if args.mode == "train": with tf.compat.v1.name_scope('create_inputs'): data_reader = DataReader( data_dir=args.train_dir, data_list=args.train_list, mask_window=0.4, queue_size=args.batch_size * 3, coord=coord) if args.valid_list is not None: data_reader_valid = DataReader( data_dir=args.valid_dir, data_list=args.valid_list, mask_window=0.4, queue_size=args.batch_size * 2, coord=coord) logging.info( "Dataset size: train {}, valid {}".format(data_reader.num_data, data_reader_valid.num_data)) else: data_reader_valid = None logging.info("Dataset size: train {}".format(data_reader.num_data)) train_fn(args, data_reader, data_reader_valid) elif args.mode == "valid" or args.mode == "test": with tf.compat.v1.name_scope('create_inputs'): data_reader = DataReader_test( data_dir=args.data_dir, data_list=args.data_list, mask_window=0.4, queue_size=args.batch_size * 10, coord=coord) valid_fn(args, data_reader) elif args.mode == "pred": with tf.compat.v1.name_scope('create_inputs'): if args.input_mseed: data_reader = DataReader_mseed( data_dir=args.data_dir, data_list=args.data_list, queue_size=args.batch_size * 10, coord=coord, input_length=args.input_length) else: data_reader = DataReader_pred( data_dir=args.data_dir, data_list=args.data_list, queue_size=args.batch_size * 10, coord=coord, input_length=args.input_length) pred_fn(args, data_reader, log_dir=args.output_dir) else: print("mode should be: train, valid, test, pred or debug") return
def train(model, config): """Trains the input model using specified configurations Args: model: tensorflow keras model config: instance of class configuration """ train_data = DataReader(config.train_file_path, config) train_batch = train_data.read_batch(train=True, num_epochs=config.num_epochs, shuffle=True) train_iterations = int(train_data.num_images//config.batch_size) if config.val_file_path: val_data = DataReader(config.val_file_path, config) learning_rate = LinearWarmUpCosineDecay(train_iterations*config.num_epochs, config.learning_rate) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate(0)) epoch = 1 epoch_loss_train = 0 for iteration, (images, labels, weights) in enumerate(train_batch): loss, grads, preds = train_step(model, images, labels, weights) epoch_loss_train += loss optimizer.__setattr__('lr', learning_rate(optimizer.iterations)) optimizer.apply_gradients(zip(grads, model.trainable_variables)) if iteration > 0 and iteration % train_iterations == 0: print("Epoch {} Train loss: {}".format(epoch, epoch_loss_train/train_iterations)) epoch_loss_train = 0 if config.val_file_path: epoch_loss_val = [] acc = [] val_batch = val_data.read_batch(train=False, num_epochs=1) for images, labels, weights in val_batch: loss, preds = val_step(model, images, labels) epoch_loss_val.append(loss) acc.append(accuracy(labels, preds, config.num_classes)) print("Epoch {} Val loss: {}".format(epoch, epoch_loss_val/len(epoch_loss_val))) for j in config.num_classes: print("Epoch {} Class {} Accuracy: {}".format(epoch, j, sum([val[j] for val in acc])/len(acc))) model.save_weights(os.path.join(config.save_directory, 'model'), save_format='tf') epoch += 1
def main(): #Prepare dataset from csv to npz files #DatasetPreparation.prepare('train_preprocessed.csv','test_preprocessed.csv') #Read the dataset, create batches, and one hot encode the targets batch_size = 100 train_data = DataReader('train.npz',batch_size) validation_data = DataReader('validation.npz') test_data = np.load('test.npz') m = Model(train_data,validation_data) m.train() m.test(test_data)
def data_training(): """ 仅用样本集进行训练 """ sentences = [] reader = DataReader(TRAIN_DATA_TYPE) reader.set_pos() start_id = reader.get_next_pic_id() qa = reader.get_pic_qa(start_id) for q in qa: question = q['question'] question = question.replace('?', ' ?') question = question.replace(',', ' ,') question = question.replace('.', ' .') sentence = question.split(' ') sentences.append(sentence) now_id = reader.get_next_pic_id() i = 0 while now_id != start_id: qa = reader.get_pic_qa(now_id) for q in qa: question = q['question'] question = question.replace('?', ' ?') question = question.replace(',', ' ,') question = question.replace('.', ' .') sentence = question.split(' ') sentences.append(sentence) now_id = reader.get_next_pic_id() i = i + 1 if i % 1000 == 0: print('*', end='') print('load data over!') model = gensim.models.Word2Vec(sentences, size=300, min_count=1) model.save(GENSIM_DATA_PATH)
def __init__(self, input_file, vocabulary_file, img_data_file, char2ix_file, output_dir, maxwordlength, emb_dimension, line_batch_size, sample_batch_size, neg_num, window_size, discard, epochs, initial_lr, seed): torch.manual_seed(seed) self.img_data = np.load(img_data_file) self.data = DataReader(input_file, vocabulary_file, char2ix_file, maxwordlength, discard, seed) dataset = Word2vecDataset(self.data, window_size, sample_batch_size, neg_num) self.dataloader = DataLoader(dataset, batch_size=line_batch_size, shuffle=True, num_workers=0, collate_fn=dataset.collate) self.output_dir = output_dir self.emb_size = len(self.data.word2id) self.char_size = len(self.data.char2id) + 1 #5031 self.emb_dimension = emb_dimension self.line_batch_size = line_batch_size self.epochs = epochs self.initial_lr = initial_lr self.VCWE_model = VCWEModel(self.emb_size, self.emb_dimension, self.data.wordid2charid, self.char_size) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") self.num_train_steps = int(len(self.dataloader) * self.epochs) if self.use_cuda: self.VCWE_model.cuda()
def __init__(self, input_file, output_file, emb_dimension=300, batch_size=64, window_size=5, iterations=5, initial_lr=1.0, min_count=5): self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: print("USING CUDA") self.skip_gram_model.cuda() else: print("CUDA FAIL")
def test_va(): # reader = DataReader("direcnet_pid", "../data/tblADataCGMS.csv", 5) # data = reader.read() reader = DataReader("VA2", "../data/CGMdataCSMcomplete.xlsx", 5) data = reader.read() normal, diabetic = read_patient_info() pids = list(normal["Patient ID"]) t = np.arange(len(data[pids[0]][0])) * 5 res = np.vstack((t, data[pids[0]][0])) np.savetxt("{}.txt".format(pids[0]), res, fmt="%.4f") for p in pids: m = map(lambda x: len(x), data[p]) print(list(m)) exit() plt.figure() tot, hyper, hypo = 0, 0, 0 for pid in data: # if pid not in pids: # continue for y in data[pid]: t = np.arange(len(y)) * 5 tot += len(y) y = np.array(y) hyper += sum(y > 180) hypo += sum(y < 70) plt.plot(t, y) plt.hlines(70, 0, 10000) plt.hlines(80, 0, 10000) plt.hlines(180, 0, 10000) plt.show() print(hypo, hyper, tot)
def build_cnv_2_gene_training_data(self, data_dir, outcome_file, cnv_2_gene_file): ''' Function: 產生以gene symbol為feature的sample set. Input: cnv_2_gene_file: 整理好的array_id對應到gene_symbol. ''' excel_obj = ExcelReader() data_reader_obj = DataReader() outcome_dict = excel_obj.get_cyto_cnv_result(outcome_file) cnv_df = data_reader_obj.cnv_data_reader_pipeline(data_dir) #### probe mapping to gene (array_2_gene, gene_2_array) = data_reader_obj.get_cnv_to_gene_table(cnv_2_gene_file) # print(array_2_gene) gene_cnv = data_reader_obj.build_array_to_gene(cnv_df, array_2_gene, gene_2_array) ## gene cnv data_df = data_reader_obj.combine_outcome_data(gene_cnv, outcome_dict) return data_df
def train_one_epoch(model, cfg, optimizer, lr_scheduler, loss_func, loss_metric, cuda=True): ann_files, img_dirs = [], [] data_info = cfg.dataset[cfg.train_mode[0]] for mode in cfg.train_mode: data_info = cfg.dataset[mode] ann_files.append(data_info['ann_file']) img_dirs.append(data_info['img_prefix']) data_reader = DataReader( ann_files=ann_files, img_dirs=img_dirs, transform=None, mode='train', img_scale=data_info['img_scale'], keep_ratio=data_info['keep_ratio'], label_transform=cfg.dataset['label_transform'], ) data_loader = DataLoader(data_reader, collate_fn=collate_fn, **cfg.data_loader) loss_metric.update(total_iter=len(data_loader)) model.train() for step, (data, target) in enumerate(data_loader): # inputs = torch.stack(data) # targets = torch.from_numpy(np.array(target)).type(torch.LongTensor) if data.shape[0] == 0: continue inputs = data targets = target if cuda: inputs = inputs.cuda() targets = targets.cuda() if cfg.mix['type'] == 'mixup': alpha = cfg.mix['alpha'] lam = np.random.beta(alpha, alpha) index = torch.randperm(inputs.size(0)).cuda() inputs = lam * inputs + (1 - lam) * inputs[index, :] targets_a, targets_b = targets, targets[index] outputs = model(inputs) loss = lam * loss_func(outputs, targets_a) + (1 - lam) * loss_func( outputs, targets_b) else: outputs = model(inputs) loss = loss_func(outputs, targets) # backward optimizer.zero_grad() loss.backward() optimizer.step() loss_metric.update(iter=step, loss=loss) if step % cfg.freq_cfg['log_print'] == 0 or step == len(data_loader): line = loss_metric.str() logger.info(line) with open(os.path.join(cfg.work_dir, cfg.log['out_file']), 'a+') as fp: fp.write(line + '\n')
def __init__(self, model_load_path, artist_name, test, prime_text): self.sess = tf.Session() self.artist_name = artist_name print 'Process data...' self.data_reader = DataReader(self.artist_name) self.vocab = self.data_reader.get_vocab() print 'Init model...' self.model = LSTMModel(self.sess, self.vocab, c.BATCH_SIZE, c.SEQ_LEN, c.CELL_SIZE, c.NUM_LAYERS, test=test) print 'Init variables...' self.saver = tf.train.Saver(max_to_keep=None) self.sess.run(tf.global_variables_initializer()) # if load path specified, load a saved model if model_load_path is not None: self.saver.restore(self.sess, model_load_path) print 'Model restored from ' + model_load_path if test: self.test(prime_text) else: self.train()
def test_parse_slabinfo(self): test_str = ( "Name Objects Objsize Space Slabs/Part/Cpu O/S O %Fr %Ef Flg\n" "kmalloc-1024 288 1024 294.9K 9/0/9 16 2 0 100 *\n" "kmalloc-128 2822 128 372.7K 64/11/27 32 0 12 96 *\n" "kmalloc-192 2163 192 425.9K 82/0/22 21 0 0 97 *\n" "kmalloc-2048 2241 2048 4.7M 138/45/7 16 3 31 96 *\n" "kmalloc-256 4921 256 1.3M 313/67/5 16 0 21 96 *\n" "kmalloc-4096 584 4096 2.3M 70/0/3 8 3 0 100 *\n" "kmalloc-512 2674 512 1.3M 163/6/7 16 1 3 98 *\n" "kmalloc-64 12904 64 843.7K 174/23/32 64 0 11 97 *\n" "kmalloc-8192 32 8192 262.1K 3/0/5 4 3 0 100 \n" ) dut = DataReader(None) exist, val = dut.parse_slabinfo(test_str, tag='kmalloc-1024') self.assertEqual(exist, True) self.assertEqual(val, 288) exist, val = dut.parse_slabinfo(test_str, tag='kmalloc-64') self.assertEqual(exist, True) self.assertEqual(val, 12904) exist, val = dut.parse_slabinfo(test_str, tag='will_not_found') self.assertEqual(exist, False) self.assertEqual(val, 0)
def createQuestionsDict(): """ 创建问题字典(包含回答字典) """ reader = DataReader() reader.set_pos() dealer = DataDealer(ANSWERS_DICT_PATH) start_id = reader.get_next_pic_id() qa = reader.get_pic_qa(start_id) for q in qa: question = q['question'] dealer.deal(question) now_id = reader.get_next_pic_id() i = 0 while now_id != start_id: qa = reader.get_pic_qa(now_id) for q in qa: question = q['question'] dealer.deal(question) now_id = reader.get_next_pic_id() i = i + 1 if i % 1000 == 0: print('*', end='') dealer.saveData(QUESTIONS_DICT_PATH) print('over!')
def run(sysargs): if len(sysargs) < 1: print("Insufficient input args.") print("Usage:") print("python lstm.py <input_file_path>") else: skip_train_flag = False testbed = TestBed() if (len(sysargs) == 2): skip_train_flag = literal_eval(sysargs[1]) print("\nskip_train_flag:'" + str(skip_train_flag) + "'") input_file_path = sysargs[0] dr = DataReader() dr.read_pkl_data_at_file_path(input_file_path) sequences = dr.get_sequences() labels = dr.get_labels() if not skip_train_flag: # train testbed.init_model() testbed.train(sequences, labels) testbed.save_model() else: # skipping training part, load model testbed.load_model() metrics_names, score = testbed.test(sequences, labels) print("metrics_names:") print(metrics_names) print("score=" + str(score))
def __init__(self, input_file, antonym_file, output_file, emb_dimension=100, batch_size=32, window_size=5, iterations=3, initial_lr=0.001, min_count=12): print("Reading input file...") self.data = DataReader(input_file, min_count) dataset = Word2vecDataset(self.data, window_size) print("Creating data batches") self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.antonym_file = open(antonym_file, 'r') self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.skip_gram_model.cuda()
def determine_iterations_per_epoch(config): """Determine the number of iterations per training epoch Creates an instance of the DataReader class and iterates over one epoch to determine number of iterations in an epoch. Required in order to accurately decay the learning rate. Args: config: instance of config class Returns: count: number of iterations in each epoch """ if config.train_file_path: data = DataReader(config, config.train_file_path) batch = data.read_batch(current_epoch=0, num_epochs=1) count = 0 if config.task == 'pretrain': for image, epoch in batch: count += 1 else: for image, label, epoch in batch: count += 1 return count
def eval(model, cfg, mode='val', cuda=True): data_info = cfg.dataset[mode] data_reader = DataReader( ann_files=[data_info['ann_file']], img_dirs=[data_info['img_prefix']], transform=None, mode='val', img_scale=data_info['img_scale'], keep_ratio=data_info['keep_ratio'], ) data_loader = DataLoader(data_reader, collate_fn=collate_fn, **cfg.val_data_loader) y_true, y_pred = [], [] model.eval() for step, (data, target) in tqdm(enumerate(data_loader)): # inputs = torch.stack(data) # target = torch.from_numpy(np.array(target)).type(torch.LongTensor) inputs = data targets = target if cuda: inputs = inputs.cuda() targets = targets.cuda() with torch.no_grad(): outputs = model(inputs) outs = nn.functional.softmax(outputs, dim=1) pred = torch.argmax(outs, dim=1) y_true.extend(list(targets.cpu().detach().numpy())) y_pred.extend(list(pred.cpu().detach().numpy())) model.train() return classification_report(y_true, y_pred, output_dict=True), \ classification_report(y_true, y_pred, output_dict=False)
def main(): # Extract arguments ap = argparse.ArgumentParser() ap.add_argument("data", help="Data file containing bugs") ap.add_argument("vocabulary", help="Vocabulary file") ap.add_argument("-s", "--suffix", help="Model and log-file suffix") args = ap.parse_args() data = DataReader(config["data"], data_file=args.data, vocab_path=args.vocabulary) model = TransformerPatchingModel(config["transformer"], data.vocabulary.vocab_dim, is_pointer=config["data"]["edits"]) # Restore model after a simple init tracker = Tracker(model, suffix=args.suffix) model(tf.zeros((1, 2), 'int32'), tf.zeros((1, 2), 'int32'), tf.zeros((1, 2), 'int32'), tf.zeros((0, 0), 'int32'), True) tracker.restore(best_only=True) with open( "results" + ("" if args.suffix is None else "-" + args.suffix) + ".txt", "w") as f_out: for batch in data.batcher(mode="test", optimize_packing=False): pre, pre_locs = batch[:2] preds = model.predict(data.vocabulary, pre, pre_locs, config["data"]["beam_size"], config["data"]["max_bug_length"]) write_completions(f_out, data.vocabulary, pre.numpy(), pre_locs.numpy(), preds)
def __init__(self, input_file, output_file, emb_dimension=500, batch_size=32, window_size=5, iterations=5, initial_lr=0.001, min_count=12): self.data = DataReader(input_file, min_count) dataset = PennDataset(self.data, window_size) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=dataset.collate) self.output_file_name = output_file self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr self.penn_skip_gram_model = PennSkipGramModel(self.emb_size, self.emb_dimension) self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") if self.use_cuda: self.penn_skip_gram_model.cuda()
def main(): data_reader = DataReader() df = data_reader.get_all_data() # random split of data train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df) # set up train data train_tokens, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False, remove_empty=True) train_x, train_y, feature_names = tokens_to_bagofwords(train_tokens, train_y_raw) # train model model = _get_nn_model_bag_of_words_simple_v2(train_x, train_y, data_reader.get_region_labels()['Code'], epochs=50, batch_size=64) # set up test data test_tokens, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True) test_x, test_y, _ = tokens_to_bagofwords(test_tokens, test_y_raw, feature_names=feature_names) # evaluate model evaluate_model_nn(model, test_x, test_y, plot_roc=False) # ABOVE IS BASIC SUPERVISED LEARNING TO GENERATE MODEL ################################################# # BELOW IS SEMI-SUPERVISED SELF-TRAINING TO FUTHER TRAIN MODEL # read unlabelled data and format it to be the same as labelled data unlabelled_df = data_reader.get_east_dir() unlabelled_df = normalize_east_dir_df(unlabelled_df) # set up unlabelled data as semi-supervised data tokens, _ = tokenize(unlabelled_df, _, save_missing_feature_as_string=False, remove_empty=True) semi_x_base, _, _ = tokens_to_bagofwords(tokens, _, feature_names=feature_names) # Confidence threshold to train on train_threshold = 0.8 semi_train_amount = 30 # SELF TRAIN MANY TIMES for i in range(semi_train_amount): # get predictions on unlabelled data pred = model.model.predict(semi_x_base) # convert probablities to 1 hot encoded output semi_y = np.zeros_like(pred) semi_y[np.arange(len(pred)), pred.argmax(1)] = 1 # filter semi_x and semi_y to only include predictions above train_threshold semi_y = semi_y[pred.max(axis=1) > train_threshold] semi_x = semi_x_base[pred.max(axis=1) > train_threshold] # train on semi supervised data model.model.fit(semi_x, semi_y, batch_size=64, epochs=100) # retrain on original train data model.model.fit(train_x, model.encoder.transform(train_y), batch_size=32, epochs=10) # evaluate model evaluate_model_nn(model, test_x, test_y, plot_roc=False) # remove semi data used in this iteration from future iterations semi_x_base = semi_x_base[~(pred.max(axis=1) > train_threshold)]
def save(artist, model_path, num_save): sample_save_dir = c.get_dir('../save/samples/') sess = tf.Session() print artist data_reader = DataReader(artist) vocab = data_reader.get_vocab() print 'Init model...' model = LSTMModel(sess, vocab, c.BATCH_SIZE, c.SEQ_LEN, c.CELL_SIZE, c.NUM_LAYERS, test=True) saver = tf.train.Saver() sess.run(tf.initialize_all_variables()) saver.restore(sess, model_path) print 'Model restored from ' + model_path artist_save_dir = c.get_dir(join(sample_save_dir, artist)) for i in xrange(num_save): print i path = join(artist_save_dir, str(i) + '.txt') sample = model.generate() processed_sample = process_sample(sample) with open(path, 'w') as f: f.write(processed_sample)
def get_img_ary(self, data_df): data_reader_obj = DataReader() # data_ary = data_reader_obj.tif_ary_reader(data_df, 'tif_path', 'cnv_outcome') data_ary = data_reader_obj.png_ary_reader(data_df, 'tif_path', 'cnv_outcome') return data_ary
def read_data(self): self._logger.info('Reading meta data...') self._reader = DataReader(self._logger) (self._vocab, self._vocab_size, self._dictionary, self._reverse_dictionary, self._unigrams, self._arts_srcs, self._srcs_ents, self._ents_srcs) = self._reader.read_meta_files(self._args.data) with open(self._args.output + '-labels-dict.pkl', 'wb') as f: cPickle.dump(self._reverse_dictionary, f,protocol=cPickle.HIGHEST_PROTOCOL) with open(self._args.output + '-vocab-dict.pkl', 'wb') as f: cPickle.dump(self._dictionary, f,protocol=cPickle.HIGHEST_PROTOCOL) self._number_of_srcs = len(set(self._srcs_ents.keys())) self._sample_dist()
def readData(self, path_to_data, path_to_energy): """ Reads in weather data from a file and stores it """ if path_to_data == None: weather_reader = RandomReader(365 * 24) else: weather_reader = DataReader(path_to_data, path_to_energy) while weather_reader.canGetForecast(): forecast = weather_reader.getForecast( ) #forecast = list of 24 tuples of (windSpeed, sunlight, energy_needed) # store raw numbers self.raw_data.append(copy.deepcopy(forecast[0])) self.energy_needed.append(forecast[0].ERCOT) self.energy_gained.append( (self.calculate_wind_power(forecast[0].windSpeed), self.calculate_solar_power(forecast[0].sunlight), self.calculate_hydro_power())) # calculate features wind_power = 0.0 solar_power = 0.0 hydro_power = 0.0 for weather_tuple in forecast: #convert weather to power wind_power += self.calculate_wind_power( weather_tuple.windSpeed) solar_power += self.calculate_solar_power( weather_tuple.sunlight) hydro_power += self.calculate_hydro_power() self.features.append((wind_power, solar_power, hydro_power)) weather_reader.advanceTime()
def solver(): parser = argparse.ArgumentParser() parser.add_argument("integer", type=int, help="Please give arguments as 'Centroid','Min','Max'") args = parser.parse_args() clusters = args.integer reader = DataReader() data = reader.loadData() simMatrix, indexes = genSimilarityMatrix(data) M, C = kmedoids.kMedoids(simMatrix, clusters) fileWriter = open('data/Kmedoids_output_{}.txt'.format(clusters), 'w') print('medoids', file=fileWriter) i = 1 for point in M: print('medoid of cluster ', i, ' ', indexes[point], file=fileWriter) i = i + 1 print(' ', file=fileWriter) print('clustering result:', file=fileWriter) i = 1 for label in C: for point_idx in C[label]: print('Cluster ', i, ': ', indexes[point_idx], file=fileWriter) i = i + 1 fileWriter.close() print("Clustering Done!!,No. of new clusters are {}".format(clusters)) print("New clusters are stored in file-data/Kmedoids_output_{}.txt".format( clusters))
def main(_): ''' Loads trained model and evaluates it on test split ''' if FLAGS.load_model is None: print('Please specify checkpoint file to load model from') return -1 if not os.path.exists(FLAGS.load_model + ".index"): print('Checkpoint file not found', FLAGS.load_model) return -1 word_vocab, word_tensors, max_doc_length, label_tensors = \ load_data(FLAGS.data_dir, FLAGS.max_doc_length, FLAGS.max_sen_length) test_reader = DataReader(word_tensors['test'], label_tensors['test'], FLAGS.batch_size) print('initialized test dataset reader') with tf.Graph().as_default(), tf.Session() as session: # tensorflow seed must be inside graph tf.set_random_seed(FLAGS.seed) np.random.seed(seed=FLAGS.seed) ''' build inference graph ''' with tf.variable_scope("Model"): m = build_model(word_vocab) global_step = tf.Variable(0, dtype=tf.int32, name='global_step') saver = tf.train.Saver() saver.restore(session, FLAGS.load_model) print('Loaded model from', FLAGS.load_model, 'saved at global step', global_step.eval()) ''' training starts here ''' count = 0 start_time = time.time() result_scores = None for x, y in test_reader.iter(): count += 1 logits = session.run(m.logits, {m.input: x, m.targets: y}) total_scores = [] for tid, tlogits in enumerate(logits): scores = softmax(tlogits) weights = np.array([0, 1, 0.5]) scores = np.dot(scores, weights) total_scores.append(scores) total_scores = np.transpose(np.asarray(total_scores)) if result_scores is None: result_scores = total_scores else: result_scores = np.vstack((result_scores, total_scores)) save_as = '%s/scores' % (FLAGS.train_dir) np.savetxt(save_as, result_scores, delimiter=' ') time_elapsed = time.time() - start_time print("test samples:", count * FLAGS.batch_size, "time elapsed:", time_elapsed, "time per one batch:", time_elapsed / count)
def create_library(): pkey_filename = '/Users/cole/.ssh/million-song-dataset.pem' pkey_password = keyring.get_password('SSH', pkey_filename) pkey = RSAKey.from_private_key_file(pkey_filename, password=pkey_password) ssh = SSHClient() ssh.set_missing_host_key_policy(AutoAddPolicy()) ssh.connect('52.91.85.148', username='******', pkey=pkey) letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] letters2 = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] letters3 = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] dr = DataReader() dr.reset_lib() import shutil for letter in letters: for letter2 in letters2: for letter3 in letters3: with SCPClient(ssh.get_transport()) as scp: print letter + letter2 + letter3 scp.get('/mnt/snap/data/' + letter + '/' + letter2 + '/' + letter3, '/Users/cole/eclipse-workspace/EC2 File Transfer/Data/', 1) dr.append_files(letter3); shutil.rmtree('/Users/cole/eclipse-workspace/EC2 File Transfer/Data/' + letter3); scp.close() ssh.close()
def main(): test = False heuristic = 'Centroid' reader = DataReader() data = reader.loadData() dataArray = reader.getDataArray() if test == True: clusters = [ Cluster(dataPoint, data[dataPoint]) for dataPoint in list(data.keys())[:5] ] else: clusters = [ Cluster(dataPoint, data[dataPoint]) for dataPoint in list(data.keys())[:] ] Cluster.generateInitialDistanceMatrix(test) Uni = UnionTracker(len(clusters)) # print('') iteration = 0 while (Cluster.currentClusterCount() > 1): clsA, clsB, dist = Cluster.findMinDistance() mergedRC = min(clsA, clsB) toDelete = max(clsA, clsB) newIDm, newIDd, pts, factor = Cluster.mergeSimilarClusters( mergedRC, toDelete, iteration, dist, heuristic=heuristic) Uni.union(newIDd, newIDm, dist, pts, iteration) iteration += 1 labels = list(data.keys()) drawDendrogram(Uni, labels, heuristic)
def init_trainer(config, text_lines, slot_value_lines): hidden_dim = config.hidden_dim segment_begin = config.segment_begin segment_end = config.segment_end data = DataReader(text_lines, slot_value_lines, segment_begin, segment_end) # Create model nodes for the source and target inputs vocab_dim = data.vocab_dim sv_dim = data.sv_dim input_sequence, sv_pair, label_sequence, inputH, inputC = create_inputs(hidden_dim, sv_dim, vocab_dim) model = create_model(hidden_dim, sv_dim, vocab_dim) z = model(input_sequence, inputH, inputC, sv_pair) # cross_entropy: this is used training criterion ce, err = cross_entropy_with_full_softmax(z, label_sequence, sv_dim, vocab_dim) learning_rate = config.learning_rate momentum_as_time_constant = config.momentum_as_time_constant clipping_threshold_per_sample = config.clipping_threshold_per_sample lr_schedule = learning_rate_schedule(learning_rate, UnitType.sample) gradient_clipping_with_truncation = True momentum_schedule = momentum_as_time_constant_schedule(momentum_as_time_constant) # Instantiate the trainer object to drive the model training learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule, gradient_clipping_threshold_per_sample=clipping_threshold_per_sample, gradient_clipping_with_truncation=gradient_clipping_with_truncation) trainer = Trainer(z, (ce, err), learner) inputs = [input_sequence, sv_pair, label_sequence, inputH, inputC] return data, z, trainer, inputs