def read_from_txt(self, file_name_and_path, test_train_ratio=0.8, valid_train_ratio=0.75): """Reads a data set from a .txt file, storing it as three data sets: training, testing, and validation. Arguments: file_name_and_path: A string describing the file name (and relative path) of the .txt file to read. test_train_ratio: A float describing how much of the data to use for training and how much to use for testing. valid_train_ratio: A float describing how much of the training data to use for actual training and how much to use for validation. Returns: Nothing. """ self.__data_loader = data_loader.DataLoader(file_name_and_path, test_train_ratio, valid_train_ratio) self.__data_loader.convert_data_to_1_hot() self.__data_loader.split_data() training_x, training_y1, training_y2 = self.__data_loader.get_training_data( ) self.__training = data_batcher.DataBatcher(training_x, training_y1, training_y2) testing_x, testing_y1, testing_y2 = self.__data_loader.get_testing_data( ) self.__testing = data_batcher.DataBatcher(testing_x, testing_y1, testing_y2) validation_x, validation_y1, validation_y2 = self.__data_loader.get_validation_data( ) self.__validation = data_batcher.DataBatcher(validation_x, validation_y1, validation_y2) self.__headers = self.__data_loader.get_header_data()
def dataPlot(data, median): n, bins, patches = plt.hist(data, 45, density=True, facecolor='g', alpha=0.75) plt.xlabel('interval') plt.ylabel('Probability') plt.title('Histogram') plt.grid(True) x = np.linspace(0, 40, 20) tmp = 1 / median print(tmp) y = tmp * np.exp(-tmp * x) plt.plot(x, y, '-', lw=2) plt.show() cumu_prob = 0 cumu_x = 0 print(n) for patch in patches: cumu_x += patch.get_width() cumu_prob += patch.get_width() * patch.get_height() print("0-{}:{}".format(cumu_x, cumu_prob)) print(median) # output to file result = [["Distance", "Probability"]] cumu_x = 0 for patch in patches: result.append([cumu_x, patch.get_height()]) cumu_x += patch.get_width() dataloader = data_loader.DataLoader() dataloader.write_to_file(result, 'distance-histogram.dat', split=' ')
def main(): output_dir = r"./output" if not tf.gfile.Exists(output_dir): tf.gfile.MakeDirs(output_dir) run_config = model_helper.get_configure() model_fn = get_model_fn() estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) if hp.model_mode == "train": input_path = r"./data/train.recoder" train_input_fn = data_loader.file_based_input_fn_builder( input_file_path=input_path, is_training=True, drop_remainder=True, mode=hp.model_mode) estimator.train(input_fn=train_input_fn, max_steps=hp.train_steps) elif hp.model_mode == "predict": input_path = r"./data/test.recoder" result_path = r"./output/result.txt" vocdict = data_loader.DataLoader().voacb_list assert tf.gfile.Exists(input_path) train_input_fn = data_loader.file_based_input_fn_builder( input_file_path=input_path, is_training=False, drop_remainder=True, mode=hp.model_mode) with tf.gfile.Open(result_path, mode="w") as f: for result in estimator.predict(input_fn=train_input_fn, yield_single_examples=True, checkpoint_path=hp.predict_ckpt): f.write( model_helper.id2sentence(result["y_hat"], vocdict) + "\n")
def MercerTest(): data_loader_obj = data_loader.DataLoader([10, 90], test=False) batch_iter = data_loader_obj.train_batch_iter(batch_size=100, num_epochs=1) # Fake variables for testing session = None samples_op = None output_dir = './exp/eval_mercer_test' y_samples_ph = 'y_ph' z_samples_ph = 'z_ph' num_randos = 1 class FakeSession: def run(self, dummy_op, feed_dict): yield 1.5 * feed_dict['y_ph'][0][1] + 3 + 10 * feed_dict['z_ph'][0] session = FakeSession() if not os.path.isdir(output_dir): os.mkdir(output_dir) p0_data, p1_data = prepare_energy_data(batch_iter) plot_energy_data(p0_data, output_dir) plot_energy_data(p1_data, output_dir) p0_data = add_gan_data(p0_data, session, samples_op, y_samples_ph, z_samples_ph, num_randos) p1_data = add_gan_data(p1_data, session, samples_op, y_samples_ph, z_samples_ph, num_randos) plot_energy_data(p0_data, output_dir, step=1) plot_energy_data(p1_data, output_dir, step=1) create_histogram(p0_data, output_dir, step=1) create_histogram(p1_data, output_dir, step=1)
def setUp(self): """Sets up a DataLoader object initialised with a data set containing 100 samples. Also creates a temporary text file from which to read the data. Arguments: Nothing. Returns: Nothing. """ try: open_file = open("tmp.txt", 'w') except IOError as excep: print('Error writing temp file for testing') print(excep) sys.exit(2) open_file.write("N1\tN2\tN3\tN4\tN5\tN6\tN7\tM8\tM9\tc\n") for _ in range(50): open_file.write("0\t1\t2\t0\t1\t2\t0\t1\t2\t1\n") for _ in range(50): open_file.write("2\t1\t0\t2\t1\t0\t2\t1\t0\t0\n") open_file.close() self.dl = data_loader.DataLoader("tmp.txt", 0.8, 0.75) self.dl.convert_data_to_1_hot() self.dl.split_data()
def trainSVM(d2vModel, model_path, tag_path, input_path): print("Initializing data loader") loader = dl.DataLoader(input_path, tag_path) label_size = loader.tag_cnt X = [] Y = [] print("Loading data into data loader") for content in loader.data: for sub_con in content: text = sub_con['sentence'] tags = sub_con['labels'] text = re.sub(r'[{}]'.format(punction), ' ', text).split(' ') text = [jieba.cut(i) for i in text if i != ''] X.append(d2vModel.infer_vector(text)) Y.append(transferTagVec(tags)) X = np.array(X) Y = np.array(Y) Y = Y.transpose() print("Beginning trainning SVC") for i in range(1, label_size + 1): classifier = SVC(gamma='auto') print("SVC {} training finished".format(i)) classifier.fit(X, Y[i - 1]) joblib.dump(classifier, model_path + str(i) + '.model') print("Train process end")
def main(): args = parse_args() mp.set_start_method('spawn') # Using spawn is decided. _logger = log.get_logger(__name__, args) _logger.info(print_args(args)) loaders = [] file_list = os.listdir(args.train_file) random.shuffle(file_list) for i in range(args.worker): loader = data_loader.DataLoader(args.train_file, args.dict_file, separate_conj_stmt=args.direction, binary=args.binary, part_no=i, part_total=args.worker, file_list=file_list, norename=args.norename, filter_abelian=args.fabelian, compatible=args.compatible) loaders.append(loader) loader.start_reader() cuda_test = torch.cuda.is_available() cuda_tensor = torch.randn(10).cuda() net, mid_net, loss_fn = create_models(args, loaders[0], allow_resume=True) # Use fake modules to replace the real ones net = FakeModule(net) if mid_net is not None: mid_net = FakeModule(mid_net) for i in range(len(loss_fn)): loss_fn[i] = FakeModule(loss_fn[i]) opt = get_opt(net, mid_net, loss_fn, args) inqueues = [] outqueues = [] plist = [] for i in range(args.worker): recv_p, send_p = Pipe(False) recv_p2, send_p2 = Pipe(False) inqueues.append(send_p) outqueues.append(recv_p2) plist.append( Process(target=worker, args=(recv_p, send_p2, loaders[i], args, i))) plist[-1].start() _logger.warning('Training begins') train(inqueues, outqueues, net, mid_net, loss_fn, opt, loaders, args, _logger) loader.destruct() for p in plist: p.terminate() for loader in loaders: loader.destruct() _logger.warning('Training ends')
def __init__(self, config, path, train_idx, test_idx): self.epochs = config.epochs self.test_patch_num = config.test_patch_num self.model_hyper = models.HyperNet(16, 112, 224, 112, 56, 28, 14, 7).cuda() self.model_hyper.train(True) self.l1_loss = torch.nn.L1Loss().cuda() backbone_params = list(map(id, self.model_hyper.res.parameters())) self.hypernet_params = filter(lambda p: id(p) not in backbone_params, self.model_hyper.parameters()) self.lr = config.lr self.lrratio = config.lr_ratio self.weight_decay = config.weight_decay paras = [{ 'params': self.hypernet_params, 'lr': self.lr * self.lrratio }, { 'params': self.model_hyper.res.parameters(), 'lr': self.lr }] self.solver = torch.optim.Adam(paras, weight_decay=self.weight_decay) train_loader = data_loader.DataLoader(config.dataset, path, train_idx, config.patch_size, config.train_patch_num, batch_size=config.batch_size, istrain=True) test_loader = data_loader.DataLoader(config.dataset, path, test_idx, config.patch_size, config.test_patch_num, istrain=False) self.train_data = train_loader.get_data() self.test_data = test_loader.get_data()
def rnn_generator(split, batch_size, n_epochs=1, test=0, partition=0): ''' Put data into a format for RNN training or prediction ''' if test: dl = data_loader.DataLoader(splits, test=True, local_test_data_dir='../..') else: dl = data_loader.DataLoader(splits, test=False) for ecal, target in dl.batch_iter(partition, batch_size, n_epochs): flat = np.array([[x[:, :, i].flatten() for i in range(x.shape[-1])] for x in ecal]) X = np.log(1 + flat[:, :-1, :]) Y = np.log(1 + flat[:, 1:, :]) P = np.zeros((Y.shape[0], 2)) P[np.arange(P.shape[0]), np.array([int(t[0]) for t in target])] = 1 M = np.array([t[1] for t in target]) X_dict = {'X_input': X, 'P_input': P, 'M_input': M} Y_dict = {'output': Y} yield (X_dict, Y_dict)
def data_generator(args, partition=0): splits = args['splits'] batch_size = args['batch_size'] test = args['test'] cropped_width = args['cropped_width'] log_data = args['log_data'] # TODO averaged = args['averaged_data'] # compute normalization stddev based on width, logging, and averaging stddev = STDDEV_MAP[cropped_width, log_data, averaged] if averaged: data_loader._SCRATCH_DIR = data_loader._AVG_SCRATCH_DIR data_loader._FILENAME_REGEX = data_loader._AVG_FILENAME_REGEX if test: assert not averaged, 'averaged should not be true when testing' dl = data_loader.DataLoader(splits, test=True, local_test_data_dir='../..') else: dl = data_loader.DataLoader(splits, test=False) for ecals, targets in dl.batch_iter(partition, batch_size, num_epochs=1): if cropped_width < data_loader.DATA_DIM[0]: ecals = data_loader.truncate_ecals(ecals, (cropped_width, cropped_width)) if log_data: ecals = data_loader.log_ecals(ecals) # normalize using the stddev computed above based on width+logging+averaging ecals = data_loader.normalize_ecals(ecals, TRAIN_MEAN, stddev) ecals = data_loader.unroll_ecals(ecals) # NOTE this is needed to make it fit with the model architecture ecals = np.expand_dims(ecals, axis=1) particle_types = np.array([y[0] for y in targets]) input_energies = np.array([y[1] for y in targets]) yield (ecals, particle_types, input_energies)
def __init__(self): # -------------------------------------------- # 데이터로드 # -------------------------------------------- self.d_loader = data_loader.DataLoader() # 단어 딕셔너리 생성 self.word_to_index, self.index_to_word = self.d_loader.set_word_dic() # 모델로드 build = test_models.BuildModel(2345, self.d_loader.embedding_dim, self.d_loader.lstm_hidden_dim) model = build.train_model() model.load_weights("./datasets/seq2seq_model.h5") self.encoder_model, self.decoder_model = build.predict_model()
def LabelImages(): config = config.Config() loader = data_loader.DataLoader(config.DataFolder, config.LabelFile) _, _, to_label_filenames = data_loader.LoadUnlabeledImages() to_label = set(to_label_filenames) print("To label count: ", len(to_label)) print("Already labeled count:", len(already_labeled)) # Randomness of samples is contingent on the behavior of set.pop(). (I \ # haven't looked it up). while len(to_label) > 0: image_filename = to_label.pop() DisplayImage(image_filename) money, tech_points = PromptUserForAnnotation() WriteAnnotationToFile(image_filename, money, tech_points, kLabelingFile)
def __init__(self, file_or_dict): file_h5, file_json = None, None if type(file_or_dict) is str: file_ = file_or_dict if '.json' in file_: file_json = file_ elif '.h5' in file_: file_h5 = file_ else: raise ValueError(file_) self.mag_data = data_loader.DataLoader(file_json=file_json, file_h5=file_h5) self.data_type = 0 elif type(file_or_dict) is dict: self.data_type = 1 self.mag_data = file_or_dict else: raise ValueError(type(file_or_dict))
def _LoadRight(self): # Load right points and mark each of it as 'right point' to be able # later to separate them after binning. cust_attrs_to_set = {_DATASET_TYPE_CUSTOM_ATTRIBUTE_NAME: _RIGHT_DATASET} self._all_right_points_by_cluster_id = data_loader.DataLoader( _RIGHT_FILENAME, num_first_rows_to_skip= _NUM_FIRST_ROWS_TO_SKIP_IN_THE_DATA_FILES, line_separator=_DATA_FILES_LINE_SEPARATOR, x_column=_DATA_FILES_X_COLUMN, y_column=_DATA_FILES_Y_COLUMN, cluster_id_column=_DATA_FILES_CLUSTER_ID_COLUMN, cluster_ids_to_exclude={0, -1000}, columns_separator_regex=_COLUMNS_SEPARATOR_REGEX ).LoadAndReturnPointsDividedByClusterId( point_custom_attributes=cust_attrs_to_set) print 'Right points are loaded. Clusters are %s' % (', ').join( [str(s) for s in self._all_right_points_by_cluster_id.iterkeys()])
def main(): current_th = min_th rate_list_auto = [] interval_list = [] rate_list_lin = [] while current_th < max_th: loader = data_loader.DataLoader(validation_folder) all_ious = [] all_itervals = [] while (True): imgs, gts = loader.get_next() if imgs == None: break # Do the auto tracking pred_auto = methods.auto_select(imgs, gts, stride=current_th) iou, est_interval = evaluate.evaluate_estimation_iou( pred_auto, gts) # evaluate the system all_ious += iou all_itervals.append(est_interval) rate_list_auto.append(evaluate.evaluate_accuracy(iou, accuracy_th)) interval_list.append(1. / est_interval) pred_lin = methods.linear_annotation(imgs, gts, stride=current_th) iou, est_interval = evaluate.evaluate_estimation_iou(pred_lin, gts) rate_list_lin.append(evaluate.evaluate_accuracy(iou, accuracy_th)) print("Processed data point - ", len(rate_list_auto)) visualize.visualize_video(imgs, pred_lin, pred_auto, gts) current_th += inter_th print("Evaluating for TH = ", current_th) pickle.dump([rate_list_lin, rate_list_auto, interval_list], open("save2f.p", "wb"))
def get_vector(self, inputs, tokenized_corpus, max_word_num, max_sequence_len): loader = data_loader.DataLoader(inputs) self.data = pd.DataFrame({'title': loader.title, 'context': loader.context, 'question':loader.question, 'answer_start':loader.answer_start, 'answer_end':loader.answer_end, 'answer_text':loader.answer_text}) self.tokenizer, self.vocabulary = self.create_vocab(tokenized_corpus, max_word_num) # tokenization & add tokens, token indexes to columns nltk_tokenizer = MosesTokenizer() vectors = [] for i, text_column in enumerate(['context' , 'question']): self.data[text_column + '_tk'] = self.data[text_column].apply(lambda i: nltk_tokenizer.tokenize(i.replace('\n', '').strip(), escape=False)) # token to index self.data[text_column+'_tk_index'] = self.tokenizer.texts_to_sequences(self.data[text_column + '_tk'].apply(lambda i: ' '.join(i))) # padding: It returns context, question vectors. vectors.append(pad_sequences(self.data[text_column+'_tk_index'], max_sequence_len[i])) return vectors
def predict_test_dataset(model, fruit_label_enum=create_fruit_labels(fruits=("apple", "banana", "mix"))): test_spectrum_path = r"dataset/test_spectrum_after5_anal_5000.npy" test_labels_path = r"dataset/test_labels_after5_anal_5000.npy" test_data_loader = data_loader.DataLoader( "test", test_spectrum_path=test_spectrum_path, test_labels_path=test_labels_path, batch_size=1, transform=transform) for spectrum, labels in test_data_loader.load_data(): # convert string representation of labels to int labels = np.array([fruit_label_enum[label].value for label in labels]) data_to_predict = spectrum amount_of_data = 1 if transform: data_to_predict = np.reshape(data_to_predict, (-1, 1)) data_to_predict = transform(data_to_predict).reshape( amount_of_data, 1, 2, -1) else: data_to_predict = torch.from_numpy( data_to_predict.reshape(amount_of_data, 1, 2, -1)) for spectrum in data_to_predict: # if transform: # spectrum = transform(spectrum).reshape(1, 1, 2, -1) # else: spectrum = spectrum.view(1, 1, 2, -1) # Run the spectrum through the model outputs = model(Variable(spectrum.float())) # Brings us probabilities outputs = torch.nn.functional.softmax(outputs, dim=1) # Get prediction and the confidence (probability) by taking the maximal value of the outputs confidence, prediction = torch.max(outputs.data, 1) if prediction != labels[0]: print("False prediction")
def test_model(): count = 0 video_g_net = VideoGANGenerator() video_g_net.load_state_dict(torch.load(MODEL_FILEPATH)) video_g_net.eval() max_size = len(os.listdir('train')) pacman_dataloader = data_loader.DataLoader('train', min(max_size, 500000), 16, 32, 32, 4) clips_x, clips_y = pacman_dataloader.get_train_batch() clips_x = torch.tensor(np.rollaxis(clips_x, 3, 1)) clips_y = torch.tensor(np.rollaxis(clips_y, 3, 1)) # batch_size x noise_size x 1 x 1 batch_size = 16 noise_size = 100 video_images = video_g_net(clips_x) save_samples(video_images, count, "test_model")
def my_generator(split, batch_size, slice_start=0, slice_end=24, n_epochs=1, test=False, partition=0): '''Wrapper on data_loader.DataLoader to yield batches that only grab particular slice&target This wrapper plays nicely with Keras' model.fit_generator() functionality Notes: 1) split=[train, valid, test] where train+valid+test=100, can have arbitrary number of partitions 2) partition_batch_iter(N_batch, N_epochs,parition_index) ; set N_epochs to 1 if using Keras' fit_generator''' dl = data_loader.DataLoader(split, test=test) for ecal, target in dl.batch_iter(partition, batch_size, n_epochs): X = np.array([ np.expand_dims(x[:, :, slice_start:slice_end + 1].mean(axis=2), axis=0) for x in ecal ]) #save the particle type, which is y[0], not the momentum Y = np.array([y[0] for y in target]) yield (X, Y)
def getData(): dataloader = data_loader.DataLoader() files = os.listdir(INPUT_DIR) files.sort() result = [["file", "avg", "median"]] for file in files[4:5]: count = 0 total_interval = 0 interval_list = [] if file.endswith(".csv"): data = dataloader.get_data(os.path.join(INPUT_DIR, file)) for idx in range(1, len(data) - 1): delta_t1 = datetime.datetime.strptime(data[idx][1], "%Y-%m-%d %H:%M:%S") delta_t2 = datetime.datetime.strptime(data[idx + 1][1], "%Y-%m-%d %H:%M:%S") interval = (delta_t2 - delta_t1).seconds interval_list.append(interval) if interval < 1000 and interval != 0: total_interval += interval count += 1 print(file, "avg:{}".format(total_interval / count), "median:{}".format(np.median(interval_list))) result.append([file, total_interval / count, np.median(interval_list)]) dataloader.write_to_file(result, './exp-time-interval/interval.txt', split=' ')
def getGaussainDistribute(): dataloader = data_loader.DataLoader() files = os.listdir(INPUT_DIR) files.sort() count = 0 total_interval = 0 for file in files[:]: interval_list = [] if file.endswith(".csv"): data = dataloader.get_data(os.path.join(INPUT_DIR, file)) for idx in range(1, len(data) - 1): delta_t1 = datetime.datetime.strptime(data[idx][1], "%Y-%m-%d %H:%M:%S") delta_t2 = datetime.datetime.strptime(data[idx + 1][1], "%Y-%m-%d %H:%M:%S") interval = (delta_t2 - delta_t1).seconds if interval < 200 and interval != 0: total_interval += interval count += 1 interval_list.append(interval) print(file, "avg:{}".format(total_interval / count), "median:{}".format(np.median(interval_list))) dataPlot(interval_list, total_interval / count)
def test_DataLoaderMultipleImages(fs): kLabeledImage = "labeled.png" kUnlabeledImage = "unlabeled.png" kLabelPath = "labels.txt" kDataFolder = "./" images_dim = (10, 5) # (H, W) Image.new("RGB", images_dim[::-1]).save(kLabeledImage) # (W, H) Image.new("RGB", images_dim[::-1]).save(kUnlabeledImage) # (W, H) label_writer = data_loader.LabelWriter(kLabelPath) label_writer.WriteLabel(kLabeledImage, 0, 0) loader = data_loader.DataLoader(kDataFolder, kLabelPath) labeled_images, _, _ = loader.LoadLabeledImages() unlabeled_images, _, _ = loader.LoadUnlabeledImages() # Test that LoadLabeledImages gave us our labeled image assert labeled_images[0].shape == (3, *images_dim) # Test that LoadUnlabedImages gave us our unlabeled image assert unlabeled_images[0].shape == (3, *images_dim)
def __init__(self, glo_params, vid_params, rn): self.netZ = model_video_orig._netZ(glo_params.nz, vid_params.n) self.netZ.apply( model_video_orig.weights_init) # init the weights of the model self.netZ.cuda() # on GPU self.rn = rn self.lr = 0.01 self.data_loader = data_loader.DataLoader() self.netG = model_video_orig.netG_new(glo_params.nz) self.netG.apply(model_video_orig.weights_init) self.netG.cuda() num_devices = torch.cuda.device_count() if num_devices > 1: print("Using " + str(num_devices) + " GPU's") for i in range(num_devices): print(torch.cuda.get_device_name(i)) self.netG = nn.DataParallel(self.netG) if load: #Load point self.load_weights(counter, self.rn) self.vis_n = 100 fixed_noise = torch.FloatTensor(self.vis_n, glo_params.nz).normal_( 0, 1) # for visualize func - Igen self.fixed_noise = fixed_noise.cuda() self.nag_params = glo_params self.vid_params = vid_params self.blockResnext = 101 if VGG: self.dist_frame = utils.distance_metric(64, 3, glo_params.force_l2) elif LAP: self.lap_loss = lap.LapLoss(max_levels=3) else: self.dist = perceptual_loss_video._resnext_videoDistance( self.blockResnext)
def process_train_load_modeling(athletes_name): loader = data_loader.DataLoader() data_set = loader.load_merged_data(athletes_name=athletes_name) sub_dataframe_dict = utility.split_dataframe_by_activities(data_set) best_model_dict = {} for activity, sub_dataframe in sub_dataframe_dict.items(): utility.SystemReminder().display_activity_modeling_start(activity) sub_dataframe_for_modeling = sub_dataframe[sub_dataframe['Training Stress Score®'].notnull()] if sub_dataframe_for_modeling.shape[0] > 20: general_features = utility.FeatureManager().get_common_features_among_activities() activity_specific_features = utility.FeatureManager().get_activity_specific_features(activity) features = [feature for feature in general_features + activity_specific_features if feature in sub_dataframe.columns and not sub_dataframe[feature].isnull().any()] # Handle columns with null def select_best_model(): min_mae, best_model_type, best_regressor = float('inf'), '', None for model_class in [ModelLinearRegression, ModelNeuralNetwork, ModelRandomForest, ModelXGBoost, ModelAdaBoost]: model_type = model_class.__name__[5:] print('\nBuilding {}...'.format(model_type)) builder = model_class(sub_dataframe_for_modeling, features) mae, regressor = builder.process_modeling() if model_type != 'NeuralNetwork': utility.save_model(athletes_name, activity, model_type, regressor) if mae < min_mae: min_mae, best_model_type, best_regressor = mae, model_type, regressor print("\n***Best model for activity '{}' is {} with mean absolute error: {}***" .format(activity, best_model_type, min_mae)) if best_regressor is not None: best_model_dict[activity] = best_model_type select_best_model() utility.SystemReminder().display_activity_modeling_end(activity, True) else: utility.SystemReminder().display_activity_modeling_end(activity, False) utility.update_trainload_model_types(athletes_name, best_model_dict)
def test_DataLoaderSingleImage(fs): kImagePath = "test.png" kLabelPath = "labels.txt" kDataFolder = "./" kImageCrop = (0, 0, 40, 20) # (x0, y0, x1, y1) kMoney = 10 kTechPoints = 20 image = Image.new("RGB", (80, 60)) # Note the format here is (W, H) which # is the opposite of torch. image.save(kImagePath) # Create the label file with open(kLabelPath, 'w+'): pass loader = data_loader.DataLoader(kDataFolder, kLabelPath, kImageCrop) unlabeled_images, _, _ = loader.LoadUnlabeledImages() unlabeled_image = unlabeled_images[0] label_writer = data_loader.LabelWriter(kLabelPath) label_writer.WriteLabel(kImagePath, kMoney, kTechPoints) labeled_images, labels, labeled_filenames = loader.LoadLabeledImages() label = labels[0] labeled_image = labeled_images[0] np.testing.assert_array_equal(unlabeled_image, \ labeled_image) # Test that we can load labels assert int(label["money"]) == kMoney assert int(label["tech_points"]) == kTechPoints # Test that the crop worked assert labeled_image.shape == (3, 20, 40) # (C, H, W)
self.cuda() else: self.load_state_dict( torch.load(path, map_location=lambda storage, loc: storage)) self.cpu() if __name__ == "__main__": import data_loader import os import utils import argparse use_cuda = torch.cuda.is_available() corpus = torch.load(os.path.join(const.DATAPATH, "corpus.pt")) dl = data_loader.DataLoader(const.DATAPATH, corpus["word2idx"], cuda=False) doc = dl.sample_data()[0] parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=100) parser.add_argument('--max_len', type=int, default=500) parser.add_argument('--span_len', type=int, default=4) parser.add_argument('--d_model', type=int, default=512) parser.add_argument('--pos_dim', type=int, default=20) parser.add_argument('--n_head', type=int, default=8) parser.add_argument('--rnn_hidden_size', type=int, default=128) parser.add_argument('--dropout', type=float, default=0.5) args = parser.parse_args() args.word_ebd_weight = corpus["wordW"]
from keras.layers import LSTM import numpy as np import cPickle from keras.models import Sequential import data_loader as dl from keras.layers import Dense from keras.layers import Masking from keras.layers.wrappers import TimeDistributed from keras.preprocessing.sequence import pad_sequences data = dl.DataLoader() X, Y, m = data.load() X_pad = pad_sequences(X, maxlen=m, padding='post') Y_pad = pad_sequences(Y, maxlen=m, padding='post') sample_weights = np.ones((273, m)) for i in xrange(273): for j in xrange(m): if (X_pad[i][j] == np.zeros(12)).all(): sample_weights[i][j] = 0 model = Sequential() accuracies = dict() for i in range(1, 200, 20): mask = np.zeros(12) model.add(Masking(mask_value=mask, input_shape=(m, 12))) model.add(LSTM(i, return_sequences=True, dropout_W=0.1, dropout_U=0.1)) model.add(TimeDistributed(Dense(12, activation="softmax"))) model.compile(optimizer='rmsprop',
flatten = Flatten()(merged) drop = Dropout(0.5)(flatten) outputs = Dense(y_train.shape[1], activation='softmax')(drop) model = Model(inputs, outputs) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) return model if __name__ == "__main__": config = argparser() current =datetime.datetime.now().strftime("%Y%m%d%H%M%S") loader = data_loader.DataLoader(config.corpus_tk, config.trained_word_vector, config.score_corpus) loader.load_data() x_train, y_train = loader.train x_test, y_test = loader.test # build model model = build_model() # training hist = model.fit(x_train, y_train, epochs = config.epoch, batch_size = config.batch_size, validation_data=(x_test, y_test), verbose=2) # evaluation: confusion matrix & roc curve
from datetime import datetime import data_loader import numpy as np dl = data_loader.DataLoader(file_json='/storage/data_2020-02-03/2020-02-03.json1') tt0 = list(dl.values())[0][0,0] tt = datetime.fromtimestamp(tt0) tt1 = datetime.now() # Timezone seems to be correct... last_quad = dl['SARBD02-MQUA030:I-SET'] #sarbd01_quad = dl['SARBD01-MQUA020:I-SET'] eduard_optics1 = { #'SARUN15.MQUA080.Q1.K1': -9.672893217266694e-01, #'SARUN16.MQUA080.Q1.K1': -2.443535112150988e+00, #'SARUN17.MQUA080.Q1.K1': +1.608546947532094e+00, 'SARUN18.MQUA080.Q1.K1': +1.360154558769963e+00, 'SARUN19.MQUA080.Q1.K1': -1.495693035627149e+00, 'SARUN20.MQUA080.Q1.K1': -1.072774681910800e+00, 'SARBD01.MQUA020.Q1.K1': -1.136049185308167e-01, } eduard_optics2 = { #'SARUN15.MQUA080.Q1.K1': -2.810125636006008e-01, #'SARUN16.MQUA080.Q1.K1': -1.820840559288582e+00, #'SARUN17.MQUA080.Q1.K1': +1.581672326954900e+00,
run_config = tf.estimator.RunConfig(model_dir=hp.model_dir, save_checkpoints_secs=None, save_checkpoints_steps=hp.save_steps, keep_checkpoint_max=hp.max_save, ) return run_config def id2sentence(ids_list, vocdict): # 将ids转化为句子 sentence = "" for i in ids_list: if i == 0 or i == 3: return sentence sentence += vocdict[i] sentence+="\n" return sentence def get_batch_size(): if hp.model_mode == "train": return hp.batch_size else: return hp.batch_size if __name__ == "__main__": vocdict = data_loader.DataLoader().voacb_list a = [[5, 192, 344, 23, 343, 4324, 432, 0]] print(id2sentence(a, vocdict))