def main(args): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id print('selecting gpu :', args.gpu_id) create_folders(args.path_results) list_image, list_mask = get_paths(args.path_train) df_train, df_val = generate_dataframe(path_mask=list_mask, out_size=args.size, stride=args.stride, classes=args.classes, out_directory=args.path_train) train_generator = DataGenerator(list_IDs=df_train, batch_size=args.batch, dim=(args.size, args.size), n_channels=3, n_classes=args.classes, norm=args.norm, transformations=True, shuffle=True) validation_generator = DataGenerator(list_IDs=df_val, batch_size=args.batch, dim=(args.size, args.size), n_channels=3, norm=args.norm, n_classes=args.classes) model = get_model(args=args, initial_lr=0.0001) history = start_train(args, model, train_generator, validation_generator) if args.no_plot == True: save_training_graph(history, args.path_results)
def TTA(sess, test_lists, dir_path, ckpt_dir, augment_times=1): probs = np.zeros((augment_times, len(test_lists), len(category_df))) # ckpt config ckpt = tf.train.get_checkpoint_state(ckpt_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): saver.restore(sess, ckpt.model_checkpoint_path) for t in range(augment_times): test_datagen = DataGenerator(augment=True, random_erasing=True, horizontal_flip=True) test_generator = test_datagen.flow_from_list_prediction( lists=test_lists, batch_size=1, image_size=336, dir_path=dir_path) _probs = np.zeros((len(test_lists), len(category_df))) print(t+1, ": times") for i, v in tqdm(enumerate(test_lists)): inputs = next(test_generator) _prob = sess.run([prob], feed_dict={x: inputs, training_flag: False}) _probs[i, :] = np.asarray(_prob) probs[t, :, :] = _probs # create pseudo_probs and predictions pseudo_probs = np.zeros((len(test_lists), len(category_df))) predictions = [] for i in enumerate(range(probs.shape[1])): pseudo_prob = np.mean(probs[:, i, :], axis=0) pseudo_probs[i, :] = pseudo_prob predictions.append(np.argmax(pseudo_prob)) predictions = np.asarray(predictions) return pseudo_probs, predictions
def main(): config = yaml.safe_load(open("config.yaml", 'r')) # =========================================== # # =============== PREPARE DATA ============== # # =========================================== # train_x, train_y, val_x, val_y = get_data(config=config) train_generator = DataGenerator(images=train_x, labels=train_y, config=config, gen_type='train') val_generator = DataGenerator(images=val_x, labels=val_y, config=config, gen_type='val') # =========================================== # # =============== CREATE MODEL ============== # # =========================================== # model = GoftNet(config=config) # =========================================== # # =============== TRAIN MODEL =============== # # =========================================== # model.train(train_data=train_generator, val_data=val_generator)
def main(args): os.environ["CUDA_VISIBLE_DEVICES"]=args.gpu_id print('selecting gpu :', args.gpu_id) create_folders(args.path_results) dataset_distribution_path = join(args.path_results,'dataset_distribution.csv') if os.path.exists(dataset_distribution_path): print('{} loaded!'.format(dataset_distribution_path)) dataset_distribution = pd.read_csv(dataset_distribution_path) train, val, test = dataset_distribution['train'], dataset_distribution['val'], dataset_distribution['test'] else: train, val, test = get_paths(args.path_train) dataset_distribution = list(zip(train, val ,test)) dataset_distribution = pd.DataFrame(dataset_distribution, columns = ['train', 'val' ,'test']) dataset_distribution.to_csv(dataset_distribution_path, index = False, header=True) train_generator = DataGenerator(list_IDs = train , batch_size=args.batch,dim=(args.size,args.size), n_channels=3, n_classes=args.classes,norm=args.norm, transformations=True, shuffle=True) validation_generator = DataGenerator(list_IDs = val , batch_size=args.batch, dim=(args.size,args.size), n_channels=3,norm=args.norm, n_classes=args.classes) model = get_model(args=args,initial_lr=0.0001) history = start_train(args,model,train_generator,validation_generator) if args.no_plot==True: save_training_graph(history,args.path_results)
def get_train_validation_generator(self, validation_percentage=0.15, sessions_per_batch=256, class_weights=[]): # return the generator for the train and optionally the one for validation (set to 0 to skip validation) # if sessions_per_batch == 'auto': # sessions_per_batch = self._get_auto_samples_per_batch() self.class_weights = class_weights tot_sessions = int(self.train_len / self.rows_per_sample) #tot_batches = math.ceil(tot_sessions / sessions_per_batch) number_of_validation_sessions = int(tot_sessions * validation_percentage) number_of_train_sessions = tot_sessions - number_of_validation_sessions train_rows = number_of_train_sessions * self.rows_per_sample #batches_in_train = math.ceil(number_of_train_sessions / sessions_per_batch) #batches_in_val = tot_batches - batches_in_train print('Train generator:') train_gen = DataGenerator(self, pre_fit_fn=self.prefit_xy, rows_to_read=train_rows) #train_gen.name = 'train_gen' print('Validation generator:') val_gen = DataGenerator(self, pre_fit_fn=self.prefit_xy, skip_rows=train_rows) #val_gen.name = 'val_gen' return train_gen, val_gen
def example_generator(): train_gen = DataGenerator( data_path= "/Users/anqin/Desktop/GSM1865005_allC.MethylC-seq_WT_rods_rep1.tsv/data.sort.clean.bed", ref_fasta= "/Users/anqin/Desktop/GSM1865005_allC.MethylC-seq_WT_rods_rep1.tsv/GRCm38.primary_assembly.genome.fa.gz", genome_size_file="./mm10.genome.size", epi_track_files=[ "/Users/anqin/Desktop/GSM1865005_allC.MethylC-seq_WT_rods_rep1.tsv/AM_R2_allChr_CpG_noL.txt" ], tasks=["TARGET"], upsample=False) valid_gen = DataGenerator( data_path= "/Users/anqin/Desktop/GSM1865005_allC.MethylC-seq_WT_rods_rep1.tsv/data_valid_sort.clean.bed", ref_fasta= "/Users/anqin/Desktop/GSM1865005_allC.MethylC-seq_WT_rods_rep1.tsv/GRCm38.primary_assembly.genome.fa.gz", genome_size_file="./mm10.genome.size", epi_track_files=[ "/Users/anqin/Desktop/GSM1865005_allC.MethylC-seq_WT_rods_rep1.tsv/AM_R2_allChr_CpG_noL.txt" ], tasks=["TARGET"], upsample=False) one_filter_keras_model = Sequential() one_filter_keras_model.add( Conv2D(filters=5, kernel_size=(1, 15), padding="same", input_shape=(1, 1000, 5))) one_filter_keras_model.add(BatchNormalization(axis=-1)) one_filter_keras_model.add(Activation('relu')) one_filter_keras_model.add(MaxPooling2D(pool_size=(1, 35))) one_filter_keras_model.add(Flatten()) one_filter_keras_model.add(Dense(1)) one_filter_keras_model.add(Activation("sigmoid")) one_filter_keras_model.summary() one_filter_keras_model.compile(optimizer='adam', loss='binary_crossentropy') #metrics_callback=MetricsCallback(train_data=(train_X,train_Y), # validation_data=(valid_X,valid_Y)) #print(one_filter_keras_model.get_weights()) history_regression = one_filter_keras_model.fit_generator( train_gen, validation_data=valid_gen, steps_per_epoch=500, validation_steps=100, epochs=150, verbose=1, use_multiprocessing=False, workers=1, max_queue_size=50, callbacks=[History()])
def main(): import argparse as argparse parser = argparse.ArgumentParser() parser.add_argument('--train-txt', type=str, required=True) parser.add_argument('--test-txt', type=str, required=True) parser.add_argument('--save-dir', type=str, required=True) parser.add_argument('--num-bins', type=int, required=True) parser.add_argument('--lr', type=float, required=False, default = 0.001) parser.add_argument('--batch-size', type=int, required=False, default=50) parser.add_argument('--epochs', type=int, required=False, default=10) parser.add_argument('--data-dir', type=str, required=True) parser.add_argument('--shape', type=int, required=True, nargs=3, help="height width chanels") parser.add_argument('--message', type=str, required=True) args = parser.parse_args() data_dir = args.data_dir image_dir = os.path.join(data_dir, "images/") anno_dir = os.path.join(data_dir, "annotations/") train_path = args.train_txt test_path = args.test_txt # Load list of image names for train and test raw_train = load_dataset(train_path) raw_test = load_dataset(test_path) # Create train and test generators num_bins = args.num_bins batch_size = args.batch_size train_gen = DataGenerator(batch_size=batch_size, data_set=raw_train[:200], image_dir=image_dir, anno_dir=anno_dir, preprocess_fn=preprocess_normalize_images_bin_annos, prepare_batch_fn=prepare_batch_images_and_labels) test_gen = DataGenerator(batch_size=batch_size, data_set=raw_test[:50], image_dir=image_dir, anno_dir=anno_dir, preprocess_fn=preprocess_normalize_images_bin_annos, prepare_batch_fn=prepare_batch_images_and_labels) # Kick-off #name = args.name save_dir = args.save_dir epochs = args.epochs in_shape = args.shape lr = args.lr classes = [i for i in range(num_bins)] message = args.message if not os.path.exists(save_dir): os.makedirs(save_dir) best_ckpt = "must have crashed during training :-(" save_config(save_dir, data_dir, num_bins, lr, batch_size, epochs, in_shape, best_ckpt, message) car_brain = Model(in_shape, classes=classes) best_ckpt = car_brain.train(train_gen, test_gen, save_dir, epochs=epochs)
def train(): # Reading train and test csv file train_df = pd.read_csv(os.path.join(PATH, TRAIN_CSV)) test_df = pd.read_csv(os.path.join(PATH, TEST_CSV)) print(f"train shape : {train_df.shape} and test shape : {test_df.shape}") train_generator = DataGenerator(train_df, BATCH_SIZE, input_size=INPUT_SIZE, path='', is_valid=False) valid_generator = DataGenerator(test_df, BATCH_SIZE * 2, input_size=INPUT_SIZE, path='', is_valid=True) # Initialize Model print("Loading Model ...") model = segmentation_model(input_shape=(INPUT_SIZE, INPUT_SIZE, 3)) # print(model.summary(110)) learning_rate = 0.001 adam = optimizers.Adam(lr=learning_rate) model.compile(optimizer=adam, loss=bce_dice_loss, metrics=[IOU]) cbks = [ ModelCheckpoint(f"./weights/{WEIGHT_FILENAME}", monitor='val_loss', verbose=1, save_best_only=True, mode='min'), ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, mode='min', min_delta=0.0001, min_lr=1e-5), EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=False) ] model.fit(train_generator, steps_per_epoch=len(train_generator), epochs=50, verbose=1, callbacks=cbks, validation_data=valid_generator, validation_steps=len(valid_generator), shuffle=False, workers=multiprocessing.cpu_count())
def main(env_var): logger.info(env_var) img_size, epochs, mask_channels, mask_type, metric_sel, loss_sel, freezed_layers = _parse_env_var( env_var) if env_var['--VGG16']: if img_size is None: img_size = (224, 224) model_class = vgg16_unet.VGG16Unet elif env_var['--InceptionV3']: if img_size is None: img_size = (299, 299) model_class = inception_v3_unet.InceptionV3Unet else: return None image_encoding = read_csv_encoding(length=1280) # ids = image_encoding.keys() ids = balanced_ids(image_encoding) training_gen = DataGenerator(ids=ids[:640], img_encodings=image_encoding, mask_type=mask_type, out_dim_img=img_size, classification=env_var['--Classification']) validati_gen = DataGenerator(ids=ids[640:680], img_encodings=image_encoding, mask_type=mask_type, out_dim_img=img_size, classification=env_var['--Classification']) model = model_class(img_size=img_size, classification=env_var['--Classification'], skip_connections=env_var['--SkipConnections'], mask_channels=mask_channels) model.set_net() model.freeze_encoder_blocks(depth=freezed_layers) model.compile(loss=loss_sel, metrics=metric_sel) model.neural_net.summary(print_fn=logger.info) model.fit(training_generator=training_gen, validation_generator=validati_gen, epochs=epochs, ref=TIME) predicti_gen = DataGenerator(ids=ids[:8], img_encodings=image_encoding, mask_type=mask_type, out_dim_img=img_size, classification=env_var['--Classification'], shuffle=False) model.predict(pred_generator=predicti_gen)
def DoTrain(train_list, val_list): # parameters train_batchsize = 4 val_batchsize = 1 class_num = 2 epochs_num = 30 initial_epoch_num = 0 # when train a new model ----------------------------------------- #model = new_models.ADSNet_Plain() #model = new_models.ADSNet_W() #model = new_models.ADSNet_O() model = new_models.ADSNet() #model = new_models.StepDeep_model() # print(model.summary()) dt_now = datetime.datetime.now().strftime('%Y%m%d%H%M') print(dt_now) adam = optimizers.adam(lr=0.0001) model.compile(loss=weight_loss, optimizer=adam, metrics=[POD, FAR, TS, binary_acc]) modelfilename = "%s-%s-{epoch:02d}.hdf5" % (dt_now, model.name) global modelrecordname modelrecordname = dt_now + '_' + model.name checkpoint = ModelCheckpoint(modelfileDir + modelfilename, monitor='val_loss', verbose=1, save_best_only=False, mode='min') train_gen = DataGenerator(train_list, train_batchsize, class_num, generator_type='train') val_gen = DataGenerator(val_list, val_batchsize, class_num, generator_type='val') RMAE = RecordMetricsAfterEpoch() model.fit_generator( train_gen, validation_data=val_gen, epochs=epochs_num, initial_epoch=initial_epoch_num, # use_multiprocessing=True, workers=3, max_queue_size=50, callbacks=[RMAE, checkpoint])
def main(): labels = json.load(open(os.path.join('data', 'labels.json'))) partition = {'training': None, 'validation': None} for x in partition.keys(): partition[x] = [ f for f in os.listdir(os.path.join('data', x)) if os.path.isfile(os.path.join(os.path.join('data', x), f)) ] partition[x].sort() print('Indices read.') n_classes = len({labels[x] for x in labels}) l = {labels[x] for x in labels} l = {x: i for i, x in enumerate(sorted(list(l)))} labels = {x: l[labels[x]] for x in labels.keys()} json.dump(l, open('mapping.json', 'w')) print('Mappings written.') training_generator = DataGenerator(partition['training'], 'training', labels, 28, 1, n_classes, True, True) validation_generator = DataGenerator(partition['validation'], 'validation', labels, 28, 1, n_classes, True, True) model = None with tf.device('/cpu:0'): model = FullNetwork.model() if os.path.exists('weights.h5'): model.load_weights('weights.h5') initial_epoch = 0 if os.path.exists('epochs.json'): initial_epoch = len(json.load(open('epochs.json')).keys()) cbk = SaveCallback(model) parallel_model = multi_gpu_model(model, gpus=2) parallel_model.compile(optimizer='adadelta', loss={ 'color_model': 'mean_squared_error', 'clf_model': 'categorical_crossentropy' }, metrics={ 'color_model': 'accuracy', 'clf_model': 'accuracy' }) parallel_model.fit_generator(generator=training_generator, epochs=1000, verbose=1, callbacks=[cbk], validation_data=validation_generator, use_multiprocessing=True, workers=4, initial_epoch=initial_epoch) print('Training done.')
def example_generator(): separate_dataset("regions_for_learning_with_head.clean.equal_size.bed", ["chr1"], "valid.bed") separate_dataset("regions_for_learning_with_head.clean.equal_size.bed", ["chr2", "chr19"], "test.bed") separate_dataset("regions_for_learning_with_head.clean.equal_size.bed", [ "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr20", "chr21", "chr22" ], "train.bed") train_gen = DataGenerator( data_path="train.bed", ref_fasta= "../GSM1865005_allC.MethylC-seq_WT_rods_rep1.tsv/GRCm38.primary_assembly.genome.fa.gz", genome_size_file="./mm10.genome.size", epi_track_files=["MethylC-seq_WT_cones_rep1_CpG.clean.plus.sorted.bw"], tasks=["TARGET"], upsample=True, upsample_ratio=0.3) valid_gen = DataGenerator( data_path="valid.bed", ref_fasta= "../GSM1865005_allC.MethylC-seq_WT_rods_rep1.tsv/GRCm38.primary_assembly.genome.fa.gz", genome_size_file="./mm10.genome.size", epi_track_files=["MethylC-seq_WT_cones_rep1_CpG.clean.plus.sorted.bw"], tasks=["TARGET"], upsample=True, upsample_ratio=0.3) model = initialize_model() trainning_history = model.fit_generator( train_gen, validation_data=valid_gen, steps_per_epoch=5000, validation_steps=500, epochs=10, verbose=1, use_multiprocessing=False, workers=4, max_queue_size=50, callbacks=[ History(), ModelCheckpoint("ATAC_peak_Classification_positive_constrain.h5", monitor='val_loss', verbose=1, save_best_only=True, mode='min') ])
def train(): # Reading train and test csv file train_df = pd.read_csv(os.path.join(PATH, TRAIN_CSV)) test_df = pd.read_csv(os.path.join(PATH, TEST_CSV)) train_df, test_df = str_to_list(train_df), str_to_list(test_df) train_df['pts'] = train_df.apply( lambda x: combine_list(x.pts_x, x.pts_y), axis=1) test_df['pts'] = test_df.apply( lambda x: combine_list(x.pts_x, x.pts_y), axis=1) train_df.pts = train_df.pts.apply(lambda x: correction(x)) test_df.pts = test_df.pts.apply(lambda x: correction(x)) print(f"train shape : {train_df.shape} and test shape : {test_df.shape}") train_generator = DataGenerator(train_df, BATCH_SIZE, path=os.path.join(PATH, TRAIN_FOLDER), is_valid=False) test_generator = DataGenerator(test_df, BATCH_SIZE*2, path=os.path.join(PATH, TEST_FOLDER), is_valid=True) # Initialize Model print("Loading Model ...") model = KeypointModel() print(model.summary(110)) learning_rate = 0.001 adam = optimizers.Adam(lr=learning_rate) model.compile(optimizer=adam, loss='mae', metrics=['mse']) cbks = [ModelCheckpoint(f"./weights/{WEIGHT_FILENAME}", monitor='val_loss', verbose=1, save_best_only=True, mode='min'), ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1, mode='min', min_delta=0.0001, min_lr=1e-5), EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=False)] model.fit_generator( generator=train_generator, steps_per_epoch=len(train_generator), epochs=50, verbose=1, callbacks=cbks, validation_data=test_generator, validation_steps=len(test_generator))
def prediction_and_evaluation(): from tensorflow.python.keras.models import load_model model = initialize_model() model.load_weights("ATAC_peak_Classification_positive_constrain.h5") #Get predictions on the test set test_gen = DataGenerator( data_path="test.bed", ref_fasta= "../GSM1865005_allC.MethylC-seq_WT_rods_rep1.tsv/GRCm38.primary_assembly.genome.fa.gz", genome_size_file="./mm10.genome.size", epi_track_files=["MethylC-seq_WT_cones_rep1_CpG.clean.plus.sorted.bw"], tasks=["TARGET"], upsample=False) model_predictions = model.predict_generator(test_gen, workers=4, use_multiprocessing=False, verbose=1) model_predictions_bool = model_predictions > 0.5 test_db_observed = get_labels_from_target_files("test.bed", ["TARGET"]) print(ClassificationResult(test_db_observed, model_predictions_bool))
def main(): # env env_path = find_dotenv() load_dotenv(dotenv_path=env_path, verbose=True) processed_p = Path(os.environ.get('PATH_PROCESSED')).resolve() models_p = Path(os.environ.get('PATH_MODELS')).resolve() img_h = int(os.environ.get('IMAGE_HEIGHT')) img_w = int(os.environ.get('IMAGE_WIDTH')) batch_size = int(os.environ.get('BATCH_SIZE')) downsample_factor = int(os.environ.get('DOWNSAMPLE_FACTOR')) min_lr = float(os.environ.get('MIN_LEARNING_RATE')) max_lr = float(os.environ.get('MAX_LEARNING_RATE')) # according how Keras' multi_gpu_mode() handles mini-batches # logging logging.root.removeHandler(absl.logging._absl_handler) absl.logging._warn_preinit_stderr = False logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) logger.info('TensorFlow version: ' + tf.__version__) logger.info('Keras version: ' + tf.keras.__version__) # parameters train_p = processed_p.joinpath('train') assert train_p.exists() # generators logger.info('loading data') train_gen = DataGenerator(train_p, img_w, img_h, batch_size, downsample_factor) max_text_len = train_gen.max_text_len logger.info('alphabet: \'' + str(train_gen.alphabet) + '\'') logger.info('alphabet size: ' + str(len(train_gen.alphabet))) logger.info('max text length: ' + str(max_text_len)) logger.info('image shape: height=' + str(img_h) + ' width=' + str(img_w)) logger.info('batch size: ' + str(batch_size)) logger.info('output size: ' + str(train_gen.output_size)) logger.info('training samples: ' + str(train_gen.n)) logger.info('train steps per epoch: ' + str(len(train_gen))) logger.info('min. learning-rate: ' + str(min_lr)) logger.info('max. learning-rate: ' + str(max_lr)) # create model model = OCRNet(train_gen.output_size, img_w, img_h, max_text_len) model.summary() # find best learning rate # initialize optimizer adam = Adam(lr=min_lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # compile model # the loss calc occurs elsewhere, so use a dummy lambda func for the loss model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam, metrics=['accuracy']) lrf = LRFinder(model) lrf.find(train_gen, min_lr, max_lr, stepsPerEpoch=len(train_gen), batchSize=batch_size) # plot the loss for the various learning rates and save the # resulting plot to disk if not models_p.exists(): models_p.mkdir() lrf.plot_loss(models_p.joinpath('loss_plot.png'), title='loss') lrf.plot_loss_change(models_p.joinpath('loss_change_plot.png'), title='loss change') # in the config and then train the network for our full set of logger.info('learning rate finder complete') logger.info('best LR: %f' % lrf.get_best_lr())
def evaluate_dir_random_rotation(args): ''' Evaluates an image directory, randomly rotating images on-the-fly ''' images = glob.glob(os.path.join(args['image_dir'], "*.jpg")) # Creating test generator test_gen = DataGenerator( images, rotate=True, preprocess_function=preprocess_input, shuffle=True, show_intermediate=False, batch_size=args['batch_size'], dim=args['img_size'], regress=args['regress'] ) # Loading model if args['regress']: model = load_model(args['model_dir'], custom_objects={"angle_loss_regress": angle_loss_regress}) else: model = load_model(args['model_dir'], custom_objects={"angle_loss": angle_loss}) # Running evaluation out = model.evaluate( test_gen, steps = int(len(images) / args['batch_size']) ) print(f"Test Loss: {out[0]} ; Angle Loss: {out[1]}")
def fitModel(model, input_size, categorical, trainDb, trainPaths, trainAge, trainGender, testDb, testPaths, testAge, testGender, epoch, batch_size, num_worker, callbacks, GPU): return model.fit_generator( DataGenerator(model, trainDb, trainPaths, trainAge, trainGender, batch_size, input_size, categorical), validation_data=DataGenerator(model, testDb, testPaths, testAge, testGender, batch_size, input_size, categorical), epochs=epoch, verbose=2, steps_per_epoch=len(trainAge) // (batch_size * GPU), validation_steps=len(testAge) // (batch_size * GPU), workers=num_worker, use_multiprocessing=True, max_queue_size=int(batch_size * 2), callbacks=callbacks)
def process(src, out, suffix): print 'compile', src, 'to', out chips = [] # загружаем файл f = open(src, 'r') for s in f: s = s.strip() l = len(s) if l == 0: continue if l == 1 and (s[0] == '\n' or s[0] == '\r'): continue if s[0] == '#': continue if s[l - 1] == '\n': s = s[:l - 1] if s.startswith('CHIP['): chip = Chip() load_line(chip, s) chips.append(chip) else: load_line(chips[len(chips) - 1], s) f.close() g = DataGenerator(suffix) for chip in chips: #chip.show() compile_chip(chip, g) g.generate(out) print '-------------[Chips]--------------------' for chip in chips: print chip.name.decode('cp1251').encode('utf8') print '----------------------------------------' print 'Total chips: ', len(chips) print 'Data size: ', g.size
def get_test_generator(self, sessions_per_batch=256): # return the generator for the test #def prefit(Xchunk_df, index): """ Preprocess a chunk of the sequence dataset """ #Xchunk_df = self._preprocess_x_df(Xchunk_df, partial=True) #return Xchunk_df return DataGenerator(self, for_train=False) #, pre_fit_fn=prefit)
def process(src, out, suffix): print 'compile', src, 'to', out chips = [] # загружаем файл f = open(src, 'r') for s in f: s = s.strip() l = len(s) if l == 0: continue if l == 1 and (s[0] == '\n' or s[0] == '\r'): continue if s[0] == '#': continue if s[l - 1] == '\n': s = s[:l - 1] if s.startswith('CHIP['): chip = Chip() load_line(chip, s) chips.append(chip) else: load_line(chips[len(chips) - 1], s) f.close() g = DataGenerator(suffix) for chip in chips: #chip.show() compile_chip(chip, g) g.generate(out) print '-------------[Chips]--------------------' for chip in chips: print chip.name.decode('cp1251').encode('utf8') print '----------------------------------------' print 'Total chips: ', len(chips) print 'Data size: ', g.size
def initialize_data(): global gh_scraper, generator, logger # scraping COVID-19 data gh_scraper.scrape() reports, countries = gh_scraper.cache, gh_scraper.valid_countries dates = process_dates(reports) data = process_data(reports, countries) generator = DataGenerator(dates, data, countries)
def __init__(self, modelname): self.modelname = modelname params = {'dim': (29, 29), 'batch_size': 1024, 'n_classes': 2, 'n_channels': 1, 'shuffle': True} Config.DATAPATH = 'data/train/Fuzzy/' data = os.listdir(Config.DATAPATH) data.remove('labels.npy') labels = np.load(Config.DATAPATH+"labels.npy") data_train = data[:int(len(data)/10*8)] data_valid = data[int(len(data)/10*8):] self.gen_train = DataGenerator(data_train, labels, **params) self.gen_valid = DataGenerator(data_valid, labels, **params) params['shuffle'] = False Config.DATAPATH = 'data/test/Fuzzy/' data_test = os.listdir(Config.DATAPATH) data_test.remove('labels.npy') data_test = data_test[int(len(data_test)/10*8):] labels_test = np.load(Config.DATAPATH+"labels.npy") self.gen_test = DataGenerator(data_test, labels_test, **params) self.model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(29, 29, 1)), tf.keras.layers.MaxPooling2D((2, 2)), tf.keras.layers.Conv2D(64, (3, 3), activation='relu'), tf.keras.layers.MaxPooling2D((2, 2)), tf.keras.layers.Conv2D(64, (3, 3), activation='relu'), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid'), ]) self.model.compile( optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], )
def get_train_validation_generator(self, validation_percentage=0.15, sessions_per_batch=256, class_weights=[]): # return the generator for the train and optionally the one for validation (set to 0 to skip validation) # if sessions_per_batch == 'auto': # sessions_per_batch = self._get_auto_samples_per_batch() def prefit(Xchunk_df, Ychunk_df, index): """ Preprocess a chunk of the sequence dataset """ #Xchunk_df = self._preprocess_x_df(Xchunk_df, partial=True) #Ychunk_df = self._preprocess_y_df(Ychunk_df) if len(class_weights) > 0: # weight only the last interaction (clickout item) by the class_weight weights = np.zeros(Xchunk_df.shape[:2]) weights[:, -1] = Ychunk_df[:, -1, :] @ class_weights return Xchunk_df, Ychunk_df, weights else: return Xchunk_df, Ychunk_df tot_sessions = int(self.train_len / self.rows_per_sample) #tot_batches = math.ceil(tot_sessions / sessions_per_batch) number_of_validation_sessions = int(tot_sessions * validation_percentage) number_of_train_sessions = tot_sessions - number_of_validation_sessions train_rows = number_of_train_sessions * self.rows_per_sample #batches_in_train = math.ceil(number_of_train_sessions / sessions_per_batch) #batches_in_val = tot_batches - batches_in_train print('Train generator:') train_gen = DataGenerator(self, pre_fit_fn=prefit, rows_to_read=train_rows) #train_gen.name = 'train_gen' print('Validation generator:') val_gen = DataGenerator(self, pre_fit_fn=prefit, skip_rows=train_rows) #val_gen.name = 'val_gen' return train_gen, val_gen
def train(self): self.load_weights() train_gen = DataGenerator(self.ddir + '/train', self.image_size, self.batch_size, train=True) dev_gen = DataGenerator(self.ddir + '/dev', self.image_size, self.batch_size) checkpoint_callback = ModelCheckpoint(os.path.join( self.wdir, 'weights.h5'), save_best_only=True, verbose=1) earlystopping_callback = EarlyStopping(verbose=1, patience=5) callbacks = [checkpoint_callback, earlystopping_callback] self.vae.fit_generator(train_gen, validation_data=dev_gen, epochs=999, shuffle='batch', callbacks=callbacks, verbose=1)
def load_generators(data_dir): # Parameters params = {'dim': (96,96), 'batch_size': 100, 'n_classes': 2, 'n_channels': 3, 'shuffle': True} # Data data = pd.read_csv(data_dir + 'train_labels.csv') train, val = train_test_split(data, test_size = 0.1, random_state=42) partition = {"train":list(train['id']), "validation":list(val['id'])} labels = dict(zip(data['id'], data['label'])) train_dir = data_dir + "train/" # Generators train_gen = DataGenerator(partition['train'], labels, train_dir, **params) val_gen = DataGenerator(partition['validation'], labels, train_dir, **params) return train_gen, val_gen
def password_probability(self, password): """ Calculate the probability of a given password. This works by determining the product of the individual probabilities of a given character conditional to the appearance of the preceding characters. Parameters ---------- password : str The password whose probability is to be calculated. model : The Keras model. tokenizer : The Keras tokenizer object. ix_to_character : dict The index-to-character dictionary. data : pd.DataFrame The dataset, including the tokenized passwords. Returns ------- float The probability of the password. """ # tokenize the password token = self.tokenizer.texts_to_sequences([password])[0] x_test = DataGenerator.slide_window(token) x_test = np.array(x_test) y_test = token - 1 # determine the probabilities of the permutations of the characters probabilities = self.model.predict(x_test, verbose=0) # multiply all of the conditional probabilities together in the password password_probability = 0 for index, probability in enumerate(probabilities): char_probability = probability[ y_test[index]] # get the probability from the model password_probability += np.log( char_probability) # use log to avoid roundoff errors # calculate the perplexity to account for varying password lengths password_length = len(password) password_probability /= -password_length password_probability = np.exp( password_probability) # recover the raw probability return password_probability
def main(): # env env_path = find_dotenv() load_dotenv(dotenv_path=env_path, verbose=True) processed_p = Path(os.environ.get('PATH_PROCESSED')).resolve() models_p = Path(os.environ.get('PATH_MODELS')).resolve() img_h = int(os.environ.get('IMAGE_HEIGHT')) img_w = int(os.environ.get('IMAGE_WIDTH')) batch_size = int(os.environ.get('BATCH_SIZE')) downsample_factor = int(os.environ.get('DOWNSAMPLE_FACTOR')) lr = float(os.environ.get('LEARNING_RATE')) # logging logging.root.removeHandler(absl.logging._absl_handler) absl.logging._warn_preinit_stderr = False logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) logger.info('TensorFlow version: ' + tf.__version__) logger.info('Keras version: ' + tf.keras.__version__) # parameters test_p = processed_p.joinpath('test') assert test_p.exists() logger.info('load data') test_gen = DataGenerator(test_p, img_w, img_h, batch_size, downsample_factor) alphabet = test_gen.alphabet logger.info('image shape: height=' + str(img_h) + ' width=' + str(img_w)) logger.info('batch size: ' + str(batch_size)) logger.info('test samples: ' + str(test_gen.n)) logger.info('test steps per epoch: ' + str(len(test_gen))) logger.info('learning rate: ' + str(lr)) # model checkpoint_p = models_p.joinpath('model.h5') assert checkpoint_p.exists() model = load_model(str(checkpoint_p), compile=False) model.summary() logger.info('model loaded') # optimizer adam = Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # the loss calc occurs elsewhere, so use a dummy lambda func for the loss model.compile(loss={ 'ctc': lambda y_true, y_pred: y_pred }, optimizer=adam, metrics=['accuracy']) logger.info('model compiled') # test data score = model.evaluate_generator(generator=test_gen, steps=len(test_gen), verbose=1) logger.info('loss %.3f accuracy: %.3f' % (score[0], score[1]))
def encode(self): self.load_weights() for type in ['test', 'dev', 'train']: print('Encoding {0}'.format(type)) dpath = os.path.join(self.ddir, type) spath = os.path.join(self.wdir, type + '_encodings.h5') gen = DataGenerator(dpath, self.image_size, self.batch_size) z = self.encoder.predict_generator(gen, verbose=1) class_dict = {v: k for k, v in gen.generator.class_indices.items()} labels = [class_dict[x] for x in gen.generator.classes] with h5py.File(spath, 'w') as f: f.create_dataset('encodings', data=z) f.create_dataset('filenames', data=np.array(gen.generator.filenames, dtype='S')) f.create_dataset('labels', data=np.array(labels, dtype='S'))
def gen_data(): params = { 'dim': (Config.NUM_ID, 2 * Config.NUM_INTVL), 'batch_size': 64, 'n_classes': 2, 'n_channels': 1, 'shuffle': True } Config.DATAPATH = 'data/test/' make_dataset("DoS_variation.csv") data = os.listdir(Config.DATAPATH) data.remove('labels.npy') data = data[int(len(data) / 10 * 8.5):] labels = np.load(Config.DATAPATH + "labels.npy") gen_test = DataGenerator(data, labels, **params) return gen_test
def main(_): # initialize settings settings = Settings(FLAGS.data_path, FLAGS.data_file, FLAGS.images_path, FLAGS.batch_size, FLAGS.epochs, FLAGS.learning_rate, FLAGS.epoch_sample_size) # print settings print(settings) # load, consolidate and augment data data_loader = DataLoader(settings) # data generator for train and test data_generator = DataGenerator(settings, data_loader) # setup & fit model DataModel(settings).fit(data_generator) print("Finally, done!")
def main(args): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id print('selecting gpu :', args.gpu_id) create_folders(args) model = get_model(args=args, weights=None) model.load_weights(join(args.model_dir)) print('model:', args.model_dir) model = fit_model(model=model) path = args.path_test path_image = join(path, 'images') path_mask = join(path, 'masks') list_image = glob(os.path.join(path_image, '*.tif')) list_mask = glob(os.path.join(path_mask, '*.tif')) min_dataset_values, max_dataset_values = 0, 255 #get_min_max(args.path_train) df_path_test = join( path, 'dataframe_test_dataset_{}_standarizate.csv'.format(args.size)) if os.path.exists(df_path_test): print('{} loaded!'.format(df_path_test)) df_test = pd.read_csv(df_path_test) else: print('{} saved!'.format(df_path_test)) df_test = get_coordinates(paths=list_mask, out_size=(args.size, args.size), stride=0.8, classes=args.classes, stride_minor_class=0.1) df_test.to_csv(df_path_test, index=False, header=True) test_generator = DataGenerator(list_IDs=df_test, batch_size=args.batch, dim=(args.size, args.size), n_channels=3, n_classes=args.classes, min_max=(min_dataset_values, max_dataset_values), shuffle=True) report_metrics(model, test_generator)