def load_and_cache_examples(args, tokenizer, evaluate=False): processor = DataProcessor() # Load data features from cache or dataset file cached_features_file = os.path.join(args.cache_dir, 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def __init__(self, output_dir, data_dir, item): data_dir = os.path.join(data_dir, item) output_dir = os.path.join(output_dir, item) try: predictions = np.load(os.path.join(output_dir, "predictions.npy")) y_pred = np.array(predictions).astype(np.float32) # y_pred = np.argsort(-y_pred, 1).astype(np.int64) except: y_pred = [] try: data_processor = DataProcessor(data_dir) test_examples = data_processor.get_examples("test") labels = test_examples["label"] # labels = [[l] for l in labels] y_true = np.array(labels).astype(np.int64) except: y_true = [] test_examples = {"text": [], "label": []} self.y_true = y_true self.y_preds = y_pred self.test_eamples = test_examples self.num_classes = len(set(y_true))
def data_summary(): data_dir = os.path.join("data/tsv/cpu") data_processor = DataProcessor(data_dir) with open("output/summary.txt", "w") as fw: train_examples = data_processor.get_examples("train") dev_examples = data_processor.get_examples("dev") test_examples = data_processor.get_examples("test") texts = train_examples["text"] + dev_examples["text"] + test_examples[ "text"] length = [len(l.split()) for l in texts] max_len = np.max(length) min_len = np.min(length) median_len = np.median(length) num_words = sum(length) num_train = len(train_examples["text"]) num_dev = len(dev_examples["text"]) num_test = len(test_examples["text"]) num_total = num_train + num_dev + num_test output = "total: %s\ntrain set: %s\ndev set:%s\ntest set:%s\n" \ "number of tokens:%s\nmax len:%s\nmin len:%s\nmedian len:%s\n" % ( num_total, num_train, num_dev, num_test, num_words, max_len, min_len, median_len) print(output) fw.write(output) length = np.array(length) np.save("output/length.npy", length)
def __init__(self): self.embeddingDim = 64 self.batch_size = 64 self.intermediate_dim = 32 self.latent_dim = 100 self.epsilon_std = 1. self.dataProcessor = DataProcessor() self.epochNum = 50
def get_correction_factor(laser_path, args): laser_processor = DataProcessor(laser_path, laser_path, None) laser_l, laser_spectrum = calculate(laser_processor, args.multiplier, args.ignorecalib, None) max_wavelength_idx = np.array(laser_spectrum).argmax() correction_factor = LASER_WAVELENGTH / laser_l[max_wavelength_idx] return correction_factor
def __init__(self, tca, img_source, config=None, frame_type=0, data_proc=None): self.img_source = img_source self.last_frame = None self.__avg_frame = None self.frame_type = FRAME_TYPES[frame_type] self.cal_data = None # Tactical Computer Application self.tca = tca # Source Calibration Module self.scm = SourceCalibrationModule(self, config) # ObjectDetectionModule self.odm = ObjectDetectionModule(self, config) self.config = config if data_proc is None: self.data_proc = DataProcessor() else: self.data_proc = data_proc
class ImageProcessor(object): def __init__(self, tca, img_source, config=None, frame_type=0, data_proc=None): self.img_source = img_source self.last_frame = None self.__avg_frame = None self.frame_type = FRAME_TYPES[frame_type] self.cal_data = None # Tactical Computer Application self.tca = tca # Source Calibration Module self.scm = SourceCalibrationModule(self, config) # ObjectDetectionModule self.odm = ObjectDetectionModule(self, config) self.config = config if data_proc is None: self.data_proc = DataProcessor() else: self.data_proc = data_proc def process(self): self.last_frame = self.img_source.read() if self.avg_frame is None: self.avg_frame = self.last_frame # Find objects from the image source self.last_frame, self.avg_frame, img_data = self.odm.findObjects( self.last_frame, self.avg_frame, self.frame_type) # Display calibration points if self.cal_data and self.frame_type == 'main': for num, cal_point in enumerate(self.cal_data.image_points, 1): point = (cal_point[0], cal_point[1]) color_intensity = ((num - 1) % 3) / 3.0 * 200 + 55 color = (0, 0, color_intensity) if num > 3 else (0, color_intensity, 0) cv.circle(self.last_frame, point, 5, color, thickness=-1) cv.circle(self.last_frame, point, 5, [0, 0, 0], thickness=2) # Display calibration status cal_status_color = [0, 255, 0] if self.cal_data else [0, 0, 255] cv.circle(self.last_frame, (self.img_source.width - 25, 25), 20, [0, 0, 0], thickness=5) cv.circle(self.last_frame, (self.img_source.width - 25, 25), 20, cal_status_color, thickness=-1) cv.circle(self.last_frame, (self.img_source.width - 25, 25), 20, [255, 255, 255], thickness=2) self.data_proc.process(img_data, self) return self.last_frame @property def avg_frame(self): #define average frame property to be used in process return self.__avg_frame @avg_frame.setter def avg_frame(self, frame): #set average frame using numpy float if self.__avg_frame is None: self.__avg_frame = np.float32(frame) else: self.__avg_frame = frame def saveFrame(self, filename=""): self.img_source.save(filename, self.last_frame) def setFrameType(self, frame_type): if isinstance(frame_type, basestring): if frame_type in FRAME_TYPES: self.frame_type = frame_type else: raise Exception("Invalid frame type '%s'" % frame_type) else: self.frame_type = FRAME_TYPES[frame_type] def __string__(self): return 'Image Processor{%r}' % self.image_source def __repr__(self): return self.__string__()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument("--eval_all_checkpoints", action='store_true', help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare task processor = DataProcessor() label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task='almond-frontend') tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=global_step) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) return results
def main(argv): parser = argparse.ArgumentParser() parser.add_argument( "dataid", help="Chooses data to process. Either 'Iod' or 'Filter'. Required.") parser.add_argument( "-m", "--multiplier", help= "Factor by which the number of datapoints is increased when interpolating. Defaults to 2.", type=int, default=2) parser.add_argument("-s", "--save", help="If set saves all plots to ./plots/<data type>/.", action="store_true") parser.add_argument("-d", "--display", help="If set displays all plots at runtime.", action="store_true") parser.add_argument( "-ft", "--ftonly", help= "If set with -d or -s displays/saves only the resulting plots after fourier transformation at runtime.", action="store_true") parser.add_argument("-i", "--ignorecalib", help="If set skips calibration.", action="store_true") args = parser.parse_args() if not check_args_valid(args): sys.exit(2) if args.dataid == "Filter": plot_fkt = plot_fkt_factory("filter", args.save, args.display) correction_factor = get_correction_factor( "./data/Notch Filter/Data Channel 2.dat", args) processor = DataProcessor("./data/Notch Filter/Data Channel 2.dat", "./data/Notch Filter/Data Channel 0.dat", None if args.ftonly else plot_fkt) l, spectrum = calculate(processor, args.multiplier, args.ignorecalib, None if args.ftonly else plot_fkt, correction_factor) plot_fkt("spectrum-notch-filter", l, spectrum, "wavelength [nm]", "I") elif args.dataid == "Iod": iod_plot_fkt = plot_fkt_factory("iod", args.save, args.display) ref_plot_fkt = plot_fkt_factory("iod-ref", args.save, args.display) correction_factor = get_correction_factor( "./data/Iod/Data Channel 2.dat", args) ref_correction_factor = get_correction_factor( "./data/Referenz fuer Iod/Data Channel 2.dat", args) processor = DataProcessor("./data/Iod/Data Channel 2.dat", "./data/Iod/Data Channel 0.dat", None if args.ftonly else iod_plot_fkt) ref_processor = DataProcessor( "./data/Referenz fuer Iod/Data Channel 2.dat", "./data/Referenz fuer Iod/Data Channel 0.dat", None if args.ftonly else ref_plot_fkt) ref_l, ref_spectrum = calculate(ref_processor, args.multiplier, args.ignorecalib, None if args.ftonly else ref_plot_fkt, ref_correction_factor) l, spectrum = calculate(processor, args.multiplier, args.ignorecalib, None if args.ftonly else iod_plot_fkt, correction_factor) # Bring the ref and iod to the same x axis y = Utils.interpolate(ref_l, l, spectrum) od = np.log(y / ref_spectrum) ref_plot_fkt("spectrum-iod-reference", ref_l, ref_spectrum, "wavelength [nm]", "I") iod_plot_fkt("spectrum-iod", ref_l, spectrum, "wavelength [nm]", "I") iod_plot_fkt("OD", ref_l, od, "wavelength [nm]", "")
from data import DataProcessor import pandas as pd from keras_mlp import KerasMLP filename = "dataset.csv" dataset = pd.read_csv(filename, header=0, index_col=0) processor = DataProcessor(dataset) processor.scale() reframed = processor.series_to_supervised(1, 1) reframed.drop(reframed.columns[[9,10,11,12,14,15,16,17]], axis=1, inplace=True) mld = KerasMLP(values=reframed.values) #print (reframed.head(5))
class VAE(object): """ 使用基于LSTM的VAE进行数据增强 Args: """ def __init__(self): self.embeddingDim = 64 self.batch_size = 64 self.intermediate_dim = 32 self.latent_dim = 100 self.epsilon_std = 1. self.dataProcessor = DataProcessor() self.epochNum = 50 def sampling(self, args): z_mean, z_log_sigma = args epsilon = K.random_normal(shape=(self.batch_size, self.latent_dim), mean=0., stddev=self.epsilon_std) return z_mean + z_log_sigma * epsilon def build(self): """ Creates an LSTM Variational Autoencoder (VAE). Returns VAE, Encoder, Generator. # Arguments input_dim: int. timesteps: int, input timestep dimension. batch_size: int. intermediate_dim: int, output shape of LSTM. latent_dim: int, latent z-layer shape. epsilon_std: float, z-layer sigma. # References - [Building Autoencoders in Keras](https://blog.keras.io/building-autoencoders-in-keras.html) - [Generating sentences from a continuous space](https://arxiv.org/abs/1511.06349) """ input = Input(shape=(self.dataProcessor.senMaxLen, ), name='input') x = Embedding(input_dim=self.dataProcessor.wordVocabSize, output_dim=self.embeddingDim, input_length=self.dataProcessor.senMaxLen, name='embeddingLayer', mask_zero=True, trainable=False, embeddings_initializer=pretrainedWord2Vec)(input) # LSTM encoding h = Bidirectional(LSTM(self.intermediate_dim))(x) # VAE Z layer z_mean = Dense(self.latent_dim)(h) z_log_sigma = Dense(self.latent_dim)(h) # note that "output_shape" isn't necessary with the TensorFlow backend # so you could write `Lambda(sampling)([z_mean, z_log_sigma])` z = Lambda(self.sampling)([z_mean, z_log_sigma]) # decoded LSTM layer decoder_h = LSTM(self.intermediate_dim, return_sequences=True) decoder_mean = LSTM(self.embeddingDim, return_sequences=True) h_decoded = RepeatVector(self.dataProcessor.senMaxLen)(z) h_decoded = decoder_h(h_decoded) # decoded layer x_decoded_mean = decoder_mean(h_decoded) # end-to-end autoencoder vae = Model(input, x_decoded_mean) def vae_loss(x, x_decoded_mean): xent_loss = objectives.mse(x, x_decoded_mean) kl_loss = -0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) - K.exp(z_log_sigma)) loss = xent_loss + kl_loss return loss vae.compile(optimizer='adam', loss=vae_loss) return vae def train(self): """ :return: """ text1 = self.dataProcessor.generateData('task3_dev.csv') text2 = self.dataProcessor.generateData('task3_train.csv') text3 = self.dataProcessor.generateData('train.csv') text4 = self.dataProcessor.generateData('test.csv') text = np.concatenate((text1, text2, text3, text4), axis=0) early_stopping = EarlyStopping(monitor='val_loss', patience=8) checkpoint = ModelCheckpoint(filepath='{vae.h5', monitor='val_acc', save_best_only=True, save_weights_only=True, mode='auto') model = self.build() model.fit(text, text, epochs=self.epochNum, batch_size=self.batch_size, callbacks=[early_stopping, checkpoint])