Example #1
0
def load_and_cache_examples(args, tokenizer, evaluate=False):
    processor = DataProcessor()
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(args.cache_dir, 'cached_{}_{}_{}'.format(
        'dev' if evaluate else 'train',
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer,
            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset
Example #2
0
    def __init__(self, output_dir, data_dir, item):
        data_dir = os.path.join(data_dir, item)
        output_dir = os.path.join(output_dir, item)

        try:
            predictions = np.load(os.path.join(output_dir, "predictions.npy"))
            y_pred = np.array(predictions).astype(np.float32)
            # y_pred = np.argsort(-y_pred, 1).astype(np.int64)
        except:
            y_pred = []

        try:
            data_processor = DataProcessor(data_dir)
            test_examples = data_processor.get_examples("test")
            labels = test_examples["label"]
            # labels = [[l] for l in labels]
            y_true = np.array(labels).astype(np.int64)
        except:
            y_true = []
            test_examples = {"text": [], "label": []}

        self.y_true = y_true
        self.y_preds = y_pred
        self.test_eamples = test_examples
        self.num_classes = len(set(y_true))
Example #3
0
def data_summary():
    data_dir = os.path.join("data/tsv/cpu")
    data_processor = DataProcessor(data_dir)

    with open("output/summary.txt", "w") as fw:
        train_examples = data_processor.get_examples("train")
        dev_examples = data_processor.get_examples("dev")
        test_examples = data_processor.get_examples("test")
        texts = train_examples["text"] + dev_examples["text"] + test_examples[
            "text"]

        length = [len(l.split()) for l in texts]
        max_len = np.max(length)
        min_len = np.min(length)
        median_len = np.median(length)
        num_words = sum(length)
        num_train = len(train_examples["text"])
        num_dev = len(dev_examples["text"])
        num_test = len(test_examples["text"])
        num_total = num_train + num_dev + num_test

        output = "total: %s\ntrain set: %s\ndev set:%s\ntest set:%s\n" \
                 "number of tokens:%s\nmax len:%s\nmin len:%s\nmedian len:%s\n" % (
            num_total, num_train, num_dev, num_test, num_words, max_len, min_len, median_len)
        print(output)
        fw.write(output)

        length = np.array(length)
        np.save("output/length.npy", length)
Example #4
0
    def __init__(self):

        self.embeddingDim = 64
        self.batch_size = 64
        self.intermediate_dim = 32
        self.latent_dim = 100
        self.epsilon_std = 1.
        self.dataProcessor = DataProcessor()
        self.epochNum = 50
Example #5
0
def get_correction_factor(laser_path, args):
    laser_processor = DataProcessor(laser_path, laser_path, None)

    laser_l, laser_spectrum = calculate(laser_processor, args.multiplier,
                                        args.ignorecalib, None)

    max_wavelength_idx = np.array(laser_spectrum).argmax()
    correction_factor = LASER_WAVELENGTH / laser_l[max_wavelength_idx]

    return correction_factor
Example #6
0
    def __init__(self, tca, img_source, config=None, frame_type=0, data_proc=None):
        self.img_source = img_source
        self.last_frame = None
        self.__avg_frame = None
        self.frame_type = FRAME_TYPES[frame_type]
        self.cal_data = None

        # Tactical Computer Application
        self.tca = tca
        # Source Calibration Module
        self.scm = SourceCalibrationModule(self, config)
        # ObjectDetectionModule
        self.odm = ObjectDetectionModule(self, config)

        self.config = config
        
        if data_proc is None:
            self.data_proc = DataProcessor()
        else:
            self.data_proc = data_proc
Example #7
0
class ImageProcessor(object):

    def __init__(self, tca, img_source, config=None, frame_type=0, data_proc=None):
        self.img_source = img_source
        self.last_frame = None
        self.__avg_frame = None
        self.frame_type = FRAME_TYPES[frame_type]
        self.cal_data = None

        # Tactical Computer Application
        self.tca = tca
        # Source Calibration Module
        self.scm = SourceCalibrationModule(self, config)
        # ObjectDetectionModule
        self.odm = ObjectDetectionModule(self, config)

        self.config = config
        
        if data_proc is None:
            self.data_proc = DataProcessor()
        else:
            self.data_proc = data_proc
    
    def process(self):
        self.last_frame = self.img_source.read()

        if self.avg_frame is None:
            self.avg_frame = self.last_frame

        # Find objects from the image source
        self.last_frame, self.avg_frame, img_data = self.odm.findObjects(
            self.last_frame, self.avg_frame, self.frame_type)

        # Display calibration points 
        if self.cal_data and self.frame_type == 'main':
            for num, cal_point in enumerate(self.cal_data.image_points, 1):
                point = (cal_point[0], cal_point[1])
                color_intensity = ((num - 1) % 3) / 3.0 * 200 + 55
                color = (0, 0, color_intensity) if num > 3 else (0, color_intensity, 0)
                cv.circle(self.last_frame, point, 5, color, thickness=-1)
                cv.circle(self.last_frame, point, 5, [0, 0, 0], thickness=2)

        # Display calibration status
        cal_status_color = [0, 255, 0] if self.cal_data else [0, 0, 255]
        cv.circle(self.last_frame, (self.img_source.width - 25, 25), 20, [0, 0, 0], thickness=5) 
        cv.circle(self.last_frame, (self.img_source.width - 25, 25), 20, cal_status_color, thickness=-1)
        cv.circle(self.last_frame, (self.img_source.width - 25, 25), 20, [255, 255, 255], thickness=2) 

        self.data_proc.process(img_data, self)

        return self.last_frame

    @property
    def avg_frame(self):
                #define average frame property to be used in process
        return self.__avg_frame
    @avg_frame.setter
    def avg_frame(self, frame):
                #set average frame using numpy float
        if self.__avg_frame is None:
            self.__avg_frame = np.float32(frame)
        else:
            self.__avg_frame = frame

    def saveFrame(self, filename=""):
        self.img_source.save(filename, self.last_frame)

    def setFrameType(self, frame_type):
        if isinstance(frame_type, basestring):
            if frame_type in FRAME_TYPES:
                self.frame_type = frame_type
            else:
                raise Exception("Invalid frame type '%s'" % frame_type)
        else:
            self.frame_type = FRAME_TYPES[frame_type]

    def __string__(self):
        return 'Image Processor{%r}' % self.image_source
    def __repr__(self):
        return self.__string__()
Example #8
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--model_type", default=None, type=str, required=True,
                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--config_name", default="", type=str,
                        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument("--tokenizer_name", default="", type=str,
                        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument("--cache_dir", default="", type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length", default=128, type=int,
                        help="The maximum total input sequence length after tokenization. Sequences longer "
                             "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train", action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--evaluate_during_training", action='store_true',
                        help="Rul evaluation during training at each logging step.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--learning_rate", default=5e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps', type=int, default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps', type=int, default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument("--eval_all_checkpoints", action='store_true',
                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument('--overwrite_cache', action='store_true',
                        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")

    parser.add_argument('--fp16', action='store_true',
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    parser.add_argument('--fp16_opt_level', type=str, default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

    # Set seed
    set_seed(args)

    # Prepare task
    processor = DataProcessor()
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task='almond-frontend')
    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    logger.info("Training/evaluation parameters %s", args)


    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)


    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)


    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=global_step)
            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
Example #9
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "dataid",
        help="Chooses data to process. Either 'Iod' or 'Filter'. Required.")
    parser.add_argument(
        "-m",
        "--multiplier",
        help=
        "Factor by which the number of datapoints is increased when interpolating. Defaults to 2.",
        type=int,
        default=2)
    parser.add_argument("-s",
                        "--save",
                        help="If set saves all plots to ./plots/<data type>/.",
                        action="store_true")
    parser.add_argument("-d",
                        "--display",
                        help="If set displays all plots at runtime.",
                        action="store_true")
    parser.add_argument(
        "-ft",
        "--ftonly",
        help=
        "If set with -d or -s  displays/saves only the resulting plots after fourier transformation at runtime.",
        action="store_true")
    parser.add_argument("-i",
                        "--ignorecalib",
                        help="If set skips calibration.",
                        action="store_true")

    args = parser.parse_args()

    if not check_args_valid(args):
        sys.exit(2)

    if args.dataid == "Filter":
        plot_fkt = plot_fkt_factory("filter", args.save, args.display)

        correction_factor = get_correction_factor(
            "./data/Notch Filter/Data Channel 2.dat", args)

        processor = DataProcessor("./data/Notch Filter/Data Channel 2.dat",
                                  "./data/Notch Filter/Data Channel 0.dat",
                                  None if args.ftonly else plot_fkt)

        l, spectrum = calculate(processor, args.multiplier, args.ignorecalib,
                                None if args.ftonly else plot_fkt,
                                correction_factor)

        plot_fkt("spectrum-notch-filter", l, spectrum, "wavelength [nm]", "I")

    elif args.dataid == "Iod":
        iod_plot_fkt = plot_fkt_factory("iod", args.save, args.display)
        ref_plot_fkt = plot_fkt_factory("iod-ref", args.save, args.display)

        correction_factor = get_correction_factor(
            "./data/Iod/Data Channel 2.dat", args)
        ref_correction_factor = get_correction_factor(
            "./data/Referenz fuer Iod/Data Channel 2.dat", args)

        processor = DataProcessor("./data/Iod/Data Channel 2.dat",
                                  "./data/Iod/Data Channel 0.dat",
                                  None if args.ftonly else iod_plot_fkt)

        ref_processor = DataProcessor(
            "./data/Referenz fuer Iod/Data Channel 2.dat",
            "./data/Referenz fuer Iod/Data Channel 0.dat",
            None if args.ftonly else ref_plot_fkt)

        ref_l, ref_spectrum = calculate(ref_processor, args.multiplier,
                                        args.ignorecalib,
                                        None if args.ftonly else ref_plot_fkt,
                                        ref_correction_factor)

        l, spectrum = calculate(processor, args.multiplier, args.ignorecalib,
                                None if args.ftonly else iod_plot_fkt,
                                correction_factor)

        # Bring the ref and iod to the same x axis
        y = Utils.interpolate(ref_l, l, spectrum)

        od = np.log(y / ref_spectrum)

        ref_plot_fkt("spectrum-iod-reference", ref_l, ref_spectrum,
                     "wavelength [nm]", "I")
        iod_plot_fkt("spectrum-iod", ref_l, spectrum, "wavelength [nm]", "I")
        iod_plot_fkt("OD", ref_l, od, "wavelength [nm]", "")
Example #10
0
from data import DataProcessor
import pandas as pd
from keras_mlp import KerasMLP

filename = "dataset.csv"
dataset = pd.read_csv(filename, header=0, index_col=0)

processor = DataProcessor(dataset)

processor.scale()
reframed = processor.series_to_supervised(1, 1)
reframed.drop(reframed.columns[[9,10,11,12,14,15,16,17]], axis=1, inplace=True)

mld = KerasMLP(values=reframed.values)

#print (reframed.head(5))
Example #11
0
class VAE(object):
    """

    使用基于LSTM的VAE进行数据增强
    Args:


    """
    def __init__(self):

        self.embeddingDim = 64
        self.batch_size = 64
        self.intermediate_dim = 32
        self.latent_dim = 100
        self.epsilon_std = 1.
        self.dataProcessor = DataProcessor()
        self.epochNum = 50

    def sampling(self, args):
        z_mean, z_log_sigma = args
        epsilon = K.random_normal(shape=(self.batch_size, self.latent_dim),
                                  mean=0.,
                                  stddev=self.epsilon_std)
        return z_mean + z_log_sigma * epsilon

    def build(self):
        """
        Creates an LSTM Variational Autoencoder (VAE). Returns VAE, Encoder, Generator.

        # Arguments
            input_dim: int.
            timesteps: int, input timestep dimension.
            batch_size: int.
            intermediate_dim: int, output shape of LSTM.
            latent_dim: int, latent z-layer shape.
            epsilon_std: float, z-layer sigma.


        # References
            - [Building Autoencoders in Keras](https://blog.keras.io/building-autoencoders-in-keras.html)
            - [Generating sentences from a continuous space](https://arxiv.org/abs/1511.06349)
        """
        input = Input(shape=(self.dataProcessor.senMaxLen, ), name='input')

        x = Embedding(input_dim=self.dataProcessor.wordVocabSize,
                      output_dim=self.embeddingDim,
                      input_length=self.dataProcessor.senMaxLen,
                      name='embeddingLayer',
                      mask_zero=True,
                      trainable=False,
                      embeddings_initializer=pretrainedWord2Vec)(input)

        # LSTM encoding
        h = Bidirectional(LSTM(self.intermediate_dim))(x)

        # VAE Z layer
        z_mean = Dense(self.latent_dim)(h)
        z_log_sigma = Dense(self.latent_dim)(h)

        # note that "output_shape" isn't necessary with the TensorFlow backend
        # so you could write `Lambda(sampling)([z_mean, z_log_sigma])`
        z = Lambda(self.sampling)([z_mean, z_log_sigma])

        # decoded LSTM layer
        decoder_h = LSTM(self.intermediate_dim, return_sequences=True)
        decoder_mean = LSTM(self.embeddingDim, return_sequences=True)

        h_decoded = RepeatVector(self.dataProcessor.senMaxLen)(z)
        h_decoded = decoder_h(h_decoded)

        # decoded layer
        x_decoded_mean = decoder_mean(h_decoded)

        # end-to-end autoencoder
        vae = Model(input, x_decoded_mean)

        def vae_loss(x, x_decoded_mean):
            xent_loss = objectives.mse(x, x_decoded_mean)
            kl_loss = -0.5 * K.mean(1 + z_log_sigma - K.square(z_mean) -
                                    K.exp(z_log_sigma))
            loss = xent_loss + kl_loss
            return loss

        vae.compile(optimizer='adam', loss=vae_loss)

        return vae

    def train(self):
        """

        :return:
        """

        text1 = self.dataProcessor.generateData('task3_dev.csv')
        text2 = self.dataProcessor.generateData('task3_train.csv')
        text3 = self.dataProcessor.generateData('train.csv')
        text4 = self.dataProcessor.generateData('test.csv')
        text = np.concatenate((text1, text2, text3, text4), axis=0)

        early_stopping = EarlyStopping(monitor='val_loss', patience=8)
        checkpoint = ModelCheckpoint(filepath='{vae.h5',
                                     monitor='val_acc',
                                     save_best_only=True,
                                     save_weights_only=True,
                                     mode='auto')
        model = self.build()
        model.fit(text,
                  text,
                  epochs=self.epochNum,
                  batch_size=self.batch_size,
                  callbacks=[early_stopping, checkpoint])