def home():
    obj = DataReader()
    obj.parse_country()
    obj.parse_city()
    obj.parse_features()
    obj.parse_prediction()
    return jsonify(obj.final_data)
def build_data_loader(args, char_dict, intent_dict):
    """[decorate samples for dataloader]
    
    Arguments:
        args {[type]} -- [description]
        char_dict {[type]} -- [description]
        intent_dict {[type]} -- [description]
    
    Returns:
        [type] -- [description]
    """
    loader_res = {}
    if args.do_train:
        train_processor = DataReader(char_dict, intent_dict, args.max_seq_len)
        train_data_generator = train_processor.prepare_data(
            data_path=args.data_dir + "train.txt",
            batch_size=args.batch_size,
            mode='train')
        loader_res["train_data_generator"] = train_data_generator
        num_train_examples = train_processor._get_num_examples()
        logger.info("Num train examples: %d" % num_train_examples)
        logger.info("Num train steps: %d" % (math.ceil(num_train_examples * 1.0 / args.batch_size) * \
                                            args.epoch // DEV_COUNT))
        if math.ceil(
                num_train_examples * 1.0 / args.batch_size) // DEV_COUNT <= 0:
            logger.error(
                "Num of train steps is less than 0  or equals to 0, exit")
            exit(1)
    if args.do_eval:
        eval_processor = DataReader(char_dict, intent_dict, args.max_seq_len)
        eval_data_generator = eval_processor.prepare_data(
            data_path=args.data_dir + "eval.txt",
            batch_size=args.batch_size,
            mode='eval')
        loader_res["eval_data_generator"] = eval_data_generator
        num_eval_examples = eval_processor._get_num_examples()
        logger.info("Num eval examples: %d" % num_eval_examples)
    if args.do_test:
        test_processor = DataReader(char_dict, intent_dict, args.max_seq_len)
        test_data_generator = test_processor.prepare_data(
            data_path=args.data_dir + "test.txt",
            batch_size=args.batch_size,
            mode='test')
        loader_res["test_data_generator"] = test_data_generator
    return loader_res
 def pretrain_model(self, src1_path, src2_path, tgt_path, epochs):
     datareader = DataReader()
     data = datareader.read_parallel_data(self.model, src1_path, src2_path,
                                          tgt_path)
     self.seq2seq_trainer.train(
         train_data=data,
         val_data=[],
         epochs=epochs,
         pretrain=True,
     )
Ejemplo n.º 4
0
    def init_from_config(self, config):
        # self.model = Model(config)
        self.model = Transformer(config, config.test.devices)
        self.model.build_test_model()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config, graph=self.model.graph)
        # Restore model.
        self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.train.logdir))

        self.data_reader = DataReader(config)
Ejemplo n.º 5
0
    def init_from_config(self, config):
        self.model = eval(config.model)(config, config.test.num_gpus)
        self.model.build_test_model()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config, graph=self.model.graph)
        if is_debug:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
        # Restore model.
        self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.model_dir))

        self.data_reader = DataReader(config)
Ejemplo n.º 6
0
    def init_from_config(self, config):

        logger = logging.getLogger('')

        self.model = eval(config.model)(config, config.test.num_gpus)
        self.model.build_test_model()

        # Print the number of total parameters
        print_num_of_total_parameters()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config, graph=self.model.graph)
        # Restore model.
        self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.model_dir))

        self.data_reader = DataReader(config)
Ejemplo n.º 7
0
    def init_from_config(self, config):
        self.model = eval(config.model)(config, config.test.num_gpus)
        self.model.build_test_model()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config)

        # Restore model.
        try:
            tf.train.Saver().restore(
                self.sess, tf.train.latest_checkpoint(config.model_dir))
        except tf.errors.NotFoundError:
            roll_back_to_previous_version(config)
            tf.train.Saver().restore(
                self.sess, tf.train.latest_checkpoint(config.model_dir))

        self.data_reader = DataReader(config)
Ejemplo n.º 8
0
 def __init__(self, options):
     """Gonna need a db, and some creds."""
     log.info("Starting AG Chatter Bot.")
     self.options = options
     # Build Constructors
     self.idx2word = Database(
             host=options.redis_host, pass_=options.redis_pass, db=0
         )
     self.word2idx = Database(
             host=options.redis_host, pass_=options.redis_pass, db=1
         )
     self.dataReader = DataReader(
             self.options, self.idx2word, self.word2idx
         )
     self.model = Model(
             self.options
         )
     log.debug(options)
     log.info("Init complete.")
Ejemplo n.º 9
0
def train(config):
    logger = logging.getLogger('')
    """Train a model with a config file."""
    data_reader = DataReader(config=config)
    model = eval(config.model)(config=config, num_gpus=config.train.num_gpus)
    model.build_train_model(test=config.train.eval_on_dev)

    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    sess_config.allow_soft_placement = True

    summary_writer = tf.summary.FileWriter(config.model_dir, graph=model.graph)

    with tf.Session(config=sess_config, graph=model.graph) as sess:
        # Initialize all variables.
        sess.run(tf.global_variables_initializer())
        # Reload variables in disk.
        if tf.train.latest_checkpoint(config.model_dir):
            available_vars = available_variables(config.model_dir)
            if available_vars:
                saver = tf.train.Saver(var_list=available_vars)
                saver.restore(sess,
                              tf.train.latest_checkpoint(config.model_dir))
                for v in available_vars:
                    logger.info('Reload {} from disk.'.format(v.name))
            else:
                logger.info('Nothing to be reload from disk.')
        else:
            logger.info('Nothing to be reload from disk.')

        evaluator = Evaluator()
        evaluator.init_from_existed(model, sess, data_reader)

        global dev_bleu, toleration
        dev_bleu = evaluator.evaluate(
            **config.dev) if config.train.eval_on_dev else 0
        toleration = config.train.toleration

        def train_one_step(batch):
            feat_batch, target_batch, batch_size = batch
            feed_dict = expand_feed_dict({
                model.src_pls: feat_batch,
                model.dst_pls: target_batch
            })
            step, lr, loss, _ = sess.run([
                model.global_step, model.learning_rate, model.loss,
                model.train_op
            ],
                                         feed_dict=feed_dict)
            if step % config.train.summary_freq == 0:
                summary = sess.run(model.summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary, global_step=step)
            return step, lr, loss

        def maybe_save_model():
            global dev_bleu, toleration
            new_dev_bleu = evaluator.evaluate(
                **config.dev) if config.train.eval_on_dev else dev_bleu + 1
            if new_dev_bleu >= dev_bleu:
                mp = config.model_dir + '/model_step_{}'.format(step)
                model.saver.save(sess, mp)
                logger.info('Save model in %s.' % mp)
                toleration = config.train.toleration
                dev_bleu = new_dev_bleu
            else:
                toleration -= 1

        step = 0
        for epoch in range(1, config.train.num_epochs + 1):
            for batch in data_reader.get_training_batches_with_buckets():

                # Train normal instances.
                start_time = time.time()
                step, lr, loss = train_one_step(batch)
                logger.info(
                    'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}\tbatch_size: {5}'
                    .format(epoch, step, lr, loss,
                            time.time() - start_time, batch[2]))
                # Save model
                if config.train.save_freq > 0 and step % config.train.save_freq == 0:
                    maybe_save_model()

                if config.train.num_steps and step >= config.train.num_steps:
                    break

            # Save model per epoch if config.train.save_freq is less or equal than zero
            if config.train.save_freq <= 0:
                maybe_save_model()

            # Early stop
            if toleration <= 0:
                break
        logger.info("Finish training.")
Ejemplo n.º 10
0
def main(unused_argv):
    # prints a message if you've entered flags incorrectly
    if len(unused_argv) != 1: 
        raise Exception("Problem with flags: %s" % unused_argv)
    
    # Get hyperparameters. We only get a subset of all the hyperparameters, others would be feed to Model directly.
    #logging.basicConfig(level=logging.INFO)
    print('Starting Basic model')
    log_root = FLAGS.log_root
    exp_name = FLAGS.exp_name
    data_file_path = FLAGS.data_file_path
    pinyin_dict_path = FLAGS.pinyin_dict_path
    id_data_dir = FLAGS.id_data_dir
    
    n_epoch = FLAGS.n_epoch
    batch_size = FLAGS.batch_size
    seed_num = FLAGS.seed_num
    max_timesteps= FLAGS.max_timesteps
    vocab_size = FLAGS.vocab_size
    train_size = FLAGS.train_size
    load_data_and_dr = FLAGS.load_data_and_dr
    use_local = FLAGS.use_local
        
    
    # make the directory for logs
    log_root = os.path.join(log_root, exp_name)
    if not os.path.exists(log_root):
        os.makedirs(log_root)

    if use_local == 1:
        #load or save the DR class from local dir
        DR_path = os.path.join(log_root, 'DataReader.pkl')
        #load or save the id data from local dir
        id_data_path = os.path.join(log_root, 'id_data.pkl')
    else:
        #load or save the DR class from global dir
        DR_path = os.path.join(id_data_dir, 'DataReader.pkl')
        #load or save the id data from global dir
        id_data_path = os.path.join(id_data_dir, 'id_data.pkl')

    if load_data_and_dr == 1:
        with open(DR_path,'rb') as f:
            DR = pickle.load(f)
        with open(id_data_path,'rb') as f1:
            input_pinyin_data = pickle.load(f1)
            input_word_data = pickle.load(f1)
            target_data = pickle.load(f1)
    else:
        # load and make the data for training
        DR = DataReader(vocab_size = vocab_size, pinyin_dict_path = pinyin_dict_path)
        #input_data,target_data = DR.make_data_from_scratch(file_path = data_file_path,build_dictionary=True)
        input_pinyin_data,input_word_data,target_data = DR.make_data_from_dataframe(file_path = data_file_path,build_dictionary=True,max_rows = train_size)
        #save the DR class to local dir
        with open(DR_path,'wb') as f:
            pickle.dump(DR,f)

        #save the ids data to local dir
        with open(id_data_path,'wb') as f1:
            pickle.dump(input_pinyin_data,f1)
            pickle.dump(input_word_data,f1)
            pickle.dump(target_data,f1)
    
    # make the batch
    train_data_full= batch_generator_triple_with_length(input_pinyin_data,input_word_data,target_data,batch_size,max_timesteps,DR.word2id,DR.pinyin2id)

    # create the model
    model = SpellChecker(hps = FLAGS)

    
    # create the supervisor
    with model.graph.as_default():
        # print the variables of tensorflow
        print("Number of sets of parameters: {}".format(len(tf.trainable_variables())))
        print("Number of parameters: {}".format(
                np.sum([np.prod(v.shape.as_list()) for v in tf.trainable_variables()])))
        for v in tf.trainable_variables():
            print(v)

        sv = tf.train.Supervisor(logdir=log_root,
                                saver = model.saver,
                                summary_op=None,
                                save_model_secs=60,
                                global_step = model.global_step,
                                init_op=model.init_op) # Do not run the summary service


        # train the model 
        with sv.managed_session() as sess:
            n_iter_per_epoch = len(input_pinyin_data) // (batch_size * 2)
            epoch = 0.0
            print('number of iterations per epoch: {}'.format(n_iter_per_epoch))
            print('start training...')     
            for _ in range(n_epoch * 2):
                epoch += 0.5
                avg_loss = 0.0
                print("----- Epoch {}/{} -----".format(epoch, n_epoch))
                for t in tqdm(range(1, n_iter_per_epoch + 1)):
                    batch_full = next(train_data_full)
                    src_pinyin_list,src_word_list,src_length_list,tgt_list,tgt_length_list = batch_full
                    
                    #if epoch == 0.5:
                        #print(src_list[1])
                        #print(len(src_list[1]))
                        #print(src_length_list[1])
                        #print(tgt_list[1])
                        #print(len(tgt_list[1]))
                        #print(tgt_length_list[1])
                    
                    src_pinyin_list = np.asarray(src_pinyin_list,dtype = np.int32)
                    src_word_list = np.asarray(src_word_list,dtype = np.int32)
                    src_length_list = np.asarray(src_length_list,dtype = np.int32)
                    tgt_list = np.asarray(tgt_list,dtype = np.int32)
                    #tgt_length_list = np.asarray(tgt_length_list,dtype = np.int32)
                    loss = model.train_one_step(src_pinyin_list, src_word_list,src_length_list, tgt_list,sess)
                    avg_loss +=loss
                avg_loss /= n_iter_per_epoch
                print('the avg_loss is {}'.format(avg_loss))
Ejemplo n.º 11
0
model_path = sys.argv[5]
logfile = sys.argv[6]

ckpt_file = "ckpt"

x, y_true, y, gap_w, conv3_pool, train_step, accuracy, saver = \
    inference(batch_size)

sess = tf.InteractiveSession()

# Setup summary
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter(logfile, sess.graph)

# Get data reader
data_reader = DataReader(train_dataset_dir, batch_size=batch_size,
                         file_names=False)

tf.add_to_collection('x', x)
tf.add_to_collection('y', y)

tf.add_to_collection('gap_w', gap_w)
tf.add_to_collection('conv3', conv3_pool)

ckpt = tf.train.latest_checkpoint(model_path)
if ckpt:
    saver.restore(sess, ckpt)
    print("Model loaded from file: %s" % ckpt)

# Initialize variables
sess.run(tf.global_variables_initializer())
Ejemplo n.º 12
0
def train(config):
    """Train a model with a config file."""
    logger = logging.getLogger('')
    data_reader = DataReader(config=config)
    model = eval(config.model)(config=config, num_gpus=config.train.num_gpus)
    model.build_train_model(test=config.train.eval_on_dev)

    train_op, loss_op = model.get_train_op(name=None)
    global_saver = tf.train.Saver()

    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    sess_config.allow_soft_placement = True

    summary_writer = tf.summary.FileWriter(config.model_dir)

    with tf.Session(config=sess_config) as sess:
        # Initialize all variables.
        sess.run(tf.global_variables_initializer())
        # Reload variables from disk.
        if tf.train.latest_checkpoint(config.model_dir):
            available_vars = available_variables(config.model_dir)
            if available_vars:
                saver = tf.train.Saver(var_list=available_vars)
                saver.restore(sess, tf.train.latest_checkpoint(config.model_dir))
                for v in available_vars:
                    logger.info('Reload {} from disk.'.format(v.name))
            else:
                logger.info('Nothing to be reload from disk.')
        else:
            logger.info('Nothing to be reload from disk.')

        evaluator = Evaluator()
        evaluator.init_from_existed(model, sess, data_reader)

        global dev_bleu, toleration
        dev_bleu = evaluator.evaluate(**config.dev) if config.train.eval_on_dev else 0
        toleration = config.train.toleration

        def train_one_step(batch, loss_op, train_op):
            feed_dict = expand_feed_dict({model.src_pls: batch[0], model.dst_pls: batch[1]})
            step, lr, loss, _ = sess.run(
                [model.global_step, model.learning_rate,
                 loss_op, train_op],
                feed_dict=feed_dict)
            if step % config.train.summary_freq == 0:
                summary = sess.run(model.summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary, global_step=step)
            return step, lr, loss

        def maybe_save_model():
            global dev_bleu, toleration

            def save():
                mp = config.model_dir + '/model_step_{}'.format(step)
                global_saver.save(sess, mp)
                logger.info('Save model in %s.' % mp)

            if config.train.eval_on_dev:
                new_dev_bleu = evaluator.evaluate(**config.dev)

                summary = tf.Summary(value=[tf.Summary.Value(tag="dev_bleu",
                                                             simple_value=new_dev_bleu)])

                summary_writer.add_summary(summary, step)

                if config.train.toleration is None:
                    save()
                else:
                    if new_dev_bleu >= dev_bleu:
                        save()
                        toleration = config.train.toleration
                        dev_bleu = new_dev_bleu
                    else:
                        toleration -= 1
            else:
                save()

        try:
            step = 0
            for epoch in range(1, config.train.num_epochs+1):
                for batch in data_reader.get_training_batches(epoches=1):

                    # Train normal instances.
                    start_time = time.time()
                    step, lr, loss = train_one_step(batch, loss_op, train_op)
                    logger.info(
                        'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}'.
                        format(epoch, step, lr, loss, time.time() - start_time))
                    # Save model
                    if config.train.save_freq > 0 \
                       and step > 0 \
                       and step % config.train.save_freq == 0:
                        maybe_save_model()

                    if config.train.num_steps is not None and step >= config.train.num_steps:
                        raise BreakLoopException("BreakLoop")

                    if toleration is not None and toleration <= 0:
                        raise BreakLoopException("BreakLoop")

                # Save model per epoch if config.train.save_freq is less or equal than zero
                if config.train.save_freq <= 0:
                    maybe_save_model()
        except BreakLoopException as e:
            logger.info(e)

        logger.info("Finish training.")
Ejemplo n.º 13
0
    col_idx = (1, 2, 3, 4, 5, 6)
    target_col = len(col_name) - 1

    # ============================================ #
    # Data location
    wd = os.path.dirname(os.path.abspath(__file__)) + '/'
    data_path = wd + 'data/'
    output_path = wd + 'output/'

    # ============================================ #
    # Read data
    data_files = os.listdir(data_path)
    for i in range(len(data_files)):
        data_files[i] = data_path + data_files[i]

    dr = DataReader(data_files, col_idx)
    ds = DataScaler()
    dp = DataParser()

    print('======== Supplying data ============')
    dr.read()

    print('======== Extracting data ============')
    # ============================================ #
    # Split data
    X = dr.data[:, :target_col]
    y = dr.data[:, target_col]
    alias = list(np.unique(y))
    y = dp.convertTextTarget(y, alias)
    #dump_result(output_path + 'accidents.csv', np.array(alias), ['accident'])
    print('Accident types: ', alias)
Ejemplo n.º 14
0
logfile = sys.argv[6]

ckpt_file = "ckpt"

x, y_true, y, gap_w, conv3_pool, train_step, accuracy, saver = \
    inference(batch_size)

sess = tf.InteractiveSession()

# Setup summary
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter(logfile, sess.graph)

# Get data reader
data_reader = DataReader(train_dataset_dir,
                         batch_size=batch_size,
                         file_names=False,
                         resize_to=(224, 224))

tf.add_to_collection('x', x)
tf.add_to_collection('y', y)

tf.add_to_collection('gap_w', gap_w)
tf.add_to_collection('conv3', conv3_pool)

ckpt = tf.train.latest_checkpoint(model_path)
if ckpt:
    saver.restore(sess, ckpt)
    print("Model loaded from file: %s" % ckpt)

# Initialize variables
sess.run(tf.global_variables_initializer())
Ejemplo n.º 15
0
def main(unused_argv):
    # prints a message if you've entered flags incorrectly
    if len(unused_argv) != 1:
        raise Exception("Problem with flags: %s" % unused_argv)

    # Get hyperparameters. We only get a subset of all the hyperparameters, others would be feed to Model directly.
    #logging.basicConfig(level=logging.INFO)
    print('Starting Basic model')
    log_root = FLAGS.log_root
    exp_name = FLAGS.exp_name
    data_file_path = FLAGS.data_file_path
    pinyin_dict_path = FLAGS.pinyin_dict_path
    id_data_dir = FLAGS.id_data_dir

    n_epoch = FLAGS.n_epoch
    batch_size = FLAGS.batch_size
    seed_num = FLAGS.seed_num
    max_timesteps = FLAGS.max_timesteps
    vocab_size = FLAGS.vocab_size
    train_size = FLAGS.train_size
    load_data_and_dr = FLAGS.load_data_and_dr
    use_local = FLAGS.use_local

    # make the directory for logs
    log_root = os.path.join(log_root, exp_name)
    if not os.path.exists(log_root):
        os.makedirs(log_root)

    if use_local == 1:
        #load or save the DR class from local dir
        DR_path = os.path.join(log_root, 'DataReader.pkl')
        #load or save the id data from local dir
        id_data_path = os.path.join(log_root, 'id_data.pkl')
    else:
        #load or save the DR class from global dir
        DR_path = os.path.join(id_data_dir, 'DataReader.pkl')
        #load or save the id data from global dir
        id_data_path = os.path.join(id_data_dir, 'id_data.pkl')

    if load_data_and_dr == 1:
        with open(DR_path, 'rb') as f:
            DR = pickle.load(f)
        with open(id_data_path, 'rb') as f1:
            input_pinyin_data = pickle.load(f1)
            input_word_data = pickle.load(f1)
            target_data = pickle.load(f1)
    else:
        # load and make the data for training
        DR = DataReader(vocab_size=vocab_size,
                        pinyin_dict_path=pinyin_dict_path)
        #input_data,target_data = DR.make_data_from_scratch(file_path = data_file_path,build_dictionary=True)
        input_pinyin_data, input_word_data, target_data = DR.make_data_from_dataframe(
            file_path=data_file_path,
            build_dictionary=True,
            max_rows=train_size)
        #save the DR class to local dir
        with open(DR_path, 'wb') as f:
            pickle.dump(DR, f)

        #save the ids data to local dir
        with open(id_data_path, 'wb') as f1:
            pickle.dump(input_pinyin_data, f1)
            pickle.dump(input_word_data, f1)
            pickle.dump(target_data, f1)

    # make the batch
    train_data_full = batch_generator_triple_with_length(
        input_pinyin_data, input_word_data, target_data, batch_size,
        max_timesteps, DR.word2id, DR.pinyin2id)

    # create the model
    model = SpellChecker(hps=FLAGS)
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    n_iter_per_epoch = len(input_pinyin_data) // (batch_size * 2)
    epoch = 0.0
    print('number of iterations per epoch: {}'.format(n_iter_per_epoch))
    print('start training...')
    for _ in range(n_epoch * 2):
        epoch += 0.5
        avg_loss = 0.0
        print("----- Epoch {}/{} -----".format(epoch, n_epoch))
        for t in tqdm(range(1, n_iter_per_epoch + 1)):
            batch_full = next(train_data_full)
            src_pinyin_list, src_word_list, src_length_list, tgt_list, tgt_length_list = batch_full

            src_pinyin_list = np.asarray(src_pinyin_list, dtype=np.int32)
            src_word_list = np.asarray(src_word_list, dtype=np.int32)
            src_length_list = np.asarray(src_length_list, dtype=np.int32)
            tgt_list = np.asarray(tgt_list, dtype=np.int32)
            keep_ratio = FLAGS.keep_ratio

            #tgt_length_list = np.asarray(tgt_length_list,dtype = np.int32)
            loss = model.train_one_step(src_pinyin_list, src_word_list,
                                        src_length_list, tgt_list, keep_ratio,
                                        sess)
            avg_loss += loss
        avg_loss /= n_iter_per_epoch
        print('the avg_loss is {}'.format(avg_loss))

        if epoch == 1.5:
            print('Build model for serving...')
            model.build_model_for_serving(sess)
            print('Build model serving done!')
Ejemplo n.º 16
0
    def init_from_frozen_graphdef(self, config):
        frozen_graph_path = os.path.join(config.model_dir,
                                         'freeze_graph_test.py')
        # If the file doesn't existed, create it.
        if not os.path.exists(frozen_graph_path):
            logging.warning(
                'The frozen graph does not existed, use \'init_from_config\' instead'
                'and create a frozen graph for next use.')
            self.init_from_config(config)
            saver = tf.train.Saver()
            save_dir = '/tmp/graph-{}'.format(os.getpid())
            os.mkdir(save_dir)
            save_path = '{}/ckpt'.format(save_dir)
            saver.save(sess=self.sess, save_path=save_path)

            with tf.Session(graph=tf.Graph()) as sess:
                clear_devices = True
                output_node_names = ['loss_sum', 'predictions']
                # We import the meta graph in the current default Graph
                saver = tf.train.import_meta_graph(save_path + '.meta',
                                                   clear_devices=clear_devices)

                # We restore the weights
                saver.restore(sess, save_path)

                # We use a built-in TF helper to export variables to constants
                output_graph_def = tf.graph_util.convert_variables_to_constants(
                    sess,  # The session is used to retrieve the weights
                    tf.get_default_graph().as_graph_def(
                    ),  # The graph_def is used to retrieve the nodes
                    output_node_names  # The output node names are used to select the useful nodes
                )

                # Finally we serialize and dump the output graph to the filesystem
                with tf.gfile.GFile(frozen_graph_path, "wb") as f:
                    f.write(output_graph_def.SerializeToString())
                    logging.info("%d ops in the final graph." %
                                 len(output_graph_def.node))

                # Remove temp files.
                os.system('rm -rf ' + save_dir)
        else:
            sess_config = tf.ConfigProto()
            sess_config.gpu_options.allow_growth = True
            sess_config.allow_soft_placement = True
            self.sess = tf.Session(config=sess_config)
            self.data_reader = DataReader(config)

            # We load the protobuf file from the disk and parse it to retrieve the
            # unserialized graph_def
            with tf.gfile.GFile(frozen_graph_path, "rb") as f:
                graph_def = tf.GraphDef()
                graph_def.ParseFromString(f.read())

            # Import the graph_def into current the default graph.
            tf.import_graph_def(graph_def)
            graph = tf.get_default_graph()
            self.model = AttrDict()

            def collect_placeholders(prefix):
                ret = []
                idx = 0
                while True:
                    try:
                        ret.append(
                            graph.get_tensor_by_name('import/{}_{}:0'.format(
                                prefix, idx)))
                        idx += 1
                    except KeyError:
                        return tuple(ret)

            self.model['src_pls'] = collect_placeholders('src_pl')
            self.model['dst_pls'] = collect_placeholders('dst_pl')
            self.model['predictions'] = graph.get_tensor_by_name(
                'import/predictions:0')
Ejemplo n.º 17
0
    except:
        print("Please add --train or --test after py Regressor.py")
        options = None

    if options == "--train":
        r = Regressor("Random Forest", load_model=False)
        mod = Regressor("Random Forest", load_model=False)
        cv, ma, mse = r.train(mod, save=False, make_chart=False)
        print(cv, ma, mse)

    elif options == "--test":
        model_name = sys.argv[2] + " " + sys.argv[
            3]  #Random Forest_2017 or Random Forest_2016
        year = int(model_name.split("_")[-1])
        r = Regressor(model_name, load_model=True)
        reader = DataReader()
        df = reader.create_input_data()
        predictions = r.predict(df, year)
        print("Actual || Predicted")
        for i in range(len(predictions)):
            print(df.iloc[i]['hdi'], "||", predictions[i])

######Training Code#########

#cv_error = []
#testing_ma_error = []
#testing_mse = []
#mod = RandomForestRegressor(bootstrap=True, criterion='mae', n_estimators=100)
#mod = RandomForestRegressor()
#r = Regressor("Random Forest_2017", load_model=True)
#importances = r.model.feature_importances_
Ejemplo n.º 18
0
    def train(self, model, save=False, make_chart=False):
        """
		Trains an input model. Makes Calculations, Charts, and Saves
		the model if necessary.

		Parameters
		----------
		model:     SKLearn Model The regression model to use
		save:      Boolean Whether or not the model should be saved
		make_chart Boolean Whether or not to make/save a chart

		Returns
		-------
		float, float, float: The Average CV Mean Squared Error, Mean Absolute Error, and Test MSE 
		"""
        #get/split data
        reader = DataReader()
        df = reader.create_input_data()
        df = self.preprocess(df)
        self.X_train, self.X_test, self.y_train, self.y_test = self.split_data(
            df)

        parameters = {
            'n_estimators': [1, 5, 10, 20, 30],
            'max_depth': [1, 5, 10]
        }
        rf = RandomForestRegressor()
        self.model = GridSearchCV(rf, parameters, cv=10)
        #train model
        self.model.fit(self.X_train, self.y_train)

        #Feature importance
        importances = self.model.best_estimator_.feature_importances_
        cols = self.X_train.columns
        for i in range(len(importances)):
            print(cols[i], importances[i])

        if save:
            joblib.dump(self.model.best_estimator_,
                        "../models/" + self.name + "_2017.joblib")

        print("------------------------")
        MSEs = cross_val_score(estimator=self.model,
                               X=self.X_train,
                               y=self.y_train,
                               scoring='neg_mean_squared_error',
                               cv=8)

        predicted = self.model.predict(self.X_test)
        print("Average CV Mean Squared Error: ", abs(np.mean(MSEs)))
        print(
            "Testing Mean Absolute Error: ",
            mean_absolute_error(self.y_test, self.model.predict(self.X_test)))
        print("Testing MSE: ", mean_squared_error(self.y_test, predicted))
        #print(self.model.feature_importances_)
        if make_chart:
            print("Generating Chart...")
            plt.style.use('dark_background')
            fig, ax = plt.subplots(nrows=1, ncols=1)
            ax.set_ylabel('HDI')
            ax.set_xlabel("Municipality Codmun ID")
            ax.set_title(self.name + 'Real vs Predicted')
            green, = ax.plot(np.arange(20),
                             self.y_test[0:100:5],
                             'g',
                             label='True')
            red, = ax.plot(np.arange(20),
                           predicted[0:100:5],
                           'r',
                           label='Predicted')
            ax.set_xticks(np.arange(20))
            x_labels = self.X_test.iloc[0:100:5]['codmun'].tolist()
            ax.set_xticklabels([str(int(y)) for y in x_labels],
                               rotation='vertical')
            plt.legend(handles=[green, red], labels=["True", "Predicted"])
            plt.tight_layout()
            fig.savefig(self.name + "_real_v_predicted")
            for x in range(0, 100, 5):
                print(predicted[x], x_labels[int(x / 5)])
            print(x_labels, predicted[0:100:5])

        return np.mean(MSEs), mean_absolute_error(
            self.y_test, self.model.predict(self.X_test)), mean_squared_error(
                self.y_test, predicted)
Ejemplo n.º 19
0
    "pert_id": ['BRD-U41416256', 'BRD-U60236422'],
    "pert_type": ["trt_cp"],
    "cell_id": ['A375', 'HA1E', 'HELA', 'HT29', 'MCF7', 'PC3', 'YAPC'],
    "pert_idose":
    ["0.04 um", "0.12 um", "0.37 um", "1.11 um", "3.33 um", "10.0 um"]
}

# check cuda
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Use GPU: %s" % torch.cuda.is_available())

data = DataReader(drug_file, gene_file, gene_expression_file_train,
                  gene_expression_file_dev, gene_expression_file_test, filter,
                  device)
print('#Train: %d' % len(data.train_feature['drug']))
print('#Dev: %d' % len(data.dev_feature['drug']))
print('#Test: %d' % len(data.test_feature['drug']))

# model creation
model = DeepCE(drug_input_dim=drug_input_dim,
               drug_emb_dim=drug_embed_dim,
               conv_size=conv_size,
               degree=degree,
               gene_input_dim=np.shape(data.gene)[1],
               gene_emb_dim=gene_embed_dim,
               num_gene=np.shape(data.gene)[0],
               hid_dim=hid_dim,
               dropout=dropout,
Ejemplo n.º 20
0
def train(args):
    vocab = Vocab.load(args.vocab, max_size=args.vocab_size)
    data_reader = DataReader(data_dir=args.data_dir, shuffle=True)
    preprocessor = Preprocessor(
        predict_prev=args.predict_prev,
        predict_cur=args.predict_cur,
        predict_next=args.predict_next,
        vocab=vocab, max_length=args.max_length, gpu=args.gpu)
    model = SkipThought(
        rnn_type=args.rnn_type, num_words=len(vocab),
        word_dim=args.word_dim, hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        predict_prev=args.predict_prev,
        predict_cur=args.predict_cur,
        predict_next=args.predict_next)
    print(model)

    if args.pretrained is not None:
        print(f'Loading pretrained model from {args.pretrained}')
        model.load_state_dict(
            torch.load(args.pretrained,
                       map_location=lambda storage, loc: storage))
    if args.gpu > -1:
        model.cuda(args.gpu)
    optimizer = optim.Adam(model.parameters())

    summary_writer = SummaryWriter(os.path.join(args.save_dir, 'log'))

    def add_scalar_summary(name, value, step):
        summary_writer.add_scalar(tag=name, scalar_value=value,
                                  global_step=step)

    def add_text_summary(name, value, step):
        summary_writer.add_text(tag=name, text_string=value,
                                global_step=step)

    def variable(tensor, volatile=False):
        return Variable(tensor, volatile=volatile)

    def run_train_iter(batch):
        if not model.training:
            model.train()
        src, tgt = preprocessor(batch)
        src = (variable(src[0]), src[1])
        for k in tgt:
            tgt[k] = (variable(tgt[k][0]), tgt[k][1])
        logits = model.forward(src=src, tgt=tgt)
        loss = 0
        for k in tgt:
            logits_k = logits[k]
            tgt_k = tgt[k]
            loss = loss + basic.sequence_cross_entropy(
                logits=logits_k[:-1], targets=tgt_k[0][1:],
                length=tgt_k[1] - 1)
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm(model.parameters(), max_norm=10)
        optimizer.step()
        return loss.data[0]

    def ids_to_words(ids):
        words = []
        eos_id = vocab.stoi(vocab.eos)
        for id_ in ids:
            words.append(vocab.itos(id_))
            if id_ == eos_id:
                break
        return words

    def generate_using_decoder(name, src, max_length):
        _, encoder_state = model.encoder(words=src[0], length=src[1])
        if isinstance(encoder_state, tuple):  # LSTM
            encoder_state = encoder_state[0]
        context = (encoder_state.transpose(0, 1).contiguous()
                   .view(-1, args.hidden_dim))
        batch_size = src[1].size(0)

        bos_id = vocab.stoi(vocab.bos)
        bos = Variable(src[1].new(1, batch_size).fill_(bos_id))
        decoder = model.get_decoder(name)
        prev_pred = bos
        done = torch.zeros(batch_size).byte()
        hyps = []
        prev_state = context.unsqueeze(0)
        for t in range(max_length):
            if done.all():
                break
            decoder_input = prev_pred
            logit, prev_state = decoder(words=decoder_input,
                                        prev_state=prev_state)
            pred = logit.max(2)[1]
            prev_pred = pred
            hyps.append(pred.data)
        hyps = torch.cat(hyps, dim=0).transpose(0, 1).tolist()
        return hyps

    def generate(batch):
        # Greedy search
        src, tgt = preprocessor(batch)
        src = (variable(src[0]), src[1])
        for k in tgt:
            tgt[k] = (variable(tgt[k][0], volatile=True), tgt[k][1])
        batch_size = src[0].size(1)
        max_length = src[0].size(0) * 2
        generated = {}
        for k in tgt:
            generated[k] = generate_using_decoder(
                name=k, src=src, max_length=max_length)
        results = []
        for i in range(batch_size):
            res = {'src': ' '.join(ids_to_words(src[0][:src[1][i], i].data)),
                   'tgt': {},
                   'out': {}}
            for k in tgt:
                res['tgt'][k] = ' '.join(ids_to_words(tgt[k][0][1:, i].data))
                res['out'][k] = ' '.join(ids_to_words(generated[k][i]))
            results.append(res)
        return results

    def generate_synthetic_batch(real_batch):
        def sort_by_length(tgt_of_key):
            sorted_length, sort_inds = tgt_of_key[1].sort(
                dim=0, descending=True)
            return tgt_of_key[0][:, sort_inds], sorted_length

        # Forward: given prev, generate cur'
        _, tgt = preprocessor(real_batch)
        tgt_prev, tgt_prev_length = sort_by_length(tgt['prev'])
        syn_src_fw = generate_using_decoder(
            name='next',
            src=(variable(tgt_prev[1:], volatile=True),
                 tgt_prev_length - 1),
            max_length=args.max_length)
        # Backward: given next, generate cur''
        tgt_next, tgt_next_length = sort_by_length(tgt['next'])
        syn_src_bw = generate_using_decoder(
            name='prev',
            src=(variable(tgt_next[1:], volatile=True),
                 tgt_next_length - 1),
            max_length=args.max_length)
        syn_batch_fw = []
        syn_batch_bw = []
        for i in range(len(real_batch)):
            syn_src_fw_str = ' '.join(ids_to_words(syn_src_fw[i]))
            syn_src_bw_str = ' '.join(ids_to_words(syn_src_bw[i]))
            syn_batch_fw.append(
                (real_batch[i][0], syn_src_fw_str, real_batch[i][2]))
            syn_batch_bw.append(
                (real_batch[i][0], syn_src_bw_str, real_batch[i][2]))
        return syn_batch_fw, syn_batch_bw

    global_step = 0

    def print_samples():
        model.eval()
        num_samples = 2
        samples = data_reader.next_batch(size=num_samples, peek=True)
        syn_samples_fw, syn_samples_bw = generate_synthetic_batch(samples)
        gen_results = generate(samples)
        syn_gen_results_fw = generate(syn_samples_fw)
        syn_gen_results_bw = generate(syn_samples_bw)
        text_val = ''
        for i, res in enumerate(gen_results):
            text_val += f'* sample (real) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        for i, res in enumerate(syn_gen_results_fw):
            text_val += f'* sample (syn_fw) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        for i, res in enumerate(syn_gen_results_bw):
            text_val += f'* sample (syn_bw) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        add_text_summary('Sample', value=text_val, step=global_step)

    for epoch in range(args.max_epoch):
        data_reader.start_epoch()
        for batch in tqdm(data_reader.iterator(args.batch_size),
                          desc=f'Epoch {epoch}'):
            # Train on real batch
            real_loss = run_train_iter(batch)
            # Train on synthetic batches
            syn_batch_fw, syn_batch_bw = generate_synthetic_batch(batch)
            syn_loss_fw = run_train_iter(syn_batch_fw)
            syn_loss_bw = run_train_iter(syn_batch_bw)
            global_step += 1
            add_scalar_summary(name='real_loss', value=real_loss,
                               step=global_step)
            add_scalar_summary(name='syn_loss_fw', value=syn_loss_fw,
                               step=global_step)
            add_scalar_summary(name='syn_loss_bw', value=syn_loss_bw,
                               step=global_step)
            if global_step % args.print_every == 0:
                print_samples()
            if global_step % args.save_every == 0:
                model_filename = f'model-{global_step}.pt'
                model_path = os.path.join(args.save_dir, model_filename)
                torch.save(model.state_dict(), model_path)
                print(f'\nIter #{global_step}: '
                      f'Saved checkpoint to {model_path}')
Ejemplo n.º 21
0
 def __init__(self, model, output_name):
     self.model = model
     self.datareader = DataReader()
     self.metrics = ErrorMetrics()
     self.output_name = output_name
Ejemplo n.º 22
0
def train(config, num_epoch, last_pretrain_model_dir, pretrain_model_dir,
          model_dir, block_idx_enc, block_idx_dec):
    logger = logging.getLogger('')
    config.num_blocks_enc = block_idx_enc
    config.num_blocks_dec = block_idx_dec
    # if block_idx >= 2:
    #     config.train.var_filter = 'encoder/block_' + str(block_idx - 1) + '|' + 'decoder/block_' + str(
    #         block_idx - 1) + '|' + 'encoder/src_embedding' + '|' + 'decoder/dst_embedding'
    # if block_idx >= 2:
    #     config.train.var_filter = 'encoder/block_' + str(block_idx - 1) + '|' + 'decoder/block_' + str(
    #         block_idx - 1)
    logger.info("config.num_blocks_enc=" + str(config.num_blocks_enc) +
                ",config.num_blocks_dec=" + str(config.num_blocks_dec) +
                ',config.train.var_filter=' + str(config.train.var_filter))
    """Train a model with a config file."""
    data_reader = DataReader(config=config)
    model = eval(config.model)(config=config, num_gpus=config.train.num_gpus)
    model.build_train_model(test=config.train.eval_on_dev)

    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    sess_config.allow_soft_placement = True

    summary_writer = tf.summary.FileWriter(pretrain_model_dir,
                                           graph=model.graph)

    with tf.Session(config=sess_config, graph=model.graph) as sess:
        # Initialize all variables.
        sess.run(tf.global_variables_initializer())
        # Reload variables in disk.
        if tf.train.latest_checkpoint(last_pretrain_model_dir):
            available_vars = available_variables_without_global_step(
                last_pretrain_model_dir)
            # available_vars = available_variables(last_pretrain_model_dir)
            if available_vars:
                saver = tf.train.Saver(var_list=available_vars)
                saver.restore(
                    sess, tf.train.latest_checkpoint(last_pretrain_model_dir))
                for v in available_vars:
                    logger.info('Reload {} from disk.'.format(v.name))
            else:
                logger.info('Nothing to be reload from disk.')
        else:
            logger.info('Nothing to be reload from disk.')

        evaluator = Evaluator()
        evaluator.init_from_existed(model, sess, data_reader)

        global dev_bleu, toleration
        dev_bleu = evaluator.evaluate(
            **config.dev) if config.train.eval_on_dev else 0
        toleration = config.train.toleration

        def train_one_step(batch):
            feat_batch, target_batch = batch
            feed_dict = expand_feed_dict({
                model.src_pls: feat_batch,
                model.dst_pls: target_batch
            })
            step, lr, loss, _ = sess.run([
                model.global_step, model.learning_rate, model.loss,
                model.train_op
            ],
                                         feed_dict=feed_dict)
            if step % config.train.summary_freq == 0:
                logger.info('pretrain summary_writer...')
                summary = sess.run(model.summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary, global_step=step)
                summary_writer.flush()
            return step, lr, loss

        def maybe_save_model(model_dir, is_save_global_step=True):
            global dev_bleu, toleration
            new_dev_bleu = evaluator.evaluate(
                **config.dev) if config.train.eval_on_dev else dev_bleu + 1
            if new_dev_bleu >= dev_bleu:
                mp = model_dir + '/pretrain_model_step_{}'.format(step)

                # model.saver.save(sess, mp)
                if is_save_global_step:
                    model.saver.save(sess, mp)
                else:
                    variables_without_global_step = global_variables_without_global_step(
                    )
                    saver = tf.train.Saver(
                        var_list=variables_without_global_step, max_to_keep=10)
                    saver.save(sess, mp)

                logger.info('Save model in %s.' % mp)
                toleration = config.train.toleration
                dev_bleu = new_dev_bleu
            else:
                toleration -= 1

        step = 0
        for epoch in range(1, num_epoch + 1):
            for batch in data_reader.get_training_batches_with_buckets():
                # Train normal instances.
                start_time = time.time()
                step, lr, loss = train_one_step(batch)
                logger.info(
                    'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}'
                    .format(epoch, step, lr, loss,
                            time.time() - start_time))

                if config.train.num_steps and step >= config.train.num_steps:
                    break

            # Early stop
            if toleration <= 0:
                break

        maybe_save_model(pretrain_model_dir)
        if model_dir:
            maybe_save_model(model_dir, False)
        logger.info("Finish pretrain block_idx_enc=" + str(block_idx_enc) +
                    ',block_idx_dec=' + str(block_idx_dec))
Ejemplo n.º 23
0
def analyze(data, acc):
    """Analyze the data in `data' and store quantities in the accumulator `acc'"""

    ##fir = load_amplitude_reco_weights('pulse_weights.pkl')
    #fir = load_amplitude_reco_weights('computed_weights.pkl')
    #print(len(fir), fir)

    signal_processing = cfg.cfg.get('analysis',
                                    'signal_processing',
                                    fallback='')

    if signal_processing == 'butterworth':
        butterworth = True
        print("# Using Butterworth filter for signal processing")
    else:
        butterworth = False

    lfreq_default = cfg.cfg.getfloat('analysis', 'lfreq_default', fallback=3)
    hfreq_default = cfg.cfg.getfloat('analysis', 'hfreq_default', fallback=300)
    thr_default = cfg.cfg.getfloat('analysis',
                                   'threshold_default',
                                   fallback=0.01)
    win_default = cfg.cfg.getfloat('analysis',
                                   'peak_search_window',
                                   fallback=1.e-3)
    lfreq = []
    thr = []
    hfreq = []
    win = []
    gain = []
    noise_threshold = []
    for i in range(len(data)):
        lfreq.append(
            cfg.cfg.getfloat('analysis',
                             'filter_lfreq_ch%03d' % (i + 1),
                             fallback=lfreq_default))
        hfreq.append(
            cfg.cfg.getfloat('analysis',
                             'filter_hfreq_ch%03d' % (i + 1),
                             fallback=hfreq_default))
        thr.append(
            cfg.cfg.getfloat('analysis',
                             'thr_ch%003d' % (i + 1),
                             fallback=thr_default))
        win.append(
            cfg.cfg.getfloat('analysis',
                             'peak_search_window_ch%003d' % (i + 1),
                             fallback=win_default))
        gain.append(
            cfg.cfg.getfloat('setup', 'gain_ch%03d' % (i + 1), fallback=1000))
        noise_threshold.append(
            cfg.cfg.getfloat('analysis',
                             'noise_threshold_ch%03d' % (i + 1),
                             fallback=2e-6))

    max_samples = cfg.cfg.getint('data', 'max_samples_per_file', fallback=-1)
    n_max_chunk = cfg.cfg.getint('data', 'n_max_chunk', fallback=-1)

    ## analyze independent channels
    tot_samples_read = 0
    for i, f in enumerate(data):

        print('# Processing file', f, '(%d)' % i)
        # to avoid a too big file loaded in RAM, split the reading in parts
        # and accumulate the results in acc
        #d = read_data(f, max_samples)
        h = DataReader(f, max_samples, n_max_chunk)
        n_samples_read = 0
        for d in h:
            cfg.params.sampling_freq = h.sampling_freq
            duration = len(d) / 3.6e3 / cfg.params.sampling_freq
            # skipping runs of less than 28.8 seconds
            if duration < 0.008:
                print("# skipping file/chunk %d (%d samples - %f hours)" %
                      (i, len(d), duration))
                continue
            ###print("# processing file %d (%d samples - %f hours)" % (i, len(d), duration))
            print("# Progress: %.1f%%" % (h.progress() * 100.))
            d = volt(d) / gain[i]
            suff = '_det%03d' % i
            det = i + 1

            acc.set_sampling_freq(det, h.sampling_freq)

            #compute_pulse_weights(d, 200)

            ##for j, s in enumerate(d):
            ##    if j > 50000:
            ##        break
            ##    print(i, j, s)
            ##print('\n')
            ##continue
            #import sys
            #sys.exit(0)

            # amplitude spectrum
            if butterworth:
                #TODO select freq depending on channel type
                peaks, peaks_max = filt_ana.find_peaks_2(
                    d, [lfreq[i], hfreq[i]], cfg.params.sampling_freq, win[i],
                    thr[i])
            else:
                peaks, peaks_max = find_peaks(d * 1., fir)

            peaks = list(np.add(peaks, n_samples_read))
            #print(peaks[:10], '...', peaks[-10:])
            acc.add(det, 'peak', (peaks, peaks_max))

            # store peak positions and amplitudes for
            # correlation analysis
            #corr_peaks[1] = (peaks, peaks_max)

            # baseline vs time
            base, base_min = baseline(d * 1., 10000)
            base = list(np.add(base, n_samples_read))
            acc.add(det, 'baseline', (base, base_min))

            ## normalized pulse shape
            #shapes = pulse_shapes(d * 1., peaks, 1000)
            #plot_pulse_shapes(shapes, suff, det)

            ## power spectral density -- all
            #f, Pxx_den = signal.welch(d, cfg.params.sampling_freq, nperseg = 25000)
            # power spectral density -- noise only
            dn = remove_signals(d, noise_threshold[i], 10000)
            #print(dn[:10], '...', dn[-10:], len(dn), d[:10], '...', d[-10:], len(d))
            f, Pxx_den = signal.welch(dn,
                                      cfg.params.sampling_freq,
                                      nperseg=min(25000, int(len(dn) / 2)))
            acc.add(det, 'fft', (f, Pxx_den))

            ## rate FFT # FIXME: takes quite a long time
            #p = [0] * (peaks[len(peaks) - 1] + 1)
            #for el in peaks:
            #    p[el] = 1.
            ##p = np.abs(np.fft.rfft(p[:10000]))
            ##p = np.abs(np.fft.rfft(rate))
            ##f = np.linspace(0, 1/2, len(p))
            ##plot_fft_rate(f, p, suff)
            #from scipy import signal
            #f, Pxx_den = signal.periodogram(p[:10000], 1)
            #plot_fft_rate(f, Pxx_den, suff)

            acc.add_analyzed_samples(det, h.last_chunk_size)
            n_samples_read += h.last_chunk_size
            #print('-->', h.last_chunk_size, n_samples_read)

        tot_samples_read += n_samples_read

    return tot_samples_read
    target_col = len(col_name) - 1

    # ============================================ #
    # Data location
    wd = os.path.dirname(os.path.abspath(__file__)) + '/'
    data_path = wd + 'data/'
    data_path += 'prototype/'
    output_path = wd + 'output/'

    # ============================================ #
    # Read data
    data_files = os.listdir(data_path)
    for i in range(len(data_files)):
        data_files[i] = data_path + data_files[i]

    dr = DataReader(data_files, col_idx)
    ds = DataScaler()
    dp = DataParser()

    print('======== Supplying data ============')
    for file_id in range(len(data_files)):
        dr_tmp = DataReader([data_files[file_id]], col_idx)
        dr_tmp.read(delimiter='\t')
        data = dr_tmp.getData()
        data = parse_data(dp, data, col_name, target_col)
        dr.append(data)

        del data
        del dr_tmp

        print(file_id + 1, ' - ', data_files[file_id], ': ',