Beispiel #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', '-i', help='input file (csv file)')
    parser.add_argument('--minsup', '-m', type=float, help='minimum support')
    parser.add_argument('--minconf',
                        '-c',
                        type=float,
                        help='minimun confidence')
    parser.add_argument('--numrule',
                        '-n',
                        type=int,
                        help='max number of rules output')
    args = parser.parse_args()

    db = CsvDatabase(args.input)
    Factory.setup_db(db)
    apriori = Apriori(db, args.minsup, args.minconf, args.numrule)
    frequent_itemsets = apriori.generate_frequent_itemset()
    for level in frequent_itemsets:
        print(level, ': ', len(frequent_itemsets[level]))
        for itemset in frequent_itemsets[level]:
            print(itemset)
    print('Rules:')
    for rule in apriori.generate_all_confidence_rules():
        print(rule, '(confidence: %.4f)' % (rule.confidence))
Beispiel #2
0
    def __init__(self,
                 fold,
                 conf,
                 data_conf,
                 cache_manager,
                 args,
                 inference=False,
                 verbose=True):
        self._args = args
        self._fold = fold
        self._conf = conf
        self._data_conf = data_conf
        self._inference = inference
        self._verbose = verbose
        self.tmp_dir = self._data_conf['tmp']

        # we save output with this folder structure:
        # output/
        #       -> tensorboard/ (tensorboard results)
        #       -> results/ (output files: images, illuminant, GT, etc...)
        #       -> checkpoint.pth.tar (checkpoint to continue training in case of failure)
        #       -> model_best.pth.tar (best checkpoint, for inference)
        self._pretrained_model = None
        if not self._inference:
            output_dir = os.path.join(self._args.outputfolder, str(self._fold))
            self._tensorboard_dir = os.path.join(output_dir, 'tensorboard')
            self._results_dir = os.path.join(output_dir, 'results')
            self._best_checkpoint_file = os.path.join(output_dir,
                                                      'model_best.pth.tar')
            self._checkpoint_file = os.path.join(output_dir,
                                                 'checkpoint.pth.tar')
            self._pretrained_model = self._args.pretrainedmodel

            # create all directories
            os.makedirs(self._tensorboard_dir, exist_ok=True)
        else:
            # for inference all results are saved under the output directory
            # (images, illuminant, GT, etc...)
            self._results_dir = self._args.outputfolder
            if isinstance(self._args.checkpointfile, list):
                self._checkpoint_file = self._args.checkpointfile[fold]
            else:
                self._checkpoint_file = self._args.checkpointfile

        self._display = Display(self._conf)
        self._factory = Factory(self._conf, self._data_conf, cache_manager,
                                self._args, verbose)
        self._cache_manager = cache_manager

        # create output directory
        os.makedirs(self._results_dir, exist_ok=True)

        os.environ['TORCH_HOME'] = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), os.pardir,
            'torch_model_zoo')
Beispiel #3
0
    def __init__(self, config_filename):
        # configuration
        config = Config()
        config_file = "{}/{}".format(config.config_dir, config_filename)
        config.update_config(config_file)

        # word embedding
        print("setting word embedding...")
        word_embedding = Embedding()

        word_embedding_file = "{}/word_embedding_{}.pkl".format(
            config.cache_dir, config.config_name)
        print("loading word embedding from {}...".format(word_embedding_file))
        word_embedding.load_word_embedding(word_embedding_file)

        # demo dataset
        print("setting demo dataset...")
        self.demo_dataset = Dataset(config.data_config)

        self.demo_dataset.set_word_to_index(word_embedding.word2index)

        label_mapping_file = "{}/label_mapping_{}.pkl".format(
            config.cache_dir, config.config_name)
        print("loading label mapping from {}...".format(label_mapping_file))
        self.demo_dataset.load_label_mapping(label_mapping_file)

        # model
        new_model_config = {
            "vocab_size": word_embedding.vocab_size,
            "word_dim": word_embedding.word_dim,
            "document_length": self.demo_dataset.document_length,
            "sentence_length": self.demo_dataset.sentence_length,
            "num_labels": self.demo_dataset.num_labels
        }
        config.update_model_config(new_model_config)

        model = Model(config.model_config)

        # model factory
        self.network = Factory(model)

        self.network.set_test_module()
        print("number of GPUs: {}".format(self.network.num_gpus))
        print("device: {}".format(self.network.device))

        # load model
        model_file = "{}/model_{}.pkl".format(config.cache_dir,
                                              config.config_name)
        print("loading model from {}...".format(model_file))
        self.network.load_model(model_file)

        self.network.model_to_device()
        self.network.eval_mode()
    def test_24_hours_shifts(self):
        # 1 hour to load, 1 hour to produce
        # produce 2 every 2 hours
        # ie 24 - 1 (1 to start the machine) / 2 = 10
        machine, spec, stock = create_machine(stocking_zone_size=None)
        factory = Factory()
        factory.add_worker(Worker(working_hour = 8 * 60))
        factory.add_worker(Worker(working_hour = 8 * 60))
        factory.add_worker(Worker(working_hour = 8 * 60))
        factory.add_production_unit(machine)
        factory.run(24 * 60)

        self.assertEquals(stock.count(), 720 - 1)
def get_factory(yaml_conf):
    yaml = load(yaml_conf)
    factory = Factory(name=yaml["name"])
    materials = create_materials(yaml)
    for production_unit in yaml["production_units"]:
        spec = create_spec(materials, production_unit)
        config = {}
        config["rate_by_minute"]= production_unit.get("rate", 1)
        factory.add_production_unit(ProductionUnit(spec=spec, config=config,  name=production_unit["name"]))

    for worker in yaml.get("workers", []):
        working_hour = worker.get("working_hour", 8) * 60
        factory.add_worker(Worker(working_hour=working_hour))
    return factory
Beispiel #6
0
 def run(self):
     parse = Parse.get()
     document = Factory.product(PLUG['edu_school_doc'])
     if '-r' in parse:
         document.refresh()
     else:
         document.test()
Beispiel #7
0
    def eliminar(self):
        self.model.pedido_id = int(ARG)
        self.model.select()
        cliente = Factory().make('Cliente', self.model.cliente)
        self.model.delete()

        redirect("cliente/ver", cliente.cliente_id)
Beispiel #8
0
def main():
    db = CsvDatabase('test_data/zero_one.csv')
    Factory.setup_db(db)
    runtimes = []  # store runtime (in second)
    numrules = []  # store number of confident rules
    for support in np.linspace(low_support, high_support, interval):
        start_time = time.monotonic()
        apriori = Apriori(db, support, confidence, None)
        rules = apriori.generate_all_confidence_rules()
        #for rule in rules:
        #  print(rule, ': ', rule.confidence)
        end_time = time.monotonic()
        runtimes.append(end_time - start_time)
        numrule = len(rules)
        numrules.append(numrule)
        print('Support %.2f, runtime %.9f, numrules %d' %
              (support, end_time - start_time, numrule))
    graph_runtime(runtimes)
    graph_numrules(numrules)
Beispiel #9
0
    def ver(self, pedido, denominacion):
        coleccion = pedido.producto_collection
        total = []
        for producto in coleccion:
            producto.subtotal = producto.fm * producto.precio
            total.append(producto.subtotal)

        archivo = '{}/pedido_ver.html'.format(STATIC_PATH)
        contenido = Template(archivo).render_dict(coleccion, tag="filapepro")

        cliente = Factory().make('Cliente', pedido.cliente)
        domicilio = cliente.domicilio
        diccionario = vars(pedido)
        diccionario.update(vars(domicilio))
        diccionario['denominacion'] = denominacion
        diccionario['total'] = sum(total)
        contenido = Template(base=contenido).render(diccionario)

        print(HTTP_HTML, "\n")
        print(Template(TEMPLATE_PATH).render_inner(contenido))
Beispiel #10
0
def get_factory(yaml_conf):
    yaml = load(yaml_conf)
    factory = Factory(name=yaml["name"])
    materials = create_materials(yaml)
    for production_unit in yaml["production_units"]:
        spec = create_spec(materials, production_unit)
        config = {}
        config["rate_by_minute"] = production_unit.get("rate", 1)
        factory.add_production_unit(
            ProductionUnit(spec=spec,
                           config=config,
                           name=production_unit["name"]))

    for worker in yaml.get("workers", []):
        working_hour = worker.get("working_hour", 8) * 60
        factory.add_worker(Worker(working_hour=working_hour))
    return factory
Beispiel #11
0
 def test_factory_add_worker(self):
     factory = Factory()
     factory.add_worker(Worker())
     self.assertEquals(len(factory.workers), 1)
Beispiel #12
0
 def test_factory_is_aware_of_time(self):
     factory = Factory()
     factory.run()
     assert_that(factory.time, is_(1))
Beispiel #13
0
 def test_factory_add_production_unit(self):
     factory = Factory()
     factory.add_production_unit(ProductionUnit(None))
     self.assertEquals(len(factory.production_units), 1)
 def test_factory_add_production_unit(self):
     factory = Factory()
     factory.add_production_unit(ProductionUnit(None))
     self.assertEquals(len(factory.production_units), 1)
Beispiel #15
0
class Worker():
    def __init__(self,
                 fold,
                 conf,
                 data_conf,
                 cache_manager,
                 args,
                 inference=False,
                 verbose=True):
        self._args = args
        self._fold = fold
        self._conf = conf
        self._data_conf = data_conf
        self._inference = inference
        self._verbose = verbose
        self.tmp_dir = self._data_conf['tmp']

        # we save output with this folder structure:
        # output/
        #       -> tensorboard/ (tensorboard results)
        #       -> results/ (output files: images, illuminant, GT, etc...)
        #       -> checkpoint.pth.tar (checkpoint to continue training in case of failure)
        #       -> model_best.pth.tar (best checkpoint, for inference)
        self._pretrained_model = None
        if not self._inference:
            output_dir = os.path.join(self._args.outputfolder, str(self._fold))
            self._tensorboard_dir = os.path.join(output_dir, 'tensorboard')
            self._results_dir = os.path.join(output_dir, 'results')
            self._best_checkpoint_file = os.path.join(output_dir,
                                                      'model_best.pth.tar')
            self._checkpoint_file = os.path.join(output_dir,
                                                 'checkpoint.pth.tar')
            self._pretrained_model = self._args.pretrainedmodel

            # create all directories
            os.makedirs(self._tensorboard_dir, exist_ok=True)
        else:
            # for inference all results are saved under the output directory
            # (images, illuminant, GT, etc...)
            self._results_dir = self._args.outputfolder
            if isinstance(self._args.checkpointfile, list):
                self._checkpoint_file = self._args.checkpointfile[fold]
            else:
                self._checkpoint_file = self._args.checkpointfile

        self._display = Display(self._conf)
        self._factory = Factory(self._conf, self._data_conf, cache_manager,
                                self._args, verbose)
        self._cache_manager = cache_manager

        # create output directory
        os.makedirs(self._results_dir, exist_ok=True)

        os.environ['TORCH_HOME'] = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), os.pardir,
            'torch_model_zoo')

    # function used to determine the best epoch
    def _compute_best(self, best, train_stats, val_stats):
        metric = train_stats.mean_loss
        if 'choose_best_epoch_by' in self._conf:
            if self._conf['choose_best_epoch_by'] == 'mean_angular_error':
                metric = train_stats.mean_err
            elif self._conf['choose_best_epoch_by'] == 'median_angular_error':
                metric = train_stats.med_err
            elif self._conf['choose_best_epoch_by'] == 'mean_loss':
                metric = train_stats.mean_loss
            elif self._conf[
                    'choose_best_epoch_by'] == 'val_median_angular_error':
                metric = val_stats.med_err
            else:
                raise Exception('Invalid "choose_best_epoch_by" option')

        is_best = metric < best
        best = min(metric, best)

        return is_best, best

    # function to print the epoch info
    def _log_epoch(self, epoch, train_stats, val_stats):
        if self._verbose and epoch % self._conf['print_frequency_epoch'] == 0:
            print(
                'Epoch [{}]: AE (mean={:.4f} med={:.4f}) loss {:.4f} time={:.1f}'
                .format(epoch, train_stats.mean_err, train_stats.med_err,
                        train_stats.mean_loss, train_stats.time),
                end='')
            if val_stats is not None:
                print(
                    ' (val: AE (mean={:.4f} med={:.4f}) loss={:.4f} time={:.4f})\t'
                    .format(val_stats.mean_err, val_stats.med_err,
                            val_stats.mean_loss, val_stats.time),
                    end='')
            print()

        # 1. Log scalar values (scalar summary)
        info = {
            'Epoch Loss': train_stats.mean_loss,
            'Epoch Mean AE': train_stats.mean_err,
            'Epoch Median AE': train_stats.med_err
        }
        if val_stats is not None:
            info.update({
                'Epoch Loss (validation)': val_stats.mean_loss,
                'Epoch Mean AE (validation)': val_stats.mean_err,
                'Epoch Median AE (validation)': val_stats.med_err
            })

        for tag, value in info.items():
            self.logger.scalar_summary(tag, value, epoch)

    def run(self):
        args = self._args
        gpu = args.gpu
        self._conf['use_gpu'] = gpu is not None

        if self._verbose:
            if gpu is not None:
                print("Using GPU: {}".format(gpu))
            else:
                print(
                    "WARNING: You're training on the CPU, this could be slow!")

        # create transforms
        transforms = create_all_transforms(self, self._conf['transforms'])
        # copy FFCC histogram settings to conf (from transform settings)
        self._conf['log_uv_warp_histogram'] = find_loguv_warp_conf(transforms)

        # create model
        self.model = self._factory.get_model()

        # if we're evaluating instead of training:
        # 1. init the model (without training illuminants)
        # 2. load model weights
        if args.evaluate:
            self.model.initialize()
            if self._inference:
                checkpoint = self._checkpoint_file
            else:
                checkpoint = self._best_checkpoint_file

            # optionally resume from a checkpoint
            start_epoch, best, self.model = self._factory.resume_from_checkpoint(
                checkpoint, self.model, None, gpu)
        else:
            checkpoint = self._checkpoint_file

        # create validation/test transforms if defined, otherwise, the same as training
        if self._conf['transforms_valtest'] is not None:
            transforms_valtest = create_all_transforms(
                self, self._conf['transforms_valtest'])
        else:
            transforms_valtest = transforms

        if gpu is not None:
            torch.cuda.set_device(gpu)
            cudnn.benchmark = True

        if args.testfile is not None:
            # test loader
            test_dataset, test_loader, test_loader_cache = self._factory.get_loader(
                args.testfile, transforms_valtest, gpu)
            # if evaluating, copy model to GPU, evaluate and die
            if args.evaluate:
                if gpu is not None:
                    self.model = self.model.cuda(gpu)
                return self.validate(test_loader)  # we finish here!

        # if validation file is defined
        if args.valfile is not None:
            # to save memory, don't do it again if valfile==testfile
            if args.valfile == args.testfile:
                val_dataset = test_dataset
                val_loader = test_loader
                val_loader_cache = test_loader_cache
            else:
                # validation loader
                val_dataset, val_loader, val_loader_cache = self._factory.get_loader(
                    args.valfile, transforms_valtest, gpu)

        # training loader
        train_dataset, train_loader, train_loader_cache = self._factory.get_loader(
            args.trainfiles, transforms, gpu, train=True)

        # init model with the training set illuminants
        self.model.initialize(train_dataset.get_illuminants_by_sensor())

        # optionally pretrain model
        self._factory.pretrain_model(self._pretrained_model, self.model)

        # optionally resume from a checkpoint
        self.optimizer, optimizer_name = self._factory.get_optimizer(
            self.model)
        start_epoch, best, self.model = self._factory.resume_from_checkpoint(
            checkpoint, self.model, self.optimizer, gpu)

        # define loss function
        self.criterion = self._factory.get_criterion()

        # tensorboard logger
        self.logger = TensorBoardLogger(self._tensorboard_dir)

        # learning rate scheduler (if defined)
        scheduler, scheduler_name = self._factory.get_lr_scheduler(
            start_epoch, self.optimizer)

        # copy stuff to GPU
        if gpu is not None:
            self.criterion = self.criterion.cuda(gpu)
            self.model = self.model.cuda(gpu)

        # for FFCC, we reset the optimizer after some epochs
        # because they use two loss functions, ugly trick
        # TODO: fix
        reset_opt = -1
        if 'reset_optimizer_epoch' in self._conf:
            reset_opt = self._conf['reset_optimizer_epoch']

        # load data for the first time
        # we use the cache loaders, they define batch size=1
        # so that we can see the progress with tqdm
        if self._cache_manager.transforms().length > 0 and self._fold == 0:
            if self._verbose:
                print('Caching images...')
            for data in tqdm(train_loader_cache,
                             desc="Training set",
                             disable=not self._verbose):
                pass
            if args.testfile is not None:
                for data in tqdm(test_loader_cache,
                                 desc="Test set",
                                 disable=not self._verbose):
                    pass
            if args.valfile is not None and args.testfile != args.valfile:
                for data in tqdm(val_loader_cache,
                                 desc="Validation set",
                                 disable=not self._verbose):
                    pass

        # if epochs==0, we don't really want to train,
        # we only want to do the candidate selection process for our method
        if self._conf['epochs'] == 0:
            print('WARNING: Training 0 epochs')
            checkpoint = {
                'epoch': 0,
                'arch': self._conf['network']['arch'],
                'subarch': self._conf['network']['subarch'],
                'state_dict': self.model.state_dict(),
                'best': float("inf"),
                'optimizer': self.optimizer.state_dict()
            }
            self._factory.save_checkpoint(self._checkpoint_file,
                                          self._best_checkpoint_file,
                                          checkpoint,
                                          is_best=True)

        # epoch loop
        for epoch in range(start_epoch, self._conf['epochs']):
            # ugly trick for FFCC 2 losses
            if epoch == reset_opt:
                if self._verbose:
                    print('Reset optimizer and lr scheduler')
                best = float("inf")
                self.optimizer, optimizer_name = self._factory.get_optimizer(
                    self.model)
                # TODO: What if lr scheduler changes its internal API?
                if scheduler is not None:
                    scheduler.optimizer = self.optimizer

            # train for one epoch
            train_stats = self.train(train_loader, epoch)

            # validation
            val_stats = None
            if args.valfile is not None:
                _, val_stats = self.validate(val_loader, epoch)

            # compute the best training epoch
            is_best, best = self._compute_best(best, train_stats, val_stats)

            # log epoch details
            self._log_epoch(epoch, train_stats, val_stats)

            # learning rate scheduler
            if scheduler is not None:
                # TODO: hardcoded
                if scheduler_name == 'ReduceLROnPlateau':
                    scheduler.step(train_stats.mean_err)
                else:
                    scheduler.step()

            # save checkpoint!
            checkpoint = {
                'epoch': epoch + 1,
                'arch': self._conf['network']['arch'],
                'subarch': self._conf['network']['subarch'],
                'state_dict': self.model.state_dict(),
                'best': best,
                'optimizer': self.optimizer.state_dict()
            }
            self._factory.save_checkpoint(self._checkpoint_file,
                                          self._best_checkpoint_file,
                                          checkpoint, is_best)

        # get results for the best model
        start_epoch, best, self.model = self._factory.load_model(
            self._best_checkpoint_file, self.model, self.optimizer, gpu)

        # return results from best epoch
        if args.testfile is not None:
            start_time = time.time()
            results = self.validate(test_loader)
            if self._verbose:
                print(
                    'Final inference (including generation of output files) took {:.4f}'
                    .format(time.time() - start_time))
            return results
        else:
            # for some datasets, we have no validation ground truth,
            # so, no evaluation possible
            return [], EpochStats(-1, -1, -1, 0)

    # log iteration
    def _log_iteration(self, epoch, step, len_epoch, loss, err, data, output):
        real_step = epoch * len_epoch + step
        if self._conf['tensorboard_frequency'] != -1 and real_step % self._conf[
                'tensorboard_frequency'] == 0:
            # Log scalar values (scalar summary)
            info = {'Loss': loss, 'Angular Error': err}

            for tag, value in info.items():
                self.logger.scalar_summary(tag, value, real_step)

            # Log values and gradients of the parameters (histogram summary)
            for tag, value in self.model.named_parameters():
                tag = tag.replace('.', '/')
                if value.requires_grad:
                    if value.grad is None:
                        print('WARNING: variable ', tag, '.grad is None!')
                    else:
                        self.logger.histo_summary(tag,
                                                  value.data.cpu().numpy(),
                                                  real_step)
                        self.logger.histo_summary(
                            tag + '/grad',
                            value.grad.data.cpu().numpy(), real_step)

            if 'confidence' in output:
                self.logger.histo_summary(
                    'confidence',
                    output['confidence'].data.cpu().numpy().flatten(),
                    real_step)

        if self._conf[
                'tensorboard_frequency_im'] != -1 and real_step % self._conf[
                    'tensorboard_frequency_im'] == 0:
            # Log training images (image summary)
            info = self._display.get_images(data, output)

            for tag, images in info.items():
                self.logger.image_summary(tag, images, real_step)

    def train(self, train_loader, epoch):
        start_t = time.time()  # log starting time
        self.model.train()  # switch to train mode

        # angular errors and loss lists
        angular_errors = []
        loss_vec = []

        # batch loop
        for step, data in enumerate(train_loader):
            data['epoch'] = epoch  # we know what's the current epoch
            err = err_m = output = loss = None

            def closure():
                nonlocal err, err_m, output, loss

                self.optimizer.zero_grad()
                output = self.model(data)

                loss = self.criterion(output, data, self.model)
                loss.backward()
                err_m = angular_error_degrees(
                    output['illuminant'],
                    Variable(data['illuminant'],
                             requires_grad=False)).detach()
                err = err_m.sum().item() / err_m.shape[0]
                return loss

            self.optimizer.step(closure)
            angular_errors += err_m.cpu().data.tolist()
            loss_value = loss.detach().item()
            loss_vec.append(loss_value)

            self._log_iteration(epoch, step, len(train_loader), loss_value,
                                err, data, output)

        angular_errors = np.array(angular_errors)
        mean_err = angular_errors.mean()
        med_err = np.median(angular_errors)

        mean_loss = np.array(loss_vec).mean()

        t = time.time() - start_t
        return EpochStats(mean_err, med_err, mean_loss, t)

    def validate(self, val_loader, epoch=None):
        with torch.no_grad():  # don't compute gradients
            save_full_res = self._args.save_fullres
            training = epoch is not None
            start_t = time.time()
            # switch to evaluate mode
            self.model.eval()

            res = []
            angular_errors = []
            loss_vec = []

            for i, data in enumerate(val_loader):
                if training:
                    data['epoch'] = epoch
                # compute output
                output = self.model(data)

                # measure accuracy and save loss
                err = None
                if 'illuminant' in data:
                    if training:
                        loss = self.criterion(output, data, self.model)
                        loss_vec.append(loss.detach().item())
                    err = angular_error_degrees(
                        output['illuminant'],
                        Variable(data['illuminant'],
                                 requires_grad=False)).data.cpu().tolist()
                    angular_errors += err

                # When training, we don't want to save validation images
                if not training:
                    res += self._display.save_output(data, output, err,
                                                     val_loader.dataset,
                                                     self._results_dir,
                                                     save_full_res)

            # some datasets have no validation GT
            mean_err = med_err = mean_loss = -1

            if len(angular_errors) > 0:
                angular_errors = np.array(angular_errors)
                mean_err = angular_errors.mean()
                med_err = np.median(angular_errors)

            if len(loss_vec) > 0:
                mean_loss = np.array(loss_vec).mean()

            t = time.time() - start_t
            return res, EpochStats(mean_err, med_err, mean_loss, t)
Beispiel #16
0
def test(config_filename):
    # configuration
    config = Config()
    config_file = "{}/{}".format(config.config_dir, config_filename)
    config.update_config(config_file)

    # logger
    log_file = "{}/test_{}.txt".format(config.log_dir, config.config_name)
    logger = Logger(log_file)

    # word embedding
    logger.info("setting word embedding...")
    word_embedding = Embedding()

    word_embedding_file = "{}/word_embedding_{}.pkl".format(
        config.cache_dir, config.config_name)
    logger.info(
        "loading word embedding from {}...".format(word_embedding_file))
    word_embedding.load_word_embedding(word_embedding_file)

    logger.info("vocab_size: {}".format(word_embedding.vocab_size))
    logger.info("word_dim  : {}".format(word_embedding.word_dim))

    # testing dataset
    logger.info("setting testing dataset...")
    test_dataset = Dataset(config.data_config)

    test_dataset.set_word_to_index(word_embedding.word2index)

    label_mapping_file = "{}/label_mapping_{}.pkl".format(
        config.cache_dir, config.config_name)
    logger.info("loading label mapping from {}...".format(label_mapping_file))
    test_dataset.load_label_mapping(label_mapping_file)

    test_data_file = "{}/{}".format(config.data_dir, config.test_data_file)
    logger.info("loading data from {}...".format(test_data_file))
    test_dataset.load_data_from_file(test_data_file)
    logger.info("number of samples: {}".format(test_dataset.num_samples))

    logger.info("processing data...")
    test_dataset.process_data_from_file()

    # model
    new_model_config = {
        "vocab_size": word_embedding.vocab_size,
        "word_dim": word_embedding.word_dim,
        "document_length": test_dataset.document_length,
        "sentence_length": test_dataset.sentence_length,
        "num_labels": test_dataset.num_labels
    }
    config.update_model_config(new_model_config)

    model = Model(config.model_config)

    # metric
    metric = Metric()

    # test configuration
    logger.info("configuration: {}".format(config))

    # data loader
    test_data_loader = DataLoader(test_dataset,
                                  batch_size=config.batch_size,
                                  shuffle=False)

    # model factory
    network = Factory(model)
    network.set_test_module()
    logger.info("number of GPUs: {}".format(network.num_gpus))
    logger.info("device: {}".format(network.device))

    # load model
    model_file = "{}/model_{}.pkl".format(config.cache_dir, config.config_name)
    logger.info("loading model from {}...".format(model_file))
    network.load_model(model_file)

    network.model_to_device()

    # test
    network.eval_mode()
    test_preds = np.zeros([0, test_dataset.num_labels], dtype=np.int)
    test_labels = np.zeros([0, test_dataset.num_labels], dtype=np.int)
    for batch, data in enumerate(test_data_loader):
        sequences_ttl, sequences_cnt, labels = data
        preds = network.test(sequences_ttl, sequences_cnt)
        test_preds = np.concatenate((test_preds, preds), axis=0)
        test_labels = np.concatenate(
            (test_labels, labels.numpy().astype(np.int)), axis=0)

    # metrics
    ac, mp, mr, mf = metric.all_metrics(test_preds, test_labels)
    logger.info("Acc: {:.4f}".format(ac))
    logger.info("MP : {:.4f}".format(mp))
    logger.info("MR : {:.4f}".format(mr))
    logger.info("MF : {:.4f}".format(mf))
 def test_factory_is_aware_of_time(self):
     factory = Factory()
     factory.run()
     assert_that(factory.time, is_(1))
Beispiel #18
0
def train(config_filename):
    # configuration
    config = Config()
    config_file = "{}/{}".format(config.config_dir, config_filename)
    config.update_config(config_file)

    # logger
    log_file = "{}/train_{}.txt".format(config.log_dir, config.config_name)
    logger = Logger(log_file)

    # word embedding
    logger.info("setting word embedding...")
    word_embedding = Embedding()

    train_data_file = "{}/{}".format(config.data_dir, config.train_data_file)
    word_vector_file = "{}/{}".format(config.src_dir, config.word_vector_file)
    vocab_list_file = "{}/vocab_list_{}.txt".format(config.cache_dir,
                                                    config.config_name)
    word_embedding_file = "{}/word_embedding_{}.pkl".format(
        config.cache_dir, config.config_name)

    if not os.path.exists(word_embedding_file):
        logger.info("building word embedding...")
        word_embedding.build_word_embedding(train_data_file, word_vector_file,
                                            vocab_list_file,
                                            word_embedding_file)

    logger.info(
        "loading word embedding from {}...".format(word_embedding_file))
    word_embedding.load_word_embedding(word_embedding_file)

    logger.info("vocab_size: {}".format(word_embedding.vocab_size))
    logger.info("word_dim  : {}".format(word_embedding.word_dim))

    # training dataset
    logger.info("setting training dataset...")
    train_dataset = Dataset(config.data_config)

    train_dataset.set_word_to_index(word_embedding.word2index)

    train_data_file = "{}/{}".format(config.data_dir, config.train_data_file)
    logger.info("loading data from {}...".format(train_data_file))
    train_dataset.load_data_from_file(train_data_file)
    logger.info("number of samples: {}".format(train_dataset.num_samples))

    label_list_file = "{}/label_list_{}.txt".format(config.cache_dir,
                                                    config.config_name)
    label_mapping_file = "{}/label_mapping_{}.pkl".format(
        config.cache_dir, config.config_name)
    logger.info("building label mapping...")
    train_dataset.build_label_mapping(label_list_file, label_mapping_file)

    logger.info("processing data...")
    train_dataset.process_data_from_file()

    # validation dataset
    logger.info("setting validation dataset...")
    valid_dataset = Dataset(config.data_config)

    valid_dataset.set_word_to_index(word_embedding.word2index)

    label_mapping_file = "{}/label_mapping_{}.pkl".format(
        config.cache_dir, config.config_name)
    logger.info("loading label mapping from {}...".format(label_mapping_file))
    valid_dataset.load_label_mapping(label_mapping_file)

    valid_data_file = "{}/{}".format(config.data_dir, config.valid_data_file)
    logger.info("loading data from {}...".format(valid_data_file))
    valid_dataset.load_data_from_file(valid_data_file)
    logger.info("number of samples: {}".format(valid_dataset.num_samples))

    logger.info("processing data...")
    valid_dataset.process_data_from_file()

    # model
    new_model_config = {
        "vocab_size": word_embedding.vocab_size,
        "word_dim": word_embedding.word_dim,
        "document_length": train_dataset.document_length,
        "sentence_length": train_dataset.sentence_length,
        "num_labels": train_dataset.num_labels
    }
    config.update_model_config(new_model_config)

    model = Model(config.model_config)

    # metric
    metric = Metric()

    # train configuration
    logger.info("configuration: {}".format(config))

    # data loader
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=config.batch_size,
                                   shuffle=True)
    valid_data_loader = DataLoader(valid_dataset,
                                   batch_size=config.batch_size,
                                   shuffle=False)

    # model factory
    network = Factory(model)
    network.set_train_module()
    logger.info("number of GPUs: {}".format(network.num_gpus))
    logger.info("device: {}".format(network.device))

    # set word embedding
    network.set_word_embedding(word_embedding.matrix)

    network.model_to_device()

    # train and validate
    max_mf = 0
    epoch_count = 0
    for epoch in range(config.num_epochs):
        logger.info("----------------------------------------")

        # train
        network.train_mode()
        for batch, data in enumerate(train_data_loader):
            sequences_ttl, sequences_cnt, labels = data
            loss = network.train(sequences_ttl, sequences_cnt, labels)
            if batch > 0 and batch % config.info_interval == 0:
                logger.info("epoch: {} | batch: {} | loss: {:.6f}".format(
                    epoch, batch, loss))

        # validate
        network.eval_mode()
        valid_preds = np.zeros([0, valid_dataset.num_labels], dtype=np.int)
        valid_labels = np.zeros([0, valid_dataset.num_labels], dtype=np.int)
        for batch, data in enumerate(valid_data_loader):
            sequences_ttl, sequences_cnt, labels = data
            preds, loss = network.validate(sequences_ttl, sequences_cnt,
                                           labels)
            valid_preds = np.concatenate((valid_preds, preds), axis=0)
            valid_labels = np.concatenate(
                (valid_labels, labels.numpy().astype(np.int)), axis=0)

        # metrics
        ac, mp, mr, mf = metric.all_metrics(valid_preds, valid_labels)
        logger.info("Acc: {:.4f}".format(ac))
        logger.info("MP : {:.4f}".format(mp))
        logger.info("MR : {:.4f}".format(mr))
        logger.info("MF : {:.4f}".format(mf))

        # early stop
        if mf >= max_mf:
            max_mf = mf
            epoch_count = 0
            model_file = "{}/model_{}.pkl".format(config.cache_dir,
                                                  config.config_name)
            logger.info("saving model to {}...".format(model_file))
            network.save_model(model_file)
        else:
            epoch_count += 1
            if epoch_count == config.early_stop:
                logger.info("stop training process.")
                logger.info("best epoch: {}".format(epoch - epoch_count))
                break
Beispiel #19
0
class Demo(object):
    def __init__(self, config_filename):
        # configuration
        config = Config()
        config_file = "{}/{}".format(config.config_dir, config_filename)
        config.update_config(config_file)

        # word embedding
        print("setting word embedding...")
        word_embedding = Embedding()

        word_embedding_file = "{}/word_embedding_{}.pkl".format(
            config.cache_dir, config.config_name)
        print("loading word embedding from {}...".format(word_embedding_file))
        word_embedding.load_word_embedding(word_embedding_file)

        # demo dataset
        print("setting demo dataset...")
        self.demo_dataset = Dataset(config.data_config)

        self.demo_dataset.set_word_to_index(word_embedding.word2index)

        label_mapping_file = "{}/label_mapping_{}.pkl".format(
            config.cache_dir, config.config_name)
        print("loading label mapping from {}...".format(label_mapping_file))
        self.demo_dataset.load_label_mapping(label_mapping_file)

        # model
        new_model_config = {
            "vocab_size": word_embedding.vocab_size,
            "word_dim": word_embedding.word_dim,
            "document_length": self.demo_dataset.document_length,
            "sentence_length": self.demo_dataset.sentence_length,
            "num_labels": self.demo_dataset.num_labels
        }
        config.update_model_config(new_model_config)

        model = Model(config.model_config)

        # model factory
        self.network = Factory(model)

        self.network.set_test_module()
        print("number of GPUs: {}".format(self.network.num_gpus))
        print("device: {}".format(self.network.device))

        # load model
        model_file = "{}/model_{}.pkl".format(config.cache_dir,
                                              config.config_name)
        print("loading model from {}...".format(model_file))
        self.network.load_model(model_file)

        self.network.model_to_device()
        self.network.eval_mode()

    def predict(self, data_list):
        """
        data_list  : [{"title": str, "content": str}]
        result_list: [{"strategy_ids": [str]}]
        """
        self.demo_dataset.load_data_from_list(data_list)
        self.demo_dataset.process_data_from_list()

        demo_data_loader = DataLoader(self.demo_dataset,
                                      batch_size=50,
                                      shuffle=False)

        demo_preds = np.zeros([0, self.demo_dataset.num_labels], dtype=np.int)
        for batch, data in enumerate(demo_data_loader):
            sequences_ttl, sequences_cnt, labels = data
            preds = self.network.test(sequences_ttl, sequences_cnt)
            demo_preds = np.concatenate((demo_preds, preds), axis=0)

        result_list = []
        for index in range(self.demo_dataset.num_samples):
            strategy_ids = []
            for label in range(self.demo_dataset.num_labels):
                if demo_preds[index, label] == 1:
                    strategy_ids.append(
                        self.demo_dataset.label2strategy[label])
            result = {"strategy_ids": strategy_ids}
            result_list.append(result)
        return result_list
Beispiel #20
0
 def setUp(self):
     db = CsvDatabase(
         os.path.join(os.getcwd(), 'test_data/contact-lenses.csv'))
     Factory.setup_db(db)
 def test_factory_add_worker(self):
     factory = Factory()
     factory.add_worker(Worker())
     self.assertEquals(len(factory.workers), 1)