Ejemplo n.º 1
0
    def __init__(self, args):
        super(Solver, self).__init__()
        self.args = args
        self._save_checkpoints = args.save_checkpoints
        if args.train:
            self.model_dir = make_save_dir(os.path.join(args.model_path, args.exp_name))

        self._disable_comet = args.disable_comet
        self._model_name = args.model_name
        self._cuda = not args.no_cuda
        self.load_path = args.load_path

        stream = open(args.config, 'r') 
        config = yaml.load(stream, Loader=yaml.SafeLoader)
        self.config = config
        self.data_utils = DataUtils(config, args.train)
        self._prepare_model(config)
        self.batch_size = config['batch_size']
        self.num_epoch = config['num_epoch']
        self._print_every_step = config['print_every_step']
        self._valid_every_step = config['valid_every_step']
   
        # print('[Logging Info] Finish preparing model...')
        if self.args.test:
            self.outfile = open(os.path.join(args.pred_dir, args.model_name, args.prediction), 'w')
Ejemplo n.º 2
0
 def __init__(self, _stock_num):
     self.DataManager = DataManager.DataManager(_stock_num)
     self.name = DataUtils.get_stock_num_to_name(_stock_num)
     self.num = _stock_num
     self.queue_sell = asyncio.Queue()
     self.stock_df = self.DataManager.get_dataframe()
     self.count = 0
     self.amount_foreign = 0
     self.amount_agency = 0
Ejemplo n.º 3
0
class Solver(object):
    """docstring for Solver"""
    def __init__(self, args):
        super(Solver, self).__init__()
        self.args = args
        self._save_checkpoints = args.save_checkpoints
        if args.train:
            self.model_dir = make_save_dir(os.path.join(args.model_path, args.exp_name))

        self._disable_comet = args.disable_comet
        self._model_name = args.model_name
        self._cuda = not args.no_cuda
        self.load_path = args.load_path

        stream = open(args.config, 'r') 
        config = yaml.load(stream, Loader=yaml.SafeLoader)
        self.config = config
        self.data_utils = DataUtils(config, args.train)
        self._prepare_model(config)
        self.batch_size = config['batch_size']
        self.num_epoch = config['num_epoch']
        self._print_every_step = config['print_every_step']
        self._valid_every_step = config['valid_every_step']
   
        # print('[Logging Info] Finish preparing model...')
        if self.args.test:
            self.outfile = open(os.path.join(args.pred_dir, args.model_name, args.prediction), 'w')

        
        
    def _prepare_model(self, config):
        print('[Logging] preparing model...')
        self.model = build_model(self._model_name, config['models'], self._cuda, self.data_utils)
        self.lr = config['lr']

    def load_checkpoint(self, path):
        state_dict = torch.load(path)['state_dict']
        self.model.load_state_dict(state_dict)

    def loss_compute(self, out, target, padding_idx):
        true_dist = out.data.clone()
        true_dist.fill_(0.)
        true_dist.scatter_(2, target.unsqueeze(2), 1.)
        true_dist[:,:,padding_idx] *= 0

        total = (target != padding_idx).sum(dim=1)
        
        # return -(true_dist*out).sum(dim=2).mean()
        return (-(true_dist*out).sum(dim=2).sum(dim=1)/total).mean()

    def _run_one_step(self, batch, step=None):
        batch['step'] = step
        ## Forwarding ## 
        out = self.model(batch).transpose(0, 1).contiguous()
        target = batch['y'].long()
        if self._cuda:
            target = target.cuda()
        loss = self.loss_compute(out, target, self.data_utils.pad)
        
        return loss

    def _generate_one_step(self, batch):
        outputs = self.model.generate(batch, max_length=self.data_utils._max_len, bos_token=self.data_utils.bos)
        return outputs

    def _first_instance_in_batch(self, batch):
        for k, v in batch.items():
            if isinstance(v, torch.Tensor):
                batch[k] = v[:1]

        return batch

    def train(self):
        if not self._disable_comet:
            # logging
            COMET_PROJECT_NAME = 'weibo-baseline'
            COMET_WORKSPACE = 'timchen0618'

            self.exp = Experiment(project_name=COMET_PROJECT_NAME,
                                  workspace=COMET_WORKSPACE,
                                  auto_output_logging='simple',
                                  auto_metric_logging=None,
                                  display_summary=False,
                                 )
            self.exp.set_name(self.args.exp_name)
            self.exp.log_parameters(self.config)
            self.exp.log_parameters(self.config['models'][self._model_name])

        optim = torch.optim.Adam(self.model.parameters(), lr = self.lr, betas=(0.9, 0.98), eps=1e-9)
        print('[Logging Info] Finish loading data, start training...')

        step = 0
        losses = []
        for epoch in range(self.num_epoch):
            train_loader = self.data_utils.data_yielder()

            for batch in train_loader:
                self.model.train()
                optim.zero_grad()

                loss = self._run_one_step(batch, step)
                loss.backward()
                optim.step()
                losses.append(loss)

                if step % self._print_every_step == 0:
                    print('Logging...')
                    print('Step: %d | Loss: %f'%(step, sum(losses)/len(losses)))
                    print('Src: ', self.data_utils.id2sent(tens2np(batch['src'][0])))
                    print('length: ', batch['lengths'][0])
                    print('Tgt: ', self.data_utils.id2sent(tens2np(batch['y'][0])))
                    # print(tens2np(self._generate_one_step(self._first_instance_in_batch(batch))).shape)
                    print('Pred: ', self.data_utils.id2sent(tens2np(self._generate_one_step(self._first_instance_in_batch(batch)))))
                    if not self._disable_comet:
                        self.exp.log_metric('Train Loss', tens2np(sum(losses)/len(losses)), step=step)
                    losses = []

                if step % self._valid_every_step == self._valid_every_step - 1:
                    self.validate(step)

                step += 1

    @torch.no_grad()
    def validate(self, step):
        print('='*33)
        print('========== Validation ==========')
        print('='*33)
        fw = open(self.args.w_valid_file, 'w')

        self.model.eval()
        valid_loader = self.data_utils.data_yielder(valid=True)

        losses = []

        for batch in valid_loader:
            loss = self._run_one_step(batch, step)
            losses.append(loss)
            outputs = self._generate_one_step(batch)
            outputs = outputs.transpose(0, 1)

            # Writing sentences to hypothesis file
            for l in outputs:
                sentence = self.data_utils.id2sent(l[1:], True)
                fw.write(sentence)
                fw.write("\n")
        fw.close()

        print('Valid Loss: %4.6f'%(sum(losses)/len(losses)))

        # Calculate BLEU score and log to comet if needed
        bleus = cal_bleu(self.args.w_valid_file, self.args.w_valid_tgt_file)

        if not self._disable_comet:
            self.exp.log_metric('BLEU-1', bleus[0], step=step)
            self.exp.log_metric('BLEU-2', bleus[1], step=step)
            self.exp.log_metric('BLEU-3', bleus[2], step=step)
            self.exp.log_metric('BLEU-4', bleus[3], step=step)
            self.exp.log_metric('Valid Loss', sum(losses)/len(losses), step=step)

        if self._save_checkpoints:
            print('saving!!!!')

            model_name = str(int(step/1000)) + 'k_' + '%6.6f_'%(sum(losses)/len(losses)) + 'model.pth'
            state = {'step': step, 'state_dict': self.model.state_dict()}
            torch.save(state, os.path.join(self.model_dir, model_name))



    @torch.no_grad()
    def test(self):
        print('='*30)
        print('========== Testing ==========')
        print('='*30)

        self.load_checkpoint(self.load_path)
        self.model.eval()
        test_loader = self.data_utils.data_yielder()

        losses = []

        for i, batch in enumerate(test_loader):
            outputs = self._generate_one_step(batch)
            outputs = outputs.transpose(0, 1)
            print('outputs', outputs.size())
            if i % 20 == 0:
                print('step %d'%i)
            for l in outputs:
                self.outfile.write(self.data_utils.id2sent(l, test=True))
                self.outfile.write('\n')

        self.outfile.close()
Ejemplo n.º 4
0
def get_month_list(_stock_num):
    stock_dataframe = DataUtils.get_stock_df(_stock_num)
    day_end = stock_dataframe.index[0]
    day_start = stock_dataframe.index[-1]
    return pd.date_range(day_start, day_end,
                         freq='MS').strftime('%Y.%m').tolist()
Ejemplo n.º 5
0
def update_raw_data():
    stock_list = DataUtils.get_stock_num_list()
    print(stock_list)
    pool = mp.Pool(2)
    pool.map(parse_dataframe, stock_list)
Ejemplo n.º 6
0
 def get_chart_path(self):
     return DataUtils.get_chart_data_path()
Ejemplo n.º 7
0
 def get_raw_path(self):
     return DataUtils.get_raw_data_path()
Ejemplo n.º 8
0
 def __init__(self, stock_num):
     self.stock_num = stock_num
     self.raw_data_folder = DataUtils.get_raw_data_path()
def train(args):
    logger.info(color("Initializing Data Loader ... \n", 1))
    data_loader = DataUtils(args)
    device = set_cuda_device(args.cuda_device)

    logger.info(color("Processing Data ... \n", 1))
    train_data, valid_data, test_data = data_loader.obtain_formatted_data()
    train_data_num, valid_data_num = len(train_data), len(valid_data)

    model = MS_Pointer(args, word2index=data_loader.word2index, char2index=data_loader.char2index,
                       device=device).to(device)
    train_state = {"word2index": data_loader.word2index, "char2index": data_loader.char2index}

    optimizer = optim.Adam(model.parameters(), args.lr)
    train_state["optimizer"] = optimizer.state_dict()

    lr_scheduler = None
    if args.flag_lr_schedule:
        lr_scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.9, patience=args.lr_patience, verbose=True)
        train_state["lr_scheduler"] = lr_scheduler.state_dict()

    writer = SummaryWriter(log_dir=args.log_dir, comment="Logs For MS-Pointer Network")

    logger.info(color("Start Training \n", 1))
    train_start_time = time.time()
    
    for epoch in range(args.max_epoch):
        hardware_info_printer()

        epoch_start_time = time.time()
        train_data_batchs = data_loader.get_batch_data(train_data, with_target=True, batch_size=args.batch_size,
                                                       shuffle=True, device=device)
        valid_data_batchs = data_loader.get_batch_data(valid_data, with_target=True, batch_size=args.batch_size,
                                                       device=device)

        total_train_loss = 0.0
        tqdm_generator = tqdm(train_data_batchs, ncols=100)
        for idx, batch in enumerate(tqdm_generator):
            batch_start_time = time.time()
            # set model.training as 'True' to updating running variables for 'Dropout' and 'Normalization'.
            model.train()
            optimizer.zero_grad()

            train_loss = model.get_batch_loss(batch)
            loss = train_loss["mean_loss"]
            
            if torch.isnan(loss):
                raise ValueError("\n\n\033[31m%s\033[0m\n\n" % "【CAUTION】NAN LOSS ENCOUNTERED!")

            loss.backward()
            if args.flag_clip_grad and args.grad_norm > 0:
                nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.grad_norm, norm_type=2)
            optimizer.step()
            
            total_train_loss += train_loss["batch_loss"].detach().cpu().item()
            
            batch_elapsed_time = round(time.time() - batch_start_time, 2)
            
            info = color("[Train] ", 1) + "Epoch:" + color(epoch, 2) + " Batch:" + color(idx, 2) + " Loss:" + \
                   color(round(loss.detach().cpu().item(), 5), 1) + " Time:" + color(batch_elapsed_time, 2)
            tqdm_generator.set_description(desc=info, refresh=True)
            
        valid_start_time = time.time()
        total_valid_loss, mean_blue_score, valid_tokens, valid_probs = model.validation(valid_data_batchs,
                                                                                        need_pred_result=True)
        mean_valid_loss = total_valid_loss / valid_data_num if valid_data_num else 0.0
        mean_train_loss = total_train_loss / train_data_num

        writer.add_scalar(tag="scalar/train_loss", scalar_value=mean_train_loss, global_step=epoch)
        writer.add_scalar(tag="scalar/valid_loss", scalar_value=mean_valid_loss, global_step=epoch)
        writer.add_scalar(tag="scalar/valid_bleu", scalar_value=mean_blue_score, global_step=epoch)

        current_lr = optimizer.state_dict()['param_groups'][0]['lr']
        if args.flag_lr_schedule:
            lr_scheduler.step(mean_valid_loss)

        date = datetime.datetime.now().strftime('%Y-%m-%d')
        torch.save(model.state_dict(), f"../model/model_state_{date}_{str(epoch)}.pt")
        
        valid_elapsed_time = round(time.time() - valid_start_time, 2)
        epoch_elapsed_time = round(time.time() - epoch_start_time, 2)
        total_elapsed_time = round(time.time() - train_start_time, 2)

        epoch_info_printer(epoch=epoch, mean_loss=mean_train_loss, epoch_time=epoch_elapsed_time,
                           total_time=total_elapsed_time, lr=current_lr, train_samples=train_data_num, 
                           valid_samples=valid_data_num, valid_loss=mean_valid_loss, mean_blue=mean_blue_score, 
                           valid_time=valid_elapsed_time)

    logger.info(color("Training Task Completed! \n\n", 1))
Ejemplo n.º 10
0
    print()
    print(f"INPUT_DIR:\t{INPUT_DIR}")
    print(f"OUT_DIR:\t{OUT_DIR}")
    print(f"N_EPOCHS:\t{N_EPOCHS}")
    print(f"BATCH_SIZE:\t{BATCH_SIZE}")
    print()

    print(f"Will save model snapshots to {OUT_DIR}/\n")
    os.makedirs(OUT_DIR)

    # FIXME: make CLI param
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # create preprocessor for input
    TEXT = DataUtils.input_field(tokenizer=DataUtils.char_tokenizer,
                                 preprocessor=DataUtils.bigrams)
    # create preprocessor for labels
    LABEL = DataUtils.label_field()

    # load data
    train_ds, dev_ds, test_ds = DataUtils.load_datasets(data_dir=INPUT_DIR,
                                                        input_field=TEXT,
                                                        label_field=LABEL)

    # tally vocabularies for input and labels
    TEXT.build_vocab(train_ds, min_freq=1)
    LABEL.build_vocab(train_ds)
    #LABEL.vocab.freqs.most_common()
    #LABEL.vocab.stoi

    model = DANish(