def __init__(self, df, column_type, embedding_dim=5, n_layers=5, dim_feedforward=100, n_head=5, dropout=0.15, ns_exponent=0.75, share_category=False, use_pos=False, device='cpu'): self.logger = create_logger(name="BERTable") self.col_type = {'numerical': [], 'categorical': [], 'vector': []} for i, data_type in enumerate(column_type): self.col_type[data_type].append(i) self.embedding_dim = embedding_dim self.use_pos = use_pos self.device = device self.vocab = Vocab(df, self.col_type, share_category, ns_exponent) vocab_size = { 'numerical': len(self.vocab.item2idx['numerical']), 'categorical': len(self.vocab.item2idx['categorical']) } vector_dims = [np.shape(df[col])[1] for col in self.col_type['vector']] tab_len = len(column_type) self.model = Model(vocab_size, self.col_type, use_pos, vector_dims, embedding_dim, dim_feedforward, tab_len, n_layers, n_head, dropout)
def main(config_path): warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.ufunc size changed") warnings.filterwarnings( module="nltk", category=UserWarning, message="\nThe hypothesis contains 0 counts of \d-gram overlaps\.", action="ignore", ) config = Box.from_yaml(config_path.open()) torch.cuda.set_device(config.train.device) logger = create_logger(name="MAIN") logger.info(f"[-] Config loaded from {config_path}") logger.info(f"[-] Experiment: {config.train.exp}") exp_path = Path( config.data.data_dir) / "exp" / config.model / config.train.exp if not exp_path.is_dir(): exp_path.mkdir(parents=True) subprocess.call(["cp", config_path, exp_path / "config.yaml"]) random.seed(config.random_seed) np.random.seed(config.random_seed) torch.manual_seed(config.random_seed) torch.cuda.manual_seed(config.random_seed) logger.info("[-] Random seed set to {}".format(config.random_seed)) logger.info(f"[*] Initialize {config.model} trainer...") T = __import__(config.model, fromlist=["trainer"]) trainer = T.trainer.Trainer(config, config.train.device) logger.info("[-] Trainer initialization completed") logger.info("[*] Start training...") trainer.train()
def main(config_path): warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.ufunc size changed") warnings.filterwarnings( module='nltk', category=UserWarning, message='\nThe hypothesis contains 0 counts of \d-gram overlaps\.', action='ignore') config = Box.from_yaml(config_path.open()) torch.cuda.set_device(config.train.device) logger = create_logger(name="MAIN") logger.info(f'[-] Config loaded from {config_path}') logger.info(f'[-] Experiment: {config.test.exp}') exp_path = \ Path(config.data.data_dir) / "exp" / config.model / config.test.exp random.seed(config.random_seed) np.random.seed(config.random_seed) torch.manual_seed(config.random_seed) torch.cuda.manual_seed(config.random_seed) logger.info('[-] Random seed set to {}'.format(config.random_seed)) logger.info(f'[*] Initialize {config.model} tester...') T = __import__(config.model, fromlist=['tester']) tester = T.tester.Tester(config, config.train.device) logger.info('[-] Tester initialization completed') logger.info('[*] Start testing...') tester.test()
def __init__(self, config, vocab, device): self._logger = create_logger(name="MODEL") self._device = device self._logger.info("[*] Creating model.") self._stats = None self._net = Net(config, vocab) self._net.to(device=self._device) self._optim = getattr(torch.optim, config.optim)( filter(lambda p: p.requires_grad, self._net.parameters()), **config.optim_param)
def __init__(self, config, vocab): super(Embedding, self).__init__() logger = create_logger(name="EMBED") UNK = vocab.convert_tokens_to_indices(["<UNK>"])[0] PAD = vocab.convert_tokens_to_indices(["<PAD>"])[0] if hasattr(config.emb, "embed_path"): weight = pickle.load(open(config.emb.embed_path, "rb")) self.model = nn.Embedding.from_pretrained(weight, freeze=config.emb.freeze, padding_idx=PAD) else: self.model = nn.Embedding(len(vocab), config.emb.dim, padding_idx=PAD) logger.info("[-] Train from scratch.")
def __init__(self, config, device): for k, v in config.test.items(): setattr(self, k, v) self.dc_gate = config.model_param.dc_gate self.multi_value = config.train.multi_value self.sch_embed = (config.model_param.sch.type == "embed") nlp = spacy.load('en') self.tokenizer = \ spacy.lang.en.English().Defaults().create_tokenizer(nlp) self.logger = create_logger(name="TEST") self.origin_dir = Path(config.data.data_dir) self.data_dir = Path(config.data.save_dir) self.exp_dir = self.origin_dir / "exp" / config.model / self.exp self.pred_dir = self.origin_dir / "prediction" if not self.pred_dir.exists(): self.pred_dir.mkdir() self.config = config self.device = create_device(device) self.vocab = pickle.load(open(self.data_dir / "vocab.pkl", 'rb')) self.model = Model(config=config.model_param, vocab=self.vocab, device=self.device) self.logger.info(f"[-] Reading word vector......") self.emb = {} with open(config.data.embed_path, 'r') as file: for line in tqdm(file, total=get_num_lines(config.data.embed_path), leave=False): data = line.strip().split(' ') token, emb = data[0], list(map(float, data[1:])) self.emb[token] = emb if hasattr(self, "model_path"): self.model.load_state(self.model_path, save_device=config.train.device, load_device=config.test.device) else: self.model.load_best_state(self.exp_dir / "ckpt", save_device=config.train.device, load_device=config.test.device) self.trim_front = [',', '.', '?', '!', ':', "'"] self.trim_back = ['#']
def __init__(self, config, device): for k, v in config.train.items(): setattr(self, k, v) self.dc_gate = config.model_param.dc_gate self.sch_embed = config.model_param.sch.type == "embed" self.logger = create_logger(name="TRAIN") self.origin_dir = Path(config.data.data_dir) self.data_dir = Path(config.data.save_dir) self.exp_dir = self.origin_dir / "exp" / config.model / self.exp self.config = config self.device = create_device(device) self.vocab = pickle.load(open(self.data_dir / "vocab.pkl", "rb")) self.model = Model( config=config.model_param, vocab=self.vocab, device=self.device ) self.__cur_epoch = 0
def main(config_path): config = Box.from_yaml(config_path.open()) logger = create_logger(name="MAIN") logger.info(f'[-] Config loaded from {config_path}') data_dir = Path(config.data.data_dir) save_dir = Path(config.data.save_dir) fig_dir = save_dir / "fig" if not fig_dir.exists(): fig_dir.mkdir(parents=True) train_vocab_file = save_dir / "train_schema_vocab.pkl" valid_vocab_file = save_dir / "valid_schema_vocab.pkl" train_embed_file = save_dir / "train_schema_embed.pkl" valid_embed_file = save_dir / "valid_schema_embed.pkl" train_vocab = pickle.load(open(train_vocab_file, 'rb')) valid_vocab = pickle.load(open(valid_vocab_file, 'rb')) train_embed = pickle.load(open(train_embed_file, 'rb')) valid_embed = pickle.load(open(valid_embed_file, 'rb')) plt.rcParams.update({'font.size': 8}) _, idx2service = train_vocab[0] matrix = get_matrix(train_embed[0]) plot(matrix, idx2service, fig_dir / "train_service.pdf") _, idx2service = valid_vocab[0] matrix = get_matrix(valid_embed[0]) plot(matrix, idx2service, fig_dir / "valid_service.pdf") plt.rcParams.update({'font.size': 6}) _, idx2intent = train_vocab[1] matrix = get_matrix(train_embed[1]) idx2intent = [intent for service, intent in idx2intent] plot(matrix, idx2intent, fig_dir / "train_intent.pdf") _, idx2intent = valid_vocab[1] matrix = get_matrix(valid_embed[1]) idx2intent = [intent for service, intent in idx2intent] plot(matrix, idx2intent, fig_dir / "valid_intent.pdf")
def main(config_path): logger = create_logger(name="DATA") config = Box.from_yaml(config_path.open()) data_dir = Path(config.data.data_dir) save_dir = Path(config.data.save_dir) train_dir = data_dir / "train" sd_train_files = list(train_dir.glob("dialogues_*.json")) md_train_files = [] valid_dir = data_dir / "dev" sd_valid_files = list(valid_dir.glob("dialogues_*.json")) md_valid_files = [] is_test = (data_dir / "test").exists() # Wait for test data release if is_test: test_dir = data_dir / "test" sd_test_files = list(test_dir.glob("dialogues_*.json")) md_test_files = [] train_files, valid_files, test_files = [], [], [] use_sd = config.data.train.single_domain use_md = config.data.train.multi_domain assert use_sd or use_md, "Please use at least one part of dataset" if use_sd: train_files += sd_train_files if use_md: train_files += md_train_files use_sd = config.data.valid.single_domain use_md = config.data.valid.multi_domain assert use_sd or use_md, "Please use at least one part of dataset" if use_sd: valid_files += sd_valid_files if use_md: valid_files += md_valid_files if is_test: use_sd = config.data.test.single_domain use_md = config.data.test.multi_domain assert use_sd or use_md, "Please use at least one part of dataset" if use_sd: test_files += sd_test_files if use_md: test_files += md_test_files train_schema_file = data_dir / "train" / "schema.json" valid_schema_file = data_dir / "dev" / "schema.json" train_schemas = json.load(open(train_schema_file)) valid_schemas = json.load(open(valid_schema_file)) train_schema_vocab_file = save_dir / "train_schema_vocab.pkl" valid_schema_vocab_file = save_dir / "valid_schema_vocab.pkl" train_schema_vocab = pickle.load(open(train_schema_vocab_file, "rb")) valid_schema_vocab = pickle.load(open(valid_schema_vocab_file, "rb")) train_dialogues = [] for f in train_files: train_dialogues.extend(json.load(open(f))) valid_dialogues = [] for f in valid_files: valid_dialogues.extend(json.load(open(f))) if is_test: test_schema_file = data_dir / "test" / "schema.json" test_schemas = json.load(open(test_schema_file)) test_schema_vocab_file = save_dir / "test_schema_vocab.pkl" test_schema_vocab = pickle.load(open(test_schema_vocab_file, "rb")) test_dialogues = [] for f in test_files: test_dialogues.extend(json.load(open(f))) # Build vocab counter = count_words(train_dialogues, train_schemas) vocab_size = config.data.vocab_size vocab = Vocab(counter, vocab_size) vocab_path = save_dir / "vocab.pkl" logger.info(f"[-] Vocab size: {vocab_size}") logger.info(f"[-] Full vocab size: {len(vocab._idx2token)}") logger.info(f"[*] Dump vocab to {vocab_path}") pickle.dump(vocab, open(vocab_path, "wb")) # Generate embeddings UNK = vocab.convert_tokens_to_indices(["<UNK>"])[0] PAD = vocab.convert_tokens_to_indices(["<PAD>"])[0] with open(config.data.embed_path, "r") as file: line = next(file) emb_dim = len(line.strip().split()) - 1 cover = 0 weight = torch.zeros(len(vocab), emb_dim) with open(config.data.embed_path, "r") as file: for line in tqdm(file, total=get_num_lines(config.data.embed_path), leave=False): data = line.strip().split(" ") token, emb = data[0], list(map(float, data[1:])) idx = vocab.convert_tokens_to_indices([token])[0] if len(emb) == emb_dim and idx != UNK: cover += 1 weight[idx] = torch.FloatTensor(emb) weight[UNK] = 0.0 weight[PAD] = 0.0 logger.info((f"[-] Coverage: {cover}/{len(vocab)} " f"({cover / len(vocab) * 100:.2f}%).")) pickle.dump(weight, open(config.model_param.emb.embed_path, "wb")) # Build dataset dataset = build_dataset(train_dialogues, vocab, train_schema_vocab, train_schemas) logger.info(f"[-] {len(dataset)} Examples for training") pickle.dump(dataset, open(save_dir / "train.pkl", "wb")) dataset = build_dataset(valid_dialogues, vocab, valid_schema_vocab, valid_schemas) logger.info(f"[-] {len(dataset)} Examples for validating") pickle.dump(dataset, open(save_dir / "valid.pkl", "wb")) if is_test: dataset = build_dataset(test_dialogues, vocab, test_schema_vocab, test_schemas) logger.info(f"[-] {len(dataset)} Examples for testing") pickle.dump(dataset, open(save_dir / "test.pkl", "wb")) # Convert schema desc schema_desc = pickle.load(open(save_dir / "train_schema_desc.pkl", "rb")) schema_desc = [[vocab.convert_tokens_to_indices(sent) for sent in desc] for desc in schema_desc] pickle.dump(schema_desc, open(save_dir / "train_schema_desc.pkl", "wb")) schema_desc = pickle.load(open(save_dir / "valid_schema_desc.pkl", "rb")) schema_desc = [[vocab.convert_tokens_to_indices(sent) for sent in desc] for desc in schema_desc] pickle.dump(schema_desc, open(save_dir / "valid_schema_desc.pkl", "wb")) if is_test: schema_desc = pickle.load(open(save_dir / "test_schema_desc.pkl", "rb")) schema_desc = [[ vocab.convert_tokens_to_indices(sent) for sent in desc ] for desc in schema_desc] pickle.dump(schema_desc, open(save_dir / "test_schema_desc.pkl", "wb"))
def main(config_path): config = Box.from_yaml(config_path.open()) torch.cuda.set_device(config.train.device) logger = create_logger(name="MAIN") logger.info(f"[-] Config loaded from {config_path}") data_dir = Path(config.data.data_dir) save_dir = Path(config.data.save_dir) if not save_dir.exists(): save_dir.mkdir() transfo_dir = Path(config.data.transfo_dir) device = create_device(config.train.device) tokenizer = BertTokenizer.from_pretrained( str(transfo_dir), do_lower_case=(not config.data.cased)) global CLS global SEP global PAD CLS, SEP, PAD = tokenizer.convert_tokens_to_ids( ["[CLS]", "[SEP]", "[PAD]"]) bert_config = BertConfig.from_pretrained(str(transfo_dir)) # To extract representations from other layers bert_config.output_hidden_states = True model = BertModel(bert_config) model.to(device) model.eval() train_file = data_dir / "schema_dstc8+m2.2.json" train_vocab_file = save_dir / "train_schema_vocab.pkl" train_embed_file = save_dir / "train_schema_embed.pkl" train_desc_file = save_dir / "train_schema_desc.pkl" valid_file = data_dir / "dev" / "schema.json" valid_vocab_file = save_dir / "valid_schema_vocab.pkl" valid_embed_file = save_dir / "valid_schema_embed.pkl" valid_desc_file = save_dir / "valid_schema_desc.pkl" if (data_dir / "test").exists(): test_file = data_dir / "test" / "schema.json" test_vocab_file = save_dir / "test_schema_vocab.pkl" test_embed_file = save_dir / "test_schema_embed.pkl" test_desc_file = save_dir / "test_schema_desc.pkl" else: test_file = None test_vocab_file = None test_embed_file = None test_desc_file = None train_schema_vocab, train_desc = extract(train_file, config.data.concat_name) valid_schema_vocab, valid_desc = extract(valid_file, config.data.concat_name) if test_file is not None: test_schema_vocab, test_desc = extract(test_file, config.data.concat_name) else: test_schema_vocab = test_desc = None pickle.dump(train_schema_vocab, open(train_vocab_file, "wb")) pickle.dump(valid_schema_vocab, open(valid_vocab_file, "wb")) if test_schema_vocab is not None: pickle.dump(test_schema_vocab, open(test_vocab_file, "wb")) layer = config.data.schema.layer pooling = config.data.schema.pooling train_embed = [] for desc in tqdm(train_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) train_embed.append(embed) train_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in train_desc] pickle.dump(train_embed, open(train_embed_file, "wb")) pickle.dump(train_desc, open(train_desc_file, "wb")) valid_embed = [] for desc in tqdm(valid_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) valid_embed.append(embed) valid_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in valid_desc] pickle.dump(valid_embed, open(valid_embed_file, "wb")) pickle.dump(valid_desc, open(valid_desc_file, "wb")) if test_desc is None: exit() test_embed = [] for desc in tqdm(test_desc, leave=False): embed = [] for sent in tqdm(desc, leave=False): embed.append( get_rep(sent, model, tokenizer, layer, pooling, device)) embed = torch.stack(embed) test_embed.append(embed) test_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)] for sent in desc] for desc in test_desc] pickle.dump(test_embed, open(test_embed_file, "wb")) pickle.dump(test_desc, open(test_desc_file, "wb"))
from datetime import date, timedelta import dash_core_components as dcc import dash_html_components as html from dash_table import DataTable from pandas import DataFrame # from apps.publisher.service import default_csc from apps.service import get_table_styles from modules.environment import PD_LOGGING_LEVEL from modules.logger import create_logger from modules.utils import colors # create logger object logger = create_logger(__name__, level=PD_LOGGING_LEVEL) SATELLITES = [{ 'satellite': 'AMAZONIA1', 'sensors': ['WFI'] }, { 'satellite': 'CBERS4A', 'sensors': ['MUX', 'WFI', 'WPM'] }, { 'satellite': 'CBERS4', 'sensors': ['MUX', 'AWFI', 'PAN5M', 'PAN10M'] }, { 'satellite': 'CBERS2B', 'sensors': ['CCD', 'WFI', 'HRC'] }, { 'satellite': 'LANDSAT1', 'sensors': ['MSS']
def __init__(self, config, device, model_path=None, use_sgd=False, epoch=None): for k, v in config.test.items(): setattr(self, k, v) self.dc_gate = config.model_param.dc_gate self.multi_value = config.train.multi_value self.sch_embed = config.model_param.sch.type == "embed" nlp = spacy.load("en") self.tokenizer = spacy.lang.en.English().Defaults().create_tokenizer( nlp) self.logger = create_logger(name="TEST") self.origin_dir = Path(config.data.data_dir) self.data_dir = Path(config.data.save_dir) self.exp_dir = self.origin_dir / "exp" / config.model / self.exp if model_path: self.model_path = model_path self.pred_dir = (self.origin_dir / "prediction" / epoch if epoch else self.origin_dir / "prediction") if not self.pred_dir.exists(): self.pred_dir.mkdir() self.config = config self.device = create_device(device) self.vocab = pickle.load( open( self.data_dir if not use_sgd else Path("../save/") / "vocab.pkl", "rb")) self.model = Model(config=config.model_param, vocab=self.vocab, device=self.device) self.logger.info(f"[-] Reading word vector......") self.emb = {} with open(config.data.embed_path, "r") as file: for line in tqdm(file, total=get_num_lines(config.data.embed_path), leave=False): data = line.strip().split(" ") token, emb = data[0], list(map(float, data[1:])) self.emb[token] = emb if hasattr(self, "model_path"): self.model.load_state( self.model_path, save_device=config.train.device, load_device=config.test.device, ) else: self.model.load_best_state( self.exp_dir / "ckpt", save_device=config.train.device, load_device=config.test.device, ) self.trim_front = [",", ".", "?", "!", ":", "'"] self.trim_back = ["#"]