Exemple #1
0
    def __init__(self,
                 df,
                 column_type,
                 embedding_dim=5,
                 n_layers=5,
                 dim_feedforward=100,
                 n_head=5,
                 dropout=0.15,
                 ns_exponent=0.75,
                 share_category=False,
                 use_pos=False,
                 device='cpu'):

        self.logger = create_logger(name="BERTable")

        self.col_type = {'numerical': [], 'categorical': [], 'vector': []}
        for i, data_type in enumerate(column_type):
            self.col_type[data_type].append(i)

        self.embedding_dim = embedding_dim
        self.use_pos = use_pos
        self.device = device

        self.vocab = Vocab(df, self.col_type, share_category, ns_exponent)

        vocab_size = {
            'numerical': len(self.vocab.item2idx['numerical']),
            'categorical': len(self.vocab.item2idx['categorical'])
        }

        vector_dims = [np.shape(df[col])[1] for col in self.col_type['vector']]
        tab_len = len(column_type)
        self.model = Model(vocab_size, self.col_type, use_pos, vector_dims,
                           embedding_dim, dim_feedforward, tab_len, n_layers,
                           n_head, dropout)
Exemple #2
0
def main(config_path):
    warnings.filterwarnings("ignore", message="numpy.dtype size changed")
    warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
    warnings.filterwarnings(
        module="nltk",
        category=UserWarning,
        message="\nThe hypothesis contains 0 counts of \d-gram overlaps\.",
        action="ignore",
    )
    config = Box.from_yaml(config_path.open())
    torch.cuda.set_device(config.train.device)
    logger = create_logger(name="MAIN")
    logger.info(f"[-] Config loaded from {config_path}")
    logger.info(f"[-] Experiment: {config.train.exp}")

    exp_path = Path(
        config.data.data_dir) / "exp" / config.model / config.train.exp
    if not exp_path.is_dir():
        exp_path.mkdir(parents=True)
    subprocess.call(["cp", config_path, exp_path / "config.yaml"])

    random.seed(config.random_seed)
    np.random.seed(config.random_seed)
    torch.manual_seed(config.random_seed)
    torch.cuda.manual_seed(config.random_seed)
    logger.info("[-] Random seed set to {}".format(config.random_seed))

    logger.info(f"[*] Initialize {config.model} trainer...")
    T = __import__(config.model, fromlist=["trainer"])
    trainer = T.trainer.Trainer(config, config.train.device)
    logger.info("[-] Trainer initialization completed")
    logger.info("[*] Start training...")
    trainer.train()
Exemple #3
0
def main(config_path):
    warnings.filterwarnings("ignore", message="numpy.dtype size changed")
    warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
    warnings.filterwarnings(
        module='nltk',
        category=UserWarning,
        message='\nThe hypothesis contains 0 counts of \d-gram overlaps\.',
        action='ignore')
    config = Box.from_yaml(config_path.open())
    torch.cuda.set_device(config.train.device)
    logger = create_logger(name="MAIN")
    logger.info(f'[-] Config loaded from {config_path}')
    logger.info(f'[-] Experiment: {config.test.exp}')

    exp_path = \
        Path(config.data.data_dir) / "exp" / config.model / config.test.exp

    random.seed(config.random_seed)
    np.random.seed(config.random_seed)
    torch.manual_seed(config.random_seed)
    torch.cuda.manual_seed(config.random_seed)
    logger.info('[-] Random seed set to {}'.format(config.random_seed))

    logger.info(f'[*] Initialize {config.model} tester...')
    T = __import__(config.model, fromlist=['tester'])
    tester = T.tester.Tester(config, config.train.device)
    logger.info('[-] Tester initialization completed')
    logger.info('[*] Start testing...')
    tester.test()
Exemple #4
0
    def __init__(self, config, vocab, device):
        self._logger = create_logger(name="MODEL")
        self._device = device
        self._logger.info("[*] Creating model.")
        self._stats = None

        self._net = Net(config, vocab)
        self._net.to(device=self._device)

        self._optim = getattr(torch.optim, config.optim)(
            filter(lambda p: p.requires_grad, self._net.parameters()),
            **config.optim_param)
Exemple #5
0
 def __init__(self, config, vocab):
     super(Embedding, self).__init__()
     logger = create_logger(name="EMBED")
     UNK = vocab.convert_tokens_to_indices(["<UNK>"])[0]
     PAD = vocab.convert_tokens_to_indices(["<PAD>"])[0]
     if hasattr(config.emb, "embed_path"):
         weight = pickle.load(open(config.emb.embed_path, "rb"))
         self.model = nn.Embedding.from_pretrained(weight,
                                                   freeze=config.emb.freeze,
                                                   padding_idx=PAD)
     else:
         self.model = nn.Embedding(len(vocab),
                                   config.emb.dim,
                                   padding_idx=PAD)
         logger.info("[-] Train from scratch.")
Exemple #6
0
    def __init__(self, config, device):
        for k, v in config.test.items():
            setattr(self, k, v)
        self.dc_gate = config.model_param.dc_gate
        self.multi_value = config.train.multi_value
        self.sch_embed = (config.model_param.sch.type == "embed")

        nlp = spacy.load('en')
        self.tokenizer = \
            spacy.lang.en.English().Defaults().create_tokenizer(nlp)

        self.logger = create_logger(name="TEST")

        self.origin_dir = Path(config.data.data_dir)
        self.data_dir = Path(config.data.save_dir)
        self.exp_dir = self.origin_dir / "exp" / config.model / self.exp
        self.pred_dir = self.origin_dir / "prediction"
        if not self.pred_dir.exists():
            self.pred_dir.mkdir()

        self.config = config
        self.device = create_device(device)

        self.vocab = pickle.load(open(self.data_dir / "vocab.pkl", 'rb'))
        self.model = Model(config=config.model_param,
                           vocab=self.vocab,
                           device=self.device)
        self.logger.info(f"[-] Reading word vector......")
        self.emb = {}
        with open(config.data.embed_path, 'r') as file:
            for line in tqdm(file,
                             total=get_num_lines(config.data.embed_path),
                             leave=False):
                data = line.strip().split(' ')
                token, emb = data[0], list(map(float, data[1:]))
                self.emb[token] = emb

        if hasattr(self, "model_path"):
            self.model.load_state(self.model_path,
                                  save_device=config.train.device,
                                  load_device=config.test.device)
        else:
            self.model.load_best_state(self.exp_dir / "ckpt",
                                       save_device=config.train.device,
                                       load_device=config.test.device)

        self.trim_front = [',', '.', '?', '!', ':', "'"]
        self.trim_back = ['#']
Exemple #7
0
    def __init__(self, config, device):
        for k, v in config.train.items():
            setattr(self, k, v)
        self.dc_gate = config.model_param.dc_gate
        self.sch_embed = config.model_param.sch.type == "embed"

        self.logger = create_logger(name="TRAIN")
        self.origin_dir = Path(config.data.data_dir)
        self.data_dir = Path(config.data.save_dir)
        self.exp_dir = self.origin_dir / "exp" / config.model / self.exp

        self.config = config
        self.device = create_device(device)

        self.vocab = pickle.load(open(self.data_dir / "vocab.pkl", "rb"))
        self.model = Model(
            config=config.model_param, vocab=self.vocab, device=self.device
        )
        self.__cur_epoch = 0
def main(config_path):
    config = Box.from_yaml(config_path.open())
    logger = create_logger(name="MAIN")
    logger.info(f'[-] Config loaded from {config_path}')

    data_dir = Path(config.data.data_dir)
    save_dir = Path(config.data.save_dir)
    fig_dir = save_dir / "fig"
    if not fig_dir.exists():
        fig_dir.mkdir(parents=True)

    train_vocab_file = save_dir / "train_schema_vocab.pkl"
    valid_vocab_file = save_dir / "valid_schema_vocab.pkl"
    train_embed_file = save_dir / "train_schema_embed.pkl"
    valid_embed_file = save_dir / "valid_schema_embed.pkl"

    train_vocab = pickle.load(open(train_vocab_file, 'rb'))
    valid_vocab = pickle.load(open(valid_vocab_file, 'rb'))
    train_embed = pickle.load(open(train_embed_file, 'rb'))
    valid_embed = pickle.load(open(valid_embed_file, 'rb'))

    plt.rcParams.update({'font.size': 8})

    _, idx2service = train_vocab[0]
    matrix = get_matrix(train_embed[0])
    plot(matrix, idx2service, fig_dir / "train_service.pdf")

    _, idx2service = valid_vocab[0]
    matrix = get_matrix(valid_embed[0])
    plot(matrix, idx2service, fig_dir / "valid_service.pdf")

    plt.rcParams.update({'font.size': 6})

    _, idx2intent = train_vocab[1]
    matrix = get_matrix(train_embed[1])
    idx2intent = [intent for service, intent in idx2intent]
    plot(matrix, idx2intent, fig_dir / "train_intent.pdf")

    _, idx2intent = valid_vocab[1]
    matrix = get_matrix(valid_embed[1])
    idx2intent = [intent for service, intent in idx2intent]
    plot(matrix, idx2intent, fig_dir / "valid_intent.pdf")
Exemple #9
0
def main(config_path):
    logger = create_logger(name="DATA")
    config = Box.from_yaml(config_path.open())
    data_dir = Path(config.data.data_dir)
    save_dir = Path(config.data.save_dir)

    train_dir = data_dir / "train"
    sd_train_files = list(train_dir.glob("dialogues_*.json"))
    md_train_files = []

    valid_dir = data_dir / "dev"
    sd_valid_files = list(valid_dir.glob("dialogues_*.json"))
    md_valid_files = []

    is_test = (data_dir / "test").exists()
    # Wait for test data release
    if is_test:
        test_dir = data_dir / "test"
        sd_test_files = list(test_dir.glob("dialogues_*.json"))
        md_test_files = []

    train_files, valid_files, test_files = [], [], []

    use_sd = config.data.train.single_domain
    use_md = config.data.train.multi_domain
    assert use_sd or use_md, "Please use at least one part of dataset"
    if use_sd:
        train_files += sd_train_files
    if use_md:
        train_files += md_train_files

    use_sd = config.data.valid.single_domain
    use_md = config.data.valid.multi_domain
    assert use_sd or use_md, "Please use at least one part of dataset"
    if use_sd:
        valid_files += sd_valid_files
    if use_md:
        valid_files += md_valid_files

    if is_test:
        use_sd = config.data.test.single_domain
        use_md = config.data.test.multi_domain
        assert use_sd or use_md, "Please use at least one part of dataset"
        if use_sd:
            test_files += sd_test_files
        if use_md:
            test_files += md_test_files

    train_schema_file = data_dir / "train" / "schema.json"
    valid_schema_file = data_dir / "dev" / "schema.json"
    train_schemas = json.load(open(train_schema_file))
    valid_schemas = json.load(open(valid_schema_file))

    train_schema_vocab_file = save_dir / "train_schema_vocab.pkl"
    valid_schema_vocab_file = save_dir / "valid_schema_vocab.pkl"
    train_schema_vocab = pickle.load(open(train_schema_vocab_file, "rb"))
    valid_schema_vocab = pickle.load(open(valid_schema_vocab_file, "rb"))

    train_dialogues = []
    for f in train_files:
        train_dialogues.extend(json.load(open(f)))

    valid_dialogues = []
    for f in valid_files:
        valid_dialogues.extend(json.load(open(f)))

    if is_test:
        test_schema_file = data_dir / "test" / "schema.json"
        test_schemas = json.load(open(test_schema_file))
        test_schema_vocab_file = save_dir / "test_schema_vocab.pkl"
        test_schema_vocab = pickle.load(open(test_schema_vocab_file, "rb"))

        test_dialogues = []
        for f in test_files:
            test_dialogues.extend(json.load(open(f)))

    # Build vocab
    counter = count_words(train_dialogues, train_schemas)
    vocab_size = config.data.vocab_size
    vocab = Vocab(counter, vocab_size)
    vocab_path = save_dir / "vocab.pkl"
    logger.info(f"[-] Vocab size: {vocab_size}")
    logger.info(f"[-] Full vocab size: {len(vocab._idx2token)}")
    logger.info(f"[*] Dump vocab to {vocab_path}")
    pickle.dump(vocab, open(vocab_path, "wb"))

    # Generate embeddings
    UNK = vocab.convert_tokens_to_indices(["<UNK>"])[0]
    PAD = vocab.convert_tokens_to_indices(["<PAD>"])[0]
    with open(config.data.embed_path, "r") as file:
        line = next(file)
        emb_dim = len(line.strip().split()) - 1
    cover = 0
    weight = torch.zeros(len(vocab), emb_dim)
    with open(config.data.embed_path, "r") as file:
        for line in tqdm(file,
                         total=get_num_lines(config.data.embed_path),
                         leave=False):
            data = line.strip().split(" ")
            token, emb = data[0], list(map(float, data[1:]))
            idx = vocab.convert_tokens_to_indices([token])[0]
            if len(emb) == emb_dim and idx != UNK:
                cover += 1
                weight[idx] = torch.FloatTensor(emb)
    weight[UNK] = 0.0
    weight[PAD] = 0.0
    logger.info((f"[-] Coverage: {cover}/{len(vocab)} "
                 f"({cover / len(vocab) * 100:.2f}%)."))
    pickle.dump(weight, open(config.model_param.emb.embed_path, "wb"))

    # Build dataset
    dataset = build_dataset(train_dialogues, vocab, train_schema_vocab,
                            train_schemas)
    logger.info(f"[-] {len(dataset)} Examples for training")
    pickle.dump(dataset, open(save_dir / "train.pkl", "wb"))
    dataset = build_dataset(valid_dialogues, vocab, valid_schema_vocab,
                            valid_schemas)
    logger.info(f"[-] {len(dataset)} Examples for validating")
    pickle.dump(dataset, open(save_dir / "valid.pkl", "wb"))
    if is_test:
        dataset = build_dataset(test_dialogues, vocab, test_schema_vocab,
                                test_schemas)
        logger.info(f"[-] {len(dataset)} Examples for testing")
        pickle.dump(dataset, open(save_dir / "test.pkl", "wb"))

    # Convert schema desc
    schema_desc = pickle.load(open(save_dir / "train_schema_desc.pkl", "rb"))
    schema_desc = [[vocab.convert_tokens_to_indices(sent) for sent in desc]
                   for desc in schema_desc]
    pickle.dump(schema_desc, open(save_dir / "train_schema_desc.pkl", "wb"))
    schema_desc = pickle.load(open(save_dir / "valid_schema_desc.pkl", "rb"))
    schema_desc = [[vocab.convert_tokens_to_indices(sent) for sent in desc]
                   for desc in schema_desc]
    pickle.dump(schema_desc, open(save_dir / "valid_schema_desc.pkl", "wb"))
    if is_test:
        schema_desc = pickle.load(open(save_dir / "test_schema_desc.pkl",
                                       "rb"))
        schema_desc = [[
            vocab.convert_tokens_to_indices(sent) for sent in desc
        ] for desc in schema_desc]
        pickle.dump(schema_desc, open(save_dir / "test_schema_desc.pkl", "wb"))
Exemple #10
0
def main(config_path):
    config = Box.from_yaml(config_path.open())
    torch.cuda.set_device(config.train.device)
    logger = create_logger(name="MAIN")
    logger.info(f"[-] Config loaded from {config_path}")

    data_dir = Path(config.data.data_dir)
    save_dir = Path(config.data.save_dir)
    if not save_dir.exists():
        save_dir.mkdir()
    transfo_dir = Path(config.data.transfo_dir)
    device = create_device(config.train.device)

    tokenizer = BertTokenizer.from_pretrained(
        str(transfo_dir), do_lower_case=(not config.data.cased))

    global CLS
    global SEP
    global PAD
    CLS, SEP, PAD = tokenizer.convert_tokens_to_ids(
        ["[CLS]", "[SEP]", "[PAD]"])

    bert_config = BertConfig.from_pretrained(str(transfo_dir))
    # To extract representations from other layers
    bert_config.output_hidden_states = True
    model = BertModel(bert_config)
    model.to(device)
    model.eval()

    train_file = data_dir / "schema_dstc8+m2.2.json"
    train_vocab_file = save_dir / "train_schema_vocab.pkl"
    train_embed_file = save_dir / "train_schema_embed.pkl"
    train_desc_file = save_dir / "train_schema_desc.pkl"
    valid_file = data_dir / "dev" / "schema.json"
    valid_vocab_file = save_dir / "valid_schema_vocab.pkl"
    valid_embed_file = save_dir / "valid_schema_embed.pkl"
    valid_desc_file = save_dir / "valid_schema_desc.pkl"
    if (data_dir / "test").exists():
        test_file = data_dir / "test" / "schema.json"
        test_vocab_file = save_dir / "test_schema_vocab.pkl"
        test_embed_file = save_dir / "test_schema_embed.pkl"
        test_desc_file = save_dir / "test_schema_desc.pkl"
    else:
        test_file = None
        test_vocab_file = None
        test_embed_file = None
        test_desc_file = None

    train_schema_vocab, train_desc = extract(train_file,
                                             config.data.concat_name)
    valid_schema_vocab, valid_desc = extract(valid_file,
                                             config.data.concat_name)
    if test_file is not None:
        test_schema_vocab, test_desc = extract(test_file,
                                               config.data.concat_name)
    else:
        test_schema_vocab = test_desc = None

    pickle.dump(train_schema_vocab, open(train_vocab_file, "wb"))
    pickle.dump(valid_schema_vocab, open(valid_vocab_file, "wb"))
    if test_schema_vocab is not None:
        pickle.dump(test_schema_vocab, open(test_vocab_file, "wb"))

    layer = config.data.schema.layer
    pooling = config.data.schema.pooling

    train_embed = []
    for desc in tqdm(train_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        train_embed.append(embed)

    train_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                   for sent in desc] for desc in train_desc]

    pickle.dump(train_embed, open(train_embed_file, "wb"))
    pickle.dump(train_desc, open(train_desc_file, "wb"))

    valid_embed = []
    for desc in tqdm(valid_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        valid_embed.append(embed)

    valid_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                   for sent in desc] for desc in valid_desc]

    pickle.dump(valid_embed, open(valid_embed_file, "wb"))
    pickle.dump(valid_desc, open(valid_desc_file, "wb"))

    if test_desc is None:
        exit()

    test_embed = []
    for desc in tqdm(test_desc, leave=False):
        embed = []
        for sent in tqdm(desc, leave=False):
            embed.append(
                get_rep(sent, model, tokenizer, layer, pooling, device))
        embed = torch.stack(embed)
        test_embed.append(embed)

    test_desc = [[[word.text.lower() for word in spacy_tokenizer(sent)]
                  for sent in desc] for desc in test_desc]

    pickle.dump(test_embed, open(test_embed_file, "wb"))
    pickle.dump(test_desc, open(test_desc_file, "wb"))
Exemple #11
0
from datetime import date, timedelta

import dash_core_components as dcc
import dash_html_components as html
from dash_table import DataTable
from pandas import DataFrame

# from apps.publisher.service import default_csc
from apps.service import get_table_styles
from modules.environment import PD_LOGGING_LEVEL
from modules.logger import create_logger
from modules.utils import colors

# create logger object
logger = create_logger(__name__, level=PD_LOGGING_LEVEL)

SATELLITES = [{
    'satellite': 'AMAZONIA1',
    'sensors': ['WFI']
}, {
    'satellite': 'CBERS4A',
    'sensors': ['MUX', 'WFI', 'WPM']
}, {
    'satellite': 'CBERS4',
    'sensors': ['MUX', 'AWFI', 'PAN5M', 'PAN10M']
}, {
    'satellite': 'CBERS2B',
    'sensors': ['CCD', 'WFI', 'HRC']
}, {
    'satellite': 'LANDSAT1',
    'sensors': ['MSS']
Exemple #12
0
    def __init__(self,
                 config,
                 device,
                 model_path=None,
                 use_sgd=False,
                 epoch=None):
        for k, v in config.test.items():
            setattr(self, k, v)
        self.dc_gate = config.model_param.dc_gate
        self.multi_value = config.train.multi_value
        self.sch_embed = config.model_param.sch.type == "embed"

        nlp = spacy.load("en")
        self.tokenizer = spacy.lang.en.English().Defaults().create_tokenizer(
            nlp)

        self.logger = create_logger(name="TEST")

        self.origin_dir = Path(config.data.data_dir)
        self.data_dir = Path(config.data.save_dir)
        self.exp_dir = self.origin_dir / "exp" / config.model / self.exp
        if model_path:
            self.model_path = model_path
        self.pred_dir = (self.origin_dir / "prediction" /
                         epoch if epoch else self.origin_dir / "prediction")
        if not self.pred_dir.exists():
            self.pred_dir.mkdir()

        self.config = config
        self.device = create_device(device)

        self.vocab = pickle.load(
            open(
                self.data_dir if not use_sgd else Path("../save/") /
                "vocab.pkl", "rb"))
        self.model = Model(config=config.model_param,
                           vocab=self.vocab,
                           device=self.device)
        self.logger.info(f"[-] Reading word vector......")
        self.emb = {}
        with open(config.data.embed_path, "r") as file:
            for line in tqdm(file,
                             total=get_num_lines(config.data.embed_path),
                             leave=False):
                data = line.strip().split(" ")
                token, emb = data[0], list(map(float, data[1:]))
                self.emb[token] = emb

        if hasattr(self, "model_path"):
            self.model.load_state(
                self.model_path,
                save_device=config.train.device,
                load_device=config.test.device,
            )
        else:
            self.model.load_best_state(
                self.exp_dir / "ckpt",
                save_device=config.train.device,
                load_device=config.test.device,
            )

        self.trim_front = [",", ".", "?", "!", ":", "'"]
        self.trim_back = ["#"]