def func(gpu_id):
     config.set_args(args.gpu_ids.split(',')[gpu_id])
     tester = Tester(Model(), cfg)
     tester.load_weights(test_model)
     range = [ranges[gpu_id], ranges[gpu_id + 1]]
     if (cfg.MODEL.occluded_detection):
         return test_net_occ(tester, dets, range, gpu_id, d.sigmas)
     else:
         return test_net(tester, dets, range, gpu_id, d.sigmas)
Beispiel #2
0
def main():
    args = set_args()

    # loading EmoContext data
    print("loading data")
    data = EMO(args)
    setattr(args, 'word_vocab_size', len(data.TEXT.vocab))
    setattr(args, 'model_time', strftime('%H:%M:%S', gmtime()))
    setattr(args, 'class_size', len(data.LABEL.vocab))
    setattr(args, 'max_word_len', data.max_word_len)
    setattr(args, 'char_vocab_size', len(data.char_vocab))
    setattr(args, 'FILTER_SIZES', [1, 3, 5])
    print(args.char_vocab_size)

    print('Vocab Size: ' + str(len(data.TEXT.vocab)))

    if args.ss_emb:
        build_sswe_vectors()

    best_model, max_test_acc, max_test_f1 = train(args, data)

    model_name_str = 'NN4EMO_' + args.name_tag

    if not os.path.exists('saved_models'):
        os.makedirs('saved_models')
    model_name = f'{model_name_str}_{args.model_time}_{max_test_acc:.4f}_{max_test_f1:.4f}.pt'
    torch.save(best_model, 'saved_models/' + model_name)

    print('training finished!')

    submission(args, model_name)
Beispiel #3
0
def main():
    args = set_args()
    global logger
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    logger.info('~Processing SQuAD dataset~')
    train_path = os.path.join(args.data_dir, 'train-v1.1.json')
    valid_path = os.path.join(args.data_dir, 'dev-v1.1.json')
    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.glove_dim,
                                                     args.glove))
    glove_path = args.glove
    glove_dim = args.glove_dim
    nlp = spacy.load('en', parser=False)
    set_environment(args.seed)
    logger.info('Loading glove vocab.')
    glove_vocab = load_glove_vocab(glove_path, glove_dim)
    # load data
    logger.info('Loading data vocab.')
    train_data = load_data(train_path)
    valid_data = load_data(valid_path, False)
    vocab_tag = Vocabulary.build(nlp.tagger.tag_names, neat=True)
    vocab_ner = Vocabulary.build([''] + nlp.entity.cfg[u'actions']['1'],
                                 neat=True)
    logger.info('Build vocabulary')
    vocab = build_vocab(train_data + valid_data,
                        glove_vocab,
                        sort_all=args.sort_all,
                        clean_on=True)

    meta_path = os.path.join(args.data_dir, args.meta)
    logger.info('building embedding')
    embedding = build_embedding(glove_path, vocab, glove_dim)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }

    # If you want to check vocab token IDs, etc., load the meta file below (squad_meta.pick).
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)
    logger.info('started the function build_data')
    train_fout = os.path.join(args.data_dir, args.train_data)
    build_data(train_data,
               vocab,
               vocab_tag,
               vocab_ner,
               train_fout,
               True,
               thread=args.threads)
    dev_fout = os.path.join(args.data_dir, args.dev_data)
    build_data(valid_data,
               vocab,
               vocab_tag,
               vocab_ner,
               dev_fout,
               False,
               thread=args.threads)
    def __init__(self, embedding_pre=None):

        super(BiLSTM_ATT, self).__init__()
        args = set_args()
        self.hidden_size = args.hidden_size
        self.tag_size = args.tag_size

        # 1. 词嵌入
        if args.is_train_embedding:
            self.word_embeds = nn.Embedding(args.vocab_size, args.embed_dim)
        else:
            self.word_embeds = nn.Embedding.from_pretrained(
                torch.FloatTensor(embedding_pre), freeze=False)

        # 两种位置嵌入
        self.pos1_embeds = nn.Embedding(args.pos_size, args.pos_dim)
        self.pos2_embeds = nn.Embedding(args.pos_size, args.pos_dim)

        self.lstm = nn.LSTM(input_size=args.embed_dim + args.pos_dim * 2,
                            hidden_size=args.hidden_size // 2,
                            num_layers=1,
                            bidirectional=True,
                            batch_first=True)
        self.dropout_lstm = nn.Dropout(p=0.5)
        self.dropout_att = nn.Dropout(p=0.5)
        self.relation_embeds = nn.Embedding(args.tag_size, self.hidden_size)
Beispiel #5
0
def main():
    args = set_args()
    setattr(args, 'model_time', strftime('%H:%M:%S', gmtime()))
    setattr(args, 'class_size', 4)

    # loading EmoContext data
    print("loading data")
    train_dataloader, valid_dataloader, num_train_examples = getDataLoaders(
        args)

    best_model, max_dev_f1 = train(args, train_dataloader, valid_dataloader,
                                   num_train_examples)

    if not os.path.exists('saved_models'):
        os.makedirs('saved_models')
    torch.save(best_model,
               f'saved_models/BERT_{args.model_time}_{max_dev_f1}.pt')

    print('training finished!')
Beispiel #6
0
def main():
    # 设置模型训练参数
    args = set_args()

    # 设置随机种子,方便模型复现
    if args.seed:
        torch.manual_seed(args.seed)
        random.seed(args.seed)
        np.random.seed(args.seed)
    # 加载模型的config
    model_config = GPT2Config.from_json_file(args.config_path)

    # 实例化GPT2LMHeadModel模型,这里我们没有加载预训练好的模型,而是直接从头开始训练。
    if args.pretrained_model_path:
        model = GPT2LMHeadModel.from_pretrained(args.pretrained_model_path)
    else:
        # 如果没有指定的预训练模型,则初始化模型
        model = GPT2LMHeadModel(config=model_config)

    tokenizer = BertTokenizer.from_pretrained(args.vocab_path,
                                              do_lower_case=True)

    # 将[space]作为一个分割整体,例如:"我爱[Space]中国。",使用原始tokenizer分词结果为"['我', '爱', '[', 'Space', ']', '中', '国', '。']";
    # 增加分割符号后的结果为"['我', '爱', '[Space]', '中', '国', '。']"
    tokenizer.add_tokens("[Space]", special_tokens=True)

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    # 加载训练数据和测试数据
    train_data = GPT2NewsTitleDataSet(tokenizer, args.max_len,
                                      args.title_max_len, args.data_dir,
                                      "train", args.train_file_path)
    test_data = GPT2NewsTitleDataSet(tokenizer, args.max_len,
                                     args.title_max_len, args.data_dir, "test",
                                     args.test_file_path)
    # 开始训练
    train(model, train_data, test_data, args)
Beispiel #7
0
    def forward(self, batch):
        doc_input, query_input,\
        doc_emb, query_emb,\
        doc_cove_low, doc_cove_high,\
        query_cove_low, query_cove_high,\
        doc_mask, query_mask,\
        doc_elmo, query_elmo = self.lexicon_encoder(batch)

        query_list, doc_list = [], []
        query_list.append(query_input)
        doc_list.append(doc_input)

        # doc encode
        if self.opt['elmo_on']:
            doc_low = self.doc_encoder_low(
                torch.cat([doc_input, doc_cove_low, doc_elmo[0]], 2), doc_mask)
        else:
            doc_low = self.doc_encoder_low(
                torch.cat([doc_input, doc_cove_low], 2), doc_mask)
        doc_low = self.dropout(doc_low)

        if self.opt['elmo_on']:
            doc_high = self.doc_encoder_high(
                torch.cat([doc_low, doc_cove_high, doc_elmo[1]], 2), doc_mask)
        else:
            doc_high = self.doc_encoder_high(
                torch.cat([doc_low, doc_cove_high], 2), doc_mask)

        doc_high = self.dropout(doc_high)
        # query
        if self.opt['elmo_on']:
            query_low = self.query_encoder_low(
                torch.cat([query_input, query_cove_low, query_elmo[0]], 2),
                query_mask)
        else:
            query_low = self.query_encoder_low(
                torch.cat([query_input, query_cove_low], 2), query_mask)
        query_low = self.dropout(query_low)
        if self.opt['elmo_on']:
            query_high = self.query_encoder_high(
                torch.cat([query_low, query_cove_high, query_elmo[1]], 2),
                query_mask)
        else:
            query_high = self.query_encoder_high(
                torch.cat([query_low, query_cove_high], 2), query_mask)
        query_high = self.dropout(query_high)

        query_mem_hiddens = self.query_understand(
            torch.cat([query_low, query_high], 2), query_mask)
        query_mem_hiddens = self.dropout(query_mem_hiddens)
        query_list = [query_low, query_high, query_mem_hiddens]
        doc_list = [doc_low, doc_high]

        query_att_input = torch.cat(
            [query_emb, query_cove_high, query_low, query_high], 2)
        doc_att_input = torch.cat([doc_emb, doc_cove_high] + doc_list, 2)
        if self.opt['elmo_on'] and self.opt['elmo_att_on']:
            idx = -2 if self.opt['elmo_self_att_on'] else -1
            doc_att_input = torch.cat([doc_att_input, doc_elmo[idx]], 2)
            query_att_input = torch.cat([query_att_input, query_elmo[idx]], 2)
        # setup logger
        args = set_args()
        logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
        # logger.warning('doc_self_hiddens {}{}{}'.format(doc_self_hiddens.output_size,doc_mem_gen.output_size,query_sum_attn.output_size))
        # logger.warning('before att {}{}{}'.format(doc_att_input.shape,query_att_input.shape, query_mask.shape,query_low.shape,query_mem_hiddens.shape ))
        # before att torch.Size([64, 246, 1412])torch.Size([64, 37, 1412])torch.Size([64, 37])
        # s=ConvAtt(doc_att_input,query_att_input,0.5)
        # s=s.cuda()
        # a=s()
        doc_attn_hiddens = self.deep_attn(doc_att_input, query_att_input,
                                          query_list, query_mask)
        # logger.warning('before att {}'.format(doc_attn_hiddens.shape))
        # before att torch.Size([64, 246, 768])
        doc_attn_hiddens = self.dropout(doc_attn_hiddens)
        # doc_attn_hiddens = self.dropout(a)
        doc_mem_hiddens = self.doc_understand(
            torch.cat([doc_attn_hiddens] + doc_list, 2), doc_mask)
        doc_mem_hiddens = self.dropout(doc_mem_hiddens)
        doc_mem_inputs = torch.cat([doc_attn_hiddens] + doc_list, 2)
        if self.opt['self_attention_on']:
            doc_att = torch.cat(
                [doc_mem_inputs, doc_mem_hiddens, doc_cove_high, doc_emb], 2)
            if self.opt['elmo_on'] and self.opt['elmo_self_att_on']:
                doc_att = torch.cat([doc_att, doc_elmo[-1]], 2)

            doc_self_hiddens = self.doc_self_attn(doc_att,
                                                  doc_att,
                                                  doc_mask,
                                                  x3=doc_mem_hiddens)
            doc_mem = self.doc_mem_gen(
                torch.cat([doc_mem_hiddens, doc_self_hiddens], 2), doc_mask)
        else:
            doc_mem = doc_mem_hiddens
        query_mem = self.query_sum_attn(query_mem_hiddens, query_mask)
        start_scores, end_scores = self.decoder(doc_mem, query_mem, doc_mask)
        # logger.warning('query_mem {}'.format(query_mem.shape))
        # logger.warning('hiddens {}'.format(query_mem_hiddens.shape))

        pred_score = None
        if self.classifier is not None:
            doc_sum = self.doc_sum_attn(doc_mem, doc_mask)
            pred_score = F.sigmoid(
                self.classifier(doc_sum, query_mem, doc_mask))
        return start_scores, end_scores, pred_score
                    scores).contiguous(), device)

            batch_critic_loss = critic_criterion(scores, rewards)
            critic_loss += batch_critic_loss.item()
            batch_rl_loss = a2c_loss(points, logits, rewards, scores.detach())
            rl_loss += batch_rl_loss.item()
            batch_loss = batch_critic_loss + batch_rl_loss

            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

            if (step + 1) % opt['print_every'] == 0:
                print('step ' + str(step + 1) + '/' +
                      str(len(data.train_loader)) + ': critic loss ' +
                      str(critic_loss) + ' rl loss ' + str(rl_loss))
                critic_loss = 0
                rl_loss = 0
            if (step + 1) % opt['validate_every'] == 0:
                validate(step, extractor, abstractor, data.valid_loader,
                         device)


if __name__ == '__main__':
    opt = set_args()
    opt['mode'] = 'r'
    opt['model_time'] = strftime('%H:%M:%S', gmtime())
    data = CnnDm(opt)
    opt['vocab_size'] = len(data.vocab)
    train(opt, data)
Beispiel #9
0
 def __init__(self):
     self.args = config.set_args()
     self.filename = self.args["yolo_comm_txt"]
     self.image_name = self.args["yolo_comm_img"]
     self.slave = CommSlave(filename=self.filename)
     self.detector = Detector()
Beispiel #10
0
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', type=str, dest='gpu_ids')
    parser.add_argument('--continue', dest='continue_train', action='store_true')
    parser.add_argument('--cfg', type=str, dest="cfg")
    args = parser.parse_args()

    if not args.gpu_ids:
        args.gpu_ids = str(np.argmin(mem_info()))

    if '-' in args.gpu_ids:
        gpus = args.gpu_ids.split('-')
        gpus[0] = 0 if not gpus[0].isdigit() else int(gpus[0])
        gpus[1] = len(mem_info()) if not gpus[1].isdigit() else int(gpus[1]) + 1
        args.gpu_ids = ','.join(map(lambda x: str(x), list(range(*gpus))))

    return args
args = parse_args()
cfg.set_config(args.cfg, train=True)
cfg.set_args(args.gpu_ids, args.continue_train)
random.seed(2233)

from model import Model
from tfflat.base import Trainer
from tfflat.utils import mem_info
trainer = Trainer(Model(), cfg.cfg)
trainer.train()



Beispiel #11
0
def main():
    args = set_args()
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True,
                           log_file=args.log_file)  # ./san.log
    v2_on = args.v2_on
    version = 'v1'
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        train_path = 'train-v2.0.json'
        dev_path = 'dev-v2.0.json'
        version = 'v2'
    else:
        msg = '~Processing SQuAD dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGING MODE***')
    train_path = os.path.join(
        args.data_dir, train_path)  # args.data_dir=data/, data/train-v2.0.json
    valid_path = os.path.join(args.data_dir, dev_path)  # data/dev-v2.0.json

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(
        args.embedding_dim, args.glove))  # embedding_dim=300
    # could be fasttext embedding
    emb_path = args.glove  # data/glove.840B.300d.txt
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:  # store_true
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')
    # load data
    train_data = load_data(train_path, v2_on=v2_on)
    dev_data = load_data(valid_path, False, v2_on=v2_on)
    """From GLoVe to acquire tokens, to set()"""
    wemb_vocab = load_emb_vocab(emb_path,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    """
    '--sort_all', action='store_true',
        sort the vocabulary by frequencies of all words, Otherwise consider question words first.
    """
    vocab, _, _ = build_vocab(train_data + dev_data,
                              wemb_vocab,
                              sort_all=args.sort_all,
                              clean_on=True,
                              cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path,
                                vocab,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data,
               vocab,
               vocab_tag,
               vocab_ner,
               train_fout,
               True,
               NLP=NLP,
               v2_on=v2_on)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data,
               vocab,
               vocab_tag,
               vocab_ner,
               dev_fout,
               False,
               NLP=NLP,
               v2_on=v2_on)
    end_time = time.time()
    logger.warning('It totally took {} minutes to processe the data!!'.format(
        (end_time - start_time) / 60.))
Beispiel #12
0
from torch.autograd import Variable
from utils.util import *
import time
import argparse
"""
Description:

    This is a script of Gradient-based Foreground Adjustment Algorithm.
    (x, y, Scale) of foreground objects will be adjust guided by model's gradient.
"""

# ========================== Constants =====================
parser = argparse.ArgumentParser(description='Inference Phase')
time = time.gmtime()
time = "-".join([str(p) for p in list(time)[:5]])
config = set_args()
test_fg = []

SAMPLE_NUM = config['sample_num']
ROUND = config['update_rd']
TOPK = config['top_k']

start_x = 0
start_y = 0
fx = [[-1, 0, 1], [1, 0, 1], [0, -1, 1], [0, 1, 1], [-1, 0, 0.95],
      [1, 0, 0.95], [0, -1, 0.95], [0, 1, 0.95], [-1, 0, 1.05], [1, 0, 1.05],
      [0, -1, 1.05], [0, 1, 1.05]]

# ======================== loading ckpt ================== #
ckpt = os.path.join(
    "checkpoints",
Beispiel #13
0
def main():
    args = set_args()
    # 加载训练集
    with gzip.open(args.train_data_path, 'rb') as f:
        train_features = pickle.load(f)
    
    # 加载验证集
    with gzip.open(args.dev_data_path, 'rb') as f:
        eval_features = pickle.load(f)
    
    # 总共训练的步数
    num_train_steps = int(
        len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)
    
    # 模型
    model = Model()

    # 指定多gpu运行
    if torch.cuda.is_available():
        model.cuda()

    if torch.cuda.device_count() > 1:
        args.n_gpu = torch.cuda.device_count()
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        # 就这一行
        model = nn.DataParallel(model)

    tokenizer = BertTokenizer.from_pretrained(args.vocab_file)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    warmup_steps = 0.05 * num_train_steps
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_steps)

    best_loss = None
    global_step = 0

    # 开始训练
    print("***** Running training *****")
    print("  Num examples = {}".format(len(train_features)))
    print("  Batch size = {}".format(args.train_batch_size))
    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label for f in train_features], dtype=torch.float32)

    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    model.train()
    for epoch in range(args.num_train_epochs):
        train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.train_batch_size)
        for step, batch in enumerate(train_dataloader):
            start_time = time.time()
            if torch.cuda.is_available():
                batch = tuple(t.cuda() for t in batch)
            input_ids, input_mask, segment_ids, label = batch

            logits = model(input_ids=input_ids, attention_mask=input_mask, segment_ids=segment_ids, labels=label)
            loss = loss_fct(logits, label)
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            s = '****Epoch: {}, step: {}, loss: {:10f}, time_cost: {:10f}'.format(epoch, step, loss, time.time() - start_time)
            rainbow(s)
            loss.backward()
            # nn.utils.clip_grad_norm_(model.parameters(), max_norm=20, norm_type=2)   # 是否进行梯度裁剪

            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
            # test_loss, test_acc = evaluate(epoch, eval_features, args, model)

        # 一轮跑完 进行eval
        test_loss, test_acc = evaluate(epoch, eval_features, args, model)
        model.train()
        if best_loss is None or best_loss > test_loss:
            best_loss = test_loss
            model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
            os.makedirs(args.save_model, exist_ok=True)

            output_model_file = os.path.join(args.save_model, "best_pytorch_model.bin")
            torch.save(model_to_save.state_dict(), output_model_file)

        # Save a trained model
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.save_model, "epoch{}_ckpt.bin".format(epoch))
        torch.save(model_to_save.state_dict(), output_model_file)
Beispiel #14
0
def main():
    args = set_args()
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    logger.warning('~Processing SQuAD dataset~')
    train_path = os.path.join(args.data_dir, 'train-v1.1.json')
    valid_path = os.path.join(args.data_dir, 'dev-v1.1.json')
    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.embedding_dim,
                                                     args.glove))
    # could be fasttext embedding
    emb_path = args.glove
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')
    wemb_vocab = load_emb_vocab(emb_path,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    # load data
    train_data = load_data(train_path)
    valid_data = load_data(valid_path, False)

    logger.info('Build vocabulary')
    vocab, _, _ = build_vocab(train_data + valid_data,
                              wemb_vocab,
                              sort_all=args.sort_all,
                              clean_on=True,
                              cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = os.path.join(args.data_dir, args.meta)
    logger.info('building embedding')
    embedding = build_embedding(emb_path,
                                vocab,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    train_fout = os.path.join(args.data_dir, args.train_data)
    build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True)

    dev_fout = os.path.join(args.data_dir, args.dev_data)
    build_data(valid_data, vocab, vocab_tag, vocab_ner, dev_fout, False)
    end_time = time.time()
    logger.info('It totally took {} minutes to processe the data!!'.format(
        (end_time - start_time) / 60.))
Beispiel #15
0
def main():
    args = set_args()
    args.datasets = args.datasets.split(',')
    global logger
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)

    all_data = []
    all_datasets = []
    for dataset_name in args.datasets:
        test_file_prefix = 'test'
        if test_mode:
            if 'marco' in dataset_name:
                train_file_prefix = 'train'
                dev_file_prefix = 'dev'
            else:
                train_file_prefix = 'dev'
                dev_file_prefix = 'dev'
        else:
            train_file_prefix = 'train'
            dev_file_prefix = 'dev'

        logger.info('Processing %s dataset' % dataset_name)
        this_data_dir = args.data_dir + dataset_name + '/'
        train_data = None
        train_path = os.path.join(this_data_dir, '%s.json' % train_file_prefix)
        logger.info('The path of training data: {}'.format(train_path))
        train_data = load_data(train_path)
        all_data += train_data

        valid_path = os.path.join(this_data_dir, '%s.json' % dev_file_prefix)
        logger.info('The path of validation data: {}'.format(valid_path))
        valid_data = load_data(valid_path, False)
        all_data += valid_data
        if args.include_test_set and 'squad' not in dataset_name and 'marco2.0' not in dataset_name:
            test_path = os.path.join(this_data_dir,
                                     '%s.json' % test_file_prefix)
            logger.info('The path of test data: {}'.format(test_path))
            test_data = load_data(test_path, False)
            all_data += test_data
            all_datasets.append((train_data, valid_data, test_data))
        else:
            all_datasets.append((train_data, valid_data))

    logger.info('{}-dim word vector path: {}'.format(args.glove_dim,
                                                     args.glove))
    glove_path = args.glove
    glove_dim = args.glove_dim
    nlp = spacy.load('en', parser=False)
    set_environment(args.seed)
    logger.info('Loading glove vocab.')
    glove_vocab = load_glove_vocab(glove_path, glove_dim)

    multitask_base_path = '../data/mtmrc/'
    with open(multitask_base_path + 'vocab_tag.pick', 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(multitask_base_path + 'vocab_ner.pick', 'rb') as f:
        vocab_ner = pickle.load(f)

    logger.info('Build vocabulary ')
    vocab = build_vocab(all_data,
                        glove_vocab,
                        sort_all=args.sort_all,
                        clean_on=True,
                        args=args)
    meta_path = os.path.join(args.output_path, args.meta)
    logger.info('building embedding ')
    embedding = build_embedding(glove_path, vocab, glove_dim)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)
    for i, item in enumerate(all_datasets):
        dataset_name = args.datasets[i]
        if args.include_test_set and 'squad' not in dataset_name and 'marco2.0' not in dataset_name:
            train_data, valid_data, test_data = item
        else:
            train_data, valid_data = item
        print('building output file for ', dataset_name)
        train_fout = os.path.join(args.output_path,
                                  dataset_name + '_train.json')
        build_data(train_data,
                   vocab,
                   vocab_tag,
                   vocab_ner,
                   train_fout,
                   True,
                   dataset_name=dataset_name)
        dev_fout = os.path.join(args.output_path, dataset_name + '_dev.json')
        build_data(valid_data,
                   vocab,
                   vocab_tag,
                   vocab_ner,
                   dev_fout,
                   False,
                   dataset_name=dataset_name)
        if args.include_test_set and 'squad' not in dataset_name:
            test_fout = os.path.join(args.output_path,
                                     dataset_name + '_test.json')
            build_data(test_data,
                       vocab,
                       vocab_tag,
                       vocab_ner,
                       test_fout,
                       False,
                       dataset_name=dataset_name)
def main():
    args = set_args()
    global logger
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    logger.info('Processing dataset')
    train_path = os.path.join(args.raw_data_dir, 'train')
    valid_path = os.path.join(args.raw_data_dir, 'dev')
    test_path = os.path.join(args.raw_data_dir, 'test')
    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('The path of test data: {}'.format(test_path))
    logger.info('{}-dim word vector path: {}'.format(args.glove_dim,
                                                     args.glove))
    glove_path = args.glove
    glove_dim = args.glove_dim
    # set_environment(args.seed)

    # load data
    train_data = load_reddit_data(train_path,
                                  anc_type='section',
                                  fact_len=12,
                                  just_anc=False,
                                  is_train=True)
    valid_data = load_reddit_data(valid_path,
                                  anc_type='section',
                                  fact_len=12,
                                  just_anc=False,
                                  is_train=False)
    test_data = load_reddit_data(test_path,
                                 anc_type='section',
                                 fact_len=12,
                                 just_anc=False,
                                 is_train=False)
    logger.info('#train data: {}'.format(len(train_data)))
    logger.info('#valid data: {}'.format(len(valid_data)))
    logger.info('#test data: {}'.format(len(test_data)))
    meta_path = args.meta

    if not os.path.exists(meta_path):
        logger.info('Build vocabulary')
        vocab = build_vocab(train_data + valid_data)
        logger.info('building embedding')
        embedding = build_embedding(glove_path, vocab, glove_dim)
        logger.info('emb done')
        meta = {'vocab': vocab, 'embedding': embedding}
        with open(meta_path, 'wb') as f:
            pickle.dump(meta, f)
    else:
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)
            vocab = meta['vocab']

    train_fout = os.path.join(args.data_dir, args.train_data)
    build_data(train_data, vocab, train_fout)
    logger.info('train data done')

    dev_fout = os.path.join(args.data_dir, args.dev_data)
    build_data(valid_data, vocab, dev_fout)
    logger.info('valid data done')

    test_fout = os.path.join(args.data_dir, args.test_data)
    build_data(test_data, vocab, test_fout)
    logger.info('test data done')

    write_files(args.data_dir + '/train', train_data)
    write_files(args.data_dir + '/dev', valid_data)
    write_files(args.data_dir + '/test', test_data)
Beispiel #17
0
def main():
    args = set_args()
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    v2_on = args.v2_on
    version = 'v1'
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        # train_path = 'train-v2.0.json'
        # dev_path = 'dev-v2.0.json'

        train_path = 'msmarco_squad_train.json'
        dev_path = 'msmarco_squad_dev.json'

        version = 'v2'
    else:
        msg = '~Processing SQuAD dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGING MODE***')
    train_path = os.path.join(args.data_dir, train_path)
    valid_path = os.path.join(args.data_dir, dev_path)

    logger.info('Train path is: {}'.format(train_path))

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.embedding_dim,
                                                     args.glove))
    # could be fasttext embedding
    emb_path = args.glove
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')
    # load data
    train_data = load_data(train_path, v2_on=v2_on, limit=20000)
    dev_data = load_data(valid_path, False, v2_on=v2_on, limit=500)

    wemb_vocab = load_emb_vocab(emb_path,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    vocab, _, _ = build_vocab(train_data + dev_data,
                              wemb_vocab,
                              sort_all=args.sort_all,
                              clean_on=True,
                              cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path,
                                vocab,
                                embedding_dim,
                                fast_vec_format=args.fasttext_on)
    meta = {
        'vocab': vocab,
        'vocab_tag': vocab_tag,
        'vocab_ner': vocab_ner,
        'embedding': embedding
    }
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    del meta
    del embedding
    logger.info('deleted meta and embedding')

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data,
               vocab,
               vocab_tag,
               vocab_ner,
               train_fout,
               True,
               NLP=NLP,
               v2_on=v2_on)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data,
               vocab,
               vocab_tag,
               vocab_ner,
               dev_fout,
               False,
               NLP=NLP,
               v2_on=v2_on)
    end_time = time.time()
    logger.warning('It totally took {} minutes to processe the data!!'.format(
        (end_time - start_time) / 60.))
Beispiel #18
0
import torch

import utils
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import scipy
import statistics
import math
import scipy.stats as ss
tstart=time.time()
from config import set_args

########################################################################################################################


args = set_args()
args.output='./res/'+args.experiment+'_'+args.approach+'_'+str(args.note)+'.txt'


print('='*100)
print('Arguments =')
for arg in vars(args):
    print('\t'+arg+':',getattr(args,arg))
print('='*100)

########################################################################################################################

# Seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed)
Beispiel #19
0
def main():
    # Create a argument parser and read arguments from command line
    args = set_args()
    # logger will be a global variable
    global logger
    start_time = time.time()
    logger = create_logger(__name__, to_disk=True, log_file=args.log_file)
    v2_on = args.v2_on
    if v2_on:
        msg = '~Processing SQuAD v2.0 dataset~'
        train_path = 'train-v2.0.json'
        dev_path = 'dev-v2.0.json'
        version = 'v2'
    else:
        msg = '~Processing SQuAD v1.1 dataset~'
        train_path = 'train-v1.1.json'
        dev_path = 'dev-v1.1.json'
        version = 'v1'

    logger.warning(msg)
    if DEBUG_ON:
        logger.error('***DEBUGGING MODE***')
    train_path = os.path.join(args.data_dir, train_path)
    valid_path = os.path.join(args.data_dir, dev_path)

    logger.info('The path of training data: {}'.format(train_path))
    logger.info('The path of validation data: {}'.format(valid_path))
    logger.info('{}-dim word vector path: {}'.format(args.embedding_dim, args.glove))

    # could be fasttext embedding
    emb_path = args.glove
    embedding_dim = args.embedding_dim
    set_environment(args.seed)
    if args.fasttext_on:
        logger.info('Loading fasttext vocab.')
    else:
        logger.info('Loading glove vocab.')

    # load data
    train_data = load_data(train_path, v2_on=v2_on)
    dev_data = load_data(valid_path, False, v2_on=v2_on)

    wemb_vocab = load_emb_vocab(emb_path, embedding_dim, fast_vec_format=args.fasttext_on)
    logger.info('Build vocabulary')
    vocab, _, _ = build_vocab(train_data + dev_data, wemb_vocab, sort_all=args.sort_all, clean_on=True, cl_on=False)
    logger.info('Done with vocabulary collection')

    # loading ner/pos tagging vocab
    resource_path = 'resource'
    logger.info('Loading resource')

    # what do these vocab tags and vocab ners do?
    with open(os.path.join(resource_path, 'vocab_tag.pick'), 'rb') as f:
        vocab_tag = pickle.load(f)
    with open(os.path.join(resource_path, 'vocab_ner.pick'), 'rb') as f:
        vocab_ner = pickle.load(f)

    meta_path = gen_name(args.data_dir, args.meta, version, suffix='pick')
    logger.info('building embedding')
    embedding = build_embedding(emb_path, vocab, embedding_dim, fast_vec_format=args.fasttext_on)
    meta = {'vocab': vocab, 'vocab_tag': vocab_tag, 'vocab_ner': vocab_ner, 'embedding': embedding}
    with open(meta_path, 'wb') as f:
        pickle.dump(meta, f)

    logger.info('building training data')
    train_fout = gen_name(args.data_dir, args.train_data, version)
    build_data(train_data, vocab, vocab_tag, vocab_ner, train_fout, True, NLP=NLP, v2_on=v2_on,
               bert_tokenizer=BERT_TOKENIZER)

    logger.info('building dev data')
    dev_fout = gen_name(args.data_dir, args.dev_data, version)
    build_data(dev_data, vocab, vocab_tag, vocab_ner, dev_fout, False, NLP=NLP, v2_on=v2_on,
               bert_tokenizer=BERT_TOKENIZER)
    end_time = time.time()
    logger.warning('It totally took {} minutes to process the data!!'.format((end_time - start_time) / 60.))