Beispiel #1
0
def save_label(train_files, label_file, task=2, firstline=True):
    datasets = Tokenizer.load_file(train_files, firstline=firstline, task=task)
    # data = []
    label_set = set()
    for dataset in datasets:
        for nl, label in dataset:
            # data.append(d)
            label_set.update(set(label.split()))
    # label_set.update([NULL])
    TXT.write(label_set, label_file)
Beispiel #2
0
    def build_data(args):
        if not args.tl:
            if not os.path.exists(args.model_dir):
                os.mkdir(args.model_dir)
            if args.timestamped_subdir:
                sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_")
            else:
                sub_folder = ''
            if not os.path.exists(os.path.join(args.model_dir, sub_folder)):
                os.mkdir(os.path.join(args.model_dir, sub_folder))
            args.model_dir = os.path.join(args.model_dir, sub_folder)
            args.log_file = os.path.join(args.model_dir, args.log_file)
            if args.tokenize_type != "bpe":
                s_paras = [args.wl_th, args.wcutoff]
                t_paras = [args.wl_th, args.wcutoff]
                print("INFO: - Build vocabulary...")

                tokenizer = Tokenizer(s_paras, t_paras)
                files = [args.train_file]
                if args.train_file != args.dev_file:
                    files.append(args.dev_file)
                # Load datasets to build vocabulary
                data = Tokenizer.load_file(files, task=2)
                tokenizer.build(datasets=data)
                sw2i = tokenizer.sw2i
                tw2i = tokenizer.tw2i
                print("INFO: - Save vocabulary...")
                Tokenizer.save(tokenizer, os.path.join(args.model_dir, "tokenizer.vocab"))
            else:
                print("INFO: - Load vocabulary...")
                tokenizer = BPE.load(args.vocab_file)
                tokenizer.add_tokens(sys_tokens)
                sw2i = tokenizer.get_vocab()
                tw2i = tokenizer.get_vocab()

            # args.tokenizer = tokenizer
            # Source language
            args.swd_pretrained = None
            args.twd_pretrained = None
            if len(args.swd_embfile) != 0:
                scale = np.sqrt(3.0 / args.swd_dim)
                emb_reader = Embeddings(args.swd_embfile)
                args.swd_pretrained = emb_reader.get_W(args.swd_dim, sw2i, scale)
                if args.twd_embfile == args.swd_embfile:
                    scale = np.sqrt(3.0 / args.twd_dim)
                    args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale)

            # Target language
            if len(args.twd_embfile) != 0:
                scale = np.sqrt(3.0 / args.twd_dim)
                if args.twd_pretrained is None:
                    emb_reader = Embeddings(args.swd_embfile)
                args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale)

            # directly integrate transfer learning if no updating new words
            SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args))
            return args
        else:
            print("INFO: - Use transfer learning technique")
            assert os.path.exists(args.tlargs), print("\t - There is no pre-trained argument file")
            # load pre-trained argument file from a previous training folder
            margs = SaveloadHP.load(args.tlargs)
            # margs.tl = args.tl
            # margs.log_file = args.log_file

            # TODO update new vocab and all other new arguments used for new training
            # 0. Read vocab
            # 1. Update schema
            # 2. Update vocab
            # args.tokenizer = margs.tokenizer
            # 3. Use all model file directory of previous train
            args.model_dir = margs.model_dir
            args.seq2seq_file = margs.seq2seq_file
            # 4. Keep the remaining current arguments
            # add a constraint at the loading time that if fail to load any model, just skip it
            args.swd_pretrained = margs.swd_pretrained
            args.twd_pretrained = margs.twd_pretrained
            return args
Beispiel #3
0
    Data2tensor.set_randseed(12345)
    device = torch.device("cpu")
    dtype = torch.long
    use_cuda = False
    filename = "/media/data/classification/datasets/yelp_review_full_csv/train.csv"
    label_file = "/media/data/classification/datasets/yelp_review_full_csv/labels.txt"
    labels_list = TXT.read(label_file, firstline=False)
    lb2id_dict = Tokenizer.list2dict(sys_tokens + labels_list)
    id2lb_dict = Tokenizer.reversed_dict(lb2id_dict)
    lb2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=lb2id_dict,
                               unk_words=False, sos=False, eos=False)
    tokenize_type = "bpe"
    if tokenize_type != "bpe":
        # Load datasets to build vocabulary
        data = Tokenizer.load_file([filename], task=1)
        s_paras = [-1, 1]
        t_paras = [-1, 1]
        tokenizer = Tokenizer(s_paras, t_paras)
        tokenizer.build(data)
        nl2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_nl, vocab_words=tokenizer.sw2i,
                                   unk_words=True, sos=False, eos=False)
        tokenizer.tw2i = lb2id_dict
        tokenizer.i2tw = id2lb_dict
        tg2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=tokenizer.tw2i,
                                   unk_words=False, sos=False, eos=False)
        pad_id = tokenizer.sw2i.get(PAD, 0)
        sw_size = len(tokenizer.sw2i)
        tw_size = len(tokenizer.tw2i)
        collate_fn = Tokenizer.collate_fn(pad_id, True)
    else: