Exemple #1
0
def create_dataset(opt, SRC, TRG):

    print("creating dataset and iterator... ")

    raw_data = {
        'src': [line for line in opt.src_data],
        'trg': [line for line in opt.trg_data]
    }
    df = pd.DataFrame(raw_data, columns=["src", "trg"])

    mask = (df['src'].str.count(' ') <
            opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
    import pdb
    #pdb.set_trace()
    df = df.loc[mask]

    df.to_csv("translate_transformer_temp.csv", index=False)

    data_fields = [('src', SRC), ('trg', TRG)]
    train = data.TabularDataset('./translate_transformer_temp.csv',
                                format='csv',
                                fields=data_fields)

    # train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device,
    #                     repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
    #                     batch_size_fn=batch_size_fn, train=True, shuffle=True)

    train_iter = MyIterator(train,
                            batch_size=opt.batchsize,
                            device=torch.device('cuda'),
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=True,
                            shuffle=True)

    #os.remove('translate_transformer_temp.csv')

    if opt.load_weights is None:
        SRC.build_vocab(train)
        TRG.build_vocab(train)
        if opt.checkpoint > 0:
            try:
                os.mkdir("weights")
            except:
                print(
                    "weights folder already exists, run program with -load_weights weights to load them"
                )
                quit()
            pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
            pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))

    opt.src_pad = SRC.vocab.stoi['<pad>']
    opt.trg_pad = TRG.vocab.stoi['<pad>']

    opt.train_len = get_len(train_iter)

    return train_iter
	def create_dataset(self):

		print('Creating dataset and iterator')

		raw_data = {'src': [line for line in self.english_data], 'trg': [line for line in self.french_data]}
		df = pd.DataFrame(raw_data, columns=['src', 'trg'])

		mask = (df['src'].str.count(' ') < this.max_length) & (df['trg'].str.count(' ') < this.max_length)
		df = df.loc[mask]

		df.to_csv('data/translate_sentences.csv', index=False)

		data_fields = [('src', self.SRC), ('trg', self.TRG)]
		train = data.TabularDataset(path='data/translate_sentences.csv', format='csv', feilds=data_fields)

		train_iter = MyIterator(train, batch_size=)
def create_dataset(opt, SRC, TRG):
    print("creating dataset and iterator... ")
    raw_data = {
        'src': [line for line in opt.src_data],
        'trg': [line for line in opt.trg_data]
    }
    # print("raw_data",raw_data)
    # 此处开始制作 一个 csv 文件
    df = pd.DataFrame(raw_data, columns=["src", "trg"])
    mask = (df['src'].str.count(' ') <
            opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
    # print("mask",len(mask),mask)
    df = df.loc[mask]
    # print("df.loc[mask]",df.loc[mask])
    df.to_csv("translate_transformer_temp.csv", index=False)
    data_fields = [('src', SRC), ('trg', TRG)]
    train = data.TabularDataset('./translate_transformer_temp.csv',
                                format='csv',
                                fields=data_fields)
    train_iter = MyIterator(train,
                            batch_size=opt.batchsize,
                            device=opt.device,
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=True,
                            shuffle=True)
    # os.remove('translate_transformer_temp.csv')
    # 此处 删除 制作的 csv 文件
    if not opt.premodels or os.path.exists(opt.load_weights + "/" +
                                           opt.premodels_path):  # 加载权重
        SRC.build_vocab(train)  # 制作数据词表
        TRG.build_vocab(train)
        if opt.checkpoint > 0:
            if not os.path.exists(opt.load_weights):
                os.mkdir(opt.load_weights)
                print(
                    "weights folder already exists, run program with -load_weights weights to load them"
                )
            pickle.dump(SRC, open(config.weights + '/SRC.pkl', 'wb'))
            pickle.dump(TRG, open(config.weights + '/TRG.pkl', 'wb'))
    opt.src_pad = SRC.vocab.stoi['<pad>']
    opt.trg_pad = TRG.vocab.stoi['<pad>']
    return train_iter
def create_dataset(opt, SRC, TRG):

    print("creating dataset and iterator...")

    raw_data = {
        'src': [line for line in opt.src_data],
        'trg': [line for line in opt.trg_data]
    }
    df = pd.DataFrame(raw_data, columns=["src", "trg"])

    mask = (df['src'].str.count(' ') <
            opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
    df = df.loc[mask]

    df.to_csv("translate_transformer_temp.csv", index=False)

    data_fields = [('src', SRC), ('trg', TRG)]
    train = data.TabularDataset('./translate_transformer_temp.csv',
                                format='csv',
                                fields=data_fields)

    train_iter = MyIterator(train,
                            batch_size=opt.batchsize,
                            device=opt.device,
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=True,
                            shuffle=True)

    os.remove('translate_transformer_temp.csv')

    if opt.load_weights is None:
        SRC.build_vocab(train)
        TRG.build_vocab(train)

    opt.src_pad = SRC.vocab.stoi['<pad>']
    opt.trg_pad = TRG.vocab.stoi['<pad>']

    opt.train_len = get_len(train_iter)

    return train_iter
Exemple #5
0
def create_dataset(opt, SRC, TRG, word_emb):
    timer = Timer()
    print(f"{pd.to_datetime('today')}: creating dataset and iterator... ")
    train = generate_tabular_dataset(opt)
    valid = generate_tabular_dataset(opt, valid=True)

    if opt.nmt_model_type == 'transformer':
        if opt.use_dynamic_batch == True:
            train_iter, valid_iter = MyIterator.splits(
                (train, valid),
                batch_size=opt.batchsize,
                device=opt.device,
                repeat=False,
                sort_key=lambda x: (len(x.src), len(x.trg)),
                batch_size_fn=batch_size_fn,
                shuffle=True)  # batch_size_fn = dynamic batching
        else:
            train_iter, valid_iter = MyIterator.splits(
                (train, valid),
                batch_size=opt.batchsize,
                device=opt.device,
                repeat=False,
                sort_key=lambda x: (len(x.src), len(x.trg)),
                shuffle=True)
    else:
        train_iter, valid_iter = data.BucketIterator.splits(
            (train, valid),
            sort_key=lambda x: (len(x.src), len(x.trg)),
            batch_size=opt.batchsize,
            device=opt.device)

    os.remove('translate_transformer_temp.csv')

    if opt.load_weights is None:
        SRC.build_vocab(train, min_freq=2)
        TRG.build_vocab(train, min_freq=2)
        if word_emb is not None:
            SRC_vectors = embedding_to_torchtext_vocab_translator(
                SRC, word_emb)
            TRG_vectors = embedding_to_torchtext_vocab_translator(
                TRG, word_emb)
            # put word embedding vectors inside vocab class
            SRC.vocab.set_vectors(SRC.vocab.stoi, SRC_vectors, opt.d_model)
            TRG.vocab.set_vectors(TRG.vocab.stoi, TRG_vectors, opt.d_model)

        if opt.checkpoint > 0:
            try:
                os.mkdir("weights")
            except:
                print(
                    "Deleting and creating new weights folder. If this was unexpected, run program before 5s with -load_weights weights to load them"
                )
                time.sleep(5)
                shutil.rmtree("weights")
                os.mkdir("weights")
            pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
            pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))

    opt.src_pad = SRC.vocab.stoi[
        '<pad>']  # get number of pad token, used for masking
    opt.trg_pad = TRG.vocab.stoi[
        '<pad>']  # Pad the text so that all the sequences are the same length, so you can process them in batch
    opt.trg_sos = TRG.vocab.stoi['<sos>']
    opt.SRC = SRC
    opt.TRG = TRG

    opt.train_len = get_len(train_iter)
    timer.print_time('create_dataset')

    return train_iter, valid_iter, SRC, TRG
Exemple #6
0
def create_dataset(opt, SRC, TRG):
    print("creating dataset and iterator... ")

    if opt.task == 'toy_task' or opt.task == 'e_snli_o':
        # Load in validation data
        f_in, f_out = open(opt.data_path + '/val_in.txt',
                           'r',
                           encoding='utf-8'), open(opt.data_path +
                                                   '/val_out.txt',
                                                   'r',
                                                   encoding='utf-8')
        in_ = [x.replace('\n', '') for x in f_in.readlines()]
        out_ = [x.replace('\n', '') for x in f_out.readlines()]

        raw_data = {'src': in_, 'trg': out_}
        df = pd.DataFrame(raw_data, columns=["src", "trg"])

        mask = (df['src'].str.count(' ') <
                opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
        df = df.loc[mask]

        df.to_csv("translate_transformer_temp.csv", index=False)
        data_fields = [('src', SRC), ('trg', TRG)]
        val = data.TabularDataset('./translate_transformer_temp.csv',
                                  format='csv',
                                  fields=data_fields,
                                  skip_header=True)
        os.remove('translate_transformer_temp.csv')

        val_iter = MyIterator(val,
                              batch_size=opt.batchsize,
                              repeat=False,
                              sort_key=lambda x: (len(x.src), len(x.trg)),
                              train=False,
                              shuffle=False)
    elif opt.task == 'e_snli_r':
        # Load in validation data
        f_in, f_out = open(opt.data_path + '/val_in.txt',
                           'r',
                           encoding='utf-8'), open(opt.data_path +
                                                   '/val_out.txt',
                                                   'r',
                                                   encoding='utf-8')
        if opt.label_path is None:
            raise AssertionError(
                'Need to provide a path to label data for validation checks')

        f_label = open(opt.label_path + '/val_out.txt', 'r', encoding='utf-8')

        in_ = [x.replace('\n', '') for x in f_in.readlines()]
        out_ = [x.replace('\n', '') for x in f_out.readlines()]
        labels_ = [x.replace('\n', '') for x in f_label.readlines()]
        out1, out2, out3 = [], [], []
        for o in out_:
            split = o.split(' @@SEP@@ ')
            out1.append(split[0])
            out2.append(split[1])
            out3.append(split[2])

        raw_data = {
            'src': in_,
            'trg1': out1,
            'trg2': out2,
            'trg3': out3,
            'labels': labels_
        }
        df = pd.DataFrame(raw_data,
                          columns=["src", "trg1", "trg2", "trg3", "labels"])

        mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg1'].str.count(' ') < opt.max_strlen) & \
               (df['trg2'].str.count(' ') < opt.max_strlen) & (df['trg3'].str.count(' ') < opt.max_strlen)
        df = df.loc[mask]

        df.to_csv("translate_transformer_temp.csv", index=False)
        data_fields = [('src', SRC), ('trg1', TRG), ('trg2', TRG),
                       ('trg3', TRG), ('label', opt.classifier_TRG)]
        val = data.TabularDataset('./translate_transformer_temp.csv',
                                  format='csv',
                                  fields=data_fields,
                                  skip_header=True)
        os.remove('translate_transformer_temp.csv')

        val_iter = MyIterator(
            val,
            batch_size=opt.batchsize,
            repeat=False,
            sort_key=lambda x:
            (len(x.src), len(x.trg1), len(x.trg2), len(x.trg3)),
            train=False,
            shuffle=False)

    else:
        # cos_e
        raise NotImplementedError(
            "No implementation provided in process.py for cos-e (yet)")

    ##### TRAIN DATA #####
    raw_data = {
        'src': [line for line in opt.src_data],
        'trg': [line for line in opt.trg_data]
    }
    df = pd.DataFrame(raw_data, columns=["src", "trg"])

    mask = (df['src'].str.count(' ') <
            opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
    df = df.loc[mask]

    df.to_csv("translate_transformer_temp.csv", index=False)
    data_fields = [('src', SRC), ('trg', TRG)]
    train = data.TabularDataset('./translate_transformer_temp.csv',
                                format='csv',
                                fields=data_fields,
                                skip_header=True)
    print('desired batch size', opt.batchsize)

    train_iter = MyIterator(
        train,
        batch_size=opt.batchsize,  # device=opt.device,
        repeat=False,
        sort_key=lambda x: (len(x.src), len(x.trg)),
        train=True,
        shuffle=True)
    os.remove('translate_transformer_temp.csv')

    if opt.load_weights is None:
        if opt.checkpoint > 0:
            try:
                os.mkdir("weights")
            except:
                print(
                    "weights folder already exists, run program with -load_weights weights to load them"
                )
                quit()
            pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
            pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))

    opt.src_pad = SRC.vocab.stoi['<pad>']
    opt.trg_pad = TRG.vocab.stoi['<pad>']

    opt.train_len = get_len(train_iter)
    print('number of train batches:', opt.train_len)
    print('number of val batches:', get_len(val_iter))
    return train_iter, val_iter