def torchLoad(config):

    TEXT = data.Field(sequential=True, fix_length=config.kwargs['padding_size'])
    LABEL = data.Field(sequential=True,use_vocab=False)

    train = GrandDataset(config.kwargs['raw_train_path'], text_field=TEXT, label_field=LABEL, config=config,test=False)
    val = GrandDataset(config.kwargs['raw_vali_path'], text_field=TEXT, label_field=LABEL, config=config,test=False)
    test = GrandDataset(config.kwargs['raw_test_path'], text_field=TEXT, label_field=None, config=config,test=True)

    cache = '../cache/'
    #读取W2V
    embedding_path = config.kwargs['embedding_path']
    print('load word2vec vectors from {}'.format(embedding_path))
    vectors = Vectors(name=embedding_path, cache=cache)
    vectors.unk_init = init.xavier_uniform_
    print('building {} vocabulary......'.format('Word'))
    TEXT.build_vocab(train, val, test, min_freq=1, vectors=vectors)


    train_iter = data.Iterator(dataset=train, batch_size=config.kwargs['batch_size'], sort=False,shuffle=True,repeat=False,
                                    device=-1)
    val_iter = data.Iterator(dataset=val, batch_size=config.kwargs['batch_size'], shuffle=False, sort=False, repeat=False,
                             device=-1)

    test_iter = data.Iterator(dataset=test, batch_size=config.kwargs['batch_size'], shuffle=False, sort=False, repeat=False,
                              device=-1)

    numerical_dict = TEXT.vocab.stoi

    return train_iter,val_iter,test_iter,TEXT.vocab.vectors,numerical_dict



# torchLoad(config)
def get_dataset(fix_length=100, lower=False, vectors=None):
    if vectors is not None:
        lower = True
    logging.info("预处理 csv......")
    prepare_csv()

    TEXT = Field(sequential=True,
                 fix_length=fix_length,
                 tokenize=tokenizer,
                 pad_first=True,
                 lower=lower)
    LABEL = Field(sequential=False, use_vocab=False)

    train_datafields = [("id", None), ("comment_text", TEXT), ("toxic", LABEL),
                        ("severe_toxic", LABEL), ("threat", LABEL),
                        ("obscene", LABEL), ("insult", LABEL),
                        ("identity_hate", LABEL)]

    logging.info("读取 train.csv......")
    train, val = TabularDataset.splits(path='cache',
                                       train='train.csv',
                                       validation="val.csv",
                                       format='csv',
                                       skip_header=True,
                                       fields=train_datafields)
    logging.info("读取 test.csv......")
    test = TabularDataset(path='cache/test.csv',
                          format='csv',
                          skip_header=True,
                          fields=[('id', None), ('comment_text', TEXT)])

    logging.info('读取glove词向量......')
    # vectors = GloVe(name='6B', dim=300) #会下载词向量
    #读取本地词向量
    cache = '.vector_cache'
    if not os.path.exists(cache):
        os.mkdir(cache)
    vectors = Vectors(
        name='/home/sunyan/quora/input/embeddings/glove.840B.300d.txt',
        cache=cache,
        max_vectors=200000)
    vectors.unk_init = init.xavier_uniform_

    logging.info('构建词表......')
    TEXT.build_vocab(train, test, max_size=20000, min_freq=50, vectors=vectors)

    print(TEXT.vocab.freqs.most_common(10))

    logging.info("预处理结束!")

    return (train, val, test), TEXT
Exemple #3
0
def load_test_data(option):

    #======
    Text_filed = data.Field(sequential=True, fix_length=option.max_text_len)
    #Label_field = data.Field(sequential=False, use_vocab=False)

    #======
    test_path = option.data_path + option.text_type + '/test_set.csv'
    #if option.aug:
    #print('make augementation datasets!')

    test = buildDataset(test_path,
                        text_field=Text_filed,
                        label_field=None,
                        text_type=option.text_type,
                        test=True)

    #======
    cache = '.vector_cache'
    if not os.path.exists(cache):
        os.mkdir(cache)
    embedding_path = '{}/{}_{}_.txt'.format(option.embedding_path,
                                            option.text_type, option.emb_size)
    print('embedding_path:', embedding_path)  #

    vectors = Vectors(name=embedding_path, cache=cache)
    print('load word2vec vectors from {}'.format(embedding_path))
    vectors.unk_init = init.xavier_uniform_
    #如何指定 Vector 缺失值的初始化方式: vector.unk_init = init.xavier_uniform 这种方式指定完再传入 build_vocab

    #======构建vocab
    print('building {} vocabulary......'.format(option.text_type))
    Text_filed.build_vocab(test, min_freq=option.min_freq, vectors=vectors)
    print('vocabulary has been made!\n')

    #======构建Iterator
    '''
	1. 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
	2. 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序
	'''
    print('building {} Iterator......'.format(option.text_type))
    test_iter = data.Iterator(dataset=test,
                              batch_size=option.batch_size,
                              shuffle=False,
                              sort=False,
                              repeat=False,
                              device=option.device)
    print('Iterator has been made!\n')

    return test_iter
Exemple #4
0
def load_data(opt):
    # 不设置fix_length
    TEXT = data.Field(sequential=True, fix_length=opt.max_text_len)  # 词或者字符
    LABEL = data.Field(sequential=False, use_vocab=False)

    # load
    # word/ or article/
    train_path = opt.data_path + opt.text_type + '/train_set.csv'
    val_path = opt.data_path + opt.text_type + '/val_set.csv'
    test_path = opt.data_path + opt.text_type + '/test_set.csv'
    train_path = 'util/word/train_set.csv'
    test_path = 'util/word/test_set.csv'
    val_path = 'util/word/val_set.csv'

    # aug for data augmentation
    if opt.aug:
        print('make augmentation datasets!')
    train = GrandDataset(train_path, text_field=TEXT, label_field=LABEL, text_type=opt.text_type, test=False,
                         aug=opt.aug)
    val = GrandDataset(val_path, text_field=TEXT, label_field=LABEL, text_type=opt.text_type, test=False)
    test = GrandDataset(test_path, text_field=TEXT, label_field=None, text_type=opt.text_type, test=True)

    cache = '.vector_cache'
    if not os.path.exists(cache):
        os.mkdir(cache)
    embedding_path = '{}/{}_{}_.txt'.format(opt.embedding_path, opt.text_type, opt.embedding_dim)

    vectors = Vectors(name=embedding_path, cache=cache)
    print('load word2vec vectors from {}'.format(embedding_path))
    vectors.unk_init = init.xavier_uniform  # 没有命中的token的初始化方式

    # 构建Vocab
    print('building {} vocabulary......'.format(opt.text_type))
    TEXT.build_vocab(train, val, test, min_freq=5, vectors=vectors)
    # LABEL.build_vocab(train)

    # 构建Iterator
    # 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
    # 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序
    train_iter = data.BucketIterator(dataset=train, batch_size=opt.batch_size, shuffle=True, sort_within_batch=False,
                                     repeat=False, device=opt.device)
    # val_iter = data.BucketIterator(dataset=val, batch_size=opt.batch_size, sort_within_batch=False, repeat=False,
    #                                device=opt.device)
    # train_iter = data.Iterator(dataset=train, batch_size=opt.batch_size, train=True, repeat=False, device=opt.device)
    val_iter = data.Iterator(dataset=val, batch_size=opt.batch_size, shuffle=False, sort=False, repeat=False,
                             device=opt.device)
    test_iter = data.Iterator(dataset=test, batch_size=opt.batch_size, shuffle=False, sort=False, repeat=False,
                              device=opt.device)

    return train_iter, val_iter, test_iter, len(TEXT.vocab), TEXT.vocab.vectors
    def __init__(self,
                 data_fields,
                 train_file,
                 valid_file,
                 batch_size,
                 device,
                 skip_header,
                 delimiter,
                 pre_embeddings,
                 vector_cache,
                 min_freq=2,
                 extend_vocab=True,
                 pre_vocab_size=200000,
                 use_pre_embedding=False):
        self.x_field = Field(sequential=True,
                             tokenize=self.word_tokenize,
                             batch_first=True,
                             include_lengths=True)
        self.y_field = LabelField(batch_first=True)
        self.train_fields, self.x_var, self.y_vars = self.parse_fields(
            data_fields, self.x_field, self.y_field)

        self.train_ds = TabularDataset(
            train_file,
            fields=self.train_fields,
            skip_header=skip_header,
            format="csv",
            csv_reader_params={"delimiter": delimiter})
        self.valid_ds = TabularDataset(
            valid_file,
            fields=self.train_fields,
            skip_header=skip_header,
            format="csv",
            csv_reader_params={"delimiter": delimiter})

        self.x_field.build_vocab(self.train_ds, min_freq=min_freq)
        if use_pre_embedding:
            vectors = Vectors(pre_embeddings, vector_cache)
            if extend_vocab:
                self.extend_vocab_with_vectors(self.x_field.vocab, vectors,
                                               pre_vocab_size)
            vectors.unk_init = partial(init_unk,
                                       vocab_size=len(self.x_field.vocab))
            self.x_field.vocab.load_vectors(vectors)
        self.y_field.build_vocab(self.train_ds)

        self.train_iter, self.valid_iter = BucketIterator.splits(
            (self.train_ds, self.valid_ds),
            batch_size=batch_size,
            device=device,
            sort=False,
            sort_key=lambda sample: len(getattr(sample, self.x_var)),
            sort_within_batch=False,
            shuffle=True,
            repeat=False,
        )

        self.vocab = self.x_field.vocab
        self.vocab_size = len(self.x_field.vocab)
        self.num_labels = len(self.y_vars)
        self.num_classes = len(self.y_field.vocab)
        self.classes = list(self.y_field.vocab.stoi.values())
        self.unk_token = self.x_field.unk_token
        self.pad_token = self.x_field.pad_token
        self.unk_idx = self.x_field.vocab.stoi[self.unk_token]
        self.pad_idx = self.x_field.vocab.stoi[self.pad_token]
        self.train_wrapper = BatchWrapper(self.train_iter, self.x_var,
                                          self.y_vars)
        self.valid_wrapper = BatchWrapper(self.valid_iter, self.x_var,
                                          self.y_vars)
Exemple #6
0
test_dataset_torchtext = MyDataset("data/tnews/test.json", TEXT, LABEL, True)
for batch_data in test_dataset_torchtext[:3]:
    print(batch_data.text)


# **Embedding**

# 下面介绍如何在torchtext中使用预训练的词向量,进而传送给神经网络模型进行训练

# In[25]:


# 加载预训练的词向量
vectors = Vectors(name="data/tnews_jieba_tencent_embeddings.txt")
# 指定Vector缺失值的初始化方式,oov词的初始化方式
vectors.unk_init = nn.init.uniform_
# 这里只使用训练集的数据进行词汇表的构建
TEXT.build_vocab(train_dataset_torchtext, vectors=vectors)


# In[26]:


# 统计词频
TEXT.vocab.freqs.most_common(10)


# **迭代器**
# 
# * **Iterator**:保持数据样本顺序不变来构建批数据,适用测试集
#