def readdata():
    global target_len
    min_count = 10
    #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ]
    dataset_train = fetch_20newsgroups(subset='train', data_home='../../..')
    dataset_test = fetch_20newsgroups(subset='test', data_home='../../..')

    data = dataset_train.data
    target = dataset_train.target
    target_len = len(dataset_train.target_names)
    train_data =  DataSet()
    padding = 0
    for i in range(len(data)):
        data_t =  re.sub("\d+|\s+|/", " ", data[i] )
        temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != '']
        train_data.append(Instance(raw = data[i], label = int(target[i]), words = temp))
        if len(temp) > padding:
            padding = len(temp)
    train_data.apply(lambda x: x['raw'].lower(), new_field_name='raw')

    data = dataset_test.data
    target = dataset_test.target
    test_data =  DataSet()
    padding = 0
    for i in range(len(data)):
        data_t =  re.sub("\d+|\s+|/", " ", data[i] )
        temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != '']
        test_data.append(Instance(raw = data[i], label = int(target[i]), words = temp))
        if len(temp) > padding:
            padding = len(temp)
    test_data.apply(lambda x: x['raw'].lower(), new_field_name='raw')

    train_data.apply(lambda x: len(x['words']), new_field_name='len')
    test_data.apply(lambda x: len(x['words']), new_field_name='len')

    vocab = Vocabulary(min_freq=10)
    train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
    vocab.build_vocab()
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq')
    test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq')
    train_data.rename_field('seq', Const.INPUT)
    train_data.rename_field('len', Const.INPUT_LEN)
    train_data.rename_field('label', Const.TARGET)

    test_data.rename_field('seq', Const.INPUT)
    test_data.rename_field('len', Const.INPUT_LEN)
    test_data.rename_field('label', Const.TARGET)

    test_data.set_input(Const.INPUT, Const.INPUT_LEN)
    test_data.set_target(Const.TARGET)
    train_data.set_input(Const.INPUT, Const.INPUT_LEN)
    train_data.set_target(Const.TARGET)

    test_data, dev_data = test_data.split(0.5)
    return train_data,dev_data,test_data,vocab
Exemple #2
0
 def test_add_field_v2(self):
     ds = DataSet({"x": [3, 4]})
     ds.add_field('y', [['hello', 'world'], ['this', 'is', 'a', 'test']], is_input=True, is_target=True)
     # ds.apply(lambda x:[x['x']]*3, is_input=True, is_target=True, new_field_name='y')
     print(ds)
Exemple #3
0
 def test_add_null(self):
     # TODO test failed because 'fastNLP\core\field.py:143: RuntimeError'
     ds = DataSet()
     with self.assertRaises(RuntimeError) as RE:
         ds.add_field('test', [])
Exemple #4
0
 def test_drop(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20})
     ds.drop(lambda ins: len(ins["y"]) < 3, inplace=True)
     self.assertEqual(len(ds), 20)
Exemple #5
0
 def test_get_target_name(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
     self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target])
Exemple #6
0
    def test_tutorial_1_data_preprocess(self):
        from fastNLP import DataSet
        data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."],
                'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'],
                          ['Third', 'instance', '.']],
                'seq_len': [6, 3, 3]}
        dataset = DataSet(data)
        # 传入的dict的每个key的value应该为具有相同长度的list

        from fastNLP import DataSet
        from fastNLP import Instance
        dataset = DataSet()
        instance = Instance(raw_words="This is the first instance",
                            words=['this', 'is', 'the', 'first', 'instance', '.'],
                            seq_len=6)
        dataset.append(instance)

        from fastNLP import DataSet
        from fastNLP import Instance
        dataset = DataSet([
            Instance(raw_words="This is the first instance",
                     words=['this', 'is', 'the', 'first', 'instance', '.'],
                     seq_len=6),
            Instance(raw_words="Second instance .",
                     words=['Second', 'instance', '.'],
                     seq_len=3)
        ])

        from fastNLP import DataSet
        dataset = DataSet({'a': range(-5, 5), 'c': [0] * 10})

        # 不改变dataset,生成一个删除了满足条件的instance的新 DataSet
        dropped_dataset = dataset.drop(lambda ins: ins['a'] < 0, inplace=False)
        # 在dataset中删除满足条件的instance
        dataset.drop(lambda ins: ins['a'] < 0)
        #  删除第3个instance
        dataset.delete_instance(2)
        #  删除名为'a'的field
        dataset.delete_field('a')

        #  检查是否存在名为'a'的field
        print(dataset.has_field('a'))  # 或 ('a' in dataset)
        #  将名为'a'的field改名为'b'
        dataset.rename_field('c', 'b')
        #  DataSet的长度
        len(dataset)

        from fastNLP import DataSet
        data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."]}
        dataset = DataSet(data)

        # 将句子分成单词形式, 详见DataSet.apply()方法
        dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')

        # 或使用DataSet.apply_field()
        dataset.apply_field(lambda sent: sent.split(), field_name='raw_words', new_field_name='words')

        # 除了匿名函数,也可以定义函数传递进去
        def get_words(instance):
            sentence = instance['raw_words']
            words = sentence.split()
            return words

        dataset.apply(get_words, new_field_name='words')
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
from fastNLP.core.const import Const as C
from fastNLP.modules import encoder

trainSet = fetch_20newsgroups(subset='train')
testSet = fetch_20newsgroups(subset='test')

testData = {"data": testSet['data'], "target": testSet['target']}
trainData = {"data": trainSet['data'], "target": trainSet['target']}
trainData = DataSet(trainData)
testData = DataSet(testData)

trainData.apply(lambda x: x['data'].lower(), new_field_name='sentence')
trainData.apply(lambda x: x['sentence'].split(),
                new_field_name='words',
                is_input=True)
vocab = Vocabulary(min_freq=2)
vocab = vocab.from_dataset(trainData, field_name='words')
#change to index
vocab.index_dataset(trainData, field_name='words', new_field_name='words')
trainData.set_target('target')
train_data, dev_data = trainData.split(0.2)


#demo LSTM version
def preprocess(task, type, length=60):
    Task = []
    i = 0
    index = 0
    ind = []
    raw = []
    for File in filter(lambda s: True if s.find('.' + type + '.train') != -1 else False, Filelist):
        raw1 = []
        j = 0
        with open(File, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                j += 1
                if line[-3] == '-':
                    label = -1
                    onehot = [1, 0]
                else:
                    label = 1
                    onehot = [0, 1]
                raw1.append(Instance(sentence = line[:-3], label = label, onehot=onehot))
                raw.append(Instance(sentence = line[:-3], label = label, onehot=onehot))
        raw1 = DataSet(raw1)
        raw1.apply(lambda x: pre(x['sentence'], length), new_field_name='words')
        if File in [task + type + '.train' for task in test]:
            index = i
        if j <= 30:
            ind.append(i)
        i += 1
        Task.append(raw1)

    raw3 = []
    for File in filter(lambda s: True if s == task + type + '.dev' else False, Filelist):
        with open(File, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                if line[-3] == '-':
                    label = -1
                    onehot = [1, 0]
                else:
                    label = 1
                    onehot = [0, 1]
                raw3.append(Instance(sentence=line[:-3], label=label, onehot=onehot))
    raw3 = DataSet(raw3)
    raw3.apply(lambda x: pre(x['sentence'], length), new_field_name='words')

    raw2 = []
    for File in filter(lambda s: True if s == task + type + '.test' else False, Filelist):
        raw2 = []
        with open(File, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                if line[-3] == '-':
                    label = -1
                    onehot = [1, 0]
                else:
                    label = 1
                    onehot = [0, 1]
                raw2.append(Instance(sentence=line[:-3], label=label, onehot=onehot))
    raw2 = DataSet(raw2)
    raw2.apply(lambda x: pre(x['sentence'], length), new_field_name='words')

    raw = DataSet(raw)
    raw.apply(lambda x: pre(x['sentence'], length), new_field_name='words')
    vocab = Vocabulary(min_freq=2).from_dataset(raw, raw3, raw2, field_name='words')
    vocab.index_dataset(raw2, raw3, field_name='words', new_field_name='words')
    return Task, vocab, ind, index, raw3, raw2
Exemple #9
0
from fastNLP.embeddings import BertEmbedding
from fastNLP.models import BertForSequenceClassification
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam
'''import other package'''
import os
import csv
import torch
'''load the csv'''
file2label = {}
with open('dataset/annotations_metadata.csv') as f:
    for line in f:
        items = line.split(',')
        file2label[items[0] +
                   ".txt"] = (1 if items[4].split('\n')[0] == 'hate' else 0)
'''load the Hate-Speech data'''
train_dataset = DataSet()
dev_dataset = DataSet()
test_dataset = DataSet()
cnt = 0
length = len(os.listdir("dataset/all_files"))

for file in os.listdir("dataset/all_files"):
    with open("dataset/all_files/" + file) as f:
        cnt += 1
        raw_words = f.read()
        words = raw_words.split()
        seq_len = len(words)
        if cnt > length * 0.9:
            test_dataset.append(
                Instance(raw_words=raw_words,
                         words=words,
def load_dataset(args):

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
    }

    num_labels_task = {
        "cola": 2,
        "mnli": 3,
        "mrpc": 2,
    }

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    args.num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    train_examples = None
    args.num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        args.num_train_steps = int(
            len(train_examples) / args.train_batch_size / 1 *
            args.num_train_epochs)

    # training dataset
    train_features = convert_examples_to_features(train_examples, label_list,
                                                  args.max_seq_length,
                                                  tokenizer)

    train_data = DataSet({
        "x": [f.input_ids for f in train_features],
        "segment_info": [f.segment_ids for f in train_features],
        "mask": [f.input_mask for f in train_features],
        "target": [f.label_id for f in train_features]
    })

    train_data.set_input('x', 'segment_info', 'mask')
    train_data.set_target('target')

    # dev dataset

    eval_examples = processor.get_dev_examples(args.data_dir)
    eval_features = convert_examples_to_features(eval_examples, label_list,
                                                 args.max_seq_length,
                                                 tokenizer)

    dev_data = DataSet({
        "x": [f.input_ids for f in eval_features],
        "segment_info": [f.segment_ids for f in eval_features],
        "mask": [f.input_mask for f in eval_features],
        "target": [f.label_id for f in eval_features]
    })

    dev_data.set_input('x', 'segment_info', 'mask')
    dev_data.set_target('target')

    return train_data, dev_data
class TextData():
    vocab_size = 0
    dataset_size = 0
    train_size = 0
    test_size = 0
    class_num = 4
    min_count = 10
    max_seq_len = 500
    seq_limit = 2000
    data_src = "20news"

    data_set = DataSet()
    train_set = DataSet()
    test_set = DataSet()
    dev_set = DataSet()
    vocab = None


    def __init__(self,data_src="20news",min_count=10,seq_limit=None):
        self.data_src = data_src
        self.min_count = min_count
        if seq_limit is not None:
            self.seq_limit = seq_limit

    def find_max_len(self,words):
        self.max_seq_len = max(len(words),self.max_seq_len)

    def seq_regularize(self,words):
        wlen = len(words)
        if wlen<self.max_seq_len:
            return [0]*(self.max_seq_len-wlen) + words
        else:
            return words[:self.max_seq_len]

    def fetch_20news(self,size=4):
        print("Loading 20newsgroups data and tokenize.")
        if size==20:
            train,test = get_all_20news()
        else:
            train,test = get_text_classification_datasets()
        train_input,test_input = tokenize(train.data,test.data)
        train_target = train.target
        test_target = test.target
        self.class_num = len(train.target_names)
        assert (self.class_num == len(test.target_names))

        # Building Fastnlp dataset.
        print("Building Fastnlp dataset.")
        self.train_set = DataSet({"text":train_input,"class":train_target})
        self.test_set = DataSet({"text":test_input,"class":test_target})
        
        # Building Fastnlp vocabulary...
        print("Building Fastnlp vocabulary.")
        self.vocab = Vocabulary(min_freq=self.min_count)
        self.train_set.apply(lambda x : [self.vocab.add_word(word) for word in x['text']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = len(self.vocab)
        # Building multi-hot-vector for train_set and test_set.
        print("Building id-presentation for train_set and test_set.")
        self.vocab.index_dataset(self.train_set,self.test_set,field_name='text',new_field_name='words')
        
        self.train_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len')
        self.test_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len')
        self.train_set.apply_field(self.find_max_len,field_name='words')

        print(self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len,self.seq_limit)

        self.train_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words')
        self.test_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words')
        # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input")
        # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input')
        
        # Building target-vector for train_set and test_set.
        print("Building target-vector for train_set and test_set.")
        self.train_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True)
        self.test_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True)
        # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")
        # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")

    def fetch_csv(self,path=None):
        print("Not implemented now...")
        pass

    def fetch_data(self,path=None):
        if self.data_src == "20news":
            # Loading 20newsgroups data and tokenize.
            self.fetch_20news()
        elif self.data_src == "20news_all":
            self.fetch_20news(size=20)
        else:
            print("No data src...")
        
        self.train_size = self.train_set.get_length()
        self.test_size = self.test_set.get_length()
        return self.train_size,self.test_size
Exemple #12
0

data_bundle = get_data()
print(data_bundle)
train_word2bpes = data_bundle.train_word2bpes
target_word2bpes = data_bundle.target_word2bpes
print(
    f"In total {len(target_word2bpes)} target words, {len(train_word2bpes)} words."
)

pad_id = data_bundle.pad_id
lg_dict = getattr(data_bundle, 'lg_dict')
lg_shifts = getattr(data_bundle, 'lg_shift')
train_lg_shifts = getattr(data_bundle, 'train_lg_shift')

train_data = DataSet()
for name, ds in data_bundle.iter_datasets():
    if 'train' in name:
        for ins in ds:
            train_data.append(ins)

train_data.add_seq_len('input')
train_data.set_input('input', 'language_ids')
train_data.set_target('target')
train_data.set_pad_val('input', pad_id)

clip_max_length(train_data, data_bundle, max_sent_len=50)

model = JointBertReverseDict(pre_name,
                             train_word2bpes,
                             target_word2bpes,
import os
import pickle
from fastNLP import Vocabulary
from fastNLP import DataSet

root_path = '/remote-home/txsun/data/glue_data'

# RTE
print('processing RTE...')
dataset = 'RTE'
data_path = os.path.join(root_path, dataset)

## Train
print('reading train file...')
train_file = os.path.join(data_path, 'train.tsv')
train_ds = DataSet.read_csv(train_file, sep='\t')
train_ds.delete_field('index')

print(train_ds[0])
print(len(train_ds))

## Dev
print('reading dev file...')
dev_file = os.path.join(data_path, 'dev.tsv')
dev_ds = DataSet.read_csv(dev_file, sep='\t')
dev_ds.delete_field('index')

print(dev_ds[0])
print(len(dev_ds))

## Test
def main():
    parser = argparse.ArgumentParser()
    # fmt: off
    parser.add_argument("--data_path", required=True, type=str, help="all of datasets pkl paths")
    # fmt: on

    options, _ = parser.parse_known_args()

    train_set, test_set = DataSet(), DataSet()

    input_dir = os.path.join(options.data_path, "joint-sighan2008/bmes")
    options.output = os.path.join(options.data_path, "total_dataset.pkl")
    print(input_dir, options.output)

    for fn in os.listdir(input_dir):
        if fn not in ["test.txt", "train-all.txt"]:
            continue
        print(fn)
        abs_fn = os.path.join(input_dir, fn)
        ds = read_file(abs_fn)
        if "test.txt" == fn:
            test_set = ds
        else:
            train_set = ds

    print(
        "num samples of total train, test: {}, {}".format(len(train_set), len(test_set))
    )

    uni_vocab = Vocabulary(min_freq=None).from_dataset(
        train_set, test_set, field_name="ori_words"
    )
    # bi_vocab = Vocabulary(min_freq=3, max_size=50000).from_dataset(train_set,test_set, field_name="bi1")
    bi_vocab = Vocabulary(min_freq=3, max_size=None).from_dataset(
        train_set, field_name="bi1", no_create_entry_dataset=[test_set]
    )
    tag_vocab = Vocabulary(min_freq=None, padding="s", unknown=None).from_dataset(
        train_set, field_name="ori_tags"
    )
    task_vocab = Vocabulary(min_freq=None, padding=None, unknown=None).from_dataset(
        train_set, field_name="task"
    )

    def to_index(dataset):
        uni_vocab.index_dataset(dataset, field_name="ori_words", new_field_name="uni")
        tag_vocab.index_dataset(dataset, field_name="ori_tags", new_field_name="tags")
        task_vocab.index_dataset(dataset, field_name="task", new_field_name="task")

        dataset.apply_field(lambda x: x[1:], field_name="bi1", new_field_name="bi2")
        dataset.apply_field(lambda x: x[:-1], field_name="bi1", new_field_name="bi1")
        bi_vocab.index_dataset(dataset, field_name="bi1", new_field_name="bi1")
        bi_vocab.index_dataset(dataset, field_name="bi2", new_field_name="bi2")

        dataset.set_input("task", "uni", "bi1", "bi2", "seq_len")
        dataset.set_target("tags")
        return dataset

    train_set = to_index(train_set)
    test_set = to_index(test_set)

    output = {}
    output["train_set"] = train_set
    output["test_set"] = test_set
    output["uni_vocab"] = uni_vocab
    output["bi_vocab"] = bi_vocab
    output["tag_vocab"] = tag_vocab
    output["task_vocab"] = task_vocab

    print(tag_vocab.word2idx)
    print(task_vocab.word2idx)

    make_sure_path_exists(os.path.dirname(options.output))

    print("Saving dataset to {}".format(os.path.abspath(options.output)))
    with open(options.output, "wb") as outfile:
        dump(output, outfile)

    print(len(task_vocab), len(tag_vocab), len(uni_vocab), len(bi_vocab))
    dic = {}
    tokens = {}

    def process(words):
        name = words[0][1:-1]
        if name not in dic:
            dic[name] = set()
            tokens[name] = 0
        tokens[name] += len(words[1:])
        dic[name].update(words[1:])

    train_set.apply_field(process, "ori_words", None)
    for name in dic.keys():
        print(name, len(dic[name]), tokens[name])

    with open(os.path.join(os.path.dirname(options.output), "oovdict.pkl"), "wb") as f:
        dump(dic, f)

    def get_max_len(ds):
        global max_len
        max_len = 0

        def find_max_len(words):
            global max_len
            if max_len < len(words):
                max_len = len(words)

        ds.apply_field(find_max_len, "ori_words", None)
        return max_len

    print(
        "train max len: {}, test max len: {}".format(
            get_max_len(train_set), get_max_len(test_set)
        )
    )
def equip_chinese_ner_with_lexicon(datasets, vocabs, embeddings, w_list, word_embedding_path=None,
                                   only_lexicon_in_train=False, word_char_mix_embedding_path=None,
                                   number_normalized=False,
                                   lattice_min_freq=1, only_train_min_freq=0):
    from fastNLP.core import Vocabulary
    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(datasets['train'], field_name='chars',
                                    no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(datasets['train'], field_name='bigrams',
                                      no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if only_lexicon_in_train:
        print('已支持只加载在trian中出现过的词汇')

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    # from fastNLP.embeddings import StaticEmbedding
    from fastNLP_module import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    if only_lexicon_in_train:
        lexicon_in_train = set()
        for s in datasets['train']['chars']:
            lexicon_in_s = w_trie.get_lexicon(s)
            for s, e, lexicon in lexicon_in_s:
                lexicon_in_train.add(''.join(lexicon))

        print('lexicon in train:{}'.format(len(lexicon_in_train)))
        print('i.e.: {}'.format(list(lexicon_in_train)[:10]))
        w_trie = Trie()
        for w in lexicon_in_train:
            w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars', 'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons', 'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons', 'lex_e')

    if number_normalized == 1:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(datasets['train'], field_name='chars',
                                    no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(datasets['train'], field_name='chars',
                                    no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(datasets['train'], field_name='bigrams',
                                      no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    # print(list(datasets['train'][:10]['lexicons']))
    # print(list(datasets['train'][:10]['lattice']))
    # print(list(datasets['train'][:10]['lex_s']))
    # print(list(datasets['train'][:10]['lex_e']))
    # print(list(datasets['train'][:10]['pos_s']))
    # print(list(datasets['train'][:10]['pos_e']))
    # exit(1208)

    word_vocab = Vocabulary()
    word_vocab.add_word_lst(w_list)
    vocabs['word'] = word_vocab

    lattice_vocab = Vocabulary()
    lattice_vocab.from_dataset(datasets['train'], field_name='lattice',
                               no_create_entry_dataset=[v for k, v in datasets.items() if k != 'train'])
    vocabs['lattice'] = lattice_vocab
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word')
    #
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count')
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_l2r_word',new_field_name='skips_l2r_word')
    #
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back')
    #
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_r2l_word',new_field_name='skips_r2l_word')

    if word_embedding_path is not None:
        word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0)
        embeddings['word'] = word_embedding

    if word_char_mix_embedding_path is not None:
        lattice_embedding = StaticEmbedding(lattice_vocab, word_char_mix_embedding_path, word_dropout=0.01,
                                            min_freq=lattice_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['lattice'] = lattice_embedding

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars', new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams', new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target', new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice', new_field_name='lattice')

    return datasets, vocabs, embeddings
def construct_dataset(dataset):
    dataset_ = DataSet()
    for sentence, target in zip(dataset.data, dataset.target):
        instance = Instance()
        instance['raw_sentence'] = sentence
        instance['target'] = int(target)
        dataset_.append(instance)

    dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '',
                                    x['raw_sentence']),
                   new_field_name='sentence')  #忽略标点
    dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.whitespace), ' ',
                                    x['sentence']),
                   new_field_name='sentence')  #将空格、换行符等空白替换为空格
    dataset_.apply(lambda x: x['sentence'].lower(),
                   new_field_name='sentence')  #转换为小写
    dataset_.apply_field(lambda x: x.split(),
                         field_name='sentence',
                         new_field_name='input')
    return dataset_
Exemple #17
0
    def test_from_dataset(self):
        start_char = 65
        num_samples = 10

        # 0 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=chr(start_char + i))
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

        # 1 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=[chr(start_char + i)] * 6)
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

        # 2 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=[[chr(start_char + i) for _ in range(6)]
                                 for _ in range(6)])
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')
Exemple #18
0
from fastNLP import Vocabulary
from fastNLP import DataSet

# 问题:fastNLP虽然已经提供了split函数,可以将数据集划分成训练集和测试机,但一般网上用作训练的标准集都已经提前划分好了训练集和测试机,
# 而使用split将数据集进行随机划分还引来了一个问题:
#       因为每次都是随机划分,导致每次的字典都不一样,保存好模型下次再载入进行测试时,因为字典不同导致结果差异非常大。
#
# 解决方法:在Vocabulary增加一个字典保存函数和一个字典读取函数,而不是每次都生成一个新字典,同时减少下次运行的成本,第一次使用save_vocab()
# 生成字典后,下次可以直接使用load_vocab()载入的字典。
if __name__ == '__main__':

    data_path = "data_for_tests/tutorial_sample_dataset.csv"

    train_data = DataSet.read_csv(data_path,
                                  headers=('raw_sentence', 'label'),
                                  sep='\t')
    print('len(train_data)', train_data)

    # 将所有字母转为小写
    train_data.apply(lambda x: x['raw_sentence'].lower(),
                     new_field_name='raw_sentence')

    # label转int
    train_data.apply(lambda x: int(x['label']) - 1,
                     new_field_name='label_seq',
                     is_target=True)

    # 使用空格分割句子
    def split_sent(ins):
        return ins['raw_sentence'].split()
Exemple #19
0
def get_batch_data():
    # Load train data
    X_train, Y_train, Sources_train, Targets_train = load_train_data()

    # calc total batch count
    num_batch = len(X_train) // Hyperparams.batch_size

    i = 0
    ds_train = DataSet()
    for x in X_train:
        instance = Instance(word=Sources_train[i],
                            traslated=Targets_train[i],
                            word_seq=x.tolist(),
                            translated_seq=Y_train[i].tolist())
        ds_train.append(instance)
        i = i + 1
    ds_train.set_input('word_seq', 'translated_seq')
    ds_train.set_target('translated_seq')

    # Load test data
    X_test, Y_test, Sources_test, Targets_test = load_test_data()

    # calc total batch count
    num_batch = len(X_test) // Hyperparams.batch_size

    i = 0
    ds_test = DataSet()
    for x in X_test:
        instance = Instance(word=Sources_test[i],
                            traslated=Targets_test[i],
                            word_seq=x.tolist(),
                            translated_seq=Y_test[i].tolist())
        ds_test.append(instance)
        i = i + 1
    ds_test.set_input('word_seq', 'translated_seq')
    ds_test.set_target('translated_seq')

    return ds_train, ds_test
Exemple #20
0
    def test_tutorial(self):
        # 从csv读取数据到DataSet
        sample_path = "./data_for_tests/tutorial_sample_dataset.csv"
        dataset = DataSet.read_csv(sample_path,
                                   headers=('raw_sentence', 'label'),
                                   sep='\t')
        print(len(dataset))
        print(dataset[0])

        dataset.append(Instance(raw_sentence='fake data', label='0'))
        dataset.apply(lambda x: x['raw_sentence'].lower(),
                      new_field_name='raw_sentence')
        # label转int
        dataset.apply(lambda x: int(x['label']), new_field_name='label')

        # 使用空格分割句子
        def split_sent(ins):
            return ins['raw_sentence'].split()

        dataset.apply(split_sent, new_field_name='words')
        # 增加长度信息
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
        # print(len(dataset))
        # print(dataset[0])

        # DataSet.drop(func)筛除数据
        dataset.drop(lambda x: x['seq_len'] <= 3)
        print(len(dataset))

        # 设置DataSet中,哪些field要转为tensor
        # set target,loss或evaluate中的golden,计算loss,模型评估时使用
        dataset.set_target("label")
        # set input,模型forward时使用
        dataset.set_input("words")

        # 分出测试集、训练集
        test_data, train_data = dataset.split(0.5)
        # print(len(test_data))
        # print(len(train_data))

        # 构建词表, Vocabulary.add(word)
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
        vocab.build_vocab()

        # index句子, Vocabulary.to_index(word)
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        test_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        print(test_data[0])

        model = CNNText(embed_num=len(vocab),
                        embed_dim=50,
                        num_classes=5,
                        padding=2,
                        dropout=0.1)

        from fastNLP import Trainer
        from copy import deepcopy

        # 更改DataSet中对应field的名称,要以模型的forward等参数名一致
        train_data.rename_field('words',
                                'word_seq')  # input field 与 forward 参数一致
        train_data.rename_field('label', 'label_seq')
        test_data.rename_field('words', 'word_seq')
        test_data.rename_field('label', 'label_seq')

        # 实例化Trainer,传入模型和数据,进行训练
        copy_model = deepcopy(model)
        overfit_trainer = Trainer(train_data=test_data,
                                  model=copy_model,
                                  loss=CrossEntropyLoss(pred="output",
                                                        target="label_seq"),
                                  metrics=AccuracyMetric(pred="predict",
                                                         target="label_seq"),
                                  n_epochs=10,
                                  batch_size=4,
                                  dev_data=test_data,
                                  save_path="./save")
        overfit_trainer.train()

        trainer = Trainer(train_data=train_data,
                          model=model,
                          loss=CrossEntropyLoss(pred="output",
                                                target="label_seq"),
                          metrics=AccuracyMetric(pred="predict",
                                                 target="label_seq"),
                          n_epochs=10,
                          batch_size=4,
                          dev_data=test_data,
                          save_path="./save")
        trainer.train()
        print('Train finished!')

        # 使用fastNLP的Tester测试脚本
        tester = Tester(data=test_data,
                        model=model,
                        metrics=AccuracyMetric(pred="predict",
                                               target="label_seq"),
                        batch_size=4)
        acc = tester.test()
        print(acc)
Exemple #21
0
    def test_len_(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
        self.assertEqual(len(ds), 40)

        ds = DataSet()
        self.assertEqual(len(ds), 0)
Exemple #22
0
def make_dataset(data):
    dataset = DataSet()
    mx = 0
    le = None
    for x, y in zip(data.data, data.target):
        xx = deal(x)
        ins = Instance(sentence=xx, label=int(y))
        if mx < len(xx.split()):
            mx = max(mx, len(xx.split()))
            le = xx
        dataset.append(ins)
    print(mx)
    dataset.apply_field(lambda x: x.split(),
                        field_name='sentence',
                        new_field_name='words')
    dataset.apply_field(lambda x: len(x),
                        field_name='words',
                        new_field_name='seq_len')

    dataset.rename_field('words', Const.INPUT)
    dataset.rename_field('seq_len', Const.INPUT_LEN)
    dataset.rename_field('label', Const.TARGET)

    dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    dataset.set_target(Const.TARGET)
    return dataset
Exemple #23
0
 def test_contains(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
     self.assertTrue("x" in ds)
     self.assertTrue("y" in ds)
     self.assertFalse("z" in ds)
Exemple #24
0
    def testENAS(self):
        # 从csv读取数据到DataSet
        sample_path = "tutorials/sample_data/tutorial_sample_dataset.csv"
        dataset = DataSet.read_csv(sample_path,
                                   headers=('raw_sentence', 'label'),
                                   sep='\t')
        print(len(dataset))
        print(dataset[0])
        print(dataset[-3])

        dataset.append(Instance(raw_sentence='fake data', label='0'))
        # 将所有数字转为小写
        dataset.apply(lambda x: x['raw_sentence'].lower(),
                      new_field_name='raw_sentence')
        # label转int
        dataset.apply(lambda x: int(x['label']), new_field_name='label')

        # 使用空格分割句子
        def split_sent(ins):
            return ins['raw_sentence'].split()

        dataset.apply(split_sent, new_field_name='words')

        # 增加长度信息
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
        print(len(dataset))
        print(dataset[0])

        # DataSet.drop(func)筛除数据
        dataset.drop(lambda x: x['seq_len'] <= 3)
        print(len(dataset))

        # 设置DataSet中,哪些field要转为tensor
        # set target,loss或evaluate中的golden,计算loss,模型评估时使用
        dataset.set_target("label")
        # set input,模型forward时使用
        dataset.set_input("words", "seq_len")

        # 分出测试集、训练集
        test_data, train_data = dataset.split(0.5)
        print(len(test_data))
        print(len(train_data))

        # 构建词表, Vocabulary.add(word)
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
        vocab.build_vocab()

        # index句子, Vocabulary.to_index(word)
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        test_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        print(test_data[0])

        # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具
        from fastNLP.core.batch import Batch
        from fastNLP.core.sampler import RandomSampler

        batch_iterator = Batch(dataset=train_data,
                               batch_size=2,
                               sampler=RandomSampler())
        for batch_x, batch_y in batch_iterator:
            print("batch_x has: ", batch_x)
            print("batch_y has: ", batch_y)
            break

        from fastNLP.models.enas_model import ENASModel
        from fastNLP.models.enas_controller import Controller
        model = ENASModel(embed_num=len(vocab), num_classes=5)
        controller = Controller()

        from fastNLP.models.enas_trainer import ENASTrainer
        from copy import deepcopy

        # 更改DataSet中对应field的名称,要以模型的forward等参数名一致
        train_data.rename_field('words',
                                'word_seq')  # input field 与 forward 参数一致
        train_data.rename_field('label', 'label_seq')
        test_data.rename_field('words', 'word_seq')
        test_data.rename_field('label', 'label_seq')

        loss = CrossEntropyLoss(pred="output", target="label_seq")
        metric = AccuracyMetric(pred="predict", target="label_seq")

        trainer = ENASTrainer(model=model,
                              controller=controller,
                              train_data=train_data,
                              dev_data=test_data,
                              loss=CrossEntropyLoss(pred="output",
                                                    target="label_seq"),
                              metrics=AccuracyMetric(pred="predict",
                                                     target="label_seq"),
                              check_code_level=-1,
                              save_path=None,
                              batch_size=32,
                              print_every=1,
                              n_epochs=3,
                              final_epochs=1)
        trainer.train()
        print('Train finished!')

        # 调用Tester在test_data上评价效果
        from fastNLP import Tester

        tester = Tester(data=test_data,
                        model=model,
                        metrics=AccuracyMetric(pred="predict",
                                               target="label_seq"),
                        batch_size=4)

        acc = tester.test()
        print(acc)
Exemple #25
0
 def test_split(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
     d1, d2 = ds.split(0.1)
Exemple #26
0
def prepare_fake_dataset2(*args, size=100):
    ys = np.random.randint(4, size=100, dtype=np.int64)
    data = {'y': ys}
    for arg in args:
        data[arg] = np.random.randn(size, 5)
    return DataSet(data=data)
Exemple #27
0
 def test_get_all_fields(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
     ans = ds.get_all_fields()
     self.assertEqual(ans["x"].content, [[1, 2, 3, 4]] * 10)
     self.assertEqual(ans["y"].content, [[5, 6]] * 10)
Exemple #28
0
def from_raw_text_new(chars, vocabs, w_list, number_normalized=False):
    from fastNLP.core import DataSet
    from utils import get_bigrams
    bigrams = get_bigrams(chars)
    seq_len = len(chars)
    target = ['O'] * seq_len
    dataset = DataSet({
        'chars': [chars],
        'bigrams': [bigrams],
        'seq_len': [seq_len],
        'target': [target]
    })
    datasets = {'train': dataset}

    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    from fastNLP.embeddings import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    a.apply
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars',
                      'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons',
                      'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons',
                      'lex_e')

    if number_normalized == 1:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    # print(list(datasets['train'][:10]['lexicons']))
    # print(list(datasets['train'][:10]['lattice']))
    # print(list(datasets['train'][:10]['lex_s']))
    # print(list(datasets['train'][:10]['lex_e']))
    # print(list(datasets['train'][:10]['pos_s']))
    # print(list(datasets['train'][:10]['pos_e']))
    # exit(1208)

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word')
    #
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count')
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_l2r_word',new_field_name='skips_l2r_word')
    #
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back')
    #
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_r2l_word',new_field_name='skips_r2l_word')

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice',
                                    new_field_name='lattice')

    return datasets, vocabs
Exemple #29
0
 def test__repr__(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
     for iter in ds:
         self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4] type=list,\n'y': [5, 6] type=list}")
Exemple #30
0
def get_fastnlp_dataset():
    # Hyper parameters
    output_dim = 10
    SEQUENCE_LENGTH = 28
    mnist_train_length = 60000
    validation_samples = 5000
    BATCH_SIZE = 1
    transform_train_test = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])
    indices = torch.randperm(mnist_train_length)
    train_indices = indices[:len(indices) - validation_samples]
    val_indices = indices[len(indices) - validation_samples:]

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data',
                       train=True,
                       download=True,
                       transform=transform_train_test),
        sampler=SubsetRandomSampler(train_indices),
        batch_size=BATCH_SIZE,
        shuffle=False)
    val_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data',
                       train=True,
                       download=False,
                       transform=transform_train_test),
        sampler=SubsetRandomSampler(val_indices),
        batch_size=BATCH_SIZE,
        shuffle=False)

    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        './data', train=False, transform=transform_train_test),
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)

    TRAIN_ITERS = int(
        math.ceil((mnist_train_length - validation_samples) / BATCH_SIZE))
    VAL_ITERS = int(math.ceil(validation_samples / BATCH_SIZE))
    TEST_ITERS = int(math.ceil(len(test_loader.dataset) / BATCH_SIZE))

    train_data = DataSet()
    val_data = DataSet()
    test_data = DataSet()
    for batch_idx, (x, y) in enumerate(train_loader):
        x = x[0][0].numpy()
        train_data.append(Instance(word_seq=x, target=int(y)))
    for batch_idx, (x, y) in enumerate(val_loader):
        x = x[0][0].numpy()
        val_data.append(Instance(word_seq=x, target=int(y)))
    for batch_idx, (x, y) in enumerate(test_loader):
        x = x[0][0].numpy()
        test_data.append(Instance(word_seq=x, target=int(y)))

    # 设定特征域和标签域
    train_data.set_input("word_seq")
    test_data.set_input("word_seq")
    val_data.set_input("word_seq")
    train_data.set_target("target")
    test_data.set_target("target")
    val_data.set_target("target")

    return train_data, val_data, test_data