Python DataSet Exemples, fastNLP.DataSet Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : source.py Projet : zhongyuchen/PRML-Spring19-Fudan

def readdata():
    global target_len
    min_count = 10
    #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ]
    dataset_train = fetch_20newsgroups(subset='train', data_home='../../..')
    dataset_test = fetch_20newsgroups(subset='test', data_home='../../..')

    data = dataset_train.data
    target = dataset_train.target
    target_len = len(dataset_train.target_names)
    train_data =  DataSet()
    padding = 0
    for i in range(len(data)):
        data_t =  re.sub("\d+|\s+|/", " ", data[i] )
        temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != '']
        train_data.append(Instance(raw = data[i], label = int(target[i]), words = temp))
        if len(temp) > padding:
            padding = len(temp)
    train_data.apply(lambda x: x['raw'].lower(), new_field_name='raw')

    data = dataset_test.data
    target = dataset_test.target
    test_data =  DataSet()
    padding = 0
    for i in range(len(data)):
        data_t =  re.sub("\d+|\s+|/", " ", data[i] )
        temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != '']
        test_data.append(Instance(raw = data[i], label = int(target[i]), words = temp))
        if len(temp) > padding:
            padding = len(temp)
    test_data.apply(lambda x: x['raw'].lower(), new_field_name='raw')

    train_data.apply(lambda x: len(x['words']), new_field_name='len')
    test_data.apply(lambda x: len(x['words']), new_field_name='len')

    vocab = Vocabulary(min_freq=10)
    train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
    vocab.build_vocab()
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq')
    test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq')
    train_data.rename_field('seq', Const.INPUT)
    train_data.rename_field('len', Const.INPUT_LEN)
    train_data.rename_field('label', Const.TARGET)

    test_data.rename_field('seq', Const.INPUT)
    test_data.rename_field('len', Const.INPUT_LEN)
    test_data.rename_field('label', Const.TARGET)

    test_data.set_input(Const.INPUT, Const.INPUT_LEN)
    test_data.set_target(Const.TARGET)
    train_data.set_input(Const.INPUT, Const.INPUT_LEN)
    train_data.set_target(Const.TARGET)

    test_data, dev_data = test_data.split(0.5)
    return train_data,dev_data,test_data,vocab

Exemple #2

0

Afficher le fichier

Fichier : test_dataset.py Projet : zxlzr/fastNLP

 def test_add_field_v2(self):
     ds = DataSet({"x": [3, 4]})
     ds.add_field('y', [['hello', 'world'], ['this', 'is', 'a', 'test']], is_input=True, is_target=True)
     # ds.apply(lambda x:[x['x']]*3, is_input=True, is_target=True, new_field_name='y')
     print(ds)

Exemple #3

0

Afficher le fichier

Fichier : test_dataset.py Projet : zxlzr/fastNLP

 def test_add_null(self):
     # TODO test failed because 'fastNLP\core\field.py:143: RuntimeError'
     ds = DataSet()
     with self.assertRaises(RuntimeError) as RE:
         ds.add_field('test', [])

Exemple #4

0

Afficher le fichier

Fichier : test_dataset.py Projet : zxlzr/fastNLP

 def test_drop(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6], [7, 8, 9, 0]] * 20})
     ds.drop(lambda ins: len(ins["y"]) < 3, inplace=True)
     self.assertEqual(len(ds), 20)

Exemple #5

0

Afficher le fichier

Fichier : test_dataset.py Projet : zxlzr/fastNLP

 def test_get_target_name(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
     self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target])

Exemple #6

0

Afficher le fichier

Fichier : test_tutorials.py Projet : zmjm4/fastNLP

    def test_tutorial_1_data_preprocess(self):
        from fastNLP import DataSet
        data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."],
                'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'],
                          ['Third', 'instance', '.']],
                'seq_len': [6, 3, 3]}
        dataset = DataSet(data)
        # 传入的dict的每个key的value应该为具有相同长度的list

        from fastNLP import DataSet
        from fastNLP import Instance
        dataset = DataSet()
        instance = Instance(raw_words="This is the first instance",
                            words=['this', 'is', 'the', 'first', 'instance', '.'],
                            seq_len=6)
        dataset.append(instance)

        from fastNLP import DataSet
        from fastNLP import Instance
        dataset = DataSet([
            Instance(raw_words="This is the first instance",
                     words=['this', 'is', 'the', 'first', 'instance', '.'],
                     seq_len=6),
            Instance(raw_words="Second instance .",
                     words=['Second', 'instance', '.'],
                     seq_len=3)
        ])

        from fastNLP import DataSet
        dataset = DataSet({'a': range(-5, 5), 'c': [0] * 10})

        # 不改变dataset，生成一个删除了满足条件的instance的新 DataSet
        dropped_dataset = dataset.drop(lambda ins: ins['a'] < 0, inplace=False)
        # 在dataset中删除满足条件的instance
        dataset.drop(lambda ins: ins['a'] < 0)
        #  删除第3个instance
        dataset.delete_instance(2)
        #  删除名为'a'的field
        dataset.delete_field('a')

        #  检查是否存在名为'a'的field
        print(dataset.has_field('a'))  # 或 ('a' in dataset)
        #  将名为'a'的field改名为'b'
        dataset.rename_field('c', 'b')
        #  DataSet的长度
        len(dataset)

        from fastNLP import DataSet
        data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."]}
        dataset = DataSet(data)

        # 将句子分成单词形式, 详见DataSet.apply()方法
        dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')

        # 或使用DataSet.apply_field()
        dataset.apply_field(lambda sent: sent.split(), field_name='raw_words', new_field_name='words')

        # 除了匿名函数，也可以定义函数传递进去
        def get_words(instance):
            sentence = instance['raw_words']
            words = sentence.split()
            return words

        dataset.apply(get_words, new_field_name='words')

Exemple #7

0

Afficher le fichier

Fichier : gruText.py Projet : zhongyuchen/PRML-Spring19-Fudan

from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
from fastNLP.core.const import Const as C
from fastNLP.modules import encoder

trainSet = fetch_20newsgroups(subset='train')
testSet = fetch_20newsgroups(subset='test')

testData = {"data": testSet['data'], "target": testSet['target']}
trainData = {"data": trainSet['data'], "target": trainSet['target']}
trainData = DataSet(trainData)
testData = DataSet(testData)

trainData.apply(lambda x: x['data'].lower(), new_field_name='sentence')
trainData.apply(lambda x: x['sentence'].split(),
                new_field_name='words',
                is_input=True)
vocab = Vocabulary(min_freq=2)
vocab = vocab.from_dataset(trainData, field_name='words')
#change to index
vocab.index_dataset(trainData, field_name='words', new_field_name='words')
trainData.set_target('target')
train_data, dev_data = trainData.split(0.2)


#demo LSTM version

Exemple #8

0

Afficher le fichier

Fichier : __init__.py Projet : SunflowerAries/PRML-Spring19-Fudan

def preprocess(task, type, length=60):
    Task = []
    i = 0
    index = 0
    ind = []
    raw = []
    for File in filter(lambda s: True if s.find('.' + type + '.train') != -1 else False, Filelist):
        raw1 = []
        j = 0
        with open(File, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                j += 1
                if line[-3] == '-':
                    label = -1
                    onehot = [1, 0]
                else:
                    label = 1
                    onehot = [0, 1]
                raw1.append(Instance(sentence = line[:-3], label = label, onehot=onehot))
                raw.append(Instance(sentence = line[:-3], label = label, onehot=onehot))
        raw1 = DataSet(raw1)
        raw1.apply(lambda x: pre(x['sentence'], length), new_field_name='words')
        if File in [task + type + '.train' for task in test]:
            index = i
        if j <= 30:
            ind.append(i)
        i += 1
        Task.append(raw1)

    raw3 = []
    for File in filter(lambda s: True if s == task + type + '.dev' else False, Filelist):
        with open(File, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                if line[-3] == '-':
                    label = -1
                    onehot = [1, 0]
                else:
                    label = 1
                    onehot = [0, 1]
                raw3.append(Instance(sentence=line[:-3], label=label, onehot=onehot))
    raw3 = DataSet(raw3)
    raw3.apply(lambda x: pre(x['sentence'], length), new_field_name='words')

    raw2 = []
    for File in filter(lambda s: True if s == task + type + '.test' else False, Filelist):
        raw2 = []
        with open(File, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                if line[-3] == '-':
                    label = -1
                    onehot = [1, 0]
                else:
                    label = 1
                    onehot = [0, 1]
                raw2.append(Instance(sentence=line[:-3], label=label, onehot=onehot))
    raw2 = DataSet(raw2)
    raw2.apply(lambda x: pre(x['sentence'], length), new_field_name='words')

    raw = DataSet(raw)
    raw.apply(lambda x: pre(x['sentence'], length), new_field_name='words')
    vocab = Vocabulary(min_freq=2).from_dataset(raw, raw3, raw2, field_name='words')
    vocab.index_dataset(raw2, raw3, field_name='words', new_field_name='words')
    return Task, vocab, ind, index, raw3, raw2

Exemple #9

0

Afficher le fichier

from fastNLP.embeddings import BertEmbedding
from fastNLP.models import BertForSequenceClassification
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam
'''import other package'''
import os
import csv
import torch
'''load the csv'''
file2label = {}
with open('dataset/annotations_metadata.csv') as f:
    for line in f:
        items = line.split(',')
        file2label[items[0] +
                   ".txt"] = (1 if items[4].split('\n')[0] == 'hate' else 0)
'''load the Hate-Speech data'''
train_dataset = DataSet()
dev_dataset = DataSet()
test_dataset = DataSet()
cnt = 0
length = len(os.listdir("dataset/all_files"))

for file in os.listdir("dataset/all_files"):
    with open("dataset/all_files/" + file) as f:
        cnt += 1
        raw_words = f.read()
        words = raw_words.split()
        seq_len = len(words)
        if cnt > length * 0.9:
            test_dataset.append(
                Instance(raw_words=raw_words,
                         words=words,

Exemple #10

0

Afficher le fichier

Fichier : sequence_classification.py Projet : zhangtaokd/BERT_Pytorch_fastNLP

def load_dataset(args):

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
    }

    num_labels_task = {
        "cola": 2,
        "mnli": 3,
        "mrpc": 2,
    }

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    args.num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    train_examples = None
    args.num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        args.num_train_steps = int(
            len(train_examples) / args.train_batch_size / 1 *
            args.num_train_epochs)

    # training dataset
    train_features = convert_examples_to_features(train_examples, label_list,
                                                  args.max_seq_length,
                                                  tokenizer)

    train_data = DataSet({
        "x": [f.input_ids for f in train_features],
        "segment_info": [f.segment_ids for f in train_features],
        "mask": [f.input_mask for f in train_features],
        "target": [f.label_id for f in train_features]
    })

    train_data.set_input('x', 'segment_info', 'mask')
    train_data.set_target('target')

    # dev dataset

    eval_examples = processor.get_dev_examples(args.data_dir)
    eval_features = convert_examples_to_features(eval_examples, label_list,
                                                 args.max_seq_length,
                                                 tokenizer)

    dev_data = DataSet({
        "x": [f.input_ids for f in eval_features],
        "segment_info": [f.segment_ids for f in eval_features],
        "mask": [f.input_mask for f in eval_features],
        "target": [f.label_id for f in eval_features]
    })

    dev_data.set_input('x', 'segment_info', 'mask')
    dev_data.set_target('target')

    return train_data, dev_data

Exemple #11

0

Afficher le fichier

Fichier : dataset.py Projet : zhongyuchen/PRML-Spring19-Fudan

class TextData():
    vocab_size = 0
    dataset_size = 0
    train_size = 0
    test_size = 0
    class_num = 4
    min_count = 10
    max_seq_len = 500
    seq_limit = 2000
    data_src = "20news"

    data_set = DataSet()
    train_set = DataSet()
    test_set = DataSet()
    dev_set = DataSet()
    vocab = None


    def __init__(self,data_src="20news",min_count=10,seq_limit=None):
        self.data_src = data_src
        self.min_count = min_count
        if seq_limit is not None:
            self.seq_limit = seq_limit

    def find_max_len(self,words):
        self.max_seq_len = max(len(words),self.max_seq_len)

    def seq_regularize(self,words):
        wlen = len(words)
        if wlen<self.max_seq_len:
            return [0]*(self.max_seq_len-wlen) + words
        else:
            return words[:self.max_seq_len]

    def fetch_20news(self,size=4):
        print("Loading 20newsgroups data and tokenize.")
        if size==20:
            train,test = get_all_20news()
        else:
            train,test = get_text_classification_datasets()
        train_input,test_input = tokenize(train.data,test.data)
        train_target = train.target
        test_target = test.target
        self.class_num = len(train.target_names)
        assert (self.class_num == len(test.target_names))

        # Building Fastnlp dataset.
        print("Building Fastnlp dataset.")
        self.train_set = DataSet({"text":train_input,"class":train_target})
        self.test_set = DataSet({"text":test_input,"class":test_target})
        
        # Building Fastnlp vocabulary...
        print("Building Fastnlp vocabulary.")
        self.vocab = Vocabulary(min_freq=self.min_count)
        self.train_set.apply(lambda x : [self.vocab.add_word(word) for word in x['text']])
        self.vocab.build_vocab()
        self.vocab.build_reverse_vocab()
        self.vocab_size = len(self.vocab)
        # Building multi-hot-vector for train_set and test_set.
        print("Building id-presentation for train_set and test_set.")
        self.vocab.index_dataset(self.train_set,self.test_set,field_name='text',new_field_name='words')
        
        self.train_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len')
        self.test_set.apply_field(lambda x : len(x),field_name='words',new_field_name='seq_len')
        self.train_set.apply_field(self.find_max_len,field_name='words')

        print(self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len,self.seq_limit)

        self.train_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words')
        self.test_set.apply_field(self.seq_regularize,field_name='words',new_field_name='words')
        # self.train_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name="input")
        # self.test_set.apply(lambda x : text2multi_hot(x['words'],self.vocab_size),new_field_name='input')
        
        # Building target-vector for train_set and test_set.
        print("Building target-vector for train_set and test_set.")
        self.train_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True)
        self.test_set.apply(lambda x : int(x['class']),new_field_name="target",is_target=True)
        # self.train_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")
        # self.test_set.apply(lambda x : class2target(x['class'],self.calss_num),new_field_name="target")

    def fetch_csv(self,path=None):
        print("Not implemented now...")
        pass

    def fetch_data(self,path=None):
        if self.data_src == "20news":
            # Loading 20newsgroups data and tokenize.
            self.fetch_20news()
        elif self.data_src == "20news_all":
            self.fetch_20news(size=20)
        else:
            print("No data src...")
        
        self.train_size = self.train_set.get_length()
        self.test_size = self.test_set.get_length()
        return self.train_size,self.test_size

Exemple #12

0

Afficher le fichier

Fichier : train_mix_bert.py Projet : yhcc/BertForRD


data_bundle = get_data()
print(data_bundle)
train_word2bpes = data_bundle.train_word2bpes
target_word2bpes = data_bundle.target_word2bpes
print(
    f"In total {len(target_word2bpes)} target words, {len(train_word2bpes)} words."
)

pad_id = data_bundle.pad_id
lg_dict = getattr(data_bundle, 'lg_dict')
lg_shifts = getattr(data_bundle, 'lg_shift')
train_lg_shifts = getattr(data_bundle, 'train_lg_shift')

train_data = DataSet()
for name, ds in data_bundle.iter_datasets():
    if 'train' in name:
        for ins in ds:
            train_data.append(ins)

train_data.add_seq_len('input')
train_data.set_input('input', 'language_ids')
train_data.set_target('target')
train_data.set_pad_val('input', pad_id)

clip_max_length(train_data, data_bundle, max_sent_len=50)

model = JointBertReverseDict(pre_name,
                             train_word2bpes,
                             target_word2bpes,

Exemple #13

0

Afficher le fichier

Fichier : rte_preproc.py Projet : txsun1997/Bachelor-Thesis-XDU

import os
import pickle
from fastNLP import Vocabulary
from fastNLP import DataSet

root_path = '/remote-home/txsun/data/glue_data'

# RTE
print('processing RTE...')
dataset = 'RTE'
data_path = os.path.join(root_path, dataset)

## Train
print('reading train file...')
train_file = os.path.join(data_path, 'train.tsv')
train_ds = DataSet.read_csv(train_file, sep='\t')
train_ds.delete_field('index')

print(train_ds[0])
print(len(train_ds))

## Dev
print('reading dev file...')
dev_file = os.path.join(data_path, 'dev.tsv')
dev_ds = DataSet.read_csv(dev_file, sep='\t')
dev_ds.delete_field('index')

print(dev_ds[0])
print(len(dev_ds))

## Test

Exemple #14

0

Afficher le fichier

Fichier : data-process.py Projet : isabella232/benchmark-3

def main():
    parser = argparse.ArgumentParser()
    # fmt: off
    parser.add_argument("--data_path", required=True, type=str, help="all of datasets pkl paths")
    # fmt: on

    options, _ = parser.parse_known_args()

    train_set, test_set = DataSet(), DataSet()

    input_dir = os.path.join(options.data_path, "joint-sighan2008/bmes")
    options.output = os.path.join(options.data_path, "total_dataset.pkl")
    print(input_dir, options.output)

    for fn in os.listdir(input_dir):
        if fn not in ["test.txt", "train-all.txt"]:
            continue
        print(fn)
        abs_fn = os.path.join(input_dir, fn)
        ds = read_file(abs_fn)
        if "test.txt" == fn:
            test_set = ds
        else:
            train_set = ds

    print(
        "num samples of total train, test: {}, {}".format(len(train_set), len(test_set))
    )

    uni_vocab = Vocabulary(min_freq=None).from_dataset(
        train_set, test_set, field_name="ori_words"
    )
    # bi_vocab = Vocabulary(min_freq=3, max_size=50000).from_dataset(train_set,test_set, field_name="bi1")
    bi_vocab = Vocabulary(min_freq=3, max_size=None).from_dataset(
        train_set, field_name="bi1", no_create_entry_dataset=[test_set]
    )
    tag_vocab = Vocabulary(min_freq=None, padding="s", unknown=None).from_dataset(
        train_set, field_name="ori_tags"
    )
    task_vocab = Vocabulary(min_freq=None, padding=None, unknown=None).from_dataset(
        train_set, field_name="task"
    )

    def to_index(dataset):
        uni_vocab.index_dataset(dataset, field_name="ori_words", new_field_name="uni")
        tag_vocab.index_dataset(dataset, field_name="ori_tags", new_field_name="tags")
        task_vocab.index_dataset(dataset, field_name="task", new_field_name="task")

        dataset.apply_field(lambda x: x[1:], field_name="bi1", new_field_name="bi2")
        dataset.apply_field(lambda x: x[:-1], field_name="bi1", new_field_name="bi1")
        bi_vocab.index_dataset(dataset, field_name="bi1", new_field_name="bi1")
        bi_vocab.index_dataset(dataset, field_name="bi2", new_field_name="bi2")

        dataset.set_input("task", "uni", "bi1", "bi2", "seq_len")
        dataset.set_target("tags")
        return dataset

    train_set = to_index(train_set)
    test_set = to_index(test_set)

    output = {}
    output["train_set"] = train_set
    output["test_set"] = test_set
    output["uni_vocab"] = uni_vocab
    output["bi_vocab"] = bi_vocab
    output["tag_vocab"] = tag_vocab
    output["task_vocab"] = task_vocab

    print(tag_vocab.word2idx)
    print(task_vocab.word2idx)

    make_sure_path_exists(os.path.dirname(options.output))

    print("Saving dataset to {}".format(os.path.abspath(options.output)))
    with open(options.output, "wb") as outfile:
        dump(output, outfile)

    print(len(task_vocab), len(tag_vocab), len(uni_vocab), len(bi_vocab))
    dic = {}
    tokens = {}

    def process(words):
        name = words[0][1:-1]
        if name not in dic:
            dic[name] = set()
            tokens[name] = 0
        tokens[name] += len(words[1:])
        dic[name].update(words[1:])

    train_set.apply_field(process, "ori_words", None)
    for name in dic.keys():
        print(name, len(dic[name]), tokens[name])

    with open(os.path.join(os.path.dirname(options.output), "oovdict.pkl"), "wb") as f:
        dump(dic, f)

    def get_max_len(ds):
        global max_len
        max_len = 0

        def find_max_len(words):
            global max_len
            if max_len < len(words):
                max_len = len(words)

        ds.apply_field(find_max_len, "ori_words", None)
        return max_len

    print(
        "train max len: {}, test max len: {}".format(
            get_max_len(train_set), get_max_len(test_set)
        )
    )

Exemple #15

0

Afficher le fichier

Fichier : add_lattice.py Projet : johnson7788/Flat-Lattice-Transformer

def equip_chinese_ner_with_lexicon(datasets, vocabs, embeddings, w_list, word_embedding_path=None,
                                   only_lexicon_in_train=False, word_char_mix_embedding_path=None,
                                   number_normalized=False,
                                   lattice_min_freq=1, only_train_min_freq=0):
    from fastNLP.core import Vocabulary
    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(datasets['train'], field_name='chars',
                                    no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(datasets['train'], field_name='bigrams',
                                      no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if only_lexicon_in_train:
        print('已支持只加载在trian中出现过的词汇')

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    # from fastNLP.embeddings import StaticEmbedding
    from fastNLP_module import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    if only_lexicon_in_train:
        lexicon_in_train = set()
        for s in datasets['train']['chars']:
            lexicon_in_s = w_trie.get_lexicon(s)
            for s, e, lexicon in lexicon_in_s:
                lexicon_in_train.add(''.join(lexicon))

        print('lexicon in train:{}'.format(len(lexicon_in_train)))
        print('i.e.: {}'.format(list(lexicon_in_train)[:10]))
        w_trie = Trie()
        for w in lexicon_in_train:
            w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars', 'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons', 'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons', 'lex_e')

    if number_normalized == 1:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(datasets['train'], field_name='chars',
                                    no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(datasets['train'], field_name='chars',
                                    no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(datasets['train'], field_name='bigrams',
                                      no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    # print(list(datasets['train'][:10]['lexicons']))
    # print(list(datasets['train'][:10]['lattice']))
    # print(list(datasets['train'][:10]['lex_s']))
    # print(list(datasets['train'][:10]['lex_e']))
    # print(list(datasets['train'][:10]['pos_s']))
    # print(list(datasets['train'][:10]['pos_e']))
    # exit(1208)

    word_vocab = Vocabulary()
    word_vocab.add_word_lst(w_list)
    vocabs['word'] = word_vocab

    lattice_vocab = Vocabulary()
    lattice_vocab.from_dataset(datasets['train'], field_name='lattice',
                               no_create_entry_dataset=[v for k, v in datasets.items() if k != 'train'])
    vocabs['lattice'] = lattice_vocab
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word')
    #
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count')
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_l2r_word',new_field_name='skips_l2r_word')
    #
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back')
    #
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_r2l_word',new_field_name='skips_r2l_word')

    if word_embedding_path is not None:
        word_embedding = StaticEmbedding(word_vocab, word_embedding_path, word_dropout=0)
        embeddings['word'] = word_embedding

    if word_char_mix_embedding_path is not None:
        lattice_embedding = StaticEmbedding(lattice_vocab, word_char_mix_embedding_path, word_dropout=0.01,
                                            min_freq=lattice_min_freq, only_train_min_freq=only_train_min_freq)
        embeddings['lattice'] = lattice_embedding

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars', new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams', new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target', new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice', new_field_name='lattice')

    return datasets, vocabs, embeddings

Exemple #16

0

Afficher le fichier

Fichier : data.py Projet : zhongyuchen/PRML-Spring19-Fudan

def construct_dataset(dataset):
    dataset_ = DataSet()
    for sentence, target in zip(dataset.data, dataset.target):
        instance = Instance()
        instance['raw_sentence'] = sentence
        instance['target'] = int(target)
        dataset_.append(instance)

    dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '',
                                    x['raw_sentence']),
                   new_field_name='sentence')  #忽略标点
    dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.whitespace), ' ',
                                    x['sentence']),
                   new_field_name='sentence')  #将空格、换行符等空白替换为空格
    dataset_.apply(lambda x: x['sentence'].lower(),
                   new_field_name='sentence')  #转换为小写
    dataset_.apply_field(lambda x: x.split(),
                         field_name='sentence',
                         new_field_name='input')
    return dataset_

Exemple #17

0

Afficher le fichier

    def test_from_dataset(self):
        start_char = 65
        num_samples = 10

        # 0 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=chr(start_char + i))
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

        # 1 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=[chr(start_char + i)] * 6)
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

        # 2 dim
        dataset = DataSet()
        for i in range(num_samples):
            ins = Instance(char=[[chr(start_char + i) for _ in range(6)]
                                 for _ in range(6)])
            dataset.append(ins)
        vocab = Vocabulary()
        vocab.from_dataset(dataset, field_name='char')
        for i in range(num_samples):
            self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
        vocab.index_dataset(dataset, field_name='char')

Exemple #18

0

Afficher le fichier

from fastNLP import Vocabulary
from fastNLP import DataSet

# 问题：fastNLP虽然已经提供了split函数，可以将数据集划分成训练集和测试机，但一般网上用作训练的标准集都已经提前划分好了训练集和测试机，
# 而使用split将数据集进行随机划分还引来了一个问题：
#       因为每次都是随机划分，导致每次的字典都不一样，保存好模型下次再载入进行测试时，因为字典不同导致结果差异非常大。
#
# 解决方法：在Vocabulary增加一个字典保存函数和一个字典读取函数，而不是每次都生成一个新字典，同时减少下次运行的成本，第一次使用save_vocab()
# 生成字典后，下次可以直接使用load_vocab()载入的字典。
if __name__ == '__main__':

    data_path = "data_for_tests/tutorial_sample_dataset.csv"

    train_data = DataSet.read_csv(data_path,
                                  headers=('raw_sentence', 'label'),
                                  sep='\t')
    print('len(train_data)', train_data)

    # 将所有字母转为小写
    train_data.apply(lambda x: x['raw_sentence'].lower(),
                     new_field_name='raw_sentence')

    # label转int
    train_data.apply(lambda x: int(x['label']) - 1,
                     new_field_name='label_seq',
                     is_target=True)

    # 使用空格分割句子
    def split_sent(ins):
        return ins['raw_sentence'].split()

Exemple #19

0

Afficher le fichier

Fichier : test.py Projet : YanLiqi/fastNLP

def get_batch_data():
    # Load train data
    X_train, Y_train, Sources_train, Targets_train = load_train_data()

    # calc total batch count
    num_batch = len(X_train) // Hyperparams.batch_size

    i = 0
    ds_train = DataSet()
    for x in X_train:
        instance = Instance(word=Sources_train[i],
                            traslated=Targets_train[i],
                            word_seq=x.tolist(),
                            translated_seq=Y_train[i].tolist())
        ds_train.append(instance)
        i = i + 1
    ds_train.set_input('word_seq', 'translated_seq')
    ds_train.set_target('translated_seq')

    # Load test data
    X_test, Y_test, Sources_test, Targets_test = load_test_data()

    # calc total batch count
    num_batch = len(X_test) // Hyperparams.batch_size

    i = 0
    ds_test = DataSet()
    for x in X_test:
        instance = Instance(word=Sources_test[i],
                            traslated=Targets_test[i],
                            word_seq=x.tolist(),
                            translated_seq=Y_test[i].tolist())
        ds_test.append(instance)
        i = i + 1
    ds_test.set_input('word_seq', 'translated_seq')
    ds_test.set_target('translated_seq')

    return ds_train, ds_test

Exemple #20

0

Afficher le fichier

    def test_tutorial(self):
        # 从csv读取数据到DataSet
        sample_path = "./data_for_tests/tutorial_sample_dataset.csv"
        dataset = DataSet.read_csv(sample_path,
                                   headers=('raw_sentence', 'label'),
                                   sep='\t')
        print(len(dataset))
        print(dataset[0])

        dataset.append(Instance(raw_sentence='fake data', label='0'))
        dataset.apply(lambda x: x['raw_sentence'].lower(),
                      new_field_name='raw_sentence')
        # label转int
        dataset.apply(lambda x: int(x['label']), new_field_name='label')

        # 使用空格分割句子
        def split_sent(ins):
            return ins['raw_sentence'].split()

        dataset.apply(split_sent, new_field_name='words')
        # 增加长度信息
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
        # print(len(dataset))
        # print(dataset[0])

        # DataSet.drop(func)筛除数据
        dataset.drop(lambda x: x['seq_len'] <= 3)
        print(len(dataset))

        # 设置DataSet中，哪些field要转为tensor
        # set target，loss或evaluate中的golden，计算loss，模型评估时使用
        dataset.set_target("label")
        # set input，模型forward时使用
        dataset.set_input("words")

        # 分出测试集、训练集
        test_data, train_data = dataset.split(0.5)
        # print(len(test_data))
        # print(len(train_data))

        # 构建词表, Vocabulary.add(word)
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
        vocab.build_vocab()

        # index句子, Vocabulary.to_index(word)
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        test_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        print(test_data[0])

        model = CNNText(embed_num=len(vocab),
                        embed_dim=50,
                        num_classes=5,
                        padding=2,
                        dropout=0.1)

        from fastNLP import Trainer
        from copy import deepcopy

        # 更改DataSet中对应field的名称，要以模型的forward等参数名一致
        train_data.rename_field('words',
                                'word_seq')  # input field 与 forward 参数一致
        train_data.rename_field('label', 'label_seq')
        test_data.rename_field('words', 'word_seq')
        test_data.rename_field('label', 'label_seq')

        # 实例化Trainer，传入模型和数据，进行训练
        copy_model = deepcopy(model)
        overfit_trainer = Trainer(train_data=test_data,
                                  model=copy_model,
                                  loss=CrossEntropyLoss(pred="output",
                                                        target="label_seq"),
                                  metrics=AccuracyMetric(pred="predict",
                                                         target="label_seq"),
                                  n_epochs=10,
                                  batch_size=4,
                                  dev_data=test_data,
                                  save_path="./save")
        overfit_trainer.train()

        trainer = Trainer(train_data=train_data,
                          model=model,
                          loss=CrossEntropyLoss(pred="output",
                                                target="label_seq"),
                          metrics=AccuracyMetric(pred="predict",
                                                 target="label_seq"),
                          n_epochs=10,
                          batch_size=4,
                          dev_data=test_data,
                          save_path="./save")
        trainer.train()
        print('Train finished!')

        # 使用fastNLP的Tester测试脚本
        tester = Tester(data=test_data,
                        model=model,
                        metrics=AccuracyMetric(pred="predict",
                                               target="label_seq"),
                        batch_size=4)
        acc = tester.test()
        print(acc)

Exemple #21

0

Afficher le fichier

Fichier : test_dataset.py Projet : zxlzr/fastNLP

    def test_len_(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
        self.assertEqual(len(ds), 40)

        ds = DataSet()
        self.assertEqual(len(ds), 0)

Exemple #22

0

Afficher le fichier

def make_dataset(data):
    dataset = DataSet()
    mx = 0
    le = None
    for x, y in zip(data.data, data.target):
        xx = deal(x)
        ins = Instance(sentence=xx, label=int(y))
        if mx < len(xx.split()):
            mx = max(mx, len(xx.split()))
            le = xx
        dataset.append(ins)
    print(mx)
    dataset.apply_field(lambda x: x.split(),
                        field_name='sentence',
                        new_field_name='words')
    dataset.apply_field(lambda x: len(x),
                        field_name='words',
                        new_field_name='seq_len')

    dataset.rename_field('words', Const.INPUT)
    dataset.rename_field('seq_len', Const.INPUT_LEN)
    dataset.rename_field('label', Const.TARGET)

    dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    dataset.set_target(Const.TARGET)
    return dataset

Exemple #23

0

Afficher le fichier

Fichier : test_dataset.py Projet : zxlzr/fastNLP

 def test_contains(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
     self.assertTrue("x" in ds)
     self.assertTrue("y" in ds)
     self.assertFalse("z" in ds)

Exemple #24

0

Afficher le fichier

    def testENAS(self):
        # 从csv读取数据到DataSet
        sample_path = "tutorials/sample_data/tutorial_sample_dataset.csv"
        dataset = DataSet.read_csv(sample_path,
                                   headers=('raw_sentence', 'label'),
                                   sep='\t')
        print(len(dataset))
        print(dataset[0])
        print(dataset[-3])

        dataset.append(Instance(raw_sentence='fake data', label='0'))
        # 将所有数字转为小写
        dataset.apply(lambda x: x['raw_sentence'].lower(),
                      new_field_name='raw_sentence')
        # label转int
        dataset.apply(lambda x: int(x['label']), new_field_name='label')

        # 使用空格分割句子
        def split_sent(ins):
            return ins['raw_sentence'].split()

        dataset.apply(split_sent, new_field_name='words')

        # 增加长度信息
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
        print(len(dataset))
        print(dataset[0])

        # DataSet.drop(func)筛除数据
        dataset.drop(lambda x: x['seq_len'] <= 3)
        print(len(dataset))

        # 设置DataSet中，哪些field要转为tensor
        # set target，loss或evaluate中的golden，计算loss，模型评估时使用
        dataset.set_target("label")
        # set input，模型forward时使用
        dataset.set_input("words", "seq_len")

        # 分出测试集、训练集
        test_data, train_data = dataset.split(0.5)
        print(len(test_data))
        print(len(train_data))

        # 构建词表, Vocabulary.add(word)
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
        vocab.build_vocab()

        # index句子, Vocabulary.to_index(word)
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        test_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        print(test_data[0])

        # 如果你们需要做强化学习或者GAN之类的项目，你们也可以使用这些数据预处理的工具
        from fastNLP.core.batch import Batch
        from fastNLP.core.sampler import RandomSampler

        batch_iterator = Batch(dataset=train_data,
                               batch_size=2,
                               sampler=RandomSampler())
        for batch_x, batch_y in batch_iterator:
            print("batch_x has: ", batch_x)
            print("batch_y has: ", batch_y)
            break

        from fastNLP.models.enas_model import ENASModel
        from fastNLP.models.enas_controller import Controller
        model = ENASModel(embed_num=len(vocab), num_classes=5)
        controller = Controller()

        from fastNLP.models.enas_trainer import ENASTrainer
        from copy import deepcopy

        # 更改DataSet中对应field的名称，要以模型的forward等参数名一致
        train_data.rename_field('words',
                                'word_seq')  # input field 与 forward 参数一致
        train_data.rename_field('label', 'label_seq')
        test_data.rename_field('words', 'word_seq')
        test_data.rename_field('label', 'label_seq')

        loss = CrossEntropyLoss(pred="output", target="label_seq")
        metric = AccuracyMetric(pred="predict", target="label_seq")

        trainer = ENASTrainer(model=model,
                              controller=controller,
                              train_data=train_data,
                              dev_data=test_data,
                              loss=CrossEntropyLoss(pred="output",
                                                    target="label_seq"),
                              metrics=AccuracyMetric(pred="predict",
                                                     target="label_seq"),
                              check_code_level=-1,
                              save_path=None,
                              batch_size=32,
                              print_every=1,
                              n_epochs=3,
                              final_epochs=1)
        trainer.train()
        print('Train finished!')

        # 调用Tester在test_data上评价效果
        from fastNLP import Tester

        tester = Tester(data=test_data,
                        model=model,
                        metrics=AccuracyMetric(pred="predict",
                                               target="label_seq"),
                        batch_size=4)

        acc = tester.test()
        print(acc)

Exemple #25

0

Afficher le fichier

Fichier : test_dataset.py Projet : zxlzr/fastNLP

 def test_split(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
     d1, d2 = ds.split(0.1)

Exemple #26

0

Afficher le fichier

def prepare_fake_dataset2(*args, size=100):
    ys = np.random.randint(4, size=100, dtype=np.int64)
    data = {'y': ys}
    for arg in args:
        data[arg] = np.random.randn(size, 5)
    return DataSet(data=data)

Exemple #27

0

Afficher le fichier

Fichier : test_dataset.py Projet : zxlzr/fastNLP

 def test_get_all_fields(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
     ans = ds.get_all_fields()
     self.assertEqual(ans["x"].content, [[1, 2, 3, 4]] * 10)
     self.assertEqual(ans["y"].content, [[5, 6]] * 10)

Exemple #28

0

Afficher le fichier

def from_raw_text_new(chars, vocabs, w_list, number_normalized=False):
    from fastNLP.core import DataSet
    from utils import get_bigrams
    bigrams = get_bigrams(chars)
    seq_len = len(chars)
    target = ['O'] * seq_len
    dataset = DataSet({
        'chars': [chars],
        'bigrams': [bigrams],
        'seq_len': [seq_len],
        'target': [target]
    })
    datasets = {'train': dataset}

    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    from fastNLP.embeddings import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    a.apply
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars',
                      'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons',
                      'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons',
                      'lex_e')

    if number_normalized == 1:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    # print(list(datasets['train'][:10]['lexicons']))
    # print(list(datasets['train'][:10]['lattice']))
    # print(list(datasets['train'][:10]['lex_s']))
    # print(list(datasets['train'][:10]['lex_e']))
    # print(list(datasets['train'][:10]['pos_s']))
    # print(list(datasets['train'][:10]['pos_e']))
    # exit(1208)

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word')
    #
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count')
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_l2r_word',new_field_name='skips_l2r_word')
    #
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back')
    #
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_r2l_word',new_field_name='skips_r2l_word')

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice',
                                    new_field_name='lattice')

    return datasets, vocabs

Exemple #29

0

Afficher le fichier

Fichier : test_dataset.py Projet : zxlzr/fastNLP

 def test__repr__(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
     for iter in ds:
         self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4] type=list,\n'y': [5, 6] type=list}")

Exemple #30

0

Afficher le fichier

def get_fastnlp_dataset():
    # Hyper parameters
    output_dim = 10
    SEQUENCE_LENGTH = 28
    mnist_train_length = 60000
    validation_samples = 5000
    BATCH_SIZE = 1
    transform_train_test = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])
    indices = torch.randperm(mnist_train_length)
    train_indices = indices[:len(indices) - validation_samples]
    val_indices = indices[len(indices) - validation_samples:]

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data',
                       train=True,
                       download=True,
                       transform=transform_train_test),
        sampler=SubsetRandomSampler(train_indices),
        batch_size=BATCH_SIZE,
        shuffle=False)
    val_loader = torch.utils.data.DataLoader(
        datasets.MNIST('./data',
                       train=True,
                       download=False,
                       transform=transform_train_test),
        sampler=SubsetRandomSampler(val_indices),
        batch_size=BATCH_SIZE,
        shuffle=False)

    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        './data', train=False, transform=transform_train_test),
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)

    TRAIN_ITERS = int(
        math.ceil((mnist_train_length - validation_samples) / BATCH_SIZE))
    VAL_ITERS = int(math.ceil(validation_samples / BATCH_SIZE))
    TEST_ITERS = int(math.ceil(len(test_loader.dataset) / BATCH_SIZE))

    train_data = DataSet()
    val_data = DataSet()
    test_data = DataSet()
    for batch_idx, (x, y) in enumerate(train_loader):
        x = x[0][0].numpy()
        train_data.append(Instance(word_seq=x, target=int(y)))
    for batch_idx, (x, y) in enumerate(val_loader):
        x = x[0][0].numpy()
        val_data.append(Instance(word_seq=x, target=int(y)))
    for batch_idx, (x, y) in enumerate(test_loader):
        x = x[0][0].numpy()
        test_data.append(Instance(word_seq=x, target=int(y)))

    # 设定特征域和标签域
    train_data.set_input("word_seq")
    test_data.set_input("word_seq")
    val_data.set_input("word_seq")
    train_data.set_target("target")
    test_data.set_target("target")
    val_data.set_target("target")

    return train_data, val_data, test_data