Exemple #1
0
def get_fastnlp_dataset():
    text_train, text_test = get_text_classification_datasets()
    train_data = DataSet()
    test_data = DataSet()
    for i in range(len(text_train.data)):
        train_data.append(
            Instance(text=split_sent(text_train.data[i]),
                     target=int(text_train.target[i])))
    for i in range(len(text_test.data)):
        test_data.append(
            Instance(text=split_sent(text_test.data[i]),
                     target=int(text_test.target[i])))

    # 构建词表
    vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>')
    train_data.apply(lambda x: [vocab.add(word) for word in x['text']])
    vocab.build_vocab()

    # 根据词表映射句子
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']],
                     new_field_name='word_seq')
    test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']],
                    new_field_name='word_seq')

    # 设定特征域和标签域
    train_data.set_input("word_seq")
    test_data.set_input("word_seq")
    train_data.set_target("target")
    test_data.set_target("target")

    return train_data, test_data, vocab
def preprocess():
    train_set = DataSet()
    for i in range(len(raw_train.data)):
        train_set.append(
            Instance(sentence=raw_train.data[i],
                     target=int(raw_train.target[i])))

    train_set.apply(lambda x: x['sentence'].translate(
        str.maketrans("", "", string.punctuation)).lower(),
                    new_field_name='sentence')
    train_set.apply(lambda x: x['sentence'].split(), new_field_name='words')
    train_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

    test_set = DataSet()
    for i in range(len(raw_test.data)):
        test_set.append(
            Instance(sentence=raw_test.data[i],
                     target=int(raw_test.target[i])))

    test_set.apply(lambda x: x['sentence'].translate(
        str.maketrans("", "", string.punctuation)).lower(),
                   new_field_name='sentence')
    test_set.apply(lambda x: x['sentence'].split(), new_field_name='words')
    test_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

    vocab = Vocabulary(min_freq=10)
    train_set.apply(lambda x: [vocab.add(word) for word in x['words']])
    test_set.apply(lambda x: [vocab.add(word) for word in x['words']])
    vocab.build_vocab()
    vocab.index_dataset(train_set, field_name='words', new_field_name='words')
    vocab.index_dataset(test_set, field_name='words', new_field_name='words')

    return train_set, test_set, vocab
def create_dataset():
        # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles']
        # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale']
        categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
                      'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
                      'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
                      'soc.religion.christian', 'talk.politics.guns',
                      'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

        newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, data_home='../../..')
        newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, data_home='../../..')

        dataset = DataSet()

        for i in range(len(newsgroups_train.data)):
            if len(newsgroups_train.data[i]) <= 2000:
                dataset.append(Instance(raw_sentence=newsgroups_train.data[i], target=int(newsgroups_train.target[i])))
        for i in range(len(newsgroups_test.data)):
            if len(newsgroups_test.data[i]) <= 2000:
                dataset.append(Instance(raw_sentence=newsgroups_test.data[i], target=int(newsgroups_test.target[i])))

        dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')
        dataset.apply(lambda x: x['sentence'].split(), new_field_name='words')
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')

        vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')
        vocab.index_dataset(dataset, field_name='words', new_field_name='words')

        dataset.set_input('words', 'seq_len')
        dataset.set_target('target')

        train_dev_data, test_data = dataset.split(0.1)
        train_data, dev_data = train_dev_data.split(0.1)

        return vocab, train_data, dev_data, test_data
Exemple #4
0
    def load(self, folder):
        fns ={
            'dev':'{}_dev.csv'.format(self.lg1_lg2),
            'test':'{}_test500.csv'.format(self.lg1_lg2),
            'train': '{}_train500_10.csv'.format(self.lg1_lg2)
        }
        target_lg = self.lg1_lg2.split('_')[0]
        data_bundle = DataBundle()
        for name, fn in fns.items():
            path = os.path.join(folder, fn)
            ds = DataSet()
            with open(path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line:
                        parts = line.split('\t')
                        if self.lower:
                            ins = Instance(word=parts[1].lower(), definition=parts[-1].lower())
                        else:
                            ins = Instance(word=parts[1], definition=parts[-1])
                        ds.append(ins)
            data_bundle.set_dataset(ds, name=name)
        target_words = {}
        with open(os.path.join(folder, '{}.txt'.format(target_lg)), encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    if self.lower:
                        line = line.lower()
                    target_words[line] = 1
        target_words = list(target_words.keys())

        setattr(data_bundle, 'target_words', target_words)
        return data_bundle
Exemple #5
0
 def test_list_of_numpy_to_tensor(self):
     ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] +
                  [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         print(x, y)
 def test_init(self):
     fields = {"x": [1, 2, 3], "y": [4, 5, 6]}
     ins = Instance(x=[1, 2, 3], y=[4, 5, 6])
     self.assertTrue(isinstance(ins.fields, dict))
     self.assertEqual(ins.fields, fields)
     
     ins = Instance(**fields)
     self.assertEqual(ins.fields, fields)
 def test_case_1(self):
     ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['Jack'])])
     vocab = Vocabulary().from_dataset(ds, field_name='words')
     self.assertEqual(len(vocab), 5)
     embed = LSTMCharEmbedding(vocab, embed_size=60)
     x = torch.LongTensor([[2, 1, 0], [4, 3, 4]])
     y = embed(x)
     self.assertEqual(tuple(y.size()), (2, 3, 60))
def readdata():
    global target_len
    min_count = 10
    #categories = ['comp.os.ms-windows.misc', 'rec.motorcycles', 'sci.space', 'talk.politics.misc', ]
    dataset_train = fetch_20newsgroups(subset='train', data_home='../../..')
    dataset_test = fetch_20newsgroups(subset='test', data_home='../../..')

    data = dataset_train.data
    target = dataset_train.target
    target_len = len(dataset_train.target_names)
    train_data =  DataSet()
    padding = 0
    for i in range(len(data)):
        data_t =  re.sub("\d+|\s+|/", " ", data[i] )
        temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != '']
        train_data.append(Instance(raw = data[i], label = int(target[i]), words = temp))
        if len(temp) > padding:
            padding = len(temp)
    train_data.apply(lambda x: x['raw'].lower(), new_field_name='raw')

    data = dataset_test.data
    target = dataset_test.target
    test_data =  DataSet()
    padding = 0
    for i in range(len(data)):
        data_t =  re.sub("\d+|\s+|/", " ", data[i] )
        temp = [word.strip(string.punctuation).lower() for word in data_t.split() if word.strip(string.punctuation) != '']
        test_data.append(Instance(raw = data[i], label = int(target[i]), words = temp))
        if len(temp) > padding:
            padding = len(temp)
    test_data.apply(lambda x: x['raw'].lower(), new_field_name='raw')

    train_data.apply(lambda x: len(x['words']), new_field_name='len')
    test_data.apply(lambda x: len(x['words']), new_field_name='len')

    vocab = Vocabulary(min_freq=10)
    train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
    vocab.build_vocab()
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq')
    test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='seq')
    train_data.rename_field('seq', Const.INPUT)
    train_data.rename_field('len', Const.INPUT_LEN)
    train_data.rename_field('label', Const.TARGET)

    test_data.rename_field('seq', Const.INPUT)
    test_data.rename_field('len', Const.INPUT_LEN)
    test_data.rename_field('label', Const.TARGET)

    test_data.set_input(Const.INPUT, Const.INPUT_LEN)
    test_data.set_target(Const.TARGET)
    train_data.set_input(Const.INPUT, Const.INPUT_LEN)
    train_data.set_target(Const.TARGET)

    test_data, dev_data = test_data.split(0.5)
    return train_data,dev_data,test_data,vocab
Exemple #9
0
 def test_list_of_list_to_tensor(self):
     ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] +
                  [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
Exemple #10
0
def prepare_fake_dataset():
    mean = np.array([-3, -3])
    cov = np.array([[1, 0], [0, 1]])
    class_A = np.random.multivariate_normal(mean, cov, size=(1000,))
    
    mean = np.array([3, 3])
    cov = np.array([[1, 0], [0, 1]])
    class_B = np.random.multivariate_normal(mean, cov, size=(1000,))
    
    data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] +
                       [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B])
    return data_set
 def test_case_2(self):
     # 测试只需要拥有一样的index就可以concat
     ds = DataSet([
         Instance(words=['hello', 'world']),
         Instance(words=['hello', 'Jack'])
     ])
     vocab1 = Vocabulary().from_dataset(ds, field_name='words')
     vocab2 = Vocabulary().from_dataset(ds, field_name='words')
     self.assertEqual(len(vocab1), 5)
     cnn_embed = CNNCharEmbedding(vocab1, embed_size=60)
     lstm_embed = LSTMCharEmbedding(vocab2, embed_size=70)
     embed = StackEmbedding([cnn_embed, lstm_embed])
     x = torch.LongTensor([[2, 1, 0], [4, 3, 4]])
     y = embed(x)
     self.assertEqual(tuple(y.size()), (2, 3, 130))
Exemple #12
0
def read_dataset(path, lower, word_idx=1, def_idx=-1):
    ds = DataSet()
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split('\t')
                if lower:
                    ins = Instance(word=parts[word_idx].lower(),
                                   definition=parts[def_idx].lower())
                else:
                    ins = Instance(word=parts[word_idx],
                                   definition=parts[def_idx])
                ds.append(ins)
    return ds
def process_poems_large(file_name, sentence_len, vocab_size):
    sentences = []
    with open(file_name, "r", encoding='utf-8', ) as f:
        for line in f.readlines():
            try:
                line = line.strip()
                if line:
                    title, content = line.split(':')
                    # print(title)
                    # print(content)
                    # content = line.replace(' ', '').replace(',','').replace('。','')
                    content = content.replace(' ', '') #包含标点符号
                    # 可以只取五言诗
                    # if len(content) < 6 or content[5] != ',':
                    #     continue
                    if len(content) < 20:
                        continue
                    if ':' in content or '_' in content or '(' in content or '(' in content or '《' in content or '[' in content:
                        continue
                    #截断长度
                    if len(content) > sentence_len:
                        content = content[:sentence_len]
                    content = content + end_token
                    sentences.append(content)
            except ValueError as e:
                pass

    dataset = DataSet()
    # sentences = random.sample(sentences, 5000)
    for sentence in sentences:
        instance = Instance()
        instance['raw_sentence'] = sentence
        instance['target'] = sentence[1:] + sentence[-1]
        dataset.append(instance)

    dataset.set_input("raw_sentence")
    dataset.set_target("target")
    
    # for iter in dataset:
    #     print(iter['raw_sentence'])
    print("dataset_size:", len(dataset))

    train_data, dev_data = dataset.split(0.2)
    train_data.rename_field("raw_sentence", "sentence")
    dev_data.rename_field("raw_sentence", "sentence")
    vocab = Vocabulary(max_size=vocab_size, min_freq=2, unknown='<unk>', padding='<pad>')

    # 构建词表
    train_data.apply(lambda x: [vocab.add(word) for word in x['sentence']])
    vocab.build_vocab()

    # 根据词表index句子
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence')
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target')
    dev_data.apply(lambda x: [vocab.to_index(word) for word in x['sentence']], new_field_name='sentence')
    dev_data.apply(lambda x: [vocab.to_index(word) for word in x['target']], new_field_name='target')

    print("vocabulary_size:", len(vocab))

    return train_data, dev_data, vocab
Exemple #14
0
def get_joke_data(data_path):
    data_set = DataSet()
    sample_num = 0
    sample_len = []
    if os.path.exists(data_path):
        with open(data_path, 'r', encoding='utf-8') as fin:
            for lid, line in enumerate(fin):
                joke = json.loads(line)
                if joke['support'] > 0:
                    if len(joke['content']) == 0:
                        continue
                    else:
                        instance = Instance(raw_joke=joke['content'])
                        data_set.append(instance)
                        sample_num += 1
                        sample_len.append(len(joke['content']))
    else:
        print("the data path doesn't  exit.")
    print("Got {} samples from file.".format(sample_num))
    for i in range(5):
        import random
        id = random.randint(0, sample_num)
        print("sample {}: {}".format(id, data_set[id]['raw_joke']))

    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    plt.hist(sample_len, bins=50, range=(0, 1000))
    plt.savefig("./examples.jpg")
    count = 0
    for i in sample_len:
        if i < 255:
            count += 1
    print(count, '/', len(sample_len))
    return data_set
Exemple #15
0
def make_dataset(data):
    dataset = DataSet()
    mx = 0
    le = None
    for x, y in zip(data.data, data.target):
        xx = deal(x)
        ins = Instance(sentence=xx, label=int(y))
        if mx < len(xx.split()):
            mx = max(mx, len(xx.split()))
            le = xx
        dataset.append(ins)
    print(mx)
    dataset.apply_field(lambda x: x.split(),
                        field_name='sentence',
                        new_field_name='words')
    dataset.apply_field(lambda x: len(x),
                        field_name='words',
                        new_field_name='seq_len')

    dataset.rename_field('words', Const.INPUT)
    dataset.rename_field('seq_len', Const.INPUT_LEN)
    dataset.rename_field('label', Const.TARGET)

    dataset.set_input(Const.INPUT, Const.INPUT_LEN)
    dataset.set_target(Const.TARGET)
    return dataset
Exemple #16
0
 def test_append(self):
     dd = DataSet()
     for _ in range(3):
         dd.append(Instance(x=[1, 2, 3, 4], y=[5, 6]))
     self.assertEqual(len(dd), 3)
     self.assertEqual(dd.field_arrays["x"].content, [[1, 2, 3, 4]] * 3)
     self.assertEqual(dd.field_arrays["y"].content, [[5, 6]] * 3)
Exemple #17
0
def make_dataset(data):
    dataset = DataSet()
    tot = 0
    for x in data:

        seq = "[CLS] " + x["raw_text"]
        seq = tokenizer.encode(seq)
        """
        seq=["[CLS]"]+word_tokenize(x["raw_text"])
        seq=tokenizer.convert_tokens_to_ids(seq)
        """
        if len(seq) > 512:
            seq = seq[:512]
            tot += 1
            # print(x["raw_text"])
            # print()

        label = int(x["label"])
        ins = Instance(origin=x["raw_text"],
                       seq=seq,
                       label=label,
                       seq_len=len(seq))
        dataset.append(ins)

    dataset.set_input("seq", "seq_len")
    dataset.set_target("label")
    print(dataset[5])
    print("number:", len(dataset), tot)
    print()
    return dataset
Exemple #18
0
def load(path):

    data = DataSet()
    _data = []

    with open(path, "r", encoding="utf-8") as fil:
        fil.readline()

        for line in fil:
            try:
                tradi, verna = line.strip().split("\t")
            except ValueError:
                continue

            tradi = chinese_tokenizer(tradi)
            verna = chinese_tokenizer(verna)

            vocab.add_word_lst(tradi)
            vocab.add_word_lst(verna)

            _data.append(Instance(traditional=tradi, vernacular=verna))

    random.shuffle(_data)
    for x in _data:
        data.append(x)

    data.set_input("vernacular")
    data.set_target("traditional")
    return data
Exemple #19
0
def prepare_env():
    mean = np.array([-3, -3])
    cov = np.array([[1, 0], [0, 1]])
    class_A = np.random.multivariate_normal(mean, cov, size=(1000,))
    
    mean = np.array([3, 3])
    cov = np.array([[1, 0], [0, 1]])
    class_B = np.random.multivariate_normal(mean, cov, size=(1000,))
    
    data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] +
                       [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B])
    
    data_set.set_input("x")
    data_set.set_target("y")
    model = NaiveClassifier(2, 1)
    return data_set, model
Exemple #20
0
 def load(self, path: str, bigram: bool = False) -> DataSet:
     """
     :param path: str
     :param bigram: 是否使用bigram feature
     :return:
     """
     dataset = DataSet()
     with open(path, 'r', encoding='utf-8') as f:
         for line in f:
             line = line.strip()
             if not line:  # 去掉空行
                 continue
             parts = line.split()
             word_lens = map(len, parts)
             chars = list(''.join(parts))
             tags = self._word_len_to_target(word_lens)
             assert len(chars) == len(tags['target'])
             dataset.append(
                 Instance(raw_chars=chars, **tags, seq_len=len(chars)))
     if len(dataset) == 0:
         raise RuntimeError(f"{path} has no valid data.")
     if bigram:
         dataset.apply_field(self._gen_bigram,
                             field_name='raw_chars',
                             new_field_name='bigrams')
     return dataset
Exemple #21
0
def construct_dataset(sentences):
    dataset = DataSet()
    for sentence in sentences:
        instance = Instance()
        instance['raw_sentence'] = sentence
        dataset.append(instance)
    return dataset
Exemple #22
0
def read_file(filename, processing_word=get_processing_word(lowercase=False)):
    dataset = DataSet()
    niter = 0
    with codecs.open(filename, "r", "utf-16") as f:
        words, tags = [], []
        for line in f:
            line = line.strip()
            if len(line) == 0 or line.startswith("-DOCSTART-"):
                if len(words) != 0:
                    assert len(words) > 2
                    if niter == 1:
                        print(words, tags)
                    niter += 1
                    dataset.append(
                        Instance(ori_words=words[:-1], ori_tags=tags[:-1]))
                    words, tags = [], []
            else:
                word, tag = line.split()
                word = processing_word(word)
                words.append(word)
                tags.append(tag.lower())

    dataset.apply_field(lambda x: [x[0]],
                        field_name='ori_words',
                        new_field_name='task')
    dataset.apply_field(lambda x: len(x),
                        field_name='ori_tags',
                        new_field_name='seq_len')
    dataset.apply_field(lambda x: expand(x),
                        field_name='ori_words',
                        new_field_name="bi1")
    return dataset
def read_instances_from_file(file, max_len=400, keep_case=False):
    ''' Collect instances and construct vocab '''

    dataset = DataSet()
    trimmed_sent = 0

    with open(file) as f:
        lines = f.readlines()
        for l in lines:
            l = l.strip().split('\t')
            if len(l) < 2:
                continue
            label = int(l[0])
            sent = l[1]
            if not keep_case:
                sent = sent.lower()
            word_lst = sent.split()
            if len(word_lst) > max_len:
                word_lst = word_lst[:max_len]
                trimmed_sent += 1
            if word_lst:
                dataset.append(Instance(words=word_lst, label=label))

    logger.info('Get {} instances from file {}'.format(len(dataset), file))
    if trimmed_sent:
        logger.info('{} sentences are trimmed. Max sentence length: {}.'
                    .format(trimmed_sent, max_len))

    return dataset
Exemple #24
0
 def test_Index2WordProcessor(self):
     vocab = Vocabulary()
     vocab.add_word_lst(["a", "b", "c", "d", "e"])
     proc = Index2WordProcessor(vocab, "tag_id", "tag")
     data_set = DataSet(
         [Instance(tag_id=[np.random.randint(0, 7) for _ in range(32)])])
     data_set = proc(data_set)
     self.assertTrue("tag" in data_set)
Exemple #25
0
    def dataset(self):
        d = DataSet()
        for key in self.data:
            for ins in self.data[key]['dataset']['chars']:
                ins = Instance(chars=ins)
                d.append(ins)

        return d
Exemple #26
0
def preprocess(batch=16):
    raw_data1 = []
    raw_data2 = []

    for i in range(len(traindata.data)):
        raw_data1.append(
            Instance(sentence=traindata.data[i],
                     label=int(traindata.target[i])))
    trainset = DataSet(raw_data1)
    trainset.apply(lambda x: pre(x['sentence']), new_field_name='words')

    for i in range(len(testdata.data)):
        raw_data2.append(
            Instance(sentence=testdata.data[i], label=int(testdata.target[i])))
    testset = DataSet(raw_data2)
    testset.apply(lambda x: pre(x['sentence']), new_field_name='words')

    global vocab
    vocab = Vocabulary(min_freq=1).from_dataset(trainset,
                                                testset,
                                                field_name='words')
    vocab.index_dataset(trainset,
                        testset,
                        field_name='words',
                        new_field_name='words')
    trainset.set_input('words')
    testset.set_input('words')

    trainset.apply(lambda x: int(x['label']),
                   new_field_name='target',
                   is_target=True)
    testset.apply(lambda x: int(x['label']),
                  new_field_name='target',
                  is_target=True)

    trainset.apply(lambda x: len(x['words']), new_field_name='seq_len')
    testset.apply(lambda x: len(x['words']), new_field_name='seq_len')

    global vocabsize
    vocabsize = len(vocab)
    sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len')
    train_batch = Batch(batch_size=batch, dataset=trainset, sampler=sampler)
    test_batch = Batch(batch_size=batch, dataset=testset, sampler=sampler)

    return train_batch, test_batch, vocabsize
Exemple #27
0
def get_all_tang(data_path=None):
    data_set = DataSet()
    if data_path is None:
        all_tang = get_all_data()
        for sample in all_tang:
            instance = Instance(raw_sentence=sample)
            data_set.append(instance)
    else:
        if os.path.exists(data_path):
            with open(data_path, 'r', encoding='utf-8') as fin:
                for lidx, line in enumerate(fin):
                    line = line.strip()
                    if (line != "" and len(line) > 1):
                        instance = Instance(raw_sentence=line)
                        data_set.append(instance)
        else:
            print("the data path doesn't  exit.")
    return data_set
def get_dataset(data_path):
    print('Getting dataset...')

    poetry = []
    with open(data_path, 'r', encoding='utf-8') as f:
        poem = ''
        for line in f:
            if len(line) <= 1:
                ins = Instance(text=poem)
                if len(poem) > 10:
                    poetry.append(ins)
                poem = ''
            else:
                poem += line.strip('\n')
    # print(poetry[0])

    data = DataSet(data=poetry)
    print("Original data:", data[0])

    vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>')
    vocabulary.add_word('<eos>')
    vocabulary.add_word('<START>')
    data.apply(lambda x: [vocabulary.add(char) for char in x['text']])
    vocabulary.build_vocab()
    print('pad:', vocabulary.to_index('<pad>'))
    print('Vocab size:', len(vocabulary))

    data.apply(lambda x: [vocabulary.to_index(char) for char in x['text']],
               new_field_name='text')
    data.apply(lambda x: [vocabulary.to_index('<START>')] + x['text'] +
               [vocabulary.to_index('<eos>')],
               new_field_name='text')
    data.apply(
        lambda x: x['text'][0:min(config.sequence_length, len(x['text']))],
        new_field_name='text')
    data.apply(lambda x: [vocabulary.to_index('<pad>')] *
               (config.sequence_length - len(x['text'])) + x['text'],
               new_field_name='text')
    data.apply(lambda x: x['text'][0:-1], new_field_name='input')
    data.apply(lambda x: x['text'][1:], new_field_name='target')
    data.set_input('input')
    data.set_target('target')

    # length = config.sequence_length
    # for i, d in enumerate(data):
    #     if length != len(d['text']):
    #         print("wrong!")
    # exit()

    train_data, dev_data = data.split(0.2)
    print('Train data size:', len(train_data))
    print('Dev data size:', len(dev_data))
    print("Train data:", train_data[20])
    # print("Dev data:", dev_data[0])

    return train_data, dev_data, vocabulary
Exemple #29
0
def process_data_1(embed_file, cws_train):
    embed, vocab = EmbedLoader.load_without_vocab(embed_file)
    time.sleep(1)  # 测试是否通过读取cache获得结果
    with open(cws_train, 'r', encoding='utf-8') as f:
        d = DataSet()
        for line in f:
            line = line.strip()
            if len(line) > 0:
                d.append(Instance(raw=line))
    return embed, vocab, d
Exemple #30
0
 def test_init_v1(self):
     # 一维list
     ds = DataSet([Instance(x=[1, 2, 3, 4], y=[5, 6])] * 40)
     self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays)
     self.assertEqual(ds.field_arrays["x"].content, [
         [1, 2, 3, 4],
     ] * 40)
     self.assertEqual(ds.field_arrays["y"].content, [
         [5, 6],
     ] * 40)