コード例 #1
0
def test():
    from fastNLP import DataSetIter, DataSet
    # 0 for not match,1 for match
    testset = DataSet({"raw_words": ["5::five"]})
    testset.apply(addWords, new_field_name="p_words")
    testset.apply(addWordPiece, new_field_name="t_words")
    testset.apply(processItem, new_field_name="word_pieces")
    testset.apply(processNum, new_field_name="word_nums")
    testset.apply(addSeqlen, new_field_name="seq_len")
    testset.field_arrays["word_pieces"].is_input = True
    testset.field_arrays["seq_len"].is_input = True
    testset.field_arrays["word_nums"].is_input = True
    # print(testset)
    from fastNLP.io import ModelLoader
    loader = ModelLoader()
    if torch.cuda.is_available():
        model = loader.load_pytorch_model(
            "../models/bert_model_max_triple.pkl")
    else:
        model = torch.load("../models/bert_model_max_triple.pkl",
                           map_location="cpu")

    model.eval()
    test_batch = DataSetIter(batch_size=1, dataset=testset, sampler=None)
    outputs = []
    for batch_x, batch_y in test_batch:
        _move_dict_value_to_device(batch_x,
                                   batch_y,
                                   device=_get_model_device(model))
        outputs.append(
            model.forward(batch_x["word_pieces"], batch_x["word_nums"],
                          batch_x["seq_len"])['pred'])
    outputs = torch.cat(outputs)
    outputs = torch.nn.functional.softmax(outputs, dim=1)
    return outputs
コード例 #2
0
def is_phrase_match_BERT(phrase1, phrase2):
    """
    Determine if two phrases match
    :param phrase1: phrase1
    :param phrase2: phrase2
    """
    from fastNLP import DataSetIter, DataSet
    from fastNLP.core.utils import _move_dict_value_to_device
    from my_bert_match import addWords, addWordPiece, processItem, processNum, addSeqlen
    # 0 for not match,1 for match
    testset = DataSet({"raw_words": [f"{phrase1}::{phrase2}"]})
    testset.apply(addWords, new_field_name="p_words")
    testset.apply(addWordPiece, new_field_name="t_words")
    testset.apply(processItem, new_field_name="word_pieces")
    testset.apply(processNum, new_field_name="word_nums")
    testset.apply(addSeqlen, new_field_name="seq_len")
    testset.field_arrays["word_pieces"].is_input = True
    testset.field_arrays["seq_len"].is_input = True
    testset.field_arrays["word_nums"].is_input = True
    # print(testset)
    with torch.no_grad():
        bert_model.eval()
        test_batch = DataSetIter(batch_size=1, dataset=testset, sampler=None)
        outputs = []
        for batch_x, batch_y in test_batch:
            _move_dict_value_to_device(batch_x, batch_y, device=device)
            outputs.append(bert_model.forward(batch_x["word_pieces"], batch_x["word_nums"], batch_x["seq_len"])['pred'])
        outputs = torch.cat(outputs)
        outputs = torch.nn.functional.softmax(outputs, dim=1)
        return ["Not Match", "Related", "Match"][outputs.argmax().item()]
コード例 #3
0
def create_dataset(data, sample_size):
    data_set = DataSet()
    data_set.add_field('raw_sentence', data.data[:sample_size])
    data_set.add_field('target', data.target[:sample_size])
    data_set.apply(lambda x: sentence_to_words(x['raw_sentence']),
                   new_field_name='word_seq')
    return data_set
コード例 #4
0
ファイル: test_dataset.py プロジェクト: shubinhuang/fastNLP
    def test_apply_cannot_modify_instance(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})

        def modify_inplace(instance):
            instance['words'] = 1

        with self.assertRaises(TypeError):
            ds.apply(modify_inplace)
コード例 #5
0
ファイル: test_dataset.py プロジェクト: shubinhuang/fastNLP
    def test_apply_tqdm(self):
        import time
        ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})

        def do_nothing(ins):
            time.sleep(0.01)

        ds.apply(do_nothing, use_tqdm=True)
        ds.apply_field(do_nothing, field_name='x', use_tqdm=True)
コード例 #6
0
 def test_BucketSampler(self):
     sampler = BucketSampler(num_buckets=3,
                             batch_size=16,
                             seq_len_field_name="seq_len")
     data_set = DataSet({
         "x": [[0] * random.randint(1, 10)] * 10,
         "y": [[5, 6]] * 10
     })
     data_set.apply(lambda ins: len(ins["x"]), new_field_name="seq_len")
     indices = sampler(data_set)
     self.assertEqual(len(indices), 10)
コード例 #7
0
def produceCandidateTripleSlow(raw_phrase, Candidate_phrases, model,
                               Candidate_hpos_sub, threshold):
    """
    使用BERT判断Candidate_phrases中哪个与raw_phrase语义最接近;基于最大值方式;适用于单个处理
    """
    from fastNLP.core.utils import _move_dict_value_to_device
    from fastNLP.core.utils import _get_model_device
    from fastNLP import DataSet
    from fastNLP import DataSetIter
    from my_bert_match import addWordPiece, addSeqlen, addWords, processItem, processNum
    p_Candidate_phrases = [
        raw_phrase + "::" + item for item in Candidate_phrases
    ]
    Candidate_dataset = DataSet({"raw_words": p_Candidate_phrases})
    Candidate_dataset.apply(addWords, new_field_name="p_words")
    Candidate_dataset.apply(addWordPiece, new_field_name="t_words")
    Candidate_dataset.apply(processItem, new_field_name="word_pieces")
    Candidate_dataset.apply(processNum, new_field_name="word_nums")
    Candidate_dataset.apply(addSeqlen, new_field_name="seq_len")
    Candidate_dataset.field_arrays["word_pieces"].is_input = True
    Candidate_dataset.field_arrays["seq_len"].is_input = True
    Candidate_dataset.field_arrays["word_nums"].is_input = True
    test_batch = DataSetIter(batch_size=10,
                             dataset=Candidate_dataset,
                             sampler=None)

    outputs = []
    for batch_x, batch_y in test_batch:
        _move_dict_value_to_device(batch_x,
                                   batch_y,
                                   device=_get_model_device(model))
        outputs.append(
            model.forward(batch_x["word_pieces"], batch_x["word_nums"],
                          batch_x["seq_len"])['pred'])
    outputs = torch.cat(outputs)
    outputs = torch.nn.functional.softmax(outputs,
                                          dim=1).cpu().detach().numpy()

    results_2 = np.array([item[2] for item in outputs])
    results_1 = np.array([item[1] for item in outputs])

    # 如果这里已经能找到精确匹配的就直接输出
    if max(results_2) >= threshold:
        return Candidate_hpos_sub[int(
            np.argmax(results_2))], max(results_2), "2"

    if max(results_1) >= threshold:
        return Candidate_hpos_sub[int(
            np.argmax(results_1))], max(results_1), "1"

    return "None", None, "0"
コード例 #8
0
def get_data():
    s = ''
    dataset = DataSet()
    for line in open('../handout/tangshi.txt'):
        if (line == '\n'):
            dataset.append(Instance(raw_sentence=s, label='0'))
            #print(s)
            s = ''
        else:
            s += line.replace('\n', '')

    dataset.apply(add_end, new_field_name='raw_sentence')
    dataset.apply(split_sent, new_field_name='words')
    return dataset
コード例 #9
0
def preprocess(input):
    data = input.data
    target = input.target
    dataset = DataSet()
    for i in range(len(data)):
        data_tmp = data[i]
        for c in string.whitespace:
            data_tmp = data_tmp.replace(c, ' ')
        for c in string.punctuation:
            data_tmp = data_tmp.replace(c, '')
        data_tmp = data_tmp.lower().split()
        # print(data_tmp)
        dataset.append(Instance(sentence=data_tmp, target=int(target[i])))
    dataset.apply(lambda x: len(x['sentence']), new_field_name='seq_len')
    return dataset
コード例 #10
0
def import_data(path):
    dataset = DataSet()
    with open(path, encoding='utf-8') as f:
        content = f.readlines()[1::2]
        for c in content:
            c = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!?、~@#¥%……&*()《》]+", "",
                       c)  # !,。?
            c = c.replace(string.whitespace, '').strip()
            dataset.append(Instance(raw_sentence=c))
    dataset.drop(lambda x: len(list(x['raw_sentence'])) == 0)

    def split_sent(ins):
        return list(ins['raw_sentence'])

    dataset.apply(split_sent, new_field_name='words', is_input=True)
    test_data, train_data = dataset.split(0.8)
    print(len(test_data), len(train_data))
    return test_data, train_data
コード例 #11
0
def get_data():
        global max_length
        dataset = DataSet()
        for i in range(58):
                f = open("../../../chinese-poetry/json/poet.tang."+str(i*1000)+".json", encoding='utf-8')  
                setting = json.load(f)
                #print(setting)
                for line in setting:
                        if (len(line['paragraphs'])==4 and len(line['paragraphs'][0])==12 and len(line['paragraphs'][1])==12 and len(line['paragraphs'][2])==12 and len(line['paragraphs'][3])==12):
                                s=''
                                for sentence in line['paragraphs']:
                                        s+=Converter('zh-hans').convert(sentence)
                                s+='$'
                                dataset.append(Instance(raw_sentence=s))
                f.close()
                print('Has processed '+str((i+1)*1000)+' poems')
        
        dataset.apply(split_sent, new_field_name='words')
        train_data,test_data = dataset.split(0.2)
        return train_data,test_data
コード例 #12
0
def preprocess(data_in):
    data = data_in.data
    target = data_in.target
    dataset = DataSet()

    for i in range(len(data)):
        data_tmp = re.sub('\d+', ' ', data[i])
        for c in string.whitespace:
            data_tmp = data_tmp.replace(c, ' ')
        for c in string.punctuation:
            data_tmp = data_tmp.replace(c, '')
        data_tmp = data_tmp.lower().split()
        dataset.append(
            Instance(raw_sentence=data[i],
                     target=int(target[i]),
                     sentence=data_tmp))
    dataset.apply(lambda x: x['raw_sentence'].lower(),
                  new_field_name='raw_sentence')
    dataset.apply(lambda x: len(x['sentence']), new_field_name='seq_len')
    return dataset
コード例 #13
0
    def __init__(self, path=".data/yelp", dataset="yelp", batch_size=32):

        if dataset == "yelp":
            dataset = DataSet()

            for db_set in ['train']:
                text_file = os.path.join(path, 'sentiment.' + db_set + '.text')
                label_file = os.path.join(path,
                                          'sentiment.' + db_set + '.labels')
                with io.open(text_file, 'r', encoding="utf-8") as tf, io.open(
                        label_file, 'r', encoding="utf-8") as lf:
                    for text in tf:
                        label = lf.readline()
                        dataset.append(Instance(text=text, label=label))

            dataset.apply(lambda x: x['text'].lower(), new_field_name='text')
            dataset.apply(
                lambda x: ['<start>'] + x['text'].split() + ['<eos>'],
                new_field_name='words')
            dataset.drop(lambda x: len(x['words']) > 1 + 15 + 1)
            dataset.apply(lambda x: x['words'] + ['<pad>'] *
                          (17 - len(x['words'])),
                          new_field_name='words')
            dataset.apply(lambda x: int(x['label']),
                          new_field_name='label_seq',
                          is_target=True)

            _train_data, _test_data = dataset.split(0.3)

            _vocab = Vocabulary(min_freq=2)
            _train_data.apply(
                lambda x: [_vocab.add(word) for word in x['words']])
            _vocab.build_vocab()

            _train_data.apply(
                lambda x: [_vocab.to_index(word) for word in x['words']],
                new_field_name='word_seq',
                is_input=True)
            _test_data.apply(
                lambda x: [_vocab.to_index(word) for word in x['words']],
                new_field_name='word_seq',
                is_input=True)

        self.train_data = _train_data
        self.test_data = _test_data
        self.vocab = _vocab
        self.batch_size = batch_size
        self.train_iter = iter(
            Batch(dataset=self.train_data,
                  batch_size=self.batch_size,
                  sampler=SequentialSampler()))
コード例 #14
0
def handle_data(n_class):
    train_data = get_text_classification_datasets(n_class)
    dataset = DataSet()
    vocab = Vocabulary(min_freq=0, unknown='<unk>', padding='<pad>')
    for i in range(len(train_data.data)):
        ans = remove_punc(train_data.data[i])
        dataset.append((Instance(content=ans,
                                 target=int(train_data.target[i]))))
    dataset.apply(lambda x: x['content'].lower().split(),
                  new_field_name='words',
                  is_input=True)
    for txt in dataset:
        vocab.add_word_lst(txt['words'])
    vocab.build_vocab()
    # index句子, Vocabulary.to_index(word)
    dataset.apply(lambda x: [vocab.to_index(word) for word in x['words']],
                  new_field_name='index')
    dataset.set_input("index")
    dataset.set_target("target")
    tra, dev = dataset.split(0.2)
    return tra, dev, len(vocab)
コード例 #15
0
def read_json(path, is_low=True):
    # 将json 读取为 dataset, 增加seq_len,post,mask field
    with open(path, "r", encoding="utf8") as f:
        raw_data = json.load(f)
    data = []
    for d in raw_data:
        tok = d["token"] if not is_low else [w.lower() for w in d["token"]]
        pos = d["pos"]
        head = d["head"]
        deprel = d["deprel"]
        for aspect in d["aspects"]:
            asp = [aspect["term"],
                   ["urgrn"]][len(aspect["term"]) == 0]  # for Restaurants16
            fidx = aspect["from"]
            tidx = aspect["to"]
            pol = aspect["polarity"]
            data.append([tok, pos, head, deprel, asp, fidx, tidx, pol])

    fields = ["tok", "pos", "head", "deprel", "asp", "fidx", "tidx", "pol"]
    dataset = DataSet(dict(zip(fields, zip(*data))))

    def get_post(instance):
        fidx = instance["fidx"]
        tidx = instance["tidx"]
        seq_len = instance["seq_len"]
        return ([i - fidx
                 for i in range(fidx)] + [0 for _ in range(fidx, tidx)] +
                [i - tidx + 1 for i in range(tidx, seq_len)])

    def get_mask(instance):
        if not instance["fidx"] == instance["tidx"]:
            return [[0, 1][i in range(instance["fidx"], instance["tidx"])]
                    for i in range(instance["seq_len"])]
        else:
            return [1 for _ in range(instance["seq_len"])]

    dataset.apply(lambda line: len(line["tok"]), new_field_name="seq_len")
    dataset.apply(get_post, new_field_name="post")
    dataset.apply(get_mask, new_field_name="mask")
    return dataset
コード例 #16
0
def get_fastnlp_dataset():
    text_train, text_test = get_text_classification_datasets()
    train_data = DataSet()
    test_data = DataSet()
    for i in range(len(text_train.data)):
        train_data.append(
            Instance(text=split_sent(text_train.data[i]),
                     target=int(text_train.target[i])))
    for i in range(len(text_test.data)):
        test_data.append(
            Instance(text=split_sent(text_test.data[i]),
                     target=int(text_test.target[i])))

    # 构建词表
    vocab = Vocabulary(min_freq=5, unknown='<unk>', padding='<pad>')
    train_data.apply(lambda x: [vocab.add(word) for word in x['text']])
    vocab.build_vocab()

    # 根据词表映射句子
    train_data.apply(lambda x: [vocab.to_index(word) for word in x['text']],
                     new_field_name='word_seq')
    test_data.apply(lambda x: [vocab.to_index(word) for word in x['text']],
                    new_field_name='word_seq')

    # 设定特征域和标签域
    train_data.set_input("word_seq")
    test_data.set_input("word_seq")
    train_data.set_target("target")
    test_data.set_target("target")

    return train_data, test_data, vocab
コード例 #17
0
def create_dataset():
        # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles']
        # categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'rec.motorcycles', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale']
        categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
                      'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball',
                      'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
                      'soc.religion.christian', 'talk.politics.guns',
                      'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

        newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, data_home='../../..')
        newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, data_home='../../..')

        dataset = DataSet()

        for i in range(len(newsgroups_train.data)):
            if len(newsgroups_train.data[i]) <= 2000:
                dataset.append(Instance(raw_sentence=newsgroups_train.data[i], target=int(newsgroups_train.target[i])))
        for i in range(len(newsgroups_test.data)):
            if len(newsgroups_test.data[i]) <= 2000:
                dataset.append(Instance(raw_sentence=newsgroups_test.data[i], target=int(newsgroups_test.target[i])))

        dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')
        dataset.apply(lambda x: x['sentence'].split(), new_field_name='words')
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')

        vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')
        vocab.index_dataset(dataset, field_name='words', new_field_name='words')

        dataset.set_input('words', 'seq_len')
        dataset.set_target('target')

        train_dev_data, test_data = dataset.split(0.1)
        train_data, dev_data = train_dev_data.split(0.1)

        return vocab, train_data, dev_data, test_data
コード例 #18
0
def get_text_classification_datasets(num=10):
  categories = target_name[:num]
  train = fetch_20newsgroups(subset='train', categories=categories, data_home='../../..')
  test = fetch_20newsgroups(subset='test', categories=categories, data_home='../../..')
  train_data, train_target = [delete_char(doc) for doc in train.data], train.target.tolist()
  test_data, test_target = [delete_char(doc) for doc in test.data], test.target.tolist()

  # transform to DataSet()
  dataset_train, dataset_test = DataSet(), DataSet()
  max_len = 0
  for i in range(len(train_data)):
    dataset_train.append(Instance(doc_words=train_data[i], target=train_target[i]))
    if max_len < len(train_data[i]):
      max_len = len(train_data[i])
  for i in range(len(test_data)):
    dataset_test.append(Instance(doc_words=test_data[i], target=test_target[i]))
    if max_len < len(test_data[i]):
      max_len = len(test_data[i])

  # preprocess

  # drop some doc
  doc_len = lambda x: len(x['doc_words']) <= 10
  dataset_train.drop(doc_len)
  
  # build vocabulary
  vocab = Vocabulary(max_size=10000, min_freq=15, unknown='<unk>')
  dataset_train.apply(lambda x: [vocab.add(word) for word in x['doc_words']])
  vocab.build_vocab()

  # index
  indexF = lambda x: [vocab.to_index(word) for word in x['doc_words']]
  dataset_train.apply(indexF, new_field_name='words')
  dataset_test.apply(indexF, new_field_name='words')

  dataset_train_list = dataset_train.split(0.1)

  return dataset_train_list[0], dataset_train_list[1], dataset_test, len(vocab), max_len
コード例 #19
0
ファイル: test_dataset.py プロジェクト: zxlzr/fastNLP
    def test_apply(self):
        ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
        ds.apply(lambda ins: ins["x"][::-1], new_field_name="rx")
        self.assertTrue("rx" in ds.field_arrays)
        self.assertEqual(ds.field_arrays["rx"].content[0], [4, 3, 2, 1])

        ds.apply(lambda ins: len(ins["y"]), new_field_name="y")
        self.assertEqual(ds.field_arrays["y"].content[0], 2)

        res = ds.apply(lambda ins: len(ins["x"]))
        self.assertTrue(isinstance(res, list) and len(res) > 0)
        self.assertTrue(res[0], 4)

        ds.apply(lambda ins: (len(ins["x"]), "hahaha"), new_field_name="k", ignore_type=True)
コード例 #20
0
def get_data(dataset):
    n = len(dataset.data)
    data_set = DataSet()
    for i in range(n):
        data_set.append(
            Instance(raw_sentence=dataset.data[i],
                     target=int(dataset.target[i])))

    data_set.apply(lambda x: x['raw_sentence'].lower(),
                   new_field_name='sentence')
    data_set.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '',
                                    x['sentence']),
                   new_field_name='sentence')
    data_set.apply(lambda x: re.sub('[%s]' % re.escape(string.whitespace), ' ',
                                    x['sentence']),
                   new_field_name='sentence')
    data_set.apply(lambda x: x['sentence'].split(), new_field_name='words')

    return data_set
コード例 #21
0
def construct_dataset(dataset):
    dataset_ = DataSet()
    for sentence, target in zip(dataset.data, dataset.target):
        instance = Instance()
        instance['raw_sentence'] = sentence
        instance['target'] = int(target)
        dataset_.append(instance)

    dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '',
                                    x['raw_sentence']),
                   new_field_name='sentence')  #忽略标点
    dataset_.apply(lambda x: re.sub('[%s]' % re.escape(string.whitespace), ' ',
                                    x['sentence']),
                   new_field_name='sentence')  #将空格、换行符等空白替换为空格
    dataset_.apply(lambda x: x['sentence'].lower(),
                   new_field_name='sentence')  #转换为小写
    dataset_.apply_field(lambda x: x.split(),
                         field_name='sentence',
                         new_field_name='input')
    return dataset_
コード例 #22
0
def get_dataset(raw_data):
    data_dict = {"input": raw_data.data, "target": raw_data.target}
    dataset = DataSet(data=data_dict)

    # ignore string.punctuation
    dataset.apply(lambda x: x['input'].translate(
        str.maketrans("", "", string.punctuation)),
                  new_field_name='input')
    # string.whitespace -> space
    dataset.apply(
        lambda x: re.sub('[' + string.whitespace + ']', ' ', x['input']),
        new_field_name='input')
    # lower case & split by space
    dataset.apply(lambda x: x['input'].lower().split(' '),
                  new_field_name='input')

    # target: int
    dataset.set_input('input')
    dataset.set_target('target')
    return dataset
コード例 #23
0
            kernel_sizes=kernel_sizes,
            padding=padding)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(sum(kernel_nums), num_classes)
    
    def forward(self, words, seq_len=None):
        x = self.embed(words)  # [N,L] -> [N,L,C]
        x = self.conv_pool(x)  # [N,L,C] -> [N,C]
        x = self.dropout(x)
        x = self.fc(x)  # [N,C] -> [N, N_class]
        return {C.OUTPUT: x}
    
    def predict(self, words, seq_len=None):
        output = self(words, seq_len)
        _, predict = output[C.OUTPUT].max(dim=1)
        return {C.OUTPUT: predict}


#demo version

trainData.apply(lambda x: x['data'].lower(), new_field_name='sentence')
trainData.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True)
vocab = Vocabulary(min_freq=2)
vocab = vocab.from_dataset(trainData, field_name='words')
#change to index
vocab.index_dataset(trainData, field_name='words',new_field_name='words')
trainData.set_target('target')
model = CNNText((len(vocab),128), num_classes=20, padding=2, dropout=0.1)
train_data, dev_data = trainData.split(0.2)
trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), metrics=AccuracyMetric(), batch_size=16)
trainer.train()
コード例 #24
0
    def __init__(self,
                 path='.data/sst/trees',
                 data_type='sst',
                 batch_size=32,
                 split_ratio=0.1,
                 seq_len=15,
                 min_freq=2):

        data_set = DataSet()
        if data_type == 'yelp':
            path = '.data/yelp'
            for db_set in ['train']:
                text_file = os.path.join(path, 'sentiment.' + db_set + '.text')
                label_file = os.path.join(path,
                                          'sentiment.' + db_set + '.labels')

                with io.open(text_file, 'r', encoding="utf-8") as tf, io.open(
                        label_file, 'r', encoding="utf-8") as lf:
                    for text in tf:
                        label = lf.readline()
                        data_set.append(Instance(text=text, label=label))

            data_set.apply(
                lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'],
                new_field_name='words')
            data_set.drop(lambda x: len(x['words']) > seq_len + 2)

        elif data_type == 'sst':
            path = '.data/sst/trees'
            text = data.Field(init_token='<start>',
                              eos_token='<eos>',
                              lower=True,
                              tokenize='spacy',
                              fix_length=16)
            label = data.Field(sequential=False, unk_token='<unk>')
            filter = lambda ex: len(ex.text
                                    ) <= seq_len and ex.label != 'neutral'
            sst_train = datasets.SST(os.path.join(path, 'train.txt'),
                                     text,
                                     label,
                                     filter_pred=filter)
            sst_dev = datasets.SST(os.path.join(path, 'dev.txt'),
                                   text,
                                   label,
                                   filter_pred=filter)
            sst_test = datasets.SST(os.path.join(path, 'test.txt'),
                                    text,
                                    label,
                                    filter_pred=filter)
            for ex in sst_train.examples + sst_dev.examples + sst_test.examples:
                data_set.append(
                    Instance(words=ex.text,
                             label={
                                 'negative': 0,
                                 'positive': 1
                             }[ex.label]))

            data_set.apply(
                lambda x: ['<start>'] + [w.lower()
                                         for w in x['words']] + ['<eos>'],
                new_field_name='words')

        elif data_type == 'test':
            with io.open('fasttrial1.pos', 'r', encoding="utf-8") as f:
                for text in f:
                    data_set.append(Instance(text=text, label=1))
            with io.open('fasttrial1.neg', 'r', encoding="utf-8") as f:
                for text in f:
                    data_set.append(Instance(text=text, label=0))

            data_set.apply(
                lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'],
                new_field_name='words')
            data_set.drop(lambda x: len(x['words']) > seq_len + 2)

        data_set.apply(lambda x: x['words'] + ['<pad>'] *
                       (seq_len + 2 - len(x['words'])),
                       new_field_name='words')

        _train_data, _ = data_set.split(split_ratio)

        _vocab = Vocabulary(min_freq=min_freq)
        _train_data.apply(lambda x: [_vocab.add(word) for word in x['words']])
        _vocab.build_vocab()

        data_set.apply(lambda x: [_vocab.to_index(w) for w in x['words']],
                       new_field_name='word_seq',
                       is_input=True)
        data_set.apply(lambda x: x['word_seq'][1:] + [0],
                       new_field_name='dec_target',
                       is_target=True)
        data_set.apply(lambda x: int(x['label']),
                       new_field_name='label_seq',
                       is_target=True)
        _train_data, _test_data = data_set.split(split_ratio)

        self.train_data = _train_data
        self.test_data = _test_data
        self.vocab = _vocab
        self.batch_size = batch_size
        self.train_iter = iter(
            Batch(dataset=self.train_data,
                  batch_size=self.batch_size,
                  sampler=SequentialSampler()))
コード例 #25
0
        return {"pred": pred}


# Prepare the dataset and testset
fitlog.commit(__file__)
fitlog.add_hyper_in_file(__file__)

table = str.maketrans('', '', string.punctuation)
newsgroups_train = fetch_20newsgroups(subset='train')
dataset = DataSet()
for i in range(newsgroups_train.target.shape[0]):
    dataset.append(
        Instance(raw_sentence=newsgroups_train.data[i].replace('\n', ' '),
                 target=int(newsgroups_train.target[i])))
dataset.apply(lambda x: x['raw_sentence'].lower().translate(table),
              new_field_name='sentence')
dataset.apply_field(lambda x: x.split(),
                    field_name='sentence',
                    new_field_name='words')
dataset.apply_field(lambda x: len(x),
                    field_name='words',
                    new_field_name='seq_len')

newsgroups_test = fetch_20newsgroups(subset='test')
testset = DataSet()
for i in range(newsgroups_test.target.shape[0]):
    testset.append(
        Instance(raw_sentence=newsgroups_test.data[i].replace('\n', ' '),
                 target=int(newsgroups_test.target[i])))
testset.apply(lambda x: x['raw_sentence'].lower().translate(table),
              new_field_name='sentence')
コード例 #26
0
ファイル: util.py プロジェクト: gobbletown/PhenoBERT
def produceCandidateTriple(Candidate_hpos_sub_total, model, hpo_tree,
                           threshold):
    """
    使用BERT判断Candidate_phrases中哪个与raw_phrase语义最接近;基于最大值方式
    :param Candidate_hpos_sub_total: 输出的短语及候选HPO嵌套列表
    :param model:
    :param hpo_tree:
    :param threshold: 用作该模型输出阈值
    :return:
    """
    from fastNLP.core.utils import _move_dict_value_to_device
    from fastNLP.core.utils import _get_model_device
    from fastNLP import DataSet
    from fastNLP import DataSetIter
    from my_bert_match import addWordPiece, addSeqlen, addWords, processItem, processNum
    p_Candidate_phrases = []
    phrase_nums_per_hpo = []
    Candidate_hpos = []
    for raw_phrase, Candidate_phrase, Candidate_hpos_sub in Candidate_hpos_sub_total:
        p_Candidate_phrases.extend(
            [raw_phrase + "::" + item for item in Candidate_phrase])
        phrase_nums_per_hpo.append(len(Candidate_phrase))
        Candidate_hpos.append(Candidate_hpos_sub)
    Candidate_dataset = DataSet({"raw_words": p_Candidate_phrases})
    Candidate_dataset.apply(addWords, new_field_name="p_words")
    Candidate_dataset.apply(addWordPiece, new_field_name="t_words")
    Candidate_dataset.apply(processItem, new_field_name="word_pieces")
    Candidate_dataset.apply(processNum, new_field_name="word_nums")
    Candidate_dataset.apply(addSeqlen, new_field_name="seq_len")
    Candidate_dataset.field_arrays["word_pieces"].is_input = True
    Candidate_dataset.field_arrays["seq_len"].is_input = True
    Candidate_dataset.field_arrays["word_nums"].is_input = True
    test_batch = DataSetIter(batch_size=128,
                             dataset=Candidate_dataset,
                             sampler=None)

    outputs = []
    for batch_x, batch_y in test_batch:
        _move_dict_value_to_device(batch_x,
                                   batch_y,
                                   device=_get_model_device(model))
        outputs.append(
            model.forward(batch_x["word_pieces"], batch_x["word_nums"],
                          batch_x["seq_len"])['pred'])
    outputs = torch.cat(outputs)
    outputs = torch.nn.functional.softmax(outputs,
                                          dim=1).cpu().detach().numpy()
    # print(outputs.size)
    results_2 = np.array([item[2] for item in outputs])
    results_1 = np.array([item[1] for item in outputs])

    # 按短语分组
    count = 0
    index = 0
    ans = []
    for group_num in phrase_nums_per_hpo:
        g_results_2 = results_2[index:index + group_num]
        g_results_1 = results_1[index:index + group_num]
        Candidate_hpos_sub = Candidate_hpos[count]
        index += group_num
        count += 1
        # 如果这里已经能找到精确匹配的就直接输出
        if max(g_results_2) >= threshold:
            ans.append([
                Candidate_hpos_sub[int(np.argmax(g_results_2))],
                max(g_results_2), "2"
            ])
            continue
        if max(g_results_1) >= threshold:
            ans.append([
                Candidate_hpos_sub[int(np.argmax(g_results_1))],
                max(g_results_1), "1"
            ])
            continue
        ans.append(["None", None, "0"])
    return ans
コード例 #27
0
def preprocess(batch=16):
    raw_data1 = []
    raw_data2 = []

    for i in range(len(traindata.data)):
        raw_data1.append(
            Instance(sentence=traindata.data[i],
                     label=int(traindata.target[i])))
    trainset = DataSet(raw_data1)
    trainset.apply(lambda x: pre(x['sentence']), new_field_name='words')

    for i in range(len(testdata.data)):
        raw_data2.append(
            Instance(sentence=testdata.data[i], label=int(testdata.target[i])))
    testset = DataSet(raw_data2)
    testset.apply(lambda x: pre(x['sentence']), new_field_name='words')

    global vocab
    vocab = Vocabulary(min_freq=1).from_dataset(trainset,
                                                testset,
                                                field_name='words')
    vocab.index_dataset(trainset,
                        testset,
                        field_name='words',
                        new_field_name='words')
    trainset.set_input('words')
    testset.set_input('words')

    trainset.apply(lambda x: int(x['label']),
                   new_field_name='target',
                   is_target=True)
    testset.apply(lambda x: int(x['label']),
                  new_field_name='target',
                  is_target=True)

    trainset.apply(lambda x: len(x['words']), new_field_name='seq_len')
    testset.apply(lambda x: len(x['words']), new_field_name='seq_len')

    global vocabsize
    vocabsize = len(vocab)
    sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len')
    train_batch = Batch(batch_size=batch, dataset=trainset, sampler=sampler)
    test_batch = Batch(batch_size=batch, dataset=testset, sampler=sampler)

    return train_batch, test_batch, vocabsize
コード例 #28
0
    def preprocess():
        train_set = DataSet()
        for i in range(len(raw_train['data'])):
            di = transfer(raw_train['data'][i])
            train_set.append(
                Instance(sentence=di, target=int(raw_train['target'][i])))

        train_set.apply(lambda x: x['sentence'].lower(),
                        new_field_name='sentence')
        train_set.apply(lambda x: x['sentence'].split(),
                        new_field_name='words')
        train_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

        test_set = DataSet()
        for i in range(len(raw_test['data'])):
            di = transfer(raw_test['data'][i])
            test_set.append(
                Instance(sentence=di, target=int(raw_test['target'][i])))

        test_set.apply(lambda x: x['sentence'].lower(),
                       new_field_name='sentence')
        test_set.apply(lambda x: x['sentence'].split(), new_field_name='words')
        test_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

        word_dict = Vocabulary(min_freq=2)
        train_set.apply(lambda x: [word_dict.add(word) for word in x['words']])
        test_set.apply(lambda x: [word_dict.add(word) for word in x['words']])
        word_dict.build_vocab()
        word_dict.index_dataset(train_set,
                                field_name='words',
                                new_field_name='words')
        word_dict.index_dataset(test_set,
                                field_name='words',
                                new_field_name='words')

        return train_set, test_set, word_dict
コード例 #29
0
def get_dataset(data_path):
    print('Getting dataset...')

    poetry = []
    with open(data_path, 'r', encoding='utf-8') as f:
        poem = ''
        for line in f:
            if len(line) <= 1:
                ins = Instance(text=poem)
                if len(poem) > 10:
                    poetry.append(ins)
                poem = ''
            else:
                poem += line.strip('\n')
    # print(poetry[0])

    data = DataSet(data=poetry)
    print("Original data:", data[0])

    vocabulary = Vocabulary(min_freq=2, unknown='<oov>', padding='<pad>')
    vocabulary.add_word('<eos>')
    vocabulary.add_word('<START>')
    data.apply(lambda x: [vocabulary.add(char) for char in x['text']])
    vocabulary.build_vocab()
    print('pad:', vocabulary.to_index('<pad>'))
    print('Vocab size:', len(vocabulary))

    data.apply(lambda x: [vocabulary.to_index(char) for char in x['text']],
               new_field_name='text')
    data.apply(lambda x: [vocabulary.to_index('<START>')] + x['text'] +
               [vocabulary.to_index('<eos>')],
               new_field_name='text')
    data.apply(
        lambda x: x['text'][0:min(config.sequence_length, len(x['text']))],
        new_field_name='text')
    data.apply(lambda x: [vocabulary.to_index('<pad>')] *
               (config.sequence_length - len(x['text'])) + x['text'],
               new_field_name='text')
    data.apply(lambda x: x['text'][0:-1], new_field_name='input')
    data.apply(lambda x: x['text'][1:], new_field_name='target')
    data.set_input('input')
    data.set_target('target')

    # length = config.sequence_length
    # for i, d in enumerate(data):
    #     if length != len(d['text']):
    #         print("wrong!")
    # exit()

    train_data, dev_data = data.split(0.2)
    print('Train data size:', len(train_data))
    print('Dev data size:', len(dev_data))
    print("Train data:", train_data[20])
    # print("Dev data:", dev_data[0])

    return train_data, dev_data, vocabulary
コード例 #30
0
def preprocess():
    train_set = DataSet()
    for i in range(len(raw_train.data)):
        train_set.append(
            Instance(sentence=raw_train.data[i],
                     target=int(raw_train.target[i])))

    train_set.apply(lambda x: x['sentence'].translate(
        str.maketrans("", "", string.punctuation)).lower(),
                    new_field_name='sentence')
    train_set.apply(lambda x: x['sentence'].split(), new_field_name='words')
    train_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

    test_set = DataSet()
    for i in range(len(raw_test.data)):
        test_set.append(
            Instance(sentence=raw_test.data[i],
                     target=int(raw_test.target[i])))

    test_set.apply(lambda x: x['sentence'].translate(
        str.maketrans("", "", string.punctuation)).lower(),
                   new_field_name='sentence')
    test_set.apply(lambda x: x['sentence'].split(), new_field_name='words')
    test_set.apply(lambda x: len(x['words']), new_field_name='seq_len')

    vocab = Vocabulary(min_freq=10)
    train_set.apply(lambda x: [vocab.add(word) for word in x['words']])
    test_set.apply(lambda x: [vocab.add(word) for word in x['words']])
    vocab.build_vocab()
    vocab.index_dataset(train_set, field_name='words', new_field_name='words')
    vocab.index_dataset(test_set, field_name='words', new_field_name='words')

    return train_set, test_set, vocab