def test_sentence_level_sampler_dependent_sampling(): sls = SentenceLevelSampler() text = "写代码。多写代码。写好代码。" text_list = cut_part(text, psubsent) res = sls.dependent_sampling(text_list) assert type(res) == list assert len(res) == 3
def test_sentence_level_sampler_swap(): sls = SentenceLevelSampler(types=["swap"]) text = "我爱你。你爱我。NLP 很有意思。简洁最重要。" res = sls.make_samples(text) assert len(res) == 2
def test_sentence_level_sampler_none_text(): sls = SentenceLevelSampler() text = "" assert sls.make_samples(text) == {}
def test_sentence_level_sampler_single_sent(): sls = SentenceLevelSampler() text = "我爱你。" assert len(sls.make_samples(text)) == 4
def test_sentence_level_sampler_none(): sls = SentenceLevelSampler([]) text = "我爱你。你爱我。" assert sls.make_samples(text) == {}
def test_sentence_level_sampler(): sls = SentenceLevelSampler() text = "我爱你。你爱我。" res = sls.make_samples(text) assert type(res) == dict assert len(res) == 4
from pnlp.ptxt import Regex, Text, Length from pnlp.pnorm import NumNorm from pnlp.penh import TokenLevelSampler, SentenceLevelSampler from pnlp.pmag import MagicDict from pnlp.stopwords import StopWords from pnlp.stopwords import chinese_stopwords, english_stopwords from pnlp.utils import pstr, concurring, divide2int from pnlp.utils import generate_batches_by_num, generate_batches_by_size num_norm = NumNorm() reg = Regex() reader = Reader() tlsampler = TokenLevelSampler() slsampler = SentenceLevelSampler() __title__ = 'pnlp' __version__ = '0.4.0' __author__ = 'Yam' __license__ = 'Apache-2.0' __copyright__ = 'Copyright 2019, 2020 Yam' __all__ = ['Reader', 'Text', 'Regex', 'Length', 'MagicDict', 'NumNorm', 'StopWords', 'TokenLevelSampler', 'SentenceLevelSampler' ]