def create_new_corpus(data_dict, corpus_vol, **kwargs): new_corpus = Corpus([]) sem_nums = kwargs['sem_nums'] intents = data_dict.keys() if not corpus_vol: return elif sem_nums > len(intents): return else: for i in range(corpus_vol): intent_sam = set() while len(intent_sam) < sem_nums: intent_sam.add(random.choice(list(intents))) spanset = SpanSet() sentences = [] start_position = 0 for intent in list(intent_sam): if intent == 'noise': txt = random.choice(list(data_dict[intent])) sentences.append(txt) start_position += len(txt) else: txt = random.choice(list(data_dict[intent])) sentences.append(txt) spanset.append( Span(start=start_position, end=start_position + len(txt), entity=intent)) start_position += len(txt) doc = Document(text=''.join(sentences), label='|'.join(intent_sam), span_set=spanset) new_corpus.append(doc) return new_corpus
def test_contains__(datadir, tmpdir): corpus = Corpus() corpus.append(seq_one) corpus.append(seq_two) assert seq_one in corpus other_corpus = Document("") assert other_corpus not in corpus
def test_write_to_file(datadir, tmpdir): corpus = Corpus() corpus.append(seq_one) corpus.append(seq_two) result_file = tmpdir / "output.conllx" corpus.write_to_file(result_file) gold_file = datadir / "output.conllx" assert filecmp.cmp(result_file, gold_file)
def test_getitem__(datadir, tmpdir): corpus = Corpus() corpus.append(seq_one) corpus.append(seq_two) # test single element get item item = corpus[0] assert item == seq_one # test batch element get item other_corpus = corpus[[0, 1]] assert other_corpus == corpus