def test_tagged_corpus_get_all_sentences(): train_sentence = Sentence(u"I'm used in training.", use_tokenizer=True) dev_sentence = Sentence(u"I'm a dev sentence.", use_tokenizer=True) test_sentence = Sentence( u'I will be only used for testing.', use_tokenizer=True) corpus = TaggedCorpus([train_sentence], [dev_sentence], [test_sentence]) all_sentences = corpus.get_all_sentences() assert (3 == len(all_sentences))
def load_column_corpus( data_folder: Union[str, Path], column_format: Dict[int, str], train_file=None, test_file=None, dev_file=None, tag_to_biloes=None) -> TaggedCorpus: """ Helper function to get a TaggedCorpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. :param data_folder: base folder with the task data :param column_format: a map specifying the column format :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :param tag_to_biloes: whether to convert to BILOES tagging scheme :return: a TaggedCorpus with annotated train, dev and test data """ if type(data_folder) == str: data_folder: Path = Path(data_folder) if train_file is not None: train_file = data_folder / train_file if test_file is not None: test_file = data_folder / test_file if dev_file is not None: dev_file = data_folder / dev_file # automatically identify train / test / dev files if train_file is None: for file in data_folder.iterdir(): file_name = file.name if file_name.endswith('.gz'): continue if 'train' in file_name and not '54019' in file_name: train_file = file if 'dev' in file_name: dev_file = file if 'testa' in file_name: dev_file = file if 'testb' in file_name: test_file = file # if no test file is found, take any file with 'test' in name if test_file is None: for file in data_folder.iterdir(): file_name = file.name if file_name.endswith('.gz'): continue if 'test' in file_name: test_file = file log.info("Reading data from {}".format(data_folder)) log.info("Train: {}".format(train_file)) log.info("Dev: {}".format(dev_file)) log.info("Test: {}".format(test_file)) def make_read_column_data(f): return lambda: NLPTaskDataFetcher.read_column_data(f, column_format) # get train and test data sentences_train: Iterable[Sentence] = make_read_column_data(train_file) # read in test file if exists, otherwise sample 10% of train data as test dataset if test_file is None or dev_file is None: total_number_of_sentences = 0 for sentence in sentences_train(): total_number_of_sentences += 1 train_indexes = set(range(0, total_number_of_sentences)) if test_file is not None: sentences_test: Iterable[Sentence] = make_read_column_data(test_file) else: test_indexes = NLPTaskDataFetcher.__sample(train_indexes, 0.1) train_indexes = sorted(train_indexes - test_indexes) sentences_test: Iterable[Sentence] = NLPTaskDataFetcher.make_sample(sentences_train, test_indexes) sentences_train = NLPTaskDataFetcher.make_sample(sentences_train, train_indexes) # read in dev file if exists, otherwise sample 10% of train data as dev dataset if dev_file is not None: sentences_dev: List[Sentence] = make_read_column_data(dev_file) else: dev_indexes = NLPTaskDataFetcher.__sample(train_indexes, 0.1) train_indexes = sorted(train_indexes - dev_indexes) sentences_dev: Iterable[Sentence] = NLPTaskDataFetcher.make_sample(sentences_train, dev_indexes) sentences_train = NLPTaskDataFetcher.make_sample(sentences_train, train_indexes) corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test, name=data_folder.name) if tag_to_biloes is not None: # convert tag scheme to iobes for sentence in corpus.get_all_sentences(): sentence.convert_tag_scheme(tag_type=tag_to_biloes, target_scheme='iobes') return corpus