コード例 #1
0
ファイル: test_data.py プロジェクト: rkwojdan/flair35
def test_tagged_corpus_get_all_sentences():
    train_sentence = Sentence(u"I'm used in training.", use_tokenizer=True)
    dev_sentence = Sentence(u"I'm a dev sentence.", use_tokenizer=True)
    test_sentence = Sentence(
        u'I will be only used for testing.', use_tokenizer=True)
    corpus = TaggedCorpus([train_sentence], [dev_sentence], [test_sentence])
    all_sentences = corpus.get_all_sentences()
    assert (3 == len(all_sentences))
コード例 #2
0
    def load_column_corpus(
            data_folder: Union[str, Path],
            column_format: Dict[int, str],
            train_file=None,
            test_file=None,
            dev_file=None,
            tag_to_biloes=None) -> TaggedCorpus:
        """
        Helper function to get a TaggedCorpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.

        :param data_folder: base folder with the task data
        :param column_format: a map specifying the column format
        :param train_file: the name of the train file
        :param test_file: the name of the test file
        :param dev_file: the name of the dev file, if None, dev data is sampled from train
        :param tag_to_biloes: whether to convert to BILOES tagging scheme
        :return: a TaggedCorpus with annotated train, dev and test data
        """

        if type(data_folder) == str:
            data_folder: Path = Path(data_folder)

        if train_file is not None:
            train_file = data_folder / train_file
        if test_file is not None:
            test_file = data_folder / test_file
        if dev_file is not None:
            dev_file = data_folder / dev_file

        # automatically identify train / test / dev files
        if train_file is None:
            for file in data_folder.iterdir():
                file_name = file.name
                if file_name.endswith('.gz'): continue
                if 'train' in file_name and not '54019' in file_name:
                    train_file = file
                if 'dev' in file_name:
                    dev_file = file
                if 'testa' in file_name:
                    dev_file = file
                if 'testb' in file_name:
                    test_file = file

            # if no test file is found, take any file with 'test' in name
            if test_file is None:
                for file in data_folder.iterdir():
                    file_name = file.name
                    if file_name.endswith('.gz'): continue
                    if 'test' in file_name:
                        test_file = file

        log.info("Reading data from {}".format(data_folder))
        log.info("Train: {}".format(train_file))
        log.info("Dev: {}".format(dev_file))
        log.info("Test: {}".format(test_file))

        def make_read_column_data(f):
            return lambda: NLPTaskDataFetcher.read_column_data(f, column_format)

        # get train and test data
        sentences_train: Iterable[Sentence] = make_read_column_data(train_file)

        # read in test file if exists, otherwise sample 10% of train data as test dataset

        if test_file is None or dev_file is None:
            total_number_of_sentences = 0
            for sentence in sentences_train():
                total_number_of_sentences += 1
            train_indexes = set(range(0, total_number_of_sentences))

        if test_file is not None:
            sentences_test: Iterable[Sentence] = make_read_column_data(test_file)
        else:
            test_indexes = NLPTaskDataFetcher.__sample(train_indexes, 0.1)
            train_indexes = sorted(train_indexes - test_indexes)
            sentences_test: Iterable[Sentence] = NLPTaskDataFetcher.make_sample(sentences_train, test_indexes)
            sentences_train = NLPTaskDataFetcher.make_sample(sentences_train, train_indexes)

        # read in dev file if exists, otherwise sample 10% of train data as dev dataset
        if dev_file is not None:
            sentences_dev: List[Sentence] = make_read_column_data(dev_file)
        else:
            dev_indexes = NLPTaskDataFetcher.__sample(train_indexes, 0.1)
            train_indexes = sorted(train_indexes - dev_indexes)
            sentences_dev: Iterable[Sentence] = NLPTaskDataFetcher.make_sample(sentences_train, dev_indexes)
            sentences_train = NLPTaskDataFetcher.make_sample(sentences_train, train_indexes)

        corpus = TaggedCorpus(sentences_train, sentences_dev, sentences_test, name=data_folder.name)

        if tag_to_biloes is not None:
            # convert tag scheme to iobes
            for sentence in corpus.get_all_sentences():
                sentence.convert_tag_scheme(tag_type=tag_to_biloes, target_scheme='iobes')

        return corpus