Esempio n. 1
0
    def __init__(
            self,
            data_folder: Union[str, Path],
            train_file=None,
            test_file=None,
            dev_file=None,
            in_memory: bool = True,
    ):
        """
        Instantiates a Corpus from CoNLL-U column-formatted task data such as the UD corpora

        :param data_folder: base folder with the task data
        :param train_file: the name of the train file
        :param test_file: the name of the test file
        :param dev_file: the name of the dev file, if None, dev data is sampled from train
        :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
        :return: a Corpus with annotated train, dev and test data
        """

        # find train, dev and test files if not specified
        dev_file, test_file, train_file = \
            find_train_dev_test_files(data_folder, dev_file, test_file, train_file)

        # get train data
        train = UniversalDependenciesDataset(train_file, in_memory=in_memory)

        # get test data
        test = UniversalDependenciesDataset(test_file, in_memory=in_memory) if test_file is not None else None

        # get dev data
        dev = UniversalDependenciesDataset(dev_file, in_memory=in_memory) if dev_file is not None else None

        super(UniversalDependenciesCorpus, self).__init__(
            train, dev, test, name=str(data_folder)
        )
Esempio n. 2
0
    def __init__(
        self,
        data_folder: Union[str, Path],
        column_format: Dict[int, str],
        train_file=None,
        test_file=None,
        dev_file=None,
        tag_to_bioes=None,
        column_delimiter: str = "\s+",
        comment_symbol: str = None,
        encoding: str = "utf-8",
        document_separator_token: str = None,
        skip_first_line: bool = False,
        in_memory: bool = True,
    ):
        """
        Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.

        :param data_folder: base folder with the task data
        :param column_format: a map specifying the column format
        :param train_file: the name of the train file
        :param test_file: the name of the test file
        :param dev_file: the name of the dev file, if None, dev data is sampled from train
        :param tag_to_bioes: whether to convert to BIOES tagging scheme
        :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
        to split only on tabs
        :param comment_symbol: if set, lines that begin with this symbol are treated as comments
        :param document_separator_token: If provided, multiple sentences are read into one object. Provide the string token
        that indicates that a new document begins
        :param skip_first_line: set to True if your dataset has a header line
        :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
        :return: a Corpus with annotated train, dev and test data
        """

        # find train, dev and test files if not specified
        dev_file, test_file, train_file = \
            find_train_dev_test_files(data_folder, dev_file, test_file, train_file)

        # get train data
        train = ColumnDataset(
            train_file,
            column_format,
            tag_to_bioes,
            encoding=encoding,
            comment_symbol=comment_symbol,
            column_delimiter=column_delimiter,
            in_memory=in_memory,
            document_separator_token=document_separator_token,
            skip_first_line=skip_first_line,
        )

        # read in test file if exists
        test = ColumnDataset(
            test_file,
            column_format,
            tag_to_bioes,
            encoding=encoding,
            comment_symbol=comment_symbol,
            column_delimiter=column_delimiter,
            in_memory=in_memory,
            document_separator_token=document_separator_token,
            skip_first_line=skip_first_line,
        ) if test_file is not None else None

        # read in dev file if exists
        dev = ColumnDataset(
            dev_file,
            column_format,
            tag_to_bioes,
            encoding=encoding,
            comment_symbol=comment_symbol,
            column_delimiter=column_delimiter,
            in_memory=in_memory,
            document_separator_token=document_separator_token,
            skip_first_line=skip_first_line,
        ) if dev_file is not None else None

        super(ColumnCorpus, self).__init__(train,
                                           dev,
                                           test,
                                           name=str(data_folder))
Esempio n. 3
0
    def __init__(
            self,
            data_folder: Union[str, Path],
            columns: List[int] = [0, 1, 2],
            train_file=None,
            test_file=None,
            dev_file=None,
            use_tokenizer: bool = True,
            max_tokens_per_doc=-1,
            max_chars_per_doc=-1,
            in_memory: bool = True,
            label_type: str = None,
            autofind_splits=True,
            sample_missing_splits: bool = True,
            skip_first_line: bool = False,
            separator: str = '\t',
            encoding: str = 'utf-8'
    ):
        """
        Corpus for tasks involving pairs of sentences or paragraphs. The data files are expected to be in column format where each line has a colmun 
        for the first sentence/paragraph, the second sentence/paragraph and the labels, respectively. The columns must be separated by a given separator (default: '\t').
        
        :param data_folder: base folder with the task data
        :param columns: List that indicates the columns for the first sentence (first entry in the list), the second sentence (second entry) and label (last entry).
                        default = [0,1,2]
        :param train_file: the name of the train file
        :param test_file: the name of the test file, if None, dev data is sampled from train (if sample_missing_splits is true)
        :param dev_file: the name of the dev file, if None, dev data is sampled from train (if sample_missing_splits is true)
        :param use_tokenizer: Whether or not to use in-built tokenizer
        :param max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens
        :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters
        :param in_memory: If True, data will be saved in list of flair.data.DataPair objects, other wise we use lists with simple strings which needs less space
        :param label_type: Name of the label of the data pairs
        :param autofind_splits: If True, train/test/dev files will be automatically identified in the given data_folder
        :param sample_missing_splits: If True, a missing train/test/dev file will be sampled from the available data
        :param skip_first_line: If True, first line of data files will be ignored
        :param separator: Separator between columns in data files
        :param encoding: Encoding of data files
        
        :return: a Corpus with annotated train, dev and test data
        """

        # find train, dev and test files if not specified
        dev_file, test_file, train_file = \
            find_train_dev_test_files(data_folder, dev_file, test_file, train_file, autofind_splits=autofind_splits)

        # create DataPairDataset for train, test and dev file, if they are given

        train: FlairDataset = DataPairDataset(
            train_file,
            columns=columns,
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            label_type=label_type,
            skip_first_line=skip_first_line,
            separator=separator,
            encoding=encoding
        ) if train_file is not None else None

        test: FlairDataset = DataPairDataset(
            test_file,
            columns=columns,
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            label_type=label_type,
            skip_first_line=skip_first_line,
            separator=separator,
            encoding=encoding
        ) if test_file is not None else None

        dev: FlairDataset = DataPairDataset(
            dev_file,
            columns=columns,
            use_tokenizer=use_tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            label_type=label_type,
            skip_first_line=skip_first_line,
            separator=separator,
            encoding=encoding
        ) if dev_file is not None else None

        super(DataPairCorpus, self).__init__(train, dev, test,
                                             sample_missing_splits=sample_missing_splits,
                                             name=str(data_folder))
Esempio n. 4
0
    def __init__(
        self,
        data_folder: Union[str, Path],
        train_file=None,
        test_file=None,
        dev_file=None,
        in_memory: bool = True,
        fields: Optional[Sequence[str]] = None,
        field_parsers: Optional[Dict[str, conllu._FieldParserType]] = None,
        metadata_parsers: Optional[Dict[str, conllu._MetadataParserType]] = None,
        sample_missing_splits: bool = True,
    ):
        """
        Instantiates a Corpus from CoNLL-U (Plus) column-formatted task data

        :param data_folder: base folder with the task data
        :param train_file: the name of the train file
        :param test_file: the name of the test file
        :param dev_file: the name of the dev file, if None, dev data is sampled from train
        :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
        :return: a Corpus with annotated train, dev and test data
        """

        # find train, dev and test files if not specified
        dev_file, test_file, train_file = find_train_dev_test_files(data_folder, dev_file, test_file, train_file)

        # get train data
        train = CoNLLUDataset(
            train_file,
            in_memory=in_memory,
            fields=fields,
            field_parsers=field_parsers,
            metadata_parsers=metadata_parsers,
        )

        # get test data
        test = (
            CoNLLUDataset(
                test_file,
                in_memory=in_memory,
                fields=fields,
                field_parsers=field_parsers,
                metadata_parsers=metadata_parsers,
            )
            if test_file is not None
            else None
        )

        # get dev data
        dev = (
            CoNLLUDataset(
                dev_file,
                in_memory=in_memory,
                fields=fields,
                field_parsers=field_parsers,
                metadata_parsers=metadata_parsers,
            )
            if dev_file is not None
            else None
        )

        super(CoNLLUCorpus, self).__init__(train, dev, test, name=str(data_folder),
                                           sample_missing_splits=sample_missing_splits)
Esempio n. 5
0
    def __init__(
        self,
        data_folder: Union[str, Path],
        train_file=None,
        test_file=None,
        dev_file=None,
        in_memory: bool = True,
        fields: Optional[Sequence[str]] = None,
        token_annotation_fields: Optional[Sequence[str]] = None,
        field_parsers: Optional[Dict[str, conllu._FieldParserType]] = None,
        metadata_parsers: Optional[Dict[str,
                                        conllu._MetadataParserType]] = None,
        sample_missing_splits: bool = True,
    ):
        """
        Instantiates a Corpus from CoNLL-U (Plus) column-formatted task data

        Universal dependencies corpora that contain multi-word tokens are not supported yet.
        The annotation of flair sentences with the "deps" column is not yet supported as well.
        Please consider using the "UniversalDependenciesCorpus" instead.

        :param data_folder: base folder with the task data
        :param train_file: the name of the train file
        :param test_file: the name of the test file
        :param dev_file: the name of the dev file, if None, dev data is sampled from train
        :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
        :param token_annotation_fields: A subset of the fields parameter for token level annotations
        :return: a Corpus with annotated train, dev and test data
        """

        # find train, dev and test files if not specified
        dev_file, test_file, train_file = find_train_dev_test_files(
            data_folder, dev_file, test_file, train_file)

        # get train data
        train = CoNLLUDataset(
            train_file,
            in_memory=in_memory,
            fields=fields,
            token_annotation_fields=token_annotation_fields,
            field_parsers=field_parsers,
            metadata_parsers=metadata_parsers,
        )

        # get test data
        test = (CoNLLUDataset(
            test_file,
            in_memory=in_memory,
            fields=fields,
            token_annotation_fields=token_annotation_fields,
            field_parsers=field_parsers,
            metadata_parsers=metadata_parsers,
        ) if test_file is not None else None)

        # get dev data
        dev = (CoNLLUDataset(
            dev_file,
            in_memory=in_memory,
            fields=fields,
            token_annotation_fields=token_annotation_fields,
            field_parsers=field_parsers,
            metadata_parsers=metadata_parsers,
        ) if dev_file is not None else None)

        super(CoNLLUCorpus, self).__init__(
            train,
            dev,
            test,
            name=str(data_folder),
            sample_missing_splits=sample_missing_splits,
        )
Esempio n. 6
0
    def __init__(
        self,
        data_folder: Union[str, Path],
        column_name_map: Dict[int, str],
        label_type: str = 'class',
        train_file=None,
        test_file=None,
        dev_file=None,
        tokenizer: Callable[[str], List[Token]] = segtok_tokenizer,
        max_tokens_per_doc=-1,
        max_chars_per_doc=-1,
        in_memory: bool = False,
        skip_header: bool = False,
        encoding: str = 'utf-8',
        **fmtparams,
    ):
        """
        Instantiates a Corpus for text classification from CSV column formatted data

        :param data_folder: base folder with the task data
        :param column_name_map: a column name map that indicates which column is text and which the label(s)
        :param train_file: the name of the train file
        :param test_file: the name of the test file
        :param dev_file: the name of the dev file, if None, dev data is sampled from train
        :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
        :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
        :param use_tokenizer: If True, tokenizes the dataset, otherwise uses whitespace tokenization
        :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
        :param fmtparams: additional parameters for the CSV file reader
        :return: a Corpus with annotated train, dev and test data
        """

        # find train, dev and test files if not specified
        dev_file, test_file, train_file = \
            find_train_dev_test_files(data_folder, dev_file, test_file, train_file)

        train: FlairDataset = CSVClassificationDataset(
            train_file,
            column_name_map,
            label_type=label_type,
            tokenizer=tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            skip_header=skip_header,
            encoding=encoding,
            **fmtparams,
        )

        test: FlairDataset = CSVClassificationDataset(
            test_file,
            column_name_map,
            label_type=label_type,
            tokenizer=tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            skip_header=skip_header,
            encoding=encoding,
            **fmtparams,
        ) if test_file is not None else None

        dev: FlairDataset = CSVClassificationDataset(
            dev_file,
            column_name_map,
            label_type=label_type,
            tokenizer=tokenizer,
            max_tokens_per_doc=max_tokens_per_doc,
            max_chars_per_doc=max_chars_per_doc,
            in_memory=in_memory,
            skip_header=skip_header,
            encoding=encoding,
            **fmtparams,
        ) if dev_file is not None else None

        super(CSVClassificationCorpus, self).__init__(train,
                                                      dev,
                                                      test,
                                                      name=str(data_folder))
Esempio n. 7
0
    def __init__(
        self,
        data_folder: Union[str, Path],
        label_type: str = 'class',
        train_file=None,
        test_file=None,
        dev_file=None,
        tokenizer: Callable[[str], List[Token]] = space_tokenizer,
        truncate_to_max_tokens: int = -1,
        truncate_to_max_chars: int = -1,
        filter_if_longer_than: int = -1,
        in_memory: bool = False,
        encoding: str = 'utf-8',
    ):
        """
        Instantiates a Corpus from text classification-formatted task data

        :param data_folder: base folder with the task data
        :param train_file: the name of the train file
        :param test_file: the name of the test file
        :param dev_file: the name of the dev file, if None, dev data is sampled from train
        :param use_tokenizer: If True, tokenizes the dataset, otherwise uses whitespace tokenization
        :param truncate_to_max_tokens: If set, truncates each Sentence to a maximum number of Tokens
        :param truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars
        :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
        :return: a Corpus with annotated train, dev and test data
        """

        # find train, dev and test files if not specified
        dev_file, test_file, train_file = \
            find_train_dev_test_files(data_folder, dev_file, test_file, train_file)

        train: FlairDataset = ClassificationDataset(
            train_file,
            label_type=label_type,
            tokenizer=tokenizer,
            truncate_to_max_tokens=truncate_to_max_tokens,
            truncate_to_max_chars=truncate_to_max_chars,
            filter_if_longer_than=filter_if_longer_than,
            in_memory=in_memory,
            encoding=encoding,
        )

        # use test_file to create test split if available
        test: FlairDataset = ClassificationDataset(
            test_file,
            label_type=label_type,
            tokenizer=tokenizer,
            truncate_to_max_tokens=truncate_to_max_tokens,
            truncate_to_max_chars=truncate_to_max_chars,
            filter_if_longer_than=filter_if_longer_than,
            in_memory=in_memory,
            encoding=encoding,
        ) if test_file is not None else None

        # use dev_file to create test split if available
        dev: FlairDataset = ClassificationDataset(
            dev_file,
            label_type=label_type,
            tokenizer=tokenizer,
            truncate_to_max_tokens=truncate_to_max_tokens,
            truncate_to_max_chars=truncate_to_max_chars,
            filter_if_longer_than=filter_if_longer_than,
            in_memory=in_memory,
            encoding=encoding,
        ) if dev_file is not None else None

        super(ClassificationCorpus, self).__init__(train,
                                                   dev,
                                                   test,
                                                   name=str(data_folder))