def __init__( self, data_folder: Union[str, Path], train_file=None, test_file=None, dev_file=None, in_memory: bool = True, ): """ Instantiates a Corpus from CoNLL-U column-formatted task data such as the UD corpora :param data_folder: base folder with the task data :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads :return: a Corpus with annotated train, dev and test data """ # find train, dev and test files if not specified dev_file, test_file, train_file = \ find_train_dev_test_files(data_folder, dev_file, test_file, train_file) # get train data train = UniversalDependenciesDataset(train_file, in_memory=in_memory) # get test data test = UniversalDependenciesDataset(test_file, in_memory=in_memory) if test_file is not None else None # get dev data dev = UniversalDependenciesDataset(dev_file, in_memory=in_memory) if dev_file is not None else None super(UniversalDependenciesCorpus, self).__init__( train, dev, test, name=str(data_folder) )
def __init__( self, data_folder: Union[str, Path], column_format: Dict[int, str], train_file=None, test_file=None, dev_file=None, tag_to_bioes=None, column_delimiter: str = "\s+", comment_symbol: str = None, encoding: str = "utf-8", document_separator_token: str = None, skip_first_line: bool = False, in_memory: bool = True, ): """ Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. :param data_folder: base folder with the task data :param column_format: a map specifying the column format :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :param tag_to_bioes: whether to convert to BIOES tagging scheme :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" to split only on tabs :param comment_symbol: if set, lines that begin with this symbol are treated as comments :param document_separator_token: If provided, multiple sentences are read into one object. Provide the string token that indicates that a new document begins :param skip_first_line: set to True if your dataset has a header line :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads :return: a Corpus with annotated train, dev and test data """ # find train, dev and test files if not specified dev_file, test_file, train_file = \ find_train_dev_test_files(data_folder, dev_file, test_file, train_file) # get train data train = ColumnDataset( train_file, column_format, tag_to_bioes, encoding=encoding, comment_symbol=comment_symbol, column_delimiter=column_delimiter, in_memory=in_memory, document_separator_token=document_separator_token, skip_first_line=skip_first_line, ) # read in test file if exists test = ColumnDataset( test_file, column_format, tag_to_bioes, encoding=encoding, comment_symbol=comment_symbol, column_delimiter=column_delimiter, in_memory=in_memory, document_separator_token=document_separator_token, skip_first_line=skip_first_line, ) if test_file is not None else None # read in dev file if exists dev = ColumnDataset( dev_file, column_format, tag_to_bioes, encoding=encoding, comment_symbol=comment_symbol, column_delimiter=column_delimiter, in_memory=in_memory, document_separator_token=document_separator_token, skip_first_line=skip_first_line, ) if dev_file is not None else None super(ColumnCorpus, self).__init__(train, dev, test, name=str(data_folder))
def __init__( self, data_folder: Union[str, Path], columns: List[int] = [0, 1, 2], train_file=None, test_file=None, dev_file=None, use_tokenizer: bool = True, max_tokens_per_doc=-1, max_chars_per_doc=-1, in_memory: bool = True, label_type: str = None, autofind_splits=True, sample_missing_splits: bool = True, skip_first_line: bool = False, separator: str = '\t', encoding: str = 'utf-8' ): """ Corpus for tasks involving pairs of sentences or paragraphs. The data files are expected to be in column format where each line has a colmun for the first sentence/paragraph, the second sentence/paragraph and the labels, respectively. The columns must be separated by a given separator (default: '\t'). :param data_folder: base folder with the task data :param columns: List that indicates the columns for the first sentence (first entry in the list), the second sentence (second entry) and label (last entry). default = [0,1,2] :param train_file: the name of the train file :param test_file: the name of the test file, if None, dev data is sampled from train (if sample_missing_splits is true) :param dev_file: the name of the dev file, if None, dev data is sampled from train (if sample_missing_splits is true) :param use_tokenizer: Whether or not to use in-built tokenizer :param max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters :param in_memory: If True, data will be saved in list of flair.data.DataPair objects, other wise we use lists with simple strings which needs less space :param label_type: Name of the label of the data pairs :param autofind_splits: If True, train/test/dev files will be automatically identified in the given data_folder :param sample_missing_splits: If True, a missing train/test/dev file will be sampled from the available data :param skip_first_line: If True, first line of data files will be ignored :param separator: Separator between columns in data files :param encoding: Encoding of data files :return: a Corpus with annotated train, dev and test data """ # find train, dev and test files if not specified dev_file, test_file, train_file = \ find_train_dev_test_files(data_folder, dev_file, test_file, train_file, autofind_splits=autofind_splits) # create DataPairDataset for train, test and dev file, if they are given train: FlairDataset = DataPairDataset( train_file, columns=columns, use_tokenizer=use_tokenizer, max_tokens_per_doc=max_tokens_per_doc, max_chars_per_doc=max_chars_per_doc, in_memory=in_memory, label_type=label_type, skip_first_line=skip_first_line, separator=separator, encoding=encoding ) if train_file is not None else None test: FlairDataset = DataPairDataset( test_file, columns=columns, use_tokenizer=use_tokenizer, max_tokens_per_doc=max_tokens_per_doc, max_chars_per_doc=max_chars_per_doc, in_memory=in_memory, label_type=label_type, skip_first_line=skip_first_line, separator=separator, encoding=encoding ) if test_file is not None else None dev: FlairDataset = DataPairDataset( dev_file, columns=columns, use_tokenizer=use_tokenizer, max_tokens_per_doc=max_tokens_per_doc, max_chars_per_doc=max_chars_per_doc, in_memory=in_memory, label_type=label_type, skip_first_line=skip_first_line, separator=separator, encoding=encoding ) if dev_file is not None else None super(DataPairCorpus, self).__init__(train, dev, test, sample_missing_splits=sample_missing_splits, name=str(data_folder))
def __init__( self, data_folder: Union[str, Path], train_file=None, test_file=None, dev_file=None, in_memory: bool = True, fields: Optional[Sequence[str]] = None, field_parsers: Optional[Dict[str, conllu._FieldParserType]] = None, metadata_parsers: Optional[Dict[str, conllu._MetadataParserType]] = None, sample_missing_splits: bool = True, ): """ Instantiates a Corpus from CoNLL-U (Plus) column-formatted task data :param data_folder: base folder with the task data :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads :return: a Corpus with annotated train, dev and test data """ # find train, dev and test files if not specified dev_file, test_file, train_file = find_train_dev_test_files(data_folder, dev_file, test_file, train_file) # get train data train = CoNLLUDataset( train_file, in_memory=in_memory, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers, ) # get test data test = ( CoNLLUDataset( test_file, in_memory=in_memory, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers, ) if test_file is not None else None ) # get dev data dev = ( CoNLLUDataset( dev_file, in_memory=in_memory, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers, ) if dev_file is not None else None ) super(CoNLLUCorpus, self).__init__(train, dev, test, name=str(data_folder), sample_missing_splits=sample_missing_splits)
def __init__( self, data_folder: Union[str, Path], train_file=None, test_file=None, dev_file=None, in_memory: bool = True, fields: Optional[Sequence[str]] = None, token_annotation_fields: Optional[Sequence[str]] = None, field_parsers: Optional[Dict[str, conllu._FieldParserType]] = None, metadata_parsers: Optional[Dict[str, conllu._MetadataParserType]] = None, sample_missing_splits: bool = True, ): """ Instantiates a Corpus from CoNLL-U (Plus) column-formatted task data Universal dependencies corpora that contain multi-word tokens are not supported yet. The annotation of flair sentences with the "deps" column is not yet supported as well. Please consider using the "UniversalDependenciesCorpus" instead. :param data_folder: base folder with the task data :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads :param token_annotation_fields: A subset of the fields parameter for token level annotations :return: a Corpus with annotated train, dev and test data """ # find train, dev and test files if not specified dev_file, test_file, train_file = find_train_dev_test_files( data_folder, dev_file, test_file, train_file) # get train data train = CoNLLUDataset( train_file, in_memory=in_memory, fields=fields, token_annotation_fields=token_annotation_fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers, ) # get test data test = (CoNLLUDataset( test_file, in_memory=in_memory, fields=fields, token_annotation_fields=token_annotation_fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers, ) if test_file is not None else None) # get dev data dev = (CoNLLUDataset( dev_file, in_memory=in_memory, fields=fields, token_annotation_fields=token_annotation_fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers, ) if dev_file is not None else None) super(CoNLLUCorpus, self).__init__( train, dev, test, name=str(data_folder), sample_missing_splits=sample_missing_splits, )
def __init__( self, data_folder: Union[str, Path], column_name_map: Dict[int, str], label_type: str = 'class', train_file=None, test_file=None, dev_file=None, tokenizer: Callable[[str], List[Token]] = segtok_tokenizer, max_tokens_per_doc=-1, max_chars_per_doc=-1, in_memory: bool = False, skip_header: bool = False, encoding: str = 'utf-8', **fmtparams, ): """ Instantiates a Corpus for text classification from CSV column formatted data :param data_folder: base folder with the task data :param column_name_map: a column name map that indicates which column is text and which the label(s) :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars :param use_tokenizer: If True, tokenizes the dataset, otherwise uses whitespace tokenization :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings :param fmtparams: additional parameters for the CSV file reader :return: a Corpus with annotated train, dev and test data """ # find train, dev and test files if not specified dev_file, test_file, train_file = \ find_train_dev_test_files(data_folder, dev_file, test_file, train_file) train: FlairDataset = CSVClassificationDataset( train_file, column_name_map, label_type=label_type, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc, max_chars_per_doc=max_chars_per_doc, in_memory=in_memory, skip_header=skip_header, encoding=encoding, **fmtparams, ) test: FlairDataset = CSVClassificationDataset( test_file, column_name_map, label_type=label_type, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc, max_chars_per_doc=max_chars_per_doc, in_memory=in_memory, skip_header=skip_header, encoding=encoding, **fmtparams, ) if test_file is not None else None dev: FlairDataset = CSVClassificationDataset( dev_file, column_name_map, label_type=label_type, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc, max_chars_per_doc=max_chars_per_doc, in_memory=in_memory, skip_header=skip_header, encoding=encoding, **fmtparams, ) if dev_file is not None else None super(CSVClassificationCorpus, self).__init__(train, dev, test, name=str(data_folder))
def __init__( self, data_folder: Union[str, Path], label_type: str = 'class', train_file=None, test_file=None, dev_file=None, tokenizer: Callable[[str], List[Token]] = space_tokenizer, truncate_to_max_tokens: int = -1, truncate_to_max_chars: int = -1, filter_if_longer_than: int = -1, in_memory: bool = False, encoding: str = 'utf-8', ): """ Instantiates a Corpus from text classification-formatted task data :param data_folder: base folder with the task data :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :param use_tokenizer: If True, tokenizes the dataset, otherwise uses whitespace tokenization :param truncate_to_max_tokens: If set, truncates each Sentence to a maximum number of Tokens :param truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings :return: a Corpus with annotated train, dev and test data """ # find train, dev and test files if not specified dev_file, test_file, train_file = \ find_train_dev_test_files(data_folder, dev_file, test_file, train_file) train: FlairDataset = ClassificationDataset( train_file, label_type=label_type, tokenizer=tokenizer, truncate_to_max_tokens=truncate_to_max_tokens, truncate_to_max_chars=truncate_to_max_chars, filter_if_longer_than=filter_if_longer_than, in_memory=in_memory, encoding=encoding, ) # use test_file to create test split if available test: FlairDataset = ClassificationDataset( test_file, label_type=label_type, tokenizer=tokenizer, truncate_to_max_tokens=truncate_to_max_tokens, truncate_to_max_chars=truncate_to_max_chars, filter_if_longer_than=filter_if_longer_than, in_memory=in_memory, encoding=encoding, ) if test_file is not None else None # use dev_file to create test split if available dev: FlairDataset = ClassificationDataset( dev_file, label_type=label_type, tokenizer=tokenizer, truncate_to_max_tokens=truncate_to_max_tokens, truncate_to_max_chars=truncate_to_max_chars, filter_if_longer_than=filter_if_longer_than, in_memory=in_memory, encoding=encoding, ) if dev_file is not None else None super(ClassificationCorpus, self).__init__(train, dev, test, name=str(data_folder))