Exemple #1
0
    def load_dataset(self, input_file: Optional[str] = None, input_dataset: Optional[Dataset] = None) -> None:
        logging.info(f'Loading dataset...')

        if input_file is None and input_dataset is None:
            self.input_file = helpers.find_latest_file(self.config.PATHS.data)
            self.dataset = helpers.load_input_file(self.input_file)  # type: ignore
        elif input_file is None and input_dataset is not None:
            self.dataset = input_dataset
        elif input_file is not None and input_dataset is not None:
            logging.warning('Both an input dataset and an input Dataset object were provided. Using the object.')
        elif input_file is not None and input_dataset is None:
            self.input_file = click.format_filename(input_file)
            self.dataset = helpers.load_input_file(self.input_file)  # type: ignore
Exemple #2
0
    def load_model(self,
                   input_file: Optional[str] = None,
                   input_model: Optional[Model] = None) -> None:
        logging.info(f'Loading model...')

        if input_file is None and input_model is None:
            self.input_file = helpers.find_latest_file(
                self.config.PATHS.models)
            self.model = helpers.load_input_file(
                self.input_file)  # type: ignore
        elif input_file is None and input_model is not None:
            self.model = input_model
        elif input_file is not None and input_model is not None:
            logging.warning(
                'Both an input dataset and an input Dataset object were provided. Using the object.'
            )
        elif input_file is not None and input_model is None:
            self.input_file = click.format_filename(input_file)
            self.model = helpers.load_input_file(
                self.input_file)  # type: ignore
def clean_data() -> List[Tuple]:
    for country in config.ALL_DATA:
        # Read country data; Doccano format
        data = helpers.load_input_file(country)

        # Clean data to fit Spacy format
        spacy_training_data = [(row['text'], {
            'entities': [(ln[0], ln[1], ln[2].upper())
                         for ln in [tuple(l) for l in row['labels']]]
        }) for row in data]

    return spacy_training_data