Exemple #1
0
    def _get_language_data_path(
            self,
            file_service: FileService,
            run_type: RunType):
        output_data_path = file_service.get_data_path()
        language_data_path = os.path.join(
            output_data_path, f'{run_type.to_str()}_language_data.pickle')

        if not os.path.exists(language_data_path):
            challenge_path = file_service.get_challenge_path()
            full_data_path = os.path.join(challenge_path, 'full')
            if not os.path.exists(full_data_path) or len(os.listdir(full_data_path)) == 0:
                newseye_path = os.path.join('data', 'newseye')
                trove_path = os.path.join('data', 'trove')
                # ocr_download.combine_data(challenge_path, newseye_path, trove_path)
                # TODO Fix download

            pickles_path = file_service.get_pickles_path()
            train_data_path = file_service.get_pickles_path()
            preprocess_data(
                self._tokenize_service,
                self._metrics_service,
                self._vocabulary_service,
                pickles_path,
                full_data_path,
                output_data_path)

        return language_data_path
    def _get_language_data_path(self, file_service: FileService,
                                run_type: RunType):
        output_data_path = file_service.get_data_path()
        language_data_path = os.path.join(
            output_data_path, f'{run_type.to_str()}_language_data.pickle')

        if not os.path.exists(language_data_path):
            train_data_path = file_service.get_pickles_path()
            test_data_path = None
            preprocess_data(train_data_path, test_data_path, output_data_path,
                            self._tokenize_service.tokenizer,
                            self._vocabulary_service)

        return language_data_path