Exemple #1
0
    def _get_language_data_path(
            self,
            file_service: FileService,
            run_type: RunType):
        output_data_path = file_service.get_data_path()
        language_data_path = os.path.join(
            output_data_path, f'{run_type.to_str()}_language_data.pickle')

        if not os.path.exists(language_data_path):
            challenge_path = file_service.get_challenge_path()
            full_data_path = os.path.join(challenge_path, 'full')
            if not os.path.exists(full_data_path) or len(os.listdir(full_data_path)) == 0:
                newseye_path = os.path.join('data', 'newseye')
                trove_path = os.path.join('data', 'trove')
                # ocr_download.combine_data(challenge_path, newseye_path, trove_path)
                # TODO Fix download

            pickles_path = file_service.get_pickles_path()
            train_data_path = file_service.get_pickles_path()
            preprocess_data(
                self._tokenize_service,
                self._metrics_service,
                self._vocabulary_service,
                pickles_path,
                full_data_path,
                output_data_path)

        return language_data_path
    def __init__(self, language: Language,
                 arguments_service: PretrainedArgumentsService,
                 tokenize_service: BaseTokenizeService,
                 file_service: FileService,
                 vocabulary_service: VocabularyService, **kwargs):
        super(SemEvalTestDataset, self).__init__()

        self._arguments_service = arguments_service

        challenge_path = file_service.get_challenge_path()
        targets_path = os.path.join(challenge_path, 'eval', str(language),
                                    'targets.txt')

        with open(targets_path, 'r', encoding='utf-8') as targets_file:
            self._target_words = targets_file.read().splitlines()
            self._target_words.sort(key=lambda v: v.upper())

        # English words end with POS tags (e.g. 'test_nn')
        if language == Language.English:
            target_words = [x[:-3] for x in self._target_words]
        else:
            target_words = self._target_words

        if arguments_service.include_pretrained_model:
            encodings = tokenize_service.encode_sequences(target_words)
            self._target_word_ids = [x[0] for x in encodings]
        else:
            self._target_word_ids = [
                vocabulary_service.string_to_id(target_word)
                for target_word in target_words
            ]