def _get_language_data_path( self, file_service: FileService, run_type: RunType): output_data_path = file_service.get_data_path() language_data_path = os.path.join( output_data_path, f'{run_type.to_str()}_language_data.pickle') if not os.path.exists(language_data_path): challenge_path = file_service.get_challenge_path() full_data_path = os.path.join(challenge_path, 'full') if not os.path.exists(full_data_path) or len(os.listdir(full_data_path)) == 0: newseye_path = os.path.join('data', 'newseye') trove_path = os.path.join('data', 'trove') # ocr_download.combine_data(challenge_path, newseye_path, trove_path) # TODO Fix download pickles_path = file_service.get_pickles_path() train_data_path = file_service.get_pickles_path() preprocess_data( self._tokenize_service, self._metrics_service, self._vocabulary_service, pickles_path, full_data_path, output_data_path) return language_data_path
def process_file(): if not request.json or not request.json['fileId']: requestData = request.json or str(request.form) or request.data return make_response('Invalid content: ' + requestData, 400) db = MongoInit().initialize() payload = { 'fileId': request.json['fileId'], 'user': get_current_user().id } fileService = FileService(db) messageService = MessageService(db) chunks = fileService.getChunksByFileId(payload['fileId']) messages = [] if all(c.user == payload['user'] for c in chunks): messages = messageService.parseFileChunks(chunks) topMessages = [] for message in messages[:50]: topMessages.append({'subject': message.subject, 'sender': message.sender, 'content': message.content, 'date': message.date}) result = {'fileId': payload['fileId'], 'messages': topMessages} return make_response(jsonify(result))
def create_file(): if not request.json or not request.json['data']: requestData = request.json or str(request.form) or request.data return make_response('Invalid content: ' + requestData, 400) db = MongoInit().initialize() payload = { 'fileId': request.json['fileId'], 'data': request.json['data'], 'position': request.json['position'], 'user': get_current_user().id } fileService = FileService(db) file = fileService.processFileChunk(None, payload['fileId'], payload['user'], payload['data'], payload['position']) return make_response(jsonify({'fileId': file._id, 'position': file.position}), 201)
def _get_language_data_path(self, file_service: FileService, run_type: RunType): output_data_path = file_service.get_data_path() language_data_path = os.path.join( output_data_path, f'{run_type.to_str()}_language_data.pickle') if not os.path.exists(language_data_path): train_data_path = file_service.get_pickles_path() test_data_path = None preprocess_data(train_data_path, test_data_path, output_data_path, self._tokenize_service.tokenizer, self._vocabulary_service) return language_data_path
def __init__( self, language: str, arguments_service: PostOCRArgumentsService, file_service: FileService, tokenize_service: BaseTokenizeService, run_type: RunType, **kwargs): super(NewsEyeDataset, self).__init__() self._arguments_service = arguments_service output_data_path = file_service.get_data_path() language_data_path = os.path.join( output_data_path, f'{run_type.to_str()}_language_data.pickle') if not tokenize_service.is_tokenizer_loaded(): full_data_path = os.path.join( 'data', 'ICDAR2019_POCR_competition_dataset', 'ICDAR2019_POCR_competition_full_22M_without_Finnish') vocabulary_size = tokenize_service.vocabulary_size train_spm_model(full_data_path, output_data_path, language, vocabulary_size) tokenize_service.load_tokenizer_model() if not os.path.exists(language_data_path): train_data_path = os.path.join( 'data', 'ICDAR2019_POCR_competition_dataset', 'ICDAR2019_POCR_competition_training_18M_without_Finnish') test_data_path = os.path.join( 'data', 'ICDAR2019_POCR_competition_dataset', 'ICDAR2019_POCR_competition_evaluation_4M_without_Finnish') preprocess_data(language, train_data_path, test_data_path, output_data_path, tokenize_service.tokenizer) with open(language_data_path, 'rb') as data_file: self._language_data: LanguageData = pickle.load(data_file)
def __init__(self, language: Language, arguments_service: PretrainedArgumentsService, tokenize_service: BaseTokenizeService, file_service: FileService, vocabulary_service: VocabularyService, **kwargs): super(SemEvalTestDataset, self).__init__() self._arguments_service = arguments_service challenge_path = file_service.get_challenge_path() targets_path = os.path.join(challenge_path, 'eval', str(language), 'targets.txt') with open(targets_path, 'r', encoding='utf-8') as targets_file: self._target_words = targets_file.read().splitlines() self._target_words.sort(key=lambda v: v.upper()) # English words end with POS tags (e.g. 'test_nn') if language == Language.English: target_words = [x[:-3] for x in self._target_words] else: target_words = self._target_words if arguments_service.include_pretrained_model: encodings = tokenize_service.encode_sequences(target_words) self._target_word_ids = [x[0] for x in encodings] else: self._target_word_ids = [ vocabulary_service.string_to_id(target_word) for target_word in target_words ]
def post(self): files = request.files.getlist('files') file_service = FileService() # dict = file_service.uploan_photo(files) # 线上用下面的方法,地址头改为https://s3-ap-northeast-1.amazonaws.com/tokenpark-test/ dict = file_service.uploan_photo_s3_session(files) flag = dict["flag"] file_path_list = dict["file_path_list"] if flag == 0: pass elif flag == 1: self.return_error(20001) elif flag == 2: self.return_error(20002) elif flag == 3: self.return_error(20003) return file_path_list
def _fetch_size(self): """ Output the file size in formatted output. :return: file size in formatted output. """ length = self._file_info.get(LENGTH) if not length: return None return FileService.calculate_formatted_size(length)
def _fetch_size(self): """ Output the size of the torrent size. :return: the formatted size of torrent :rtype string """ info = self._metainfo.get(INFO) if not info: return None piece_length = info.get(PIECE_LENGTH) if not piece_length: return None return FileService.calculate_formatted_size(piece_length)
def __init__( self, file_service: FileService, device: str, pretrained_representations_options: PretrainedRepresentationsOptions ): super().__init__() self._device = device self.do_not_save: bool = ( not pretrained_representations_options.fine_tune_pretrained and not pretrained_representations_options.fine_tune_after_convergence) self._include_pretrained = pretrained_representations_options.include_pretrained_model self._pretrained_model_size = pretrained_representations_options.pretrained_model_size self._pretrained_weights = pretrained_representations_options.pretrained_weights self._pretrained_max_length = pretrained_representations_options.pretrained_max_length self._pretrained_model: PreTrainedModel = None self._fine_tune_pretrained = pretrained_representations_options.fine_tune_pretrained self._fine_tune_after_convergence = pretrained_representations_options.fine_tune_after_convergence self._include_fasttext_model = pretrained_representations_options.include_fasttext_model if self._include_pretrained and self._pretrained_model_size and self._pretrained_weights: if pretrained_representations_options.pretrained_model == PretrainedModel.BERT: self._pretrained_model = BertModel.from_pretrained( pretrained_representations_options.pretrained_weights) elif pretrained_representations_options.pretrained_model == PretrainedModel.CamemBERT: self._pretrained_model = CamembertModel.from_pretrained( pretrained_representations_options.pretrained_weights) if pretrained_representations_options.fine_tune_pretrained: self._pretrained_model.train() else: self._pretrained_model.eval() if self._include_fasttext_model: assert pretrained_representations_options.fasttext_model is not None, 'fast text model is not supplied when include-fasttext-model is set to true' data_path = file_service.get_initial_data_path() fasttext_path = os.path.join( data_path, 'fasttext', pretrained_representations_options.fasttext_model) assert os.path.exists( fasttext_path), f'fast text model not found in {fasttext_path}' self._fasttext_dimension = pretrained_representations_options.fasttext_model_size self._fasttext_model = fasttext.load_model(fasttext_path)
def map_action(self) -> (str, bool): """ This function maps the action to the corresponding method in FileService. Input: - action: str - args: Namespace Output: - str - str - bool """ file_service = FileService(self.file, **self.args) try: response = getattr(file_service, self.action)() except: raise BaseException( "Unexpected action: '{}' does not map to controller" .format( self.action ) ) return response
def __init__(self, arguments_service: PretrainedArgumentsService, dataloader_service: DataLoaderService, loss_function: LossBase, optimizer: OptimizerBase, log_service: LogService, file_service: FileService, model: ModelBase): self._arguments_service = arguments_service self._model_path = file_service.get_checkpoints_path() self._optimizer_base = optimizer self._log_service = log_service self._dataloader_service = dataloader_service self._loss_function = loss_function self._model = model.to(arguments_service.device) self.data_loader_train: DataLoader = None self.data_loader_validation: DataLoader = None self._initial_patience = self._arguments_service.patience # if we are going to fine-tune after initial convergence # then we set a low patience first and use the real one in # the second training iteration set if self._arguments_service.fine_tune_after_convergence: self._initial_patience = 5
def save_results(self, path=Defaults.output_path): output = 'image_id,category\n' for image in sorted(self._classified_images, key=lambda img: img.id): output += '{},{}\n'.format(image.id, image.classification) FileService().write_csv_file(path, output)
def load_images(self, path): image_data_array = FileService().read_h5_file(path) for id, image_data in enumerate(image_data_array): self._image_queue.enqueue(Image(id, image_data)) self._image_queue.shuffle() self._get_next_image()
from services.scrape_service import scrapeContent from services.file_service import FileService from services.email_service import sendMail from services.argument_parser import ArgumentParser from helper.url_builder import buildUrl import numpy as np import os appPath = os.path.dirname(os.path.abspath(__file__)) fileService = FileService(appPath) arguments = ArgumentParser() url = buildUrl(arguments) currentBikeList = scrapeContent(url) storedBikeList = fileService.readFromFile(arguments.fileName) newBikesExists = not np.array_equal(currentBikeList, storedBikeList) if newBikesExists: newBikes = [item for item in currentBikeList if item not in storedBikeList] if len(newBikes): sendMail(arguments.email, newBikes) fileService.writeToFile(arguments.fileName, currentBikeList)
from services.general_service import GeneralService from services.file_service import FileService from services.os_service import OSService from services.music_service import MusicService from services.simon_says_service import SimonSaysService from services.quiz_service import QuizService from assistant_logic.assistant import Assistant # Program dependencies assistant = Assistant() os_service = OSService() internet_service = InternetService() general_service = GeneralService() simon_says_service = SimonSaysService() quiz_service = QuizService() file_service = FileService("./appsettings.json") appsettings = file_service.read_appsettings() api_call_service = ApiCallService(appsettings["WeatherApiKey"], appsettings["WolframalphaApiKey"]) music_service = MusicService(appsettings["MusixmatchApiKey"]) # Startup print("Loading your AI personal assistant Jarvis") assistant.speak("Loading your AI personal assistant Jarvis") assistant.wish_me() print("\n ############### ") print(" ## ## ") print(" # ~~ ~~ # ") print(" # () () # ") print(" ( ^ ) ")
def __init__(self, arguments_service: NERArgumentsService, vocabulary_service: VocabularyService, file_service: FileService, tokenize_service: BaseTokenizeService, data_service: DataService, cache_service: CacheService, string_process_service: StringProcessService): super().__init__() self._arguments_service = arguments_service self._tokenize_service = tokenize_service self._file_service = file_service self._data_service = data_service self._string_process_service = string_process_service self._entity_tag_types = arguments_service.entity_tag_types self._data_version = "1.3" self.PAD_TOKEN = '[PAD]' self.START_TOKEN = '[CLS]' self.STOP_TOKEN = '[SEP]' self.pad_idx = 0 self.start_idx = 1 self.stop_idx = 2 data_path = file_service.get_data_path() language_suffix = self.get_language_suffix(arguments_service.language) train_cache_key = f'train-hipe-data-v{self._data_version}-limit-{arguments_service.train_dataset_limit_size}-{arguments_service.split_type.value}-merge-{arguments_service.merge_subwords}-replacen-{arguments_service.replace_all_numbers}' validation_cache_key = f'validation-hipe-data-v{self._data_version}-limit-{arguments_service.validation_dataset_limit_size}-{arguments_service.split_type.value}-merge-{arguments_service.merge_subwords}-replacen-{arguments_service.replace_all_numbers}' self._train_ne_collection = cache_service.get_item_from_cache( item_key=train_cache_key, callback_function=lambda: (self.preprocess_data(os.path.join( data_path, f'HIPE-data-v{self._data_version}-train-{language_suffix}.tsv' ), limit=arguments_service. train_dataset_limit_size))) self._validation_ne_collection = cache_service.get_item_from_cache( item_key=validation_cache_key, callback_function=lambda: (self.preprocess_data(os.path.join( data_path, f'HIPE-data-v{self._data_version}-dev-{language_suffix}.tsv'), limit=arguments_service. validation_dataset_limit_size))) if arguments_service.evaluate: test_cache_key = f'test-hipe-data-v{self._data_version}-{arguments_service.split_type.value}-merge-{arguments_service.merge_subwords}-replacen-{arguments_service.replace_all_numbers}' self._test_ne_collection = cache_service.get_item_from_cache( item_key=test_cache_key, callback_function=lambda: (self.preprocess_data( os.path.join( data_path, f'HIPE-data-v{self._data_version}-test-{language_suffix}.tsv' )))) self._entity_mappings = self._create_entity_mappings( self._train_ne_collection, self._validation_ne_collection) vocabulary_cache_key = f'char-vocabulary-{self._data_version}' vocabulary_data = cache_service.get_item_from_cache( item_key=vocabulary_cache_key, callback_function=lambda: self._generate_vocabulary_data( language_suffix, self._data_version)) vocabulary_service.initialize_vocabulary_data(vocabulary_data)