def __create_folders(self): datasetAlreadyExists = True # initialize dataset folders if not os.path.isdir(FileManager.datasets['training']['url']): datasetAlreadyExists = False os.mkdir(FileManager.datasets['training']['url']) if not os.path.isdir(FileManager.datasets['testing']['url']): datasetAlreadyExists = False os.mkdir(FileManager.datasets['testing']['url']) # initialize output folders if not os.path.isdir(FileManager.getFeaturesFolderUrl()): os.mkdir(FileManager.getFeaturesFolderUrl()) if not os.path.isdir(FileManager.getModelsFolderUrl()): os.mkdir(FileManager.getModelsFolderUrl()) if not os.path.isdir(FileManager.getVocabularyFolderUrl()): os.mkdir(FileManager.getVocabularyFolderUrl()) if not os.path.isdir(FileManager.getReportsFolderUrl()): os.mkdir(FileManager.getReportsFolderUrl()) return datasetAlreadyExists
def updateConfigFile(self, args): if self._argsAreValidForUpdate(args): try: jfw = JsonFileWrapper(self._getConfigFilePath(args.name)) actualChapter = jfw.getKey(JsonFileWrapper.CHAPTER) if actualChapter is not None: self._logger.info("class Main : update config for {}, set chapter from {} to {}".format(args.name, actualChapter, args.chapter)) jfw.update(JsonFileWrapper.CHAPTER, str(args.chapter)) jfw.save() self._logger.info("class Main :cleaning donwload directory") FileManager.getFileManager().cleanMangaDirectory(jfw.getKey(JsonFileWrapper.NAME), jfw.getKey(JsonFileWrapper.CHAPTER)) else: self._logger.error("class Main : can not set chapter {} for manga {} : actual chapter is {}"\ .format(args.chapter, args.name, actualChapter)) except IOError as e: self._logger.error("class Main : ioerror {}".format(e))
def recognize_pooling_srv(): # 導入設定 cfg = ConfigObj(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'offline_stt_api_config.ini')) model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['stt']['model_path']) wav_storage_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['upload']['wav_storage_path']) wav_path_after_stt = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['stt']['wav_path_after_stt']) metadata_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['upload']['metadata_path']) op_log_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['log']['log_storage_path']) stt_result_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['stt']['stt_res_storage_path']) # 初始化物件 rec = Recognizer(Model(model_path)) # 開始主程式囉 while True: # 掃檔、逐一辨識、辨識完搬檔 wav_list = fm.list_all_file_type_in_dir('\.wav$', wav_storage_path) if len(wav_list) <= 0: time.sleep(2) continue wav_list.sort(key=os.path.getctime) for i in wav_list: log_name = i.split('/', -1)[-1].rsplit('.', 1)[0] i_metadata_path = f'{metadata_path}/{log_name}.json' if not os.path.isfile(i_metadata_path): print(f'[請檢查] metadata 檔案不存在,wav 路徑為 {i},忽略此檔辨識') continue t_start = UtcTime.get_current_utc_time() print(f'[開始辨識] 於 {t_start} 辨識 {i}') sentence = rec.recognize_wav_from_path(i) t_end = UtcTime.get_current_utc_time() with open(i_metadata_path, 'r') as f: metadata = json.load(f) # 寫結果,metadata 部份塞進來 sentence.update(metadata) # 寫結果,辨識結果部份 sentence['key'] = log_name sentence['recog_start_time'] = t_start sentence['recog_end_time'] = t_end sentence['recog_time'] = t_end - t_start sentence['time_zone'] = str(timezone.utc) sentence['am_version'] = model_path.rsplit('/', 1)[-1] sentence['lm_version'] = model_path.rsplit('/', 1)[-1] sentence['hostname'] = 'stt_test' with open(f'{stt_result_path}/{log_name}.json', 'w') as f: f.write(json.dumps(sentence)) # print(sentence) print(f'[辨識完成] {t_end} :{sentence["text"]},共花費 {t_end - t_start} 秒') # 刪掉 metadata 檔案 os.remove(i_metadata_path) # 辨識完的 wav 要搬家 os.rename(i, os.path.join(wav_path_after_stt, i.rsplit("/", 1)[-1]))
def load(self): datasetAlreadyExists = self.__create_folders() # clone file sources if dataset doesn't already exists if not datasetAlreadyExists: self.__cloneFilesSources() if not os.path.exists(FileManager.getDatasetCopyFileUrl()): # load dataset in memory self.__loadInMemory() # generate 'filtered' version self.__filterSources() # save dataset copy datasetCopy: dict = {'training': self.Dataset.training, 'testing': self.Dataset.testing} FileManager.writeFile(FileManager.getDatasetCopyFileUrl(), json.dumps(datasetCopy)) else: datasetCopy = json.loads(FileManager.readFile(FileManager.getDatasetCopyFileUrl())) self.Dataset.training = datasetCopy['training'] self.Dataset.testing = datasetCopy['testing'] return self
def PUT(self, request): """ Create path if needed and write data to it """ logging.debug("storage.put") #request.showVariables() response = {"headers": [], "status": "201 OK", "body": ""} if not request.file_name: response['status'] = http_code['304'] return response replication = request.metadata.get("HTTP_REPLICATION", None) if replication == "1": logging.debug("REPLICATION") fm = FileManager(self.data_path + "/" + request.file_name,\ request.file_name) if not fm.write_file(request.body): response['status'] = http_code['500'] response['body'] = "Could not replicate" logging.debug("Could not replicate") return response client_id = request.metadata.get("HTTP_CLIENT_ID", "test:test") logging.debug("HTTP_CLIENT_ID: %s" % client_id) replicas_count = request.metadata.get("HTTP_REPLICAS_COUNT", 3) fm = FileManager(self.data_path + "/" + request.file_name\ , request.file_name) if not fm.write_file(request.body): response['status'] = http_code['500'] logging.debug("Starting to save data in disk") storage_replicas = make_tuple(request.metadata["HTTP_STORAGE_LIST"]) for replica in storage_replicas: if replica[0] == self.port_ip[0] and\ replica[1] == self.port_ip[1]: continue headers = {} headers['file_name'] = request.file_name headers['data_length'] = request.content_length headers['etag'] = request.file_name headers['replicas_count'] = replicas_count headers['client_id'] = "%s:%s" % (request.remote_addr, request.remote_port) headers["replication"] = "1" response = self._put_request(request.file_name, replica, data=request.body, headers=headers) logging.debug("response: %s" % response) if not "201" in response["status"]: response['status'] = http_code['500'] response['body'] = "Could reach proper number of replicas" return response logging.debug("Starting to send metadata confirmation for proxy") proxy_list = make_tuple(request.metadata["HTTP_PROXY_LIST"]) logging.debug(proxy_list) for proxy in proxy_list: logging.debug(proxy) headers = {} headers['data_length'] = request.content_length headers['etag'] = request.file_name headers['replicas_count'] = 3 headers['client_id'] = client_id response = self._post_request(request.file_name, proxy, headers=headers) logging.debug("response: %s" % response) if "201" in response["status"]: break return response
def importKerasTrainedModel(self): self.model = load_model(FileManager.getTrainedModelFileUrl(self.type)) return self
def exportVocabulary(self, indexes): FileManager.writeFile(FileManager.getVocabularyFileUrl(self.type), json.dumps(indexes)) return self
def importVocabulary(self): return json.loads(FileManager.readFile(FileManager.getVocabularyFileUrl(self.type)))
import json import os import sys from tencent_cos import TencentCOSBucket from aliyun_oss import AliyunOSSBucket from utils import FileManager, OSSSynchronizer if __name__ == '__main__': with open('config/keybrl-mines.json', 'rt', encoding='utf-8') as fp: cos = TencentCOSBucket(json.load(fp)) local_file = FileManager('../dist') OSSSynchronizer(local_file, cos).sync_from_local_to_oss()
def main() -> None: """主函数 """ # 解析命令行参数 args = parser_args() # 开启调试模式 if args.debug: logger.addHandler(debug_console_handler) logger.setLevel(logging.DEBUG) logger.debug('DEBUG 模式已开启') else: logger.addHandler(normal_console_handler) logger.setLevel(logging.INFO) logger.debug('DEBUG 模式关闭') main_config_path = args.config or default_main_config_path config_encoding = args.config_encoding or default_config_encoding # 加载主配置文件 config = load_configs(config_path=main_config_path, validator=main_config_validator, encoding=config_encoding) if config is None: logger.error(f'加载主配置文件 "{main_config_path}" 失败。') exit(1) for config_item in config: oss_type = config_item['oss_type'] oss_config_path = config_item['oss_config'] local_dir = config_item['local_dir'] direction = config_item['direction'] # 加载 OSS 配置文件 oss_config = load_configs(config_path=oss_config_path, validator=None, encoding=config_encoding) if oss_config is None: logger.error(f'加载 OSS 配置文件 "{oss_config_path}" 失败。') exit(1) if oss_type == 'tencent-cos': bucket = QcloudCosBucket(oss_config) else: bucket = AliyunOssBucket(oss_config) file_manager = FileManager(local_dir) oss_synchronizer = OSSSynchronizer(file_manager, bucket) if direction == 'local-to-remote': logger.info( f'开始同步 {local_dir}(本地)-> {oss_config.get("bucket", "Unknown Bucket")}(OSS)' ) oss_synchronizer.sync_from_local_to_oss() else: logger.info( f'开始同步 {oss_config.get("bucket", "Unknown Bucket")}(OSS) -> {local_dir}(本地)' ) oss_synchronizer.sync_from_oss_to_local()
def __cloneFilesSources(self): SOURCE_URL = FileManager.datasets['source']['url'] TRAINING_URL = FileManager.datasets['training']['url'] TESTING_URL = FileManager.datasets['testing']['url'] # foreach directory in '/Lang' folder ... languagesExamplesCounter = {} for languageFolder in [f for f in os.scandir(SOURCE_URL) if f.is_dir()]: language = str(languageFolder.name).lower() languagesExamplesCounter[language] = 0 # parse only selected languages if language in ConfigurationManager.getLanguages(): # preparing empty {languageFolder.name} for each dataset if not (os.path.isdir(os.path.join(TRAINING_URL, language))): os.mkdir(os.path.join(TRAINING_URL, language)) if not (os.path.isdir(os.path.join(TESTING_URL, language))): os.mkdir(os.path.join(TESTING_URL, language)) # count example foreach language for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): for _ in FileManager.getExampleFiles(exampleFolder.path): languagesExamplesCounter[language] += 1 # print languages with examples counter less than {TRAINING_EXAMPLES_NUMBER} if languagesExamplesCounter[language] < TRAINING_EXAMPLES_NUMBER: print(' > [dataset] the total number of examples for the ' + language + ' is less than ' + str(TRAINING_EXAMPLES_NUMBER)) continue # for this language, the total examples number could be less than {TRAINING_EXAMPLES_NUMBER} indexesOfTrainingExamples = random.sample( range(1, languagesExamplesCounter[language]), TRAINING_EXAMPLES_NUMBER ) # list all examples in {languageFolder.name} folder exampleIndex = 0 for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): # list all examples versions in {exampleFolder.name} folder for exampleVersionFile in FileManager.getExampleFiles(exampleFolder.path): exampleIndex += 1 # move file to right dataset if exampleIndex in indexesOfTrainingExamples: DATASET_TYPE = TRAINING_URL else: DATASET_TYPE = TESTING_URL # prepare destination folder example = str(exampleVersionFile.name).lower() exampleFolderUri = os.path.join(DATASET_TYPE, language, example) os.mkdir(exampleFolderUri) # copy the ORIGINAL source file content originalFileUri = FileManager.getOriginalFileUrl(exampleFolderUri) FileManager.createFile(originalFileUri) shutil.copyfile(exampleVersionFile.path, originalFileUri) # create the 'PARSED' version of the orginal file parsedFileUri = FileManager.getParsedFileUrl(exampleFolderUri) FileManager.createFile(parsedFileUri) parser = Parser() parser.initialize(originalFileUri, parsedFileUri) parser.parse() return self
def __loadInMemory(self): TRAINING_URL = FileManager.datasets['training']['url'] TESTING_URL = FileManager.datasets['testing']['url'] # training for languageFolder in FileManager.getLanguagesFolders(TRAINING_URL): language = str(languageFolder.name).lower() self.Dataset.addLanguage('training', language) # example for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): exampleDict: dict = {} # original file originalFileUri = FileManager.getOriginalFileUrl(exampleFolder.path) originalFileContent = FileManager.readFile(originalFileUri) exampleDict['original'] = originalFileContent # parsed file parsedFileUri = FileManager.getParsedFileUrl(exampleFolder.path) parsedFileContent = FileManager.readFile(parsedFileUri) exampleDict['parsed'] = parsedFileContent # save self.Dataset.addExample('training', language, exampleDict) # testing for languageFolder in FileManager.getLanguagesFolders(TESTING_URL): language = str(languageFolder.name).lower() self.Dataset.addLanguage('testing', language) # example for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): exampleDict: dict = {} # original file originalFileUri = FileManager.getOriginalFileUrl(exampleFolder.path) originalFileContent = FileManager.readFile(originalFileUri) exampleDict['original'] = originalFileContent # parsed file parsedFileUri = FileManager.getParsedFileUrl(exampleFolder.path) parsedFileContent = FileManager.readFile(parsedFileUri) exampleDict['parsed'] = parsedFileContent # save self.Dataset.addExample('testing', language, exampleDict) return self
def PUT(self, request): logging.debug("proxy.put") proxy_nodes = [] storage_nodes = [] self.get_nodes(proxy_nodes, storage_nodes) print("storage_node: %s" % storage_nodes) print("proxy_node: %s" % proxy_nodes) #request.showVariables() print("Am I leader? %s" % self.group.is_leader) response = {"headers": [], "status": "200 OK", "body": ""} replication = request.metadata.get("HTTP_REPLICATION", None) if replication == "1": logging.debug("REPLICATION") fm = FileManager(self.data_path + "/" + request.file_name,\ request.file_name) if not fm.write_file(request.body): response['status'] = http_code['500'] response['body'] = "Could not replicate" logging.debug("Could not replicate") return response if len(proxy_nodes) < 3: response['status'] = http_code['304'] response['body'] = "Deny of service due server is down" print("Deny of service due server is down") logging.debug("Deny of service due server is down") return response if not self.group.is_leader: response['status'] = http_code['304'] response['body'] = "No leader. Try to find the leader" logging.debug("I'm no leader. Try to find the leader") return response if not request.file_name: response['status'] = http_code['400'] response['body'] = "No valid file name found" logging.debug("No valid file name found") return response client_id = request.metadata.get("HTTP_CLIENT_ID", "test:test") #logging.debug("HTTP_CLIENT_ID: %s" % client_id) metadata = MetadataManager(self.data_path + "/" + request.file_name, request.file_name) file_exist = metadata.restore() if len(metadata.lock_list) > 0: response['status'] = http_code['403'] response['body'] = "File is locked" logging.debug("file is locked") return response metadata.size = request.metadata["HTTP_DATA_LENGTH"] metadata.etag = request.metadata["HTTP_ETAG"] # Create File in any nodes if not file_exist: write_balancer = WriteLoadBalancer(self.data_path, storage_nodes) balanced_storage_list = write_balancer.balance() if balanced_storage_list: storage_list = balanced_storage_list else: storage_list = storage_nodes logging.debug("storage_list: %s" % storage_list) replicas_count = request.metadata.get("HTTP_REPLICAS_COUNT", 3) replica_list = storage_list[:int(replicas_count)] logging.debug("replicas_list: %s" % replica_list) metadata.storage_list = replica_list metadata.replicas_count = replicas_count # Update File in the same nodes else: replica_list = metadata.storage_list available_replicas = [] for replica in metadata.storage_list: for node in storage_nodes: if replica[0] == node[0] and replica[1] == node[1]: available_replicas.append((replica[0], replica[1])) if len(available_replicas) != len(metadata.storage_list): response['status'] = http_code['304'] response['body'] = "Deny of service due server is down" print("Deny of service due server is down") logging.debug("Deny of service due server is down") return response #if len(replica_list) < 3: #response['status'] = http_code['304'] #response['body'] = "Deny of service due server is down" #return response #fl = FileLock(self.data_path + "/" + request.file_name) #while True: #if not fl.is_locked(): #break #else: #logging.debug("file is locked. Someone is updating") #time.sleep(1) #response['status'] = http_code['403'] #response['body'] = "File is locked. Someone is updating." #return response # Cuts off not needed replicas metadata.add_lock_item(client_id, "LOCK_WRITE") metadata.save() #fl.lock() body = {} body["STORAGE_LIST"] = replica_list response['body'] = json.dumps(body) response['status'] = http_code['201'] logging.debug("response: %s" % response) return response