Ejemplo n.º 1
0
    def __create_folders(self):
        datasetAlreadyExists = True

        # initialize dataset folders

        if not os.path.isdir(FileManager.datasets['training']['url']):
            datasetAlreadyExists = False
            os.mkdir(FileManager.datasets['training']['url'])

        if not os.path.isdir(FileManager.datasets['testing']['url']):
            datasetAlreadyExists = False
            os.mkdir(FileManager.datasets['testing']['url'])

        # initialize output folders

        if not os.path.isdir(FileManager.getFeaturesFolderUrl()):
            os.mkdir(FileManager.getFeaturesFolderUrl())

        if not os.path.isdir(FileManager.getModelsFolderUrl()):
            os.mkdir(FileManager.getModelsFolderUrl())

        if not os.path.isdir(FileManager.getVocabularyFolderUrl()):
            os.mkdir(FileManager.getVocabularyFolderUrl())

        if not os.path.isdir(FileManager.getReportsFolderUrl()):
            os.mkdir(FileManager.getReportsFolderUrl())

        return datasetAlreadyExists
Ejemplo n.º 2
0
	def updateConfigFile(self, args):
		if self._argsAreValidForUpdate(args):
			try:
				jfw = JsonFileWrapper(self._getConfigFilePath(args.name))
				actualChapter = jfw.getKey(JsonFileWrapper.CHAPTER)
				if actualChapter is not None:
					self._logger.info("class Main : update config for {}, set chapter from {} to {}".format(args.name, actualChapter, args.chapter))
					jfw.update(JsonFileWrapper.CHAPTER, str(args.chapter))
					jfw.save()
					self._logger.info("class Main :cleaning donwload directory")
					FileManager.getFileManager().cleanMangaDirectory(jfw.getKey(JsonFileWrapper.NAME), jfw.getKey(JsonFileWrapper.CHAPTER))
				else:
					self._logger.error("class Main : can not set chapter {} for manga {} : actual chapter is {}"\
					.format(args.chapter, args.name, actualChapter))
			except IOError as e:
				self._logger.error("class Main : ioerror {}".format(e))
Ejemplo n.º 3
0
def recognize_pooling_srv():
    # 導入設定
    cfg = ConfigObj(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'offline_stt_api_config.ini'))
    model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['stt']['model_path'])
    wav_storage_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['upload']['wav_storage_path'])
    wav_path_after_stt = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['stt']['wav_path_after_stt'])
    metadata_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['upload']['metadata_path'])
    op_log_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['log']['log_storage_path'])
    stt_result_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), cfg['stt']['stt_res_storage_path'])
    # 初始化物件
    rec = Recognizer(Model(model_path))
    # 開始主程式囉
    while True:
        # 掃檔、逐一辨識、辨識完搬檔
        wav_list = fm.list_all_file_type_in_dir('\.wav$', wav_storage_path)
        if len(wav_list) <= 0:
            time.sleep(2)
            continue
        wav_list.sort(key=os.path.getctime)
        for i in wav_list:
            log_name = i.split('/', -1)[-1].rsplit('.', 1)[0]
            i_metadata_path = f'{metadata_path}/{log_name}.json'
            if not os.path.isfile(i_metadata_path):
                print(f'[請檢查] metadata 檔案不存在,wav 路徑為 {i},忽略此檔辨識')
                continue
            t_start = UtcTime.get_current_utc_time()
            print(f'[開始辨識] 於 {t_start} 辨識 {i}')
            sentence = rec.recognize_wav_from_path(i)
            t_end = UtcTime.get_current_utc_time()
            with open(i_metadata_path, 'r') as f:
                metadata = json.load(f)
            # 寫結果,metadata 部份塞進來
            sentence.update(metadata)
            # 寫結果,辨識結果部份
            sentence['key'] = log_name
            sentence['recog_start_time'] = t_start
            sentence['recog_end_time'] = t_end
            sentence['recog_time'] = t_end - t_start
            sentence['time_zone'] = str(timezone.utc)
            sentence['am_version'] = model_path.rsplit('/', 1)[-1]
            sentence['lm_version'] = model_path.rsplit('/', 1)[-1]
            sentence['hostname'] = 'stt_test'
            with open(f'{stt_result_path}/{log_name}.json', 'w') as f:
                f.write(json.dumps(sentence))
            # print(sentence)
            print(f'[辨識完成] {t_end} :{sentence["text"]},共花費 {t_end - t_start} 秒')
            # 刪掉 metadata 檔案
            os.remove(i_metadata_path)
            # 辨識完的 wav 要搬家
            os.rename(i, os.path.join(wav_path_after_stt, i.rsplit("/", 1)[-1]))
Ejemplo n.º 4
0
    def load(self):
        datasetAlreadyExists = self.__create_folders()

        # clone file sources if dataset doesn't already exists
        if not datasetAlreadyExists:
            self.__cloneFilesSources()

        if not os.path.exists(FileManager.getDatasetCopyFileUrl()):
            # load dataset in memory
            self.__loadInMemory()
            # generate 'filtered' version
            self.__filterSources()
            # save dataset copy
            datasetCopy: dict = {'training': self.Dataset.training,  'testing': self.Dataset.testing}
            FileManager.writeFile(FileManager.getDatasetCopyFileUrl(), json.dumps(datasetCopy))
        else:
            datasetCopy = json.loads(FileManager.readFile(FileManager.getDatasetCopyFileUrl()))
            self.Dataset.training = datasetCopy['training']
            self.Dataset.testing = datasetCopy['testing']

        return self
Ejemplo n.º 5
0
    def PUT(self, request):
        """
            Create path if needed and write data to it
        """
        logging.debug("storage.put")
        #request.showVariables()
        response = {"headers": [], "status": "201 OK", "body": ""}

        if not request.file_name:
            response['status'] = http_code['304']
            return response

        replication = request.metadata.get("HTTP_REPLICATION", None)
        if replication == "1":
            logging.debug("REPLICATION")
            fm = FileManager(self.data_path + "/" + request.file_name,\
                             request.file_name)
            if not fm.write_file(request.body):
                response['status'] = http_code['500']
                response['body'] = "Could not replicate"
                logging.debug("Could not replicate")
            return response

        client_id = request.metadata.get("HTTP_CLIENT_ID", "test:test")
        logging.debug("HTTP_CLIENT_ID: %s" % client_id)

        replicas_count = request.metadata.get("HTTP_REPLICAS_COUNT", 3)
        fm = FileManager(self.data_path + "/" + request.file_name\
                         , request.file_name)
        if not fm.write_file(request.body):
            response['status'] = http_code['500']

        logging.debug("Starting to save data in disk")
        storage_replicas = make_tuple(request.metadata["HTTP_STORAGE_LIST"])
        for replica in storage_replicas:
            if replica[0] == self.port_ip[0] and\
                replica[1] == self.port_ip[1]:
                    continue

            headers = {}
            headers['file_name'] = request.file_name
            headers['data_length'] = request.content_length
            headers['etag'] = request.file_name
            headers['replicas_count'] = replicas_count
            headers['client_id'] = "%s:%s" % (request.remote_addr,
                                              request.remote_port)

            headers["replication"] = "1"
            response = self._put_request(request.file_name,
                                         replica,
                                         data=request.body,
                                         headers=headers)
            logging.debug("response: %s" % response)
            if not "201" in response["status"]:
                response['status'] = http_code['500']
                response['body'] = "Could reach proper number of replicas"
                return response


        logging.debug("Starting to send metadata confirmation for proxy")
        proxy_list = make_tuple(request.metadata["HTTP_PROXY_LIST"])
        logging.debug(proxy_list)
        for proxy in proxy_list:
            logging.debug(proxy)
            headers = {}
            headers['data_length'] = request.content_length
            headers['etag'] = request.file_name
            headers['replicas_count'] = 3
            headers['client_id'] = client_id
            response = self._post_request(request.file_name, proxy,
                                          headers=headers)
            logging.debug("response: %s" % response)
            if "201" in response["status"]:
                break

        return response
Ejemplo n.º 6
0
 def importKerasTrainedModel(self):
     self.model = load_model(FileManager.getTrainedModelFileUrl(self.type))
     return self
Ejemplo n.º 7
0
 def exportVocabulary(self, indexes):
     FileManager.writeFile(FileManager.getVocabularyFileUrl(self.type), json.dumps(indexes))
     return self
Ejemplo n.º 8
0
 def importVocabulary(self):
     return json.loads(FileManager.readFile(FileManager.getVocabularyFileUrl(self.type)))
Ejemplo n.º 9
0
import json
import os
import sys

from tencent_cos import TencentCOSBucket
from aliyun_oss import AliyunOSSBucket
from utils import FileManager, OSSSynchronizer

if __name__ == '__main__':

    with open('config/keybrl-mines.json', 'rt', encoding='utf-8') as fp:
        cos = TencentCOSBucket(json.load(fp))

    local_file = FileManager('../dist')

    OSSSynchronizer(local_file, cos).sync_from_local_to_oss()
Ejemplo n.º 10
0
def main() -> None:
    """主函数
    """

    # 解析命令行参数
    args = parser_args()

    # 开启调试模式
    if args.debug:
        logger.addHandler(debug_console_handler)
        logger.setLevel(logging.DEBUG)
        logger.debug('DEBUG 模式已开启')
    else:
        logger.addHandler(normal_console_handler)
        logger.setLevel(logging.INFO)
        logger.debug('DEBUG 模式关闭')

    main_config_path = args.config or default_main_config_path
    config_encoding = args.config_encoding or default_config_encoding

    # 加载主配置文件
    config = load_configs(config_path=main_config_path,
                          validator=main_config_validator,
                          encoding=config_encoding)

    if config is None:
        logger.error(f'加载主配置文件 "{main_config_path}" 失败。')
        exit(1)

    for config_item in config:
        oss_type = config_item['oss_type']
        oss_config_path = config_item['oss_config']
        local_dir = config_item['local_dir']
        direction = config_item['direction']

        # 加载 OSS 配置文件
        oss_config = load_configs(config_path=oss_config_path,
                                  validator=None,
                                  encoding=config_encoding)
        if oss_config is None:
            logger.error(f'加载 OSS 配置文件 "{oss_config_path}" 失败。')
            exit(1)

        if oss_type == 'tencent-cos':
            bucket = QcloudCosBucket(oss_config)
        else:
            bucket = AliyunOssBucket(oss_config)

        file_manager = FileManager(local_dir)
        oss_synchronizer = OSSSynchronizer(file_manager, bucket)

        if direction == 'local-to-remote':
            logger.info(
                f'开始同步 {local_dir}(本地)-> {oss_config.get("bucket", "Unknown Bucket")}(OSS)'
            )
            oss_synchronizer.sync_from_local_to_oss()
        else:
            logger.info(
                f'开始同步 {oss_config.get("bucket", "Unknown Bucket")}(OSS) -> {local_dir}(本地)'
            )
            oss_synchronizer.sync_from_oss_to_local()
Ejemplo n.º 11
0
    def __cloneFilesSources(self):
        SOURCE_URL = FileManager.datasets['source']['url']
        TRAINING_URL = FileManager.datasets['training']['url']
        TESTING_URL = FileManager.datasets['testing']['url']

        # foreach directory in '/Lang' folder ...
        languagesExamplesCounter = {}
        for languageFolder in [f for f in os.scandir(SOURCE_URL) if f.is_dir()]:
            language = str(languageFolder.name).lower()
            languagesExamplesCounter[language] = 0
            # parse only selected languages
            if language in ConfigurationManager.getLanguages():
                # preparing empty {languageFolder.name} for each dataset
                if not (os.path.isdir(os.path.join(TRAINING_URL, language))):
                    os.mkdir(os.path.join(TRAINING_URL, language))
                if not (os.path.isdir(os.path.join(TESTING_URL, language))):
                    os.mkdir(os.path.join(TESTING_URL, language))

                # count example foreach language
                for exampleFolder in FileManager.getExamplesFolders(languageFolder.path):
                    for _ in FileManager.getExampleFiles(exampleFolder.path):
                        languagesExamplesCounter[language] += 1

                # print languages with examples counter less than {TRAINING_EXAMPLES_NUMBER}
                if languagesExamplesCounter[language] < TRAINING_EXAMPLES_NUMBER:
                    print(' >  [dataset] the total number of examples for the '
                          + language + ' is less than ' + str(TRAINING_EXAMPLES_NUMBER))
                    continue

                # for this language, the total examples number could be less than {TRAINING_EXAMPLES_NUMBER}
                indexesOfTrainingExamples = random.sample(
                    range(1, languagesExamplesCounter[language]),
                    TRAINING_EXAMPLES_NUMBER
                )

                # list all examples in {languageFolder.name} folder
                exampleIndex = 0
                for exampleFolder in FileManager.getExamplesFolders(languageFolder.path):
                    # list all examples versions in {exampleFolder.name} folder
                    for exampleVersionFile in FileManager.getExampleFiles(exampleFolder.path):
                        exampleIndex += 1
                        # move file to right dataset
                        if exampleIndex in indexesOfTrainingExamples:
                            DATASET_TYPE = TRAINING_URL
                        else:
                            DATASET_TYPE = TESTING_URL

                        # prepare destination folder
                        example = str(exampleVersionFile.name).lower()
                        exampleFolderUri = os.path.join(DATASET_TYPE, language, example)
                        os.mkdir(exampleFolderUri)
                        # copy the ORIGINAL source file content
                        originalFileUri = FileManager.getOriginalFileUrl(exampleFolderUri)
                        FileManager.createFile(originalFileUri)
                        shutil.copyfile(exampleVersionFile.path, originalFileUri)
                        # create the  'PARSED' version of the orginal file
                        parsedFileUri = FileManager.getParsedFileUrl(exampleFolderUri)
                        FileManager.createFile(parsedFileUri)
                        parser = Parser()
                        parser.initialize(originalFileUri, parsedFileUri)
                        parser.parse()

        return self
Ejemplo n.º 12
0
    def __loadInMemory(self):
        TRAINING_URL = FileManager.datasets['training']['url']
        TESTING_URL = FileManager.datasets['testing']['url']

        # training
        for languageFolder in FileManager.getLanguagesFolders(TRAINING_URL):
            language = str(languageFolder.name).lower()
            self.Dataset.addLanguage('training', language)
            # example
            for exampleFolder in FileManager.getExamplesFolders(languageFolder.path):
                exampleDict: dict = {}
                # original file
                originalFileUri = FileManager.getOriginalFileUrl(exampleFolder.path)
                originalFileContent = FileManager.readFile(originalFileUri)
                exampleDict['original'] = originalFileContent
                # parsed file
                parsedFileUri = FileManager.getParsedFileUrl(exampleFolder.path)
                parsedFileContent = FileManager.readFile(parsedFileUri)
                exampleDict['parsed'] = parsedFileContent
                # save
                self.Dataset.addExample('training', language, exampleDict)

        # testing
        for languageFolder in FileManager.getLanguagesFolders(TESTING_URL):
            language = str(languageFolder.name).lower()
            self.Dataset.addLanguage('testing', language)
            # example
            for exampleFolder in FileManager.getExamplesFolders(languageFolder.path):
                exampleDict: dict = {}
                # original file
                originalFileUri = FileManager.getOriginalFileUrl(exampleFolder.path)
                originalFileContent = FileManager.readFile(originalFileUri)
                exampleDict['original'] = originalFileContent
                # parsed file
                parsedFileUri = FileManager.getParsedFileUrl(exampleFolder.path)
                parsedFileContent = FileManager.readFile(parsedFileUri)
                exampleDict['parsed'] = parsedFileContent
                # save
                self.Dataset.addExample('testing', language, exampleDict)

        return self
Ejemplo n.º 13
0
    def PUT(self, request):
        logging.debug("proxy.put")

        proxy_nodes = []
        storage_nodes = []
        self.get_nodes(proxy_nodes, storage_nodes)
        print("storage_node: %s" % storage_nodes)
        print("proxy_node: %s" % proxy_nodes)

        #request.showVariables()
        print("Am I leader? %s" % self.group.is_leader)
        response = {"headers": [], "status": "200 OK", "body": ""}

        replication = request.metadata.get("HTTP_REPLICATION", None)
        if replication == "1":
            logging.debug("REPLICATION")
            fm = FileManager(self.data_path + "/" + request.file_name,\
                             request.file_name)
            if not fm.write_file(request.body):
                response['status'] = http_code['500']
                response['body'] = "Could not replicate"
                logging.debug("Could not replicate")
            return response

        if len(proxy_nodes) < 3:
            response['status'] = http_code['304']
            response['body'] = "Deny of service due server is down"
            print("Deny of service due server is down")
            logging.debug("Deny of service due server is down")
            return response

        if not self.group.is_leader:
            response['status'] = http_code['304']
            response['body'] = "No leader. Try to find the leader"
            logging.debug("I'm no leader. Try to find the leader")
            return response

        if not request.file_name:
            response['status'] = http_code['400']
            response['body'] = "No valid file name found"
            logging.debug("No valid file name found")
            return response

        client_id = request.metadata.get("HTTP_CLIENT_ID", "test:test")
        #logging.debug("HTTP_CLIENT_ID: %s" % client_id)


        metadata = MetadataManager(self.data_path + "/" + request.file_name,
                                   request.file_name)

        file_exist = metadata.restore()
        if len(metadata.lock_list) > 0:
            response['status'] = http_code['403']
            response['body'] = "File is locked"
            logging.debug("file is locked")
            return response

        metadata.size = request.metadata["HTTP_DATA_LENGTH"]
        metadata.etag = request.metadata["HTTP_ETAG"]

        # Create File in any nodes
        if not file_exist:
            write_balancer = WriteLoadBalancer(self.data_path, storage_nodes)
            balanced_storage_list = write_balancer.balance()
            if balanced_storage_list:
                storage_list = balanced_storage_list
            else:
                storage_list = storage_nodes

            logging.debug("storage_list: %s" % storage_list)
            replicas_count = request.metadata.get("HTTP_REPLICAS_COUNT", 3)
            replica_list = storage_list[:int(replicas_count)]
            logging.debug("replicas_list: %s" % replica_list)
            metadata.storage_list = replica_list
            metadata.replicas_count = replicas_count

        # Update File in the same nodes
        else:
            replica_list = metadata.storage_list

        available_replicas = []
        for replica in metadata.storage_list:
            for node in storage_nodes:
                if replica[0] == node[0] and replica[1] == node[1]:
                    available_replicas.append((replica[0], replica[1]))

        if len(available_replicas) != len(metadata.storage_list):
            response['status'] = http_code['304']
            response['body'] = "Deny of service due server is down"
            print("Deny of service due server is down")
            logging.debug("Deny of service due server is down")
            return response

        #if  len(replica_list) < 3:
            #response['status'] = http_code['304']
            #response['body'] = "Deny of service due server is down"
            #return response

        #fl = FileLock(self.data_path + "/" + request.file_name)
        #while True:
            #if not fl.is_locked():
                #break
            #else:
                #logging.debug("file is locked. Someone is updating")
                #time.sleep(1)
                #response['status'] = http_code['403']
                #response['body'] = "File is locked. Someone is updating."
                #return response


        # Cuts off not needed replicas
        metadata.add_lock_item(client_id, "LOCK_WRITE")
        metadata.save()
        #fl.lock()

        body = {}
        body["STORAGE_LIST"] = replica_list
        response['body'] = json.dumps(body)

        response['status'] = http_code['201']
        logging.debug("response: %s" % response)
        return response