def __validate_input(self): if self.df is None: raise ValueError("Feature dataframe should not be of None type") if type(self.df) is not pd.core.frame.DataFrame: raise TypeError( "Feature dataframe is not a valid dataframe.\nExpected object type: pandas.core.frame.DataFrame" ) if self.type is None: raise ValueError("Feature type should not be of None type") else: if type(self.type) is not str: raise TypeError(f'Expected string value for argument "type" ') if self.type not in ["MinMaxScaler", "BinaryScaler", "StandardScaler"]: raise ArgumentsError( f'Allowed argument for type is "MinMaxScaler" or "BinaryScaler" or "StandardScaler", got {self.type}' ) if self.columns is not None: column_list = list(self.df.keys()) for column in self.columns: if type(column) != str: raise TypeError(f"Expected str type column, got {type(column)}") if column not in column_list: raise ArgumentsError(f"Column {column} does not exist in dataframe") self.new_df = self.df
def isempty(content): if isinstance(content, int): return content == 0 elif isinstance(content, basestring): return content.lower() in ["", "null", "[]"] else: raise ArgumentsError()
def speech_recognition(self, content, _attrs=None): if content is not None: content = json.loads(content) if not isinstance(content, list): content = [content] if "dest_dir" not in _attrs and not Utils.isempty(_attrs["dest_dir"]): raise ArgumentsError("dest_dir Is Not Set In Attributes.") cmd = Config.SPEECH_RECOGNITION_CMD.format(_attrs["dest_dir"]) if not self.is_test_mode: _dir = Config.SPEECH_RECOGNITION_RESULT_DIR _dirs = [ _dir, os.path.join(_dir, "voiceconflict"), os.path.join(_dir, "voiceresult"), os.path.join(_dir, "voicesence") ] for _d in _dirs: if not os.path.is_dir(_d): os.makedirs(_d, 0755) self.logger.info("Speech Recognition Command: {0}".format(cmd)) os.system(cmd) time.sleep(self.cmd_sleep) return ProcessorResponse(corrects=content, attributes={ "command": cmd })._print()
def create_partition(self, content, _attrs=None): attrs = { "recorddate": "", "filename": "", "mysql_put_date": "", "area_of_job": "", "date": "", "batch": "", "command": "", } if content is not None: content = json.loads(content) if not isinstance(content, list): content = [content] for item in content: if "start_time" in item and "area_of_job" in item: start_time = item["start_time"] (year, mon, day) = Utils.timestamp_to_ymd(start_time) attrs["recorddate"] = Utils.timestamp_to_partiton(start_time) attrs["date"] = "{0}{1}{2}".format(year, mon, day) attrs["filename"] = "{0}_hive".format(attrs["date"]) attrs["mysql_put_date"] = "{0}-{1}-{2}".format(year, mon, day) attrs["area_of_job"] = item["area_of_job"] attrs["batch"] = attrs["date"] break else: raise ArgumentsError("start_time or area_of_job is Empty.") if "recorddate" in attrs and attrs["recorddate"] != "": cmd = Config.CREATE_HIVE_PARTITIONS_COMMAND_PATTERN.format( recorddate=attrs["recorddate"]).strip() attrs["command"] = cmd self.logger.info("Create Partition CMD: {0}".format(cmd)) if not self.is_test_mode: os.system(cmd) time.sleep(self.cmd_sleep) for (k, v) in attrs.items(): if v == "": raise ArgumentsError("create_partition attrs value is Empty.") return ProcessorResponse(corrects=content, attributes=attrs)._print()
def wav2png(self, content, _attrs=None): """ 语音文件生成波形图 """ cmd = None if content is not None: content = json.loads(content) if not isinstance(content, list): content = [content] if not Utils.jsonobj_isempty(content): if "dest_dir" not in _attrs and \ not Utils.isempty(_attrs["dest_dir"]): raise ArgumentsError("dest_dir Is Not Set In Attributes.") ftp_dw_root = Utils.use_if_set_else_default( "ftp_download_root_dir", _attrs, Config.FTP_DOWNLOAD_ROOT_DIR) wavform_root = Utils.use_if_set_else_default( "wavform_root", _attrs, Config.WAVFORM_ROOT_DIR) input_dir = _attrs["dest_dir"] output_dir = os.path.join( wavform_root, *input_dir.split(ftp_dw_root)[-1].split("/")) (cmd, _, _output_dir) = Utils.wav2png(input_dir, output_dir, self.is_test_mode) self.logger.info("WAV Transform PNG Command: {0}".format(cmd)) for _file in content: if "download_path" in _file: fname = os.path.basename(_file["download_path"]) png_path = os.path.join(_output_dir, "{0}.png".format(fname)) self.logger.info("Get PNG Result [{0}]".format(png_path)) if not self.is_test_mode: try: _content = Utils.get_file_strcontents(png_path) _file["waveform"] = "{0}{1}".format( Config.WAVFORM_PREFIX, base64.b64encode(_content)) except Exception as e: _file["wav2png_errors"] = str(e) _file["id"] = fname return ProcessorResponse(corrects=content, attributes={ "command": cmd })._print()
def wav2png(input_dir=None, output_dir=None, is_test_mode=False): if input_dir is None or output_dir is None: raise ArgumentsError() input_dir = Utils.append_suffix_not_exists(input_dir, "/") output_dir = Utils.append_suffix_not_exists(output_dir, "/") if not os.path.exists(input_dir) or not os.path.isdir(input_dir): raise FileNotFoundError(input_dir) if not os.path.exists(output_dir) or not os.path.isdir(output_dir): os.makedirs(output_dir) cmd = Config.WAV_TO_PNG_COMMAND.format( input_dir=input_dir, output_dir=output_dir) if not is_test_mode: os.system(cmd) return (cmd, input_dir, output_dir)
def split_flowfiles_for_stt(self, content=None, _attrs=None): """ 由于语音识别模块是集群,所以再进行语音识别模块前需要将数据预先分开, 同时由于执行识别是以目录为入口,所以需要将分离的语音文件移动到不同的目录中。 """ if "dest_dir" not in _attrs and not Utils.isempty(_attrs["dest_dir"]): raise ArgumentsError("dest_dir Is Not Set In Attributes.") result = [] if content is not None: content = json.loads(content.strip()) if not isinstance(content, list): content = [content] split_group_number = int( Utils.use_if_set_else_default("split_group_number", _attrs, Config.DEFAULT_SPLIT_NUMBER)) dest_dir = _attrs["dest_dir"] content_chunks = Utils.groups(content, split_group_number) for i, content_chunk in enumerate(content_chunks): _attrs = copy.copy(_attrs) _dest_dir = os.path.join(dest_dir, "CHUNK-{0}".format(i + 1)) if not os.path.isdir(_dest_dir): os.makedirs(_dest_dir) _attrs["dest_dir"] = _dest_dir for _file in content_chunk: filename = os.path.basename(_file["DOCUMENTPATH"]) _src = os.path.join(dest_dir, filename) _dest = os.path.join(_dest_dir, filename) self.logger.debug("Copy Src:[{0}] To Dest:[{1}]".format( _src, _dest)) if not self.is_test_mode: shutil.move(_src, _dest) result.append( ProcessorResponse(corrects=content_chunk, attributes=_attrs)._print()) return result