def process(self, x): if isinstance(x, list): result = {} for item in x: if 't' in item and 'ext_data' in item and self.param in item[ 'ext_data']: date = '19700101' try: date = datesutil.timestamp_to_datetime( item['t'], '%Y%m%d') except: logger.info('error datesutil.timestamp_to_datetime:' + str(x)) result[date] = result.get(date, 0) + datautil.str2float( item['ext_data'][self.param], 0) else: logger.debug("SumOnParamByDay error item: " + str(item)) return result else: result = {} item = x if 't' in item and 'ext_data' in item and self.param in item[ 'ext_data']: date = '19700101' try: date = datesutil.timestamp_to_datetime(item['t'], '%Y%m%d') except: logger.info('error datesutil.timestamp_to_datetime:' + str(x)) result[date] = result.get(date, 0) + datautil.str2float( item['ext_data'][self.param]) else: logger.debug("SumOnParamByDay error item: " + str(item)) return result
def process(self, x): result = 0 if isinstance(x, list): for item in x: if "ext_data" in item: if self.param in item["ext_data"]: result += datautil.str2float( item['ext_data'][self.param], 0) else: item = x if "ext_data" in item: if self.param in item["ext_data"]: result += datautil.str2float(item['ext_data'][self.param], 0) return result
def stdin_stream(columns=None, field_delimiter=',', ignore_first_line=False, ignore_error_line=False, ignore_blank_line=True, callback=None, batch_count=10000, batch_key=None): ''' 流式读取hive text 流 :param columns: list 列名,需与实际列长度一致,可空,为空时按照数字顺序填充 :param field_delimiter: str 列分隔符,默认为逗号 :param ignore_first_line: boolean 是否忽略首行,默认不忽略 :param ignore_error_line: boolean 是否忽略错误行,包括列长与首个有效行不一致的行,默认不忽略,不忽略时数据有错误会抛出异常。 :param ignore_blank_line: boolean 是否忽略空白行,默认忽略 :param callback: function 回调处理方法 :param batch_count: int 按接收条数回调处理, :param batch_key: list 列名 接收列做为key分组。默认为None,表示每一条为一组。 在有序输入流时,能保证同一个key的数据不被截断为两个batch :return: 无 ''' if field_delimiter.startswith("0x"): field_delimiter = chr(int(field_delimiter, base=16)) data = [] keys_set = set() length = 0 count = 0 first_parts = [] start = time.time() batch_key_index = [] column_names = [] lines = 0 for line in sys.stdin: lines += 1 if ignore_first_line and lines == 1: line = line.strip('\n') if len(line) > 0: first_parts = line.split(field_delimiter) else: line = line.strip('\n') if ignore_blank_line and len(line) == 0: continue parts = line.split(field_delimiter) if length == 0: length = len(parts) if length == len(first_parts) and len(columns) == 0: columns = [[part, 'str', ''] for part in first_parts] elif len(columns) < length: for i in range(len(columns), length): columns.append(["value-" + str(i), 'str', '']) column_names = [column[0] for column in columns] if batch_count > 0 and batch_key is not None: columns_index_dict = { column: index for index, column in enumerate(column_names) } batch_key_index = [ columns_index_dict[column] for column in batch_key if column in columns_index_dict ] if len(parts) == length: parse_parts = [] for index, part in enumerate(parts): column_type = columns[index][1] column_default = columns[index][2] if column_type == "int": parse_part = datautil.str2int(part, int(column_default)) elif column_type == "float": parse_part = datautil.str2float( part, float(column_default)) elif column_type == "bool": parse_part = datautil.str2bool(part, bool(column_default)) else: parse_part = part parse_parts.append(parse_part) if batch_count > 0 and len(batch_key_index) > 0: key_item = tuple( [parse_parts[key] for key in batch_key_index]) if key_item not in keys_set: if count >= batch_count: logger.info('batch items len = ' + str(len(data)) + " cost = " + str(time.time() - start)) callback(DataFrame(data, columns=column_names)) count -= batch_count data = [] keys_set = set() start = time.time() keys_set.add(key_item) count += 1 else: if count >= batch_count > 0: logger.info('batch items len = ' + str(len(data)) + " cost = " + str(time.time() - start)) callback(DataFrame(data, columns=column_names)) count -= batch_count data = [] start = time.time() count += 1 data.append(parse_parts) elif not ignore_error_line: error_desc = line + "\nexpect be " + str( length) + " parts, actually is " + str( len(parts)) + " parts" raise Exception(error_desc) if len(data) > 0: logger.info('last batch items len = ' + str(len(data)) + " cost = " + str(time.time() - start)) callback(DataFrame(data, columns=column_names))
def csv(file_path, columns, field_delimiter=',', ignore_first_line=False, ignore_error_line=False, ignore_blank_line=True): ''' 读取CSV文件。支持从本地、hdfs、http读取 :param file_path: str 可以是单个或者多个文件,多个文件按逗号分隔; 也可以是一层或者多层的文件夹; 不可空 :param columns: list 列名,需与实际列长度一致,可空,为空时按照数字顺序填充或者首行填充 :param field_delimiter: str 列分隔符,默认为逗号 :param ignore_first_line: boolean 是否忽略首行,默认不忽略 :param ignore_error_line: boolean 是否忽略错误行,包括列长与首个有效行不一致的行,默认不忽略,不忽略时数据有错误会抛出异常。 :param ignore_blank_line: boolean 是否忽略空白行,默认忽略 :return: DataFrame 二维数据表 ''' files = fileutil.get_paths(file_path) if field_delimiter.startswith("0x"): field_delimiter = chr(int(field_delimiter, base=16)) data = [] length = 0 first_parts = [] column_names = [] for file_path in files: with open(file_path) as f: if ignore_first_line: line = f.readline().strip('\n') if len(line) > 0: first_parts = line.split(field_delimiter) for line in f: line = line.strip('\n') if ignore_blank_line and not len(line): continue parts = line.split(field_delimiter) if length == 0: length = len(parts) if length == len(first_parts) and len(columns) == 0: columns = [[part, 'str', ''] for part in first_parts] elif len(columns) < length: for i in range(len(columns), length): columns.append(["value-" + str(i), 'str', '']) column_names = [column[0] for column in columns] if len(parts) == length: parse_parts = [] for index, part in enumerate(parts): column_type = columns[index][1] column_default = columns[index][2] if column_type == "int": parse_part = datautil.str2int( part, int(column_default)) elif column_type == "float": parse_part = datautil.str2float( part, float(column_default)) elif column_type == "bool": parse_part = datautil.str2bool( part, bool(column_default)) else: parse_part = part parse_parts.append(parse_part) data.append(parse_parts) elif not ignore_error_line: error_desc = line + "\nexpect be " + str( length) + " parts, actually is " + str( len(parts)) + " parts" raise Exception(error_desc) if len(data) > 0: return DataFrame(data, columns=column_names) else: raise Exception("no data, please check your input path!")