class UmcWriter(object): def __init__(self, config, writerDef): self.config = config self.writerDef = writerDef self.writer_id = writerDef["writer-id"] # read common writer's params base_key = "common.umcpush.writer-params" self.params = Map( delay_writes=self.config.value(base_key + ".delay-writes", 0.2), delay_runs=self.config.value(base_key + ".delay-runs", 10), connection_retry_count=self.config.value( base_key + ".connection-retry-count", 5), connection_retry_interval=self.config.value( base_key + ".connection-retry-interval", 10), write_interval=self.config.value(base_key + ".write-interval", 0), ) # base key for this writer's configuration #self.base_key="common.umcpush.{writer_id}.".format(writer_id=self.writer_id) # update any specific writer's param of this writer # update any value that may be overriden in writer's specific parameters wparams = self.param("writer-params") if wparams is not None: for k, v in wparams.items(): k = k.replace("-", "_") # update only params that exist in common params if self.params.get(k) is not None: self.params[k] = v else: # this param may be used in child's configuration pass # // init # def param_key(self,param_name): # return self.base_key + param_name def param(self, param_name, default=None): return self.config.value_element(self.writerDef, param_name, default) def read_umcdef(self, umc_id, umcconf): writers = self.config.value_element(umcconf, "writers", []) for writer in writers: if writer["writer-id"] == self.writer_id: return Map(enabled=self.config.value_element( writer, "enabled", True), writerDef=writer) # writer definition for this umc instance has not been found return Map(enabled=False, writerDef=None)
def run_task(self, GlobalContext, tdef): if GlobalContext.umcdefs is not None: for ud in GlobalContext.umcdefs: if ud.enabled: ud.lock.acquire() try: log_stats=Map(backlog_total=0, errorlog_mtime=0, errorlog_size=0, errorlog_tail=[]) log_dir=get_umc_instance_log_dir(ud.umc_instanceid, GlobalContext) if os.path.isdir(log_dir): for file in [os.path.basename(f.path) for f in scandir(log_dir)]: # match the log file waiting to be consumed # there is a maximum of 9 groups (1-9) m1 = re.match(r"^{umc_instanceid}_[0-9\-]+.log.([1-9])$".format(umc_instanceid=ud.umc_instanceid), file) if m1: fg_key="backlog_group_%s"%m1.group(1) if log_stats.get(fg_key) is None: log_stats[fg_key]=1 else: log_stats[fg_key]+=1 log_stats.backlog_total += 1 # // if match log file # match the error log m2 = re.match(r"^{umc_instanceid}(_[0-9\-]+)?.error.out$".format(umc_instanceid=ud.umc_instanceid), file) if m2: stat=os.stat(log_dir + "/" + file) log_stats.errorlog_size=stat.st_size if log_stats.errorlog_size>0: log_stats.errorlog_mtime=stat.st_mtime else: log_stats.errorlog_mtime=0 #the below takes too much time to finish, better not run this #log_stats.errorlog_tail=utils.tail(log_dir + "/" + file, 10) # // if match error log # // for else: Msg.warn_msg("Directory %s does not exist!"%log_dir) # update log stats ud.log_stats = log_stats finally: ud.lock.release() # // if enabled # // for # // if return True
class UmcReader: def __init__(self, config, writer_id): self.config = config # read common reader's params base_key = "common.umcpush.reader-params" self.params = Map( max_batchsize_rows=self.config.value( base_key + ".max-batchsize-rows", 50), max_batchsize_files=self.config.value( base_key + ".max-batchsize-files", 300), log_file_group=self.config.value(base_key + ".log-file-group", 1), common_tags=self.config.value(base_key + ".common-tags").split(','), common_fields=self.config.value(base_key + ".common-fields").split(','), default_timefield=self.config.value( base_key + ".default-timefield", "datetime"), default_timeformat=self.config.value( base_key + ".default-timeformat", "%Y-%m-%d %H:%M:%S"), tzoffset=utils.float_ex( self.config.value(base_key + ".tzoffset", 0), 0)) # update any value that may be overriden in writer's specific parameters writers = config.value("common.umcpush.writers") for writer in writers: if writer["writer-id"] == writer_id: rparams = writer["reader-params"] if rparams is not None: for k, v in rparams.items(): k = k.replace("-", "_") if self.params.get(k): self.params[k] = v else: Msg.warn_msg( "The reader param %s is invalid in %s" % (k, key)) # *** reads and checks umc definition for a specific umc id def read_umcdef(self, umc_id, umcconf): # tags and fields cols of this umc definition tcols = [ x.strip() for x in self.config.value_element( umcconf, "reader.tags").split(',') if x != '' ] fcols = [ x.strip() for x in self.config.value_element( umcconf, "reader.fields").split(',') if x != '' ] # combine with common tags and fields cols tcols.extend(x for x in [y.strip() for y in self.params.common_tags] if x != '' and x not in tcols and '!' + x not in tcols) fcols.extend(x for x in [y.strip() for y in self.params.common_fields] if x != '' and x not in fcols and '!' + x not in tcols) # remove all commented out fields and tags tcols = [x for x in tcols if not (x.startswith('!'))] fcols = [x for x in fcols if not (x.startswith('!'))] # read and check time field and its format timeformat = self.config.value_element(umcconf, "reader.timeformat", self.params.default_timeformat) try: if timeformat not in ['_unix_', '_time_s_', '_time_ms_']: strftime(timeformat, gmtime()) except Exception as e: raise Exception( "The time format '%s' is invalid for umc '%s': %s!" % (timeformat, umc_id, e)) timefield = self.config.value_element(umcconf, "reader.timefield", self.params.default_timefield) tzfield = self.config.value_element(umcconf, "reader.tzfield", None) filter = self.config.value_element(umcconf, "reader.filter", None) # transformation expressions transform = self.config.value_element(umcconf, "reader.transform", None) return Map(tcols=tcols, fcols=fcols, timeformat=timeformat, timefield=timefield, tzfield=tzfield, filter=filter, transform=transform) # // read_umcdef # unix time def unix_time_millis(self, dt): return int((dt - epoch).total_seconds() * 1000) # retrieves the first batch of log files sorted by modified time def get_batch_logs(self, logDir, umc_instanceids, files_in_buffer=[]): pattern = re.compile(".+_[0-9]+.*\.log.{log_file_group}$".format( log_file_group=self.params.log_file_group)) search_re = logDir + "/[a-zA-Z0-9\._\-]+/([a-zA-Z0-9\-\._]+)" # + "|".join(GlobalContext.config.umc_instanceids(False)) + ")$"; batch = [] cnt = 0 for dirname, dirnames, filenames in walk(logDir): #Msg.info1_msg("walk: %s, filenames=%d"%(dirname,len(filenames))) m = re.match(search_re, dirname) if m and m.group(1) in umc_instanceids: for filename in filenames: fullfname = os.path.join(dirname, filename) if fullfname not in files_in_buffer and pattern.match( filename): cnt = cnt + 1 if cnt <= self.params.max_batchsize_files: batch.append(fullfname) if cnt > self.params.max_batchsize_files: break return sorted(batch, key=lambda fn: os.stat(fn).st_mtime, reverse=True) # // get_batch_logs # read data points from a single log file def read_datapoints(self, logfilename, umcdef, create_writeitem_func): datapoints = [] notags = False nofields = False tzoffset = self.params.tzoffset if umcdef.enabled: # read datapoints with open(logfilename, 'r') as csvfile: reader = csv.DictReader(csvfile, delimiter=',') for row in reader: # remove None keys row = {k: v for k, v in row.items() if k is not None} # timestamp try: if not (umcdef.reader.timefield in row): raise ValueError("Cannot find time field '" + umcdef.reader.timefield + "' in data row!") if umcdef.reader.timeformat == "_unix_" or umcdef.reader.timeformat == "_time_s_": timestamp = long( row[umcdef.reader.timefield]) * 1000000000 elif umcdef.reader.timeformat == "_time_ms_": timestamp = long( row[umcdef.reader.timefield]) * 1000000 else: if umcdef.reader.tzfield is not None and umcdef.reader.tzfield in row: tzoffset = utils.float_ex( row[umcdef.reader.tzfield], self.params.tzoffset) timestamp = (self.unix_time_millis( datetime.datetime.strptime( row[umcdef.reader.timefield], umcdef.reader.timeformat)) - int( tzoffset * 60 * 60 * 1000)) * 1000000 except Exception as e: # output error and skip this row Msg.err_msg( "Cannot read or convert time to timestamp for %s: %s" % (umcdef.umcid, str(e))) continue # create tags and fields tags = { k: str(v) for k, v in row.items() if k in umcdef.reader.tcols } fields = { k: utils.float_ex(v) for k, v in row.items() if k in umcdef.reader.fcols } notags = (len(tags) == 0) # only add this row if there is at least one field with some value if len([v for k, v in fields.items() if v is not None]) > 0: # evaluate transformations if umcdef.reader.transform is not None: tags, fields = eval_transform( umcdef.reader.transform, timestamp, tags, fields) # only add this row if filter holds on this row or there is no filter if umcdef.reader.filter is None or eval_filter( umcdef.reader.filter, timestamp, tags, fields): try: records = create_writeitem_func( umcdef, timestamp, fields, tags) if records is not None and isinstance( records, list): datapoints += records except Exception as e: Msg.err_msg( "Error occured while creating data points item: %s" % str(e)) # // if write data # // end reading rows # // end open file # check for no tags if notags and len(datapoints) > 0: Msg.warn_msg( "The definition of %s contains no tags presented in the log file %s!" % (umcdef.umcid, os.path.basename(logfilename))) return datapoints