class WCAggregatorBolt(Bolt): outputs = [ Stream(fields=['info', 'wordcounts_list'], name="default"), Stream(fields=['info', 'keywords'], name="rec") ] def initialize(self, conf, ctx): self.pid = os.getpid() self.wcs = [] self.count = 0 def process(self, tup): info = tup.values[0] wordcount_dict = tup.values[1] self.wcs.append(wordcount_dict) self.count += 1 if self.count == 10: self.count = 0 global_wordcount = wcm.aggregate_wordcount_dicts(self.wcs) global_wordcount_dict_best = { k: wcm.take_firsts(v, n=3) for k, v in global_wordcount.items() if k in ["PERSON", "ORGANIZATION"] } global_wordcount_best = wcm.aggregate_subjects( global_wordcount_dict_best) self.logger.info("sending tokens to RecursiveBolt") for token in global_wordcount_best: self.emit([info, token[0]], stream="rec") self.emit([info, self.wcs])
class MultiBolt(Bolt): outputs = [ Stream(['value'], 'word'), Stream(['value'], 'number'), ] def initialize(self, conf, ctx): self.pid = os.getpid() def process(self, tup): word = tup.values.word if isinstance(word, int): self.emit([word], stream='number') else: self.emit([word], stream='word')
class TDMSParseBolt(Bolt): # channel properties as tuple fields tuple_fields = ('timestamp', 'time_offset', 'time_increment', 'samples', 'channel_name', 'module_name', 'data') # group channels as output streams' name channels = ('FCXF-X-02-T01', 'FCXF-X-02-T02', 'FCXF-X-02-T03', 'FCXF-X-02-T04', 'FCXF-X-03-T01', 'FCXF-X-03-T02', 'FCXF-X-03-T03', 'FCXF-X-03-T04', 'FCXF-X-03-T05', 'FCXF-X-03-T06', 'FCXF-X-04-T01', 'FCXF-X-04-T02', 'FCXF-X-04-T03', 'FCXF-X-04-T04', 'FCXF-X-02-S01', 'FCXF-X-02-S02', 'FCXF-X-02-S03', 'FCXF-X-02-S04', 'FCXF-X-02-A01', 'FCXF-X-03-A01', 'FCXF-X-03-A02', 'FCXF-X-04-A01', 'FCXF-X-03-S05', 'FCXF-X-03-S06', 'FCXF-X-03-S01', 'FCXF-X-03-S02', 'FCXF-X-03-S03', 'FCXF-X-03-S04', 'FCXF-X-04-S01', 'FCXF-X-04-S02', 'FCXF-X-04-S03', 'FCXF-X-04-S04') # output streams declare outputs = [Stream(tuple_fields, channel) for channel in channels] def process(self, tup): data = tup.values[0] # decode the base64 encoded data decoded_data = base64.b64decode(data) # the tdms data starts with 'TDSm' i = decoded_data.index('TDSm') data_stream = io.BytesIO(decoded_data[i:]) tdms_file = TdmsFile(data_stream) for tup in self._parse(tdms_file): self.emit(tup, stream=tup[-3]) def _parse(self, tdms_file): for group in tdms_file.groups(): for channel in tdms_file.group_channels(group): if channel.channel in self.channels: # acquire this channel's 'wf_start_time' property # and get its timestamp value for JSON serialize start_time = channel.property('wf_start_time') timestamp = time.mktime(start_time.timetuple()) tup = [timestamp] # acquire this channel's other properties others = [ v for k, v in channel.properties.items() if k != 'wf_start_time' ] tup.extend(others) # acquire channel data data = channel.data.tolist() tup.append(data) yield tup
def setup_debug_hooks(topo_class): inputs = [] graph = topo_class.to_flux_dict('') for spec in topo_class.specs: for source_name, output in spec.outputs.items(): if source_name == "error_handler_bolt": continue inputs.append(spec[source_name]) topo_class.debug_bolt = ShellBolt.spec(command='coffee', script='node/bolts/debug.coffee', outputs=[Stream(fields=['config'])], inputs=inputs) topo_class.debug_bolt.name = "debug_bolt" Topology.add_bolt_spec(topo_class.debug_bolt, topo_class.thrift_bolts) topo_class.specs.append(topo_class.debug_bolt) directory = 'src/topology_graphs' if not os.path.exists(directory): os.makedirs(directory) topo_name = topo_class.__module__ split_topo_name = topo_name.split('.') if len(split_topo_name) > 1: topo_name = split_topo_name[1] path = "{}/{}.json".format(directory, topo_name) fullpath = os.path.abspath(path) with open(fullpath, 'w+') as outfile: json.dump(graph, outfile, indent=2)
class Tokenizer(AbstractBolt): """Split the mail in token parts (body, attachments, etc.) and sends these parts to others bolts.""" outputs = [ Stream( fields=['sha256_random', 'mail', 'is_filtered'], name='mail'), Stream( fields=['sha256_random', 'raw_mail', 'mail_type', 'is_filtered'], name='raw_mail'), Stream( fields=['sha256_random', 'body', 'is_filtered'], name='body'), Stream( fields=['sha256_random', 'network', 'is_filtered'], name='network'), Stream( fields=['sha256_random', 'with_attachments', 'attachments'], name='attachments')] def __getattr__(self, name): return self.conf[name] def get_persistent_path(self, filter_name): return os.path.join( self.persistent_path, "{}.dump".format(filter_name)) def initialize(self, stormconf, context): super(Tokenizer, self).initialize(stormconf, context) self.filter_types = ("mails", "attachments", "network") self.load_filters() self.mailparser = { MAIL_PATH: mailparser.parse_from_file, MAIL_PATH_OUTLOOK: mailparser.parse_from_file_msg, MAIL_STRING: mailparser.parse_from_string} def load_filters(self): for i in self.filter_types: if getattr(self, "filter_" + i): path = self.get_persistent_path(i) try: obj = load_obj(path) setattr(self, "analyzed_" + i, obj) except (IOError, EOFError, ValueError, BadPickleGet): setattr(self, "analyzed_" + i, deque( maxlen=getattr(self, "maxlen_" + i))) def dump_filters(self): for i in self.filter_types: if getattr(self, "filter_" + i): path = self.get_persistent_path(i) dump_obj(path, getattr(self, "analyzed_" + i)) self.log("Dumped RAM filter {!r} in {!r}".format(i, path)) def _make_mail(self, tup): raw_mail = tup.values[0] mail_type = tup.values[5] rand = '_' + ''.join(random.choice('0123456789') for i in range(10)) self.parser = self.mailparser[mail_type](raw_mail) # get only the mains headers because this number can explode # Elastic can't manage all possible headers mail = self.parser.mail_partial mail["headers"] = self.parser.headers_json # Data mail sources mail["mail_server"] = tup.values[1] mail["mailbox"] = tup.values[2] mail["priority"] = tup.values[3] mail["sender_ip"] = self.parser.get_server_ipaddress(tup.values[4]) # Fingerprints of body mail (mail["md5"], mail["sha1"], mail["sha256"], mail["sha512"], mail["ssdeep"]) = fingerprints(self.parser.body.encode('utf-8')) sha256_rand = mail["sha256"] + rand if mail_type in (MAIL_PATH, MAIL_PATH_OUTLOOK): mail_string = raw_mail.split("/")[-1].replace(".processing", "") self.log("{}: {}".format(mail_string, mail["sha256"])) with open(raw_mail) as f: mail["size"] = len(f.read()) elif mail_type in (MAIL_STRING): mail["size"] = len(raw_mail) # Add path to result if mail_type == MAIL_PATH: mail["mail_file"] = raw_mail.split("/")[-1].replace( ".processing", "") # Dates if mail.get('date'): mail["date"] = mail.get('date').isoformat() else: mail["date"] = datetime.datetime.utcnow().isoformat() mail["analisys_date"] = datetime.datetime.utcnow().isoformat() # Adding custom headers for h in tup.values[6]: mail["custom_" + h] = get_header(self.parser.message, h) # Remove attachments mail.pop("attachments", None) return sha256_rand, mail def process_tick(self, freq): """Every freq seconds you reload configuration. """ super(Tokenizer, self).process_tick(freq) self.dump_filters() def process(self, tup): try: sha256_rand, mail = self._make_mail(tup) sha256 = sha256_rand.split("_")[0] self.log("Processing started: {}".format(sha256)) with_attachments = False attachments = [] body = self.parser.body raw_mail = tup.values[0] mail_type = tup.values[5] # If filter network is enabled is_filtered_net = False if self.filter_network: if mail["sender_ip"] in self.analyzed_network: is_filtered_net = True # Update database ip addresses analyzed self.analyzed_network.append(mail["sender_ip"]) # If filter mails is enabled is_filtered_mail = False if self.filter_mails: if mail["sha1"] in self.analyzed_mails: mail.pop("body", None) body = six.text_type() raw_mail = six.text_type() is_filtered_mail = True # Update database mails analyzed self.analyzed_mails.append(mail["sha1"]) if self.parser.attachments: with_attachments = True attachments = MailAttachments.withhashes( self.parser.attachments) # If filter attachments is enabled if self.filter_attachments: hashes = attachments.filter(self.analyzed_attachments) self.analyzed_attachments.extend(hashes) except TypeError, e: self.raise_exception(e, tup) except UnicodeDecodeError, e: self.raise_exception(e, tup)
class Tokenizer(AbstractBolt): """Split the mail in token parts (body, attachments, etc.) and sends these parts to others bolts.""" outputs = [ Stream( fields=['sha256_random', 'mail', 'is_filtered'], name='mail'), Stream( fields=['sha256_random', 'raw_mail', 'mail_type', 'is_filtered'], name='raw_mail'), Stream( fields=['sha256_random', 'body', 'is_filtered'], name='body'), Stream( fields=['sha256_random', 'network', 'is_filtered'], name='network'), Stream( fields=['sha256_random', 'with_attachments', 'attachments'], name='attachments')] def initialize(self, stormconf, context): super(Tokenizer, self).initialize(stormconf, context) self.mailparser = { MAIL_PATH: mailparser.parse_from_file, MAIL_PATH_OUTLOOK: mailparser.parse_from_file_msg, MAIL_STRING: mailparser.parse_from_string} self.mails_analyzed = deque(maxlen=self.conf["maxlen_mails"]) self.network_analyzed = deque(maxlen=self.conf["maxlen_network"]) self.attachments_analyzed = deque( maxlen=self.conf["maxlen_attachments"]) self._load_filters() def _load_filters(self): self.filter_mails_enabled = self.conf["filter_mails"] self.filter_network_enabled = self.conf["filter_network"] self.filter_attachments_enabled = self.conf["filter_attachments"] def _make_mail(self, tup): raw_mail = tup.values[0] mail_type = tup.values[5] rand = '_' + ''.join(random.choice('0123456789') for i in range(10)) self.parser = self.mailparser[mail_type](raw_mail) mail = self.parser.mail # Data mail sources mail["mail_server"] = tup.values[1] mail["mailbox"] = tup.values[2] mail["priority"] = tup.values[3] mail["sender_ip"] = self.parser.get_server_ipaddress(tup.values[4]) # Fingerprints of body mail (mail["md5"], mail["sha1"], mail["sha256"], mail["sha512"], mail["ssdeep"]) = fingerprints(self.parser.body.encode('utf-8')) sha256_rand = mail["sha256"] + rand # Add path to result if mail_type == MAIL_PATH: mail["path_mail"] = raw_mail # Dates if mail.get('date'): mail["date"] = mail.get('date').isoformat() else: mail["date"] = datetime.datetime.utcnow().isoformat() mail["analisys_date"] = datetime.datetime.utcnow().isoformat() # Adding custom headers for h in tup.values[6]: mail["custom_" + h] = self.parser.message.get(h) # Remove attachments mail.pop("attachments", None) return sha256_rand, mail def process_tick(self, freq): """Every freq seconds you reload configuration. """ super(Tokenizer, self).process_tick(freq) self._load_filters() def process(self, tup): try: sha256_rand, mail = self._make_mail(tup) with_attachments = False attachments = [] body = self.parser.body raw_mail = tup.values[0] mail_type = tup.values[5] # If filter network is enabled is_filtered_net = False if self.filter_network_enabled: if mail["sender_ip"] in self.network_analyzed: is_filtered_net = True # Update database ip addresses analyzed self.network_analyzed.append(mail["sender_ip"]) # If filter mails is enabled is_filtered_mail = False if self.filter_mails_enabled: if mail["sha1"] in self.mails_analyzed: mail.pop("body", None) body = six.text_type() raw_mail = six.text_type() is_filtered_mail = True # Update database mails analyzed self.mails_analyzed.append(mail["sha1"]) if self.parser.attachments: with_attachments = True attachments = MailAttachments.withhashes( self.parser.attachments) # If filter attachments is enabled if self.filter_attachments_enabled: hashes = attachments.filter(self.attachments_analyzed) self.attachments_analyzed.extend(hashes) except TypeError, e: self.raise_exception(e, tup) except UnicodeDecodeError, e: self.raise_exception(e, tup)
class Tokenizer(AbstractBolt): """Split the mail in token parts (body, attachments, etc.). """ outputs = [ Stream(fields=['sha256_random', 'mail', 'is_filtered'], name='mail'), Stream(fields=['sha256_random', 'body', 'is_filtered'], name='body'), Stream(fields=['sha256_random', 'network', 'is_filtered'], name='network'), Stream(fields=['sha256_random', 'with_attachments', 'attachments'], name='attachments') ] def initialize(self, stormconf, context): super(Tokenizer, self).initialize(stormconf, context) self._parser = MailParser() self._mails_analyzed = deque(maxlen=self.conf["maxlen_mails"]) self._network_analyzed = deque(maxlen=self.conf["maxlen_network"]) self._attachments_analyzed = deque( maxlen=self.conf["maxlen_attachments"]) self._load_filters() def _load_filters(self): self._filter_mails_enabled = self.conf["filter_mails"] self._filter_network_enabled = self.conf["filter_network"] self._filter_attachments_enabled = self.conf["filter_attachments"] @property def filter_mails_enabled(self): return self._filter_mails_enabled @property def filter_network_enabled(self): return self._filter_network_enabled @property def filter_attachments_enabled(self): return self._filter_attachments_enabled @property def parser(self): return self._parser def _make_mail(self, tup): raw_mail = tup.values[0] mail_format = tup.values[5] rand = '_' + ''.join(random.choice('0123456789') for i in range(10)) # Check if kind_data is correct if mail_format != STRING and mail_format != PATH: raise InvalidMailFormat( "Invalid mail format {!r}. Choose {!r} or {!r}".format( mail_format, STRING, PATH)) # Parsing mail if mail_format == PATH: if os.path.exists(raw_mail): self.parser.parse_from_file(raw_mail) else: self.parser.parse_from_string(raw_mail) # Getting all parts mail = self.parser.parsed_mail_obj # Data mail sources mail["mail_server"] = tup.values[1] mail["mailbox"] = tup.values[2] mail["priority"] = tup.values[3] mail["sender_ip"] = self.parser.get_server_ipaddress(tup.values[4]) # Fingerprints of body mail (mail["md5"], mail["sha1"], mail["sha256"], mail["sha512"], mail["ssdeep"]) = fingerprints(self.parser.body.encode('utf-8')) sha256_rand = mail["sha256"] + rand # Add path to result if mail_format == PATH: mail["path_mail"] = raw_mail # Dates if mail.get('date'): mail["date"] = mail.get('date').isoformat() else: mail["date"] = datetime.datetime.utcnow().isoformat() mail["analisys_date"] = datetime.datetime.utcnow().isoformat() # Remove attachments mail.pop("attachments", None) return sha256_rand, mail def process_tick(self, freq): """Every freq seconds you reload configuration. """ super(Tokenizer, self).process_tick(freq) self._load_filters() def process(self, tup): try: sha256_rand, mail = self._make_mail(tup) with_attachments = False attachments = [] body = self.parser.body # If filter network is enabled is_filtered = False if self.filter_network_enabled: if mail["sender_ip"] in self._network_analyzed: is_filtered = True # Update databese mail analyzed self._network_analyzed.append(mail["sender_ip"]) # If filter mails is enabled is_filtered = False if self.filter_mails_enabled: if mail["sha1"] in self._mails_analyzed: mail.pop("body", None) body = six.text_type() is_filtered = True # Update databese mail analyzed self._mails_analyzed.append(mail["sha1"]) # Emit only attachments raw_attach = self.parser.attachments_list if raw_attach: with_attachments = True attachments = MailAttachments.withhashes(raw_attach) # If filter attachments is enabled if self.filter_attachments_enabled: hashes = attachments.filter(self._attachments_analyzed) self._attachments_analyzed.extend(hashes) except TypeError, e: self.raise_exception(e, tup) except UnicodeDecodeError, e: self.raise_exception(e, tup)
class DetrendBolt(Bolt): # todo: try pyecharts tup_fields = ('timestamp', 'time_offset', 'time_increment', 'samples', 'channel_name', 'module_name', 'data') outputs = [Stream(fields=tup_fields, name='detrend')] def initialize(self, storm_conf, context): # String, dt1: how often to recalculate a mode value, eg. '5s', '20ms' # String, dt2: how long data is taken in calculating the mode value, eg. '5s', '20ms' self.dt1 = pd.Timedelta(storm_conf.get('dt1', '5s')) self.dt2 = pd.Timedelta(storm_conf.get('dt1', '15s')) def process(self, tup): """ step 1: accept tup and convert to pandas.Dataframe; step 2: call self._detrend() function to remove the mode value from the original data; stip 3: convert resulted pandas.Dataframe to list and return. :param streamparse.Tuple, tup: tup.values = [timestamp, time_offset, time_increment, samples, channel_name, module_name, data] :return streamparse.Tuple, tup: tup.values = [timestamp, time_offset, time_increment, samples, channel_name, module_name, data] """ timestamp = tup.values[0] time_increment = tup.values[2] channel_name = tup.values[4] data = tup.values[6] index = pd.date_range(start=pd.Timestamp(timestamp, unit='s', tz='UTC'), periods=data.__len__(), freq='{}ms'.format(int(time_increment / 0.001))) df = pd.DataFrame(data=data, index=index, columns=[channel_name]) try: self.history = self.history.combine_first(df) except AttributeError: self.history = df self.freq = df.index.freq self.res = list(tup.values) self.history = self.history.asfreq(self.freq) for df in self._detrend(): self.res[0] = df.index[0].timestamp() self.res[6] = df[channel_name].values.tolist() self.emit(self.res, stream='detrend') def _detrend(self): """ remove the mode value from the original data :return pandas.Dataframe df: de-trended signal segments """ self.log('length of history data: {}'.format(self.history.__len__()), level='info') n = int(self.dt1 / self.freq) m = int(self.dt2 / self.freq) while self.history.__len__() >= m and self.history.head(m).notnull().all().all() \ or self.history.__len__() >= 2 * m and self.history.head(m).notnull().any().all(): mode = self.history.head(m).mode() df = self.history[:n] - mode.loc[0].values self.history = self.history[n:] yield df
class Tokenizer(AbstractBolt): """Split the mail in token parts (body, attachments, etc.). """ outputs = [ Stream(fields=['sha256_random', 'mail', 'is_filtered'], name='mail'), Stream(fields=['sha256_random', 'body', 'is_filtered'], name='body'), Stream(fields=['sha256_random', 'with_attachments', 'attachments'], name='attachments') ] def initialize(self, stormconf, context): super(Tokenizer, self).initialize(stormconf, context) self._parser = MailParser() self._mails_analyzed = deque(maxlen=self.conf["maxlen_mails"]) self._attachments_analyzed = deque( maxlen=self.conf["maxlen_attachments"]) self._load_filters() def _load_filters(self): self._filter_mails_enabled = self.conf["filter_mails"] self._filter_attachments_enabled = self.conf["filter_attachments"] @property def filter_mails_enabled(self): return self._filter_mails_enabled @property def filter_attachments_enabled(self): return self._filter_attachments_enabled @property def parser(self): return self._parser def _filter_attachments(self): """ Filter the attachments that are in memory, already analyzed """ attachments = self.parser.attachments_list new_attachments = [] for i in attachments: if i.get("content_transfer_encoding") == "base64": f = fingerprints(i["payload"].decode('base64')) else: f = fingerprints(i["payload"]) if self.filter_attachments_enabled and \ f[1] in self._attachments_analyzed: new_attachments.append({ "md5": f[0], "sha1": f[1], "sha256": f[2], "sha512": f[3], "ssdeep": f[4], "is_filtered": True }) else: i["is_filtered"] = False new_attachments.append(i) self._attachments_analyzed.append(f[1]) return new_attachments def _make_mail(self, tup): raw_mail = tup.values[0] mail_format = tup.values[4] rand = '_' + ''.join(random.choice('0123456789') for i in range(10)) # Check if kind_data is correct if mail_format != STRING and mail_format != PATH: raise InvalidMailFormat( "Invalid mail format '{}'. Choose '{}' or '{}'".format( mail_format, STRING, PATH)) # Parsing mail if mail_format == PATH: if os.path.exists(raw_mail): self.parser.parse_from_file(raw_mail) else: self.parser.parse_from_string(raw_mail) # Getting all parts mail = self.parser.parsed_mail_obj # Data mail sources mail['mail_server'] = tup.values[1] mail['mailbox'] = tup.values[2] mail['priority'] = tup.values[3] # Fingerprints of body mail (mail['md5'], mail['sha1'], mail['sha256'], mail['sha512'], mail['ssdeep']) = fingerprints(self.parser.body.encode('utf-8')) sha256_rand = mail['sha256'] + rand # Add path to result if mail_format == PATH: mail['path_mail'] = raw_mail # Dates if mail.get('date'): mail['date'] = mail.get('date').isoformat() else: mail['date'] = datetime.datetime.utcnow().isoformat() mail['analisys_date'] = datetime.datetime.utcnow().isoformat() # Remove attachments mail.pop("attachments", None) return sha256_rand, raw_mail, mail def process_tick(self, freq): """Every freq seconds you reload configuration. """ super(Tokenizer, self).process_tick(freq) self._load_filters() def process(self, tup): sha256_rand, raw_mail, mail = self._make_mail(tup) with_attachments = False attachments = [] # If mail is already analyzed if self.filter_mails_enabled and \ mail["sha1"] in self._mails_analyzed: mail.pop("body", None) body = "" is_filtered = True else: body = self.parser.body is_filtered = False # Emit mail self.emit([sha256_rand, mail, is_filtered], stream="mail") # Emit body self.emit([sha256_rand, body, is_filtered], stream="body") # Update databese mail analyzed self._mails_analyzed.append(mail["sha1"]) # Emit only attachments if self.parser.attachments_list: attachments = self._filter_attachments() with_attachments = True self.emit([sha256_rand, with_attachments, attachments], stream="attachments")
class DetrendBolt(Bolt): tup_fields = ('timestamp', 'time_offset', 'time_increment', 'samples', 'channel_name', 'module_name', 'data') outputs = [Stream(fields=tup_fields, name='detrend')] def initialize(self, storm_conf, context): """ receive parameters set in topology definition from storm_conf argument :param dict storm_conf: the Storm configuration for this component :param dict context: information about the component’s place within the topology """ self.min_tup_num = storm_conf.get('min_tup_num', 3) def process(self, tup): """ step 1: receive tup and convert to pandas.Dataframe; step 2: call self._detrend() function to remove the mode value from the original data; step 3: convert resulted pandas.Dataframe to list and send out. :param streamparse.Tuple, tup: tup.values = [timestamp, time_offset, time_increment, samples, channel_name, module_name, data] :return streamparse.Tuple, tup: tup.values = [timestamp, time_offset, time_increment, samples, channel_name, module_name, data] """ timestamp = tup.values[0] time_increment = tup.values[2] channel_name = tup.values[4] data = tup.values[6] start_time = pd.Timestamp(timestamp, unit='s', tz='UTC') periods = data.__len__() freq = '{}ms'.format(int(time_increment / 0.001)) index = pd.MultiIndex.from_product( [[start_time], pd.date_range(start=start_time, periods=periods, freq=freq)], names=['start_time', 'timestamp']) df = pd.DataFrame(data=data, index=index, columns=[channel_name]) try: self.history = self.history.combine_first(df) except AttributeError: self.history = df self.res = list(tup.values) for df in self._detrend(): self.res[0] = df.index[0].timestamp() self.res[6] = df[channel_name].values.tolist() self.emit(self.res, stream='detrend') def _detrend(self): """ remove the mode value from the original data :return pandas.Dataframe df: de-trended signal segments """ self.log('\nlength of history data: {}'.format(self.history.__len__()), level='info') start_time_index = self.history.index.get_level_values( level='start_time') unique_values = start_time_index.unique() self.log('\nstart time of tuples: {}'.format(unique_values)) while unique_values.__len__() >= self.min_tup_num: mode = self.history.mode() df = self.history.loc[unique_values[0]] - mode.loc[0].values self.history.drop(index=unique_values[0], level='start_time', inplace=True) unique_values = unique_values.delete(0) yield df
class NeutralAxisBolt(Bolt): tup_fields = ('timestamp_min', 'timestamp_max', 'neutral_axis') outputs = [Stream(fields=tup_fields, name='neutral_axis')] def initialize(self, storm_conf, context): """ receive parameters set in topology definition from storm_conf argument :param dict storm_conf: the Storm configuration for this component :param dict context: information about the component’s place within the topology """ threshold = storm_conf['threshold'] group_freq = storm_conf['group_freq'] height = storm_conf['height'] height = dict(height) columns = height.keys() index = pd.MultiIndex.from_tuples([], names=['group_time', 'sample_time']) self.threshold = threshold self.group_freq = group_freq self.height = pd.Series(height, name='height') self.height.sort_values(ascending=True, inplace=True) self.history = pd.DataFrame(data=[], index=index, columns=columns) def process(self, tup): """ step 1: receive tup and reorganize to pandas.Dataframe; step 2: call self._neutral_axis() function to get the neutral axis height of a section ; step 3: convert resulted pandas.Dataframe to list and send out. :param streamparse.Tuple, tup: tup.values = [timestamp, time_offset, time_increment, samples, channel_name, module_name, data] :return streamparse.Tuple, tup: tup.values = [minima_timestamp, maxima_timestamp, neutral_axis] """ # convert tuple to pandas.DataFrame with pandas.MultiIndex timestamp = tup.values[0] time_increment = tup.values[2] channel_name = tup.values[4] data = tup.values[6] start = pd.Timestamp(timestamp, unit='s', tz='UTC') periods = data.__len__() freq = '{}ms'.format(int(time_increment / 0.001)) sample_time = pd.date_range(start=start, periods=periods, freq=freq) group_time = sample_time.floor(self.group_freq) index = pd.MultiIndex.from_arrays([group_time, sample_time], names=['group_time', 'sample_time']) df = pd.DataFrame(data=data, index=index, columns=[channel_name]) # self.history holds all history data self.history = self.history.combine_first(df) # todo: discard history data 10 minutes before the latest data # group self.history by index <group_time> level notnull_count = self.history.groupby( level='group_time').apply(lambda x: x.notnull().sum().sum()) count = self.history.columns.size * pd.Timedelta( self.group_freq) / pd.Timedelta(freq) notnull_count = notnull_count[notnull_count >= count] if not notnull_count.empty: # extract data ready for processing and discard them from history data data_ready = self.history.loc[notnull_count.index] self.history.drop(index=notnull_count.index, level='group_time', inplace=True) res = data_ready.groupby(level='group_time').apply( self._neutral_axis) for idx in res.index: self.emit(res[idx], stream='neutral_axis') def _neutral_axis(self, df): self.log('---------------excuting neutral_axis method---------------') master_channel = self.height.index[0] if df[master_channel].ptp() > self.threshold: idx_max = df[master_channel].idxmax() idx_min = df[master_channel].idxmin() ptp = df.loc[idx_max] - df.loc[idx_min] ptp.rename('ptp', inplace=True) xy = pd.concat([ptp, self.height], axis='columns') # deg = 1 stands for linear fit # the lowest polynomial coefficients (y-intercept) is the height of neutral axis neutral_axis = polyfit(xy['ptp'], xy['height'], 1)[-1] return [ idx_min[1].timestamp(), idx_max[1].timestamp(), neutral_axis ]