def execute(self, top_num, bolt_num, rid, tuple_batch, collector, mmp_list): new_tuple_batch = TupleBatch(tuple_batch.timestamp) for big_tuple in tuple_batch.tuple_list: word = big_tuple.tup self.counter += word[1] tmp_tuple = Tuple(('result', self.counter)) new_tuple_batch.add_tuple(tmp_tuple) collector.ack(top_num, bolt_num, new_tuple_batch, rid) self.counter = 0
def execute(self, top_num, bolt_num, rid, tuple_batch, collector, mmp_list): new_tuple_batch = TupleBatch(tuple_batch.timestamp) for big_tup in tuple_batch.tuple_list: tup = big_tup.tup tup = tup.replace("\n", "") words = tup.split(' ') for word in words: tmp_tuple = Tuple(word) new_tuple_batch.add_tuple(tmp_tuple) collector.emit(top_num, bolt_num + 1, new_tuple_batch, rid)
def execute(self, top_num, bolt_num, rid, tuple_batch, collector, mmp_list): new_tuple_batch = TupleBatch(tuple_batch.timestamp) for big_tup in tuple_batch.tuple_list: tup = big_tup.tup tup = tup.replace("\n", "") url_list = tup.split('\t') urls = [url_list[i] for i in range(len(url_list)) if i != 0] weight = len(urls) + 1 for url in urls: tmp_tuple = Tuple((url, 1/weight)) new_tuple_batch.add_tuple(tmp_tuple) collector.emit(top_num, bolt_num + 1, new_tuple_batch, rid)
def execute(self, top_num, bolt_num, rid, tuple_batch, collector, mmp_list): new_tuple_batch = TupleBatch(tuple_batch.timestamp) for big_tup in tuple_batch.tuple_list: url, rank = big_tup.tup if url in self.ranks: rank += self.ranks.get(url) self.ranks[url] = rank for url, rank in self.ranks.items(): tmp_tuple = Tuple((url, rank)) new_tuple_batch.add_tuple(tmp_tuple) collector.ack(top_num, bolt_num, new_tuple_batch, rid) self.ranks.clear()
def execute(self, top_num, bolt_num, rid, tuple_batch, collector, mmp_list): new_tuple_batch = TupleBatch(tuple_batch.timestamp) for big_tuple in tuple_batch.tuple_list: word = big_tuple.tup count = 0 if word in self.counts: count = self.counts.get(word) count += 1 self.counts[word] = count for word, count in self.counts.items(): tmp_tuple = Tuple((word, count)) new_tuple_batch.add_tuple(tmp_tuple) collector.ack(top_num, bolt_num, new_tuple_batch, rid) self.counts.clear()
def start_top(self): curr_top = self.topology_list[self.topology_num] print(self.prefix, curr_top.name, " starting...") tuple_batch = TupleBatch(time.time()) while True: tup = curr_top.spout.next_tup() if not tup: self.emit(tuple_batch, self.topology_num) break else: big_tuple = Tuple(tup) tuple_batch.add_tuple(big_tuple) if len(tuple_batch.tuple_list) >= CRANE_BATCH_SIZE: self.emit(tuple_batch, self.topology_num) tuple_batch = TupleBatch(time.time()) print(self.prefix + 'All tuples transmitted. Spout closed down.') self.monitor_thread.start()
class URL(object): class _Dict(dict): _extra = {'basename', 'subdomain'} def regular(self, key): return key in self and key not in self._extra def __setitem__(self, key, value): if key is 'path': super().__setitem__('basename', os.path.basename(value) if value else '') elif key is 'netloc': domain = value.rsplit('.', 2) super().__setitem__('subdomain', domain[0] if len(domain) is 3 else '') return super().__setitem__(key, value) _keys = Tuple(SplitResult._fields) basename = None subdomain = None def __init__(self, url=None, cut=None): self.__dict__ = self._Dict() _keys = self._keys if isinstance(url, str): data = _keys * urlsplit(url) elif isinstance(url, dict): data = url elif url is not None: data = _keys * url else: data = [] index = len(_keys) if isinstance(cut, str): index = _keys.index(cut, index) elif isinstance(cut, int): index = cut _dict = dict(data) for k in _keys[:index]: self._set(k, _dict.get(k, '')) for k in _keys[index:]: self._set(k) def _set(self, key, value=''): self.__dict__[key] = value def _regular(self, key): return self.__dict__.regular(key) def _key(self, key): return key if isinstance(key, str) else self._keys.get(key) def __getitem__(self, key): return self.__dict__[self._key(key)] def __setitem__(self, key, value): key = self._key(key) if self._regular(key): self._set(key, value) def __len__(self): return len(self._keys) def __iter__(self): for k in self._keys: yield self[k] def __str__(self): return urlunsplit(self) def format(self, *args, **kw): return str(self).format(*args, **kw) def __repr__(self): _name = self.__class__.__name__ data = (f"{k}='{self[k]}'" for k in self._keys) return f"{_name}({', '.join(data)})"