Example #1
0
    def process(self, tuple):
        val = tuple.values[0]
        line = re.compile(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)").split(val)

        cct = ChicagoCrimeObject()
        ccl = ChicagoCrimeLocation()
        ccb = ChicagoCrimeBeat()

        cct.id = str(line[0])
        cct.case_number = str(line[1])
        cct.date = str(line[2])
        cct.block = str(line[3])
        cct.iucr = str(line[4])
        cct.primary_type = str(line[5])
        cct.description = str(line[6])
        ccl.location_description = str(line[7])
        ccl.location = str(line[21])
        ccl.longitude = str(line[19])
        ccl.latitude = str(line[20])
        ccl.x_coordinate = str(line[15])
        ccl.y_coordinate = str(line[16])
        cct.location = ccl.toJSON()
        cct.arrest = str(line[8])
        cct.domestic = str(line[9])
        ccb.beat = str(line[10])
        ccb.community_area = str(line[13])
        ccb.district = str(line[11])
        ccb.ward = str(line[12])
        cct.beat = ccb.toJSON()
        cct.fbi_code = str(line[14])
        cct.year = str(line[17])
        cct.updated_on = str(line[18])

        log.info(cct.toJSON())
        storm.emit([cct.toJSON()])
    def nextTuple(self):
        """
        從kafka batch 讀取資料處理
        messages (m) are namedtuples with attributes:
        m.offset: message offset on topic-partition log (int)
        m.value: message (output of deserializer_class - default is raw bytes)
        """
        if self.consumer is None:
            log.debug("self.consumer is not ready yet.")
            return

        # log.debug("ExpSpout.nextTuple()")
        # time.sleep(3)  # prototype減速觀察
        try:
            for message in self.consumer:
                if message is not None:
                    # log.warning("offset: %s \t value: %s \t at %s", message.offset, message.value, time.time())
                    if self.counter == 0:
                        log.warning(
                            "start process 1000000 records at {0} (timestamp)".
                            format(time.time()))
                    self.counter += 1
                    # self.emit_thread.append(message.value)
                    storm.emit([message.value])
                if self.counter % 10000 == 0:
                    log.warning(
                        "finish process {0} records at {1} (timestamp@{2})".
                        format(self.counter, time.time(),
                               socket.gethostname()))
        except Exception as inst:
            log.debug("Exception Type: %s ; Args: %s", type(inst), inst.args)
    def nextTuple(self):
        """
        從kafka batch 讀取資料處理
        messages (m) are namedtuples with attributes:
        m.offset: message offset on topic-partition log (int)
        m.value: message (output of deserializer_class - default is raw bytes)
        """
        if self.consumer is None:
            log.debug("self.consumer is not ready yet.")
            return

        # log.debug("ExpSpout.nextTuple()")
        # time.sleep(3)  # prototype減速觀察
        try:
            for message in self.consumer:
                if message is not None:
                    # log.warning("offset: %s \t value: %s \t at %s", message.offset, message.value, time.time())
                    if self.counter == 0:
                        log.warning("start process 1000000 records at {0} (timestamp)".format(time.time()))
                    self.counter += 1
                    # self.emit_thread.append(message.value)
                    storm.emit([message.value])
                if self.counter % 10000 == 0:
                    log.warning("finish process {0} records at {1} (timestamp@{2})".format(self.counter, time.time(),
                                                                                           socket.gethostname()))
        except Exception as inst:
            log.debug("Exception Type: %s ; Args: %s", type(inst), inst.args)
Example #4
0
 def process(self, tup):
     raise ValueError('abc')
     log.debug('WordCountBolt.process() called with: %s', tup)
     word = tup.values[0]
     self._count[word] += 1
     log.debug('WordCountBolt.process() emitting: %s', [word, self._count[word]])
     storm.emit([word, self._count[word]])
Example #5
0
 def process(self, tup):
     log.debug('WordCountBolt.process() called with: %s', tup)
     word = tup.values[0]
     self._count[word] += 1
     log.debug('WordCountBolt.process() emitting: %s',
               [word, self._count[word]])
     storm.emit([word, self._count[word]])
 def process(self, tup):
     """
     將接收到的csv line, 切分成row, 並只傳有興趣的欄位給下個bolt
     """
     if tup.is_tick_tuple():
         log.debug("tuple is tick")
     else:
         # log.warning("get tuple whose id is {0}".format(tup.id))
         line = tup.values[0]
         line = line.strip()[8:]  # remove "message:" added by fluentd
         # log.warning("SplitBolt process: %s", line.strip())
         raw_row = line.split(",")
         if len(raw_row) == 47:
             storm.emit([raw_row[6], raw_row[4], raw_row[17], raw_row[18]])
             if self.counter == 0:
                 log.warning(
                     "start process 1000000 records at {0} (timestamp@{1})".format(time.time(), socket.gethostname())
                 )
             self.counter += 1
             if self.counter == 1000000:  # this won't work since more than on instance
                 log.warning(
                     "finish process 1000000 records at {0} (timestamp@{1})".format(
                         time.time(), socket.gethostname()
                     )
                 )
Example #7
0
    def nextTuple(self):
        """
        從kafka batch 讀取資料處理
        messages (m) are namedtuples with attributes:
        m.offset: message offset on topic-partition log (int)
        m.value: message (output of deserializer_class - default is raw bytes)
        """
        if self.consumer is None:
            log.debug("self.consumer is not ready yet.")
            return

        log.debug("ExpSpout.nextTuple()")
        time.sleep(3)  # prototype減速觀察
        cursor = 0
        try:
            for message in self.consumer:
                cursor += 1
                if message is not None:
                    log.debug("offset: %s \t value: %s", message.offset,
                              message.value)
                    storm.emit([message.value])
                if cursor > 10000:  # prototype減量觀察
                    break
        except NoPartitionsForConsumerException:
            log.debug("NoPartitionsForConsumerException")
Example #8
0
 def process(self, tup):
     count =0
     if len(tup.values[0])>1 and (len(tup.values[0])%2)==0:
         for word in self.get_words(tup.values[0]):
             if count ==0:
                 helpcount = word
                 count+=1
             else:
                 word2 = word.encode('utf-8') + ' ' + helpcount.encode('utf-8')
                 count =0
                 storm.emit([word2])
Example #9
0
 def process(self, tup):
     if tup.is_tick_tuple():
         for t in self.rankedItems.itervalues():
             storm.emit(t.values)
     else:
         self.rankedItems[tup.values[0]] = tup
         if len(self.rankedItems) > self.maxSize:
             for t in sorted(self.rankedItems.itervalues(),
                             key=tup_sort_key):
                 del self.rankedItems[t.values[0]]
                 break
Example #10
0
 def process(self, tup):
     if tup.is_tick_tuple():
         now = time.time()
         now_floor = int(math.floor(now / self.window_duration) * self.window_duration)
         first_window = int(now_floor - self.num_windows * self.window_duration)
         self.conn.zunionstore(
             'twitter_word_count',
             ['twitter_word_count:%s' % t for t in xrange(first_window, now_floor)])
         for t in self.conn.zrevrange('twitter_word_count', 0, self.maxSize, withscores=True):
             log.info('Emitting: %s', repr(t))
             storm.emit(t)
Example #11
0
 def fail(self, msg_id):
     """
     emit message again with id which is composed with a prefix and the failed tuple.id
     :param msg_id: id of failed tuple
     """
     # log.warning("fail of message #{0}".format(msg_id))
     fail_id = "fail_{0}".format(msg_id)
     fail_message = self.message_pool[msg_id]
     self.message_pool[fail_id] = fail_message
     storm.emit([fail_message], id=fail_id)  # emit message again
     del self.message_pool[msg_id]
 def process(self, tup):
     if tup.is_tick_tuple():
         for t in self.rankedItems.itervalues():
             storm.emit(t.values)
     else:
         self.rankedItems[tup.values[0]] = tup
         if len(self.rankedItems) > self.maxSize:
             for t in sorted(
                     self.rankedItems.itervalues(), key=tup_sort_key):
                 del self.rankedItems[t.values[0]]
                 break
Example #13
0
 def fail(self, msg_id):
     """
     emit message again with id which is composed with a prefix and the failed tuple.id
     :param msg_id: id of failed tuple
     """
     # log.warning("fail of message #{0}".format(msg_id))
     fail_id = "fail_{0}".format(msg_id)
     fail_message = self.message_pool[msg_id]
     self.message_pool[fail_id] = fail_message
     storm.emit([fail_message], id=fail_id)  # emit message again
     del self.message_pool[msg_id]
Example #14
0
 def nextTuple(self):
     for message in self.consumer:
         algo = message.value
         if (len(algo) > 4):
         	user = algo[:1]
         	if user.isdigit():
            		aux = 'INSTANT'+user
            		algo = algo[2:len(algo)]
            		if(algo[0] == ' '):
               		algo=algo[1:len(algo)]
            	  	self.db[aux].tweet.insert_one({'tweet':algo})
               	storm.emit([algo,user])
    def run(self):
        while True:
            if len(self.messages) != 0:
                # log.warning(self.messages.pop(0))
                storm.emit([self.messages.pop(0)])
                self.counter += 1
                if self.counter % 10000 == 0:
                    log.warning("emit process {0} records at {1} (timestamp@{2})".format(self.counter, time.time(),
                                                                                         socket.gethostname()))

            else:
                time.sleep(0.01)
Example #16
0
 def nextTuple(self):
     for message in self.consumer:
         algo = message.value
         if (len(algo) > 4):
             user = algo[:1]
             if user.isdigit():
                 aux = 'INSTANT' + user
                 algo = algo[2:len(algo)]
                 if (algo[0] == ' '):
                     algo = algo[1:len(algo)]
                 self.db[aux].tweet.insert_one({'tweet': algo})
                 storm.emit([algo, user])
Example #17
0
 def run(self):
     while True:
         if len(self.messages) != 0:
             # log.warning(self.messages.pop(0))
             storm.emit([self.messages.pop(0)])
             self.counter += 1
             if self.counter % 10000 == 0:
                 log.warning("#{0}".format(self.counter))
             if self.counter == 1000000:  # mark time
                 log.warning("emit process 1000000 records at {0} (timestamp)".format(time.time()))
         else:
             time.sleep(0.01)
Example #18
0
    def nextTuple(self):
        for message in self.consumer:
            algo = message.value
      	    if(len(algo) >4):
            	user = algo[:1]
            	if user.isdigit():
                	aux = 'BOARD'+user
                	algo = algo[2:len(algo)]
                	if(algo[0] == ' '):
                    		algo=algo[1:len(algo)]
                	self.db[aux].bad.insert_one({'tweet':algo})
			algo=algo.encode('utf-8','ignore')
                	storm.emit([algo,user])
Example #19
0
    def nextTuple(self):
        #if self._index == len(self.sentences):
        #    # This is just a demo; keep sleeping and returning None after we run
        #    # out of data. We can't just sleep forever or Storm will hang.
        #    time.sleep(1)
        #    return None

        time.sleep(0.25)
        sentence = self.sentences[random.randint(0, len(self.sentences) - 1)]
        #sentence = self.sentences[self._index]
        #self._index += 1
        log.debug('randomsentence emitting: %s', sentence)
        storm.emit([sentence])
Example #20
0
    def nextTuple(self):
        #if self._index == len(self.sentences):
        #    # This is just a demo; keep sleeping and returning None after we run
        #    # out of data. We can't just sleep forever or Storm will hang.
        #    time.sleep(1)
        #    return None

        time.sleep(0.25);
        sentence = self.sentences[random.randint(0, len(self.sentences) - 1)]
        #sentence = self.sentences[self._index]
        #self._index += 1
        log.debug('randomsentence emitting: %s', sentence)
        storm.emit([sentence])
Example #21
0
 def nextTuple(self):
     for message in self.consumer:
         algo = message.value
         if (len(algo) > 4):
             user = algo[:1]
             if user.isdigit():
                 aux = 'BOARD' + user
                 algo = algo[2:len(algo)]
                 if (algo[0] == ' '):
                     algo = algo[1:len(algo)]
                 self.db[aux].good.insert_one({'tweet': algo})
                 algo = algo.encode('utf-8', 'replace')
                 storm.emit([algo, user])
Example #22
0
    def nextTuple(self):
        #file = open('/home/pipe/twitterintel/topology/text.txt','a')
        for message in self.consumer:
            algo = message.value
	    if (len(algo) > 4):
            	user = algo[:1]
            	if user.isdigit():
               		aux = 'BOARD'+user
               		algo = algo[2:len(algo)]
               		if(algo[0] == ' '):
                  		algo=algo[1:len(algo)]
               	  	self.db[aux].spam.insert_one({'tweet':algo})
			algo=algo.encode('utf-8','replace')
                  	storm.emit([algo,user])
Example #23
0
 def nextTuple(self):
     #file = open('/home/pipe/twitterintel/topology/text.txt','a')
     for message in self.consumer:
         algo = message.value
         if (len(algo) > 4):
             user = algo[:1]
             if user.isdigit():
                 aux = 'BOARD' + user
                 algo = algo[2:len(algo)]
                 if (algo[0] == ' '):
                     algo = algo[1:len(algo)]
                 self.db[aux].spam.insert_one({'tweet': algo})
                 algo = algo.encode('utf-8', 'replace')
                 storm.emit([algo, user])
    def run(self):
        while True:
            if len(self.messages) != 0:
                # log.warning(self.messages.pop(0))
                storm.emit([self.messages.pop(0)])
                self.counter += 1
                if self.counter % 10000 == 0:
                    log.warning(
                        "emit process {0} records at {1} (timestamp@{2})".
                        format(self.counter, time.time(),
                               socket.gethostname()))

            else:
                time.sleep(0.01)
Example #25
0
	def process(self, tup):
		log.debug("HashtagCountBolt.process() started with: %s", tup)
		tag = tup.values[0]
		if tag != "None":
			self._count[tag] += 1
			d = parse(tup.values[1])
			date = calendar.timegm(d.timetuple())
			db = MySQLdb.connect("localhost","root","password","twitter")
			cursor = db.cursor()
			sql = """INSERT INTO hashtags (hashtag, datetime, count) values ('%s', '%s', %d) on duplicate key update count=%d""" % (tag, date, self._count[tag], self._count[tag])
			cursor.execute(sql)
			db.commit()
			storm.emit([tag, self._count[tag], date])
		else:
			storm.emit(["None", "None", "None"])
    def nextTuple(self):
        if self.consumer is None:
            print("self.consumer is not ready yet.")
            return

        try:
            for message in self.consumer:
               if message is not None:
                    msg_id = str(self.counter)
                    #log.info(">>> MESSAGE: " + message.value.decode('ascii'))
                    storm.emit([message.value.decode('ascii')])
                    self.counter += 1
                    #log.info(">>>> COUNTER: " + self.counter)

        except Exception as inst:
            log.debug("Exception Type: %s ; Args: %s", type(inst), inst.args)
Example #27
0
    def toNextBolt(self):
        """
        將累計的同個msisdn的uplink & downlink emit給下個bolt
        並重新累計
        """
        for msisdn in self.total_uplink:
            # see if we could pass list in storm tuple: False, emit members needs to be hashable
            # so ... merge list to str
            merged = ",".join(self.total_records[msisdn])
            # log.debug("%s", [msisdn, merged])
            storm.emit([msisdn, self.total_uplink[msisdn], self.total_downlink[msisdn], merged])

        # clear accumulator
        self.total_uplink.clear()
        self.total_downlink.clear()
        self.total_records.clear()
        self.counter = 0
Example #28
0
    def nextTuple(self):
        """
        consume message from kafka
        messages (m) are named tuples with attributes:
        m.offset: message offset on topic-partition log (int)
        m.value: message (output of deserializer_class - default is raw bytes)
        """
        if self.consumer is None:
            log.debug("self.consumer is not ready yet.")
            return

        if self.counter >= 1000000:
            return

        # log.debug("ExpSpout.nextTuple()")
        # time.sleep(3)  # prototype減速觀察
        try:
            # message = self.consumer.consume()
            for message in self.consumer:
                if message is not None:
                    # log.warning("offset: %s \t value: %s \t at %s", message.offset, message.value, time.time())
                    if self.counter == 0:
                        self.start_time = time.time()
                        log.warning(
                            "start process 1000000 records at {0} (timestamp@{1})".format(
                                time.time(), socket.gethostname()
                            )
                        )
                    msg_id = str(self.counter)
                    self.message_pool[msg_id] = message.value  # message cache for fail over
                    storm.emit([message.value], id=msg_id)
                    self.counter += 1
                    if self.counter % 10000 == 0:
                        log.warning("mark @ #{0}".format(self.counter))
                if self.counter == 1000000:  # mark time
                    self.end_time = time.time()
                    log.warning(
                        "finish process 1000000 records at {0} (timestamp@{1})".format(
                            time.time(), socket.gethostname()
                        )
                    )
                    log.warning("spend {0} seconds processing 1000000 records".format(self.end_time - self.start_time))
                if self.counter % 100 == 0:
                    break
        except Exception as inst:
            log.debug("Exception Type: %s ; Args: %s", type(inst), inst.args)
 def process(self, tup):
     if tup.is_tick_tuple():
         now = time.time()
         now_floor = int(
             math.floor(now / self.window_duration) * self.window_duration)
         first_window = int(now_floor -
                            self.num_windows * self.window_duration)
         self.conn.zunionstore('twitter_word_count', [
             'twitter_word_count:%s' % t
             for t in xrange(first_window, now_floor)
         ])
         for t in self.conn.zrevrange('twitter_word_count',
                                      0,
                                      self.maxSize,
                                      withscores=True):
             log.info('Emitting: %s', repr(t))
             storm.emit(t)
Example #30
0
    def nextTuple(self):
        """
        consume message from kafka
        messages (m) are named tuples with attributes:
        m.offset: message offset on topic-partition log (int)
        m.value: message (output of deserializer_class - default is raw bytes)
        """
        if self.consumer is None:
            log.debug("self.consumer is not ready yet.")
            return

        if self.counter >= 1000000:
            return

        # log.debug("ExpSpout.nextTuple()")
        # time.sleep(3)  # prototype減速觀察
        try:
            # message = self.consumer.consume()
            for message in self.consumer:
                if message is not None:
                    # log.warning("offset: %s \t value: %s \t at %s", message.offset, message.value, time.time())
                    if self.counter == 0:
                        self.start_time = time.time()
                        log.warning(
                            "start process 1000000 records at {0} (timestamp@{1})"
                            .format(time.time(), socket.gethostname()))
                    msg_id = str(self.counter)
                    self.message_pool[
                        msg_id] = message.value  # message cache for fail over
                    storm.emit([message.value], id=msg_id)
                    self.counter += 1
                    if self.counter % 10000 == 0:
                        log.warning("mark @ #{0}".format(self.counter))
                if self.counter == 1000000:  # mark time
                    self.end_time = time.time()
                    log.warning(
                        "finish process 1000000 records at {0} (timestamp@{1})"
                        .format(time.time(), socket.gethostname()))
                    log.warning(
                        "spend {0} seconds processing 1000000 records".format(
                            self.end_time - self.start_time))
                if self.counter % 100 == 0:
                    break
        except Exception as inst:
            log.debug("Exception Type: %s ; Args: %s", type(inst), inst.args)
    def process(self, tup):
        log.debug('MsgLogStorageBolt.process() called with: %s', tup)        
        content_type = tup.values[0]
        delivery_tag = tup.values[1]
        msg_body = tup.values[2]
        
        msg = message.Message().createFromJSON(msg_body)
        try:
            self.storage.saveMsg(msg)
        except Exception as e:
            log.debug('Message failed to be stored')
            log.debug('Error: %s', e)
        else:
            #channel.basic_ack(delivery_tag=method.delivery_tag)
            pass

        # Emit same message to next bolt
        storm.emit([content_type, delivery_tag, msg_body])
Example #32
0
    def toNextBolt(self):
        """
        將累計的同個msisdn的uplink & downlink emit給下個bolt
        並重新累計
        """
        for msisdn in self.total_uplink:
            # see if we could pass list in storm tuple: False, emit members needs to be hashable
            # so ... merge list to str
            merged = ",".join(self.total_records[msisdn])
            # log.debug("%s", [msisdn, merged])
            storm.emit([
                msisdn, self.total_uplink[msisdn], self.total_downlink[msisdn],
                merged
            ])

        # clear accumulator
        self.total_uplink.clear()
        self.total_downlink.clear()
        self.total_records.clear()
        self.counter = 0
    def nextTuple(self):
        # if self._index == len(self.sentences):
        #    # This is just a demo; keep sleeping and returning None after we run
        #    # out of data. We can't just sleep forever or Storm will hang.
        #    time.sleep(1)
        #    return None
        # time.sleep(0.25);
        # sentence = self.sentences[random.randint(0, len(self.sentences) - 1)]
        # sentence = self.sentences[self._index]
        # self._index += 1
        # log.debug('rabbitmq_spout emitting: %s', sentence)
        # for word in sentence.split(' '):
        #    storm.emit([word])
        # Initialize our timers and loop until external influence stops us

        if self.conn_broker.is_open:
            # Call basic get which returns the 3 frame types
            method, header, body = self.channel.basic_get(queue=self.QUEUE_NAME)
            print method, header, body

            # It can be empty if the queue is empty so don't do anything
            if not method:
                time.sleep(self.SLEEP_TIME)
                return None
            if method.NAME == "Basic.GetEmpty":
                # No need to pound rabbit, sleep for a while. If you want messages as
                # fast as you can get them, use Basic.Consume
                time.sleep(self.SLEEP_TIME)
                return None
                ##storm.emit(['content_type', 'delivery_tag', 'msg_body'])

                # We have data
            else:
                # print "Basic.GetOk %s delivery-tag %i: %s" % (header.content_type,
                #                                              method.delivery_tag,
                #                                              body)
                storm.emit([header.content_type, method.delivery_tag, body])
                # Acknowledge the receipt of the data
                # TODO: Do it at the end of processing,
                # here it is done at the end of delivering it to the following bolt
                self.channel.basic_ack(delivery_tag=method.delivery_tag)
    def nextTuple(self):
        if self.consumer is None:
            print("self.consumer is not ready yet.")
            return

        while True:
            msg = self.consumer.poll(timeout=1.0)
            if msg is None:
                continue

            if msg.error():
                if msg.error().code() == KafkaError._PARTITION_EOF:
                    log.info('%% %s [%d] reached end of offset %d\n' %
                             (msg.topic(), msg.partition(), msg.offset()))
                elif msg.error():
                    raise KafkaException(msg.error())
            else:
                print('%% %s [%d] at offset %d with key %s:\n' %
                      (msg.topic(), msg.partition(), msg.offset(),
                       str(msg.key())))
                storm.emit(msg.value())
Example #35
0
 def process(self, tup):
     if tup.is_tick_tuple():
         for t in sorted(
                 self.rankedItems.itervalues(),
                 key=tup_sort_key,
                 reverse=True):
             log.info('Emitting: %s', repr(t.values))
             storm.emit(t.values)
     else:
         self.rankedItems[tup.values[0]] = tup
         if len(self.rankedItems) > self.maxSize:
             for t in sorted(
                     self.rankedItems.itervalues(),
                     key=tup_sort_key):
                 del self.rankedItems[t.values[0]]
                 break
         zero_keys = set(
             k for k, v in self.rankedItems.iteritems()
             if v.values[1] == 0)
         for k in zero_keys:
             del self.rankedItems[k]
Example #36
0
 def process(self, tup):
     if(len(tup.values[0])>1):
         count =0
         words = self.get_words(tup.values[0].encode('utf-8','ignore'))
         if len(words)>=2 and (len(words)%2)==0:
             for index in words:
                 if count ==0:
                     helpcount = words[index]
                     count+=1
                 else:
                     word2 = helpcount + ' ' + words[index]
                     count =0
                     storm.emit([word2,tup.values[1]])
         elif(len(words)>2):
             for index in words:
                 if words[len(words)-1] == words[index]:
                     word2 = words[len(words)-2] + ' ' + words[index]
                     storm.emit([word2,tup.values[1]])
                 if count ==0:
                     helpcount = words[index]
                     count+=1
                 else:
                     word2 = helpcount + ' ' + words[index]
                     count =0
                     storm.emit([word2,tup.values[1]])
Example #37
0
 def process(self, tup):
     if (len(tup.values[0]) > 1):
         count = 0
         words = self.get_words(tup.values[0].encode('utf-8', 'ignore'))
         if len(words) >= 2 and (len(words) % 2) == 0:
             for index in words:
                 if count == 0:
                     helpcount = words[index]
                     count += 1
                 else:
                     word2 = helpcount + ' ' + words[index]
                     count = 0
                     storm.emit([word2, tup.values[1]])
         elif (len(words) > 2):
             for index in words:
                 if words[len(words) - 1] == words[index]:
                     word2 = words[len(words) - 2] + ' ' + words[index]
                     storm.emit([word2, tup.values[1]])
                 if count == 0:
                     helpcount = words[index]
                     count += 1
                 else:
                     word2 = helpcount + ' ' + words[index]
                     count = 0
                     storm.emit([word2, tup.values[1]])
 def process(self, tup):
     """
     將接收到的csv line, 切分成row, 並只傳有興趣的欄位給下個bolt
     """
     if tup.is_tick_tuple():
         log.debug("tuple is tick")
     else:
         line = tup.values[0]
         line = line.strip()[8:]  # remove "message:" added by fluentd
         # log.warning("SplitBolt process: %s", line.strip())
         raw_row = line.split(",")
         if len(raw_row) == 47:
             storm.emit([raw_row[6], raw_row[4], raw_row[17], raw_row[18]])
             if self.counter == 0:
                 log.warning(
                     "start process 1000000 records at {0} (timestamp)".
                     format(time.time()))
             self.counter += 1
             if self.counter == 1000000:  # this won't work since more than on instance
                 log.warning(
                     "finish process 1000000 records at {0} (timestamp)".
                     format(time.time()))
Example #39
0
    def nextTuple(self):
        """
        從kafka batch 讀取資料處理
        messages (m) are namedtuples with attributes:
        m.offset: message offset on topic-partition log (int)
        m.value: message (output of deserializer_class - default is raw bytes)
        """
        if self.consumer is None:
            log.debug("self.consumer is not ready yet.")
            return

        log.debug("ExpSpout.nextTuple()")
        time.sleep(3)  # prototype減速觀察
        cursor = 0
        try:
            for message in self.consumer:
                cursor += 1
                if message is not None:
                    log.debug("offset: %s \t value: %s", message.offset, message.value)
                    storm.emit([message.value])
                if cursor > 10000:  # prototype減量觀察
                    break
        except NoPartitionsForConsumerException:
            log.debug("NoPartitionsForConsumerException")
Example #40
0
	def process(self, tup):
		log.debug('SplitHashtagBolt.process() started with: %s', tup)
		t = tup.values[0]
		if t.has_key('entities'):
			if t['entities']['hashtags']:
				for i in t['entities']['hashtags']:
					try:
						tag = str(i['text'].decode("ascii"))
						date = t['created_at']
						storm.emit([tag, date])
					except:
						tag = "None"
						date = "None"
						storm.emit([tag, date])
			else:
				tag = "None"
				date = "None"
				storm.emit([tag, date])
		else:
			tag = "None"
			date = "None"
			storm.emit([tag, date])			
Example #41
0
 def nextTuple(self):
     for message in self.consumer:
         algo = message.value
         self.db.BOARD.bad.insert_one({'tweet':algo})
         storm.emit([algo])
Example #42
0
 def process(self, tup):
     for word in self.get_words(tup.values[0]):
         word2 = word.encode('utf-8')
         storm.emit([word])
Example #43
0
 def emitCurrentWindowCounts(self):
     counts = self.counter.getCountsThenAdvanceWindow()
     for k, v in counts.iteritems():
         storm.emit([k, v])
Example #44
0
 def nextTuple(self):
     time.sleep(0.25)
     sentences = self.sentences
     sentence = sentences[random.randint(0, len(sentences) - 1)]
     self.log.debug('randomsentence emitting: %s', sentence)
     storm.emit([sentence])
 def process(self, tup):
     words = tup.values[0].split(" ")
     for word in words:
         storm.emit([word])
Example #46
0
 def emitCurrentWindowCounts(self):
     counts = self.counter.getCountsThenAdvanceWindow()
     for k, v in counts.iteritems():
         storm.emit([k, v])
Example #47
0
 def process(self, tup):
     log.debug('SplitSentenceBolt.process() called with: %s', tup)
     words = tup.values[0].split(" ")
     for word in words:
       log.debug('SplitSentenceBolt.process() emitting: %s', word)
       storm.emit([word])
Example #48
0
 def process(self, tup):
     log.debug('SplitSentenceBolt.process() called with: %s', tup)
     words = tup.values[0].split(" ")
     for word in words:
       log.debug('SplitSentenceBolt.process() emitting: %s', word)
       storm.emit([word])
Example #49
0
 def process(self, tup):
     for word in self.get_words(tup.values[0]):
         word = word.encode('utf-8', 'replace')
         storm.emit([word, tup.values[1]])
Example #50
0
 def process(self, tup):
     for word in self._get_words(tup.values[0]):
         storm.emit([word])
Example #51
0
 def process(self, tup):
     for word in self.get_words(tup.values[0]):
         storm.emit([word,tup.values[1]])
Example #52
0
 def process(self, tup):
     word = tup.values[0]
     self._count[word] += 1
     storm.emit([word, self._count[word]])
Example #53
0
    def process(self, tup):
        for word in self.get_words(tup.values[0]):
	    word= word.encode('utf-8','replace')
            storm.emit([word,tup.values[1]])
Example #54
0
 def nextTuple(self):
     tweet = self.queue.get()
     storm.emit([tweet])
     self.queue.task_done()
 def nextTuple(self):
     time.sleep(0.25)
     sentence = self.sentences[random.randint(0, len(self.sentences) - 1)]
     storm.emit([sentence])
    def nextTuple(self):
        time.sleep(0.25)
        sentence = self.sentences[random.randint(0, len(self.sentences) - 1)]

        log.debug("RandomSentence nextTuple emitting %s", sentence)
        storm.emit([sentence])
Example #57
0
 def emitCurrentWindowCounts(self):
     counts = self.counter.getCountsThenAdvanceWindow()
     for k, v in counts.iteritems():
         word2 = k.encode('utf-8')+ ' '+ str(v)
         self.producer.send(self.topic,word2)
         storm.emit([k, v])
Example #58
0
 def process(self, tup):
     word = tup.values[0]
     self._count[word] += 1
     storm.emit([word, self._count[word]])
Example #59
0
	def nextTuple(self):
		tag = self.t[random.randint(0, len(self.t) - 1)]
		date = datetime.datetime.now()
		log.debug('hashtagspout emitting: %s', tag)
		storm.emit([tag, date])