Esempio n. 1
0
class Producer(object):
	
	def __init__(self, addr):
		self.client = KafkaClient(addr)
		self.producer = KeyedProducer(self.client)

	def produce_msgs(self, source_symbol):
		#price_field = random.randint(800,1400)
		msg_cnt = 0

		datagenerator = DataGenerator()

		function_options = {
			0:datagenerator.click_event,
			1:datagenerator.view_event,
			2:datagenerator.bid_event,
			3:datagenerator.hover_event,
			4:datagenerator.load_event
		}

		while True:
			#time_field = datetime.now().strftime("%Y%m%d %H%M%S")
			#price_field += random.randint(-10, 10)/10.0
			#volume_field = random.randint(1, 1000)
			#str_fmt = "{};{};{};{}"
			#message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field)
			num = random.randint(0, 4)
			message_info = function_options[num]()

			print json.dumps(message_info)

			self.producer.send_messages('test_adability', source_symbol, message_info)
			msg_cnt += 1
class Producer(object):

    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)
        self.sess = requests.Session()
        adapter = requests.adapters.HTTPAdapter(max_retries=5)
        self.sess.mount('http://', adapter)
        self.sess.mount('https://', adapter)

    def produce_msgs(self, topic, source_symbol, last_record_set):
        self.record_set = set()
        count = 0
	try:
		for item in self.r["data"]:
		    self.record_set.add(item["payment_id"])
		    count += 1
		    if not item["payment_id"] in last_record_set:
			message_info = "{}\n".format(json.dumps(item))
			self.producer.send_messages(topic, source_symbol, message_info)
	#                print message_info
	#                print count
	except:
		k = 1
		    
    
    def get_venmo(self,limit=300,page="https://venmo.com/api/v5/public?"):
	try:
                self.r = self.sess.get(page + "&limit={}".format(limit)).json()
	except:
		self.r = ""
Esempio n. 3
0
class KafkaLoggingHandler(logging.Handler):

    def __init__(self, hosts_list, topic, key=None):
        logging.Handler.__init__(self)
        self.kafka_client = KafkaClient(hosts_list)
        self.key = key
        self.kafka_topic_name = topic
        if not key:
            self.producer = SimpleProducer(self.kafka_client)
        else:
            self.producer = KeyedProducer(self.kafka_client)

    def emit(self, record):
        # drop kafka logging to avoid infinite recursion
        if record.name == 'kafka':
            return
        try:
            # use default formatting
            msg = self.format(record)
            if isinstance(msg, unicode):
                msg = msg.encode("utf-8")
            # produce message
            if not self.key:
                self.producer.send_messages(self.kafka_topic_name, msg)
            else:
                self.producer.send(self.kafka_topic_name, self.key, msg)
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            self.handleError(record)

    def close(self):
        self.producer.stop()
        logging.Handler.close(self)
class Producer(object):

	def __init__(self, addr):
		self.client = KafkaClient(addr)
		self.producer = KeyedProducer(self.client)
		self.artist_id = []
		self.artwork_id = []

	def load_ids(self):
		artwork_path = "/home/ubuntu/Insight/dataset/Artsy/artwork_id.txt"
		artist_path = "/home/ubuntu/Insight/dataset/Artsy/artist_id.txt"
		with open(artwork_path) as f1:
			for line in f1:
				if line != "":
					self.artwork_id.append(line.strip())
			f1.close()
		with open(artist_path) as f2:
			for line in f2:
				if line != "":
					self.artist_id.append(line.strip())
			f2.close()


	def produce_msgs(self, source_symbol):
		msg_cnt = 0
		while True:
			time_field = datetime.now().strftime("%Y%m%d %H%M%S")
			user_field = random.choice(self.artist_id)
			art_field = random.choice(self.artwork_id)
			str_fmt = "{};{};{};{};{}"
			message_info = str_fmt.format(source_symbol,time_field,user_field,"pin",art_field)
			# print message_info
			self.producer.send_messages('pin_activity', source_symbol, message_info)
			msg_cnt += 1
Esempio n. 5
0
class Producer(object):
    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client)

    def produce_msgs(self, source_symbol):
        #price_field = random.randint(800,1400)
        msg_cnt = 0

        datagenerator = DataGenerator()

        function_options = {
            0: datagenerator.click_event,
            1: datagenerator.view_event,
            2: datagenerator.bid_event,
            3: datagenerator.hover_event,
            4: datagenerator.load_event
        }

        while True:
            #time_field = datetime.now().strftime("%Y%m%d %H%M%S")
            #price_field += random.randint(-10, 10)/10.0
            #volume_field = random.randint(1, 1000)
            #str_fmt = "{};{};{};{}"
            #message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field)
            num = random.randint(0, 4)
            message_info = function_options[num]()

            print json.dumps(message_info)

            self.producer.send_messages('test_adability', source_symbol,
                                        message_info)
            msg_cnt += 1
class Producer(object):
    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client)
        self.artist_id = []
        self.artwork_id = []

    def load_ids(self):
        artwork_path = "/home/ubuntu/Insight/dataset/Artsy/artwork_id.txt"
        artist_path = "/home/ubuntu/Insight/dataset/Artsy/artist_id.txt"
        with open(artwork_path) as f1:
            for line in f1:
                if line != "":
                    self.artwork_id.append(line.strip())
            f1.close()
        with open(artist_path) as f2:
            for line in f2:
                if line != "":
                    self.artist_id.append(line.strip())
            f2.close()

    def produce_msgs(self, source_symbol):
        msg_cnt = 0
        while True:
            time_field = datetime.now().strftime("%Y%m%d %H%M%S")
            user_field = random.choice(self.artist_id)
            art_field = random.choice(self.artwork_id)
            str_fmt = "{};{};{};{};{}"
            message_info = str_fmt.format(source_symbol, time_field,
                                          user_field, "pin", art_field)
            # print message_info
            self.producer.send_messages('pin_activity', source_symbol,
                                        message_info)
            msg_cnt += 1
def process(time, lines):
    """match user with bidder
    Input:
    lines: (ts string, uid string, topic vector)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    # calculate user-product correlation table
    runningWindow=lines.map(lambda (k, v): ( (k[0], str(time)), v ))\
         .reduceByKey(lambda x,y: x+y)\
  .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\
   .flatMap(lambda x:x)\
  .filter(lambda (x,y,s):s>.97)
    rowRDD = runningWindow.map(
        lambda x: Row(uid=x[0][0], pid=x[1], score=x[2], ts=x[0][1]))
    #    print(rowRDD.take(10))
    print("========= %d =========" % rowRDD.count())
    if (rowRDD.count() > 0):
        client = SimpleClient(KAFKA_NODE)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "tick" :"' + str(time.isoformat()) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "score":"' + str(row['score']) + '",'
            line += '  "pid":' + str(row['pid']) + '}'
            #	    print(line)
            producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
Esempio n. 8
0
class Producer(object):

  def __init__(self, addr):
    self.client = KafkaClient(addr)
    self.producer = KeyedProducer(self.client)
    self.zipcode = []
    self.complaint = []

  def load_ids(self):
    zipcode_path = "/home/ubuntu/repos/project311/kafka/zipcodes.txt"
    complaint_path = "/home/ubuntu/repos/project311/kafka/complaint_type.txt"
    with open(zipcode_path, 'r') as f1:
      for line in f1:
        if line != "":
            self.zipcode.append(line.strip())
    with open(complaint_path) as f2:
      for line in f2:
        if line != "":
          self.complaint.append(line.strip())

  def produce_msgs(self, source_symbol):
    msg_cnt = 0
    while True:
      time_field = datetime.now().strftime("%Y%m%d%H%M%S")
      zipcode_field = random.choice(self.zipcode)
      complaint_field = random.choice(self.complaint)
      str_fmt = "{};{};{};{}"
      message_info = str_fmt.format(source_symbol, time_field, zipcode_field, complaint_field)
      print message_info
      self.producer.send_messages('complaints', source_symbol, message_info)
      msg_cnt += 1
Esempio n. 9
0
def process(time, lines):
    """match user with bidder
    Input:
    lines: (ts string, uid string, {pid:score} dict)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    # calculate user-product correlation table
    rowRDD=lines.map(lambda x: ( x['uid'], matchBids(x['score']) ))\
                .map(lambda x:Row(uid=x[0],pid=x[1][0],price=x[1][1]))
    print("========= %d =========" % rowRDD.count())
    if (rowRDD.count() > 0):
        #   send to kafka
        client = SimpleClient(KAFKA_NODE)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "pid" :"' + str(row['pid']) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "price":"' + str(row['price']) + '",'
            line += '  "ts":' + str(time) + '}'
            #	    print(line)
            producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)


#   save to cassandra
        rowRDD.map(lambda x:Row(pid=x['pid'],ts=str(time),price=x['price']))\
              .toDF().write\
              .format("org.apache.spark.sql.cassandra")\
              .options(table='winningbid10s', keyspace='ad_flow')\
              .save(mode="append")
def process(time, lines):
    """1. select user to push ads
       2. save user-product corr table to cassandra
       3. match user with bidder
       4. save bid winner to cassandra 
    Input:
    lines: (ts string, uid string, topic vector)
    """
    print("========= %s =========" % str(time))
    sqlContext=getSqlContextInstance(lines.context)
    # calculate user-product correlation table  
    #lines1s=lines.map(lambda x: ( (x['uid'], roundTime(parser.parse(x['tick']),1).isoformat()), np.asarray([1]+[float(i) for i in x['topic']])))\
#    lines1s=lines.map(lambda x: ( x[0], 1))\
    runningWindow=lines.map(lambda (k, v): ( (k[0], time.isoformat()), v ))\
         .reduceByKey(lambda x,y: x+y)\
	 .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\
 	 .flatMap(lambda x:x)\
	 .filter(lambda (x,y,s):s>.97)
    rowRDD=runningWindow.map(lambda x:Row(uid=x[0][0],pid=x[1],score=x[2],ts=x[0][1])) 
#    print(rowRDD.take(10))
#    saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s')
    print("========= %d =========" % rowRDD.count())
    # save corr table to cassandra 
    if (rowRDD.count()>0):
	client = SimpleClient(KAFKA_NODE)
	producer = KeyedProducer(client)
        for row in rowRDD.collect():
	    line = '{ "tick" :"' + str(time.isoformat()) + '",'
            line+= '  "uid"  :"' + str(row['uid']) + '",'
            line+= ' "score":"' + str(row['score'])+'",'
            line+= '  "pid":' + str(row['pid'])+ '}'
#	    print(line)
	    producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
class KafkaLoggingHandler(logging.Handler):
    def __init__(self, hosts="", topic="", partition=0):
        logging.Handler.__init__(self)
        self.kafkaClient = KafkaClient(hosts)
        self.topic = topic
        self.partition = partition
        self.producer = KeyedProducer(
            self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=200
        )

    def emit(self, record):
        # drop kafka logging to avoid infinite recursion
        if record.name == "kafka":
            return
        try:
            # use default formatting
            msg = self.format(record)
            # produce message
            self.producer.send_messages(self.topic + record.name, self.partition, msg)
        except:
            import traceback

            ei = sys.exc_info()
            traceback.print_exception(ei[0], ei[1], ei[2], None, sys.stderr)
            del ei

    def close(self):
        self.producer.stop()
        logging.Handler.close(self)
def process(time, lines):
    """Calculate user-product corr table and select ad-push events
    Input:
    lines: (ts string, uid string, topic vector)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    #   calculate user-product correlation table
    runningWindow=lines.map(lambda (k, v): ( (k[1], time.isoformat()), v ))\
         .reduceByKey(lambda x,y: x+y)\
  .map(lambda (x,u): (x, [(pid, score) for (pid, score) in ( (y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value ) if score>.90]))\
  .filter(lambda (k, v): v!=[])
    rowRDD = runningWindow.map(
        lambda x: Row(uid=x[0][0], score=x[1], ts=x[0][1]))
    #   print(rowRDD.take(10))
    #   saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s')
    print("========= %d =========" % rowRDD.count())
    #   save corr table to cassandra
    if (rowRDD.count() > 0):
        client = SimpleClient(KAFKA_NODE)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "timeStamp" :"' + str(time) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "score":"' + json.dumps(dict(row['score'])) + '}'
            #	    print(line)
            producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
class Producer(object):
    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client)


    def produce_deal_urls(self, api_url=''):
        ''' Constantly produce deal urls for consumers to crawl '''
        # TODO - Find total deals per category
        
        # TODO - Calculate number of pages to crawl
        
        # TODO - Produce categories and page range for consumers
        # {category_slug; start_page; end_page}
        
        

    def produce_msgs(self, source_symbol):
        price_field = random.randint(800,1400)
        msg_cnt = 0
        while True:
            time_field = datetime.now().strftime("%Y%m%d %H%M%S")
            price_field += random.randint(-10, 10)/10.0
            volume_field = random.randint(1, 1000)
            str_fmt = "{};{};{};{}"
            message_info = str_fmt.format(source_symbol,
                                          time_field,
                                          price_field,
                                          volume_field)
            print message_info
            self.producer.send_messages('price_data_part4', source_symbol, message_info)
            msg_cnt += 1
Esempio n. 14
0
class Producer(object):
    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)

    def produce_messages(self, data):
        #timestamp = 1473613200 # 1:00 est
        #while timestamp <= 1473624000:
        while True:
            rows = np.random.randint(0, len(data) - 1, size=num_plays_persec)
            sampled_data = data.iloc[rows]
            curr_time = datetime.datetime.now()
            #create timestamp for camus to partition
            timestamp = datetime.datetime.strftime(curr_time,
                                                   '%Y-%m-%d_%H:%M:%S')
            #create epoch timstamp for custom partitioning
            raw_timestamp = convert_datetime_to_est(curr_time)
            epoch = int(time.mktime(raw_timestamp.timetuple()))
            for idx, row in sampled_data.iterrows():
                json_data = {
                    'timestamp': timestamp,
                    'epoch_timestamp': epoch,
                    'player_id': row.player_id,
                    'player_name': row.player_name,
                    'position': row.position,
                    'yards': row.yards,
                    'touchdown': row.touchdown
                }
                message_info = json.dumps(json_data)
                keystring = 'QA' if row.position == 'QB' else row.position
                key = b'{}'.format(keystring)
                self.producer.send_messages('nfl_plays', key, message_info)
Esempio n. 15
0
def process(time, lines):
    """ Processing tweets 
    Input:
    lines: (ts string, uid string, state string, tweet vector)
    Output:
    Json: (ts string, uid string, topicVec vector)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    #    rowRDD=lines.map(lambda x: (x['timeStamp'], x['userId'], getMeanVector(x['tweet'])))\
    #                .filter(lambda (time, uid, vec): vec!=[])\
    #                .map(lambda x:Row(timestamp=x[0], uid=x[1], topicVec=x[2]))
    rowRDD=lines.map(lambda x: [((x['timeStamp'], x['userId']), word2vec(item)) for item in x['tweet'] if isInVolcabulary(item)] )\
                .flatMap(lambda x:x)\
                .filter(lambda (k, vec): vec!=[])\
                .reduceByKey(lambda x,y:x+y)\
                .map(lambda x:Row(timestamp=x[0][0], uid=x[0][1], topicVec=x[1]))

    #    print(rowRDD.take(10))
    print("========= %d =========" % rowRDD.count())
    #   save corr table to cassandra
    if (rowRDD.count() > 0):
        client = SimpleClient(kafkaNodeBC.value)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "timestamp" :"' + str(row['timestamp']) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "topicVec":' + json.dumps(
                [float(i) for i in row['topicVec']]) + '}'
            #	    print(line)
            producer.send_messages(outgoingTopic, str(hash(line)), line)
Esempio n. 16
0
def unfollow_producer(users: List[Tuple[str]], photos: Deque[Tuple[str, str]],
                      tags: List[Tuple[str]], locations: List[Tuple[str, str]],
                      producer: KeyedProducer) -> Dict[str, str]:
    """
    Produce unfollow events to Kafka

    Arguments:
        users: List of users who can produce an event
        photos: Queue of recent photos and their usernames
        tags: List of company names
        locations: List of possible global lat/long coordinates
        producer: Kafka producer object to post messages

    Returns:
        Kafka message
    """
    followee, follower = random.choice(users)[0], random.choice(users)[0]
    created_time, partition_date = get_datetime()
    record = {
        "follower_username": follower,
        "followed_username": followee,
        "created_time": created_time,
        "partition_date": partition_date,
        "event": "unfollow"
    }
    producer.send_messages("unfollow", bytes(followee, 'utf-8'),
                           json.dumps(record).encode('utf-8'))
    return record
class KafkaLfProducer(object):
    def __init__(self, addr, conf_file, start_house_id, end_house_id, house_status):
        self.parser = SafeConfigParser()
        self.parser.read(conf_file)
        install_dir = self.parser.get('smw_tool', 'INSTALL_DIR')
        zipdb_file = self.parser.get('smw_tool', 'ZIP_DB_FILE') 

        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client, async=True, batch_send_every_n=500,batch_send=True)
        self.meterReader = MeterLfReader(start_house_id,
                                         end_house_id,
                                         house_status,
                                         install_dir + "/data/low_freq/", 
                                         install_dir + "/" + zipdb_file)

    def produce_msgs(self, source_symbol):
        msg_cnt = 0

        while not self.meterReader.houseSentDone():
            (isLf, msg) = self.meterReader.getRecord()

            if msg_cnt % 500000 == 0:
                print "Sent " + str(msg_cnt) + " messages to Kafka"

            if isLf:
                self.producer.send_messages('smw_batch_lf2', source_symbol, msg)
            else:
                self.producer.send_messages('smw_batch_hf2', source_symbol, msg)

            msg_cnt += 1

        print "Sent Total " + str(msg_cnt) + " messages to Kafka"
        self.meterReader.writeHouseStatus()
class Producer(object):
    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)
        self.sess = requests.Session()
        adapter = requests.adapters.HTTPAdapter(max_retries=5)
        self.sess.mount('http://', adapter)
        self.sess.mount('https://', adapter)

    def produce_msgs(self, topic, source_symbol, last_record_set):
        self.record_set = set()
        count = 0
        try:
            for item in self.r["data"]:
                self.record_set.add(item["payment_id"])
                count += 1
                if not item["payment_id"] in last_record_set:
                    message_info = "{}\n".format(json.dumps(item))
                    self.producer.send_messages(topic, source_symbol,
                                                message_info)

#                print message_info
#                print count
        except:
            k = 1

    def get_venmo(self, limit=300, page="https://venmo.com/api/v5/public?"):
        try:
            self.r = self.sess.get(page + "&limit={}".format(limit)).json()
        except:
            self.r = ""
Esempio n. 19
0
def create_user_producer(users: List[Tuple[str]],
                         photos: Deque[Tuple[str, str]],
                         tags: List[Tuple[str]],
                         locations: List[Tuple[str, str]],
                         producer: KeyedProducer) -> Dict[str, str]:
    """
    Produce create-user events to Kafka

    Arguments:
        users: List of users who can produce an event
        photos: Queue of recent photos and their usernames
        tags: List of company names
        locations: List of possible global lat/long coordinates
        producer: Kafka producer object to post messages

    Returns:
        Kafka message
    """
    username, full_name = fake_user()
    created_time, partition_date = get_datetime()

    record = {
        "username": username,
        "full_name": full_name,
        "created_time": created_time,
        "partition_date": partition_date,
        "event": "create-user"
    }
    producer.send_messages("create-user",
                           bytes(username, 'utf-8'),
                           json.dumps(record).encode('utf-8'))
    users.append((username,))
    return record
Esempio n. 20
0
class Producer(object):

    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client,async=True,\
                                      batch_send_every_n=500,batch_send=False)
        self.min_steps = 1
        self.max_steps = 3
        self.max_users_each_thread = 12000

    def produce_msgs(self, source_symbol):
        msg_cnt = 0
        while True:
            start_uuid = (int(source_symbol) - 1) * self.max_users_each_thread
            stop_uuid =  (int(source_symbol) * self.max_users_each_thread) - 1
            uuid = random.sample(range(start_uuid,stop_uuid), 9)
            for uid in uuid:
                timestamp = datetime.now(timezone('US/Pacific')).\
                                        strftime('%Y-%m-%d %H:%M:%S')
                steps = random.randint(1,10)
                json_msg= {'source':source_symbol,'uuid':uid, 
                           'timestamp':timestamp, 'steps': steps}
                json_encoded = json.dumps(json_msg)
                self.producer.send_messages('steps_data_part4', source_symbol,\
                                             json_encoded)
                print json_encoded
                msg_cnt += 1
Esempio n. 21
0
class KafkaLoggingHandler(logging.Handler):

    def __init__(self, hosts_list, topic, key=None):
        logging.Handler.__init__(self)
        self.kafka_client = KafkaClient(hosts_list)
        self.key = key
        self.kafka_topic_name = topic
        if not key:
            self.producer = SimpleProducer(self.kafka_client)
        else:
            self.producer = KeyedProducer(self.kafka_client)

    def emit(self, record):
        # drop kafka logging to avoid infinite recursion
        if record.name == 'kafka':
            return
        try:
            # use default formatting
            msg = self.format(record)
            # produce message
            if not self.key:
                self.producer.send_messages(self.kafka_topic_name, msg)
            else:
                self.producer.send(self.kafka_topic_name, self.key, msg)
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            self.handleError(record)

    def close(self):
        self.producer.stop()
        logging.Handler.close(self)
Esempio n. 22
0
class Producer(object):
    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client)

    def produce_msgs(self, source_symbol, file_source):
        hd = open(file_source)
        for line in hd:
            print line
            self.producer.send_messages('datatest', source_symbol, line)
    def run(self, delay=0.1):
        client = KafkaClient("localhost:9092")
        producer = KeyedProducer(client)

        import numpy as np

        for photoid in TESTPHOTOIDS:
            producer.send_messages('flickr-photoid','%d'%np.random.randint(0,20) ,photoid)
            print "Sending PhotoID: %s"%photoid

            time.sleep(delay)
Esempio n. 24
0
class Producer(object):

    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client)

    def produce_msgs(self, source_symbol, file_source):
        hd = open(file_source)
        for line in hd:
            print line
            self.producer.send_messages('datatest', source_symbol, line)
Esempio n. 25
0
class Producer(object):
    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)

    def produce_msgs(self, source_symbol, file_to_use):
        file_obj = open(file_to_use, 'r')
        msg_cnt = 0
        while True:
            message_info = file_obj.next()
            print message_info
            self.producer.send_messages('venmo2', source_symbol, message_info)
            msg_cnt += 1
Esempio n. 26
0
class KafkaLoggingHandler(logging.Handler):
    """
    Use kafka to send msg to elk platform
    """

    def __init__(self, hosts_list, topic,
                 timeout_secs=DEFAULT_SOCKET_TIMEOUT_SECONDS, **kwargs):
        logging.Handler.__init__(self)

        self.kafka_client = KafkaClient(hosts_list, timeout=timeout_secs)
        self.key = kwargs.get("key", None)
        self.kafka_topic_name = topic

        if not self.key:
            self.producer = SimpleProducer(self.kafka_client, **kwargs)
        else:
            self.producer = KeyedProducer(self.kafka_client, **kwargs)

    def emit(self, record):
        """
        emit record
        :param record:
        :return:
        """
        # drop kafka logging to avoid infinite recursion
        if record.name == 'kafka':
            return
        try:
            # use default formatting
            msg = self.format(record)
            msg = msg.encode("utf-8")
            # produce message
            if not self.key:
                self.producer.send_messages(self.kafka_topic_name, msg)
            else:
                self.producer.send_messages(
                    self.kafka_topic_name, self.key, msg)
        except (KeyboardInterrupt, SystemExit):
            raise
        except BaseException:
            self.handleError(record)

    def close(self):
        """
        close the client
        :return:
        """
        if self.producer is not None:
            self.producer.stop()
        logging.Handler.close(self)
Esempio n. 27
0
class Producer(object):

	def __init__(self, addr):
		self.client = SimpleClient(addr)
		self.producer = KeyedProducer(self.client)

	def stream_science_posts(self, key):
		r = requests.session()
		header = {"User-Agent": "anisotropix Science"}
		s = r.get('https://www.reddit.com/r/science/new/.json?limit=100', stream = True, headers =header)#tream = True, timeout = 2)
		for post in s.iter_lines():
			if post:
				self.producer.send_messages('Science_posts',key,  post)
				print (post)
Esempio n. 28
0
class Producer(object):

    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)

    def produce_msgs(self, source_symbol, file_to_use):
        file_obj = open(file_to_use, 'r')
        msg_cnt = 0
        while True:
            message_info = file_obj.next()
            print message_info
            self.producer.send_messages('venmo2', source_symbol, message_info)
            msg_cnt += 1
Esempio n. 29
0
class Producer(object):
    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)

    def stream_science_posts(self, key):
        r = requests.session()
        header = {"User-Agent": "anisotropix Science"}
        s = r.get('https://www.reddit.com/r/science/new/.json?limit=100',
                  stream=True,
                  headers=header)  #tream = True, timeout = 2)
        for post in s.iter_lines():
            if post:
                self.producer.send_messages('Science_posts', key, post)
                print(post)
Esempio n. 30
0
class Producer(object):

    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client)

    def produce_msgs(self, source_symbol):
        msg_cnt = 0
        while True:
            artwork_path = "loc.txt"
            with open(artwork_path) as f1:
                for line in f1:
                    if line.strip():
                        print line.strip()
                        self.producer.send_messages('post_geo_activity', source_symbol,line.strip())
                        msg_cnt += 1
Esempio n. 31
0
def write():
    k_client = KafkaClient(KAFKA_URL)
    p = KeyedProducer(k_client,
                      async=False,
                      req_acks=KeyedProducer.ACK_AFTER_LOCAL_WRITE,
                      ack_timeout=2000)
    messages = []
    for i in xrange(NUM_MESSAGES):
        message = json.dumps({'msg': 'X' * SIZE_MSG})
        messages.append(message)
        if len(messages) >= 500:
            key = int(time.time() * 1000)
            p.send_messages(KAFKA_TOPIC, str(key), *messages)
            messages = []
    key = int(time.time() * 1000)
    p.send_messages(KAFKA_TOPIC, str(key), *messages)
Esempio n. 32
0
class Producer(object):

    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)

        # The 1999 KDDCup network traffic dataset
        self.data_file = open('/home/ubuntu/opt/realtimeAnomalies/src/main/test/kddcup.testdata.unlabeled', 'r')
        self.mem_data = []
        for record in self.data_file:
            self.mem_data.append(record)

    def produce_msgs(self, source_symbol):
        random.seed()
        while True:
            idx = random.randint(0, len(self.mem_data) - 1)
            str_fmt = "{}"
            message_content = str_fmt.format(self.mem_data[idx])
            self.producer.send_messages('traffic_data', source_symbol, message_content)
Esempio n. 33
0
class Producer(object):
    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)
        self.timezone = timezone('EST')

    def name_generator(self):
        return ''.join(
            random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVWZ')
            for i in range(random.randint(3, 9)))

    def item_generator(self):
        global item_lists
        return random.choice(item_lists)

    def produce_msgs(self):
        msg_cnt = 0
        auction_id = 0
        while True:
            auction_id += 1
            #Create time EST time
            create_time = datetime.now(
                self.timezone).strftime("%Y-%m-%d %H:%M:%S")
            #Auctioner ID
            auctioner_id = random.randint(0, 100000)
            #Expiry: 2 hours to 4 days
            auction_type = random.randint(2, 96)
            #Starting price: 1 cent to $100
            starting_price = random.uniform(0.01, 100.0)
            #Auctioner name generator
            auctioner_name = self.name_generator()
            #Item generator
            item = self.item_generator()

            str_fmt = "{};{};{};{};{};{};{}"
            message_info = str_fmt.format(auction_id, create_time,
                                          auctioner_id, auction_type,
                                          round(starting_price,
                                                2), auctioner_name, item)
            print message_info
            self.producer.send_messages('auctions', str(random.randint(0, 4)),
                                        message_info)
            msg_cnt += 1
Esempio n. 34
0
class Producer(object):

    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client)

    def open_save(self, fileName):
        log_file = open(fileName, "w")
        log_file.close()
        return log_file

    def create_topic(self, topic):
        script = "/usr/local/kafka/bin/kafka-topics.sh"
        os.system("{} --create --zookeeper localhost:2181 --topic {} --partitions {} --replication-factor 2".format(script, topic, "4"))
        return "topic {} created".format(topic)

    def produce_msgs(self, source_symbol, topic):
        server_topics = self.client.topic_partitions
        if topic not in server_topics:
          self.create_topic(topic)
        price_field = random.randint(800,1400)
        cities = ["Barcelona", "Philadelphia", "Honolulu",
                  "Atlanta", "Miami", "Chicago", "SF", "LA", "NYC",
                  "Houston", "Paris", "London", "Tokyo"]
        msg_cnt = 0
        log_file = open("input1/{}.csv".format(topic), "a")
        while True:
            time_field = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')
            location_field = random.choice(cities)
            price_field += random.randint(-10, 10)/10.0
            str_fmt = "{},{},{},{}"
            message_info = str_fmt.format(source_symbol,
                                          time_field,
                                          location_field,
                                          price_field)
            print message_info
            log_file.write("{}\n".format(message_info))
            self.producer.send_messages(topic, source_symbol, message_info)
            msg_cnt += 1
            if msg_cnt > 200000:
                log_file.close()
                self.producer.stop()
                break
Esempio n. 35
0
class Producer(object):
    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)

    def produce_msgs(self, source_symbol):
        price_field = random.randint(800, 1400)
        msg_cnt = 0
        while True:
            time_field = datetime.now().strftime("%Y%m%d %H%M%S")
            price_field += random.randint(-10, 10) / 10.0
            volume_field = random.randint(1, 1000)
            str_fmt = "{};{};{};{}"
            message_info = str_fmt.format(source_symbol, time_field,
                                          price_field, volume_field)
            print message_info
            self.producer.send_messages('price_data_part4', source_symbol,
                                        message_info)
            msg_cnt += 1
Esempio n. 36
0
class Producer(object):

    # Initialization for the class with address
    def __init__(self, addr):

        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)
        self.topic = 'ajay_test_topic'

    # Main method for simulation
    def produce_msgs(self, source_symbol):

        # Generate some random data
        price_field = random.randint(800, 1400)
        # Count the messages in the tunnel
        msg_cnt = 0

        # Loop for the feilds
        while True:

            # Get a random time value
            time_field = datetime.now().strftime("%Y%m%d %H%M%S")
            # Get a random price value
            price_field += random.randint(-10, 10) / 10.0
            # Get a random volume feild
            volume_field = random.randint(1, 1000)

            # Format your string
            str_fmt = "{};{};{};{}"

            # Create the message
            message_info = str_fmt.format(source_symbol, time_field,
                                          price_field, volume_field)

            # Print for debug
            print message_info

            # Send the message
            self.producer.send_messages(self.topic, source_symbol,
                                        message_info)

            # Messages count
            msg_cnt += 1
Esempio n. 37
0
class Producer(object):
    def __init__(self, addr=None):
        self.isNone = True
        if addr is not None:
            self.client = SimpleClient(addr)
            self.producer = KeyedProducer(self.client)
            self.isNone = False

    def produce_msgs(self, source_symbol):
        random = Random(0)
        msg_cnt = 0
        start = 50
        for i in range(100):  #for observation groups 13 through 13+range
            #time.sleep(10) #waits between observation groups
            for x in range(
                    3000
            ):  #1500 means about 1000 per obs because there are 4 producers
                time.sleep(
                    0.00001
                )  # 0.2 waits this many seconds before producing another message about 1000 each obs each 5  min
                self.observationgroup_field = random.randint(
                    start + i, start + i)
                self.observationorder_field = random.randint(1, 6)
                self.frequency_field = random.random() * 10000
                self.snr_field = random.random() * 100
                self.driftrate_field = random.random() - random.random()
                self.uncorrectedfrequency_field = random.random(
                ) - random.random() + self.frequency_field
                str_fmt = "{};{};{};{};{};{};{}"
                message_info = str_fmt.format(
                    source_symbol, self.observationgroup_field,
                    self.observationorder_field, self.frequency_field,
                    self.snr_field, self.driftrate_field,
                    self.uncorrectedfrequency_field)
                if not self.isNone:
                    self.producer.send_messages('gbthits', source_symbol,
                                                message_info)
                else:
                    break
                msg_cnt += 1
            if self.isNone:
                break
Esempio n. 38
0
class Producer(object):
    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)

    def produce_msgs(self, source_symbol):
        price_field = random.randint(800,1400)
        msg_cnt = 0
        while True:
            time_field = datetime.now().strftime("%Y%m%d %H%M%S")
            price_field += random.randint(-10, 10)/10.0
            volume_field = random.randint(1, 1000)
            str_fmt = "{};{};{};{}"
            message_info = str_fmt.format(source_symbol,
                                          time_field,
                                          price_field,
                                          volume_field)
            print message_info
            self.producer.send_messages('price_data_part4', source_symbol, message_info)
            msg_cnt += 1
Esempio n. 39
0
class KafkaLoggingHandler(logging.Handler):

    def __init__(self, hosts_list, topic, **kwargs):
        logging.Handler.__init__(self)

        self.kafka_client = SimpleClient(hosts_list)
        self.key = kwargs.get("key", None)
        self.kafka_topic_name = topic

        if not self.key:
            self.producer = SimpleProducer(self.kafka_client, **kwargs)
        else:
            self.producer = KeyedProducer(self.kafka_client, **kwargs)

    def emit(self, record):
        # drop kafka logging to avoid infinite recursion
        if record.name == 'kafka':
            return
        try:
            # use default formatting
            msg = self.format(record)
            if isinstance(msg, unicode):
                msg = msg.encode("utf-8")

            # produce message
            if not self.key:
                self.producer.send_messages(self.kafka_topic_name, msg)
            else:
                self.producer.send_messages(self.kafka_topic_name, self.key,
                                            msg)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception:
            self.handleError(record)

    def close(self):
        if self.producer is not None:
            self.producer.stop()
        logging.Handler.close(self)
class Producer(object):
    def __init__(self, addr, group_id):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)
        self.group_id = group_id

    def produce_msgs(self, source_file):
        with open(source_file, 'r') as f:
            lines = f.readlines()
        start_time = datetime.now()
        num_lines = 0
        line_inx = 0
        max_inx = len(lines)

        while line_inx < max_inx:
            token = lines[line_inx].strip().split()
            line_inx = line_inx % max_inx

            if token[2] != 'NaN':
                for num in range(1000):
                    user_id = "user_%s_%s" % (self.group_id, num)
                    event_time = (
                        start_time +
                        timedelta(0, num_lines)).strftime('%Y-%m-%d %H:%M:%S')
                    hr = int(token[2]) + randint(0, 4) - 2
                    msg = {'id': user_id, 'time': event_time, 'hr': hr}
                    json_msg = json.dumps(msg)
                    print json_msg
                    self.producer.send_messages('sensor', str(self.group_id),
                                                json_msg)
                line_inx += 1
                num_lines += 1
                line_inx = line_inx % max_inx
                time.sleep(2)

            line_inx += 1
            num_lines += 1
            line_inx = line_inx % max_inx
Esempio n. 41
0
class Producer(object):
    '''
    Messages are sent to a single kafka topic "Friendsquare" as a json formatted string
    '''
    def __init__(self, addr, userslist, venueslist):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)
        self.userslist = userslist[0:500000]
        self.venueslist = venueslist[0:250000]

    def produce_msgs(self, partitionkey):
        new_time = datetime.now()
        msg_cnt = 0
        while True:
            if ((msg_cnt % 4000) != 0):
                userid = int(random.choice(self.userslist))
                venueid = int(random.choice(self.venueslist))
            else:
                userid = int(random.choice(self.userslist[0:500000]))
                venueid = int(random.choice(self.venueslist[0:250000]))
            rating = random.randint(0, 5)
            randomdelta = np.random.normal(3, 3, 1)[0]
            new_time += timedelta(seconds=randomdelta)
            created_time = new_time.strftime("%Y-%m-%d %H:%M:%S")
            message_info = {
                'partitionkey': partitionkey,
                'userid': userid,
                'venueid': venueid,
                'created_at': created_time,
                'rating': rating
            }
            msg_info = json.dumps(message_info)
            print message_info
            self.producer.send_messages('Friendsquare1', partitionkey,
                                        msg_info)
            msg_cnt += 1
            time.sleep(0.01)
Esempio n. 42
0
def create_photo_producer(users: List[Tuple[str]], photos: Deque[Tuple[str,
                                                                       str]],
                          tags: List[Tuple[str,
                                           str]], locations: List[Tuple[str,
                                                                        str]],
                          producer: KeyedProducer) -> Dict[str, str]:
    """
    Produce photo-upload events to Kafka

    Arguments:
        users: List of users who can produce an event
        photos: Queue of recent photos and their usernames
        tags: List of company names
        locations: List of possible global lat/long coordinates
        producer: Kafka producer object to post messages

    Returns:
        Kafka message
    """
    user = random.choice(users)[0]
    tag, link = random.choice(tags)
    latitude, longitude = random.choice(locations)
    created_time, partition_date = get_datetime()
    record = {
        "username": user,
        "tags": tag,
        "photo_link": link,
        "created_time": created_time,
        "partition_date": partition_date,
        "latitude": latitude,
        "longitude": longitude,
        "event": "photo-upload"
    }
    producer.send_messages('photo-upload', bytes(user, 'utf-8'),
                           json.dumps(record).encode('utf-8'))
    photos.append((created_time, user))
    return record
Esempio n. 43
0
def comment_producer(users: List[Tuple[str]], photos: Deque[Tuple[str, str]],
                     tags: List[Tuple[str]], locations: List[Tuple[str, str]],
                     producer: KeyedProducer) -> Optional[Dict[str, str]]:
    """
    Produce comment events to Kafka

    Arguments:
        users: List of users who can produce an event
        photos: Queue of recent photos and their usernames
        tags: List of company names
        locations: List of possible global lat/long coordinates
        producer: Kafka producer object to post messages

    Returns:
        Kafka message
    """
    if not photos:
        return None
    follower = random.choice(users)[0]
    photo, followee = random.choice(photos)
    text = get_text()
    created_time, partition_date = get_datetime()

    if not all([photo, follower, followee]):
        return None
    record = {
        "follower_username": follower,
        "followed_username": followee,
        "photo_id": photo,
        "text": text,
        "created_time": created_time,
        "partition_date": partition_date,
        "event": "comment"
    }
    producer.send_messages("comment", bytes(followee, 'utf-8'),
                           json.dumps(record).encode('utf-8'))
    return record
Esempio n. 44
0
class Producer(object):
    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)

    def produce_msgs(self, source_symbol):
        price_field = random.randint(800, 1400)
        msg_cnt = 0
        category_product = [('furniture', 'cat bed'),
                            ('dog food', 'purina dog biscuits'),
                            ('cat food', 'fancy feast 8oz'),
                            ('cleaning', 'roomba')]

        while True:

            rand_datetime = radar.random_datetime(start=datetime(year=2016,
                                                                 month=5,
                                                                 day=24),
                                                  stop=datetime(year=2017,
                                                                month=2,
                                                                day=1))
            time_field = rand_datetime.strftime("%Y%m%d %H%M%S")
            price_field += random.randint(-10, 10) / 10.0
            product_cat_listid = random.randint(0, 2)
            customer_field = random.randint(1, 10000)
            product_field = category_product[product_cat_listid][1]
            category_field = category_product[product_cat_listid][0]
            volume_field = random.randint(1, 10)
            str_fmt = "{};{};{};{};{};{};{}"
            message_info = str_fmt.format(source_symbol, time_field,
                                          price_field, volume_field,
                                          customer_field, product_field,
                                          category_field)
            print message_info
            self.producer.send_messages('transactiondata', source_symbol,
                                        message_info)
            msg_cnt += 1
Esempio n. 45
0
class KafkaLoggingHandler(logging.Handler):

    def __init__(self, hosts_list, topic, timeout_secs=DEFAULT_SOCKET_TIMEOUT_SECONDS, **kwargs):
        logging.Handler.__init__(self)

        self.kafka_client = KafkaClient(hosts_list, timeout=timeout_secs)
        self.key = kwargs.get("key", None)
        self.kafka_topic_name = topic

        if not self.key:
            self.producer = SimpleProducer(self.kafka_client, **kwargs)
        else:
            self.producer = KeyedProducer(self.kafka_client, **kwargs)
        self.addFilter(KafkaLoggingFilter())

    def emit(self, record):
        try:
            # use default formatting
            msg = self.format(record)
            if isinstance(msg, unicode):
                msg = msg.encode("utf-8")

            # produce message
            if not self.key:
                self.producer.send_messages(self.kafka_topic_name, msg)
            else:
                self.producer.send_messages(self.kafka_topic_name, self.key, msg)
        except (KeyboardInterrupt, SystemExit):
            raise
        except:
            self.handleError(record)

    def close(self):
        if self.producer is not None:
            self.producer.stop()
        logging.Handler.close(self)
Esempio n. 46
0
class KafkaHandler(logging.Handler):
    """
    publish message to kafka
    """
    def __init__(self, topic, producer_type=ProducerType.SIMPLE,\
            host_port="127.0.0.1:9092", **producer_opts):

        self.topic = topic
        self.host_port = host_port
        if producer_type == ProducerType.SIMPLE:
            self.producer = SimpleProducer(KafkaClient(host_port),\
                    **producer_opts)
        else:
            self.producer = KeyedProducer(KafkaClient(host_port),\
                    **producer_opts)

    def emit(self, record):
        try:
            response = self.producer.send_messages(self.topic,\
                    self.format(record))
        except:
            raise
Esempio n. 47
0
class KafkaHandler(logging.Handler):
    """
    publish message to kafka
    """
    def __init__(self, topic, producer_type=ProducerType.SIMPLE,\
            host_port="127.0.0.1:9092", **producer_opts):

        self.topic = topic
        self.host_port = host_port
        if producer_type == ProducerType.SIMPLE:
            self.producer = SimpleProducer(KafkaClient(host_port),\
                    **producer_opts)
        else:
            self.producer = KeyedProducer(KafkaClient(host_port),\
                    **producer_opts)

    def emit(self, record):
        try:
            response = self.producer.send_messages(self.topic,\
                    self.format(record))
        except:
            raise
    def test_switch_leader_keyed_producer(self):
        topic = self.topic

        producer = KeyedProducer(self.client, async=False)

        # Send 10 random messages
        for _ in range(10):
            key = random_string(3)
            msg = random_string(10)
            producer.send_messages(topic, key, msg)

        # kill leader for partition 0
        self._kill_leader(topic, 0)

        recovered = False
        started = time.time()
        timeout = 60
        while not recovered and (time.time() - started) < timeout:
            try:
                key = random_string(3)
                msg = random_string(10)
                producer.send_messages(topic, key, msg)
                if producer.partitioners[kafka_bytestring(topic)].partition(key) == 0:
                    recovered = True
            except (FailedPayloadsError, ConnectionError):
                logging.debug("caught exception sending message -- will retry")
                continue

        # Verify we successfully sent the message
        self.assertTrue(recovered)

        # send some more messages just to make sure no more exceptions
        for _ in range(10):
            key = random_string(3)
            msg = random_string(10)
            producer.send_messages(topic, key, msg)
Esempio n. 49
0
    from kafka.client import SimpleClient as KafkaClient
    from kafka.producer import KeyedProducer

import json
import time


KAFKA_URL = '192.168.10.6:9092'
KAFKA_GROUP = 'kafka_python_perf'
KAFKA_TOPIC = 'raw-events'

NUM_MESSAGES = 10
SIZE_MSG = 369

k_client = KafkaClient(KAFKA_URL)
p = KeyedProducer(k_client,
                  async=False,
                  req_acks=KeyedProducer.ACK_AFTER_LOCAL_WRITE,
                  ack_timeout=2000)
messages = []
while 1:
    for i in xrange(NUM_MESSAGES):
        message = json.dumps({'msg': 'X' * SIZE_MSG})
        messages.append(message)
        if len(messages) >= 500:
            key = int(time.time() * 1000)
            p.send_messages(KAFKA_TOPIC, str(key), *messages)
            messages = []
            print("wrote 500")
    time.sleep(1)
#select a random piece of news from collected set of tweets and send as a Kafka message

DATADIR="/home/ubuntu/synthetic_twitter/"
KAFKA_NODE="ec2-54-215-247-116.us-west-1.compute.amazonaws.com"
KAFKA_TOPIC="twitter"
os.chdir(DATADIR)
files = glob.glob("*.archive")

#select a random company
datafile = random.choice(files)
datafile = DATADIR + datafile
#select a random line in data file for news
#R(3.4.2) (Waterman's "Reservoir Algorithm")
data = open(datafile, "r")
line = next(data)
for num, nextline in enumerate(data):
    if random.randrange(num + 2): continue
    line = nextline

data.close()

#add "Synthetic Twitter" as news outlet
line = line.rstrip().replace('}', ', "newsoutlet":"Synthetic Twitter"}')
#add timestamp
line = line.replace('}', ',"newstime":"' + time.strftime("%c") + '"}')

#Create producer and send message
client = KafkaClient(KAFKA_NODE)
producer = KeyedProducer(client)
producer.send_messages('twitter', str(hash(line) % 2), line)
#select a random piece of news from collected set of tweets and send as a Kafka message

DATADIR = "/home/ubuntu/synthetic_twitter/"
KAFKA_NODE = "ec2-54-215-247-116.us-west-1.compute.amazonaws.com"
KAFKA_TOPIC = "twitter"
os.chdir(DATADIR)
files = glob.glob("*.archive")

#select a random company
datafile = random.choice(files)
datafile = DATADIR + datafile
#select a random line in data file for news
#R(3.4.2) (Waterman's "Reservoir Algorithm")
data = open(datafile, "r")
line = next(data)
for num, nextline in enumerate(data):
    if random.randrange(num + 2): continue
    line = nextline

data.close()

#add "Synthetic Twitter" as news outlet
line = line.rstrip().replace('}', ', "newsoutlet":"Synthetic Twitter"}')
#add timestamp
line = line.replace('}', ',"newstime":"' + time.strftime("%c") + '"}')

#Create producer and send message
client = KafkaClient(KAFKA_NODE)
producer = KeyedProducer(client)
producer.send_messages('twitter', str(hash(line) % 2), line)
Esempio n. 52
0
class Producer(object):

    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client)

    def open_save(self, fileName):
        log_file = open(fileName, "w")
        log_file.close()
        return log_file

    def produce_msgs(self, source_symbol, topic, items):
      print datetime.now()

      sample = []
      saved = checkCassandra()

      for item in items:
          listing = []
          log_file = open("justsoldonebay.csv", "a")

          if item.sellingStatus.sellingState == "EndedWithSales":
              gender = get_category(item.primaryCategory.categoryId)
              shoe = get_shoe(item.title.lower())

              if item.itemId in saved: 
                print item.itemId, item.title
              if item.itemId not in saved:

                zip_code2 = ""
                try:
                    zip_code = item.postalCode
                    try:
                        int(zip_code)
                    except:
                        zip_code = 0
                        # save it elsewhere
                        zip_code2 = item.postalCode
                except: 
                    zip_code = 0

                try:
                    location = item.location
                except: 
                    location = "0"

                try:
                    gallery = item.galleryURL
                except:
                    gallery = "NA"

                
                price = item.sellingStatus.convertedCurrentPrice.value
                pprice = "${}{}".format(price,item.sellingStatus.convertedCurrentPrice._currencyId)        
                #print item.listingInfo.startTime, item.itemId, shoe, price, zip_code, gender #, item.title
                start = str(item.listingInfo.startTime)
                end = str(item.listingInfo.endTime)
                listing = [start, item.listingInfo.endTime, item.viewItemURL,
                           item.itemId, shoe, price, zip_code, gender, gallery, 
                           location, item.title, zip_code2]
                sample.append(listing)


      print "done building q"
      msg_cnt = 0
      while True:
        str_fmt = "{};{};{};{};{};{};{};{};{};{};{};{}"
        x = sample[msg_cnt]
        message_info = str_fmt.format(x[0], x[1], x[2],
                                      x[3], x[4], x[5],
                                      x[6], x[7], x[8],
                                      x[9], x[10], x[11])
        print message_info
        log_file.write(str(len(x)) +" "+message_info+ "\n")
        self.producer.send_messages('justsoldonebay', "1", message_info)
        msg_cnt += 1
        if msg_cnt == len(sample):
          break
class Producer(object):

    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client)

    def produce_house_centric_msgs(self, source_symbol, topic):
        # Declare variables
        timing_list = []
    
        # Read throughput timing distribution
        with open('data/throughput/throughput_timing.txt', 'rU') as infile:
            for line in infile :
                timing_list.append(line.strip())

        # Run Forever
        while True:
            random_historical_data = random.randint(0, 85)  # Pick random historical data - there are 86 time points
            
            zipcode_sales_dict = parse_prefilter_data('data/tri_zipcode_sales/%s.txt'%random_historical_data)
            zipcode_price_dict = parse_prefilter_data('data/tri_zipcode_price/%s.txt'%random_historical_data)
        
            random_zipcode_sales_list = build_random_zipcode_list(zipcode_sales_dict)
            distribution_zipcode_sales_list = build_distribution_based_zipcode_list(zipcode_sales_dict)
        
            t_end = time.time() + 60 * 60 # every hour
        
            while time.time() < t_end:
                time_field = datetime.now().strftime("%Y%m%d-%H%M%S")
                user_id_field = 0
                random_number = random.randint(0, (len(random_zipcode_sales_list)-1)) 
                user_zipcode = random_zipcode_sales_list[random_number]
                house_field = emit_random_zipcode_price(distribution_zipcode_sales_list, zipcode_price_dict)
                str_fmt = """{{"timestamp":"{}","user":{{"id":"{}","zipcode":"{}"}},"house":{}}}"""
                message_info = str_fmt.format(time_field, user_id_field, user_zipcode, house_field)

                print message_info
                self.producer.send_messages(topic, source_symbol, message_info)

                if float(timing_list[random_historical_data]) != 0:
                    time.sleep(float(timing_list[random_historical_data]))

    def produce_user_centric_msgs(self, source_symbol, topic):
        user_id_list = ('1', '2')
        user_zipcode_list = ('10461', '07304')
        house_zipcode_list = ('10545', '07304')
        price_list = ('315789', '299679')

        # Run Forever
        while True:
            i = random.randint(0, 1)  # Pick random user

            t_end = time.time() + 60 * 360 # for 6 hours
    
            while time.time() < t_end:
                time_field = datetime.now().strftime("%Y%m%d-%H%M%S")
                user_id_field = user_id_list[i]
                user_zipcode = user_zipcode_list[i]
                another_random_number = random.randint(int(float(price_list[i])-(float(price_list[i])*0.3)), int(float(price_list[i])+(float(price_list[i])*0.5)))
                house_field = '{"zipcode":"{%s}","price":"{%s}"}'%(house_zipcode_list[i], another_random_number)
                str_fmt = """{{"timestamp":"{}","user":{{"id":"{}","zipcode":"{}"}},"house":{}}}"""
                message_info = str_fmt.format(time_field, user_id_field, user_zipcode, house_field)

                print message_info
                self.producer.send_messages(topic, source_symbol, message_info)

                time.sleep(30) # send message every 30 seconds
Esempio n. 54
0
class Producer(object):
    
    _msg_cnt = 0
    def __init__(self, addr):
        print "Trying connection..."
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client)
        print "Made connection with host: {}".format(addr)
        self._last_update = datetime.utcnow() # For latest deals
        self._more_pages = 10
        self._chunk_size = 10

    def produce_deal_urls(self, url, topic, partition_key, max_deals_per_page=100, initial_visit=True):
        ''' Constantly produce deal urls for consumers to crawl '''
        if not initial_visit: # Get the right URL to crawl
            # Search the UTC time delta since last visit for this category
            checked_last = self._last_update.strftime("%Y-%m-%dT%H:%M:%S%Z")
            url = '{};updated_after={}'.format(url,checked_last)
        req = self.fetch_request(url)
        if req.ok:
            # Calculate number of pages to crawl
            # Max 100 per page, crawl total//100
            try:
                total_category = req.json()['query']['total'] 
                if total_category > 0:
                    num_pages_to_fetch = ((total_category / max_deals_per_page) + 1)
                    '''
                        Produce categories and page range for consumers
                        Crawl extra pages to account for changing api.
                        Pages with no deals will be filtered out by consumer
                        Recommended approaches to partitioning
                        1. max(t/p, t/c) partitions.
                            - t: Required throughput
                            - p: Production speed
                            - c: consumption speed
                        2. Rule of thumb - 100 * b * r
                            - b: # of brokers in cluster
                            - r: # of replication factor
                        {category_slug; start_page; end_page}
                    '''
                    total_pages = range(1, num_pages_to_fetch + self._more_pages)
                    page_chunks = list(self.yield_chunks(total_pages, self._chunk_size))
                    for chunk in page_chunks:
                        time_stamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S%Z")
                        msg = '{} => {} => {}'.format(time_stamp, 
                                                      url,
                                                      chunk)
                        print msg
                        self.producer.send_messages(topic, str(partition_key), msg)
                        self.__class__._msg_cnt += 1
                self._last_update = datetime.utcnow()
            except simplejson.scanner.JSONDecodeError:
                pass
            
    def produce_deal_full_data(self):
        time_stamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S%Z")
        pass
    
    def get_total_msg_prod(self):
        ''' Returns how many messages all instances of producer sent '''
        return self.__class__._msg_cnt
    
    def fetch_request(self, url):
        ''' Return url to endpoint '''
        return rq.get(url)
    
    def yield_chunks(self, int_list, num):
        ''' Yield successive chunks of size num from lists '''
        for idx in xrange(0, len(int_list), num):
            yield int_list[idx:idx+num]
class Producer(object):

    def __init__(self, addr):
        self.client = KafkaClient(addr)
        self.producer = KeyedProducer(self.client)
        self.minLat = 40.708751 
        self.maxLat = 40.802895
        self.minLong = -74.025879
        self.maxLong = -73.930435
        self.counter_start = 0
        # self.locs = [(40.75280785, -73.97546422),(40.73988115,-73.98711691),(40.76105171, -73.96962834),\
        #                 (40.75790096,-73.97578395),(40.75833353,-74.00436092),(40.74496999,-73.97087089),\
        #                 (40.76088942,-73.97008963),(40.75494802,-73.96084512),(40.73754566,-73.98306014),\
        #                 (40.76804075,-73.98086881),(40.73795777,-73.97972054),(40.75311322,-73.99081106),\
        #                 (40.76445038,-73.9693873),(40.75204099,-73.99041951),(40.75705723,-73.98304045),\
        #                 (40.74984862,-73.98108846),(40.73641334,-73.99263483),(40.74022644,-73.97511118),\
        #                 (40.74081696,-73.99869147),(40.75155827,-73.97809876),(40.7979499,-73.93799602),\
        #                 (40.78487376,-73.9488285),(40.78891306,-73.96322338),(40.80932537,-73.95927604),\
        #                 (40.79512142,-73.97732225),(40.78566559,-73.94358666),(40.80024399,-73.96799964),\
        #                 (40.78788311,-73.97040765),(40.80434947,-73.93874699),(40.80183406,-73.96247845),\
        #                 (40.80595751,-73.95441724),(40.80650874,-73.96646741),(40.7931067,-73.9413598),\
        #                 (40.81627861,-73.95581725),(40.80999546,-73.96029616),(40.81289571,-73.95471676),\
        #                 (40.81689372,-73.93035378),(40.81309684,-73.92121306), (40.8096491,-73.93651239)]
        self.available=[[8,9,10, 17,18,19,20,21],[8,9,10,11,16,17,18,19,20],[8,9,15,16,17,18],\
                            [14,15,16,17,18,19,20,21,22],[10,11,12,22,23,00],[12,13,14,15,16],\
                            [19,20,21,22,23,00],[00,01,02],[8,11,13,15,17,19,21,22,23], [8,2],\
			    [0,1,2,3,4,5,6,7,8,9,10,12],[13,14,15,16,17,18,19,20,21,22,23]]

    def produce_msgs(self, name):
        # location_index = random.randint(0, len(self.locs))               
        # latitude = self.locs[location_index][0]
        # longitude = self.locs[location_index][1]
        lat_frac = random.random()
        long_frac = random.random()               
        latitude = lat_frac*self.minLat + (1-lat_frac)*self.maxLat
        longitude = long_frac*self.minLong + (1-long_frac)*self.maxLong
        schedule = self.available[random.randint(0,len(self.available)-1)]
        steps = self.counter_start
        while True:
            direction = random.randint(0,4)
            t = datetime.now().strftime("%Y%m%d %H%M%S")
            hr = datetime.now().hour
            if hr == 0:
                steps = self.counter_start
            availability = hr in schedule
            if direction == 0:
                steps +=1
                if latitude >= self.maxLat:
                    latitude = latitude - 0.00001124152
                else:
                    latitude = latitude + 0.00001124152
            elif direction == 1:
                steps +=1
                if longitude >= self.maxLong:
                    longitude = longitude - 0.00001124152
                else:
                    longitude = longitude + 0.00001124152
            elif direction == 2:
                steps +=1
                if latitude <= self.minLat:
                    latitude = latitude + 0.00001124152
                else:
                    latitude = latitude - 0.00001124152
            elif direction == 3:
                steps +=1
                if latitude <= self.minLong:
                    longitude = longitude + 0.00001124152
                else:
                    longitude = longitude - 0.00001124152
            else:
                pass
            
            str_fmt = "{};{};{};{};{};{}"
            message_info = str_fmt.format("user_"+str(name),
                                          t,
                                          latitude,
                                          longitude,
                                          availability,
                                          steps)
            print message_info
            self.producer.send_messages('b', name, message_info)
            time.sleep(0.1)