def process(time, lines): """match user with bidder Input: lines: (ts string, uid string, topic vector) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # calculate user-product correlation table runningWindow=lines.map(lambda (k, v): ( (k[0], str(time)), v ))\ .reduceByKey(lambda x,y: x+y)\ .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\ .flatMap(lambda x:x)\ .filter(lambda (x,y,s):s>.97) rowRDD = runningWindow.map( lambda x: Row(uid=x[0][0], pid=x[1], score=x[2], ts=x[0][1])) # print(rowRDD.take(10)) print("========= %d =========" % rowRDD.count()) if (rowRDD.count() > 0): client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "tick" :"' + str(time.isoformat()) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "score":"' + str(row['score']) + '",' line += ' "pid":' + str(row['pid']) + '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol): #price_field = random.randint(800,1400) msg_cnt = 0 datagenerator = DataGenerator() function_options = { 0: datagenerator.click_event, 1: datagenerator.view_event, 2: datagenerator.bid_event, 3: datagenerator.hover_event, 4: datagenerator.load_event } while True: #time_field = datetime.now().strftime("%Y%m%d %H%M%S") #price_field += random.randint(-10, 10)/10.0 #volume_field = random.randint(1, 1000) #str_fmt = "{};{};{};{}" #message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) num = random.randint(0, 4) message_info = function_options[num]() print json.dumps(message_info) self.producer.send_messages('test_adability', source_symbol, message_info) msg_cnt += 1
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) self.zipcode = [] self.complaint = [] def load_ids(self): zipcode_path = "/home/ubuntu/repos/project311/kafka/zipcodes.txt" complaint_path = "/home/ubuntu/repos/project311/kafka/complaint_type.txt" with open(zipcode_path, 'r') as f1: for line in f1: if line != "": self.zipcode.append(line.strip()) with open(complaint_path) as f2: for line in f2: if line != "": self.complaint.append(line.strip()) def produce_msgs(self, source_symbol): msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d%H%M%S") zipcode_field = random.choice(self.zipcode) complaint_field = random.choice(self.complaint) str_fmt = "{};{};{};{}" message_info = str_fmt.format(source_symbol, time_field, zipcode_field, complaint_field) print message_info self.producer.send_messages('complaints', source_symbol, message_info) msg_cnt += 1
def process(time, lines): """match user with bidder Input: lines: (ts string, uid string, {pid:score} dict) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # calculate user-product correlation table rowRDD=lines.map(lambda x: ( x['uid'], matchBids(x['score']) ))\ .map(lambda x:Row(uid=x[0],pid=x[1][0],price=x[1][1])) print("========= %d =========" % rowRDD.count()) if (rowRDD.count() > 0): # send to kafka client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "pid" :"' + str(row['pid']) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "price":"' + str(row['price']) + '",' line += ' "ts":' + str(time) + '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line) # save to cassandra rowRDD.map(lambda x:Row(pid=x['pid'],ts=str(time),price=x['price']))\ .toDF().write\ .format("org.apache.spark.sql.cassandra")\ .options(table='winningbid10s', keyspace='ad_flow')\ .save(mode="append")
def process(time, lines): """Calculate user-product corr table and select ad-push events Input: lines: (ts string, uid string, topic vector) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # calculate user-product correlation table runningWindow=lines.map(lambda (k, v): ( (k[1], time.isoformat()), v ))\ .reduceByKey(lambda x,y: x+y)\ .map(lambda (x,u): (x, [(pid, score) for (pid, score) in ( (y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value ) if score>.90]))\ .filter(lambda (k, v): v!=[]) rowRDD = runningWindow.map( lambda x: Row(uid=x[0][0], score=x[1], ts=x[0][1])) # print(rowRDD.take(10)) # saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s') print("========= %d =========" % rowRDD.count()) # save corr table to cassandra if (rowRDD.count() > 0): client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "timeStamp" :"' + str(time) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "score":"' + json.dumps(dict(row['score'])) + '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) self.artist_id = [] self.artwork_id = [] def load_ids(self): artwork_path = "/home/ubuntu/Insight/dataset/Artsy/artwork_id.txt" artist_path = "/home/ubuntu/Insight/dataset/Artsy/artist_id.txt" with open(artwork_path) as f1: for line in f1: if line != "": self.artwork_id.append(line.strip()) f1.close() with open(artist_path) as f2: for line in f2: if line != "": self.artist_id.append(line.strip()) f2.close() def produce_msgs(self, source_symbol): msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d %H%M%S") user_field = random.choice(self.artist_id) art_field = random.choice(self.artwork_id) str_fmt = "{};{};{};{};{}" message_info = str_fmt.format(source_symbol, time_field, user_field, "pin", art_field) # print message_info self.producer.send_messages('pin_activity', source_symbol, message_info) msg_cnt += 1
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) self.artist_id = [] self.artwork_id = [] def load_ids(self): artwork_path = "/home/ubuntu/Insight/dataset/Artsy/artwork_id.txt" artist_path = "/home/ubuntu/Insight/dataset/Artsy/artist_id.txt" with open(artwork_path) as f1: for line in f1: if line != "": self.artwork_id.append(line.strip()) f1.close() with open(artist_path) as f2: for line in f2: if line != "": self.artist_id.append(line.strip()) f2.close() def produce_msgs(self, source_symbol): msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d %H%M%S") user_field = random.choice(self.artist_id) art_field = random.choice(self.artwork_id) str_fmt = "{};{};{};{};{}" message_info = str_fmt.format(source_symbol,time_field,user_field,"pin",art_field) # print message_info self.producer.send_messages('pin_activity', source_symbol, message_info) msg_cnt += 1
def process(time, lines): """ Processing tweets Input: lines: (ts string, uid string, state string, tweet vector) Output: Json: (ts string, uid string, topicVec vector) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # rowRDD=lines.map(lambda x: (x['timeStamp'], x['userId'], getMeanVector(x['tweet'])))\ # .filter(lambda (time, uid, vec): vec!=[])\ # .map(lambda x:Row(timestamp=x[0], uid=x[1], topicVec=x[2])) rowRDD=lines.map(lambda x: [((x['timeStamp'], x['userId']), word2vec(item)) for item in x['tweet'] if isInVolcabulary(item)] )\ .flatMap(lambda x:x)\ .filter(lambda (k, vec): vec!=[])\ .reduceByKey(lambda x,y:x+y)\ .map(lambda x:Row(timestamp=x[0][0], uid=x[0][1], topicVec=x[1])) # print(rowRDD.take(10)) print("========= %d =========" % rowRDD.count()) # save corr table to cassandra if (rowRDD.count() > 0): client = SimpleClient(kafkaNodeBC.value) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "timestamp" :"' + str(row['timestamp']) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "topicVec":' + json.dumps( [float(i) for i in row['topicVec']]) + '}' # print(line) producer.send_messages(outgoingTopic, str(hash(line)), line)
def process(time, lines): """1. select user to push ads 2. save user-product corr table to cassandra 3. match user with bidder 4. save bid winner to cassandra Input: lines: (ts string, uid string, topic vector) """ print("========= %s =========" % str(time)) sqlContext=getSqlContextInstance(lines.context) # calculate user-product correlation table #lines1s=lines.map(lambda x: ( (x['uid'], roundTime(parser.parse(x['tick']),1).isoformat()), np.asarray([1]+[float(i) for i in x['topic']])))\ # lines1s=lines.map(lambda x: ( x[0], 1))\ runningWindow=lines.map(lambda (k, v): ( (k[0], time.isoformat()), v ))\ .reduceByKey(lambda x,y: x+y)\ .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\ .flatMap(lambda x:x)\ .filter(lambda (x,y,s):s>.97) rowRDD=runningWindow.map(lambda x:Row(uid=x[0][0],pid=x[1],score=x[2],ts=x[0][1])) # print(rowRDD.take(10)) # saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s') print("========= %d =========" % rowRDD.count()) # save corr table to cassandra if (rowRDD.count()>0): client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "tick" :"' + str(time.isoformat()) + '",' line+= ' "uid" :"' + str(row['uid']) + '",' line+= ' "score":"' + str(row['score'])+'",' line+= ' "pid":' + str(row['pid'])+ '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.sess = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=5) self.sess.mount('http://', adapter) self.sess.mount('https://', adapter) def produce_msgs(self, topic, source_symbol, last_record_set): self.record_set = set() count = 0 try: for item in self.r["data"]: self.record_set.add(item["payment_id"]) count += 1 if not item["payment_id"] in last_record_set: message_info = "{}\n".format(json.dumps(item)) self.producer.send_messages(topic, source_symbol, message_info) # print message_info # print count except: k = 1 def get_venmo(self,limit=300,page="https://venmo.com/api/v5/public?"): try: self.r = self.sess.get(page + "&limit={}".format(limit)).json() except: self.r = ""
class KafkaLoggingHandler(logging.Handler): def __init__(self, hosts="", topic="", partition=0): logging.Handler.__init__(self) self.kafkaClient = KafkaClient(hosts) self.topic = topic self.partition = partition self.producer = KeyedProducer( self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=200 ) def emit(self, record): # drop kafka logging to avoid infinite recursion if record.name == "kafka": return try: # use default formatting msg = self.format(record) # produce message self.producer.send_messages(self.topic + record.name, self.partition, msg) except: import traceback ei = sys.exc_info() traceback.print_exception(ei[0], ei[1], ei[2], None, sys.stderr) del ei def close(self): self.producer.stop() logging.Handler.close(self)
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol): #price_field = random.randint(800,1400) msg_cnt = 0 datagenerator = DataGenerator() function_options = { 0:datagenerator.click_event, 1:datagenerator.view_event, 2:datagenerator.bid_event, 3:datagenerator.hover_event, 4:datagenerator.load_event } while True: #time_field = datetime.now().strftime("%Y%m%d %H%M%S") #price_field += random.randint(-10, 10)/10.0 #volume_field = random.randint(1, 1000) #str_fmt = "{};{};{};{}" #message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) num = random.randint(0, 4) message_info = function_options[num]() print json.dumps(message_info) self.producer.send_messages('test_adability', source_symbol, message_info) msg_cnt += 1
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client,async=True,\ batch_send_every_n=500,batch_send=False) self.min_steps = 1 self.max_steps = 3 self.max_users_each_thread = 12000 def produce_msgs(self, source_symbol): msg_cnt = 0 while True: start_uuid = (int(source_symbol) - 1) * self.max_users_each_thread stop_uuid = (int(source_symbol) * self.max_users_each_thread) - 1 uuid = random.sample(range(start_uuid,stop_uuid), 9) for uid in uuid: timestamp = datetime.now(timezone('US/Pacific')).\ strftime('%Y-%m-%d %H:%M:%S') steps = random.randint(1,10) json_msg= {'source':source_symbol,'uuid':uid, 'timestamp':timestamp, 'steps': steps} json_encoded = json.dumps(json_msg) self.producer.send_messages('steps_data_part4', source_symbol,\ json_encoded) print json_encoded msg_cnt += 1
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_deal_urls(self, api_url=''): ''' Constantly produce deal urls for consumers to crawl ''' # TODO - Find total deals per category # TODO - Calculate number of pages to crawl # TODO - Produce categories and page range for consumers # {category_slug; start_page; end_page} def produce_msgs(self, source_symbol): price_field = random.randint(800,1400) msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d %H%M%S") price_field += random.randint(-10, 10)/10.0 volume_field = random.randint(1, 1000) str_fmt = "{};{};{};{}" message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) print message_info self.producer.send_messages('price_data_part4', source_symbol, message_info) msg_cnt += 1
class KafkaLfProducer(object): def __init__(self, addr, conf_file, start_house_id, end_house_id, house_status): self.parser = SafeConfigParser() self.parser.read(conf_file) install_dir = self.parser.get('smw_tool', 'INSTALL_DIR') zipdb_file = self.parser.get('smw_tool', 'ZIP_DB_FILE') self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client, async=True, batch_send_every_n=500,batch_send=True) self.meterReader = MeterLfReader(start_house_id, end_house_id, house_status, install_dir + "/data/low_freq/", install_dir + "/" + zipdb_file) def produce_msgs(self, source_symbol): msg_cnt = 0 while not self.meterReader.houseSentDone(): (isLf, msg) = self.meterReader.getRecord() if msg_cnt % 500000 == 0: print "Sent " + str(msg_cnt) + " messages to Kafka" if isLf: self.producer.send_messages('smw_batch_lf2', source_symbol, msg) else: self.producer.send_messages('smw_batch_hf2', source_symbol, msg) msg_cnt += 1 print "Sent Total " + str(msg_cnt) + " messages to Kafka" self.meterReader.writeHouseStatus()
def unfollow_producer(users: List[Tuple[str]], photos: Deque[Tuple[str, str]], tags: List[Tuple[str]], locations: List[Tuple[str, str]], producer: KeyedProducer) -> Dict[str, str]: """ Produce unfollow events to Kafka Arguments: users: List of users who can produce an event photos: Queue of recent photos and their usernames tags: List of company names locations: List of possible global lat/long coordinates producer: Kafka producer object to post messages Returns: Kafka message """ followee, follower = random.choice(users)[0], random.choice(users)[0] created_time, partition_date = get_datetime() record = { "follower_username": follower, "followed_username": followee, "created_time": created_time, "partition_date": partition_date, "event": "unfollow" } producer.send_messages("unfollow", bytes(followee, 'utf-8'), json.dumps(record).encode('utf-8')) return record
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def produce_messages(self, data): #timestamp = 1473613200 # 1:00 est #while timestamp <= 1473624000: while True: rows = np.random.randint(0, len(data) - 1, size=num_plays_persec) sampled_data = data.iloc[rows] curr_time = datetime.datetime.now() #create timestamp for camus to partition timestamp = datetime.datetime.strftime(curr_time, '%Y-%m-%d_%H:%M:%S') #create epoch timstamp for custom partitioning raw_timestamp = convert_datetime_to_est(curr_time) epoch = int(time.mktime(raw_timestamp.timetuple())) for idx, row in sampled_data.iterrows(): json_data = { 'timestamp': timestamp, 'epoch_timestamp': epoch, 'player_id': row.player_id, 'player_name': row.player_name, 'position': row.position, 'yards': row.yards, 'touchdown': row.touchdown } message_info = json.dumps(json_data) keystring = 'QA' if row.position == 'QB' else row.position key = b'{}'.format(keystring) self.producer.send_messages('nfl_plays', key, message_info)
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.sess = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=5) self.sess.mount('http://', adapter) self.sess.mount('https://', adapter) def produce_msgs(self, topic, source_symbol, last_record_set): self.record_set = set() count = 0 try: for item in self.r["data"]: self.record_set.add(item["payment_id"]) count += 1 if not item["payment_id"] in last_record_set: message_info = "{}\n".format(json.dumps(item)) self.producer.send_messages(topic, source_symbol, message_info) # print message_info # print count except: k = 1 def get_venmo(self, limit=300, page="https://venmo.com/api/v5/public?"): try: self.r = self.sess.get(page + "&limit={}".format(limit)).json() except: self.r = ""
def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.sess = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=5) self.sess.mount('http://', adapter) self.sess.mount('https://', adapter)
def create_user_producer(users: List[Tuple[str]], photos: Deque[Tuple[str, str]], tags: List[Tuple[str]], locations: List[Tuple[str, str]], producer: KeyedProducer) -> Dict[str, str]: """ Produce create-user events to Kafka Arguments: users: List of users who can produce an event photos: Queue of recent photos and their usernames tags: List of company names locations: List of possible global lat/long coordinates producer: Kafka producer object to post messages Returns: Kafka message """ username, full_name = fake_user() created_time, partition_date = get_datetime() record = { "username": username, "full_name": full_name, "created_time": created_time, "partition_date": partition_date, "event": "create-user" } producer.send_messages("create-user", bytes(username, 'utf-8'), json.dumps(record).encode('utf-8')) users.append((username,)) return record
def __init__(self, hosts_list, topic, key=None): logging.Handler.__init__(self) self.kafka_client = KafkaClient(hosts_list) self.key = key self.kafka_topic_name = topic if not key: self.producer = SimpleProducer(self.kafka_client) else: self.producer = KeyedProducer(self.kafka_client)
def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) # The 1999 KDDCup network traffic dataset self.data_file = open('/home/ubuntu/opt/realtimeAnomalies/src/main/test/kddcup.testdata.unlabeled', 'r') self.mem_data = [] for record in self.data_file: self.mem_data.append(record)
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol, file_source): hd = open(file_source) for line in hd: print line self.producer.send_messages('datatest', source_symbol, line)
def __init__(self, hosts_list, topic, **kwargs): logging.Handler.__init__(self) self.kafka_client = SimpleClient(hosts_list) self.key = kwargs.get("key", None) self.kafka_topic_name = topic if not self.key: self.producer = SimpleProducer(self.kafka_client, **kwargs) else: self.producer = KeyedProducer(self.kafka_client, **kwargs)
def __init__(self, topic, producer_type=ProducerType.SIMPLE,\ host_port="127.0.0.1:9092", **producer_opts): self.topic = topic self.host_port = host_port if producer_type == ProducerType.SIMPLE: self.producer = SimpleProducer(KafkaClient(host_port),\ **producer_opts) else: self.producer = KeyedProducer(KafkaClient(host_port),\ **producer_opts)
def __init__(self, host_list, topic, **kwargs): logging.Handler.__init__(self) self.kafka_client = SimpleClient(host_list) self.key = kwargs.get("key", None) self.kafka_topic_name = topic if not self.key: self.producer = SimpleProducer(self.kafka_client, **kwargs) else: self.producer = KeyedProducer(self.kafka_client, **kwargs)
def __init__(self, hosts_list, topic, timeout_secs=DEFAULT_SOCKET_TIMEOUT_SECONDS, **kwargs): logging.Handler.__init__(self) self.kafka_client = KafkaClient(hosts_list, timeout=timeout_secs) self.key = kwargs.get("key", None) self.kafka_topic_name = topic if not self.key: self.producer = SimpleProducer(self.kafka_client, **kwargs) else: self.producer = KeyedProducer(self.kafka_client, **kwargs)
def run(self, delay=0.1): client = KafkaClient("localhost:9092") producer = KeyedProducer(client) import numpy as np for photoid in TESTPHOTOIDS: producer.send_messages('flickr-photoid','%d'%np.random.randint(0,20) ,photoid) print "Sending PhotoID: %s"%photoid time.sleep(delay)
def __init__(self, hosts_list, topic, timeout_secs=DEFAULT_SOCKET_TIMEOUT_SECONDS, **kwargs): logging.Handler.__init__(self) self.kafka_client = KafkaClient(hosts_list, timeout=timeout_secs) self.key = kwargs.get("key", None) self.kafka_topic_name = topic if not self.key: self.producer = SimpleProducer(self.kafka_client, **kwargs) else: self.producer = KeyedProducer(self.kafka_client, **kwargs) self.addFilter(KafkaLoggingFilter())
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol, file_to_use): file_obj = open(file_to_use, 'r') msg_cnt = 0 while True: message_info = file_obj.next() print message_info self.producer.send_messages('venmo2', source_symbol, message_info) msg_cnt += 1
def __init__(self, addr): self.timezone = timezone('EST') self.host = 'ec2-34-192-152-48.compute-1.amazonaws.com' self.auction_db = 'auctiontable' self.bid_db = 'bidtable' self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.active_auctions = [] self.conn_auction_db = None self.conn_bid_db = None self.auction_table = None self.bid_table = None self.connected_auction_db = False self.connected_bid_db = False
def offsetCommit(): global users checkUserPartitionMapping() kafkaClient = KafkaClient(kafkaHost, timeout=None) producer = KeyedProducer(kafkaClient, async=False, req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=200) for partition in partitions: encodedMessage = simplejson.dumps({'turtleName':turtleName, 'user':'', 'operation':'offsetCommit'}) print producer.send(kafkaTopic, partition, encodedMessage) producer.stop(1) kafkaClient.close()
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def stream_science_posts(self, key): r = requests.session() header = {"User-Agent": "anisotropix Science"} s = r.get('https://www.reddit.com/r/science/new/.json?limit=100', stream = True, headers =header)#tream = True, timeout = 2) for post in s.iter_lines(): if post: self.producer.send_messages('Science_posts',key, post) print (post)
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def stream_science_posts(self, key): r = requests.session() header = {"User-Agent": "anisotropix Science"} s = r.get('https://www.reddit.com/r/science/new/.json?limit=100', stream=True, headers=header) #tream = True, timeout = 2) for post in s.iter_lines(): if post: self.producer.send_messages('Science_posts', key, post) print(post)
def write(): k_client = KafkaClient(KAFKA_URL) p = KeyedProducer(k_client, async=False, req_acks=KeyedProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=2000) messages = [] for i in xrange(NUM_MESSAGES): message = json.dumps({'msg': 'X' * SIZE_MSG}) messages.append(message) if len(messages) >= 500: key = int(time.time() * 1000) p.send_messages(KAFKA_TOPIC, str(key), *messages) messages = [] key = int(time.time() * 1000) p.send_messages(KAFKA_TOPIC, str(key), *messages)
def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client,async=True,\ batch_send_every_n=500,batch_send=False) self.min_steps = 1 self.max_steps = 3 self.max_users_each_thread = 12000
def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) self.minLat = 40.708751 self.maxLat = 40.802895 self.minLong = -74.025879 self.maxLong = -73.930435 self.counter_start = 0 # self.locs = [(40.75280785, -73.97546422),(40.73988115,-73.98711691),(40.76105171, -73.96962834),\ # (40.75790096,-73.97578395),(40.75833353,-74.00436092),(40.74496999,-73.97087089),\ # (40.76088942,-73.97008963),(40.75494802,-73.96084512),(40.73754566,-73.98306014),\ # (40.76804075,-73.98086881),(40.73795777,-73.97972054),(40.75311322,-73.99081106),\ # (40.76445038,-73.9693873),(40.75204099,-73.99041951),(40.75705723,-73.98304045),\ # (40.74984862,-73.98108846),(40.73641334,-73.99263483),(40.74022644,-73.97511118),\ # (40.74081696,-73.99869147),(40.75155827,-73.97809876),(40.7979499,-73.93799602),\ # (40.78487376,-73.9488285),(40.78891306,-73.96322338),(40.80932537,-73.95927604),\ # (40.79512142,-73.97732225),(40.78566559,-73.94358666),(40.80024399,-73.96799964),\ # (40.78788311,-73.97040765),(40.80434947,-73.93874699),(40.80183406,-73.96247845),\ # (40.80595751,-73.95441724),(40.80650874,-73.96646741),(40.7931067,-73.9413598),\ # (40.81627861,-73.95581725),(40.80999546,-73.96029616),(40.81289571,-73.95471676),\ # (40.81689372,-73.93035378),(40.81309684,-73.92121306), (40.8096491,-73.93651239)] self.available=[[8,9,10, 17,18,19,20,21],[8,9,10,11,16,17,18,19,20],[8,9,15,16,17,18],\ [14,15,16,17,18,19,20,21,22],[10,11,12,22,23,00],[12,13,14,15,16],\ [19,20,21,22,23,00],[00,01,02],[8,11,13,15,17,19,21,22,23], [8,2],\ [0,1,2,3,4,5,6,7,8,9,10,12],[13,14,15,16,17,18,19,20,21,22,23]]
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol): msg_cnt = 0 while True: artwork_path = "loc.txt" with open(artwork_path) as f1: for line in f1: if line.strip(): print line.strip() self.producer.send_messages('post_geo_activity', source_symbol,line.strip()) msg_cnt += 1
def main(servers: List[str]) -> None: """ Main Method Arguments: servers: List of Zookeeper Kafka Host IPs """ mysql_session = pymysql.connect(**MYSQL_CONF) users = query_for_users(mysql_session) photos: Deque = deque([], maxlen=3000) tags = query_for_tags(mysql_session) locations = query_for_locations(mysql_session) simple_client = SimpleClient(servers) producer = KeyedProducer(simple_client) events = [ comment_producer, #create_user_producer, follow_producer, like_producer, create_photo_producer, unfollow_producer ] while True: event = generate_random_events(events) print(event(users, photos, tags, locations, producer)) time.sleep(0.02)
def __init__(self, addr): print "Trying connection..." self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) print "Made connection with host: {}".format(addr) self._last_update = datetime.utcnow() # For latest deals self._more_pages = 10 self._chunk_size = 10
def __init__(self, hosts="", topic="", partition=0): logging.Handler.__init__(self) self.kafkaClient = KafkaClient(hosts) self.topic = topic self.partition = partition self.producer = KeyedProducer( self.kafkaClient, async=False, req_acks=KeyedProducer.ACK_AFTER_LOCAL_WRITE, ack_timeout=200 )
class Producer(object): def __init__(self, addr): self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client) def open_save(self, fileName): log_file = open(fileName, "w") log_file.close() return log_file def create_topic(self, topic): script = "/usr/local/kafka/bin/kafka-topics.sh" os.system("{} --create --zookeeper localhost:2181 --topic {} --partitions {} --replication-factor 2".format(script, topic, "4")) return "topic {} created".format(topic) def produce_msgs(self, source_symbol, topic): server_topics = self.client.topic_partitions if topic not in server_topics: self.create_topic(topic) price_field = random.randint(800,1400) cities = ["Barcelona", "Philadelphia", "Honolulu", "Atlanta", "Miami", "Chicago", "SF", "LA", "NYC", "Houston", "Paris", "London", "Tokyo"] msg_cnt = 0 log_file = open("input1/{}.csv".format(topic), "a") while True: time_field = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f') location_field = random.choice(cities) price_field += random.randint(-10, 10)/10.0 str_fmt = "{},{},{},{}" message_info = str_fmt.format(source_symbol, time_field, location_field, price_field) print message_info log_file.write("{}\n".format(message_info)) self.producer.send_messages(topic, source_symbol, message_info) msg_cnt += 1 if msg_cnt > 200000: log_file.close() self.producer.stop() break
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) # The 1999 KDDCup network traffic dataset self.data_file = open('/home/ubuntu/opt/realtimeAnomalies/src/main/test/kddcup.testdata.unlabeled', 'r') self.mem_data = [] for record in self.data_file: self.mem_data.append(record) def produce_msgs(self, source_symbol): random.seed() while True: idx = random.randint(0, len(self.mem_data) - 1) str_fmt = "{}" message_content = str_fmt.format(self.mem_data[idx]) self.producer.send_messages('traffic_data', source_symbol, message_content)
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol): price_field = random.randint(800, 1400) msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d %H%M%S") price_field += random.randint(-10, 10) / 10.0 volume_field = random.randint(1, 1000) str_fmt = "{};{};{};{}" message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) print message_info self.producer.send_messages('price_data_part4', source_symbol, message_info) msg_cnt += 1
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.timezone = timezone('EST') def name_generator(self): return ''.join( random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVWZ') for i in range(random.randint(3, 9))) def item_generator(self): global item_lists return random.choice(item_lists) def produce_msgs(self): msg_cnt = 0 auction_id = 0 while True: auction_id += 1 #Create time EST time create_time = datetime.now( self.timezone).strftime("%Y-%m-%d %H:%M:%S") #Auctioner ID auctioner_id = random.randint(0, 100000) #Expiry: 2 hours to 4 days auction_type = random.randint(2, 96) #Starting price: 1 cent to $100 starting_price = random.uniform(0.01, 100.0) #Auctioner name generator auctioner_name = self.name_generator() #Item generator item = self.item_generator() str_fmt = "{};{};{};{};{};{};{}" message_info = str_fmt.format(auction_id, create_time, auctioner_id, auction_type, round(starting_price, 2), auctioner_name, item) print message_info self.producer.send_messages('auctions', str(random.randint(0, 4)), message_info) msg_cnt += 1
class Producer(object): # Initialization for the class with address def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.topic = 'ajay_test_topic' # Main method for simulation def produce_msgs(self, source_symbol): # Generate some random data price_field = random.randint(800, 1400) # Count the messages in the tunnel msg_cnt = 0 # Loop for the feilds while True: # Get a random time value time_field = datetime.now().strftime("%Y%m%d %H%M%S") # Get a random price value price_field += random.randint(-10, 10) / 10.0 # Get a random volume feild volume_field = random.randint(1, 1000) # Format your string str_fmt = "{};{};{};{}" # Create the message message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) # Print for debug print message_info # Send the message self.producer.send_messages(self.topic, source_symbol, message_info) # Messages count msg_cnt += 1
class Producer(object): def __init__(self, addr=None): self.isNone = True if addr is not None: self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.isNone = False def produce_msgs(self, source_symbol): random = Random(0) msg_cnt = 0 start = 50 for i in range(100): #for observation groups 13 through 13+range #time.sleep(10) #waits between observation groups for x in range( 3000 ): #1500 means about 1000 per obs because there are 4 producers time.sleep( 0.00001 ) # 0.2 waits this many seconds before producing another message about 1000 each obs each 5 min self.observationgroup_field = random.randint( start + i, start + i) self.observationorder_field = random.randint(1, 6) self.frequency_field = random.random() * 10000 self.snr_field = random.random() * 100 self.driftrate_field = random.random() - random.random() self.uncorrectedfrequency_field = random.random( ) - random.random() + self.frequency_field str_fmt = "{};{};{};{};{};{};{}" message_info = str_fmt.format( source_symbol, self.observationgroup_field, self.observationorder_field, self.frequency_field, self.snr_field, self.driftrate_field, self.uncorrectedfrequency_field) if not self.isNone: self.producer.send_messages('gbthits', source_symbol, message_info) else: break msg_cnt += 1 if self.isNone: break
class Producer(object): def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) def produce_msgs(self, source_symbol): price_field = random.randint(800,1400) msg_cnt = 0 while True: time_field = datetime.now().strftime("%Y%m%d %H%M%S") price_field += random.randint(-10, 10)/10.0 volume_field = random.randint(1, 1000) str_fmt = "{};{};{};{}" message_info = str_fmt.format(source_symbol, time_field, price_field, volume_field) print message_info self.producer.send_messages('price_data_part4', source_symbol, message_info) msg_cnt += 1
class KafkaLoggingHandler(logging.Handler): def __init__(self, hosts_list, topic, key=None): logging.Handler.__init__(self) self.kafka_client = KafkaClient(hosts_list) self.key = key self.kafka_topic_name = topic if not key: self.producer = SimpleProducer(self.kafka_client) else: self.producer = KeyedProducer(self.kafka_client) def emit(self, record): # drop kafka logging to avoid infinite recursion if record.name == 'kafka': return try: # use default formatting msg = self.format(record) # produce message if not self.key: self.producer.send_messages(self.kafka_topic_name, msg) else: self.producer.send(self.kafka_topic_name, self.key, msg) except (KeyboardInterrupt, SystemExit): raise except: self.handleError(record) def close(self): self.producer.stop() logging.Handler.close(self)
class KafkaLoggingHandler(logging.Handler): def __init__(self, hosts_list, topic, key=None): logging.Handler.__init__(self) self.kafka_client = KafkaClient(hosts_list) self.key = key self.kafka_topic_name = topic if not key: self.producer = SimpleProducer(self.kafka_client) else: self.producer = KeyedProducer(self.kafka_client) def emit(self, record): # drop kafka logging to avoid infinite recursion if record.name == 'kafka': return try: # use default formatting msg = self.format(record) if isinstance(msg, unicode): msg = msg.encode("utf-8") # produce message if not self.key: self.producer.send_messages(self.kafka_topic_name, msg) else: self.producer.send(self.kafka_topic_name, self.key, msg) except (KeyboardInterrupt, SystemExit): raise except: self.handleError(record) def close(self): self.producer.stop() logging.Handler.close(self)
def __init__(self, addr, conf_file, start_house_id, end_house_id, house_status): self.parser = SafeConfigParser() self.parser.read(conf_file) install_dir = self.parser.get('smw_tool', 'INSTALL_DIR') zipdb_file = self.parser.get('smw_tool', 'ZIP_DB_FILE') self.client = KafkaClient(addr) self.producer = KeyedProducer(self.client, async=True, batch_send_every_n=500,batch_send=True) self.meterReader = MeterLfReader(start_house_id, end_house_id, house_status, install_dir + "/data/low_freq/", install_dir + "/" + zipdb_file)
class Producer(object): def __init__(self, addr, group_id): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.group_id = group_id def produce_msgs(self, source_file): with open(source_file, 'r') as f: lines = f.readlines() start_time = datetime.now() num_lines = 0 line_inx = 0 max_inx = len(lines) while line_inx < max_inx: token = lines[line_inx].strip().split() line_inx = line_inx % max_inx if token[2] != 'NaN': for num in range(1000): user_id = "user_%s_%s" % (self.group_id, num) event_time = ( start_time + timedelta(0, num_lines)).strftime('%Y-%m-%d %H:%M:%S') hr = int(token[2]) + randint(0, 4) - 2 msg = {'id': user_id, 'time': event_time, 'hr': hr} json_msg = json.dumps(msg) print json_msg self.producer.send_messages('sensor', str(self.group_id), json_msg) line_inx += 1 num_lines += 1 line_inx = line_inx % max_inx time.sleep(2) line_inx += 1 num_lines += 1 line_inx = line_inx % max_inx
class KafkaHandler(logging.Handler): """ publish message to kafka """ def __init__(self, topic, producer_type=ProducerType.SIMPLE,\ host_port="127.0.0.1:9092", **producer_opts): self.topic = topic self.host_port = host_port if producer_type == ProducerType.SIMPLE: self.producer = SimpleProducer(KafkaClient(host_port),\ **producer_opts) else: self.producer = KeyedProducer(KafkaClient(host_port),\ **producer_opts) def emit(self, record): try: response = self.producer.send_messages(self.topic,\ self.format(record)) except: raise
def test_switch_leader_keyed_producer(self): topic = self.topic producer = KeyedProducer(self.client, async=False) # Send 10 random messages for _ in range(10): key = random_string(3) msg = random_string(10) producer.send_messages(topic, key, msg) # kill leader for partition 0 self._kill_leader(topic, 0) recovered = False started = time.time() timeout = 60 while not recovered and (time.time() - started) < timeout: try: key = random_string(3) msg = random_string(10) producer.send_messages(topic, key, msg) if producer.partitioners[kafka_bytestring(topic)].partition(key) == 0: recovered = True except (FailedPayloadsError, ConnectionError): logging.debug("caught exception sending message -- will retry") continue # Verify we successfully sent the message self.assertTrue(recovered) # send some more messages just to make sure no more exceptions for _ in range(10): key = random_string(3) msg = random_string(10) producer.send_messages(topic, key, msg)