def process(time, lines): """Calculate user-product corr table and select ad-push events Input: lines: (ts string, uid string, topic vector) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # calculate user-product correlation table runningWindow=lines.map(lambda (k, v): ( (k[1], time.isoformat()), v ))\ .reduceByKey(lambda x,y: x+y)\ .map(lambda (x,u): (x, [(pid, score) for (pid, score) in ( (y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value ) if score>.90]))\ .filter(lambda (k, v): v!=[]) rowRDD = runningWindow.map( lambda x: Row(uid=x[0][0], score=x[1], ts=x[0][1])) # print(rowRDD.take(10)) # saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s') print("========= %d =========" % rowRDD.count()) # save corr table to cassandra if (rowRDD.count() > 0): client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "timeStamp" :"' + str(time) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "score":"' + json.dumps(dict(row['score'])) + '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
def get_simple_kafka_client(client_id=GENERIC_KAFKA_CLIENT_ID): # this uses the old SimpleClient because we are using the old SimpleProducer interface return SimpleClient( hosts=settings.KAFKA_BROKERS, client_id=client_id, timeout=30, # seconds )
def get_values_kafak(self, groupName, topicName): kafka_values = dict() broker = SimpleClient(kafka_conn) zk = KazooClient(hosts=zookeepers_conn, read_only=True) zk.start() logsize = 0 if topicName: logsize = 0 partitions = broker.get_partition_ids_for_topic(topicName) responses = broker.send_offset_fetch_request(groupName, [OffsetFetchRequestPayload(topicName, p) for p in partitions], fail_on_error=True) latest_offset = 0 for res in responses: if topicName != "test": latest_offset += res[2] for partition in partitions: log = "/consumers/%s/offsets/%s/%s" % (groupName, topicName, partition) if zk.exists(log): data, stat = zk.get(log) logsize += int(data) lag = latest_offset - logsize broker.close() zk.stop() zk.close() kafka_values['offset'] = latest_offset kafka_values['logsize'] = logsize kafka_values['lag'] = lag return kafka_values
def process(time, lines): """ Processing tweets Input: lines: (ts string, uid string, state string, tweet vector) Output: Json: (ts string, uid string, topicVec vector) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # rowRDD=lines.map(lambda x: (x['timeStamp'], x['userId'], getMeanVector(x['tweet'])))\ # .filter(lambda (time, uid, vec): vec!=[])\ # .map(lambda x:Row(timestamp=x[0], uid=x[1], topicVec=x[2])) rowRDD=lines.map(lambda x: [((x['timeStamp'], x['userId']), word2vec(item)) for item in x['tweet'] if isInVolcabulary(item)] )\ .flatMap(lambda x:x)\ .filter(lambda (k, vec): vec!=[])\ .reduceByKey(lambda x,y:x+y)\ .map(lambda x:Row(timestamp=x[0][0], uid=x[0][1], topicVec=x[1])) # print(rowRDD.take(10)) print("========= %d =========" % rowRDD.count()) # save corr table to cassandra if (rowRDD.count() > 0): client = SimpleClient(kafkaNodeBC.value) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "timestamp" :"' + str(row['timestamp']) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "topicVec":' + json.dumps( [float(i) for i in row['topicVec']]) + '}' # print(line) producer.send_messages(outgoingTopic, str(hash(line)), line)
def process(time, lines): """1. select user to push ads 2. save user-product corr table to cassandra 3. match user with bidder 4. save bid winner to cassandra Input: lines: (ts string, uid string, topic vector) """ print("========= %s =========" % str(time)) sqlContext=getSqlContextInstance(lines.context) # calculate user-product correlation table #lines1s=lines.map(lambda x: ( (x['uid'], roundTime(parser.parse(x['tick']),1).isoformat()), np.asarray([1]+[float(i) for i in x['topic']])))\ # lines1s=lines.map(lambda x: ( x[0], 1))\ runningWindow=lines.map(lambda (k, v): ( (k[0], time.isoformat()), v ))\ .reduceByKey(lambda x,y: x+y)\ .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\ .flatMap(lambda x:x)\ .filter(lambda (x,y,s):s>.97) rowRDD=runningWindow.map(lambda x:Row(uid=x[0][0],pid=x[1],score=x[2],ts=x[0][1])) # print(rowRDD.take(10)) # saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s') print("========= %d =========" % rowRDD.count()) # save corr table to cassandra if (rowRDD.count()>0): client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "tick" :"' + str(time.isoformat()) + '",' line+= ' "uid" :"' + str(row['uid']) + '",' line+= ' "score":"' + str(row['score'])+'",' line+= ' "pid":' + str(row['pid'])+ '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
def process(time, lines): """match user with bidder Input: lines: (ts string, uid string, topic vector) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # calculate user-product correlation table runningWindow=lines.map(lambda (k, v): ( (k[0], str(time)), v ))\ .reduceByKey(lambda x,y: x+y)\ .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\ .flatMap(lambda x:x)\ .filter(lambda (x,y,s):s>.97) rowRDD = runningWindow.map( lambda x: Row(uid=x[0][0], pid=x[1], score=x[2], ts=x[0][1])) # print(rowRDD.take(10)) print("========= %d =========" % rowRDD.count()) if (rowRDD.count() > 0): client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "tick" :"' + str(time.isoformat()) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "score":"' + str(row['score']) + '",' line += ' "pid":' + str(row['pid']) + '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
def setup_kafka(self, settings): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name hosts = settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092') consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = SimpleClient(hosts) # wait at most 1sec for more messages. Otherwise continue self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) logger.info("Reading URLs from kafka topic '%s'" % self.kafka_topic)
def from_crawler(cls, crawler, *args, **kwargs): spider = super(ListeningKafkaSpider, cls).from_crawler(crawler, *args, **kwargs) if not hasattr(spider, 'topic') or not spider.topic: spider.topic = '%s-starturls' % spider.name hosts = crawler.settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092') consumer_group = crawler.settings.get( 'SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = SimpleClient(hosts) # wait at most 1sec for more messages. Otherwise continue spider.consumer = SimpleConsumer(_kafka, consumer_group, spider.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) crawler.signals.connect(spider.item_scraped, signal=signals.item_scraped) logger.info("Reading URLs from kafka topic '%s'" % spider.kafka_topic) return spider
def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.sess = requests.Session() adapter = requests.adapters.HTTPAdapter(max_retries=5) self.sess.mount('http://', adapter) self.sess.mount('https://', adapter)
def process(time, lines): """match user with bidder Input: lines: (ts string, uid string, {pid:score} dict) """ print("========= %s =========" % str(time)) sqlContext = getSqlContextInstance(lines.context) # calculate user-product correlation table rowRDD=lines.map(lambda x: ( x['uid'], matchBids(x['score']) ))\ .map(lambda x:Row(uid=x[0],pid=x[1][0],price=x[1][1])) print("========= %d =========" % rowRDD.count()) if (rowRDD.count() > 0): # send to kafka client = SimpleClient(KAFKA_NODE) producer = KeyedProducer(client) for row in rowRDD.collect(): line = '{ "pid" :"' + str(row['pid']) + '",' line += ' "uid" :"' + str(row['uid']) + '",' line += ' "price":"' + str(row['price']) + '",' line += ' "ts":' + str(time) + '}' # print(line) producer.send_messages(KAFKA_TOPIC, str(hash(line)), line) # save to cassandra rowRDD.map(lambda x:Row(pid=x['pid'],ts=str(time),price=x['price']))\ .toDF().write\ .format("org.apache.spark.sql.cassandra")\ .options(table='winningbid10s', keyspace='ad_flow')\ .save(mode="append")
def main(servers: List[str]) -> None: """ Main Method Arguments: servers: List of Zookeeper Kafka Host IPs """ mysql_session = pymysql.connect(**MYSQL_CONF) users = query_for_users(mysql_session) photos: Deque = deque([], maxlen=3000) tags = query_for_tags(mysql_session) locations = query_for_locations(mysql_session) simple_client = SimpleClient(servers) producer = KeyedProducer(simple_client) events = [ comment_producer, #create_user_producer, follow_producer, like_producer, create_photo_producer, unfollow_producer ] while True: event = generate_random_events(events) print(event(users, photos, tags, locations, producer)) time.sleep(0.02)
def __init__(self, addr): self.client = SimpleClient(addr) #self.producer = KeyedProducer(self.client) self.producer = KafkaProducer( bootstrap_servers=addr + ":9092", value_serializer=lambda v: v.encode('utf-8'), acks=0, linger_ms=500)
def __init__(self, bootstrapservers): ''' Constructor ''' self.Servers = bootstrapservers self.Producer = KafkaProducer(bootstrap_servers=self.Servers, retries=5) self.client = SimpleClient(bootstrapservers)
def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) # The 1999 KDDCup network traffic dataset self.data_file = open('/home/ubuntu/opt/realtimeAnomalies/src/main/test/kddcup.testdata.unlabeled', 'r') self.mem_data = [] for record in self.data_file: self.mem_data.append(record)
def from_settings(cls, settings): """ :param settings: the current Scrapy settings :type settings: scrapy.settings.Settings :rtype: A :class:`~KafkaPipeline` instance """ k_hosts = settings.get('SCRAPY_KAFKA_HOSTS', '127.0.0.1:9092') topic = settings.get('SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC', 'data-topic') client = SimpleClient(k_hosts) producer = SimpleProducer(client) return cls(producer, topic)
def getLogLines(self): #infinite loop of magical random numbers while not thread_stop_event.isSet(): client = SimpleClient('127.0.0.1:9092') consumer = SimpleConsumer(client, "my-producer", "alarm") for message in consumer: print message.message.value socketio.emit('newmessage', {'message': message.message.value}, namespace='/test') sleep(self.delay)
def __init__(self, host_list, topic, **kwargs): logging.Handler.__init__(self) self.kafka_client = SimpleClient(host_list) self.key = kwargs.get("key", None) self.kafka_topic_name = topic if not self.key: self.producer = SimpleProducer(self.kafka_client, **kwargs) else: self.producer = KeyedProducer(self.kafka_client, **kwargs)
def __init__(self, addr): self.timezone = timezone('EST') self.host = 'ec2-34-192-152-48.compute-1.amazonaws.com' self.auction_db = 'auctiontable' self.bid_db = 'bidtable' self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.active_auctions = [] self.conn_auction_db = None self.conn_bid_db = None self.auction_table = None self.bid_table = None self.connected_auction_db = False self.connected_bid_db = False
def process(spouts): ''' Returns a named tuple of type PartitionsSummary. ''' results = [] total_depth = 0 total_delta = 0 brokers = [] for s in spouts: for p in s.partitions: try: k = SimpleClient([p['broker']['host'] + ":" + str(p['broker']['port'])]) except socket.gaierror as e: raise ProcessorError('Failed to contact Kafka broker %s (%s)' % (p['broker']['host'], str(e))) earliest_off = OffsetRequestPayload(p['topic'], p['partition'], -2, 1) latest_off = OffsetRequestPayload(p['topic'], p['partition'], -1, 1) earliest = k.send_offset_request([earliest_off])[0].offsets[0] latest = k.send_offset_request([latest_off])[0].offsets[0] current = p['offset'] brokers.append(p['broker']['host']) total_depth = total_depth + (latest - earliest) total_delta = total_delta + (latest - current) results.append(PartitionState._make([ p['broker']['host'], p['topic'], p['partition'], earliest, latest, latest - earliest, s.id, current, latest - current])) return PartitionsSummary(total_depth=total_depth, total_delta=total_delta, num_partitions=len(results), num_brokers=len(set(brokers)), partitions=tuple(results))
def get_offset(group, topicname): try: kafka_client = KafkaClient(kafka_conn, timeout=30) except Exception as e: print "Error, cannot connect kafka broker." sys.exit(1) else: kafka_topics = kafka_client.topics finally: kafka_client.close() try: zookeeper_client = KazooClient(hosts=zookeepers_conn, read_only=True, timeout=30) zookeeper_client.start() except Exception as e: print "Error, cannot connect zookeeper server." sys.exit(1) offset_total = 0 logsize_totoal = 0 broker = SimpleClient(kafka_conn) # partition_path = 'consumers/%s/offsets/%s' % (group, topicname) partitions = broker.get_partition_ids_for_topic(topicname) kafka_consumer = KafkaConsumer(bootstrap_servers=kafka_conn) for partition in partitions: base_path = 'consumers/%s/%s/%s/%s' % (group, '%s', topicname, partition) owner_path, offset_path = base_path % 'owners', base_path % 'offsets' pdb.set_trace() offset = zookeeper_client.get('/' + offset_path)[0] offset_total += int(offset) # logsize_num = kafka_consumer.get_partition_offsets(topicname, partition, -1, 1)[0] # logsize_totoal += int(logsize_num) return offset_total, logsize_totoal
def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.topic = 'ajay_test_topic'
def __init__(self, addr, group_id): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.group_id = group_id
def configure(self, **configs): """Configure the consumer instance Configuration settings can be passed to constructor, otherwise defaults will be used: Keyword Arguments: bootstrap_servers (list): List of initial broker nodes the consumer should contact to bootstrap initial cluster metadata. This does not have to be the full node list. It just needs to have at least one broker that will respond to a Metadata API Request. client_id (str): a unique name for this client. Defaults to 'kafka.consumer.kafka'. group_id (str): the name of the consumer group to join, Offsets are fetched / committed to this group name. fetch_message_max_bytes (int, optional): Maximum bytes for each topic/partition fetch request. Defaults to 1024*1024. fetch_min_bytes (int, optional): Minimum amount of data the server should return for a fetch request, otherwise wait up to fetch_wait_max_ms for more data to accumulate. Defaults to 1. fetch_wait_max_ms (int, optional): Maximum time for the server to block waiting for fetch_min_bytes messages to accumulate. Defaults to 100. refresh_leader_backoff_ms (int, optional): Milliseconds to backoff when refreshing metadata on errors (subject to random jitter). Defaults to 200. socket_timeout_ms (int, optional): TCP socket timeout in milliseconds. Defaults to 30*1000. auto_offset_reset (str, optional): A policy for resetting offsets on OffsetOutOfRange errors. 'smallest' will move to the oldest available message, 'largest' will move to the most recent. Any ofther value will raise the exception. Defaults to 'largest'. deserializer_class (callable, optional): Any callable that takes a raw message value and returns a deserialized value. Defaults to lambda msg: msg. auto_commit_enable (bool, optional): Enabling auto-commit will cause the KafkaConsumer to periodically commit offsets without an explicit call to commit(). Defaults to False. auto_commit_interval_ms (int, optional): If auto_commit_enabled, the milliseconds between automatic offset commits. Defaults to 60 * 1000. auto_commit_interval_messages (int, optional): If auto_commit_enabled, a number of messages consumed between automatic offset commits. Defaults to None (disabled). consumer_timeout_ms (int, optional): number of millisecond to throw a timeout exception to the consumer if no message is available for consumption. Defaults to -1 (dont throw exception). Configuration parameters are described in more detail at http://kafka.apache.org/documentation.html#highlevelconsumerapi """ configs = self._deprecate_configs(**configs) self._config = {} for key in self.DEFAULT_CONFIG: self._config[key] = configs.pop(key, self.DEFAULT_CONFIG[key]) if configs: raise KafkaConfigurationError('Unknown configuration key(s): ' + str(list(configs.keys()))) if self._config['auto_commit_enable']: if not self._config['group_id']: raise KafkaConfigurationError( 'KafkaConsumer configured to auto-commit ' 'without required consumer group (group_id)') # Check auto-commit configuration if self._config['auto_commit_enable']: logger.info("Configuring consumer to auto-commit offsets") self._reset_auto_commit() if not self._config['bootstrap_servers']: raise KafkaConfigurationError( 'bootstrap_servers required to configure KafkaConsumer') reporters = [self._config['metrics_reporter']()] if \ self._config['metrics_reporter'] else [] metrics = Metrics(reporters=reporters) self.metrics = KafkaConsumerMetrics(metrics) self._client = SimpleClient( self._config['bootstrap_servers'], client_id=self._config['client_id'], timeout=(self._config['socket_timeout_ms'] / 1000.0), metrics=metrics, )
def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client) self.timezone = timezone('EST')
from tweepy import StreamListener from tweepy import OAuthHandler from tweepy import Stream from kafka import KafkaProducer from kafka.client import SimpleClient from kafka.consumer import SimpleConsumer from kafka.producer import SimpleProducer import json import configure client = SimpleClient("localhost:9092") producer = SimpleProducer(client) consumer_key = configure.consumer_key consumer_secret = configure.consumer_secret access_token = configure.access_token access_token_secret = configure.access_secret class StdOutListener(StreamListener): """ A listener handles tweets that are received from the stream. This is a basic listener that just prints received tweets to stdout. """ def on_data(self, data): data = json.loads(data) #print(data) user_id = data['user']['id_str'] producer.send_messages('covid', (user_id).encode('utf-8')) print(user_id) return True def on_error(self, status):
def getBid(): return float('{0:.03f}'.format(np.random.pareto(.3) % 64 + 1)) def getJsonLine(i): timestamp = getTimeStamp() pid = getPid(i) bidPrice = getBid() line = '{ "timeStamp" :"' + timestamp + '",' line += ' "productId" :"' + pid + '",' line += ' "bidPrice":' + str(bidPrice) + '}' return line if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: produceBids.py <node> <topic>", file=sys.stderr) exit(-1) kafkaNode, kafkaTopic = sys.argv[1:] # load date client = SimpleClient(kafkaNode) producer = KeyedProducer(client) # for i in xrange(NumOfProduct): while True: line = getJsonLine(-1) time.sleep(.01) print(line) producer.send_messages(kafkaTopic, str(hash(line) % 100), line)
from kazoo.client import KazooClient from kafka.consumer import base from kafka.structs import (OffsetRequestPayload, OffsetCommitRequestPayload, OffsetFetchRequestPayload) import sys # zookeepers="10.10.217.152:2182" zookeepers = "127.0.0.1:2182" kafka = "127.0.0.1:9092" group = "consumer-group" if __name__ == '__main__': broker = SimpleClient(kafka) lags = {} zk = KazooClient(hosts=zookeepers, read_only=True) zk.start() logsize = 0 # topics=zk.get_children("/consumers/%s/owners" % (group) ) topic = sys.argv[1] data_need = sys.argv[2] # for topic in topics: if topic: logsize = 0 # print topic partitions = broker.get_partition_ids_for_topic(topic) # print partitions consumer = KafkaConsumer(broker, group, str(topic)) responses = broker.send_offset_fetch_request(
from tweepy.streaming import StreamListener from tweepy import OAuthHandler from tweepy import Stream from kafka import KafkaProducer from kafka.client import SimpleClient from kafka.consumer import SimpleConsumer from kafka.producer import SimpleProducer client = SimpleClient("localhost:8080") producer = SimpleProducer(client) consumer_key = "" consumer_secret = "" access_token = "" access_secret = "" def main(): ''' main function initiates a kafka consumer, initialize the tweetdata database. Consumer consumes tweets from producer extracts features, cleanses the tweet text, calculates sentiments and loads the data into postgres database ''' # set-up a Kafka consumer consumer = KafkaConsumer('movies') tweets,conn, dbcur = initialize(db_name = "tweetdata") for msg in consumer: output = [] output.append(json.loads(msg.value)) print output print '\n' # Function to extract features from tweets
def __init__(self, addr): self.client = SimpleClient(addr) self.producer = KeyedProducer(self.client)
return well_stat, (wellID, str(reform_event_data)) def days_between(d1, d2): d1 = datetime.strptime(d1, "%Y-%m-%d %H:%M:%S") d2 = datetime.strptime(d2, "%Y-%m-%d %H:%M:%S") return abs((d2 - d1).seconds) if __name__ == "__main__": args = sys.argv ip_address = str(args[1]) partition_key = str(args[2]) client = SimpleClient(ip_address) producer = KeyedProducer(client) # -------------------------------- # Defining data variables # -------------------------------- dt = datetime.datetime(2017, 10, 9, 1, 59, 59) # Defining a starting date and time max_step = 600 # max time step size numOfWells = 50000 # Number of wells # -------------------------------- # Generating time series data # -------------------------------- event_types = ['on', 'off'] completion_types = ['BP', 'ESP'] # diff from 'completion_type' (no s)