def process(time, lines):
    """Calculate user-product corr table and select ad-push events
    Input:
    lines: (ts string, uid string, topic vector)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    #   calculate user-product correlation table
    runningWindow=lines.map(lambda (k, v): ( (k[1], time.isoformat()), v ))\
         .reduceByKey(lambda x,y: x+y)\
  .map(lambda (x,u): (x, [(pid, score) for (pid, score) in ( (y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value ) if score>.90]))\
  .filter(lambda (k, v): v!=[])
    rowRDD = runningWindow.map(
        lambda x: Row(uid=x[0][0], score=x[1], ts=x[0][1]))
    #   print(rowRDD.take(10))
    #   saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s')
    print("========= %d =========" % rowRDD.count())
    #   save corr table to cassandra
    if (rowRDD.count() > 0):
        client = SimpleClient(KAFKA_NODE)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "timeStamp" :"' + str(time) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "score":"' + json.dumps(dict(row['score'])) + '}'
            #	    print(line)
            producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
Esempio n. 2
0
def get_simple_kafka_client(client_id=GENERIC_KAFKA_CLIENT_ID):
    # this uses the old SimpleClient because we are using the old SimpleProducer interface
    return SimpleClient(
        hosts=settings.KAFKA_BROKERS,
        client_id=client_id,
        timeout=30,  # seconds
    )
 def get_values_kafak(self, groupName, topicName):
     kafka_values = dict()
     broker = SimpleClient(kafka_conn)
     zk = KazooClient(hosts=zookeepers_conn, read_only=True)
     zk.start()
     logsize = 0
     if topicName:
         logsize = 0
         partitions = broker.get_partition_ids_for_topic(topicName)
         responses = broker.send_offset_fetch_request(groupName,
                                                      [OffsetFetchRequestPayload(topicName, p) for p in partitions],
                                                      fail_on_error=True)
         latest_offset = 0
         for res in responses:
             if topicName != "test":
                 latest_offset += res[2]
         for partition in partitions:
             log = "/consumers/%s/offsets/%s/%s" % (groupName, topicName, partition)
             if zk.exists(log):
                 data, stat = zk.get(log)
                 logsize += int(data)
         lag = latest_offset - logsize
     broker.close()
     zk.stop()
     zk.close()
     kafka_values['offset'] = latest_offset
     kafka_values['logsize'] = logsize
     kafka_values['lag'] = lag
     return kafka_values
Esempio n. 4
0
def process(time, lines):
    """ Processing tweets 
    Input:
    lines: (ts string, uid string, state string, tweet vector)
    Output:
    Json: (ts string, uid string, topicVec vector)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    #    rowRDD=lines.map(lambda x: (x['timeStamp'], x['userId'], getMeanVector(x['tweet'])))\
    #                .filter(lambda (time, uid, vec): vec!=[])\
    #                .map(lambda x:Row(timestamp=x[0], uid=x[1], topicVec=x[2]))
    rowRDD=lines.map(lambda x: [((x['timeStamp'], x['userId']), word2vec(item)) for item in x['tweet'] if isInVolcabulary(item)] )\
                .flatMap(lambda x:x)\
                .filter(lambda (k, vec): vec!=[])\
                .reduceByKey(lambda x,y:x+y)\
                .map(lambda x:Row(timestamp=x[0][0], uid=x[0][1], topicVec=x[1]))

    #    print(rowRDD.take(10))
    print("========= %d =========" % rowRDD.count())
    #   save corr table to cassandra
    if (rowRDD.count() > 0):
        client = SimpleClient(kafkaNodeBC.value)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "timestamp" :"' + str(row['timestamp']) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "topicVec":' + json.dumps(
                [float(i) for i in row['topicVec']]) + '}'
            #	    print(line)
            producer.send_messages(outgoingTopic, str(hash(line)), line)
def process(time, lines):
    """1. select user to push ads
       2. save user-product corr table to cassandra
       3. match user with bidder
       4. save bid winner to cassandra 
    Input:
    lines: (ts string, uid string, topic vector)
    """
    print("========= %s =========" % str(time))
    sqlContext=getSqlContextInstance(lines.context)
    # calculate user-product correlation table  
    #lines1s=lines.map(lambda x: ( (x['uid'], roundTime(parser.parse(x['tick']),1).isoformat()), np.asarray([1]+[float(i) for i in x['topic']])))\
#    lines1s=lines.map(lambda x: ( x[0], 1))\
    runningWindow=lines.map(lambda (k, v): ( (k[0], time.isoformat()), v ))\
         .reduceByKey(lambda x,y: x+y)\
	 .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\
 	 .flatMap(lambda x:x)\
	 .filter(lambda (x,y,s):s>.97)
    rowRDD=runningWindow.map(lambda x:Row(uid=x[0][0],pid=x[1],score=x[2],ts=x[0][1])) 
#    print(rowRDD.take(10))
#    saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s')
    print("========= %d =========" % rowRDD.count())
    # save corr table to cassandra 
    if (rowRDD.count()>0):
	client = SimpleClient(KAFKA_NODE)
	producer = KeyedProducer(client)
        for row in rowRDD.collect():
	    line = '{ "tick" :"' + str(time.isoformat()) + '",'
            line+= '  "uid"  :"' + str(row['uid']) + '",'
            line+= ' "score":"' + str(row['score'])+'",'
            line+= '  "pid":' + str(row['pid'])+ '}'
#	    print(line)
	    producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
def process(time, lines):
    """match user with bidder
    Input:
    lines: (ts string, uid string, topic vector)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    # calculate user-product correlation table
    runningWindow=lines.map(lambda (k, v): ( (k[0], str(time)), v ))\
         .reduceByKey(lambda x,y: x+y)\
  .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\
   .flatMap(lambda x:x)\
  .filter(lambda (x,y,s):s>.97)
    rowRDD = runningWindow.map(
        lambda x: Row(uid=x[0][0], pid=x[1], score=x[2], ts=x[0][1]))
    #    print(rowRDD.take(10))
    print("========= %d =========" % rowRDD.count())
    if (rowRDD.count() > 0):
        client = SimpleClient(KAFKA_NODE)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "tick" :"' + str(time.isoformat()) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "score":"' + str(row['score']) + '",'
            line += '  "pid":' + str(row['pid']) + '}'
            #	    print(line)
            producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
Esempio n. 7
0
    def setup_kafka(self, settings):
        """Setup redis connection and idle signal.
        This should be called after the spider has set its crawler object.
        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092')
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP',
                                      'scrapy-kafka')
        _kafka = SimpleClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka,
                                       consumer_group,
                                       self.topic,
                                       auto_commit=True,
                                       iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped,
                                     signal=signals.item_scraped)
        logger.info("Reading URLs from kafka topic '%s'" % self.kafka_topic)
Esempio n. 8
0
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(ListeningKafkaSpider,
                       cls).from_crawler(crawler, *args, **kwargs)

        if not hasattr(spider, 'topic') or not spider.topic:
            spider.topic = '%s-starturls' % spider.name

        hosts = crawler.settings.get('SCRAPY_KAFKA_HOSTS', 'localhost:9092')
        consumer_group = crawler.settings.get(
            'SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = SimpleClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        spider.consumer = SimpleConsumer(_kafka,
                                         consumer_group,
                                         spider.topic,
                                         auto_commit=True,
                                         iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
        crawler.signals.connect(spider.item_scraped,
                                signal=signals.item_scraped)
        logger.info("Reading URLs from kafka topic '%s'" % spider.kafka_topic)

        return spider
 def __init__(self, addr):
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
     self.sess = requests.Session()
     adapter = requests.adapters.HTTPAdapter(max_retries=5)
     self.sess.mount('http://', adapter)
     self.sess.mount('https://', adapter)
Esempio n. 10
0
def process(time, lines):
    """match user with bidder
    Input:
    lines: (ts string, uid string, {pid:score} dict)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    # calculate user-product correlation table
    rowRDD=lines.map(lambda x: ( x['uid'], matchBids(x['score']) ))\
                .map(lambda x:Row(uid=x[0],pid=x[1][0],price=x[1][1]))
    print("========= %d =========" % rowRDD.count())
    if (rowRDD.count() > 0):
        #   send to kafka
        client = SimpleClient(KAFKA_NODE)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "pid" :"' + str(row['pid']) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "price":"' + str(row['price']) + '",'
            line += '  "ts":' + str(time) + '}'
            #	    print(line)
            producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)


#   save to cassandra
        rowRDD.map(lambda x:Row(pid=x['pid'],ts=str(time),price=x['price']))\
              .toDF().write\
              .format("org.apache.spark.sql.cassandra")\
              .options(table='winningbid10s', keyspace='ad_flow')\
              .save(mode="append")
Esempio n. 11
0
def main(servers: List[str]) -> None:
    """
    Main Method

    Arguments:
        servers: List of Zookeeper Kafka Host IPs
    """
    mysql_session = pymysql.connect(**MYSQL_CONF)

    users = query_for_users(mysql_session)
    photos: Deque = deque([], maxlen=3000)
    tags = query_for_tags(mysql_session)
    locations = query_for_locations(mysql_session)

    simple_client = SimpleClient(servers)
    producer = KeyedProducer(simple_client)

    events = [
        comment_producer,
        #create_user_producer,
        follow_producer,
        like_producer,
        create_photo_producer,
        unfollow_producer
    ]

    while True:
        event = generate_random_events(events)
        print(event(users, photos, tags, locations, producer))
        time.sleep(0.02)
Esempio n. 12
0
 def __init__(self, addr):
     self.client = SimpleClient(addr)
     #self.producer  = KeyedProducer(self.client)
     self.producer = KafkaProducer(
         bootstrap_servers=addr + ":9092",
         value_serializer=lambda v: v.encode('utf-8'),
         acks=0,
         linger_ms=500)
Esempio n. 13
0
 def __init__(self, bootstrapservers):
     '''
     Constructor
     '''
     self.Servers = bootstrapservers
     self.Producer = KafkaProducer(bootstrap_servers=self.Servers,
                                   retries=5)
     self.client = SimpleClient(bootstrapservers)
Esempio n. 14
0
    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)

        # The 1999 KDDCup network traffic dataset
        self.data_file = open('/home/ubuntu/opt/realtimeAnomalies/src/main/test/kddcup.testdata.unlabeled', 'r')
        self.mem_data = []
        for record in self.data_file:
            self.mem_data.append(record)
Esempio n. 15
0
 def from_settings(cls, settings):
     """
     :param settings: the current Scrapy settings
     :type settings: scrapy.settings.Settings
     :rtype: A :class:`~KafkaPipeline` instance
     """
     k_hosts = settings.get('SCRAPY_KAFKA_HOSTS', '127.0.0.1:9092')
     topic = settings.get('SCRAPY_KAFKA_ITEM_PIPELINE_TOPIC', 'data-topic')
     client = SimpleClient(k_hosts)
     producer = SimpleProducer(client)
     return cls(producer, topic)
    def getLogLines(self):
        #infinite loop of magical random numbers
        while not thread_stop_event.isSet():
            client = SimpleClient('127.0.0.1:9092')
            consumer = SimpleConsumer(client, "my-producer", "alarm")

            for message in consumer:
                print message.message.value
                socketio.emit('newmessage', {'message': message.message.value},
                              namespace='/test')
                sleep(self.delay)
Esempio n. 17
0
    def __init__(self, host_list, topic, **kwargs):
        logging.Handler.__init__(self)

        self.kafka_client = SimpleClient(host_list)
        self.key = kwargs.get("key", None)
        self.kafka_topic_name = topic

        if not self.key:
            self.producer = SimpleProducer(self.kafka_client, **kwargs)
        else:
            self.producer = KeyedProducer(self.kafka_client, **kwargs)
Esempio n. 18
0
 def __init__(self, addr):
     self.timezone = timezone('EST')
     self.host = 'ec2-34-192-152-48.compute-1.amazonaws.com'
     self.auction_db = 'auctiontable'
     self.bid_db = 'bidtable'
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
     self.active_auctions = []
     self.conn_auction_db = None
     self.conn_bid_db = None
     self.auction_table = None
     self.bid_table = None
     self.connected_auction_db = False
     self.connected_bid_db = False
Esempio n. 19
0
def process(spouts):
    '''
    Returns a named tuple of type PartitionsSummary.
    '''
    results = []
    total_depth = 0
    total_delta = 0
    brokers = []
    for s in spouts:
        for p in s.partitions:
            try:
                k = SimpleClient([p['broker']['host'] + ":" + str(p['broker']['port'])])
            except socket.gaierror as e:
                raise ProcessorError('Failed to contact Kafka broker %s (%s)' % (p['broker']['host'], str(e)))

            earliest_off = OffsetRequestPayload(p['topic'], p['partition'], -2, 1)
            latest_off = OffsetRequestPayload(p['topic'], p['partition'], -1, 1)

            earliest = k.send_offset_request([earliest_off])[0].offsets[0]
            latest = k.send_offset_request([latest_off])[0].offsets[0]
            current = p['offset']

            brokers.append(p['broker']['host'])
            total_depth = total_depth + (latest - earliest)
            total_delta = total_delta + (latest - current)

            results.append(PartitionState._make([
                p['broker']['host'],
                p['topic'],
                p['partition'],
                earliest,
                latest,
                latest - earliest,
                s.id,
                current,
                latest - current]))
    return PartitionsSummary(total_depth=total_depth,
                             total_delta=total_delta,
                             num_partitions=len(results),
                             num_brokers=len(set(brokers)),
                             partitions=tuple(results))
Esempio n. 20
0
def get_offset(group, topicname):
    try:
        kafka_client = KafkaClient(kafka_conn, timeout=30)
    except Exception as e:
        print "Error, cannot connect kafka broker."
        sys.exit(1)
    else:
        kafka_topics = kafka_client.topics
    finally:
        kafka_client.close()

    try:
        zookeeper_client = KazooClient(hosts=zookeepers_conn,
                                       read_only=True,
                                       timeout=30)
        zookeeper_client.start()
    except Exception as e:
        print "Error, cannot connect zookeeper server."
        sys.exit(1)

    offset_total = 0
    logsize_totoal = 0
    broker = SimpleClient(kafka_conn)
    # partition_path = 'consumers/%s/offsets/%s' % (group, topicname)
    partitions = broker.get_partition_ids_for_topic(topicname)
    kafka_consumer = KafkaConsumer(bootstrap_servers=kafka_conn)

    for partition in partitions:
        base_path = 'consumers/%s/%s/%s/%s' % (group, '%s', topicname,
                                               partition)
        owner_path, offset_path = base_path % 'owners', base_path % 'offsets'
        pdb.set_trace()
        offset = zookeeper_client.get('/' + offset_path)[0]
        offset_total += int(offset)

        # logsize_num = kafka_consumer.get_partition_offsets(topicname, partition, -1, 1)[0]
        # logsize_totoal += int(logsize_num)

    return offset_total, logsize_totoal
Esempio n. 21
0
    def __init__(self, addr):

        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)
        self.topic = 'ajay_test_topic'
 def __init__(self, addr, group_id):
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
     self.group_id = group_id
Esempio n. 23
0
    def configure(self, **configs):
        """Configure the consumer instance

        Configuration settings can be passed to constructor,
        otherwise defaults will be used:

        Keyword Arguments:
            bootstrap_servers (list): List of initial broker nodes the consumer
                should contact to bootstrap initial cluster metadata.  This does
                not have to be the full node list.  It just needs to have at
                least one broker that will respond to a Metadata API Request.
            client_id (str): a unique name for this client.  Defaults to
                'kafka.consumer.kafka'.
            group_id (str): the name of the consumer group to join,
                Offsets are fetched / committed to this group name.
            fetch_message_max_bytes (int, optional): Maximum bytes for each
                topic/partition fetch request.  Defaults to 1024*1024.
            fetch_min_bytes (int, optional): Minimum amount of data the server
                should return for a fetch request, otherwise wait up to
                fetch_wait_max_ms for more data to accumulate.  Defaults to 1.
            fetch_wait_max_ms (int, optional): Maximum time for the server to
                block waiting for fetch_min_bytes messages to accumulate.
                Defaults to 100.
            refresh_leader_backoff_ms (int, optional): Milliseconds to backoff
                when refreshing metadata on errors (subject to random jitter).
                Defaults to 200.
            socket_timeout_ms (int, optional): TCP socket timeout in
                milliseconds.  Defaults to 30*1000.
            auto_offset_reset (str, optional): A policy for resetting offsets on
                OffsetOutOfRange errors. 'smallest' will move to the oldest
                available message, 'largest' will move to the most recent.  Any
                ofther value will raise the exception.  Defaults to 'largest'.
            deserializer_class (callable, optional):  Any callable that takes a
                raw message value and returns a deserialized value.  Defaults to
                 lambda msg: msg.
            auto_commit_enable (bool, optional): Enabling auto-commit will cause
                the KafkaConsumer to periodically commit offsets without an
                explicit call to commit().  Defaults to False.
            auto_commit_interval_ms (int, optional):  If auto_commit_enabled,
                the milliseconds between automatic offset commits.  Defaults to
                60 * 1000.
            auto_commit_interval_messages (int, optional): If
                auto_commit_enabled, a number of messages consumed between
                automatic offset commits.  Defaults to None (disabled).
            consumer_timeout_ms (int, optional): number of millisecond to throw
                a timeout exception to the consumer if no message is available
                for consumption.  Defaults to -1 (dont throw exception).

        Configuration parameters are described in more detail at
        http://kafka.apache.org/documentation.html#highlevelconsumerapi
        """
        configs = self._deprecate_configs(**configs)
        self._config = {}
        for key in self.DEFAULT_CONFIG:
            self._config[key] = configs.pop(key, self.DEFAULT_CONFIG[key])

        if configs:
            raise KafkaConfigurationError('Unknown configuration key(s): ' +
                                          str(list(configs.keys())))

        if self._config['auto_commit_enable']:
            if not self._config['group_id']:
                raise KafkaConfigurationError(
                    'KafkaConsumer configured to auto-commit '
                    'without required consumer group (group_id)')

        # Check auto-commit configuration
        if self._config['auto_commit_enable']:
            logger.info("Configuring consumer to auto-commit offsets")
            self._reset_auto_commit()

        if not self._config['bootstrap_servers']:
            raise KafkaConfigurationError(
                'bootstrap_servers required to configure KafkaConsumer')

        reporters = [self._config['metrics_reporter']()] if \
            self._config['metrics_reporter'] else []
        metrics = Metrics(reporters=reporters)
        self.metrics = KafkaConsumerMetrics(metrics)

        self._client = SimpleClient(
            self._config['bootstrap_servers'],
            client_id=self._config['client_id'],
            timeout=(self._config['socket_timeout_ms'] / 1000.0),
            metrics=metrics,
        )
Esempio n. 24
0
 def __init__(self, addr):
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
     self.timezone = timezone('EST')
from tweepy import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from kafka import KafkaProducer
from kafka.client import SimpleClient
from kafka.consumer import SimpleConsumer
from kafka.producer import SimpleProducer
import json
import configure

client = SimpleClient("localhost:9092")
producer = SimpleProducer(client)
consumer_key = configure.consumer_key
consumer_secret = configure.consumer_secret
access_token = configure.access_token
access_token_secret = configure.access_secret


class StdOutListener(StreamListener):
    """ A listener handles tweets that are received from the stream.
    This is a basic listener that just prints received tweets to stdout.
    """
    def on_data(self, data):
        data = json.loads(data)
        #print(data)
        user_id = data['user']['id_str']
        producer.send_messages('covid', (user_id).encode('utf-8'))
        print(user_id)
        return True

    def on_error(self, status):
Esempio n. 26
0
def getBid():
    return float('{0:.03f}'.format(np.random.pareto(.3) % 64 + 1))


def getJsonLine(i):
    timestamp = getTimeStamp()
    pid = getPid(i)
    bidPrice = getBid()

    line = '{ "timeStamp" :"' + timestamp + '",'
    line += '  "productId"  :"' + pid + '",'
    line += '  "bidPrice":' + str(bidPrice) + '}'

    return line


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: produceBids.py <node> <topic>", file=sys.stderr)
        exit(-1)
    kafkaNode, kafkaTopic = sys.argv[1:]
    #   load date
    client = SimpleClient(kafkaNode)
    producer = KeyedProducer(client)
    #    for i in xrange(NumOfProduct):
    while True:
        line = getJsonLine(-1)
        time.sleep(.01)
        print(line)
        producer.send_messages(kafkaTopic, str(hash(line) % 100), line)
Esempio n. 27
0
from kazoo.client import KazooClient
from kafka.consumer import base
from kafka.structs import (OffsetRequestPayload, OffsetCommitRequestPayload,
                           OffsetFetchRequestPayload)
import sys

# zookeepers="10.10.217.152:2182"
zookeepers = "127.0.0.1:2182"

kafka = "127.0.0.1:9092"

group = "consumer-group"

if __name__ == '__main__':

    broker = SimpleClient(kafka)
    lags = {}
    zk = KazooClient(hosts=zookeepers, read_only=True)
    zk.start()
    logsize = 0
    #    topics=zk.get_children("/consumers/%s/owners" % (group) )
    topic = sys.argv[1]
    data_need = sys.argv[2]
    #    for topic in topics:
    if topic:
        logsize = 0
        #	print topic
        partitions = broker.get_partition_ids_for_topic(topic)
        #	print partitions
        consumer = KafkaConsumer(broker, group, str(topic))
        responses = broker.send_offset_fetch_request(
Esempio n. 28
0
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from kafka import KafkaProducer
from kafka.client import SimpleClient
from kafka.consumer import SimpleConsumer
from kafka.producer import SimpleProducer

client = SimpleClient("localhost:8080")
producer = SimpleProducer(client)
consumer_key = ""
consumer_secret = ""
access_token = ""
access_secret = ""

def main():
            '''
            main function initiates a kafka consumer, initialize the tweetdata database.
            Consumer consumes tweets from producer extracts features, cleanses the tweet text,
            calculates sentiments and loads the data into postgres database
            '''
            # set-up a Kafka consumer
            consumer = KafkaConsumer('movies')
            tweets,conn, dbcur = initialize(db_name = "tweetdata")
            for msg in consumer:
                output = []
                output.append(json.loads(msg.value))
                print output
                print '\n'

                # Function to extract features from tweets
 def __init__(self, addr):
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
Esempio n. 30
0
    return well_stat, (wellID, str(reform_event_data))


def days_between(d1, d2):
    d1 = datetime.strptime(d1, "%Y-%m-%d %H:%M:%S")
    d2 = datetime.strptime(d2, "%Y-%m-%d %H:%M:%S")
    return abs((d2 - d1).seconds)


if __name__ == "__main__":

    args = sys.argv
    ip_address = str(args[1])
    partition_key = str(args[2])

    client = SimpleClient(ip_address)
    producer = KeyedProducer(client)

    # --------------------------------
    # Defining data variables
    # --------------------------------
    dt = datetime.datetime(2017, 10, 9, 1, 59,
                           59)  # Defining a starting date and time
    max_step = 600  # max time step size
    numOfWells = 50000  # Number of wells

    # --------------------------------
    # Generating time series data
    # --------------------------------
    event_types = ['on', 'off']
    completion_types = ['BP', 'ESP']  # diff from 'completion_type' (no s)