def process(time, lines):
    """1. select user to push ads
       2. save user-product corr table to cassandra
       3. match user with bidder
       4. save bid winner to cassandra 
    Input:
    lines: (ts string, uid string, topic vector)
    """
    print("========= %s =========" % str(time))
    sqlContext=getSqlContextInstance(lines.context)
    # calculate user-product correlation table  
    #lines1s=lines.map(lambda x: ( (x['uid'], roundTime(parser.parse(x['tick']),1).isoformat()), np.asarray([1]+[float(i) for i in x['topic']])))\
#    lines1s=lines.map(lambda x: ( x[0], 1))\
    runningWindow=lines.map(lambda (k, v): ( (k[0], time.isoformat()), v ))\
         .reduceByKey(lambda x,y: x+y)\
	 .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\
 	 .flatMap(lambda x:x)\
	 .filter(lambda (x,y,s):s>.97)
    rowRDD=runningWindow.map(lambda x:Row(uid=x[0][0],pid=x[1],score=x[2],ts=x[0][1])) 
#    print(rowRDD.take(10))
#    saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s')
    print("========= %d =========" % rowRDD.count())
    # save corr table to cassandra 
    if (rowRDD.count()>0):
	client = SimpleClient(KAFKA_NODE)
	producer = KeyedProducer(client)
        for row in rowRDD.collect():
	    line = '{ "tick" :"' + str(time.isoformat()) + '",'
            line+= '  "uid"  :"' + str(row['uid']) + '",'
            line+= ' "score":"' + str(row['score'])+'",'
            line+= '  "pid":' + str(row['pid'])+ '}'
#	    print(line)
	    producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
def process(time, lines):
    """Calculate user-product corr table and select ad-push events
    Input:
    lines: (ts string, uid string, topic vector)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    #   calculate user-product correlation table
    runningWindow=lines.map(lambda (k, v): ( (k[1], time.isoformat()), v ))\
         .reduceByKey(lambda x,y: x+y)\
  .map(lambda (x,u): (x, [(pid, score) for (pid, score) in ( (y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value ) if score>.90]))\
  .filter(lambda (k, v): v!=[])
    rowRDD = runningWindow.map(
        lambda x: Row(uid=x[0][0], score=x[1], ts=x[0][1]))
    #   print(rowRDD.take(10))
    #   saveRDD(sqlContext, rowRDD, keyspaceName='ad_flow', tableName='records1s')
    print("========= %d =========" % rowRDD.count())
    #   save corr table to cassandra
    if (rowRDD.count() > 0):
        client = SimpleClient(KAFKA_NODE)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "timeStamp" :"' + str(time) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "score":"' + json.dumps(dict(row['score'])) + '}'
            #	    print(line)
            producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
 def __init__(self, addr):
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
     self.sess = requests.Session()
     adapter = requests.adapters.HTTPAdapter(max_retries=5)
     self.sess.mount('http://', adapter)
     self.sess.mount('https://', adapter)
def process(time, lines):
    """match user with bidder
    Input:
    lines: (ts string, uid string, topic vector)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    # calculate user-product correlation table
    runningWindow=lines.map(lambda (k, v): ( (k[0], str(time)), v ))\
         .reduceByKey(lambda x,y: x+y)\
  .map(lambda (x,u): [(x, y, float(u.dot(v)/(norm(u)*norm(v)))) for (y,v) in bv.value])\
   .flatMap(lambda x:x)\
  .filter(lambda (x,y,s):s>.97)
    rowRDD = runningWindow.map(
        lambda x: Row(uid=x[0][0], pid=x[1], score=x[2], ts=x[0][1]))
    #    print(rowRDD.take(10))
    print("========= %d =========" % rowRDD.count())
    if (rowRDD.count() > 0):
        client = SimpleClient(KAFKA_NODE)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "tick" :"' + str(time.isoformat()) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "score":"' + str(row['score']) + '",'
            line += '  "pid":' + str(row['pid']) + '}'
            #	    print(line)
            producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)
Esempio n. 5
0
def process(time, lines):
    """match user with bidder
    Input:
    lines: (ts string, uid string, {pid:score} dict)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    # calculate user-product correlation table
    rowRDD=lines.map(lambda x: ( x['uid'], matchBids(x['score']) ))\
                .map(lambda x:Row(uid=x[0],pid=x[1][0],price=x[1][1]))
    print("========= %d =========" % rowRDD.count())
    if (rowRDD.count() > 0):
        #   send to kafka
        client = SimpleClient(KAFKA_NODE)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "pid" :"' + str(row['pid']) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "price":"' + str(row['price']) + '",'
            line += '  "ts":' + str(time) + '}'
            #	    print(line)
            producer.send_messages(KAFKA_TOPIC, str(hash(line)), line)


#   save to cassandra
        rowRDD.map(lambda x:Row(pid=x['pid'],ts=str(time),price=x['price']))\
              .toDF().write\
              .format("org.apache.spark.sql.cassandra")\
              .options(table='winningbid10s', keyspace='ad_flow')\
              .save(mode="append")
Esempio n. 6
0
def process(time, lines):
    """ Processing tweets 
    Input:
    lines: (ts string, uid string, state string, tweet vector)
    Output:
    Json: (ts string, uid string, topicVec vector)
    """
    print("========= %s =========" % str(time))
    sqlContext = getSqlContextInstance(lines.context)
    #    rowRDD=lines.map(lambda x: (x['timeStamp'], x['userId'], getMeanVector(x['tweet'])))\
    #                .filter(lambda (time, uid, vec): vec!=[])\
    #                .map(lambda x:Row(timestamp=x[0], uid=x[1], topicVec=x[2]))
    rowRDD=lines.map(lambda x: [((x['timeStamp'], x['userId']), word2vec(item)) for item in x['tweet'] if isInVolcabulary(item)] )\
                .flatMap(lambda x:x)\
                .filter(lambda (k, vec): vec!=[])\
                .reduceByKey(lambda x,y:x+y)\
                .map(lambda x:Row(timestamp=x[0][0], uid=x[0][1], topicVec=x[1]))

    #    print(rowRDD.take(10))
    print("========= %d =========" % rowRDD.count())
    #   save corr table to cassandra
    if (rowRDD.count() > 0):
        client = SimpleClient(kafkaNodeBC.value)
        producer = KeyedProducer(client)
        for row in rowRDD.collect():
            line = '{ "timestamp" :"' + str(row['timestamp']) + '",'
            line += '  "uid"  :"' + str(row['uid']) + '",'
            line += ' "topicVec":' + json.dumps(
                [float(i) for i in row['topicVec']]) + '}'
            #	    print(line)
            producer.send_messages(outgoingTopic, str(hash(line)), line)
Esempio n. 7
0
def main(servers: List[str]) -> None:
    """
    Main Method

    Arguments:
        servers: List of Zookeeper Kafka Host IPs
    """
    mysql_session = pymysql.connect(**MYSQL_CONF)

    users = query_for_users(mysql_session)
    photos: Deque = deque([], maxlen=3000)
    tags = query_for_tags(mysql_session)
    locations = query_for_locations(mysql_session)

    simple_client = SimpleClient(servers)
    producer = KeyedProducer(simple_client)

    events = [
        comment_producer,
        #create_user_producer,
        follow_producer,
        like_producer,
        create_photo_producer,
        unfollow_producer
    ]

    while True:
        event = generate_random_events(events)
        print(event(users, photos, tags, locations, producer))
        time.sleep(0.02)
Esempio n. 8
0
    def __init__(self, addr):
        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)

        # The 1999 KDDCup network traffic dataset
        self.data_file = open('/home/ubuntu/opt/realtimeAnomalies/src/main/test/kddcup.testdata.unlabeled', 'r')
        self.mem_data = []
        for record in self.data_file:
            self.mem_data.append(record)
Esempio n. 9
0
 def __init__(self, hosts_list, topic, key=None):
     logging.Handler.__init__(self)
     self.kafka_client = KafkaClient(hosts_list)
     self.key = key
     self.kafka_topic_name = topic
     if not key:
         self.producer = SimpleProducer(self.kafka_client)
     else:
         self.producer = KeyedProducer(self.kafka_client)
Esempio n. 10
0
    def __init__(self, topic, producer_type=ProducerType.SIMPLE,\
            host_port="127.0.0.1:9092", **producer_opts):

        self.topic = topic
        self.host_port = host_port
        if producer_type == ProducerType.SIMPLE:
            self.producer = SimpleProducer(KafkaClient(host_port),\
                    **producer_opts)
        else:
            self.producer = KeyedProducer(KafkaClient(host_port),\
                    **producer_opts)
Esempio n. 11
0
    def __init__(self, host_list, topic, **kwargs):
        logging.Handler.__init__(self)

        self.kafka_client = SimpleClient(host_list)
        self.key = kwargs.get("key", None)
        self.kafka_topic_name = topic

        if not self.key:
            self.producer = SimpleProducer(self.kafka_client, **kwargs)
        else:
            self.producer = KeyedProducer(self.kafka_client, **kwargs)
Esempio n. 12
0
    def __init__(self, hosts_list, topic, timeout_secs=DEFAULT_SOCKET_TIMEOUT_SECONDS, **kwargs):
        logging.Handler.__init__(self)

        self.kafka_client = KafkaClient(hosts_list, timeout=timeout_secs)
        self.key = kwargs.get("key", None)
        self.kafka_topic_name = topic

        if not self.key:
            self.producer = SimpleProducer(self.kafka_client, **kwargs)
        else:
            self.producer = KeyedProducer(self.kafka_client, **kwargs)
        self.addFilter(KafkaLoggingFilter())
Esempio n. 13
0
 def __init__(self, addr):
     self.timezone = timezone('EST')
     self.host = 'ec2-34-192-152-48.compute-1.amazonaws.com'
     self.auction_db = 'auctiontable'
     self.bid_db = 'bidtable'
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
     self.active_auctions = []
     self.conn_auction_db = None
     self.conn_bid_db = None
     self.auction_table = None
     self.bid_table = None
     self.connected_auction_db = False
     self.connected_bid_db = False
Esempio n. 14
0
    def __init__(self, addr):

        self.client = SimpleClient(addr)
        self.producer = KeyedProducer(self.client)
        self.topic = 'ajay_test_topic'
 def __init__(self, addr, group_id):
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
     self.group_id = group_id
Esempio n. 16
0
 def __init__(self, addr=None):
     self.isNone = True
     if addr is not None:
         self.client = SimpleClient(addr)
         self.producer = KeyedProducer(self.client)
         self.isNone = False
Esempio n. 17
0
 def __init__(self, addr):
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
     self.timezone = timezone('EST')
Esempio n. 18
0
 def __init__(self, addr):
     self.client = KafkaClient(addr)
     self.producer = KeyedProducer(self.client)
Esempio n. 19
0
 def __init__(self, addr):
     self.client = KafkaClient(addr)
     self.producer = KeyedProducer(self.client)
     self.artist_id = []
     self.artwork_id = []
#select a random piece of news from collected set of tweets and send as a Kafka message

DATADIR = "/home/ubuntu/synthetic_twitter/"
KAFKA_NODE = "ec2-54-215-247-116.us-west-1.compute.amazonaws.com"
KAFKA_TOPIC = "twitter"
os.chdir(DATADIR)
files = glob.glob("*.archive")

#select a random company
datafile = random.choice(files)
datafile = DATADIR + datafile
#select a random line in data file for news
#R(3.4.2) (Waterman's "Reservoir Algorithm")
data = open(datafile, "r")
line = next(data)
for num, nextline in enumerate(data):
    if random.randrange(num + 2): continue
    line = nextline

data.close()

#add "Synthetic Twitter" as news outlet
line = line.rstrip().replace('}', ', "newsoutlet":"Synthetic Twitter"}')
#add timestamp
line = line.replace('}', ',"newstime":"' + time.strftime("%c") + '"}')

#Create producer and send message
client = KafkaClient(KAFKA_NODE)
producer = KeyedProducer(client)
producer.send_messages('twitter', str(hash(line) % 2), line)
Esempio n. 21
0
 def __init__(self, addr, userslist, venueslist):
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
     self.userslist = userslist[0:500000]
     self.venueslist = venueslist[0:250000]
 def __init__(self, addr):
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)
Esempio n. 23
0
 def __init__(self, addr):
     # setup connection to the kafka in order to send messages
     self.client = SimpleClient(addr)
     self.producer = KeyedProducer(self.client)