def test_ts(self): kafka = KafkaClient(config.get("kafka.host1") + "," + config.get("kafka.host2")) # consumer = SimpleConsumer(kafka, "my-group112", "test") consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC, fetch_size_bytes=3000000, buffer_size=2000000000, max_buffer_size=2000000000) while True: print("HELLO") # Prepare data for insert and copy to S3 # data_str = StringIO() count = 0 # last_offset = 2 consumer.seek(2, 0) for message in consumer.get_messages(count=100, block=False, timeout=0.1): count += 1 print(message.message.value) # # Write tweets to StringIO # self.write_to_data_str(message, data_str) # # Store batch tweets to S3 # self.write_to_s3(data_str, last_offset) if count != 100: break
def load_data_to_s3(self, photo_data): data_str = StringIO() ordering = ["pid", "yymmddhh", "word", "url"] for data in photo_data: row_arr = [] for field in ordering: val = data[field] if data[field] else None if val is None: row_arr.append('\N') else: row_arr.append(unicode(val)) data_str.write('\007'.join(row_arr).encode('utf-8') + '\n') # Copy data for load to S3 s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) s3_file = Key(bucket) s3_file.key = self.S3_KEY data_str.seek(0) s3_file.set_contents_from_file(data_str)
def https_open(self, req): ca_certs = config.get('http.ca_certs_file', DEFAULT_CA_CERTS) if config.get('http.verify_server_certificates', True) and os.path.exists(ca_certs): frags = urlparse.urlparse(req.get_full_url()) ssl.get_server_certificate((frags.hostname, frags.port or 443), ca_certs=ca_certs) return self.do_open(httplib.HTTPSConnection, req)
def get_socket(self): ''' Creates and connects a new socket, or returns an existing one if this method was called previously. Returns a (protocol, socket) tuple, where protocol is either 'tcp' or 'udp'. If the returned socket is None, the operation failed and details were logged. ''' if self.sock is not None: return (self.proto, self.sock) proto = config.get('statsd.protocol', 'udp') self.proto = proto self.host = config.get('statsd.host', None) self.port = config.get('statsd.port', 8125) if self.host is None or self.port is None: return (self.proto, None) if (self.next_retry is not None) and (self.next_retry > time.time()): return (self.proto, None) if proto == 'udp': self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) log.debug('Created udp statsd socket') return (proto, self.sock) if proto == 'tcp': if self.host is None or not isinstance(self.port, int): log.error('Invalid TCP statsd config: host=%r port=%r', self.host, self.port) self.sock = None else: try: self.sock = socket.create_connection(address=(self.host, self.port), timeout=4.0) log.debug('Connected tcp statsd socket to %s:%i', self.host, self.port) # Succesful connection resets retry backoff to 1 second self.next_retry = None self.backoff = 0.5 except socket.error: log.exception('Cannot open tcp stats socket %s:%i', self.host, self.port) self.sock = None # Every time a connection fails, we add 25% of the backoff value # We cap this at max_backoff so that we guarantee retries after # some period of time if self.backoff > self.max_backoff: self.backoff = self.max_backoff log.warning( 'Unable to connect to statsd, not trying again for %.03f seconds', self.backoff) self.next_retry = (time.time() + self.backoff) self.backoff *= 1.25 return (proto, self.sock) log.warning('Unknown protocol configured for statsd socket: %s', proto) return (proto, None)
def get_list_of_events(self, date, event_types): data = {} eventbrite = EventBriteRest( rest_endpoint=config.get("eventbrite.endpoint_rest"), token=config.get("eventbrite.token")) start_time = (date + timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ') end_time = (date + timedelta(days=2)).strftime('%Y-%m-%dT%H:%M:%SZ') # Search for each event types for event_type in event_types: result = eventbrite.search_events_by(event_type, start_time, end_time, 1) # Determine the number of events returned from api event_count = result["pagination"]["object_count"] page_count = result["pagination"]["page_count"] # If there is data if event_count > 0: data[event_type] = [] # Get data from each page for i in xrange(1, page_count + 1): result = eventbrite.search_events_by( event_type, start_time, end_time, i) # Append data into array for event in result["events"]: data[event_type].append(event) return data
def check_stream(self): api = TwitterAPI(consumer_key=config.get("twitter.consumer_key"), consumer_secret=config.get("twitter.consumer_secret"), access_token_key=config.get("twitter.access_token"), access_token_secret=config.get("twitter.access_token_secret") ) while True: tweeter_stream = api.request('statuses/filter', {'locations': "-123.66,32.54,-113.77,39.57,-93.82,24.32,-65.08,47.84"}) # tweeter_stream = api.request('statuses/filter', {'locations': self.get_geo_str()}) # print(self.get_geo_str()) start_time = time.time() # print("len") # print((tweeter_stream.text)) # print((tweeter_stream.stream)) # Stream data for tweet in tweeter_stream: # Break out for loop at specified time interval to query new sets of Geo Coordinates if time.time() > start_time + self.REFRESH_INTERVAL: print("breaktime") break # Publish tweets to Kafka print(tweet)
def get_conference_pictures(self, keyword): json_data = [] # Cassandra initialization cluster = Cluster([config.get("cassandra.host1"), config.get("cassandra.host2")]) session = cluster.connect('insight') # Instagram initialization instagram_api = InstagramAPI(client_id=config.get("instagram.client_id"), client_secret=config.get("instagram.client_secret")) yymmdd = datetime.utcnow().strftime('%y') + datetime.utcnow().strftime('%m') + datetime.utcnow().strftime('%d') rows = session.execute(self.TOP_10_QUERY % (yymmdd)) for (yymmdd, count, word) in rows: img_arr = [] popular_media = instagram_api.media_popular(count=20) for media in popular_media: img_arr.append(media.images['standard_resolution'].url) json_data.append({"word": word, "count": count, "pic_url": img_arr}) return json_data
def get_list_of_events(self, date, event_types): data = {} eventbrite = EventBriteRest(rest_endpoint=config.get("eventbrite.endpoint_rest"), token=config.get("eventbrite.token") ) start_time = (date + timedelta(days=1)).strftime('%Y-%m-%dT%H:%M:%SZ') end_time = (date + timedelta(days=2)).strftime('%Y-%m-%dT%H:%M:%SZ') # Search for each event types for event_type in event_types: result = eventbrite.search_events_by(event_type, start_time, end_time, 1) # Determine the number of events returned from api event_count = result["pagination"]["object_count"] page_count = result["pagination"]["page_count"] # If there is data if event_count > 0: data[event_type] = [] # Get data from each page for i in xrange(1, page_count + 1): result = eventbrite.search_events_by(event_type, start_time, end_time, i) # Append data into array for event in result["events"]: data[event_type].append(event) return data
def __init__(self): self.conn_opts = dict( host=config.get("redshift_db.host"), port=config.get("redshift_db.port"), user=config.get("redshift_db.user"), password=config.get("redshift_db.password"), database=config.get("redshift_db.db") )
def __init__(self): self.conn_opts = dict( host=config.get("mysql_db.host"), port=config.get("mysql_db.port"), user=config.get("mysql_db.user"), passwd=config.get("mysql_db.password"), db=config.get("mysql_db.db") )
def run(self): cluster = Cluster( [config.get("cassandra.host1"), config.get("cassandra.host2")]) session = cluster.connect('insight') print(session.execute("""describe tables"""))
def clean_s3_files(self): s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) for key in bucket.list(self.S3_KEY): bucket.delete_key(key)
def https_open(self, req): ca_certs = config.get('http.ca_certs_file', DEFAULT_CA_CERTS) if config.get('http.verify_server_certificates', True) and os.path.exists(ca_certs): frags = urlparse(req.get_full_url()) ssl.get_server_certificate((frags.hostname, frags.port or 443), ca_certs=ca_certs) return self.do_open(http.client.HTTPSConnection, req)
def test_insta(self, options=None): api = InstagramAPI(client_id=config.get("instagram.client_id"), client_secret=config.get("instagram.client_secret")) popular_media = api.media_popular(count=20) for media in popular_media: print(media.images['standard_resolution'].url)
def sendmail(mailto, subject, message, subtype='html', charset='utf-8', smtpconfig=None, attachments={}, use_starttls=False, **headers): ''' Send an email to the given address. Additional SMTP headers may be specified as keyword arguments. ''' if not smtpconfig: # we support both smtp and mail for legacy reasons # smtp is the correct usage. smtpconfig = config.get('smtp') or config.get('mail') # mailto arg is explicit to ensure that it's always set, but it's processed # mostly the same way as all other headers headers['To'] = _string_or_list(mailto) msg = MIMEMultipart('alternative') msg['Subject'] = subject for key, value in six.iteritems(headers): for val in _string_or_list(value): msg.add_header(key, val) text = MIMEText(message, subtype, charset) msg.attach(text) # Add attachments for file_name, file_payload in attachments.items(): part = MIMEBase('application', 'octet-stream') part.set_payload(file_payload.encode(charset)) Encoders.encode_base64(part) part.add_header( 'Content-Disposition', 'attachment; filename="%s"' % file_name ) msg.attach(part) if not 'From' in msg: msg['From'] = smtpconfig.get('from') mailfrom = msg['From'] assert isinstance(mailfrom, six.string_types) recipients = [] for toheader in ('To', 'CC', 'BCC'): recipients += msg.get_all(toheader, []) if 'BCC' in msg: del msg['BCC'] smtp = smtplib.SMTP(smtpconfig.get('host'), smtpconfig.get('port')) if smtpconfig.get('username', None) is not None and smtpconfig.get('password', None) is not None: if use_starttls: smtp.elho() smtp.starttls() smtp.elho() smtp.login(smtpconfig.get('username'), smtpconfig.get('password')) smtp.sendmail(mailfrom, recipients, msg.as_string()) smtp.quit() log.info('Sent email to %s (Subject: %s)', recipients, subject)
def get_socket(self): ''' Creates and connects a new socket, or returns an existing one if this method was called previously. Returns a (protocol, socket) tuple, where protocol is either 'tcp' or 'udp'. If the returned socket is None, the operation failed and details were logged. ''' if self.sock is not None: return (self.proto, self.sock) proto = config.get('statsd.protocol', 'udp') self.proto = proto self.host = config.get('statsd.host', None) self.port = config.get('statsd.port', 8125) if self.host is None or self.port is None: return (self.proto, None) if (self.next_retry is not None) and (self.next_retry > time.time()): return (self.proto, None) if proto == 'udp': self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) log.debug('Created udp statsd socket') return (proto, self.sock) if proto == 'tcp': if self.host is None or not isinstance(self.port, int): log.error('Invalid TCP statsd config: host=%r port=%r', self.host, self.port) self.sock = None else: try: self.sock = socket.create_connection(address=(self.host, self.port), timeout=4.0) log.debug('Connected tcp statsd socket to %s:%i', self.host, self.port) # Succesful connection resets retry backoff to 1 second self.next_retry = None self.backoff = 0.5 except socket.error: log.exception('Cannot open tcp stats socket %s:%i', self.host, self.port) self.sock = None # Every time a connection fails, we add 25% of the backoff value # We cap this at max_backoff so that we guarantee retries after # some period of time if self.backoff > self.max_backoff: self.backoff = self.max_backoff log.warning('Unable to connect to statsd, not trying again for %.03f seconds', self.backoff) self.next_retry = (time.time() + self.backoff) self.backoff *= 1.25 return (proto, self.sock) log.warning('Unknown protocol configured for statsd socket: %s', proto) return (proto, None)
def get_updates(self, timestamp): start_time = time.time() d_time = datetime.fromtimestamp(long(timestamp.encode("utf-8")) / 1000) timestamp = convert_time_to_utc(d_time) # Cassandra initialization cluster = Cluster([config.get("cassandra.host1"), config.get("cassandra.host2")]) session = cluster.connect('insight') t_yymmddhh = datetime.utcnow().strftime('%y') + datetime.utcnow().strftime('%m') + datetime.utcnow().strftime('%d') + datetime.utcnow().strftime('%H') print(self.QUERY_REAL_TIME_TWEETS % (t_yymmddhh, timestamp)) tweeted_words = session.execute(self.QUERY_REAL_TIME_TWEETS % (t_yymmddhh, timestamp)) # tweeted_words = session.execute(self.QUERY_REAL_TIME_TWEETS % "15020416") tweet_data = {} for (yymmddhh, timestamp, lat, lng, data) in tweeted_words: print("data") lat = round(float(lat), 2) lng = round(float(lng), 2) location = str(lat) + "," + str(lng) # Append data if not already in record else append if location not in tweet_data: tweet_data[location] = {"words": [], "tweets": 0} # Get words from cassandra column word_count_pairs = [pair for pair in data.split(":")] for pair in word_count_pairs: data = pair.split(",") word = data[0].encode("utf-8") count = int(data[1].encode("utf-8")) if word == "OVERALL_CNT": tweet_data[location]["tweets"] += count continue tweet_data[location]["words"].append({"word": word, "count": count}) # Delete misc data if "0.0,0.0" in tweet_data: tweet_data.pop("0.0,0.0") print(tweet_data) print("Realtime STREAMING API exec time for Instance is " + str(time.time() - start_time)) return tweet_data
def run(self, options=None): # try: # Create table if it doesn't exist in the database if self.REDSHIFT.if_table_exists(self.TABLE_NAME) is False: self.REDSHIFT.execute(self.CREATE_TRACKING_TABLE) kafka = KafkaClient(config.get("kafka.host1") + "," + config.get("kafka.host2")) consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC, fetch_size_bytes=3000000, buffer_size=2000000000, max_buffer_size=2000000000) while True: # Prepare data for insert and copy to S3 data_str = StringIO() csv_str = StringIO() count = 0 # Get Offset from previous read s3_last_offset = self.get_s3_offset() (last_offset) = self.REDSHIFT.select(self.GET_OFFSET_QUERY)[0][0] last_offset = last_offset if last_offset else 0 # Resolve difference in offset (s3 offset does not carry over from day to day) if s3_last_offset > last_offset: last_offset = s3_last_offset self.REDSHIFT.execute(self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset)) print(last_offset) # Read from Offset consumer.seek(last_offset, 0) for message in consumer.get_messages(count=self.BATCH_SIZE, block=False, timeout=5): # Write tweets to StringIO self.write_to_data_str(message, data_str, csv_str) count += 1 last_offset += 1 # Store batch tweets to S3 self.write_to_s3(data_str, csv_str, last_offset) # Track Kafka Offset self.REDSHIFT.execute(self.UPDATE_OFFSET_QUERY % (self.GROUP_NAME, self.PARTITION, last_offset)) if count != self.BATCH_SIZE: break
def get_and_load_words_to_s3(self, i): data_str = StringIO() # ordering = ["wid", "words", "latitude", "longitude", "count", "created_at"] timestamp = datetime.utcnow().strftime('%y') + datetime.utcnow( ).strftime('%m') + datetime.utcnow().strftime( '%d') + datetime.utcnow().strftime('%H') tweeted_words = self.CASSANDRA.execute(self.QUERY_TODAYS_TWEET % timestamp) # tweeted_words = self.CASSANDRA.execute(self.QUERY_TODAYS_TWEET % '15013006') for (yymmddhh, location, string, data, timestamp) in tweeted_words: loc = location.split(",") lat = float(loc[0]) lng = float(loc[1]) # word_count_pairs = [pair for pair in string.split(":")] for word, count in data.iteritems(): i += 1 row_arr = [] word = word.encode("utf-8") if word == "OVERALL_CNT": continue row_arr = [ unicode(i), unicode(word), unicode(lat), unicode(lng), unicode(count), unicode(datetime.utcnow()) ] data_str.write('\007'.join(row_arr).encode('utf-8') + '\n') # Copy data for load to S3 s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) s3_file = Key(bucket) s3_file.key = self.S3_KEY data_str.seek(0) s3_file.set_contents_from_file(data_str)
def test_s3(self, options=None): s3_key = "eventbrite/" + self.TABLE_NAME + "_" + str(calendar.timegm(datetime.utcnow().timetuple())) + ".txt" s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) line_buff = StringIO() line_buff.write("Hello World MORE AND MORE STUFF") s3_file = Key(bucket) s3_file.key = s3_key line_buff.seek(0) s3_file.set_contents_from_file(line_buff)
def fetch_photos_for_conference(self, word_data, last_id): bag_of_words = [] photo_data = [] flickr = FlickrRest(rest_endpoint=config.get("flickr.endpoint_rest"), api_key=config.get("flickr.api_key")) yymmddhh = datetime.utcnow().strftime('%y') + datetime.utcnow( ).strftime('%m') + datetime.utcnow().strftime( '%d') + datetime.utcnow().strftime('%H') db_data = self.REDSHIFT.select(self.QUERY_RS_EVENTS) for row in db_data: if row["latitude"] is None and row["longitude"] is None: continue location = ("%.2f" % row["latitude"]) + "," + ("%.2f" % row["longitude"]) if location not in word_data: continue for word, count in word_data[location].iteritems(): if word == "OVERALL_CNT" or word in bag_of_words: continue if len(word) is 0: continue for url in flickr.get_photos_by_keyword(word): # Increment last_id last_id += 1 data = {} data["pid"] = last_id data["yymmddhh"] = yymmddhh data["word"] = word data["url"] = url photo_data.append(data) bag_of_words.append(word) return photo_data
def load_data_to_s3(self, data): # Prepare data for insert and copy to S3 data_str = StringIO() ordering = ["eb_id", "url", "logo_url", "event_name", "event_type", "start_time_utc", "end_time_utc", "ev_created_at", "ev_updated_at", "capacity", "online_event", "venue_id", "venue_name", "latitude", "longitude", "category", "created_at"] # Get all data from all given category for key in data: for event in data[key]: event_data = {} event_data["eb_id"] = event[unicode("id")].encode("utf-8") if event[unicode("id")] else 0 event_data["url"] = event[unicode("url")].encode("utf-8") if event[unicode("url")] else None event_data["logo_url"] = event[unicode("logo_url")].encode("utf-8") if event[unicode("logo_url")] else None event_data["event_name"] = event[unicode("name")][unicode("text")].strip().encode("utf-8") if event[unicode("name")] and event[unicode("name")][unicode("text")] else None event_data["event_type"] = event[unicode("format")][unicode("name_localized")].encode("utf-8") if event[unicode("format")] and unicode("name_localized") in event[unicode("format")] else None event_data["start_time_utc"] = datetime.strptime(event[unicode("start")][unicode("utc")], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d %H:%M:%S') event_data["end_time_utc"] = datetime.strptime(event[unicode("end")][unicode("utc")], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d %H:%M:%S') event_data["ev_created_at"] = datetime.strptime(event[unicode("created")], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d %H:%M:%S') event_data["ev_updated_at"] = datetime.strptime(event[unicode("changed")], '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d %H:%M:%S') event_data["capacity"] = int(event[unicode("capacity")]) if event[unicode("capacity")] else 0 event_data["online_event"] = event[unicode("online_event")].encode("utf-8") if event[unicode("online_event")] else False event_data["venue_id"] = int(event[unicode("venue_id")]) if event[unicode("venue_id")] else -1 event_data["venue_name"] = event[unicode("venue")][unicode("name")].encode("utf-8") if event[unicode("venue")] and unicode("name") in event[unicode("venue")] and event[unicode("venue")][unicode("name")] else None event_data["latitude"] = float(event[unicode("venue")][unicode("latitude")]) if event[unicode("venue")] and unicode("latitude") in event[unicode("venue")] else 0 event_data["longitude"] = float(event[unicode("venue")][unicode("longitude")]) if event[unicode("venue")] and unicode("longitude") in event[unicode("venue")] else 0 event_data["category"] = key.encode("utf-8") event_data["created_at"] = datetime.utcnow() row_arr = [] for field in ordering: val = event_data[field] if event_data[field] else None if val is None: row_arr.append('\N') else: row_arr.append(unicode(val)) data_str.write('\007'.join(row_arr).encode('utf-8') + '\n') # Copy data for load to S3 s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) s3_file = Key(bucket) s3_file.key = self.S3_KEY data_str.seek(0) s3_file.set_contents_from_file(data_str)
def write_to_s3(self, data_str, csv_str, last_offset): mmddyy = datetime.utcnow().strftime('%m') + datetime.utcnow().strftime('%d') + datetime.utcnow().strftime('%y') # Copy data for load to S3 s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) s3_file = Key(bucket) s3_file.key = self.KAFKA_TOPIC + "/" + mmddyy + "/" + self.S3_KEY + "_offset_" + str(last_offset) + "_" + str(calendar.timegm(datetime.utcnow().timetuple())) + ".txt" data_str.seek(0) s3_file.set_contents_from_file(data_str) s3_file.key = self.KAFKA_TOPIC + "/" + mmddyy + "/" + self.S3_KEY + "_offset_" + str(last_offset) + "_" + str(calendar.timegm(datetime.utcnow().timetuple())) + ".csv" csv_str.seek(0) s3_file.set_contents_from_file(csv_str) s3_connection.close()
def sendmail(mailto, subject, message, subtype='html', charset='utf-8', smtpconfig=None, **headers): ''' Send an email to the given address. Additional SMTP headers may be specified as keyword arguments. ''' if not smtpconfig: # we support both smtp and mail for legacy reasons # smtp is the correct usage. smtpconfig = config.get('smtp') or config.get('mail') # mailto arg is explicit to ensure that it's always set, but it's processed # mostly the same way as all other headers headers['To'] = _string_or_list(mailto) msg = MIMEMultipart('alternative') msg['Subject'] = subject for key, value in six.iteritems(headers): for val in _string_or_list(value): msg.add_header(key, val) text = MIMEText(message, subtype, charset) msg.attach(text) if not 'From' in msg: msg['From'] = smtpconfig.get('from') mailfrom = msg['From'] assert isinstance(mailfrom, six.string_types) recipients = [] for toheader in ('To', 'CC', 'BCC'): recipients += msg.get_all(toheader, []) if 'BCC' in msg: del msg['BCC'] smtp = smtplib.SMTP(smtpconfig.get('host'), smtpconfig.get('port')) if smtpconfig.get('username', None) is not None and smtpconfig.get( 'password', None) is not None: smtp.login(smtpconfig.get('username'), smtpconfig.get('password')) smtp.sendmail(mailfrom, recipients, msg.as_string()) smtp.quit() log.info('Sent email to %s (Subject: %s)', recipients, subject)
def fetch_photos_for_conference(self, word_data, last_id): bag_of_words = [] photo_data = [] flickr = FlickrRest(rest_endpoint=config.get("flickr.endpoint_rest"), api_key=config.get("flickr.api_key") ) yymmddhh = datetime.utcnow().strftime('%y') + datetime.utcnow().strftime('%m') + datetime.utcnow().strftime('%d') + datetime.utcnow().strftime('%H') db_data = self.REDSHIFT.select(self.QUERY_RS_EVENTS) for row in db_data: if row["latitude"] is None and row["longitude"] is None: continue location = ("%.2f" % row["latitude"]) + "," + ("%.2f" % row["longitude"]) if location not in word_data: continue for word, count in word_data[location].iteritems(): if word == "OVERALL_CNT" or word in bag_of_words: continue if len(word) is 0: continue for url in flickr.get_photos_by_keyword(word): # Increment last_id last_id += 1 data = {} data["pid"] = last_id data["yymmddhh"] = yymmddhh data["word"] = word data["url"] = url photo_data.append(data) bag_of_words.append(word) return photo_data
def serve_web(): parse_command_line() logger.info('App starting up') app = make_app() app.listen(config.get('server.port')) ioloop.IOLoop.current().start()
def test_s3(self, options=None): s3_key = "eventbrite/" + self.TABLE_NAME + "_" + str( calendar.timegm(datetime.utcnow().timetuple())) + ".txt" s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) line_buff = StringIO() line_buff.write("Hello World MORE AND MORE STUFF") s3_file = Key(bucket) s3_file.key = s3_key line_buff.seek(0) s3_file.set_contents_from_file(line_buff)
def test_ts(self): kafka = KafkaClient(config.get("kafka.url")) producer = SimpleProducer(kafka) api = TwitterAPI(consumer_key=config.get("twitter.consumer_key"), consumer_secret=config.get("twitter.consumer_secret"), access_token_key=config.get("twitter.access_token"), access_token_secret=config.get("twitter.access_token_secret") ) tweeter_stream = api.request('statuses/filter', {'locations': '-122.75,36.8,-121.75,37.8,-74,40,-73,41'}) # Stream data for tweet in tweeter_stream: producer.send_messages("test", json.dumps(tweet)) break
def get_todays_tweet(self): word_data, sorted_by_loc = {}, [] timestamp = datetime.utcnow().strftime('%y') + datetime.utcnow( ).strftime('%m') + datetime.utcnow().strftime( '%d') + datetime.utcnow().strftime('%H') cassandra = Cluster([ config.get("cassandra.host1"), config.get("cassandra.host2"), config.get("cassandra.host3") ]).connect('insight') tweeted_words = cassandra.execute(self.QUERY_TODAYS_TWEET % timestamp) # tweeted_words = self.CASSANDRA.execute(self.QUERY_TODAYS_TWEET % '15020221') # for (yymmddhh, timestamp, data, lat, lng) in tweeted_words: for (yymmddhh, location, string, data, timestamp) in tweeted_words: if location not in word_data: word_data[location] = {} for word, count in data.iteritems(): word = word.encode("utf-8") if word not in word_data[location]: word_data[location][word] = count else: word_data[location][word] += count for location in word_data: # In place sort by number of words in desc order sorted_by_loc = sorted(word_data[location].items(), key=operator.itemgetter(1)) sorted_by_loc.reverse() # Re-enter sorted data word_data[location] = {} for word_pair in sorted_by_loc: word = word_pair[0] count = word_pair[1] word_data[location][word] = count return word_data
def get_s3_offset(self): # Get s3 bucket s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) s3_bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) mmddyy = datetime.utcnow().strftime('%m') + datetime.utcnow().strftime('%d') + datetime.utcnow().strftime('%y') s3_key = self.KAFKA_TOPIC + "/" + mmddyy + "/" offset = [int(key.name.split("_")[2]) for key in s3_bucket.list(prefix=s3_key) if ".txt" in key.name] offset = offset if len(offset) else [0] s3_connection.close() return max(offset)
def select_s3(self, s3_key, select_sql, batch_size=None): s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) fp = StringIO() conn = self.get_conn() cursor = conn.cursor() cursor.execute(select_sql) ordering = map(lambda c: c[0], cursor.description) row_count = 0 last_row = [] for row in cursor: row_arr = [] for val in row: if val is None: row_arr.append('\N') else: row_arr.append(unicode(val)) str_row = (self.COL_DELIMITER.join(row_arr).replace( self.ROW_DELIMITER, '') + self.ROW_DELIMITER).encode('utf-8') fp.write(str_row) row_count = row_count + 1 last_row = row if batch_size is not None and row_count >= batch_size: break s3_file = Key(bucket) s3_file.key = s3_key fp.seek(0) s3_file.set_contents_from_file(fp) conn.close() if batch_size is None: return None, None else: return row_count, dict(zip(ordering, last_row))
def create_db(): connect = None try: connect = engine.connect() connect.execute('CREATE DATABASE IF NOT EXISTS ' + config.get('database.database')) finally: if connect: connect.close()
def drop_db(): connect = None try: connect = engine.connect() connect.execute('DROP DATABASE IF EXISTS ' + config.get('database.database')) finally: if connect: connect.close()
def get_sentry_client(): global client if client: return client dsn = config.get("sentry.url", None) if not dsn: return client = raven.Client(dsn=dsn) return client
def get_sentry_client(): global client if client: return client dsn = config.get('sentry.url', None) if not dsn: return client = raven.Client(dsn=dsn) return client
def error_email(**kwargs): if kwargs.get('table_name', None) is None or kwargs.get('error', None) is None: return if config.get('debug.enabled') is False: subj_str = "Job Failure - %s" % kwargs.get('table_name') mail_str = "Job %s failed with error: %s" % (kwargs.get('table_name'), kwargs.get('error')) mail_str = mail_str + ' || Stack trace: %s' % kwargs.get('trace', None) mail.sendmail("*****@*****.**", subj_str, mail_str)
def test_sendmail_with_other_smtpconfig(self, mock_SMTP): mock_SMTP_instance = mock_SMTP.return_value mailto = '*****@*****.**' subject = 'This is another subject' message = 'This is another message' mail.sendmail(mailto, subject, message, smtpconfig=config.get('othersmtp')) args, kwargs = mock_SMTP_instance.sendmail.call_args from_header = config.get('othersmtp.from') self.assertEqual(from_header, args[0]) self.assertIn(mailto, args[1]) self.assertIn('To: %s' % mailto, args[2]) self.assertIn('From: %s' % from_header, args[2]) self.assertIn('Subject: %s' % subject, args[2]) self.assertIn('Content-Type: text/html', args[2])
def drop_database(env='development', database_type='write'): """Drop the database.""" os.environ['CLAY_CONFIG'] = './config/%s.yaml' % env db = config.get('database') db_info = db[database_type][0] db_name = db_info['dbname'] print('Deleting %s database' % db_name) os.system('psql -q template1 -c "DROP DATABASE IF EXISTS %s";' % db_name)
def select_s3(self, s3_key, select_sql, batch_size=None): s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) fp = StringIO() conn = self.get_conn() cursor = conn.cursor() cursor.execute(select_sql) ordering = map(lambda c: c[0], cursor.description) row_count = 0 last_row = [] for row in cursor: row_arr = [] for val in row: if val is None: row_arr.append('\N') else: row_arr.append(unicode(val)) str_row = (self.COL_DELIMITER.join(row_arr).replace(self.ROW_DELIMITER, '') + self.ROW_DELIMITER).encode('utf-8') fp.write(str_row) row_count = row_count + 1 last_row = row if batch_size is not None and row_count >= batch_size: break s3_file = Key(bucket) s3_file.key = s3_key fp.seek(0) s3_file.set_contents_from_file(fp) conn.close() if batch_size is None: return None, None else: return row_count, dict(zip(ordering, last_row))
def load_data_to_s3(self, stats_data): data_str = StringIO() ordering = [ "tid", "eb_id", "event_name", "event_type", "latitude", "longitude", "start_time_utc", "end_time_utc", "tweets", "created_at" ] for data in stats_data: row_arr = [] for field in ordering: val = data[field] if data[field] else None if val is None: row_arr.append('\N') else: row_arr.append(unicode(val)) for data_type in ["word", "count"]: for i in xrange(10): key = data_type + str(i + 1) if key not in data: row_arr.append(unicode('\N')) else: row_arr.append(unicode(data[key])) data_str.write('\007'.join(row_arr).encode('utf-8') + '\n') # Copy data for load to S3 s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) s3_file = Key(bucket) s3_file.key = self.S3_KEY data_str.seek(0) s3_file.set_contents_from_file(data_str)
def test_sendmail_with_other_smtpconfig(self, mock_SMTP): mock_SMTP_instance = mock_SMTP.return_value mailto = '*****@*****.**' subject = 'This is another subject' message = 'This is another message' mail.sendmail( mailto, subject, message, smtpconfig=config.get('othersmtp')) args, kwargs = mock_SMTP_instance.sendmail.call_args from_header = config.get('othersmtp.from') self.assertEqual(from_header, args[0]) self.assertIn(mailto, args[1]) self.assertIn('To: %s' % mailto, args[2]) self.assertIn('From: %s' % from_header, args[2]) self.assertIn('Subject: %s' % subject, args[2]) self.assertIn('Content-Type: text/html', args[2])
def sendmail(mailto, subject, message, subtype='html', charset='utf-8', smtpconfig=None, **headers): ''' Send an email to the given address. Additional SMTP headers may be specified as keyword arguments. ''' if not smtpconfig: # we support both smtp and mail for legacy reasons # smtp is the correct usage. smtpconfig = config.get('smtp') or config.get('mail') # mailto arg is explicit to ensure that it's always set, but it's processed # mostly the same way as all other headers headers['To'] = _string_or_list(mailto) msg = MIMEMultipart('alternative') msg['Subject'] = subject for key, value in headers.iteritems(): for val in _string_or_list(value): msg.add_header(key, val) text = MIMEText(message, subtype, charset) msg.attach(text) if not 'From' in msg: msg['From'] = smtpconfig.get('from') mailfrom = msg['From'] assert isinstance(mailfrom, basestring) recipients = [] for toheader in ('To', 'CC', 'BCC'): recipients += msg.get_all(toheader, []) if 'BCC' in msg: del msg['BCC'] smtp = smtplib.SMTP(smtpconfig.get('host'), smtpconfig.get('port')) if smtpconfig.get('username', None) is not None and smtpconfig.get('password', None) is not None: smtp.login(smtpconfig.get('username'), smtpconfig.get('password')) smtp.sendmail(mailfrom, recipients, msg.as_string()) smtp.quit() log.info('Sent email to %s (Subject: %s)', recipients, subject)
def get_todays_tweet(self): word_data, sorted_by_loc = {}, [] timestamp = datetime.utcnow().strftime('%y') + datetime.utcnow().strftime('%m') + datetime.utcnow().strftime('%d') + datetime.utcnow().strftime('%H') cassandra = Cluster([config.get("cassandra.host1"), config.get("cassandra.host2"), config.get("cassandra.host3")]).connect('insight') tweeted_words = cassandra.execute(self.QUERY_TODAYS_TWEET % timestamp) # tweeted_words = self.CASSANDRA.execute(self.QUERY_TODAYS_TWEET % '15020221') # for (yymmddhh, timestamp, data, lat, lng) in tweeted_words: for (yymmddhh, location, string, data, timestamp) in tweeted_words: if location not in word_data: word_data[location] = {} for word, count in data.iteritems(): word = word.encode("utf-8") if word not in word_data[location]: word_data[location][word] = count else: word_data[location][word] += count for location in word_data: # In place sort by number of words in desc order sorted_by_loc = sorted(word_data[location].items(), key=operator.itemgetter(1)) sorted_by_loc.reverse() # Re-enter sorted data word_data[location] = {} for word_pair in sorted_by_loc: word = word_pair[0] count = word_pair[1] word_data[location][word] = count return word_data
def write_to_s3(self, data_str, csv_str, last_offset): mmddyy = datetime.utcnow().strftime('%m') + datetime.utcnow().strftime( '%d') + datetime.utcnow().strftime('%y') # Copy data for load to S3 s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) s3_file = Key(bucket) s3_file.key = self.KAFKA_TOPIC + "/" + mmddyy + "/" + self.S3_KEY + "_offset_" + str( last_offset) + "_" + str( calendar.timegm(datetime.utcnow().timetuple())) + ".txt" data_str.seek(0) s3_file.set_contents_from_file(data_str) s3_file.key = self.KAFKA_TOPIC + "/" + mmddyy + "/" + self.S3_KEY + "_offset_" + str( last_offset) + "_" + str( calendar.timegm(datetime.utcnow().timetuple())) + ".csv" csv_str.seek(0) s3_file.set_contents_from_file(csv_str) s3_connection.close()
def devserver(): if not config.get('debug.enabled', False): sys.stderr.write('This server must be run in development mode, set debug.enabled in your config and try again\n') return -1 for modulename in config.get('views'): log.debug('Loading views from %s' % modulename) __import__(modulename) conf = config.get('debug.server') log.warning('DEVELOPMENT MODE') log.info('Listening on %s:%i' % (conf['host'], conf['port'])) kwargs = { 'use_reloader': True, 'use_debugger': True, 'use_evalex': True, 'threaded': False, 'processes': 1, } kwargs.update(config.get('debug.werkzeug', {})) werkzeug.serving.run_simple(conf['host'], conf['port'], application, **kwargs)
def test(self): start_time = time.time() # Cassandra initialization cluster = Cluster([config.get("cassandra.host1"), config.get("cassandra.host2")]) session = cluster.connect('insight') timestamp = datetime.utcnow().strftime('%y') + datetime.utcnow().strftime('%m') + datetime.utcnow().strftime('%d') + datetime.utcnow().strftime('%H') tweeted_words = session.execute(self.QUERY_TWEETS % timestamp) tweeted_words = session.execute(self.QUERY_TEST) cnt = 0 for (id, counter) in tweeted_words: print(counter) print(cnt) print("Realtime API exec time for Instance is " + str(time.time() - start_time)) return "super"
def get_s3_offset(self): # Get s3 bucket s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) s3_bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) mmddyy = datetime.utcnow().strftime('%m') + datetime.utcnow().strftime( '%d') + datetime.utcnow().strftime('%y') s3_key = self.KAFKA_TOPIC + "/" + mmddyy + "/" offset = [ int(key.name.split("_")[2]) for key in s3_bucket.list(prefix=s3_key) if ".txt" in key.name ] offset = offset if len(offset) else [0] s3_connection.close() return max(offset)
def test_ts(self): kafka = KafkaClient( config.get("kafka.host1") + "," + config.get("kafka.host2")) # consumer = SimpleConsumer(kafka, "my-group112", "test") consumer = SimpleConsumer(kafka, self.GROUP_NAME, self.KAFKA_TOPIC, fetch_size_bytes=3000000, buffer_size=2000000000, max_buffer_size=2000000000) while True: print("HELLO") # Prepare data for insert and copy to S3 # data_str = StringIO() count = 0 # last_offset = 2 consumer.seek(2, 0) for message in consumer.get_messages(count=100, block=False, timeout=0.1): count += 1 print(message.message.value) # # Write tweets to StringIO # self.write_to_data_str(message, data_str) # # Store batch tweets to S3 # self.write_to_s3(data_str, last_offset) if count != 100: break
def load_data_to_s3(self, stats_data): data_str = StringIO() ordering = ["tid", "eb_id", "event_name", "event_type", "latitude", "longitude", "start_time_utc", "end_time_utc", "tweets", "created_at"] for data in stats_data: row_arr = [] for field in ordering: val = data[field] if data[field] else None if val is None: row_arr.append('\N') else: row_arr.append(unicode(val)) for data_type in ["word", "count"]: for i in xrange(10): key = data_type + str(i + 1) if key not in data: row_arr.append(unicode('\N')) else: row_arr.append(unicode(data[key])) data_str.write('\007'.join(row_arr).encode('utf-8') + '\n') # Copy data for load to S3 s3_connection = S3Connection(config.get('S3.access_key'), config.get('S3.secret')) bucket = s3_connection.get_bucket(config.get('S3.bucket'), validate=False) s3_file = Key(bucket) s3_file.key = self.S3_KEY data_str.seek(0) s3_file.set_contents_from_file(data_str)
def tweets(ncaaf_or_nfl): """Find all tweets relevant to NCAA football games in a given week.""" sportsdata_key = config.get('sportsdata.key') access_token_key = config.get('twitter.access_token_key') access_token_secret = config.get('twitter.access_token_secret') consumer_key = config.get('twitter.consumer_key') consumer_secret = config.get('twitter.consumer_secret') if ncaaf_or_nfl not in ('ncaaf', 'nfl'): abort(400) try: request_info = request.json except AttributeError: abort(400) if not request_info: abort(400) year = request_info.get('year', None) week = request_info.get('week', None) if not (year and week): abort(400) htags = sportsdata.sportsdatareq(week, year, sportsdata_key, ncaaf_or_nfl) return twitter.fetchsamples(htags, access_token_key, access_token_secret, consumer_key, consumer_secret)
def run_migrations_offline(): """Run migrations in 'offline' mode. This configures the context with just a URL and not an Engine, though an Engine is acceptable here as well. By skipping the Engine creation we don't even need a DBAPI to be available. Calls to context.execute() here emit the given string to the script output. """ # This is special to Clay url = clay_config.get("database")['sqlalchemy.url'] context.configure(url=url) with context.begin_transaction(): context.run_migrations()