Example #1
0
	def get_twitter_media(self, entities, tw_id):
		"""
		Method is used to extract media data from tweets. Called from infinity loop. 
		In case, if url contains Instagram photo (not shown in 'media'), pushes data to queue:instagram Redis key.
		"""
		if 'media' in entities:
			for item in entities['media']:
				q = 'INSERT INTO media(tweet_id, url) VALUES ("{}", "{}");'.format(tw_id, item['media_url_https'])
				exec_mysql(q, self.mysql)
		if 'urls' in entities:
			for url in entities['urls']:
				if 'instagram.com' in url['expanded_url']:
					self.redis.rpush('queue:instagram', jdumps([tw_id, url['expanded_url']]))
Example #2
0
	def run(self):
		"""
		Infinity loop for running twitter collector.
		Used recomendations for error handling to make it loop for ever.
		"""
		while True:
			try:
				stream = self.tw_api.request('statuses/filter', {'locations':TW_LOCATIONS}).get_iterator()
				for item in stream:
					if 'coordinates' in item and item['coordinates']:
						if self.city_polygon and self.city_polygon.disjoint(Point(item['coordinates']['coordinates'][0],item['coordinates']['coordinates'][1])):
							continue
						q = 'INSERT IGNORE INTO tweets(id, text, lat, lng, tstamp, user, network, iscopy) VALUES ("{}", "{}", {}, {}, "{}", {}, 1, {});'.format(
							item['id_str'], 
							escape_string(item['text'].encode('utf-8', 'replace')),
							item['coordinates']['coordinates'][1],
							item['coordinates']['coordinates'][0],
							datetime.strptime(item['created_at'][4:], '%b %d %H:%M:%S +0000 %Y'),
							item['user']['id_str'],
							int('Instagram' in item['source'])
							)
						exec_mysql(q, self.mysql)
						#if 'Instagram' not in item['source']:
						#	warning(u"Twitter data source: {}".format(item['source']))
						message = {
							'id':item['id_str'], 
							'lat':item['coordinates']['coordinates'][1], 
							'lng':	item['coordinates']['coordinates'][0], 
							'tstamp': int(mktime(datetime.strptime(item['created_at'][4:], '%b %d %H:%M:%S +0000 %Y').timetuple())), 
							'network': 1
						}
						self.redis.hmset("message:{}".format(message['id']), message)
						self.redis.expire("message:{}".format(message['id']), int(TIME_SLIDING_WINDOW))
						self.redis.set('statistics:tw_last', datetime.now().strftime('%H:%M:%S %d %b %Y'))
						self.get_twitter_media(item['entities'], item['id_str'])
					elif 'disconnect' in item:
						event = item['disconnect']
						if event['code'] in [2,5,6,7]:
							raise Exception(event['reason'])
						else:
							break
			except TwitterRequestError as e:
				if e.status_code < 500:
					raise
				else:
					error(e)
			except TwitterConnectionError as e:
				error(e)
Example #3
0
	def backup(self):
		"""
		Method dumps event to MySQL long-term storage, used for non-evaluating events.
		"""
		if self.verification is None:
			ver = 'NULL'
		else:
			ver = int(self.verification)
		if self.validity is None:
			val = 'NULL'
		else:
			val = int(self.validity)
		msg_string = self.pack()
		q = b'''INSERT INTO events(id, start, end, msgs, description, dumps, verification, validity) VALUES ("{}", "{}", "{}", {}, "{}", "{}", {}, {}) ON DUPLICATE KEY UPDATE `start`=VALUES(`start`), `end`=VALUES(`end`), `msgs`=VALUES(`msgs`), `description`=VALUES(`description`), `dumps`=VALUES(`dumps`), `verification`=VALUES(`verification`), `validity`=VALUES(`validity`);'''.format(self.id, self.start, self.end, len(self.messages.keys()), escape_string(', '.join([x.encode('utf-8') for x in self.cores[2]])), escape_string(msg_string), ver, val)
		exec_mysql(q, self.mysql)
		self.redis.delete("event:{}".format(self.id))
Example #4
0
	def get_ig_data(self, data, medialist):
		"""
		Method for parsing data, collected from Instagram search API endpoint.
		Looks for coordinates, text, and other attributes for every message.
		Dumps message to MySQL and Redis with expiration, updates statistics.
		List medialist is used to minimize overlaping for multiple locations.
		"""
		for item in data['data']:
			if item['id'] in medialist:
				continue
			medialist.append(item['id'])
			try:
				text = item['caption']['text']
			except:
				text = ''
			try:
				lat = item['location']['latitude']
				lng = item['location']['longitude'] 
				user = item['user']['id']
				media_url = item['images']['standard_resolution']['url']
			except:
				pass
			else:
				if self.city_polygon and self.city_polygon.disjoint(Point(lng,lat)):
					continue
				q = '''INSERT IGNORE INTO tweets(id, text, lat, lng, tstamp, user, network, iscopy) VALUES ("{}", "{}", {}, {}, "{}", {}, 2, 0);'''.format(
					item['id'], 
					escape_string(text.encode('utf-8', 'replace')),
					lat, lng,
					datetime.fromtimestamp(int(item['created_time'])),
					user)
				exec_mysql(q, self.mysql)
				message = {
					'id':item['id'], 
					'lat':lat, 
					'lng':lng, 
					'tstamp': int(mktime(datetime.fromtimestamp(int(item['created_time'])).timetuple())), 
					'network':2
				}
				self.redis.hmset("message:{}".format(message['id']), message)
				self.redis.expire("message:{}".format(message['id']), int(TIME_SLIDING_WINDOW))
				q = 'INSERT IGNORE INTO media(tweet_id, url) VALUES ("{}", "{}");'.format(
					item['id'], media_url)
				exec_mysql(q, self.mysql)
		return medialist
Example #5
0
	def run(self):
		"""
		Infinity loop: wait for any new item in queue:instagram Redis list, pop it and process.
		Wait for 2 seconds, go to the begining.
		"""
		while True:
			data = jloads(self.redis.blpop('queue:instagram')[1])
			try:
				url = 'https://api.instagram.com/v1/media/shortcode/{}?access_token={}'.format(data[1].split('/')[4], IG_ACCESS_TOKEN)
				photo_data = get(url, stream=False, timeout=10)
			except (IndexError, ConnectionError, ProtocolError, ReadTimeout, ReadTimeoutError, SSLError, ssl_SSLError, soc_error, SysCallError) as e:
				error(e)
			else:
				if photo_data.ok:
					link = photo_data.json()['data']['images']['standard_resolution']['url']
					q = 'INSERT INTO media(tweet_id, url) VALUES ("{}", "{}");'.format(data[0], link)
					exec_mysql(q, self.mysql)
			sleep(2)
Example #6
0
	def restore(self, event_id):
		"""
		Method restores event from MySQL table using event_id parameter.

		Args:
			event_id (str): unique event identifier
		"""
		q = '''SELECT dumps FROM events WHERE id="{}"'''.format(event_id)
		event_data = exec_mysql(q, self.mysql)[0][0]['dumps']
		self.unpack(event_data)
Example #7
0
	def push_msg(self, message):
		"""
		Function for processing every single meaasge as if it is real Collector.
		Currently message is:
		(1) being sent to redis with lifetime for 1 hour (from settings - TIME_SLIDING_WINDOW)
		(2) being dumped in MySQL datatable
		"""
		redis_message = {
			'id':message['id'], 
			'lat':message['lat'], 
			'lng': message['lng'], 
			'tstamp': int(mktime(message['tstamp'].timetuple())), 
			'network': message['network']
		}
		self.redis.hmset("message:{}".format(message['id']), redis_message)
		self.redis.expire("message:{}".format(message['id']), int(TIME_SLIDING_WINDOW/self.fast_forward_ratio))
		message['text'] = escape_string(message['text'])
		q = 'INSERT IGNORE INTO tweets(id, text, lat, lng, tstamp, user, network, iscopy) VALUES ("{id}", "{text}", {lat}, {lng}, "{tstamp}", {user}, "{network}", {iscopy});'.format(**message)
		exec_mysql(q, self.mysql)
		self.log_process(message)
def callback_dispatcher(bot, update):
	global CONTEXT
	global redis_con
	query = update.callback_query
	user = str(query.from_user.id)
	verific_dict = {'teach.real':1, 'teach.fake':0}
	if user in CONTEXT.keys():
		if CONTEXT[user]['command'] == 'teach':
			q = 'SELECT dumps FROM events WHERE id = "{}";'.format(CONTEXT[user]['event'].id)
			data = unpackb(exec_mysql(q, mysql_con)[0][0]['dumps'])
			if query.data in ['teach.real', 'teach.fake']:
				data['verification'] = int(verific_dict[query.data])
				data = packb(data)
				q = b'''UPDATE events SET dumps = "{}", verification = {} WHERE id = "{}";'''.format(escape_string(data), verific_dict[query.data], CONTEXT[user]['event'].id)
				exec_mysql(q, mysql_con)
				del CONTEXT[user]['event']
				del CONTEXT[user]['event_limits']
				bot.answerCallbackQuery(query.id, text=TEXTS['ok'])
				publish_event(bot, user)
			elif query.data == 'teach.prev_msgs':
				publish_event(bot, user, CONTEXT[user]['event_limits'][0]-1, False)
			elif query.data == 'teach.next_msgs':
				publish_event(bot, user, CONTEXT[user]['event_limits'][1]+1, True)
			elif query.data == 'teach.finish':
				bot.editMessageText(text=TEXTS['teach.finish'], chat_id=CONTEXT[user]['chat'], message_id=CONTEXT[user]['message'])
				del CONTEXT[user]
			else:
				bot.answerCallbackQuery(query.id, text=TEXTS['unknown.command'])
		elif CONTEXT[user]['command'] == 'adduser':
			if query.data == 'adduser.cancel':
				del CONTEXT[user]
			elif query.data in ('adduser.admin', 'adduser.tester'):
				group = query.data[8:]+'s'
				redis_con.sadd(group, CONTEXT[user]['userid'])
				bot.editMessageText(text=TEXTS['contact.added'].format(CONTEXT[user]['username'], group), chat_id=CONTEXT[user]['chat'], message_id=CONTEXT[user]['message'])
			else:
				bot.answerCallbackQuery(query.id, text=TEXTS['unknown.command'])
		else:
			bot.answerCallbackQuery(query.id, text=TEXTS['unknown.command'])
	else:
		bot.answerCallbackQuery(query.id, text=TEXTS['no.context'])
Example #9
0
	def get_reference_data(self, time, ):
		"""
		Load historical data from MySQL.
		Returns MySQL dict
		Args:
			time (datetime): timestamp for data of interest. 90% of data is taken from the past, and 10% - from the future
		"""
		lower_bound = time - timedelta(seconds = TIME_SLIDING_WINDOW * 0.9)
		upper_bound = time + timedelta(seconds = TIME_SLIDING_WINDOW * 0.1)
		lower_bound = lower_bound.time().second + lower_bound.time().minute * 60 + lower_bound.time().hour * 3600
		upper_bound = upper_bound.time().second + upper_bound.time().minute * 60 + upper_bound.time().hour * 3600

		if lower_bound < upper_bound:
			q = '''SELECT tstamp, lat, lng, network FROM ref_data WHERE `second` BETWEEN {} AND {};'''.format(lower_bound, upper_bound)
			data, i = exec_mysql(q, self.mysql)
		else:
			q = '''SELECT tstamp, lat, lng, network FROM ref_data WHERE `second` BETWEEN {} AND 86400;'''.format(lower_bound)
			data = exec_mysql(q, self.mysql)[0]
			q = '''SELECT DATE_ADD(tstamp,INTERVAL -1 DAY) AS tstamp, lat, lng, network FROM ref_data WHERE `second` BETWEEN 0 AND {};'''.format(upper_bound)
			data += exec_mysql(q, self.mysql)[0]
		return data
Example #10
0
	def get_media_data(self, ids=None):
		"""
		Method loads MySQL data for media using existing messages ids and adds it to the self.media argument.

		Args:
			ids (List[str]): list of messages ids to load. If not provided, all ids from self.messages are used 
		"""
		if not ids:
			ids = [x['id'] for x in self.messages.values()]
		q = '''SELECT * FROM media WHERE tweet_id in ({});'''.format(','.join(['"'+str(x)+'"' for x in ids]))
		data = exec_mysql(q, self.mysql)[0]
		for item in data:
			self.media[item['id']] = item
Example #11
0
	def daily_maintenance(self):
		# Updating classifier
		self.classifier = build_event_classifier(classifier_type="adaboost", balanced=True)

		# Creating new reference data table
		exec_mysql('TRUNCATE ref_data;', self.mysql)

		if self.use_real_reference:
			exec_mysql('''INSERT INTO ref_data SELECT lat, lng, network, DATE(tstamp) as tstamp, TIME_TO_SEC(TIME(tstamp)) AS `second` FROM tweets WHERE DATE(tstamp) BETWEEN '{}' AND '{}' ORDER BY `second` ASC;'''.format((datetime.now() - timedelta(days=self.ref_days)).strftime('%Y-%m-%d'), (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')), self.mysql)
		else:
			d = exec_mysql('SELECT tstamp FROM tweets_origins ORDER BY tstamp DESC LIMIT 1;', self.mysql)
			max_date = d[0][0]['tstamp'] - timedelta(days=1)
			exec_mysql('''INSERT INTO ref_data SELECT lat, lng, network, DATE(tstamp) as tstamp, TIME_TO_SEC(TIME(tstamp)) AS `second` FROM tweets_origins WHERE DATE(tstamp) BETWEEN '{}' AND '{}' ORDER BY `second` ASC;'''.format((max_date - timedelta(days=self.ref_days-1)).strftime('%Y-%m-%d'), max_date.strftime('%Y-%m-%d')), self.mysql)
		self.last_maintenance = datetime.now()
Example #12
0
	def __init__(self, mysql_con, redis_con, dataset=None, fast_forward_ratio = 1, start_timeout = 10, run_on_init = True, truncate_on_init = True):
		"""
		dataset (List[Dict]): tweets database dump, collected using PySQLPool. Available fields:
			id, text, lat, lng, tstamp, user, network, iscopy
		"""
		self.redis = redis_con
		self.mysql = mysql_con

		# Loading default dataset
		if not dataset:
			if truncate_on_init and not run_on_init:
				q = '''SELECT * FROM tweets_origins LIMIT 1;'''
			else:
				q = '''SELECT * FROM tweets_origins WHERE tstamp >= '2015-07-01 12:00:00' AND tstamp <= '2015-07-02 12:00:00';'''
			dataset = exec_mysql(q, self.mysql)[0]

		# Recalculating publish timestamp for messages, according to current time
		self.fast_forward_ratio = fast_forward_ratio
		self.raw_data = sorted(list(dataset), key=lambda x: x['tstamp'], reverse=False)
		self.old_init = self.raw_data[0]['tstamp']
		self.new_init = datetime.now() + timedelta(seconds = start_timeout)
		for i in range(len(self.raw_data)):
			seconds2add = (self.raw_data[i]['tstamp'] - self.old_init).total_seconds()/fast_forward_ratio
			self.raw_data[i]['pub_tstamp'] = self.new_init + timedelta(seconds = seconds2add)
	
		# These vars are required for logging and writing status updates
		self.new_end = self.raw_data[-1]['pub_tstamp']
		self.duration = (self.new_end - self.new_init).total_seconds()
		self.total_msgs = len(self.raw_data)
		self.i = 0
		self.rotator = ('\\', '|', '/', '-')
		self.previous_out = ''

		# Starting emulation (turned on by default)
		if truncate_on_init:
			self.truncate_db()
		if run_on_init:
			self.run()
Example #13
0
	def truncate_db(self):
		"""
		Clear database before emulation start. 
		Truncate tweets, events, and event_msgs db's in MySQL;
		Delete all "message:*" and "event:*" hashes from Redis.
		"""
		exec_mysql('TRUNCATE event_msgs;', self.mysql);
		exec_mysql('TRUNCATE events;', self.mysql);
		exec_mysql('TRUNCATE tweets;', self.mysql);
		try:
			self.redis.delete(*self.redis.keys('message:*'))
		except ResponseError:
			pass
		try:
			self.redis.delete(*self.redis.keys('event:*'))
		except ResponseError:
			pass
		try:
			self.redis.delete(*self.redis.keys('dumped:*'))
		except ResponseError:
			pass
Example #14
0
	if not args.fastforward:
		args.fastforward = 1
	if not args.timeout:
		args.timeout = 0
	if args.action == 'truncate':
		q = '''SELECT * FROM tweets_origins LIMIT 1;'''
	elif not args.period:
		q = '''SELECT * FROM tweets_origins;'''
	elif args.period == "hour":
		q = '''SELECT * FROM tweets_origins WHERE tstamp >= '2015-06-22 19:00:00' AND tstamp <= '2015-06-23 20:00:00';'''
	elif args.period == "day":
		q = '''SELECT * FROM tweets_origins WHERE tstamp >= '2015-06-22 00:00:00' AND tstamp <= '2015-06-23 00:00:00';'''
	elif args.period == "week":
		q = '''SELECT * FROM tweets_origins WHERE tstamp >= '2015-06-29 00:00:00' AND tstamp <= '2015-07-06 00:00:00';'''
	elif args.period == "month":
		q = '''SELECT * FROM tweets_origins WHERE tstamp >= '2015-06-21 00:00:00' AND tstamp <= '2015-07-18 00:00:00';'''
	if args.action:
		from settings import REDIS_HOST, REDIS_PORT, REDIS_DB
		from redis import StrictRedis
		from utilities import get_mysql_con
		redis_db = StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB)
		mysql_db = get_mysql_con()
		dataset = exec_mysql(q, mysql_db)[0]
		if args.action == 'run':
			emulator = CollectorEmulator(mysql_db, redis_db, dataset=dataset, fast_forward_ratio=args.fastforward, start_timeout=args.timeout, run_on_init = True, truncate_on_init = False)
		elif args.action == 'runclean':
			emulator = CollectorEmulator(mysql_db, redis_db, dataset=dataset, fast_forward_ratio=args.fastforward, start_timeout=args.timeout, run_on_init = True, truncate_on_init = True)
		elif args.action == 'truncate':
			emulator = CollectorEmulator(mysql_db, redis_db, dataset=dataset, fast_forward_ratio=args.fastforward, start_timeout=args.timeout, run_on_init = False, truncate_on_init = True)

Example #15
0
	def get_media_data(self):
		q = '''SELECT * FROM media WHERE tweet_id in ({});'''.format(','.join(['"'+str(x)+'"' for x in self.messages.keys()]))
		data = exec_mysql(q, self.mysql)[0]
		for item in data:
			self.messages[item['tweet_id']]['media'] = item['url']
Example #16
0
	def get_messages_data(self):
		q = '''SELECT * FROM tweets WHERE id in ({});'''.format(','.join(['"'+str(x)+'"' for x in self.messages.keys()]))
		data = exec_mysql(q, self.mysql)[0]
		for item in data:
			self.messages[item['id']].update(item)
def get_random_event():
	q = 'SELECT * FROM events WHERE verification IS NULL AND description != "" ORDER BY end DESC LIMIT 5;'
	data = exec_mysql(q, mysql_con)[0]
	data = sample(data,1)[0]
	event = EventLight(start=data['start'], end=data['end'], validity=data['validity'], description=data['description'], dump=data['dumps'], mysql_con=mysql_con)
	return event
Example #18
0
	def get_vk_data(self, data, medialist):
		"""
		Method for parsing data, collected from VK API  custom function endpoint.
		Looks for coordinates, text, and other attributes for every message.
		Dumps message to MySQL and Redis with expiration, updates statistics.
		List medialist is used to minimize overlaping for multiple locations.
		Stored function:
		---
		var places = API.places.getCheckins({
			"latitude":Args.lat,
			"longitude":Args.lng,
			"count":100,
			"timestamp":Args.from_time
		});

		if(places["items"].length == 0){
			return {"wall":[], "places":places};
		}
		var wallIds = [];
		var i = 0;
		while(i < places["items"].length){
			wallIds = wallIds + [places["items"][i]["id"]];
			i = i + 1;
		}
		var walls = API.wall.getById({"posts": wallIds});
		return {"wall":walls, "places":places};
		---
		"""
		try:
			wall_posts = {'{}_{}'.format(x['from_id'], x['id']): x for x in data['response']['wall']}
		except:
			pass
		if isinstance(data, bool) or 'response' not in data or 'places' not in data['response'] or 'items' not in data['response']['places']:
			return medialist
		for item in data['response']['places']['items']:
			if item['id'] in medialist or item['id'] not in wall_posts.keys():
				continue
			medialist.append(item['id'])
			lat = None
			lng = None
			if item['latitude'] > 0 and item['longitude'] > 0:
				lat = item['latitude']
				lng = item['longitude']
				iscopy = 0
			elif 'geo' in wall_posts[item['id']]:
				coordinates = wall_posts[item['id']]['geo']['coordinates'].split(' ')
				lat = float(coordinates[0])
				lng = float(coordinates[1])
				iscopy = 1
			else:
				continue
			if 'text' in item:
				text = item['text']
			else:
				text = ''
			if lat and lng:
				if self.city_polygon and self.city_polygon.disjoint(Point(lng,lat)):
					continue
				q = 'INSERT IGNORE INTO tweets(id, text, lat, lng, tstamp, user, network, iscopy) VALUES ("{}", "{}", {}, {}, "{}", {}, 3, {});'.format(
					item['id'], 
					escape_string(text.encode('utf-8', 'replace')),
					lat,
					lng,
					datetime.fromtimestamp(int(item['date'])),
					item['user_id'],
					iscopy
					)
				exec_mysql(q, self.mysql)
				message = {
					'id':item['id'],
					'lat':lat,
					'lng':lng,
					'tstamp': int(mktime(datetime.fromtimestamp(int(item['date'])).timetuple())),
					'network':3, 
					}
				self.redis.hmset("message:{}".format(message['id']), message)
				self.redis.expire("message:{}".format(message['id']), int(TIME_SLIDING_WINDOW))
				if 'attachments' in wall_posts[item['id']] and 'photo' in wall_posts[item['id']]['attachments'][0] and 'photo_807' in wall_posts[item['id']]['attachments'][0]['photo']:
					q = 'INSERT INTO media(tweet_id, url) VALUES ("{}", "{}");'.format(
					item['id'], wall_posts[item['id']]['attachments'][0]['photo']['photo_807'])
				exec_mysql(q, self.mysql)
		return medialist