def detect_gender(tweet): """ Detect the gender of the tweet's author. :param tweet: the tweet object :return: nothing, the tweet object will be updated inline """ orig_name = tweet['user']['name'] name = re.sub(r'[^\x00-\x7f]', r'', orig_name) # remove non-ASCII characters gender = 'NA' try: first_name = extract_first_name(name) global gp gender = gp.classify(first_name) except IndexError: # first name is most likely empty in this case # probably due to the name containing non-standard characters pass except Exception: log.exception("Unable to detect gender based on first name") pass tweet['user']['gender'] = gender log.debug( f"Tweet[{tweet['id']}] Name: {orig_name} - First name: {first_name} - Gender: {gender}" )
def on_exception(self, exception): log.exception(f"Exception from Twitter streaming: {exception}") if self._stream(): if not self._stream().running: # TODO: fix this issue properly time.sleep(2) self._stream().start()
def run(self): interval = 60 # seconds while not self.is_stopped(): try: free = _get_percent_free_space() log.info(f"Disk usage: {free}% free") threshold = config.DISK_FREE_THRESHOLD if free < threshold: log.warn(f"Disk space left {free:.2f}% is below threshold of {threshold:.2f}%") clean_old_data(days=7) free = _get_percent_free_space() while free < threshold: log.warn(f"Disk space left {free:.2f}% is below threshold of {threshold:.2f}%") force_clean_old_tweets(count=1000) free = _get_percent_free_space() log.info(f"Finished cleaning disk space [{free}% free]") except Exception: log.exception("Error during disk usage monitoring") finally: if self.is_stopped(): return self.sleep(interval)
def start_worker(): q = connect_to_message_queue() log.info("Started listening for message") while 1: try: tweet = q.pop() process(tweet) except Exception: log.exception(f"Error during execution")
def run(self): # start watching for changes in a different thread from threading import Thread worker = Thread(target=self._watch) worker.start() # wait until being stopped self.sleep() # stop the MongoDB change stream and cleanup try: if self._cursor: self._cursor.close() worker.join() except Exception: log.exception("Error while stopping the thread")
def tweets(): """ Return tweets from database that has been processed from the latest one. :param limit: limit the number of result to be returned """ try: limit = int(request.args.get('limit', 0)) result = db.result.find()\ .sort('$natural', pymongo.DESCENDING)\ .limit(limit) return jsonify(result=[ json.loads(json.dumps(item, indent=4, default=json_util.default)) for item in result ]) except Exception: log.exception(f"Unable to retrieve tweets") return Response("Unable to retrieve tweets", status=500)
def search_tweets(): try: req = request.get_json(force=True) keyword = req.get('keyword') limit = int(req.get('limit', 1000)) if not keyword: return Response("Keyword is not specified", 400) api = _get_api() result = (process(tweet._json) for tweet in tweepy.Cursor( api.search, q=keyword, count=limit).items(limit)) return jsonify(result=[ json.loads(json.dumps(item, indent=4, default=json_util.default)) for item in result ]) except Exception: log.exception(f"Unable to search tweets") return Response("Unable to search tweets", status=500)
def on_data(self, raw_data): try: data = json.loads(raw_data) # The data is not always a tweet but can also be a message from Twitter system itself if 'limit' in data: return self.on_limit(data) elif 'text' not in data: log.warn(f"Unknown message type: {data}") # TODO: what to do with unknown message? db.unknown.insert(data) return if config.OPERATION_MODE == 'normal': process(data) elif config.OPERATION_MODE == 'mq': QUEUE.push(data) except Exception: log.exception("Exception while processing tweet")
def filter_tweets_by_date(): """ Retrieve tweets by start and end dates. """ try: req = request.get_json(force=True) start_date = req.get('start_date') end_date = req.get('end_date') try: # ensure that the specified time is in UTC +0000 date_format = '%d/%m/%Y %z' start_date = datetime.strptime(start_date + " +0000", date_format).timestamp() * 1000 end_date = datetime.strptime(end_date + " +0000", date_format).timestamp() * 1000 if start_date >= end_date: raise Exception( "End date is equal to or earlier than start date") except Exception: log.exception("Date(s) are invalid") return Response("Date(s) are invalid", status=400) result = db.result.find( {'timestamp_ms': { '$gte': start_date, '$lt': end_date }}) return jsonify(result=[ json.loads(json.dumps(item, indent=4, default=json_util.default)) for item in result ]) except Exception: log.exception(f"Unable to filter tweets") return Response("Unable to filter tweets", status=500)
def init(): """ Initialize the sentiment analyser. This method must be called before any usage of this module. """ try: log.info("Loading pre-trained classifier for sentiment analysis") load_classifier() except FileNotFoundError: log.info("Not found pre-trained classifier. Will train a new one") train_and_test_classifier() try: load_classifier() except Exception: log.exception( "Unable to train and test new classifier. Sentiment analysis will not work" ) return except Exception: log.exception( "Unable to load the classifier. Sentiment analysis will not work") return log.info("Loaded pre-trained classifier for sentiment analysis")
def _watch(self): while not self.is_stopped(): log.debug( f"Start watching for changes for {self._collection.name}") try: self._cursor = self._collection.watch(self._pipeline) while not self.is_stopped(): try: doc = next(self._cursor) if doc and callable(self._on_change): self._on_change() except StopIteration: break except pymongo.errors.OperationFailure: log.exception("Operation fails. Updates will not be watched") self.stop() break except pymongo.errors.PyMongoError: log.exception(f"Error during watching for updates") except Exception: log.exception("Other exception occurs")
def generate(): import io, csv output = io.StringIO() writer = csv.DictWriter(output, dialect='unix', fieldnames=[ "id", "timestamp", "text", "hashtags", "reply_to", "mentions", "keywords", "gender", "longitude", "latitude", "city", "state", "country_code", "sentiment", ]) writer.writeheader() output.seek(0) yield output.getvalue() output.truncate(0) # helper functions to retrieve coordinates get_long = lambda tweet: safe_get( safe_get_dict(tweet, ['coordinates', 'coordinates'], default=[]), 0, '') get_lat = lambda tweet: safe_get( safe_get_dict(tweet, ['coordinates', 'coordinates'], default=[]), 1, '') # the number of result to get limit = int(request.args.get('limit', 0)) for tweet in db.result.find()\ .sort('$natural', pymongo.DESCENDING)\ .limit(limit): try: writer.writerow({ 'id': tweet['id'], 'timestamp': tweet['timestamp'], 'text': tweet['text'], 'hashtags': ','.join(tweet['entities'].get('hashtags', [])), 'reply_to': tweet.get('reply_to', ''), 'mentions': ','.join(i['screen_name'] for i in tweet['entities']['user_mentions']), 'keywords': ','.join( tweet.get('keywords') if tweet. get('keywords') is not None else []), 'gender': tweet.get('gender', ''), 'longitude': get_long(tweet), 'latitude': get_lat(tweet), 'city': safe_get_dict(tweet, ['place', 'city'], ''), 'state': safe_get_dict(tweet, ['place', 'state'], ''), 'country_code': safe_get_dict(tweet, ['place', 'country_code'], ''), 'sentiment': tweet.get('sentiment', ''), }) yield output.getvalue() except Exception: log.exception(f"Error during exporting tweet [{tweet['id']}]") finally: output.seek(0) output.truncate(0)
def export_tweets(): """ Export all results as CSV file from the latest tweet. :param limit: limit the number of result to be returned """ @copy_current_request_context def generate(): import io, csv output = io.StringIO() writer = csv.DictWriter(output, dialect='unix', fieldnames=[ "id", "timestamp", "text", "hashtags", "reply_to", "mentions", "keywords", "gender", "longitude", "latitude", "city", "state", "country_code", "sentiment", ]) writer.writeheader() output.seek(0) yield output.getvalue() output.truncate(0) # helper functions to retrieve coordinates get_long = lambda tweet: safe_get( safe_get_dict(tweet, ['coordinates', 'coordinates'], default=[]), 0, '') get_lat = lambda tweet: safe_get( safe_get_dict(tweet, ['coordinates', 'coordinates'], default=[]), 1, '') # the number of result to get limit = int(request.args.get('limit', 0)) for tweet in db.result.find()\ .sort('$natural', pymongo.DESCENDING)\ .limit(limit): try: writer.writerow({ 'id': tweet['id'], 'timestamp': tweet['timestamp'], 'text': tweet['text'], 'hashtags': ','.join(tweet['entities'].get('hashtags', [])), 'reply_to': tweet.get('reply_to', ''), 'mentions': ','.join(i['screen_name'] for i in tweet['entities']['user_mentions']), 'keywords': ','.join( tweet.get('keywords') if tweet. get('keywords') is not None else []), 'gender': tweet.get('gender', ''), 'longitude': get_long(tweet), 'latitude': get_lat(tweet), 'city': safe_get_dict(tweet, ['place', 'city'], ''), 'state': safe_get_dict(tweet, ['place', 'state'], ''), 'country_code': safe_get_dict(tweet, ['place', 'country_code'], ''), 'sentiment': tweet.get('sentiment', ''), }) yield output.getvalue() except Exception: log.exception(f"Error during exporting tweet [{tweet['id']}]") finally: output.seek(0) output.truncate(0) try: response = Response(stream_with_context(generate()), mimetype='text/csv') response.headers[ 'Content-Disposition'] = 'attachment; filename=result.csv' return response except Exception: log.exception(f"Unable to export tweets") return Response("Unable to export tweets", status=500)
def process(tweet): """ Perform processing of tweet data. :param tweet: the tweet object :return: a new tweet object """ try: log.debug(f"Processing tweet[{tweet['id']}]") # extract the full text from the tweet if tweet.get('truncated') and tweet.get('extended_tweet'): full_text = tweet['extended_tweet']['full_text'] tweet['text'] = full_text # tweets obtained via Search API do not have `timestamp_ms` field by default # we need to derive it from `created_at` if not tweet.get('timestamp_ms'): datetime_format = '%a %b %d %H:%M:%S %z %Y' tweet['timestamp_ms'] = datetime.strptime( tweet['created_at'], datetime_format).timestamp() * 1000 hash_tags = [i['text'] for i in tweet['entities'].get('hashtags', [])] user_mentions = [{ 'screen_name': i['screen_name'], 'name': i['name'], 'id': i['id_str'], } for i in tweet['entities'].get('user_mentions', [])] # more complex processing steps gender.detect_gender(tweet) location.detect_location(tweet) sentiment.classify(tweet) t = { 'id': tweet['id_str'], 'user': { 'id': tweet['user']['id_str'], 'name': tweet['user']['name'], 'screen_name': tweet['user']['screen_name'], 'gender': tweet['user']['gender'], }, 'timestamp': tweet['created_at'], 'timestamp_ms': int(tweet['timestamp_ms']), 'text': tweet['text'], 'coordinates': tweet.get('coordinates'), 'place': { 'city': safe_get_dict(tweet, ['place', 'city']), 'state': safe_get_dict(tweet, ['place', 'state']), 'country_code': safe_get_dict(tweet, ['place', 'country_code']), }, 'keywords': [], # TODO 'reply_to': tweet.get('in_reply_to_screen_name'), 'entities': { 'hashtags': hash_tags, 'user_mentions': user_mentions, }, 'sentiment': tweet.get('sentiment') } db.result.insert(t) except pymongo.errors.DuplicateKeyError: log.warn(f"Tweet [{tweet['id']}] already exists in database") except Exception as e: log.exception(f"Error during processing tweet [{tweet['id']}]")
import pymongo from datetime import datetime from flask_pymongo import PyMongo from hatespeech.api.app import app from hatespeech.api.logging2 import log from hatespeech.config import config try: mongo = PyMongo(app) db = None with app.app_context(): log.info(f"Establishing database connection") db = mongo.db log.info(f"Connected to database: {db.client.server_info()}") except Exception as e: log.exception(e) @app.route('/db/recreate') def recreate_db(): """ Recreate the database. """ import pymongo from script import script # table for storing categories of hate words db.category.drop() db.category.create_index([('name', pymongo.ASCENDING)], unique=True) # table for storing hate words