def run(self): interval = 60 # seconds while not self.is_stopped(): try: free = _get_percent_free_space() log.info(f"Disk usage: {free}% free") threshold = config.DISK_FREE_THRESHOLD if free < threshold: log.warn(f"Disk space left {free:.2f}% is below threshold of {threshold:.2f}%") clean_old_data(days=7) free = _get_percent_free_space() while free < threshold: log.warn(f"Disk space left {free:.2f}% is below threshold of {threshold:.2f}%") force_clean_old_tweets(count=1000) free = _get_percent_free_space() log.info(f"Finished cleaning disk space [{free}% free]") except Exception: log.exception("Error during disk usage monitoring") finally: if self.is_stopped(): return self.sleep(interval)
def stop(self): """ Pause the streaming process. """ log.info("Stopping the stream") # NOTE: this will stop after the next tweet arrives self.running = False
def start(self): """ Start or resume the streaming if it's been stopped before. """ log.info("Start listening for tweets data") self.filter(track=self._keywords.data(), languages=['en'], stall_warnings=True, async=True)
def start_worker(): q = connect_to_message_queue() log.info("Started listening for message") while 1: try: tweet = q.pop() process(tweet) except Exception: log.exception(f"Error during execution")
def force_clean_old_tweets(count=1000): """ Force delete a number of old tweets to make up for more space. :param count: the number of tweets to delete """ docs = db.result.find({}, ('_id',)) \ .sort('timestamp_ms', pymongo.ASCENDING) \ .limit(count) selector = {'_id': {'$in': [doc['_id'] for doc in docs]}} result = db.result.delete_many(selector) log.info(f"Deleted {result.deleted_count} tweet records")
def clean_old_data(days=7): """ Clean tweet data that is older than 7 days (by default). """ now = int(datetime.now().timestamp() * 1000) date_range = days * 86400 * 1000 date_threshold = now - date_range result = db.result.delete_many({'timestamp_ms': {'$lt': date_threshold}}) log.info(f"Deleted {result.deleted_count} tweet records") return '', http.HTTPStatus.NO_CONTENT
def connect_to_message_queue(*args, **kwargs): """ Create a connection to a message queue according to the application config. :return: the corresponding queue connection object """ from hatespeech.api.logging2 import log from hatespeech.config import config log.info(f"Connecting to queue with config: args {args}, kwargs {kwargs}") if config.MESSAGE_QUEUE_TYPE == 'redis': url = config.REDIS_URL return RedisQueue(url, key=config.REDIS_QUEUE_KEY, *args, **kwargs) else: raise RuntimeError(f"Unknown message queue type: {config.QUEUE_TYPE}")
def detect_location(tweet): """ Detect the location where the tweet is posted. :param tweet: the tweet object :return: nothing, the tweet object will be updated inline """ if tweet['place']: # https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/geo-objects#place-dictionary if tweet['place']['place_type'] == 'city': if tweet['place']['country'] == 'United States': tweet['place']['city'] = tweet['place']['name'] tweet['place']['state'] = tweet['place']['full_name'].split( ',')[1].strip() else: # NOTE: other countries have different formats to parse # some have full_name as <city, state>, while others are <city, country> # at the moment, we have to assume that the 2nd element of full_name is state tweet['place']['city'] = tweet['place']['name'] tweet['place']['state'] = tweet['place']['full_name'].split( ',')[1].strip() elif tweet['place']['place_type'] == 'admin': tweet['place']['city'] = None tweet['place']['state'] = tweet['place']['name'] # TODO: convert state name to state code elif tweet['place']['place_type'] == 'country': # only country info is available, nothing else we can do here tweet['place']['city'] = None tweet['place']['state'] = None else: log.warn( f"Not handling the place with unknown type: {tweet['place']}") # If the exact location of the tweet is not available, try to produce an approximate coordinates # by calculating the central point of the bounding box of the given place if not tweet['coordinates']: avg_long, avg_lat = [ mean(lst) for lst in zip( *tweet['place']['bounding_box']['coordinates'][0]) ] tweet['coordinates'] = { 'type': 'Point', 'coordinates': [avg_long, avg_lat], 'generated': True, } log.info( f"Detected location of tweet {tweet['id']} - coordinates: {tweet['coordinates']} - place: {tweet['place']}" ) elif tweet['coordinates']: log.info(f"Received tweet with coordinates: {tweet['coordinates']}") long, lat = tweet['coordinates']['coordinates']
def _delete_hate_word(): """Delete a hate word from the list.""" try: req = request.get_json(force=True) if not req: return Response("Supplied data format is malformed", status=400) result = db.hateword.delete_one({'word': req['word']}) if result.deleted_count == 1: log.info(f"Deleted hate word [{req['word']}]") return "" else: return Response(f"Unable to delete hate word [{req['word']}]", 400) except Exception as e: return Response(e, status=500)
def login(): if not session.get('username'): req = request.get_json(force=True) username = req.get('username') password = req.get('password') if not username or not password: return Response('Input format is invalid', status=400) user = db.user.find_one({'username': username, 'password': password}) if user: session['username'] = username log.info(f"User [{username}] has just logged in") return 'Successful' else: return Response('Username or password is incorrect', status=401) else: return 'User already logged in'
def recreate_db(): """ Recreate the database. """ import pymongo from script import script # table for storing categories of hate words db.category.drop() db.category.create_index([('name', pymongo.ASCENDING)], unique=True) # table for storing hate words db.hateword.drop() db.hateword.create_index([('word', pymongo.ASCENDING)], unique=True) script.populate_hateword_data() # table for storing tweets # NOTE: we do not use this table anymore db.tweet.drop() # table for storing processed tweets db.result.drop() db.create_collection( 'result', capped=True, size=_get_tweet_collection_size(), autoIndexId=False, ) db.result.create_index([('id', pymongo.ASCENDING)], unique=True) log.info( f"Storing tweets in a collection of max size {db.result.options()['size']/(2**20):.2f}MB" ) # table for storing user info db.user.drop() db.user.create_index([('username', pymongo.ASCENDING)], unique=True) script.populate_user_data() log.info("Recreated database successfully") return '', http.HTTPStatus.NO_CONTENT
def _set_hate_word(): """Add a new hate word to the list or update existing one.""" try: req = request.get_json(force=True) if not req: return Response("Supplied data format is malformed", status=400) hateword = _normalize(req['word']) categories = [_normalize(i) for i in req.get('category', [])] similar_words = [_normalize(i) for i in req.get('similar_to', [])] obj = { 'word': hateword, 'category': categories, 'similar_to': similar_words, } # add/update the word in db result = db.hateword.replace_one({'word': hateword}, obj, upsert=True) # add new categories if there are any for cate in categories: db.category.update_one({'name': cate}, {'$setOnInsert': { 'name': cate }}, upsert=True) # check result if result.modified_count == 1: log.info(f"Updated hate word [{hateword}]: {obj}") return "" elif result.upserted_id is not None: log.info(f"Added new hate word [{hateword}]: {obj}") return "" else: raise RuntimeError("Unknown error") except Exception as e: return Response(e, status=500)
def init(): """ Initialize the sentiment analyser. This method must be called before any usage of this module. """ try: log.info("Loading pre-trained classifier for sentiment analysis") load_classifier() except FileNotFoundError: log.info("Not found pre-trained classifier. Will train a new one") train_and_test_classifier() try: load_classifier() except Exception: log.exception( "Unable to train and test new classifier. Sentiment analysis will not work" ) return except Exception: log.exception( "Unable to load the classifier. Sentiment analysis will not work") return log.info("Loaded pre-trained classifier for sentiment analysis")
Start the Server with Gunicorn. """ from gunicorn.app.base import Application class FlaskApplication(Application): def init(self, parser, opts, args): return {} def load(self): init_app() return app application = FlaskApplication() return application.run() @manager.command def worker(): init_worker() from hatespeech.api.worker import start_worker start_worker() if __name__ == '__main__': try: manager.run() except (KeyboardInterrupt, SystemExit): log.info("Stopping the app") teardown_app()
def train_and_test_classifier(): """ Train the Naive Bayes classifier and perform preliminary testing on it. The model and feature extracting function will be persisted to file. """ # load the dataset and keep the necessary columns data = pd.read_csv(DATASET_PATH) data = data[['text', 'sentiment']] # split train-test data and arrange respective classes train, test = train_test_split(data, test_size=0.1) train_pos = train[train['sentiment'] == 'Positive'] train_pos = train_pos['text'] train_neg = train[train['sentiment'] == 'Negative'] train_neg = train_neg['text'] train_neu = train[train['sentiment'] == 'Neutral'] train_neu = train_neu['text'] test_pos = test[test['sentiment'] == 'Positive'] test_pos = test_pos['text'] test_neg = test[test['sentiment'] == 'Negative'] test_neg = test_neg['text'] test_neu = test[test['sentiment'] == 'Neutral'] test_neu = test_neu['text'] log.info( f"Train data: {len(train_pos)} positive, {len(train_neg)} negative, {len(train_neu)} neutral" ) log.info( f"Test data: {len(test_pos)} positive, {len(test_neg)} negative, {len(test_neu)} neutral" ) # text pre-processing tweets = [(_preprocess_text(row.text), row.sentiment) for index, row in train.iterrows()] # extract word features word_features = _get_word_features(_get_words_in_tweets(tweets)) extract_features = partial(_extract_features, word_features=word_features) log.info(f"{len(word_features)} feature(s): {word_features[:1000]}") # persist the feature extraction function to file with open(FEATURE_FUNC_PATH, 'wb') as f: pickle.dump(extract_features, f) # train the classifier training_set = nltk.classify.apply_features(extract_features, tweets) classifier = nltk.NaiveBayesClassifier.train(training_set) classifier.show_most_informative_features(20) # persist the classifier to file with open(MODEL_PATH, 'wb') as f: pickle.dump(classifier, f) neg_cnt = 0 pos_cnt = 0 neu_cnt = 0 for obj in test_neg: res = classifier.classify(extract_features(obj.split())) if res == 'Negative': neg_cnt += 1 for obj in test_neu: res = classifier.classify(extract_features(obj.split())) if res == 'Neutral': neu_cnt += 1 for obj in test_pos: res = classifier.classify(extract_features(obj.split())) if res == 'Positive': pos_cnt += 1 log.info( f"Negative: {neg_cnt}/{len(test_neg)} [{neg_cnt/len(test_neg)*100.0}%]" ) log.info( f"Neutral: {neu_cnt}/{len(test_neu)} [{neu_cnt/len(test_neu)*100.0}%]") log.info( f"Positive: {pos_cnt}/{len(test_pos)} [{pos_cnt/len(test_pos)*100.0}%]" )
def _on_hateword_updated(self): log.info("Hatewords were just updated") if self.running: self.stop() self.start()
def on_limit(self, track): log.info(f"Tracking info: {track}")
def on_connect(self): log.info("Connected to Twitter streaming API")
def logout(): session.pop('username', None) log.info(f"User [{username}] has just logged out") return 'Logged out'
import re from hatespeech.api.logging2 import log from gender_predictor import GenderPredictor # ============================================================================ # setup the gender predictor log.info("Setting up gender predictor") gp = GenderPredictor() gp.train_and_test() # ============================================================================ def detect_gender(tweet): """ Detect the gender of the tweet's author. :param tweet: the tweet object :return: nothing, the tweet object will be updated inline """ orig_name = tweet['user']['name'] name = re.sub(r'[^\x00-\x7f]', r'', orig_name) # remove non-ASCII characters gender = 'NA' try: first_name = extract_first_name(name) global gp gender = gp.classify(first_name) except IndexError:
def monitor_disk_usage(): """ Start monitoring disk usage in a separate thread. """ start_thread(_MonitorDiskUsageThread()) log.info("Started monitoring disk usage")
import http import pymongo from datetime import datetime from flask_pymongo import PyMongo from hatespeech.api.app import app from hatespeech.api.logging2 import log from hatespeech.config import config try: mongo = PyMongo(app) db = None with app.app_context(): log.info(f"Establishing database connection") db = mongo.db log.info(f"Connected to database: {db.client.server_info()}") except Exception as e: log.exception(e) @app.route('/db/recreate') def recreate_db(): """ Recreate the database. """ import pymongo from script import script # table for storing categories of hate words db.category.drop() db.category.create_index([('name', pymongo.ASCENDING)], unique=True)