Beispiel #1
0
    def run(self):
        interval = 60   # seconds
        while not self.is_stopped():
            try:
                free = _get_percent_free_space()
                log.info(f"Disk usage: {free}% free")

                threshold = config.DISK_FREE_THRESHOLD
                if free < threshold:
                    log.warn(f"Disk space left {free:.2f}% is below threshold of {threshold:.2f}%")
                    clean_old_data(days=7)

                    free = _get_percent_free_space()
                    while free < threshold:
                        log.warn(f"Disk space left {free:.2f}% is below threshold of {threshold:.2f}%")
                        force_clean_old_tweets(count=1000)
                        free = _get_percent_free_space()

                    log.info(f"Finished cleaning disk space [{free}% free]")
            except Exception:
                log.exception("Error during disk usage monitoring")
            finally:
                if self.is_stopped():
                    return
                self.sleep(interval)
 def stop(self):
     """
     Pause the streaming process.
     """
     log.info("Stopping the stream")
     # NOTE: this will stop after the next tweet arrives
     self.running = False
 def start(self):
     """
     Start or resume the streaming if it's been stopped before.
     """
     log.info("Start listening for tweets data")
     self.filter(track=self._keywords.data(),
                 languages=['en'],
                 stall_warnings=True,
                 async=True)
def start_worker():
    q = connect_to_message_queue()
    log.info("Started listening for message")
    while 1:
        try:
            tweet = q.pop()
            process(tweet)
        except Exception:
            log.exception(f"Error during execution")
def force_clean_old_tweets(count=1000):
    """
    Force delete a number of old tweets to make up for more space.
    :param count:   the number of tweets to delete
    """
    docs = db.result.find({}, ('_id',)) \
        .sort('timestamp_ms', pymongo.ASCENDING) \
        .limit(count)
    selector = {'_id': {'$in': [doc['_id'] for doc in docs]}}
    result = db.result.delete_many(selector)

    log.info(f"Deleted {result.deleted_count} tweet records")
def clean_old_data(days=7):
    """
    Clean tweet data that is older than 7 days (by default).
    """
    now = int(datetime.now().timestamp() * 1000)
    date_range = days * 86400 * 1000
    date_threshold = now - date_range

    result = db.result.delete_many({'timestamp_ms': {'$lt': date_threshold}})

    log.info(f"Deleted {result.deleted_count} tweet records")
    return '', http.HTTPStatus.NO_CONTENT
Beispiel #7
0
def connect_to_message_queue(*args, **kwargs):
    """
    Create a connection to a message queue according to the application config.
    :return:    the corresponding queue connection object
    """
    from hatespeech.api.logging2 import log
    from hatespeech.config import config

    log.info(f"Connecting to queue with config: args {args}, kwargs {kwargs}")
    if config.MESSAGE_QUEUE_TYPE == 'redis':
        url = config.REDIS_URL
        return RedisQueue(url, key=config.REDIS_QUEUE_KEY, *args, **kwargs)
    else:
        raise RuntimeError(f"Unknown message queue type: {config.QUEUE_TYPE}")
def detect_location(tweet):
    """
    Detect the location where the tweet is posted.
    :param tweet:   the tweet object
    :return:        nothing, the tweet object will be updated inline
    """
    if tweet['place']:
        # https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/geo-objects#place-dictionary
        if tweet['place']['place_type'] == 'city':
            if tweet['place']['country'] == 'United States':
                tweet['place']['city'] = tweet['place']['name']
                tweet['place']['state'] = tweet['place']['full_name'].split(
                    ',')[1].strip()
            else:
                # NOTE: other countries have different formats to parse
                # some have full_name as <city, state>, while others are <city, country>
                # at the moment, we have to assume that the 2nd element of full_name is state
                tweet['place']['city'] = tweet['place']['name']
                tweet['place']['state'] = tweet['place']['full_name'].split(
                    ',')[1].strip()
        elif tweet['place']['place_type'] == 'admin':
            tweet['place']['city'] = None
            tweet['place']['state'] = tweet['place']['name']
            # TODO: convert state name to state code
        elif tweet['place']['place_type'] == 'country':
            # only country info is available, nothing else we can do here
            tweet['place']['city'] = None
            tweet['place']['state'] = None
        else:
            log.warn(
                f"Not handling the place with unknown type: {tweet['place']}")

        # If the exact location of the tweet is not available, try to produce an approximate coordinates
        # by calculating the central point of the bounding box of the given place
        if not tweet['coordinates']:
            avg_long, avg_lat = [
                mean(lst) for lst in zip(
                    *tweet['place']['bounding_box']['coordinates'][0])
            ]
            tweet['coordinates'] = {
                'type': 'Point',
                'coordinates': [avg_long, avg_lat],
                'generated': True,
            }
        log.info(
            f"Detected location of tweet {tweet['id']} - coordinates: {tweet['coordinates']} - place: {tweet['place']}"
        )
    elif tweet['coordinates']:
        log.info(f"Received tweet with coordinates: {tweet['coordinates']}")
        long, lat = tweet['coordinates']['coordinates']
def _delete_hate_word():
    """Delete a hate word from the list."""
    try:
        req = request.get_json(force=True)
        if not req:
            return Response("Supplied data format is malformed", status=400)

        result = db.hateword.delete_one({'word': req['word']})

        if result.deleted_count == 1:
            log.info(f"Deleted hate word [{req['word']}]")
            return ""
        else:
            return Response(f"Unable to delete hate word [{req['word']}]", 400)
    except Exception as e:
        return Response(e, status=500)
Beispiel #10
0
def login():
    if not session.get('username'):
        req = request.get_json(force=True)
        username = req.get('username')
        password = req.get('password')

        if not username or not password:
            return Response('Input format is invalid', status=400)

        user = db.user.find_one({'username': username, 'password': password})
        if user:
            session['username'] = username
            log.info(f"User [{username}] has just logged in")
            return 'Successful'
        else:
            return Response('Username or password is incorrect', status=401)
    else:
        return 'User already logged in'
def recreate_db():
    """
    Recreate the database.
    """
    import pymongo
    from script import script

    # table for storing categories of hate words
    db.category.drop()
    db.category.create_index([('name', pymongo.ASCENDING)], unique=True)

    # table for storing hate words
    db.hateword.drop()
    db.hateword.create_index([('word', pymongo.ASCENDING)], unique=True)
    script.populate_hateword_data()

    # table for storing tweets
    # NOTE: we do not use this table anymore
    db.tweet.drop()

    # table for storing processed tweets
    db.result.drop()
    db.create_collection(
        'result',
        capped=True,
        size=_get_tweet_collection_size(),
        autoIndexId=False,
    )
    db.result.create_index([('id', pymongo.ASCENDING)], unique=True)
    log.info(
        f"Storing tweets in a collection of max size {db.result.options()['size']/(2**20):.2f}MB"
    )

    # table for storing user info
    db.user.drop()
    db.user.create_index([('username', pymongo.ASCENDING)], unique=True)
    script.populate_user_data()

    log.info("Recreated database successfully")
    return '', http.HTTPStatus.NO_CONTENT
def _set_hate_word():
    """Add a new hate word to the list or update existing one."""
    try:
        req = request.get_json(force=True)
        if not req:
            return Response("Supplied data format is malformed", status=400)

        hateword = _normalize(req['word'])
        categories = [_normalize(i) for i in req.get('category', [])]
        similar_words = [_normalize(i) for i in req.get('similar_to', [])]

        obj = {
            'word': hateword,
            'category': categories,
            'similar_to': similar_words,
        }

        # add/update the word in db
        result = db.hateword.replace_one({'word': hateword}, obj, upsert=True)

        # add new categories if there are any
        for cate in categories:
            db.category.update_one({'name': cate},
                                   {'$setOnInsert': {
                                       'name': cate
                                   }},
                                   upsert=True)

        # check result
        if result.modified_count == 1:
            log.info(f"Updated hate word [{hateword}]: {obj}")
            return ""
        elif result.upserted_id is not None:
            log.info(f"Added new hate word [{hateword}]: {obj}")
            return ""
        else:
            raise RuntimeError("Unknown error")
    except Exception as e:
        return Response(e, status=500)
def init():
    """
    Initialize the sentiment analyser.
    This method must be called before any usage of this module.
    """
    try:
        log.info("Loading pre-trained classifier for sentiment analysis")
        load_classifier()
    except FileNotFoundError:
        log.info("Not found pre-trained classifier. Will train a new one")
        train_and_test_classifier()
        try:
            load_classifier()
        except Exception:
            log.exception(
                "Unable to train and test new classifier. Sentiment analysis will not work"
            )
            return
    except Exception:
        log.exception(
            "Unable to load the classifier. Sentiment analysis will not work")
        return

    log.info("Loaded pre-trained classifier for sentiment analysis")
Beispiel #14
0
    Start the Server with Gunicorn.
    """
    from gunicorn.app.base import Application

    class FlaskApplication(Application):
        def init(self, parser, opts, args):
            return {}

        def load(self):
            init_app()
            return app

    application = FlaskApplication()
    return application.run()


@manager.command
def worker():
    init_worker()

    from hatespeech.api.worker import start_worker
    start_worker()


if __name__ == '__main__':
    try:
        manager.run()
    except (KeyboardInterrupt, SystemExit):
        log.info("Stopping the app")
        teardown_app()
def train_and_test_classifier():
    """
    Train the Naive Bayes classifier and perform preliminary testing on it.
    The model and feature extracting function will be persisted to file.
    """
    # load the dataset and keep the necessary columns
    data = pd.read_csv(DATASET_PATH)
    data = data[['text', 'sentiment']]

    # split train-test data and arrange respective classes
    train, test = train_test_split(data, test_size=0.1)

    train_pos = train[train['sentiment'] == 'Positive']
    train_pos = train_pos['text']
    train_neg = train[train['sentiment'] == 'Negative']
    train_neg = train_neg['text']
    train_neu = train[train['sentiment'] == 'Neutral']
    train_neu = train_neu['text']

    test_pos = test[test['sentiment'] == 'Positive']
    test_pos = test_pos['text']
    test_neg = test[test['sentiment'] == 'Negative']
    test_neg = test_neg['text']
    test_neu = test[test['sentiment'] == 'Neutral']
    test_neu = test_neu['text']

    log.info(
        f"Train data: {len(train_pos)} positive, {len(train_neg)} negative, {len(train_neu)} neutral"
    )
    log.info(
        f"Test data: {len(test_pos)} positive, {len(test_neg)} negative, {len(test_neu)} neutral"
    )

    # text pre-processing
    tweets = [(_preprocess_text(row.text), row.sentiment)
              for index, row in train.iterrows()]

    # extract word features
    word_features = _get_word_features(_get_words_in_tweets(tweets))
    extract_features = partial(_extract_features, word_features=word_features)
    log.info(f"{len(word_features)} feature(s): {word_features[:1000]}")

    # persist the feature extraction function to file
    with open(FEATURE_FUNC_PATH, 'wb') as f:
        pickle.dump(extract_features, f)

    # train the classifier
    training_set = nltk.classify.apply_features(extract_features, tweets)
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    classifier.show_most_informative_features(20)

    # persist the classifier to file
    with open(MODEL_PATH, 'wb') as f:
        pickle.dump(classifier, f)

    neg_cnt = 0
    pos_cnt = 0
    neu_cnt = 0
    for obj in test_neg:
        res = classifier.classify(extract_features(obj.split()))
        if res == 'Negative':
            neg_cnt += 1
    for obj in test_neu:
        res = classifier.classify(extract_features(obj.split()))
        if res == 'Neutral':
            neu_cnt += 1
    for obj in test_pos:
        res = classifier.classify(extract_features(obj.split()))
        if res == 'Positive':
            pos_cnt += 1

    log.info(
        f"Negative: {neg_cnt}/{len(test_neg)} [{neg_cnt/len(test_neg)*100.0}%]"
    )
    log.info(
        f"Neutral: {neu_cnt}/{len(test_neu)} [{neu_cnt/len(test_neu)*100.0}%]")
    log.info(
        f"Positive: {pos_cnt}/{len(test_pos)} [{pos_cnt/len(test_pos)*100.0}%]"
    )
 def _on_hateword_updated(self):
     log.info("Hatewords were just updated")
     if self.running:
         self.stop()
         self.start()
 def on_limit(self, track):
     log.info(f"Tracking info: {track}")
 def on_connect(self):
     log.info("Connected to Twitter streaming API")
Beispiel #19
0
def logout():
    session.pop('username', None)
    log.info(f"User [{username}] has just logged out")
    return 'Logged out'
Beispiel #20
0
import re
from hatespeech.api.logging2 import log
from gender_predictor import GenderPredictor

# ============================================================================

# setup the gender predictor
log.info("Setting up gender predictor")
gp = GenderPredictor()
gp.train_and_test()

# ============================================================================


def detect_gender(tweet):
    """
    Detect the gender of the tweet's author.
    :param tweet:   the tweet object
    :return:        nothing, the tweet object will be updated inline
    """
    orig_name = tweet['user']['name']
    name = re.sub(r'[^\x00-\x7f]', r'',
                  orig_name)  # remove non-ASCII characters
    gender = 'NA'

    try:
        first_name = extract_first_name(name)

        global gp
        gender = gp.classify(first_name)
    except IndexError:
Beispiel #21
0
def monitor_disk_usage():
    """
    Start monitoring disk usage in a separate thread.
    """
    start_thread(_MonitorDiskUsageThread())
    log.info("Started monitoring disk usage")
import http
import pymongo
from datetime import datetime
from flask_pymongo import PyMongo
from hatespeech.api.app import app
from hatespeech.api.logging2 import log
from hatespeech.config import config

try:
    mongo = PyMongo(app)
    db = None
    with app.app_context():
        log.info(f"Establishing database connection")
        db = mongo.db
        log.info(f"Connected to database: {db.client.server_info()}")
except Exception as e:
    log.exception(e)


@app.route('/db/recreate')
def recreate_db():
    """
    Recreate the database.
    """
    import pymongo
    from script import script

    # table for storing categories of hate words
    db.category.drop()
    db.category.create_index([('name', pymongo.ASCENDING)], unique=True)