Beispiel #1
0
# -*- coding: utf-8 -*-
from collections import Counter
from datetime import datetime, timedelta

import falcon
import ujson as json

from streamer import get_db, get_keywords
from hortiradar import tokenizeRawTweetText, admins, users


tweets = get_db().tweets
KEYWORDS = get_keywords()
time_format = "%Y-%m-%d-%H-%M-%S"

with open("data/stoplist-nl.txt") as f:
    stop_words = [w.decode("utf-8").strip() for w in f.readlines()]
    stop_words = {w: 1 for w in stop_words}  # stop words to filter out in word cloud


def get_dates(req, resp, resource, params):
    """Parse the 'start' and 'end' datetime parameters."""
    try:
        start = req.get_param("start")
        start = datetime.strptime(start, time_format) if start else datetime(2001, 1, 1)
        end = req.get_param("end")
        end = datetime.strptime(end, time_format) if end else datetime(3001, 1, 1)
        params["start"] = start
        params["end"] = end
    except ValueError:
        msg = "Invalid datetime format string, use: %s" % time_format
Beispiel #2
0
from streamer import get_db, get_keywords, find_keywords_and_groups

from hortiradar import tokenizeRawTweetText

tweets = get_db().tweets
keywords = get_keywords()


tw = tweets.find()
for t in tw:
    tokens = tokenizeRawTweetText(t["tweet"]["text"])
    kws, groups = find_keywords_and_groups(tokens, keywords)
    tweets.update_one({"_id": t["_id"]}, {
        "$set": {
            "keywords": kws,
            "groups": groups,
            "num_keywords": len(kws)
        }
    })