def collect_todays_tweets(entry):
    """Collects todays tweets for every topic."""
    count_word_frequency = Counter()
    word_counter = Counter()
    hour_break_dict = {}
    if ("-latest") not in entry:
        if ("median") not in entry:
            # we frst need to collect all todays tweets
            entry_total = elastic_utils.last_id(entry)
            if elastic_utils.check_index_exists(entry + "-latest") is True:
                total = elastic_utils.last_id(entry + "-latest")
                day_res = elastic_utils.iterate_search(entry + "-latest",
                                                       query={
                                                           "query": {
                                                               "match_all": {}
                                                           },
                                                           "sort": [{
                                                               "last_time": {
                                                                   "order":
                                                                   "desc"
                                                               }
                                                           }]
                                                       })
                for test in day_res:
                    time_of_tweet = test["_source"]["created"]
                    datetime_object = datetime.strptime(
                        time_of_tweet, '%Y-%m-%d %H:%M:%S')
                    dateobj = datetime_object.strftime("%Y-%m-%d")
                    created_at = datetime_object.strftime("%Y-%m-%dT%H:%M:%S")
                    count_word_frequency.update(str(datetime_object.hour))
                    if str(datetime_object.hour) in hour_break_dict:
                        hour_break_dict[str(datetime_object.hour)] += 1
                    else:
                        hour_break_dict[str(datetime_object.hour)] = 1

                    words = preprocessor.filter_multiple(str(
                        test["_source"]["text"]),
                                                         ats=True,
                                                         hashtags=True,
                                                         stopwords=True,
                                                         stemming=False,
                                                         urls=True,
                                                         singles=True)
                    terms_all = [term for term in words]
                    word_counter.update(terms_all)
                    freq_obj = {
                        "hour_breakdown": hour_break_dict,
                        "words": json.dumps(word_counter.most_common(400)),
                        "total": total,
                        "date": dateobj,
                        "last_time": created_at
                    }
                    elastic_utils.add_entry(entry, entry_total + 1, freq_obj)
                    elastic_utils.delete_index(entry + "-latest")
                try:
                    elastic_utils.create_index(entry + "-latest")
                except:
                    print(
                        "Todays index already exists! This is an exception, but it's probably ok"
                    )
def oldtweets(request):
    words = []
    if request.POST:
        completed_form = forms.OldTweetsForm(request.POST)
        if completed_form.is_valid():
            cleaned = completed_form.clean()
            index_name = cleaned['index_name']
            search_query = cleaned['query_search']
            start_date = cleaned['start_date']
            end_date = cleaned['end_date'] + datetime.timedelta(days=1)
            end_tweets = []
            if elastic_utils.check_index_exists(index_name) is None:
                elastic_utils.create_index(index_name)
            while start_date != end_date:
                tweets = collect_tweets(
                    search_query, start_date,
                    (start_date + datetime.timedelta(days=1)))
                words = aggregate(tweets, index_name, start_date)
                start_date += datetime.timedelta(days=1)

    es_index_form = forms.OldTweetsForm()
    return render(request, "fyp/oldtweets/index.html", {
        "oldtweetsform": es_index_form,
        "words": words
    })
Beispiel #3
0
 def test_list_all_index(self):
     es.create_index("list_all")
     res = es.list_all_indexes()
     self.assertIn("\'test\'" , res)
     self.assertIn("\'list_all\'", res)
     es.delete_index("list_all")
     es.delete_index("test")
     self.assertIn("{}", res)
 def test_list_all_index(self):
     es.create_index("list_all")
     res = es.list_all_indexes()
     time.sleep(1)
     self.assertIn('test', res)
     self.assertIn('list_all', res)
     es.delete_index("list_all")
     es.delete_index("test")
     time.sleep(1)
 def test_search_index(self):
     # Add entry first along with the index
     doc = {"name": "test"}
     es.create_index("searching")
     es.add_entry(index_name="searching", id=1, body=doc)
     time.sleep(1)
     res = es.search_index(index_name="searching")
     print(res['hits']['hits'][0]['_source'])
     self.assertIn('test', res['hits']['hits'][0]['_source']['name'])
     es.delete_index("searching")
def aggregate_words(user_id, status):
    """The aggregate_words task adds Tweets to Elasticsearch live from the Celery Queue."""
    cat = TwitterCat.objects.filter(user_id=user_id)
    assigned_cat = False
    for entry in cat:
        if str(entry.category_name) in (status['text'].lower()
                                        or status['name'].lower()):
            print(status['created'])
            topic = entry.category_name + "-latest"
            elastic_utils.create_index(topic)
            assigned_cat = True
            break
    if assigned_cat == False:
        topic = "unknown-latest"
        elastic_utils.create_index(topic)
    id = elastic_utils.last_id(topic)
    id += 1
    elastic_utils.add_entry(topic, id, status)
Beispiel #7
0
def twittercat_list(request,
                    template_name='fyp/Category/twittercat_list.html'):
    cat = TwitterCat.objects.filter(user=request.user)
    data = {}
    for entry in cat:
        tracked = NotificationTracked.objects.filter(topic=entry.category_name)
        if "whois" in entry.category_name:
            tracked.delete()
        if "story" in entry.category_name:
            tracked.delete()
        elastic_utils.create_index(entry.category_name)
        elastic_utils.create_index(entry.category_name + "-latest")
        count = elastic_utils.count_entries(entry.category_name +
                                            "-latest")["count"]
        entry.count = count
        entry.tracked = len(tracked)
    data['object_list'] = cat
    return render(request, template_name, data)
Beispiel #8
0
def twitteruser_suggest(
        request, template_name='fyp/twitteruser/twitteruser_suggest.html'):
    if request.method == 'POST':
        if 'twitteruser-form' in request.POST:
            all_tweets = []
            all_text = []
            user_list = request.POST.getlist('suggest-user')
            for user in user_list:
                all_tweets.extend(
                    collect_user_tweets.get_all_users_tweets(user))
            count_word_frequency = Counter()
            for tweet in all_tweets:
                text = preprocessor.preprocess(str(tweet.text))
                text = preprocessor.remove_stop_words(text)
                text = preprocessor.remove_ats(text)
                text = preprocessor.remove_hashtags(text)
                text = preprocessor.remove_urls(text)
                text = [i for i in text if len(i) > 2]
                all_text.extend(text)
                terms_all = [term for term in text]
                count_word_frequency.update(terms_all)
            suggestions = count_word_frequency.most_common(25)
            print(suggestions)
            cat = TwitterUser.objects.filter(user=request.user)
            data = {}
            data['object_list'] = cat
            return render(request, template_name, {
                'suggestions': suggestions,
                'object_list': data['object_list']
            })
        if 'suggestcat-form' in request.POST:
            print(request.POST)
            category_list = request.POST.getlist('suggest-category')
            for category in category_list:
                category = ''.join(c for c in category if c not in '()\',')
                entry = TwitterCat(user=request.user, category_name=category)
                entry.save()
                elastic_utils.create_index(category)
                collect_old_tweets.delay(category, 30)

    cat = TwitterUser.objects.filter(user=request.user)
    data = {}
    data['object_list'] = cat
    return render(request, template_name, data)
Beispiel #9
0
def twittercat_create(request,
                      template_name='fyp/Category/twittercat_form.html'):
    if 'job' in request.POST:
        form = TwitterCatForm(request.POST or None)
        test = form.save(commit=False)
        test.user = request.user
        if form.is_valid():
            form.save()
            return redirect('fyp_webapp:twittercat_list')
    form = TwitterCatForm(request.POST or None)
    test = form.save(commit=False)
    test.user = request.user
    if form.is_valid():
        print("in here?")
        form.save()
        topic = form.cleaned_data['category_name'] + "-latest"
        elastic_utils.create_index(topic)
        elastic_utils.create_index(form.cleaned_data['category_name'])
        # collect_old_tweets.delay(form.cleaned_data['category_name'], 30)
        return redirect('fyp_webapp:twittercat_list')
    return render(request, template_name, {'form': form})
def get_median(entry):
    """Calculates the median for every topic."""
    # Now get yesterdays entries
    #I need to keep track of the value for words over each day, and also need day/hour breakdowns for each entry.
    day_breakdown = []
    hour_breakdown = []
    minute_breakdown = []
    latest_words = {}

    day_res = elastic_utils.iterate_search(entry,
                                           query={
                                               "query": {
                                                   "match_all": {}
                                               },
                                               "sort": [{
                                                   "date": {
                                                       "order": "desc"
                                                   }
                                               }]
                                           })

    ##iterate through entries by date.
    day = 0
    yesterday_res = {}
    for latest in day_res:
        try:
            hours = latest["_source"]["hour_breakdown"]
        except:
            hours = "No Tweets"
            continue
        #This is a words setup.
        if (hours != "No Tweets"):
            latest_ent = json.dumps(latest['_source']['words'])
            latest_ent = latest_ent.replace("\"[", "")
            latest_ent = latest_ent.replace("]\"", "")
            latest_ent = (latest_ent.split("], ["))

            for data in latest_ent:
                data = data.replace("[", "")
                data = data.replace("\"", "")
                data = data.replace("\\", "")
                data = data.replace("[\'", "")
                data = data.replace("\']", "")
                data = data.replace("]", "")
                terms_all = [data.split(", ")[0]]
                print(entry)
                total = [data.split(", ")[1]]
                if len(hours) < 24:
                    total[0] = (int(total[0]) / int(len(hours))) * 24
                if terms_all[0] in latest_words:
                    if "." in terms_all[0]:
                        terms_all[0] = terms_all[0].replace(".", "dot")
                    elif "," in terms_all[0]:
                        terms_all[0] = terms_all[0].replace(",", "comma")
                    latest_words[terms_all[0]].append(int(total[0]))
                else:
                    if "." in terms_all[0]:
                        terms_all[0] = terms_all[0].replace(".", "dot")
                    elif "," in terms_all[0]:
                        terms_all[0] = terms_all[0].replace(",", "comma")
                    latest_words[terms_all[0]] = []
                    latest_words[terms_all[0]].append(int(total[0]))
                if day is 0:
                    if terms_all[0] in yesterday_res:
                        if "." in terms_all[0]:
                            terms_all[0] = terms_all[0].replace(".", "dot")
                        elif "," in terms_all[0]:
                            terms_all[0] = terms_all[0].replace(",", "comma")
                        yesterday_res[terms_all[0]].append(int(total[0]))
                    else:
                        if "." in terms_all[0]:
                            terms_all[0] = terms_all[0].replace(".", "dot")
                        elif "," in terms_all[0]:
                            terms_all[0] = terms_all[0].replace(",", "comma")
                        yesterday_res[terms_all[0]] = []
                        yesterday_res[terms_all[0]].append(int(total[0]))

        #Now dealing with the breakdown over time
            if len(hours) is 24:
                day_breakdown.append(latest["_source"]["total"])
            else:
                day_b = (
                    (latest["_source"]["total"] / len(hours)) * 24
                )  # This is to combat when all entries aren't collected.
                day_breakdown.append(day_b)
            todays_hours = []  # A list of all the hours captured fors total..
            for test in hours:
                todays_hours.append(hours[test])
            todays_hours.sort()
            hour_med = statistics.median(
                todays_hours
            )  # gets the median for the hours for the specific day
            minute_estimate = hour_med / 60  # divide by 60 to get a minutes median
            hour_breakdown.append(hour_med)
            minute_breakdown.append(minute_estimate)
            day += 1

    #Now to calculate setup.
    day_breakdown.sort()
    minute_breakdown.sort()
    hour_breakdown.sort()
    five_min_median = 0
    count = elastic_utils.count_entries(entry)
    totals_array = add_zeros(latest_words, count)
    standard_dev = totals_array[1]
    totals_array = totals_array[0]

    five_min_word_breakdown = {}

    if (len(day_breakdown) != 0):
        day_median = statistics.median(day_breakdown)
    else:
        day_median = 0
    if (len(minute_breakdown) != 0):
        minute_median = statistics.median(minute_breakdown)
        five_min_median = minute_median * 5
    else:
        minute_median = 0
    if (len(hour_breakdown) != 0):
        hour_median = statistics.median(hour_breakdown)
    else:
        hour_median = 0
    es_obj = {
        "index": entry,
        "day_median": day_median,
        "minute_median": minute_median,
        "hour_median": hour_median,
        "five_minute_median": five_min_median,
        "day_words_median": totals_array,
        "yesterday_res": yesterday_res,
        "standard_dev": standard_dev
    }
    if "-median" not in entry:
        if elastic_utils.check_index_exists(entry + "-median") == False:
            elastic_utils.create_index(entry + "-median")
    elastic_utils.add_entry_median(entry + "-median", es_obj)
 def setUp(self):
     es.create_index("test")
 def test_create_index(self):
     self.assertIn(" \'index\': \'testcase\'",
                   es.create_index("testcase"))  #An index is made
     time.sleep(1)
     es.delete_index("testcase")