def collect_todays_tweets(entry): """Collects todays tweets for every topic.""" count_word_frequency = Counter() word_counter = Counter() hour_break_dict = {} if ("-latest") not in entry: if ("median") not in entry: # we frst need to collect all todays tweets entry_total = elastic_utils.last_id(entry) if elastic_utils.check_index_exists(entry + "-latest") is True: total = elastic_utils.last_id(entry + "-latest") day_res = elastic_utils.iterate_search(entry + "-latest", query={ "query": { "match_all": {} }, "sort": [{ "last_time": { "order": "desc" } }] }) for test in day_res: time_of_tweet = test["_source"]["created"] datetime_object = datetime.strptime( time_of_tweet, '%Y-%m-%d %H:%M:%S') dateobj = datetime_object.strftime("%Y-%m-%d") created_at = datetime_object.strftime("%Y-%m-%dT%H:%M:%S") count_word_frequency.update(str(datetime_object.hour)) if str(datetime_object.hour) in hour_break_dict: hour_break_dict[str(datetime_object.hour)] += 1 else: hour_break_dict[str(datetime_object.hour)] = 1 words = preprocessor.filter_multiple(str( test["_source"]["text"]), ats=True, hashtags=True, stopwords=True, stemming=False, urls=True, singles=True) terms_all = [term for term in words] word_counter.update(terms_all) freq_obj = { "hour_breakdown": hour_break_dict, "words": json.dumps(word_counter.most_common(400)), "total": total, "date": dateobj, "last_time": created_at } elastic_utils.add_entry(entry, entry_total + 1, freq_obj) elastic_utils.delete_index(entry + "-latest") try: elastic_utils.create_index(entry + "-latest") except: print( "Todays index already exists! This is an exception, but it's probably ok" )
def oldtweets(request): words = [] if request.POST: completed_form = forms.OldTweetsForm(request.POST) if completed_form.is_valid(): cleaned = completed_form.clean() index_name = cleaned['index_name'] search_query = cleaned['query_search'] start_date = cleaned['start_date'] end_date = cleaned['end_date'] + datetime.timedelta(days=1) end_tweets = [] if elastic_utils.check_index_exists(index_name) is None: elastic_utils.create_index(index_name) while start_date != end_date: tweets = collect_tweets( search_query, start_date, (start_date + datetime.timedelta(days=1))) words = aggregate(tweets, index_name, start_date) start_date += datetime.timedelta(days=1) es_index_form = forms.OldTweetsForm() return render(request, "fyp/oldtweets/index.html", { "oldtweetsform": es_index_form, "words": words })
def test_list_all_index(self): es.create_index("list_all") res = es.list_all_indexes() self.assertIn("\'test\'" , res) self.assertIn("\'list_all\'", res) es.delete_index("list_all") es.delete_index("test") self.assertIn("{}", res)
def test_list_all_index(self): es.create_index("list_all") res = es.list_all_indexes() time.sleep(1) self.assertIn('test', res) self.assertIn('list_all', res) es.delete_index("list_all") es.delete_index("test") time.sleep(1)
def test_search_index(self): # Add entry first along with the index doc = {"name": "test"} es.create_index("searching") es.add_entry(index_name="searching", id=1, body=doc) time.sleep(1) res = es.search_index(index_name="searching") print(res['hits']['hits'][0]['_source']) self.assertIn('test', res['hits']['hits'][0]['_source']['name']) es.delete_index("searching")
def aggregate_words(user_id, status): """The aggregate_words task adds Tweets to Elasticsearch live from the Celery Queue.""" cat = TwitterCat.objects.filter(user_id=user_id) assigned_cat = False for entry in cat: if str(entry.category_name) in (status['text'].lower() or status['name'].lower()): print(status['created']) topic = entry.category_name + "-latest" elastic_utils.create_index(topic) assigned_cat = True break if assigned_cat == False: topic = "unknown-latest" elastic_utils.create_index(topic) id = elastic_utils.last_id(topic) id += 1 elastic_utils.add_entry(topic, id, status)
def twittercat_list(request, template_name='fyp/Category/twittercat_list.html'): cat = TwitterCat.objects.filter(user=request.user) data = {} for entry in cat: tracked = NotificationTracked.objects.filter(topic=entry.category_name) if "whois" in entry.category_name: tracked.delete() if "story" in entry.category_name: tracked.delete() elastic_utils.create_index(entry.category_name) elastic_utils.create_index(entry.category_name + "-latest") count = elastic_utils.count_entries(entry.category_name + "-latest")["count"] entry.count = count entry.tracked = len(tracked) data['object_list'] = cat return render(request, template_name, data)
def twitteruser_suggest( request, template_name='fyp/twitteruser/twitteruser_suggest.html'): if request.method == 'POST': if 'twitteruser-form' in request.POST: all_tweets = [] all_text = [] user_list = request.POST.getlist('suggest-user') for user in user_list: all_tweets.extend( collect_user_tweets.get_all_users_tweets(user)) count_word_frequency = Counter() for tweet in all_tweets: text = preprocessor.preprocess(str(tweet.text)) text = preprocessor.remove_stop_words(text) text = preprocessor.remove_ats(text) text = preprocessor.remove_hashtags(text) text = preprocessor.remove_urls(text) text = [i for i in text if len(i) > 2] all_text.extend(text) terms_all = [term for term in text] count_word_frequency.update(terms_all) suggestions = count_word_frequency.most_common(25) print(suggestions) cat = TwitterUser.objects.filter(user=request.user) data = {} data['object_list'] = cat return render(request, template_name, { 'suggestions': suggestions, 'object_list': data['object_list'] }) if 'suggestcat-form' in request.POST: print(request.POST) category_list = request.POST.getlist('suggest-category') for category in category_list: category = ''.join(c for c in category if c not in '()\',') entry = TwitterCat(user=request.user, category_name=category) entry.save() elastic_utils.create_index(category) collect_old_tweets.delay(category, 30) cat = TwitterUser.objects.filter(user=request.user) data = {} data['object_list'] = cat return render(request, template_name, data)
def twittercat_create(request, template_name='fyp/Category/twittercat_form.html'): if 'job' in request.POST: form = TwitterCatForm(request.POST or None) test = form.save(commit=False) test.user = request.user if form.is_valid(): form.save() return redirect('fyp_webapp:twittercat_list') form = TwitterCatForm(request.POST or None) test = form.save(commit=False) test.user = request.user if form.is_valid(): print("in here?") form.save() topic = form.cleaned_data['category_name'] + "-latest" elastic_utils.create_index(topic) elastic_utils.create_index(form.cleaned_data['category_name']) # collect_old_tweets.delay(form.cleaned_data['category_name'], 30) return redirect('fyp_webapp:twittercat_list') return render(request, template_name, {'form': form})
def get_median(entry): """Calculates the median for every topic.""" # Now get yesterdays entries #I need to keep track of the value for words over each day, and also need day/hour breakdowns for each entry. day_breakdown = [] hour_breakdown = [] minute_breakdown = [] latest_words = {} day_res = elastic_utils.iterate_search(entry, query={ "query": { "match_all": {} }, "sort": [{ "date": { "order": "desc" } }] }) ##iterate through entries by date. day = 0 yesterday_res = {} for latest in day_res: try: hours = latest["_source"]["hour_breakdown"] except: hours = "No Tweets" continue #This is a words setup. if (hours != "No Tweets"): latest_ent = json.dumps(latest['_source']['words']) latest_ent = latest_ent.replace("\"[", "") latest_ent = latest_ent.replace("]\"", "") latest_ent = (latest_ent.split("], [")) for data in latest_ent: data = data.replace("[", "") data = data.replace("\"", "") data = data.replace("\\", "") data = data.replace("[\'", "") data = data.replace("\']", "") data = data.replace("]", "") terms_all = [data.split(", ")[0]] print(entry) total = [data.split(", ")[1]] if len(hours) < 24: total[0] = (int(total[0]) / int(len(hours))) * 24 if terms_all[0] in latest_words: if "." in terms_all[0]: terms_all[0] = terms_all[0].replace(".", "dot") elif "," in terms_all[0]: terms_all[0] = terms_all[0].replace(",", "comma") latest_words[terms_all[0]].append(int(total[0])) else: if "." in terms_all[0]: terms_all[0] = terms_all[0].replace(".", "dot") elif "," in terms_all[0]: terms_all[0] = terms_all[0].replace(",", "comma") latest_words[terms_all[0]] = [] latest_words[terms_all[0]].append(int(total[0])) if day is 0: if terms_all[0] in yesterday_res: if "." in terms_all[0]: terms_all[0] = terms_all[0].replace(".", "dot") elif "," in terms_all[0]: terms_all[0] = terms_all[0].replace(",", "comma") yesterday_res[terms_all[0]].append(int(total[0])) else: if "." in terms_all[0]: terms_all[0] = terms_all[0].replace(".", "dot") elif "," in terms_all[0]: terms_all[0] = terms_all[0].replace(",", "comma") yesterday_res[terms_all[0]] = [] yesterday_res[terms_all[0]].append(int(total[0])) #Now dealing with the breakdown over time if len(hours) is 24: day_breakdown.append(latest["_source"]["total"]) else: day_b = ( (latest["_source"]["total"] / len(hours)) * 24 ) # This is to combat when all entries aren't collected. day_breakdown.append(day_b) todays_hours = [] # A list of all the hours captured fors total.. for test in hours: todays_hours.append(hours[test]) todays_hours.sort() hour_med = statistics.median( todays_hours ) # gets the median for the hours for the specific day minute_estimate = hour_med / 60 # divide by 60 to get a minutes median hour_breakdown.append(hour_med) minute_breakdown.append(minute_estimate) day += 1 #Now to calculate setup. day_breakdown.sort() minute_breakdown.sort() hour_breakdown.sort() five_min_median = 0 count = elastic_utils.count_entries(entry) totals_array = add_zeros(latest_words, count) standard_dev = totals_array[1] totals_array = totals_array[0] five_min_word_breakdown = {} if (len(day_breakdown) != 0): day_median = statistics.median(day_breakdown) else: day_median = 0 if (len(minute_breakdown) != 0): minute_median = statistics.median(minute_breakdown) five_min_median = minute_median * 5 else: minute_median = 0 if (len(hour_breakdown) != 0): hour_median = statistics.median(hour_breakdown) else: hour_median = 0 es_obj = { "index": entry, "day_median": day_median, "minute_median": minute_median, "hour_median": hour_median, "five_minute_median": five_min_median, "day_words_median": totals_array, "yesterday_res": yesterday_res, "standard_dev": standard_dev } if "-median" not in entry: if elastic_utils.check_index_exists(entry + "-median") == False: elastic_utils.create_index(entry + "-median") elastic_utils.add_entry_median(entry + "-median", es_obj)
def setUp(self): es.create_index("test")
def test_create_index(self): self.assertIn(" \'index\': \'testcase\'", es.create_index("testcase")) #An index is made time.sleep(1) es.delete_index("testcase")