def execute_all_term_functions(self, index, number_word_frequency_results=10): current_max_sentence_size = 0 count_word_frequency = Counter() res = es.iterate_search(index_name=index) for entry in res: #Step 1. Get the max sentence size as we go. print (entry) current_tweet = preprocessor.preprocess(entry['_source']['text']) if (len(current_tweet) > current_max_sentence_size): current_max_sentence_size = len(current_tweet) #Step 2. Count the number of words in the frequency. terms_all = [term for term in preprocessor.preprocess(entry['_source']['text']) if term not in stop] # Update the counter count_word_frequency.update(terms_all) dict = {"word_frequency": count_word_frequency.most_common(number_word_frequency_results), "max_sentence_size": current_max_sentence_size} return dict def max_tweet_sentence_size(self,filename): #TODO need to add new function to support elasticsearch first return -1 def count_word_frequency(self, filename): return -1 def most_common_words(self, num_results, filename): #TODO need to add new function to support elasticsearch first return -1
def test(): texts = [] res = elastic_utils.iterate_search( index_name=cfg.twitter_credentials['topic']) for i in res: processed_text = preprocessor.preprocess(i['_source']['text']) processed_text = preprocessor.remove_stop_words( processed_text) #remove stop words processed_text = preprocessor.remove_urls(processed_text) #remove urls processed_text = preprocessor.remove_ats( processed_text) #remove username requests processed_text = preprocessor.remove_hashtags( processed_text) #remove hashtags? #TODO this might be useful texts.append(processed_text) doc_2_vec = testlda.run(texts)
def count_words(number_word_frequency_results, list_in_question): nltk.download('stopwords') punctuation = list(string.punctuation) stop = stopwords.words('english') + punctuation + [ 'rt', 'via', '…', 'I', '’', 'The', '!' ] count_word_frequency = Counter() for entry in list_in_question: print("-----------") print(entry) print(type(entry)) print("-----------") terms_all = [ term for term in preprocessor.preprocess(entry) if term not in stop ] count_word_frequency.update(terms_all) return count_word_frequency.most_common(number_word_frequency_results)
def twitteruser_suggest( request, template_name='fyp/twitteruser/twitteruser_suggest.html'): if request.method == 'POST': if 'twitteruser-form' in request.POST: all_tweets = [] all_text = [] user_list = request.POST.getlist('suggest-user') for user in user_list: all_tweets.extend( collect_user_tweets.get_all_users_tweets(user)) count_word_frequency = Counter() for tweet in all_tweets: text = preprocessor.preprocess(str(tweet.text)) text = preprocessor.remove_stop_words(text) text = preprocessor.remove_ats(text) text = preprocessor.remove_hashtags(text) text = preprocessor.remove_urls(text) text = [i for i in text if len(i) > 2] all_text.extend(text) terms_all = [term for term in text] count_word_frequency.update(terms_all) suggestions = count_word_frequency.most_common(25) print(suggestions) cat = TwitterUser.objects.filter(user=request.user) data = {} data['object_list'] = cat return render(request, template_name, { 'suggestions': suggestions, 'object_list': data['object_list'] }) if 'suggestcat-form' in request.POST: print(request.POST) category_list = request.POST.getlist('suggest-category') for category in category_list: category = ''.join(c for c in category if c not in '()\',') entry = TwitterCat(user=request.user, category_name=category) entry.save() elastic_utils.create_index(category) collect_old_tweets.delay(category, 30) cat = TwitterUser.objects.filter(user=request.user) data = {} data['object_list'] = cat return render(request, template_name, data)
def word_cloud(id, topic): """The word cloud task creates a word cloud from the data.""" item = {} category = [] cat = TwitterCat.objects.filter(user_id=id) for entry in cat: entry = preprocessor.preprocess(entry.category_name) entry = preprocessor.porter_stemming(entry) entry = ''.join(c for c in entry if c not in '[]\'') res = (elastic_utils.search_index( topic, query='{"query":{"query_string":{"fields":["text"],"query":"%s*"}}}' % str(entry))) total = res['hits']['total'] item[entry] = total category.append(entry) current_task.update_state(state='PROGRESS', meta={ 'current_categories': category, 'current_results': item }) jsonData = json.dumps(item) return (category, jsonData)