def collect_todays_tweets(entry):
    """Collects todays tweets for every topic."""
    count_word_frequency = Counter()
    word_counter = Counter()
    hour_break_dict = {}
    if ("-latest") not in entry:
        if ("median") not in entry:
            # we frst need to collect all todays tweets
            entry_total = elastic_utils.last_id(entry)
            if elastic_utils.check_index_exists(entry + "-latest") is True:
                total = elastic_utils.last_id(entry + "-latest")
                day_res = elastic_utils.iterate_search(entry + "-latest",
                                                       query={
                                                           "query": {
                                                               "match_all": {}
                                                           },
                                                           "sort": [{
                                                               "last_time": {
                                                                   "order":
                                                                   "desc"
                                                               }
                                                           }]
                                                       })
                for test in day_res:
                    time_of_tweet = test["_source"]["created"]
                    datetime_object = datetime.strptime(
                        time_of_tweet, '%Y-%m-%d %H:%M:%S')
                    dateobj = datetime_object.strftime("%Y-%m-%d")
                    created_at = datetime_object.strftime("%Y-%m-%dT%H:%M:%S")
                    count_word_frequency.update(str(datetime_object.hour))
                    if str(datetime_object.hour) in hour_break_dict:
                        hour_break_dict[str(datetime_object.hour)] += 1
                    else:
                        hour_break_dict[str(datetime_object.hour)] = 1

                    words = preprocessor.filter_multiple(str(
                        test["_source"]["text"]),
                                                         ats=True,
                                                         hashtags=True,
                                                         stopwords=True,
                                                         stemming=False,
                                                         urls=True,
                                                         singles=True)
                    terms_all = [term for term in words]
                    word_counter.update(terms_all)
                    freq_obj = {
                        "hour_breakdown": hour_break_dict,
                        "words": json.dumps(word_counter.most_common(400)),
                        "total": total,
                        "date": dateobj,
                        "last_time": created_at
                    }
                    elastic_utils.add_entry(entry, entry_total + 1, freq_obj)
                    elastic_utils.delete_index(entry + "-latest")
                try:
                    elastic_utils.create_index(entry + "-latest")
                except:
                    print(
                        "Todays index already exists! This is an exception, but it's probably ok"
                    )
def execute_all_term_functions(self, index, number_word_frequency_results=10):
    current_max_sentence_size = 0
    count_word_frequency = Counter()
    res = es.iterate_search(index_name=index)
    for entry in res:
        #Step 1. Get the max sentence size as we go.
        print (entry)
        current_tweet = preprocessor.preprocess(entry['_source']['text'])
        if (len(current_tweet) > current_max_sentence_size):
            current_max_sentence_size = len(current_tweet)

        #Step 2. Count the number of words in the frequency.
        terms_all = [term for term in preprocessor.preprocess(entry['_source']['text']) if term not in stop]
        # Update the counter
        count_word_frequency.update(terms_all)
    dict = {"word_frequency": count_word_frequency.most_common(number_word_frequency_results),
                                                                   "max_sentence_size": current_max_sentence_size}
    return dict


    def max_tweet_sentence_size(self,filename):
        #TODO need to add new function to support elasticsearch first
        return -1

    def count_word_frequency(self, filename):
        return -1

    def most_common_words(self, num_results, filename):
        #TODO need to add new function to support elasticsearch first
        return -1
Exemple #3
0
def test():
    texts = []
    res = elastic_utils.iterate_search(
        index_name=cfg.twitter_credentials['topic'])
    for i in res:
        processed_text = preprocessor.preprocess(i['_source']['text'])
        processed_text = preprocessor.remove_stop_words(
            processed_text)  #remove stop words
        processed_text = preprocessor.remove_urls(processed_text)  #remove urls
        processed_text = preprocessor.remove_ats(
            processed_text)  #remove username requests
        processed_text = preprocessor.remove_hashtags(
            processed_text)  #remove hashtags? #TODO this might be useful
        texts.append(processed_text)
    doc_2_vec = testlda.run(texts)
def setup_charts(cat):
    """Sets up the data for the charts on the front end."""
    tot = len(cat)
    entries_arrays = []
    i = 0
    for mod in cat:
        current_entry = []
        current_entry.append(mod)
        res = elastic_utils.iterate_search(index_name=mod,
                                           query={
                                               "query": {
                                                   "match_all": {}
                                               },
                                               "sort": [{
                                                   "last_time": {
                                                       "order": "desc"
                                                   }
                                               }],
                                               "size":
                                               20,
                                           })
        i += 1
        for entry in res:
            current_entry.append(entry["_source"]["total"])
        if i != (tot - 1):
            current_task.update_state(state='PROGRESS',
                                      meta={
                                          'current_percentage':
                                          (i / tot) * 100,
                                          'current_entry': mod,
                                          "chart_data": entries_arrays
                                      })
        else:
            current_task.update_state(state='PROGRESS',
                                      meta={
                                          'current_percentage':
                                          (i / tot) * 100,
                                          'current_entry': mod,
                                          "chart_data": entries_arrays,
                                          "latest_chart_data": current_entry,
                                          "test": 'Finished'
                                      })

        entries_arrays.append(current_entry)
    print("task finished.")
    return entries_arrays
def run_tf_idf(n_clusters, n_init, verbose):
    max_features = 10000

    # Create TF-IDF of texts
    tfidf = TfidfVectorizer(max_df=0.95,
                            min_df=2,
                            stop_words='english',
                            max_features=max_features)
    texts = []
    res = elastic_utils.iterate_search(
        index_name=cfg.twitter_credentials['topic'])
    for i in res:
        texts.append(i['_source']['text'])

    tfidf_vector = tfidf.fit_transform(texts)

    km = KMeans(n_clusters=2, init='k-means++', n_init=100, verbose=1)
    km.fit(tfidf_vector)
    result = {"model": km, "texts": texts}
    pickle.dump(result, open("save.p", "wb"))
    return result
def check_index():
    """Check index is the main algorithm. It will detect trends in real time. This task runs every 5 minutes."""
    index = elastic_utils.list_all_indexes()
    ts = datetime.now() - timedelta(minutes=5)
    total_count = 0
    for entry in index:
        word_counter = Counter()
        if ("-latest") not in entry:
            if ("median") not in entry:
                if elastic_utils.check_index_exists(entry + "-latest") is True:
                    total = elastic_utils.last_id(entry + "-latest")
                    #     t = datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
                    day_res = elastic_utils.iterate_search(
                        entry + "-latest",
                        query={
                            "query": {
                                "match_all": {}
                            },
                            "sort": [{
                                "created.keyword": {
                                    "order": "desc"
                                }
                            }]
                        })
                    total_in_five = 0
                    tweet_list = []
                    name = []
                    for item in day_res:
                        time_of_tweet = item["_source"]["created"]
                        datetime_object = datetime.strptime(
                            time_of_tweet, '%Y-%m-%d %H:%M:%S')
                        if datetime_object > ts:
                            if name.count(item["_source"]["name"]) < 3:
                                print("in here")
                                print(name)
                                name.append(item["_source"]["name"])
                                tweet_list.append(str(item["_source"]["text"]))
                                total_in_five += 1
                                words = preprocessor.filter_multiple(
                                    str(item["_source"]["text"]),
                                    ats=True,
                                    hashtags=True,
                                    stopwords=True,
                                    stemming=False,
                                    urls=True,
                                    singles=True)
                                terms_all = [term for term in words]
                                terms_all = set(terms_all)
                                word_counter.update(terms_all)
                            else:
                                break  #stop iterating through every entry. This will save a lot of time.
                    res = elastic_utils.iterate_search(entry + "-median")
                    potential_keywords = []
                    for median in res:
                        breakdown = median["_source"]["five_minute_median"]
                        if (total_in_five is 0):
                            total_five_ratio = 0
                        elif (breakdown is 0):
                            total_five_ratio = 0
                        elif (breakdown < 1):
                            total_five_ratio = 1
                        else:
                            total_five_ratio = total_in_five / breakdown
                        if (total_five_ratio > 2.0):
                            potential_keywords.append(
                                (entry, total_five_ratio, entry, "Monthly"))
                    yesterdays_res = median["_source"]["yesterday_res"]

                    for key, value in word_counter.items():
                        current_word = word_counter[key]
                        if (current_word > 5):
                            if key in yesterdays_res:
                                test_var = (
                                    (yesterdays_res[key][0] / 24) / 60) * 5
                                current_word_ratio = current_word / test_var
                                if key == entry:
                                    if (current_word_ratio > 2.5):
                                        potential_keywords.append(
                                            (entry, current_word_ratio, key,
                                             "Yesterday"))
                                        continue
                                elif (current_word_ratio > 2.0):
                                    potential_keywords.append(
                                        (entry, current_word_ratio, key,
                                         "Yesterday"))
                        existing_words = median["_source"]["day_words_median"]
                        existing_dev = median["_source"]["standard_dev"]

                        if (current_word > 5):
                            if key in existing_words:
                                existing_val = existing_words[key]
                                existing_val = ((existing_val / 24) / 60) * 5
                                standard_dev_5_mins = (
                                    (existing_dev[key] / 24) / 60) * 5
                                compared_to_monthly_ratio = current_word / existing_val
                                if (current_word >
                                    (standard_dev_5_mins + existing_val +
                                     standard_dev_5_mins)):
                                    potential_keywords.append(
                                        (entry,
                                         (current_word -
                                          (standard_dev_5_mins + existing_val +
                                           standard_dev_5_mins)), key,
                                         "Deviation"))
                                if (compared_to_monthly_ratio > 1.9):
                                    potential_keywords.append(
                                        (entry, compared_to_monthly_ratio, key,
                                         "Monthly"))
                        if (current_word > 6 and key not in existing_words
                                and key not in yesterdays_res):
                            potential_keywords.append(
                                (entry, current_word, key, "No Entries"))
                    notification = check_percentage(entry, tweet_list,
                                                    potential_keywords)

                    if "total" in notification:
                        print("--------")
                        print(notification)
                        print(notification["total"])
                        total_count += notification["total"]
    data = json.dumps({'job': total_count})
    Group('notifications').send({'text': data})
def elastic_info(index_list):
    """Displays statistics from the topics."""
    final_res = []
    current_entry = 0
    all_entries = []
    for entry in index_list:
        index_dict = {}
        all_entries.append(entry)
        index_dict["name"] = {}
        index_dict["current_entry"] = entry
        if current_entry is 0:
            current_task.update_state(state='PROGRESS',
                                      meta={
                                          'current_percentage': 0,
                                          "current_entry": entry
                                      })

        res = elastic_utils.search_index(entry,
                                         query={
                                             "query": {
                                                 "match_all": {}
                                             },
                                             "sort": [{
                                                 "date": {
                                                     "order": "desc"
                                                 }
                                             }],
                                             "size": 10
                                         })

        current_array = []
        for current in res["hits"]["hits"]:
            test = {}
            test["date"] = current["_source"]["date"]
            test["total"] = current["_source"]["total"]
            test["last_collected"] = current["_source"]["last_time"]
            current_array.append(test)
        index_dict["name"]["current"] = current_array

        median_array = []
        res_median = elastic_utils.iterate_search(entry + "-median")
        for median in res_median:
            med = {}
            med["day_median"] = median["_source"]["day_median"]
            med["hour_median"] = median["_source"]["hour_median"]
            med["minute_median"] = median["_source"]["minute_median"]
            median_array.append(med)
        index_dict["name"]["median"] = median_array
        res_latest = elastic_utils.search_index(entry + "-latest",
                                                query={
                                                    "query": {
                                                        "match_all": {}
                                                    },
                                                    "sort": [{
                                                        "created.keyword": {
                                                            "order": "desc"
                                                        }
                                                    }],
                                                    "size":
                                                    5
                                                })

        latest_array = []
        for item in res_latest["hits"]["hits"]:
            cur_entry = {}
            cur_entry["created"] = item["_source"]["created"]
            cur_entry["text"] = item["_source"]["text"]
            cur_entry["image"] = item["_source"]["profile_picture"]
            cur_entry["name"] = item["_source"]["name"]
            latest_array.append(cur_entry)

        index_dict["name"]["latest"] = latest_array

        all_entries.append(latest_array)
        if current_entry is not 0:
            current_task.update_state(
                state='PROGRESS',
                meta={
                    'current_percentage':
                    (current_entry / len(index_list)) * 100,
                    'current_entry': entry,
                    'final_res': final_res
                })
        current_entry += 1
        final_res.append(index_dict)
    print(len(final_res))
    return final_res
def get_median(entry):
    """Calculates the median for every topic."""
    # Now get yesterdays entries
    #I need to keep track of the value for words over each day, and also need day/hour breakdowns for each entry.
    day_breakdown = []
    hour_breakdown = []
    minute_breakdown = []
    latest_words = {}

    day_res = elastic_utils.iterate_search(entry,
                                           query={
                                               "query": {
                                                   "match_all": {}
                                               },
                                               "sort": [{
                                                   "date": {
                                                       "order": "desc"
                                                   }
                                               }]
                                           })

    ##iterate through entries by date.
    day = 0
    yesterday_res = {}
    for latest in day_res:
        try:
            hours = latest["_source"]["hour_breakdown"]
        except:
            hours = "No Tweets"
            continue
        #This is a words setup.
        if (hours != "No Tweets"):
            latest_ent = json.dumps(latest['_source']['words'])
            latest_ent = latest_ent.replace("\"[", "")
            latest_ent = latest_ent.replace("]\"", "")
            latest_ent = (latest_ent.split("], ["))

            for data in latest_ent:
                data = data.replace("[", "")
                data = data.replace("\"", "")
                data = data.replace("\\", "")
                data = data.replace("[\'", "")
                data = data.replace("\']", "")
                data = data.replace("]", "")
                terms_all = [data.split(", ")[0]]
                print(entry)
                total = [data.split(", ")[1]]
                if len(hours) < 24:
                    total[0] = (int(total[0]) / int(len(hours))) * 24
                if terms_all[0] in latest_words:
                    if "." in terms_all[0]:
                        terms_all[0] = terms_all[0].replace(".", "dot")
                    elif "," in terms_all[0]:
                        terms_all[0] = terms_all[0].replace(",", "comma")
                    latest_words[terms_all[0]].append(int(total[0]))
                else:
                    if "." in terms_all[0]:
                        terms_all[0] = terms_all[0].replace(".", "dot")
                    elif "," in terms_all[0]:
                        terms_all[0] = terms_all[0].replace(",", "comma")
                    latest_words[terms_all[0]] = []
                    latest_words[terms_all[0]].append(int(total[0]))
                if day is 0:
                    if terms_all[0] in yesterday_res:
                        if "." in terms_all[0]:
                            terms_all[0] = terms_all[0].replace(".", "dot")
                        elif "," in terms_all[0]:
                            terms_all[0] = terms_all[0].replace(",", "comma")
                        yesterday_res[terms_all[0]].append(int(total[0]))
                    else:
                        if "." in terms_all[0]:
                            terms_all[0] = terms_all[0].replace(".", "dot")
                        elif "," in terms_all[0]:
                            terms_all[0] = terms_all[0].replace(",", "comma")
                        yesterday_res[terms_all[0]] = []
                        yesterday_res[terms_all[0]].append(int(total[0]))

        #Now dealing with the breakdown over time
            if len(hours) is 24:
                day_breakdown.append(latest["_source"]["total"])
            else:
                day_b = (
                    (latest["_source"]["total"] / len(hours)) * 24
                )  # This is to combat when all entries aren't collected.
                day_breakdown.append(day_b)
            todays_hours = []  # A list of all the hours captured fors total..
            for test in hours:
                todays_hours.append(hours[test])
            todays_hours.sort()
            hour_med = statistics.median(
                todays_hours
            )  # gets the median for the hours for the specific day
            minute_estimate = hour_med / 60  # divide by 60 to get a minutes median
            hour_breakdown.append(hour_med)
            minute_breakdown.append(minute_estimate)
            day += 1

    #Now to calculate setup.
    day_breakdown.sort()
    minute_breakdown.sort()
    hour_breakdown.sort()
    five_min_median = 0
    count = elastic_utils.count_entries(entry)
    totals_array = add_zeros(latest_words, count)
    standard_dev = totals_array[1]
    totals_array = totals_array[0]

    five_min_word_breakdown = {}

    if (len(day_breakdown) != 0):
        day_median = statistics.median(day_breakdown)
    else:
        day_median = 0
    if (len(minute_breakdown) != 0):
        minute_median = statistics.median(minute_breakdown)
        five_min_median = minute_median * 5
    else:
        minute_median = 0
    if (len(hour_breakdown) != 0):
        hour_median = statistics.median(hour_breakdown)
    else:
        hour_median = 0
    es_obj = {
        "index": entry,
        "day_median": day_median,
        "minute_median": minute_median,
        "hour_median": hour_median,
        "five_minute_median": five_min_median,
        "day_words_median": totals_array,
        "yesterday_res": yesterday_res,
        "standard_dev": standard_dev
    }
    if "-median" not in entry:
        if elastic_utils.check_index_exists(entry + "-median") == False:
            elastic_utils.create_index(entry + "-median")
    elastic_utils.add_entry_median(entry + "-median", es_obj)
def run_nmf(n_samples, n_features, n_components, n_top_words):
    texts = []
    res = elastic_utils.iterate_search(
        index_name=cfg.twitter_credentials['topic'])
    for i in res:
        texts.append(i['_source']['text'])

    # Use tf-idf features for NMF.
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                       min_df=2,
                                       max_features=n_features,
                                       stop_words='english')
    t0 = time()
    tfidf = tfidf_vectorizer.fit_transform(texts)
    print("done in %0.3fs." % (time() - t0))

    # Use tf (raw term count) features for LDA.
    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=n_features,
                                    stop_words='english')
    t0 = time()
    tf = tf_vectorizer.fit_transform(texts)
    print("done in %0.3fs." % (time() - t0))
    print()

    # Fit the NMF model
    print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
          "n_samples=%d and n_features=%d..." % (n_samples, n_features))
    t0 = time()
    nmf = NMF(n_components=n_components, random_state=1, alpha=.1,
              l1_ratio=.5).fit(tfidf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in NMF model (Frobenius norm):")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, n_top_words)

    # Fit the NMF model
    print(
        "Fitting the NMF model (generalized Kullback-Leibler divergence) with "
        "tf-idf features, n_samples=%d and n_features=%d..." %
        (n_samples, n_features))
    t0 = time()
    nmf = NMF(n_components=n_components,
              random_state=1,
              beta_loss='kullback-leibler',
              solver='mu',
              max_iter=1000,
              alpha=.1,
              l1_ratio=.5).fit(tfidf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, n_top_words)

    tf_feature_names = tf_vectorizer.get_feature_names()
    categories = print_top_words(nmf, tf_feature_names, n_top_words)
    predict = nmf.transform(tf)
    result = {"predictions": predict, "text": texts, "categories": categories}
    return result
Exemple #10
0
def timeline(request):
    if request.POST:
        print(request.POST)
        answer = request.POST['dropdown']
        cat = TwitterCat.objects.filter(category_name=answer)
        name = ""
        for mod in cat:
            res = elastic_utils.iterate_search(index_name=mod.category_name,
                                               query={
                                                   "size":
                                                   20,
                                                   "query": {
                                                       "match_all": {}
                                                   },
                                                   "sort": [{
                                                       "date": {
                                                           "order": "desc"
                                                       }
                                                   }],
                                               })
            med = elastic_utils.search_index(index_name=mod.category_name +
                                             "-median")
            name = mod.category_name
            break
    else:
        cat = TwitterCat.objects.filter(user=request.user)
        name = ""
        for mod in cat:
            res = elastic_utils.iterate_search(index_name=mod.category_name,
                                               query={
                                                   "size":
                                                   20,
                                                   "query": {
                                                       "match_all": {}
                                                   },
                                                   "sort": [{
                                                       "date": {
                                                           "order": "desc"
                                                       }
                                                   }],
                                               })
            med = elastic_utils.search_index(index_name=mod.category_name +
                                             "-median")
            name = mod.category_name
            break

    cat = TwitterCat.objects.filter(user=request.user)
    data = {}
    i = 0
    for entry in res:
        temp_data = {}
        for hour in entry["_source"]["hour_breakdown"]:
            temp_data[int(hour)] = (entry["_source"]["hour_breakdown"][hour])
        data[entry["_source"]["date"]] = temp_data
        i += 1
        if i == 20:
            break

    day_median = med["hits"]["hits"][0]["_source"]["day_median"]
    hour_median = med["hits"]["hits"][0]["_source"]["hour_median"]
    minute_median = med["hits"]["hits"][0]["_source"]["minute_median"]
    hour_med_tresh = round(hour_median * 2, 2)
    minute_med_tresh = round(minute_median * 2, 2)
    day_med_tresh = round(day_median * 1.5, 2)

    print(hour_med_tresh)
    #for entry in res:
    #    temp_data = {}
    #    for hour in entry["_source"]["hour_breakdown"]:
    #        temp_data[int(hour)] = (entry["_source"]["hour_breakdown"][hour])
    #    data[entry["_source"]["date"]] = temp_data
    return render(
        request, "fyp/timeline/index.html", {
            "data": data,
            "name": name,
            "cats": cat,
            "hour_med_tresh": hour_med_tresh,
            "minute_med_tresh": minute_med_tresh,
            "day_med_tresh": day_med_tresh
        })