def find_top_k_entities(tweets, entity_key, k): ''' Find the K most frequently occuring entitites. Inputs: tweets: a list of tweets entity_key: a pair ("hashtags", "text"), ("user_mentions", "screen_name"), etc k: integer Returns: list of entity, count pairs ''' z1, z2 = entity_key c = [] l = range(len(tweets)) for i in l: a = tweets[i]['entities'][z1] range_a = range(len(a)) for j in range_a: b = a[j] c.append(b[z2]) entity_list = [d.lower() for d in c] return find_top_k(entity_list, k)
def find_top_k_ngrams_by_month(tweets, n, stop_words, stop_prefixes, k): ''' Find the top k ngrams for each month. Inputs: tweets: list of tweet dictionaries n: integer stop_words: a set of strings to ignore stop_prefixes: a set of strings. Words w/ a prefix that appears in this list should be ignored. k: integer Returns: sorted list of pairs. Each pair has the form: ((year, month), (sorted top-k n-grams for that month with their counts)) ''' time_dict = {} ltuple = [] for td in tweets: year_month = grab_year_month(td['created_at']) if year_month in time_dict: time_dict[year_month].append(td) else: time_dict[year_month] = [] time_dict[year_month].append(td) for date, t_list in time_dict.items(): b = ngram_helper(t_list, n, stop_words, stop_prefixes) ltuple.append((date, find_top_k(b, k))) list.sort(ltuple) return ltuple
def find_top_k_entities(tweets, entity_key, value_key, k): ''' Find the K most frequently occuring entitites Inputs: tweets: a list of tweets entity_key: a string ("hashtags", "user_mentions", etc) value_key: string (appropriate value depends on the entity type) k: integer Returns: list of entity, count pairs sorted in non-decreasing order by count. ''' l = helper_1_to_3(tweets, entity_key, value_key) return find_top_k(l, k)
def find_top_k_ngrams(tweets, n, k): ''' Find k most frequently occurring n-grams Inputs: tweets: a list of tweets n: integer k: integer Returns: list of ngram/value pairs ''' final_list = pre_process_tweets(tweets, n) return find_top_k(final_list, k)
def find_top_k_ngrams(tweets, n, k): ''' Find k most frequently occurring n-grams. Inputs: tweets: a list of tweets n: integer k: integer Returns: list of key/value pairs ''' """ Your code goes here """ return find_top_k(make_n_grams(tweets, n), k)
def find_top_k_ngrams(tweets, n, k): ''' Find k most frequently occurring n-grams Inputs: tweets: a list of tweets n: integer k: integer Variables: big_ngram_array: list of tuples called ngrams Returns: list of ngram/value pairs ''' big_ngram_array = make_big_ngram_array(tweets, n) return find_top_k(big_ngram_array, k)
def find_top_k_ngrams(tweets, n, k): ''' Find k most frequently occurring n-grams. Inputs: tweets: a list of tweets n: integer k: integer Returns: list of key/value pairs ''' #Your code goes here top_k = find_ngrams(tweets, n) a = find_top_k(top_k, k) return a
def find_top_k_entities(tweets, entity_key, k): ''' Find the K most frequently occuring entitites Inputs: tweets: a list of tweets entity_key: a pair ("hashtags", "text"), ("user_mentions", "screen_name"), etc k: integer Variables: entity_array = holds all the hastags or screen_names Returns: list of entity, count pairs ''' entity_array = make_entity_array(tweets, entity_key) return find_top_k(entity_array, k)
def find_top_k_ngrams(tweets, n, stop_words, stop_prefixes, k): ''' Find k most frequently occuring n-grams or if k < 0, count occurrences of all n-grams Inputs: tweets: a list of tweets n: integer k: integer stop_words: a set of strigns to ignore stop_prefixes: a set of strings. Words w/a prefix that appears in this list should be ignored. Returns: list of key/value pairs sorted in non-increasing order by value. ''' b = ngram_helper(tweets, n, stop_words, stop_prefixes) return find_top_k(b, k)
def find_top_k_entities(tweets, entity_key, k): ''' Find the K most frequently occurring entities. Inputs: tweets: a list of tweets entity_key: a pair ("hashtags", "text"), ("user_mentions", "screen_name"), etc. k: integer Returns: list of entity, count pairs ''' """ Your code goes here """ top_hashtags = hashtager(tweets, entity_key) a = find_top_k(top_hashtags, k) return a
def find_top_k_entities(tweets, entity_key, k): ''' Find the K most frequently occurring entities. Inputs: tweets: a list of tweets entity_key: a pair ("hashtags", "text"), ("user_mentions", "screen_name"), etc. k: integer Returns: list of entity, count pairs ''' """ Your code goes here """ # Extract the list of desired entities/ key and subkey input_entities_list = extract_entities_list(tweets, entity_key) # (I put an an extra list assignment so it's easier for # The grader to understand the logic. I could have just # put that in return. Will do that in later tasks) return find_top_k(input_entities_list, k)
>>>>>>> 772604324d9e6fbf8e76ecb03659b77d09955de0 def find_top_k_entities(tweets, entity_key, k): ''' Find the K most frequently occuring entitites Inputs: tweets: a list of tweets entity_key: a pair ("hashtags", "text"), ("user_mentions", "screen_name"), etc k: integer Returns: list of entity, count pairs ''' <<<<<<< HEAD good_tweets = clean_tweets(tweets, entity_key) top_k = find_top_k(good_tweets,k) return top_k ======= # Your code for Task 2.1 goes here # Replace None with appropriate value return None >>>>>>> 772604324d9e6fbf8e76ecb03659b77d09955de0 def find_min_count_entities(tweets, entity_key, min_count): ''' <<<<<<< HEAD Find the entitites that occur at least min_count times. Inputs: