def find_min_count(items, min_count): ''' Find the items that occur at least min_count times Inputs: items: a list of items (must be hashable/comparable) min_count: integer Returns: sorted list of tuples ''' # Runs the helper function for you (DO NOT MODIFY) item_counts = count_items(items) # YOUR CODE HERE #Check if list is empty if (len(items) == 0): return [] # Sort item_counts sorted_item_counts = sort_count_pairs(item_counts) # Initializing output list min_count_list = [] # Filling the output list index = 1 count_until_min = sorted_item_counts[index - 1][1] while (count_until_min >= min_count and index <= len(sorted_item_counts)): min_count_list.append(sorted_item_counts[index - 1]) index += 1 if (index <= len(sorted_item_counts)): count_until_min = sorted_item_counts[index - 1][1] # REPLACE RETURN VALUE WITH AN APPROPRIATE VALUE return min_count_list
def find_top_k(tokens, k): ''' Find the k most frequently occuring tokens Inputs: tokens: list of tokens (must be immutable) k: a non-negative integer Returns: list of the top k tokens ordered by count. ''' #Error checking if k < 0: raise ValueError("In find_top_k, k must be a non-negative integer") d = count_tokens(tokens) lst = [] top_k = [] if len(tokens) != 0: for key, value in d.items(): tpl = (key, value) lst.append(tpl) sorted_lst = sort_count_pairs(lst) sorted_key = [i[0] for i in sorted_lst] top_k = sorted_key[:k] return top_k
def find_frequent(items, k): ''' Find items where the number of times the item occurs is at least 1/k * len(items). Input: items: a list of items (must be hashable/comparable) k: integer Returns: sorted list of tuples ''' counter = {} for item in items: if len(counter) > k - 1: raise ValueError("The number of elements stored in counter" + " should not exceed (k-1)=" + str(k - 1)) # YOUR CODE HERE if (item in counter): counter[item] += 1 else: counter[item] = 1 if len(counter) > k - 1: for key in list(counter): counter[key] -= 1 if counter[key] == 0: del counter[key] # WRITE THE APPROPRIATE UPDATE LOGIC FOR COUNTER return sort_count_pairs(counter.items())
def find_top_k(items, k): ''' Find the K most frequently occurring items Inputs: items: list of items (must be hashable/comparable) k: a non-negative integer Returns: sorted list of the top K tuples ''' # Error checking (DO NOT MODIFY) if k < 0: raise ValueError("In find_top_k, k must be a non-negative integer") # Runs the helper function for you (DO NOT MODIFY) item_counts = count_items(items) # YOUR CODE GOES HERE # Initializing output list top_k_list = [] # Initializing sorted list sorted_item_counts = sort_count_pairs(item_counts) if len(sorted_item_counts) <= k: return sorted_item_counts for num in range(k): top_k_list.append(sorted_item_counts[num]) # REPLACE RETURN VALUE WITH AN APPROPRIATE VALUE return top_k_list
def find_min_count(tokens, min_count): ''' Find the tokens that occur *at least* min_count times Inputs: tokens: a list of tokens (must be immutable) min_count: a non-negative integer Returns: set of tokens ''' #Error checking if min_count < 0: raise ValueError("min_count must be a non-negative integer") d = count_tokens(tokens) lst = [] for key, value in d.items(): tpl = (key, value) lst.append(tpl) sorted_lst = sort_count_pairs(lst) min_lst = [x for x, v in sorted_lst if v >= min_count] return set(min_lst)
def find_min_count(items, min_count): ''' Find the items that occur at least min_count times Inputs: items: a list of items (must be hashable/comparable) min_count: integer Returns: sorted list of tuples ''' # Runs the helper function for you (DO NOT MODIFY) # YOUR CODE HERE item_counts = count_items(items) temp = [] lst = [] counts = {} for item in items: if (item in counts): counts[item] += 1 else: counts[item] = 1 for key, value in counts.items(): if value >= min_count: temp = (key, value) lst.append(temp) lst1 = sort_count_pairs(lst) # REPLACE RETURN VALUE WITH AN APPROPRIATE VALUE return lst1
def find_top_k_entities(tweets, entity_key, value_key, k): ''' Find the K most frequently occuring entitites Inputs: tweets: a list of tweets entity_key: a string ("hashtags", "user_mentions", etc) value_key: string (appropriate value depends on the entity type) k: integer Returns: list of entity, count pairs sorted in non-decreasing order by count. ''' # Calls counter function to count occurances of entities of a given type # found in tweets and store as a dictionary tweet_entities_dict = counter(tweets, entity_key, value_key) # Converts this dictionary to a list of tuples tweet_entities_list = tweet_entities_dict.items() # Sorts this list in decending order by calling sort_count_pairs function tweet_entities_list_sorted = sort_count_pairs(tweet_entities_list) # Chops off all but the K most occuring values from the list of entities tweet_entities_list_sorted = tweet_entities_list_sorted[:k] return tweet_entities_list_sorted
def find_frequent(items, k): ''' Find items where the number of times the item occurs is at least 1/k * len(items). Input: items: a list of items (must be hashable/comparable) k: integer Returns: sorted list of tuples ''' # YOUR CODE HERE counter = {} keys_to_remove = [] for item in items: if item in counter: counter[item] += 1 if item not in counter: if len(counter) < k - 1: counter[item] = 1 else: decr_and_remove(counter) for item in items: if len(counter) > k - 1: raise ValueError("The number of elements stored in counter" + " should not exceed (k-1)=" + str(k - 1)) # WRITE THE APPROPRIATE UPDATE LOGIC FOR COUNTER return sort_count_pairs(counter.items())
def compare_tuple_lists(actual, params, recreate_msg): ''' Do a test, check the result, report an error, if necessary. ''' print("Actual:", actual) print() print("Expected:", params["expected"]) # check the type check_tuple_list(actual, recreate_msg) expected = params["expected"] if actual != expected: if len(actual) != len(expected): msg = ("Length of actual result ({}) does not match " "the length of the expected result ({}).\n{}") pytest.fail(msg.format(len(actual), len(expected), recreate_msg)) if sort_count_pairs(actual) == expected: msg = "Actual result is not sorted properly.\n{}" pytest.fail(msg.format(recreate_msg)) for i, actual_val in enumerate(actual): if actual_val != expected[i]: msg = ("At index {}:" " Actual result ({}) does not match" " Expected result ({}).\n{}") pytest.fail( msg.format(i, actual_val, expected[i], recreate_msg)) # Test succeeded if you get to here return
def find_top_k(items, k): ''' Find the K most frequently occuring items Inputs: items: a list of items k: integer Variables: item_dict: a dictionary with the number of occurrence of each unique value Returns: sorted list of K tuples ''' # Use a dictionary to count the number of times each unique # value occurs # Extract list of (key, count) pairs from the dictionary # Sort the pairs using the supplied function # Pick off the first K pairs item_dict = {} for item in items: if item in item_dict: item_dict[item] += 1 else: item_dict[item] = 1 return sort_count_pairs(item_dict.items())[:k]
def find_top_k(items, k): ''' Find the K most frequently occuring items Inputs: items: a list of items k: integer Returns: sorted list of K tuples ''' k_library = {} for i in items: item = k_library.get(i, 0) k_library[i] = item + 1 highest_to_lowest = sort_count_pairs(list(k_library.items())) return highest_to_lowest[0:k] items_dict = {} for i in range(len(items)): values = items[i] v = items_dict.get[values, 0] items_dict[values] = v + 1 return list(items_dict.items())
def find_frequent(items, k): ''' Find items where the number of times the item occurs is at least fraction * len(items). Input: items: list of items k: integer Returns: sorted list of tuples ''' k_library = {} for i in items: if i in k_library: k_library[i] = k_library[i] + 1 elif (len(k_library) + 1) <= k - 1: k_library[i] = 1 else: reject = [] for j in k_library: k_library[j] = k_library[j] - 1 if k_library[j] == 0: reject.append(j) for r in reject: del k_library[r] return sort_count_pairs(k_library.items())
def helper(test_description): ''' Do a test, check the result, report an error, if necessary. ''' task = test_description["task"] # load the tweets from the file try: tweet_filename = os.path.join(BASE_DIR, test_description["tweet_filename"]) tweets = json.load(open(tweet_filename)) except OSError as e: pytest.fail("{}".format(e)) expected = get_expected(test_description) if expected is None: pytest.fail("Could not open expected result file:" + test_description["expected_filename"] + ":") try: actual = task_to_fn[task](tweets, test_description["arg1"], test_description["arg2"]) except Exception as e: pytest.fail("{}".format(e)) if not check_type(actual): s = ("Actual result has the wrong type." " The correct type is list of pairs " "(that is, tuples of length 2)") pytest.fail(s) if actual != expected: if len(actual) != len(expected): s = ("Length of actual result ({}) does not match " "the length of the expected result ({})") pytest.fail(s.format(len(actual), len(expected))) if sort_count_pairs(actual) == expected: pytest.fail("Actual result is not sorted properly.") for i in range(len(actual)): if actual[i] != expected[i]: s = ("Actual result at index {} ({}) does not match" "expected result ({}) at index {}.") pytest.fail(s.format(i, actual[i], expected[i], i)) # Test succeeded if you get to here return
def find_top_n(items, n): ''' Find the N most frequently occuring items. Inputs: items: a list of items n: integer Returns: sorted list of N tuples ''' item_count = {} for item in items: item_count[item] = item_count.get(item, 0) + 1 sorted_list = sort_count_pairs(item_count.items()) return sorted_list[0:n]
def helper(test_description): task = test_description["task"] # load the tweets from the file try: tweet_filename = os.path.join(BASE_DIR, test_description["tweet_filename"]) tweets = json.load(open(tweet_filename)) except OSError as e: pytest.fail("{}".format(e)) expected = get_expected(test_description) if expected == None: pytest.fail("Could not open expected result file:"+ test_description["expected_filename"] + ":") try: if task in ["task1", "task2", "task3"]: actual = task_to_fn[task](tweets, test_description["entity_type"], test_description["value_key"], test_description["arg3"]) else: assert task in ["task4", "task5", "task6", "task7"] stop_words = STOP_WORDS.get(test_description["stop_words_key"], set([])) stop_prefix = STOP_PREFIXES.get(test_description["stop_prefix_key"], set([])) actual = task_to_fn[task](tweets, test_description["n"], stop_words, stop_prefix, test_description["arg4"]) except Exception as e: pytest.fail("{}".format(e)) if not check_type(actual): s = "Actual result has the wrong type. The correct type is list of pairs (that is, tuples of length 2)" pytest.fail(s) if actual != expected: if len(actual) != len(expected): s = "Length of actual result ({}) does not match the length of the expected result ({})" pytest.fail(s.format(len(actual), len(expected))) if sort_count_pairs(actual) == expected: pytest.fail("Actual result is not sorted properly.") for i in range(len(actual)): if actual[i] != expected[i]: s = "Actual result at index {} ({}) does not match expected result ({}) at index {}." pytest.fail(s.format(i, actual[i], expected[i], i)) # Test succeeded if you get to here return
def find_frequent_entities(tweets, entity_key, value_key, k): ''' Find entities where the number of times the specific entity occurs is at least fraction * the number of entities in across the tweets. Input: tweets: a list of tweets entity_key: a string ("hashtags", "user_mentions", etc) value_key: string (appropriate value depends on the entity type) k: integer Returns: list of entity, count pairs sorted in non-decreasing order by count. ''' # creates empty list to store relevant values from tweets tweet_freq = {} # generates a list of tweet values to be analyzed by calling # tweet_value_finder function tweet_values = tweet_value_finder(tweets, entity_key, value_key) # loops over each tweet value in tweet_values for tweet_value in tweet_values: # if a value is not stored in tweet_freq dictionary, and the dict. # is not longer than the specified length, frequencty count for # the value is increased by one if tweet_value not in tweet_freq and len(tweet_freq) < k - 1: tweet_freq[tweet_value] = 0 tweet_freq[tweet_value] += 1 # if a tweet value is not found in tweet_freq and the dictionary's # length meets/exceeds k-1, all counts in tweet_freq are decreased # and gets rid of phrases (keys) which map to 0 after the decrease elif tweet_value not in tweet_freq and len(tweet_freq) >= k - 1: tweet_freq = {key: tweet_freq[key] - 1 for key in tweet_freq} tweet_freq = {key: tweet_freq[key] for key in tweet_freq if tweet_freq[key] >= 1} # if a tweet entity is found in tweet_freq and the length condition is # not exceeded, the count for the value in question is increased by 1 else: tweet_freq[tweet_value] += 1 # converts the tweet_freq dictionary to a sortable list tweet_frequent = tweet_freq.items() # sorts the tweet_freq list in decending order by counts tweet_freq_sorted = sort_count_pairs(tweet_frequent) return tweet_freq_sorted
def find_frequent_ngrams(tweets, n, stop_words, stop_prefixes, k): ''' Find frequently occurring n-grams Inputs: tweets: a list of tweets n: integer stop_words: a set of strings to ignore stop_prefixes: a set of strings. Words w/ a prefix that appears in this list should be ignored. k: integer Returns: list of key/value pairs sorted in non-increasing order by value. ''' # calls generator function to generate possible ngrams ngrams = ngram_generator(tweets, n, stop_words, stop_prefixes) # initializes empty dictionary for processing freq_ngrams = {} # loops over each item in the list for ngram in ngrams: # checks to see if item isn't in list and if there are less # than k-1 counters and adds it to list with value 1 if so if ngram not in freq_ngrams and len(freq_ngrams) < k - 1: freq_ngrams[ngram] = 0 freq_ngrams[ngram] += 1 # checks to see if item isn't in list and if there are more # than k-1 counters and reduces all items in list by 1 and drops # items valued at 0 elif ngram not in freq_ngrams and len(freq_ngrams) >= k - 1: freq_ngrams = {key: freq_ngrams[key] - 1 for key in freq_ngrams} freq_ngrams = {key: freq_ngrams[key] for key in freq_ngrams if freq_ngrams[key] >= 1} # adds 1 if item is in list else: freq_ngrams[ngram] += 1 # turns dictionary into list freq_ngram_list = freq_ngrams.items() # sorts list freq_ngram_list_sorted = sort_count_pairs(freq_ngram_list) return freq_ngram_list_sorted
def find_frequent(items, k): ''' Find items where the number of times the item occurs is at least 1/k * len(items). Input: items: list of items k: integer Variables: item_dict: a dictionary with the number of occurrence of each unique value Returns: sorted list of tuples ''' # N = Total number of items # D = Data structure with K - 1 counters # I = Given list item # If I occurs in D, increment I counter by one # If I doesn't occur in D, and there are fewer than K - 1 items in D, # add I with a value of one to D # If I does not occur in D and there are K - 1 items in D, decrement all # the counters by one and remove any with a count of 0 from D item_dict = {} for item in items: if item not in item_dict and len(item_dict) < k - 1: item_dict[item] = 1 elif item not in item_dict and len(item_dict) == k - 1: item_dict_subtracted = {} for key in item_dict: value = item_dict[key] - 1 if value > 0: item_dict_subtracted[key] = value item_dict = item_dict_subtracted elif item in item_dict: item_dict[item] += 1 return sort_count_pairs(item_dict.items())
def make_k_list(items): ''' returns a list of counts for each item in items Inputs: items: list of items to be counted Returns: k_list: list of counts for each item in list as a list of tuples ''' #initializes an empty dictionary k_dict = {} #creates keys with binned counts, the keys are the items from the list for item in items: k_dict[item] = k_dict.get(item, 0) + 1 #converts dictionary to list of tuples k_list = k_dict.items() k_list_sorted = sort_count_pairs(k_list) return k_list_sorted
def calc_tf(docs): ''' Calculates TF scores per document for a corpus of documents. Inputs: docs: a list of lists (must be hashable/comparable) Returns: a list of dictionaries where each dictionary is a document containing the token as key and TF score as value ''' tf_full = [] for doc in docs: if len(doc) == 0: tf_full.append({}) else: sorted_count = sort_count_pairs(count_tokens(doc)) max_tf = sorted_count[0][1] tf_doc = {} for k,v in sorted_count: tf_doc[k] = 0.5+0.5*(v/max_tf) tf_full.append(tf_doc) return tf_full
def find_min_count(items, min_count): ''' Find the items that occur at least min_count times Inputs: items: a list of items min)count: integer Returns: sorted list of tuples ''' k_library = {} made_the_cut = [] for i in items: item = k_library.get(i, 0) k_library[i] = item + 1 for j in k_library: if k_library[j] >= min_count: made_the_cut.append((j, k_library[j])) made_the_cut = sort_count_pairs(made_the_cut) return made_the_cut
def ngram_counter(tweets, n, stop_words, stop_prefixes): ''' Find n-grams and their associated counts. Inputs: tweets: a list of tweets n: integer stop_words: a set of strings to ignore stop_prefixes: a set of strings. Words w/ a prefix that appears in this list should be ignored. min_count: integer Returns: list of key/value pairs sorted in non-increasing order by value. ''' # creates to list to hold ngrams and dict to count them ngram_dict = {} ngrams = [] # loops through tweets in list to process text for tweet in tweets: preproc_words = preprocess_tweet(tweet, stop_words, stop_prefixes) # loops through words in tweet to create ngram for i, word in enumerate(preproc_words): # checks to see if there is space left in list index # to create ngram if i <= len(preproc_words) - n: ngram = (preproc_words[i:i + n]) ngram = tuple(ngram) ngram_dict[ngram] = ngram_dict.get(ngram, 0) + 1 else: pass # converts dict to list and sorts ngram_list = ngram_dict.items() sorted_ngram_list = sort_count_pairs(ngram_list) return sorted_ngram_list
def find_frequent_6(items, k): ''' Find items where the number of times the item occurs is at least 1/k * len(items). Input: items: list of items k: integer Returns: sorted list of tuples ''' N = len(items) items_dict = {z : 0 for z in items} new_dict = {} overall_dict = {z : 0 for z in items} for a in items: if a in new_dict: items_dict[a] += 1 if a not in new_dict: new_dict[a] = 0 if len(new_dict) < k - 1: items_dict[a] += 1 if len(new_dict) == k - 1: items_dict[a] += 1 for b in new_dict: items_dict[b] -= 1 for b in overall_dict: if b in new_dict: if items_dict[b] == 0: del new_dict[b] for a in overall_dict: if items_dict[a] == 0: del items_dict[a] l = items_dict.items() return sort_count_pairs(l)
def find_frequent(items, k): ''' Find items where the number of times the item occurs is at least fraction * len(items). Input: items: list of items k: integer Returns: sorted list of tuples ''' # initializes empty dictionary for processing D = {} # loops over each item in the list for I in items: # checks to see if item isn't in list and if there are less # than k-1 counters and adds it to list with value 1 if so if I not in D and len(D) < k - 1: D[I] = 0 D[I] += 1 # checks to see if item isn't in list and if there are more # than k-1 counters and reduces all items in list by 1 and drops # items valued at 0 elif I not in D and len(D) >= k - 1: D = {key: D[key] - 1 for key in D} D = {key: D[key] for key in D if D[key] >= 1} # adds 1 if item is in list else: D[I] += 1 # turns dictionary into list list_frequent = D.items() # sorts list list_freq_sorted = sort_count_pairs(list_frequent) # YOUR CODE HERE # REPLACE RETURN VALUE WITH AN APPROPRIATE VALUE return list_freq_sorted
def find_top_k(items, k): ''' Find the K most frequently occurring items Inputs: items: list of items (must be hashable/comparable) k: a non-negative integer Returns: sorted list of the top K tuples ''' # Error checking (DO NOT MODIFY) if k < 0: raise ValueError("In find_top_k, k must be a non-negative integer") # Runs the helper function for you (DO NOT MODIFY) item_counts = count_items(items) temp = [] lst = [] counts = {} for item in items: if (item in counts): counts[item] += 1 else: counts[item] = 1 for key, value in counts.items(): temp = (key, value) lst.append(temp) lst1 = sort_count_pairs(lst) # YOUR CODE GOES HERE # REPLACE RETURN VALUE WITH AN APPROPRIATE VALUE return lst1[:k]
def find_min_count(items, min_count): ''' Find the items that occur at least min_count times Inputs: items: a list of items min_count: integer Variables: item_dict: a dictionary with the number of occurrence of each unique value min_count_array: an array with each tuple containing an item and its occurrences Returns: sorted list of tuples ''' # Compute the counts # Build a list of the items and associated counts that meet # the threshold # Sort it using the supplied function item_dict = {} for item in items: if item in item_dict: item_dict[item] += 1 else: item_dict[item] = 1 min_count_array = [] for key, value in item_dict.items(): if value >= min_count: min_count_array.append((key, value)) return sort_count_pairs(min_count_array)
def find_min_count_entities(tweets, entity_key, value_key, min_count): ''' Find the entitites that occur at least min_count times. Inputs: tweets: a list of tweets entity_key: a string ("hashtags", "user_mentions", etc) value_key: string (appropriate value depends on the entity type) min_count: integer Returns: list of entity, count pairs sorted in non-decreasing order by count. ''' # calls counter function to count entity occurences in the tweets, stores # counts as a dictionary tweet_entities_dict = counter(tweets, entity_key, value_key) # converts this dictionary of counts to a list tweet_entities_list = tweet_entities_dict.items() # cuts out all entity values with counts below the specified min. threshold tweet_entities_list_min = [x for x in tweet_entities_list if x[1] >= min_count] # sorts list of values of count >= mincounts in decending order tweet_entities_list_sorted = sort_count_pairs(tweet_entities_list_min) return tweet_entities_list_sorted
Inputs: tokens: list of tokens (must be hashable/comparable) k: a non-negative integer Returns: sorted list of the top k tuples ''' # Error checking (DO NOT MODIFY) err_msg = "In find_top_k, k must be a non-negative integer" assert k >= 0, err_msg <<<<<<< HEAD counted = count_tokens(tokens) sorted_tokens = sort_count_pairs(counted) return sorted_tokens[:k] ======= # Your code for Task 1.2 goes here # Replace return value with an appropriate value return [] >>>>>>> 772604324d9e6fbf8e76ecb03659b77d09955de0 def find_min_count(tokens, min_count): ''' Find the tokens that occur at least min_count times Inputs: tokens: a list of tokens (must be hashable/comparable) min_count: integer