Exemple #1
0
def find_top_k_entities(tweets, entity_key, k):
    '''
    Find the K most frequently occuring entitites.

    Inputs:
        tweets: a list of tweets
        entity_key: a pair ("hashtags", "text"), 
          ("user_mentions", "screen_name"), etc
        k: integer

    Returns: list of entity, count pairs
    '''
    z1, z2 = entity_key
    c = []
    l = range(len(tweets))
    
    for i in l:
        a = tweets[i]['entities'][z1]
        range_a = range(len(a))
        for j in range_a:
            b = a[j]
            c.append(b[z2])

    entity_list = [d.lower() for d in c]
        
    return find_top_k(entity_list, k)
Exemple #2
0
def find_top_k_ngrams_by_month(tweets, n, stop_words, stop_prefixes, k):
    '''                                                                                                            
    Find the top k ngrams for each month.

    Inputs:
        tweets: list of tweet dictionaries
        n: integer
        stop_words: a set of strings to ignore
        stop_prefixes: a set of strings.  Words w/ a prefix that
          appears in this list should be ignored.
        k: integer

    Returns: sorted list of pairs.  Each pair has the form: 
        ((year,  month), (sorted top-k n-grams for that month with their counts))
    '''

    time_dict = {}
    ltuple = []
    for td in tweets:
        year_month = grab_year_month(td['created_at'])
        if year_month in time_dict:
            time_dict[year_month].append(td)
        else:
            time_dict[year_month] = []
            time_dict[year_month].append(td)
    for date, t_list in time_dict.items():
        b = ngram_helper(t_list, n, stop_words, stop_prefixes)
        ltuple.append((date, find_top_k(b, k)))
    list.sort(ltuple)
    return ltuple
Exemple #3
0
def find_top_k_entities(tweets, entity_key, value_key, k):
    '''
    Find the K most frequently occuring entitites

    Inputs:
        tweets: a list of tweets
        entity_key: a string ("hashtags", "user_mentions", etc)
        value_key: string (appropriate value depends on the entity type)
        k: integer 

    Returns: list of entity, count pairs sorted in non-decreasing order by count.
    '''

    l = helper_1_to_3(tweets, entity_key, value_key)
    return find_top_k(l, k)
Exemple #4
0
def find_top_k_ngrams(tweets, n, k):
    '''
    Find k most frequently occurring n-grams
    
    Inputs:
        tweets: a list of tweets
        n: integer
        k: integer

    Returns: list of ngram/value pairs
    '''

    final_list = pre_process_tweets(tweets, n)

    return find_top_k(final_list, k)
Exemple #5
0
def find_top_k_ngrams(tweets, n, k):
    '''
	Find k most frequently occurring n-grams.

	Inputs:
		tweets: a list of tweets
		n: integer
		k: integer

	Returns: list of key/value pairs
	'''
    """
	Your code goes here
	"""

    return find_top_k(make_n_grams(tweets, n), k)
Exemple #6
0
def find_top_k_ngrams(tweets, n, k):
    '''
    Find k most frequently occurring n-grams
    
    Inputs:
        tweets: a list of tweets
        n: integer
        k: integer

    Variables:
        big_ngram_array: list of tuples called ngrams

    Returns: list of ngram/value pairs
    '''

    big_ngram_array = make_big_ngram_array(tweets, n)
    return find_top_k(big_ngram_array, k)
Exemple #7
0
def find_top_k_ngrams(tweets, n, k):
    '''
    Find k most frequently occurring n-grams.

    Inputs:
        tweets: a list of tweets
        n: integer
        k: integer

    Returns: list of key/value pairs
    '''

    #Your code goes here

    top_k = find_ngrams(tweets, n)
    a = find_top_k(top_k, k)
    return a
Exemple #8
0
def find_top_k_entities(tweets, entity_key, k):
    '''
    Find the K most frequently occuring entitites

    Inputs:
        tweets: a list of tweets
        entity_key: a pair ("hashtags", "text"), 
          ("user_mentions", "screen_name"), etc
        k: integer

    Variables:
		entity_array = holds all the hastags or screen_names

    Returns: list of entity, count pairs
    '''

    entity_array = make_entity_array(tweets, entity_key)
    return find_top_k(entity_array, k)
Exemple #9
0
def find_top_k_ngrams(tweets, n, stop_words, stop_prefixes, k):
    '''
    Find k most frequently occuring n-grams or if k < 0,
    count occurrences of all n-grams

    Inputs: 
        tweets: a list of tweets
        n: integer
        k: integer
        stop_words: a set of strigns to ignore
        stop_prefixes: a set of strings. Words w/a prefix that appears 
        in this list should be ignored.

    Returns: list of key/value pairs sorted in non-increasing order by 
    value.
    '''
    b = ngram_helper(tweets, n, stop_words, stop_prefixes)
    return find_top_k(b, k)     
Exemple #10
0
def find_top_k_entities(tweets, entity_key, k):
    '''
    Find the K most frequently occurring entities.

    Inputs:
        tweets: a list of tweets
        entity_key: a pair ("hashtags", "text"),
          ("user_mentions", "screen_name"), etc.
        k: integer

    Returns: list of entity, count pairs

    '''
    """
    Your code goes here
    """

    top_hashtags = hashtager(tweets, entity_key)
    a = find_top_k(top_hashtags, k)

    return a
Exemple #11
0
def find_top_k_entities(tweets, entity_key, k):
    '''
	Find the K most frequently occurring entities.

	Inputs:
		tweets: a list of tweets
		entity_key: a pair ("hashtags", "text"),
		  ("user_mentions", "screen_name"), etc.
		k: integer

	Returns: list of entity, count pairs

	'''
    """
	Your code goes here
	"""
    # Extract the list of desired entities/ key and subkey
    input_entities_list = extract_entities_list(tweets, entity_key)

    # (I put an an extra list assignment so it's easier for
    # The grader to understand the logic. I could have just
    # put that in return. Will do that in later tasks)

    return find_top_k(input_entities_list, k)
Exemple #12
0
>>>>>>> 772604324d9e6fbf8e76ecb03659b77d09955de0
def find_top_k_entities(tweets, entity_key, k):
    '''
    Find the K most frequently occuring entitites

    Inputs:
        tweets: a list of tweets
        entity_key: a pair ("hashtags", "text"),
          ("user_mentions", "screen_name"), etc
        k: integer

    Returns: list of entity, count pairs
    '''
<<<<<<< HEAD
    good_tweets = clean_tweets(tweets, entity_key)
    top_k = find_top_k(good_tweets,k)
    return top_k 
=======

    # Your code for Task 2.1 goes here
    # Replace None with appropriate value
    return None
>>>>>>> 772604324d9e6fbf8e76ecb03659b77d09955de0


def find_min_count_entities(tweets, entity_key, min_count):
    '''
<<<<<<< HEAD
     Find the entitites that occur at least min_count times.

     Inputs: