Ejemplo n.º 1
0
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5):

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    pos = 0
    neg = 0
    for review in posids:
        pos += 1
        if (pos != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['pos'].update(token_helpers.tokenize_simple(word))
 
    for review in negids:
        neg += 1
        if (neg != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['neg'].update(token_helpers.tokenize_simple(word))
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return bestwords
    
    """
Ejemplo n.º 2
0
def tokenize_advanced(text, weight):
    """
    Tokenizes a string and returns it as a bag of words, including bigrams
    
    Parameters
    ----------
    text : str
        A string of raw text to be tokenized
    weight : int
        The weight to give to each bigram
    
    Returns
    -------
    list of strs
        One string for each token in the document, in the same order as the original
    """
    global counter
    counter = counter + 1
    print(counter)
    tokens = token_helpers.tokenize_simple(text)
    tagged_tokens = pos_tag(tokens, tagset="universal")
    result = []
    previous = None
    # Will be replaced later with internal functions from a pos.py file
    for index, t in enumerate(tagged_tokens):
        if t[1] == "ADJ" and index != 0:
            previous = tagged_tokens[index - 1]
            result.append(previous[0] + " " + t[0])
    for bigram in result:
        tokens = tokens + ([bigram] * weight)
    return tokens
Ejemplo n.º 3
0
def get_count_vect(data):
    """
    Takes json data and return a document term matrix consisting of all the tokens
    within the text of review data and how many times they are used (with stopwords
    removed)
    
    Parameters
    ----------
    data : list of dicts
        The yelp data to be analyzed
    
    Returns
    -------
    A transformed count vectorizer
        Returns a document term matrix of all tokens and their frequencies
    """
    tokens = []
    count_vect = CountVectorizer(stop_words=stopwords.words("english"))
    for review in data:
        tokens += token_helpers.tokenize_simple(review["text"])
    return count_vect.fit_transform(tokens)
Ejemplo n.º 4
0
def get_freq_dist(data):
    """
    Takes json data and returns its text as a Frequency Distribution
    
    Parameters
    ----------
    data : list of dicts
        The yelp data to be analyzed
    
    Returns
    -------
    An nltk Frequency Distribution
        Returns the frequency distribution of all tokens within the reviews
        of the yelp data
    """
    tokens = []
    for review in data:
        tokens += token_helpers.tokenize_simple(review["text"])
    tokens = token_helpers.remove_stopwords_inner(
        tokens,
        stopwords=stopwords.words("english")
        + ["time", "would", "got", "i'm", "-", "food", "like", "really", "service"],
    )
    return FreqDist(tokens)