def freq_dist(input, filtering_functions=[], plot = False, limit = None, return_counts = False):
    """Takes a list of words (hashtags, keywrods, anything) and plots a frequency distribution
       
       Filtering functions is an ORDERED set of functions to call on the raw input list that are executed before the freq dist
       That is, each item in input is run though f1,f2..,fn where filtering_functions = [f1,...fn]
       
       limit truncates the freq_dist to the limit most common items
       
       return_counts determines whether a list of tuples (word, count) are returned, 
          or whether a list of just the limit most used words is returned
    """
    for f in filtering_functions + [str.lower, str.strip]:
        input = map(f, input) 
    
    nltk_fdist = FreqDist(list(input))    
    
    if plot: #use nltks built in plotting function before destroying the data structure
        nltk_fdist.plot(limit) if limit else nltk_fdist.plot()      
    
    fdist = sorted(nltk_fdist.items(), key=lambda x:(-x[1], x[0]))   #alphabetically sort equally counted items
    fidst = fdist[0:limit] if limit else fdist                                  #apply limit
    fdist = [i[0] for i in fdist] if not return_counts else fdist               #remove counts if desired
        

    
    return fdist
def process_tweets (hashtag,addl_stops=[]):
    count=0
    good_count=0
    words_to_plot=[]
    #Iterate through all chunked files with relevant hashtag
    for fname in os.listdir(os.getcwd()):
        if fname.startswith(hashtag):
            with open(fname,'r') as data_file:
                data=data_file.read()
                # Parse raw string since json.load() approach wasn't working
                data=data.split("\n\x00,")
            for tweet in data:
                count+=1
        
                # Tweets have a well-defined structure, so we can parse them 
                # manually (even though the JSON approach would be cleaner)
                text=tweet[tweet.find("text\":")+7:tweet.find(",\"source\"")-1]
                
                # Skip tweets that contain Unicode
                if text.find('\u')>=0:
                    continue
                else:
                    good_count+=1
                    # Tokenize and count word frequency, ignoring case
                    words = word_tokenize(text)
                    clean_words= [w.lower() for w in words if not w.lower() in set(stops+addl_stops)]
                    words_to_plot=words_to_plot+clean_words             
    
    #Create frequency histogram of 50 most common words and print summary of activity 
    fdist=FreqDist(words_to_plot)
    fdist.plot(50)
    print "for "+hashtag+' we collected '+str(count)+' tweets out of which '+str(good_count)+" will be analyzed"
    return words_to_plot
Esempio n. 3
0
def thong_ke_nganh_nghe_ung_vien(nganh_nghe):
    from matplotlib import pyplot as bieudo
    # xử lí, tách ngành nghề bỏ vào list, set
    list_nganh_nghe = []
    for nghe in nganh_nghe:
        list_nganh_nghe.append(
            nghe.replace('nhân viên kinh doanh', "Kinh doanh").replace(
                "nhân viên tư vấn",
                "Tư vấn").replace("Kinh doanh Kinh doanh bất động sản",
                                  "Kinh doanh").
            replace("Kinh doanh  bất động sản", "Bất động sản").replace(
                "Kinh doanh Kinh doanh",
                "Kinh doanh").strip())  # thêm từng ngành nghề vào list

    set_list_nganh_nghe = set(list_nganh_nghe)  # loại bỏ các nghề trùng nhau
    for nghe in set_list_nganh_nghe:
        print(nghe)
    print("----------------------------------------------------------------")
    print('Thống kê số lượng bài tuyển dụng theo nghành nghề: ')
    for i in set_list_nganh_nghe:
        count = list_nganh_nghe.count(i)
        if count > 100:
            print(str(i).strip(), ":", count)

    print(
        "-------------------------Thống kê ngành nghề--------------------------------"
    )
    print("Có tổng số ngành nghề: ", len(set_list_nganh_nghe))

    fdist_nganh_nghe = FreqDist(list_nganh_nghe)  #tuần suất xuất hiện
    fdist_nganh_nghe.plot(20)  # vẽ biểu đồ
Esempio n. 4
0
class Article:
    def __init__(self, text):
        self.text = text

    def tokenize(self):
        self.tokens = self.txt_to_tokens(self.text)
        self.freq = FreqDist(self.tokens)

    def print_frequencies(self, n=20):
        for key, value in self.freq.most_common(n):
            print(key, ": ", value)

    def plot_frequencies(self):
        self.freq.plot(20, cumulative=False)

    @staticmethod
    def txt_to_tokens(text):
        """Split text and filter words containing only letters"""

        tokens = [
            x for x in filter(lambda x: x.isalpha(),
                              text.lower().split())
        ]
        nltk.download("stopwords")
        sr = stopwords.words("english")
        # throwing out 'stop words'
        return [x for x in filter(lambda x: x not in sr, tokens)]
Esempio n. 5
0
def visualize_words(
    text,
    tokenized_stop_words,
    cleanup=False,
    color=None
):  # https://www.datacamp.com/community/tutorials/wordcloud-python
    """Generate both wordcloud and frequency distribution visualizations when passed a list of text strings and a
    list of stopwords to remove. Both visualizations make use of wordlcoud's ability to process text, so that the
    word statistics correspond. Show these figures side by side."""
    plt.figure(figsize=(18, 6), dpi=80)

    processed_text = process_text(text, tokenized_stop_words,
                                  clean=cleanup)  # Tokenize & remove stopwords
    fdist = FreqDist(processed_text)

    plt.subplot(1, 2, 1)
    wordcloud = WordCloud(
        background_color="white").generate_from_frequencies(fdist)

    if color == "blue":
        wordcloud = wordcloud.recolor(color_func=blue_color_func)
    elif color == "red":
        wordcloud = wordcloud.recolor(color_func=red_color_func)
    elif color == "yellow":
        wordcloud = wordcloud.recolor(color_func=yellow_color_func)
    elif color == "purple":
        wordcloud = wordcloud.recolor(color_func=purple_color_func)

    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")

    plt.subplot(1, 2, 2)
    fdist.plot(30, cumulative=False)
    plt.show()
    print(fdist.most_common(30))
Esempio n. 6
0
def plot_freq_dist(texts, n_gram=1, num_words=25):
    """
    Create a plot of frequency distribution of the most common terms found in the documents.

    Args:
        texts: string of texts
		n_gram: default value to one-gram
        num_words: number of words to be shown, defaulted to 25
    """

    temp = texts.split(' ')  # tokenize texts
    if n_gram == 2:
        temp = bigrams(temp)  # create a list of bigrams

    fdist = FreqDist(temp)  # FreqDist object

    # set up plot
    plt.figure(figsize=(17, 7))
    plt.rc('xtick', labelsize=15)
    plt.rc('ytick', labelsize=15)
    plt.xlabel('', fontsize=18)
    plt.ylabel('', fontsize=18)

    # plot data
    fdist.plot(num_words)
def wordFreqDist(num, toxic_wordsSW):
    from nltk import FreqDist
    # Creating the word frequency distribution
    wordFreqDist = FreqDist(toxic_wordsSW)
    wordFreqDist
    # Plotting the word frequency distribution
    wordFreqDist.plot(num)
    return (wordFreqDist)
Esempio n. 8
0
def word_count(document):
    words = get_words(document["content"])
    stemmer = EnglishStemmer()
    words = [stemmer.stem(word) for word in words]
    fdist = FreqDist(words)
    for word, frequency in fdist.most_common(50):
        print(u'{};{}'.format(word, frequency))

    fdist.plot(30, cumulative=False)
Esempio n. 9
0
    def plot_frequency_distribution(self, n):
        """
        @brief      Plots a frequency distribution plot.

        @param      self  The object
        param       n     The desired number of most frequent words
        """
        assert (isinstance(n, int))
        fd = FreqDist(self.tokenized_text)
        fd.plot(n, cumulative=True)
Esempio n. 10
0
def words_length_distribution(text):
    words = del_stopwords(text)
    plt.ion()
    fig = plt.figure(figsize=(10, 4))
    fig.suptitle("WORDS LENGTH DISTRIBUTION")
    plt.gcf().subplots_adjust(bottom=0.15)
    fdist = FreqDist(len(token) for token in words)
    fdist.plot(30, cumulative=False)
    fig.savefig('wldis.png', bbox_inches="tight")
    return fdist.most_common((10))
Esempio n. 11
0
def words_distribution(text):
    words = lemmas(text)
    plt.ion()
    fig = plt.figure(figsize=(10, 4))
    fig.suptitle("WORDS DISTRIBUTION")
    plt.gcf().subplots_adjust(bottom=0.15)
    fdist = FreqDist(words)
    fdist.plot(30, cumulative=False)
    fig.savefig('wdis.png', bbox_inches="tight")
    return fdist.most_common((10))
Esempio n. 12
0
def main():
    # класс частного распределения
    fd_text1 = FreqDist(book.text1)

    print(str.format('Объект частотного распределения: {}', fd_text1))

    print(str.format(
        '50 наиболее встречаемых слов: {}', fd_text1.most_common(50)
    ))

    fd_text1.plot(50, cumulative=True)
Esempio n. 13
0
def calc_freq_dist(tokens):
    cleaned_tokens = tokens[:]
    for token in tokens:
        if token in stopwords.words('english') or not token.isalpha():
            cleaned_tokens.remove(token)

    #print(set(tokens) - set(cleaned_tokens))
    freq = FreqDist(cleaned_tokens)
    # for key,val in freq.items():
    #     print(str(key) + ':' + str(val))
    freq.plot(20, cumulative=False)
Esempio n. 14
0
def NAICS_word_freq():
    part1 = pd.read_csv('data/Part 1.csv', low_memory=False)
    labels_str = ''
    labels = list(set(
        part1['NAICS.display-label']))  # verified already to be all strings
    for item in labels:
        labels_str = item + " " + labels_str

    words = labels_str.split(" ")
    freqDist = FreqDist(words)
    freqDist.plot(10)
def plot_freqdist_from_series(pd_series,
                              tokenizer_obj=default_tk,
                              stop_words_list=gen_stop_words,
                              title='Term Frequency distribution',
                              num_terms=20,
                              figsize=(10, 10),
                              ngram_number=1,
                              lower_case=True):
    """Function that takes in a Pandas Series or column of a DataFrame and plots the Frequency Distribution
    of termns within that list of documents.
    Args:
    pd_series - either a standalone Pandas Series object or a dataframe column, e.g. df.job_description
    tokenizer_obj - (obj) a tokenizer object, normally of the NLTK variety
    num_terms - (int) how many of the top terms to plot on the Freq Dist, default 20
    stop_words - (list of str) list of stop words to exclude from final corpus
    figsize - (tuple of 2 integers) size of matplotlib plot, default is (10,10)
    ngram_numer - (int) what size ngrams to use, expects 1, 2 or 3. Default is 1.
                Values outside that list will just return the default. 
    lower_case - (bool) whether to return all words lowercased or not
    
    
    Plot of the Frequency Distribution of the words in the corpus, using NLTK's built in FreqDist function.

    Returns:
    f_dist_dict - (dict) ngrams as keys; frequency as value
    """
    all_text_lst = []
    for string in pd_series.tolist():
        output_txt = ''
        tokenized_str = tokenizer_obj.tokenize(string)
        for word in tokenized_str:
            if ((word.lower() not in stop_words_list)
                    and (word not in stop_words_list)):
                if lower_case:
                    output_txt += word.lower() + ' '
                else:
                    output_txt += word + ' '
            else:
                continue
        ngram_list = list(
            nltk.ngrams(output_txt.split(' ')[:-1], n=ngram_number))
        for ngram in ngram_list:
            all_text_lst.append(ngram)

    f_dist = FreqDist(all_text_lst)

    f_dist_dict = dict(f_dist)

    plt.figure(figsize=figsize)
    plt.title(title)
    f_dist.plot(num_terms)
    plt.show()

    return f_dist_dict
Esempio n. 16
0
 def generar_grafico2(self, lista_datos):
     import nltk
     from nltk import FreqDist
     lista_unica = ""
     for respuesta_encuesta in lista_datos:
         for respuesta_pregunta in respuesta_encuesta:
             for palabra in respuesta_pregunta:
                 lista_unica += palabra + " "
     tokens = nltk.word_tokenize(lista_unica)
     fdist = FreqDist(tokens)
     print(fdist.keys())
     print(fdist.values())
     fdist.plot(30, cumulative=False)
Esempio n. 17
0
    def fdistByGroup(self, values, title, dataset):

        for index, row in values.iterrows():
            fdist1 = FreqDist(row['tokens'])
            plt.ion()
            fdist1.plot(25,
                        cumulative=False,
                        title=(title + " (" + index + ") " + dataset))
            plt.tight_layout()

            plt.savefig(self.mkFileName("wfdist" + dataset + " " + index))
            plt.ioff()
            plt.close()
Esempio n. 18
0
    def fdistCumulative(self, values, title, dataset):

        allContents = []
        for wordList in values['tokens']:
            allContents += wordList

        fdist1 = FreqDist(allContents)

        plt.ion()
        fdist1.plot(25, cumulative=False, title=title + " " + dataset)
        plt.tight_layout()
        plt.savefig(self.mkFileName("wfdist" + dataset))
        plt.ioff()
        plt.close()
def text_show(words_lists):
    """
    文本分析
    """
    freq = FreqDist(words_lists)
    for key, val in freq.items():
        print(str(key) + ':' + str(val))
    #可视化折线图
    freq.plot(20, cumulative=False)
    #可视化词云
    words = ' '.join(words_lists)
    wc = WordCloud().generate(words)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    plt.show()
Esempio n. 20
0
def create_freqdist_plot(artist, songs):
    wordlist = []
    for song in songs:
        try:
            s = get_lyrics(get_artist("eminem"), get_song(song))
            s = s.split("\n")
            s = "".join(s)
            s = s.split(" ")
            wordlist.extend(s)
            print("Got lyrics for ", song)
        except:
            print("Error getting  lyrics for ", song)
            continue
    plt.figure(figsize=(19, 10))
    freqDist = FreqDist(wordlist)
    freqDist.plot(50)
Esempio n. 21
0
def analyze(inputfile):

    file = open(inputfile, "rt")

    text = file.read()
    file.close()

    # split into words
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    junk_words = ['nt']
    words = [w for w in words if not w in junk_words]
    print(words[:100])

    freqDist = FreqDist(words)
    words = list(freqDist.keys())

    print(freqDist.plot(50))
Esempio n. 22
0
def program(pages):
    final_list = []
    cleaned_list = []
    for p in pages:
        link = f"https://www.indeed.com/jobs?q={job}&l={city}{p}"
        page = requests.get(link)
        document = page.text
        soup = BeautifulSoup(document, 'html.parser')
        description = soup.find_all("div", {'class': 'summary'})

        for i in description:
            text = i.text.strip()
            final_list.append(text)

        for f in final_list:
            tokens = tokenizer.tokenize(f)
            clean = [w for w in tokens if not w in stop_words]
            lowercase = [w.lower() for w in clean]
            filtered_words = [
                x for x in lowercase if x not in job and x not in city
            ]
            cleaned_list += filtered_words

    fdist1 = FreqDist(cleaned_list)
    stop = timeit.default_timer()
    print('Time: ', stop - start)
    return fdist1.plot(
        20,
        title=
        f"Top 20 keywords for {job} jobs on indeed.com\nCity: {city}\nnumber of postings={len(pages)*10}"
    )
Esempio n. 23
0
def freq(tokens, n=None):
    '''
    This function takes a list of tokens and returns a list of the top n most 
    frequent tokens
    
    It also prints a frequency distribution of the top 50 tokens
    '''
    fdist2 = FreqDist(tokens)
    fdist2.plot(50, cumulative=True)
    [i[0] for i in fdist2.items()[:20]]
    if n is None:    
        print fdist2.items()[:20]
        return [i[0] for i in fdist2.items()[:20]]
    else:
        print fdist2.items()[:n]
        return [i[0] for i in fdist2.items()[:n]]
Esempio n. 24
0
def getFrequency(moviename, subfilepath):
    file = open(subfilepath, "rt")
    text = file.read()
    file.close()

    # split into words
    tokenized = word_tokenize(text)
    # remove uppercase words, usually sounds
    tokens = []
    for token in tokenized:
        if token.isupper():
            print(token)
        else:
            tokens.append(token)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    #until I implement my own tokenizer, exclude contractions
    # junk_words = ['nt', 'na', 'gon', 'won']
    junk_words = ['nt', 'na', 'gon', 'won', 'got', 'get', 'go', 'la']
    words = [w for w in words if not w in junk_words]
    # test print first 100 words
    # print(words[:100])

    freqDist = FreqDist(words)
    words = list(freqDist.keys())

    # print(freqDist.plot(10))

    fig = plt.figure(figsize=(10, 4))
    plt.gcf().subplots_adjust(bottom=0.15)  # to avoid x-ticks cut-off
    plt.xlabel('words', fontsize=18)
    plt.ylabel('times said', fontsize=16)
    fdist = FreqDist(freqDist)
    fdist.plot(10,
               cumulative=False,
               title="Most Frequently Used Words in " + moviename)
    plt.show()
    # fig.suptitle('test title', fontsize=20)
    fig.savefig(moviename + '.png', bbox_inches="tight")
Esempio n. 25
0
def main():
    obj = TweetFilter()
    with open("NBA_Warriors.txt", "r") as myFile:
        data = myFile.read().replace('\n',' ')
        data = unicode(data, 'utf-8')

    # This tokenizes each of the word in data
    tokenz = word_tokenize(data)

    # This passes the tokenz to the filter function
    newTokenz = obj.filter(tokenz)

    # Run a frequency distribution on the entire word list
    fdist1 = FreqDist(newTokenz)

    # Plots the top 30 words
    fdist1.plot(30, cumulative=False)
Esempio n. 26
0
def q5(cleaned_corpus_tokens, x):

    # Construct a frequency distribution over the lowercased tokens in the document
    #fd_doc_tokens = ...

    fd_doc_tokens = FreqDist(cleaned_corpus_tokens)

    # Find the top x most frequently used tokens in the document
    #top_tokens = ...
    top_tokens = fd_doc_tokens.most_common(x)

    # Produce a plot showing the top x tokens and their frequencies
    #...
    fd_doc_tokens.plot(x)

    # Return the top x most frequently used tokens
    return top_tokens
Esempio n. 27
0
def fenci(data):

    data = re.compile(r'\s+').sub('', data)
    data = re.compile(r'\!\[.*?\]\(.*?\)').sub('', data)

    seg_list = jieba.cut(data)

    # 基于TF-IDF算法的关键词抽取
    tags = jieba.analyse.extract_tags(data, topK=50)
    print ','.join(tags)

    # 基于TextRank算法的关键词抽取
    tags2 = jieba.analyse.textrank(data, topK=50)
    print ','.join(tags2)


    fdist = FreqDist([seg for seg in seg_list])
    fdist.plot(50)
Esempio n. 28
0
def q3(corpus, list_of_files, x):
    corpus_tokens = []

    # Get a list of all tokens in the corpus
    corpus_tokens = get_corpus_tokens(corpus, list_of_files)

    # Construct a frequency distribution over the lowercased tokens in the document
    #fd_doc_tokens = ...
    fd_doc_tokens = FreqDist(corpus_tokens)

    # Find the top x most frequently used tokens in the document
    #top_tokens = ...
    top_tokens = fd_doc_tokens.most_common(x)

    # Produce a plot showing the top x tokens and their frequencies
    #...
    fd_doc_tokens.plot(x)

    # Return the top x most frequently used tokens
    return top_tokens
Esempio n. 29
0
def plot(tokens: Iterable[str], *, first_n: int=None, omit: Iterable[str]=None):
    if omit is None:
        omit = []
    
    omit.extend(stopwords.words("english"))

    def is_clean(s: str):
        return all([
            len(s) > 2, 
            s.isalpha(), 
            s not in omit, 
        ])

    clean_tokens = filter(is_clean, tokens)

    freq = FreqDist(clean_tokens)

    if first_n is None:
        freq.plot()
    else:
        freq.plot(first_n)
Esempio n. 30
0
def lemmas_distribution_rus(dist):
    dict_file = codecs.open('literature/processed_vocabulary',encoding='utf-8')
    dict_text = dict_file.readlines()
    dict_file.close()
    dict_dict = {}
    import pymorphy2
    morph = pymorphy2.MorphAnalyzer()
    from collections import defaultdict
    lemmas_dist = defaultdict(int)    
    for line in dict_text:
        line_list = line.split(':')
        dict_dict[line_list[0]] = line_list[1]
    for word in dist.keys():
        if word in dict_dict:
            lemmas_dist[dict_dict[word]] += 1
        else:
            p = morph.parse(word)
            if len(p) > 0:
                print word
                print p[0].normal_form
                lemmas_dist[p[0].normal_form] += 1
                print lemmas_dist[p[0].normal_form]
    lemmas_dist = FreqDist(lemmas_dist)
    lemmas_dist.plot(100)
Esempio n. 31
0
class CountFreq(object):
    def __init__(self,  *args, **kwargs):
        self.txt_file = codecs.open('new1.txt', encoding='utf-8')
        self.stop_words = stopwords.words('english')
        self.clean_words = []
        self.loose_words = loose_words

    def clean_text(self):
        '''
        this method will clean all the data in new1.txt as well as transfer the data from a text file to
        a tokenized format that will be readily available for nltk to work with.
        :return: sanitized and tokenized words.
        '''
        stop = self.stop_words
        text = self.txt_file
        for lines in text:
            clean_words = [word for word in lines.lower().split() if word not in stop]
            self.clean_words.append(clean_words)
        self.clean_words = [val for sublist in self.clean_words for val in sublist]
        return self.clean_words

    def word_freq(self):
        '''
        single word frequency without any context. This will result in the top 100 words that will be shown and
        identified as the most repeated words. However, rigorous filtration will be applied to the printed words
        getting rid of words that are not Nouns
        :return: the frequency distribution, obj.
        '''
        classified_text = pt(self.clean_words)
        noun_descriptor = [word for word, pos in classified_text if pos == 'NN']
        revised_noun_descriptor = [word for word in noun_descriptor if word not in self.loose_words]
        self.fdist = FreqDist(revised_noun_descriptor)
        return self.fdist

    def graph_freq(self, cumulative):
        '''

        :param cumulative: Boolean value, when true it graphs the cumulative text score producing a diminishing
        return graph
        :return: a matplotlib graph
        '''

        return self.fdist.plot(100, cumulative=cumulative)
Esempio n. 32
0
File: kd.py Progetto: dmml/NLTK
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem


def lexical_diversity(text):
    return len(text) / len(set(text))

nostop_title = lemma(remove_stopwords(text_title))
nltk.Text(nostop_title).collocations()
# Frequency distribution of text
fdist_title = FreqDist(nostop_title)
fdist_title.most_common(50)
fdist_title.max()
fdist_title.plot(50, cumulative=True)#plot
fdist_title.plot(50)
total_words = len(set(nostop_title))
print("The total number of words in title of KD is: " + str(total_words))
avg_words = fdist_title.N()/total_words
print("Each word appears in title of KD is: " + str(int(avg_words)))


# process for text
f = open('kdtext.txt', encoding="latin-1")
raw_text = f.read()
# type
type(raw_text)
tokens = word_tokenize(raw_text)
type(tokens)
len(tokens)
Esempio n. 33
0
import json

from textstat.textstat import textstat

from nltk import FreqDist
from matplotlib.pyplot import *

filename = 'bieber-raw-test.json'
READ = 'rb'
TEXT=1
stopwords = open('stopwords',READ).read().splitlines()
tweets = json.load(open(filename,READ))
#Identify retweets

words = ' '.join([tweet['text'] for tweet in tweets]).split()

fdist = FreqDist(words)

fdist.plot(20)
tight_layout()
len(all_words)

without_stopwords = [w for w in all_words if not w in stop_words]

len(without_stopwords)

words_len = [w for w in without_stopwords if not len(w) < 5]

len(words_len)

freq_dist = FreqDist([word for word in words_len])

plt.figure(figsize=(12, 6))
plt.title(f'Frequency Distribution (Insincere Questions, Top 50 words)')
freq_dist.plot(50, marker='|', markersize=20)
plt.show()

from wordcloud import WordCloud

wordcloud = WordCloud(
    background_color='White').generate_from_frequencies(freq_dist)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
"""## Splitting dataset into train and test"""

from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X,
                                                    y,
Esempio n. 35
0
## MAYBE USE THIS?
# remove small words
# elected not to use this
finder2.apply_ngram_filter(lambda w1, w2: len(w1) < 2)
scored = finder2.score_ngrams(bigram_measures.raw_freq)
for bscore in scored[:20]:
    print(bscore)

# need to stem, but realy only want to stem "horse" and "horses"

# First list the top 50 words by frequency (normalized by the length of the document)
bbDist = FreqDist(sbbWords)
bbDist2 = DictionaryProbDist(bbDist, normalize=True)
bbDist2.prob('black')
bbDist2.prob('horse')
bbDist.plot(50)
# need to make second number number / len(sbbWords)
bbItems = bbDist.most_common(50)
# Show the normalized probability
for item in bbItems:
    print(item)

# King of the Wind Frequency Distribution
kwDist = FreqDist(skwWords)
kwDist2 = DictionaryProbDist(kwDist, normalize=True)
kwDist2.prob('said')
kwDist2.prob('agba')
kwDist.plot(50)
# need to make second number number / len(skwWords)
kwItems = kwDist.most_common(50)
for item in kwItems:
def plot_frequency_distribution(text, number_of_words):
    freq_dist = FreqDist(text)
    freq_dist.plot(number_of_words)
    plot_freqdist_freq(freq_dist, number_of_words)
Esempio n. 37
0
print count_vect.vocabulary_.get(u'algorithm')

#text classification algorithm
clf = SGDClassifier().fit(X_train_tfidf, train_labels)
# clf = svm.SVC().fit(X_train_tfidf, train_labels)
# clf = svm.SVC(kernel='linear', probability=True, class_weight='auto').fit(X_train_tfidf, train_labels)

#ubah data test ke bentuk vector tfidf
X_new_counts = count_vect.transform(test_gotg)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

#prediksi data test
predicted = clf.predict(X_new_tfidf)

#print label data test
score_arr = []
for category in predicted:
    print category
    score_arr.append(category)

#cek akurasi
X_old_counts = count_vect.transform(train_sentences)
X_old_tfidf = tfidf_transformer.transform(X_old_counts)
predicted_train = clf.predict(X_old_tfidf)
# print 'Akurasi:'
# print np.mean(predicted == test_labels)
score_fd = FreqDist(score_arr)
score_fd.plot(cumulative=False)
# for doc, category in zip(docs_new, predicted):
#   print('%r,%s' % (doc, predicted))
Esempio n. 38
0
# We can find the FIRST position of given word:
myText.index('about')

# Frequency distribution
from nltk import FreqDist

fdist1 = FreqDist(text1)

vocabulary = fdist1.keys()

frequencies = fdist1.values()

fdist1['whale']

fdist1.plot(20)

fdist1.plot(20, cumulative = True)   

# List comprehension
# Counting the number of characters in each word in a text
[len(w) for w in text1]

# Bigram function returns a list of bigrams
from nltk import bigrams, trigrams

bigrams(myText2)

trigrams(myText2)

bigramsText1 = bigrams(text1) # bigramsText1[0] is the tuple containing the first bigram
Esempio n. 39
0
from nltk.corpus import brown
brown.words()

# Find the frequency of each word in a text
fd = FreqDist(brown.words())

# Find the most frequent words in a text:
# http://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
import operator
max(fd.iteritems(), key=operator.itemgetter(1))
sorted(fd.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
# Or use the wrapper function
fd.most_common(10)

# plot the most frequent words
fd.plot(10)
fd.plot(10, cumulative=True)

# See the words with lowest frequency (these words are called hapaxes)
fd.hapaxes()

# Count all the words
len(text1)
# count unique words
len(set(text1))
# count unique words, irrespective of word case
len(set(w.lower() for w in text1))


# Find the words that are more than 15 characters long
words = set(brown.words())
Esempio n. 40
0
def freq(remstop):
    fdist2 = FreqDist(remstop)
    x = fdist2.items()[:20]
    fdist2.plot(50, cumulative=True)
Esempio n. 41
0
#!/usr/bin/python
# coding: utf-8

# 2013/03/20

from nltk import FreqDist

fdist = FreqDist(samples) # samples で指定されたデータの頻度分布を生成
fdist.inc(sample) # sampleで指定されたデータの数を1増やす
fdist['データ'] # 指定されたデータの出現数
fdist.freq('データ') # 指定されたデータの頻度
fdist.N() # サンプルの総数
fdist.keys() # 頻度の順にソートされたサンプル
for sample in fdist: # 頻度の順にサンプルをイテレート
    pass
fdist.max() # 数の最も多いサンプル
fdist.tabulate() # 頻度分布を表形式で表示
fdist.plot() # 頻度分布をプロット
fdist.plot(cumulative=True) # 累積頻度をプロット
fdist1 < fdist2 # fdist1のサンプルの頻度がfdist2 より少ないかをテスト


Esempio n. 42
0
def _expt0_answerhist():
  hist = answerhist()

  fdist = FreqDist(hist)
  fdist.plot(50)
Esempio n. 43
0
        # soup = BeautifulSoup(open(path))
        # chapter = soup.findAll(text=True)[0]
        file = open(path)
        chapter = file.read()
        chapter_tuple = (chapter, 'real')

        words = [ w.lower() for w in word_tokenize(chapter) ]

        real_chapters.append(chapter_tuple)
        real_words.extend(words)

word_total  = len(real_words)
harry_total = real_words.count('harry')

fd = FreqDist(real_words)
fd.plot(26)

# filtered_real_words = [ w.lower() for w in real_words if w.isalpha() ]
filtered_real_words = [ w for w in real_words if w.isalpha() and w not in stop ]

Rowling = filtered_real_words

fd  = FreqDist(filtered_real_words)
fd.plot(26)

file = open('ao_hp_stories.jl')

ao_chapters = []
ao_words    = []
AO3         = []
AO3_normed  = []
Esempio n. 44
0
preprocessedStory = preprocess(storytext)
tokens = nltk.word_tokenize(preprocessedStory)
print tokens[0:20]
len(tokens)


stop = stopwords.words('english')
remstop = [i for i in tokens if i not in stop]
remstop[0:20]
len(remstop)

# 5810 tokens --> 2670 without stopwords
fdist2 = FreqDist(remstop)
print(fdist2)
fdist2.most_common()[:20]
fdist2.plot(50, cumulative=True)


# Turn it into an nltk text object
SpeechText = nltk.Text(tokens)
SpeechText.concordance('america', lines=47)
SpeechText.concordance('negro', lines=38)
SpeechText.concordance('nation', lines=38)
SpeechText.concordance('white', lines=38)
SpeechText.concordance('negroes', lines=38)
SpeechText.concordance('struggle', lines=38)
SpeechText.concordance('justice', lines=38)
SpeechText.concordance('problems', lines=38)
SpeechText.concordance('freedom', lines=38)
SpeechText.concordance('rights', lines=38)
Esempio n. 45
0
def create_dist(nltk_text, stopwords):
    dist = FreqDist(w.lower() for w in nltk_text if len(w)>=3 and w.isalnum() and w.lower() not in stopwords)
    dist.plot(50)
    print "Number of wordforms"+str(len(dist))
    return dist
Esempio n. 46
0
from nltk.corpus import words
from nltk import ConditionalFreqDist as CFreqDist , FreqDist

cfd = CFreqDist([(w[0] , len(w)) for w in words.words()])
cfd.plot()
fd = FreqDist([w[0] for w in words.words()])
fd.plot()
Esempio n. 47
0
# Word and sentence tokenization
tokenized_sentences = sent_tokenize(webtext.raw(file_path))
#tokenized_words = reduce(operator.concat, [word_tokenize(s) for s in tokenized_sentences])

tokenizer = RegexpTokenizer(r'\w+')

stop = stopwords.words('english') + list(string.punctuation)
raw_tokens = tokenizer.tokenize(webtext.raw(file_path).lower())
tokens = [i for i in raw_tokens if i not in stop]

# Convert to nltk text
text = Text(tokens)

# Freq dist
fdist = FreqDist(text)
fdist.plot(num_of_words_to_plot, cumulative = False)
scarlet_commons = [word for word, counts in fdist.most_common(num_of_words_compare)]
print('Most common words for Study in Scarlet:\n', fdist.most_common(num_of_words_to_plot), '\n')

# Moby Dick frequencies
moby_raw_text = gutenberg.raw(moby_file_name)
moby_tokens = tokenizer.tokenize(moby_raw_text.lower())
moby_text = Text([w for w in moby_tokens if w not in stop])
fdist_moby = FreqDist(moby_text)
moby_commons = [word for word, counts in fdist_moby.most_common(num_of_words_compare)]
print('Most common words for Moby Dick:\n', fdist_moby.most_common(num_of_words_compare))

# Frequencies comparison
## In scarlet but not in moby
diff_scarlet_vs_moby = [word for word in scarlet_commons if word not in moby_commons]
print('=============\nIn Study in scarlet, but not in Moby Dick: ', ', '.join(diff_scarlet_vs_moby))
Esempio n. 48
0
File: dsc.py Progetto: dmml/NLTK

# lemma
def lemma(text):
    lmtzr = WordNetLemmatizer()
    return [lmtzr.lemmatize(w) for w in text]

nostop_title = lemma(remove_stopwords(text_title))
# check the collocations of text
nostop_title = nltk.Text(nostop_title)
nostop_title.collocations()
fdist_title = FreqDist(nostop_title)  # Frequency distribution of text
fdist_title.most_common(50)  # most common 50
fdist_title['science']  # return count of a given word
fdist_title.max()  # max counts
fdist_title.plot(50, cumulative=True)  # plot
fdist_title.plot(50)
fdist_title.tabulate(50)  # tabulate
total_words = len(set(nostop_title))
print("The total number of words in title of dsc is: " + str(total_words))
avg_words = fdist_title.N() / total_words
print("Each word appears in title of dsc is: " + str(int(avg_words)))

# bigrams, trigrams
from nltk import bigrams
from nltk import trigrams
word_pair = list(bigrams(nostop_title))
word_triple = list(trigrams(nostop_title))
bigrams_title = FreqDist(word_pair)
trigrams_title = FreqDist(word_triple)
bigrams_title.most_common(50)
Esempio n. 49
0
word_len = [len(w) for w in text1]
print word_len





# Example	Description
# fdist = FreqDist(samples)	create a frequency distribution containing the given samples
# fdist[sample] += 1	increment the count for this sample
# fdist['monstrous']	count of the number of times a given sample occurred
# fdist.freq('monstrous')	frequency of a given sample
# fdist.N()	total number of samples
# fdist.most_common(n)	the n most common samples and their frequencies
# for sample in fdist:	iterate over the samples
# fdist.max()	sample with the greatest count
# fdist.tabulate()	tabulate the frequency distribution
# fdist.plot()	graphical plot of the frequency distribution
# fdist.plot(cumulative=True)	cumulative plot of the frequency distribution
# fdist1 |= fdist2	update fdist1 with counts from fdist2
# fdist1 < fdist2	test if samples in fdist1 occur less frequently than in fdist2

fdlist = FreqDist(len(w) for w in text1)
print dict(fdlist)
print fdlist.most_common(3)
print fdlist.max()
print fdlist[2]
print fdlist.tabulate()
fdlist.plot()
fdlist.plot(cumulative=True)
Esempio n. 50
0
import matplotlib
import string

exclude = set(string.punctuation)

with open("YT_Comment_Output.txt", "rb") as f:
	lines = [line.rstrip() for line in f]
	splits = [line.split() for line in lines]
	some_upper = [item for sublist in splits for item in sublist]
	#replace BOM w known stopword
	BOM_gone = [word.replace('\xef\xbb\xbf', 'i') for word in some_upper]
	punct_gone = []
	for word in BOM_gone: 		
		punct_gone.append(''.join(ch for ch in word if ch not in exclude))
	YT_comment_words = [word.lower() for word in punct_gone]

with open('stopwords.txt', 'rb') as f:
    stopwords = [line.rstrip() for line in f]

print YT_comment_words[:10]
print stopwords[:10]

filtered_words = [w for w in YT_comment_words if not w in stopwords]

print filtered_words[:10]

fd = FreqDist(filtered_words)
print fd.values()[:10]
print fd
fd.plot(30)
# Getting the frequency distribution of individual words in the reviews
fd = FreqDist()
for word in cleaned_reviews:
    fd[word] += 1
    
# Examining the top 5 most frequent words
fd.most_common(5)


# In[ ]:


# Plotting the top 50 most frequent words
plt.figure(figsize = (10, 5))
fd.plot(50)
plt.show()


# ### Observations
# Of the 50 most frequent words across customer reviews, six reveal food preferences: **chicken, biryani, veg, pizza, rice, paneer**. The only negative word in the top 50 is "bad".
# 
# Factors contributing to restaurant experience are mentioned in the following (descending) order of frequency: place > taste > service > time > ambience > staff > quality > delivery > menu > quantity > friendly.
# 
# Now let us repeat the analysis on a bi-gram level. Bi-grams are pairs of words which can provide better context than individual words.

# In[ ]:


# Generating bigrams from the reviews
bigrams = bigrams(cleaned_reviews)
# 或者使用 Counter 来实现
from collections import Counter
print(Counter(fd).most_common(5))  # [(',', 6750), ('the', 3120), ('to', 2775), ('.', 2741), ('and', 2739)]
# 简奥斯丁的小说 Persuasion 总共包含 98171 字和 6141 个唯一单词. 此外, 最常见的词例是逗号, 接着是单词 the.
# 如果你对海量的语料库进行统计, 将每个单词的出现次数和单词出现的频率由高到低记录在表中, 我们可以直观地发现列表中词频和词序的关系.

# 事实上, 齐普夫(Zipf)证明了这个关系可以表达为数学表达式, 例如: 对于任意给定单词, f * r = k(正比于 k);
# f 是词频, r 是词的排列, 或者是在排序后列表中的词序, 而 k 则是一个常数.
# 复杂的公式为: f * r = 1 / log(N); N 为所有单词的总数
# 举个例子, 第五高频的词应该比第十高频的词的出现次数要多两倍. 在 NLP 文献中, 以上的关系通常被称为 "齐普夫定律(Zipf’s Law)" .

# 即使由齐普夫定律描述的数学关系不一定完全准确, 但它依然对于人类语言中单词分布的刻画很有用——词序小的词很常出现,
# 而稍微词序大一点的则较为少出现, 词序非常大的词则几乎没有怎么出现; 相关的 log-log 关系如图 1, 可以很清晰地发现我们语料库中对应的扩展关系
##################################################################
## 使用 NLTK 对齐普夫定律进行作图
import matplotlib.pyplot as plt
fd = FreqDist(gutenberg.words(gutenberg.fileids()))  # 统计 Gutenberg 中每个词例数量
print(fd)  # <FreqDist with 51156 samples and 2621613 outcomes>; 5166 个非重复, 2621613 个 token
fd.plot(50, title='hello', cumulative=True)  # 累加
fd.plot(50)  # 前 50 对应的出现次数

## 传统 matplotlib 方法, 和上面对比, 可以使用 loglog()
freqs = []  # 初始化两个空列表来存放词序和词频
for word, rank in fd.most_common(500): freqs.append(rank)  # 计算排名前 500 的词的出现次数
plt.subplot(2, 1, 1); plt.plot(range(500), freqs)
plt.subplot(2, 1, 2); plt.loglog(range(500), freqs)  # 在 log-log 图中展示词序和词频的关系
plt.xlabel('rank(r)', fontsize=14, fontweight='bold')
plt.ylabel('frequenly(f)', fontsize=14, fontweight='bold')
plt.grid(True)
plt.show()
Esempio n. 53
0
# We can find the FIRST position of given word:
myText.index('about')

# Frequency distribution
from nltk import FreqDist

fdist1 = FreqDist(text1)

vocabulary = fdist1.keys()

frequencies = fdist1.values()

fdist1['whale']

fdist1.plot(20)

fdist1.plot(20, cumulative=True)

# Finding the really long words (using Pythons list comprehension):
V = set(text1)

[w for w in V if len(w) > 15]

# Note that the variable w in the list comprehension is just a dummy. The following gives the same result
[whatever for whatever in V if len(whatever) > 15]

# Finding the long words (more than seven letters) that appear more than seven times
[w for w in V if len(w) > 7 and fdist1[w] > 7]

# Counting the number of characters in each word in a text
Esempio n. 54
0
data = (('woman', ('this', 'bought')),
        ('man', ('this', 'looked')))

c = CFD(data)

# 返回FreqDist object
print c['woman']

# 下面两个statements是等价的
print c['woman'][('this', 'bought')]
print c['woman'].freq(('this', 'bought'))

# 尝试访问不存在的condition
print c['a']

# 利用相同的数据测试在FD下的结果
f = FD(data)
print f

# 尝试没有提供数据时的初始化结果
print CFD()

#===============================================================================
# 测试plot
#===============================================================================

import matplotlib
import matplotlib.pyplot as plt

f.plot(cumulative=True)