Example #1
0
def ch03_42_wordnet_semantic_index():
  from nltk.corpus import webtext
  from nltk.corpus import wordnet as wn
  postings = []
  docids = {}
  for (pos, fileid) in enumerate(webtext.fileids()):
    docids[pos] = fileid
    wpos = 0
    words = webtext.words(fileid)
    for word in words:
      try:
        postings.append((word.lower(), (pos, wpos)))
        offset = wn.synsets(word)[0].offset
        postings.append((offset, (pos, wpos)))
        poffset = wn.synsets(word)[0].hypernyms()[0].offset
        postings.append((poffset, (pos, wpos)))
      except IndexError:
        continue
      wpos = wpos + 1
  index = nltk.Index(postings)
  query = "canine"
  qpostings = []
  qpostings.extend([(pos, wpos) for (pos, wpos) in index[query]])
  try:
    offset = wn.synsets(query)[0].offset
    qpostings.extend([(pos, wpos) for (pos, wpos) in index[offset]])
  except IndexError:
    pass
  for (pos, wpos) in qpostings:
    left = webtext.words(docids[pos])[wpos-4:wpos]
    right = webtext.words(docids[pos])[wpos:wpos+4]
    print left, right
Example #2
0
def ch03_42_wordnet_semantic_index():
    from nltk.corpus import webtext
    from nltk.corpus import wordnet as wn
    postings = []
    docids = {}
    for (pos, fileid) in enumerate(webtext.fileids()):
        docids[pos] = fileid
        wpos = 0
        words = webtext.words(fileid)
        for word in words:
            try:
                postings.append((word.lower(), (pos, wpos)))
                offset = wn.synsets(word)[0].offset
                postings.append((offset, (pos, wpos)))
                poffset = wn.synsets(word)[0].hypernyms()[0].offset
                postings.append((poffset, (pos, wpos)))
            except IndexError:
                continue
            wpos = wpos + 1
    index = nltk.Index(postings)
    query = "canine"
    qpostings = []
    qpostings.extend([(pos, wpos) for (pos, wpos) in index[query]])
    try:
        offset = wn.synsets(query)[0].offset
        qpostings.extend([(pos, wpos) for (pos, wpos) in index[offset]])
    except IndexError:
        pass
    for (pos, wpos) in qpostings:
        left = webtext.words(docids[pos])[wpos - 4:wpos]
        right = webtext.words(docids[pos])[wpos:wpos + 4]
        print left, right
Example #3
0
def save_other_grams():
    HIGH_FREQ_UNI = 0.01
    HIGH_FREQ_BI = 0.02
    HIGH_FREQ_TRI = 0.02
    other_corpus_unigrams = [
        w.lower()
        for w in (gutenberg.words() + brown.words() + webtext.words())
    ]
    other_corpus_freq_unigrams = high_freq(other_corpus_unigrams,
                                           HIGH_FREQ_UNI)
    output = open('unigrams_data.pkl', 'wb')
    dump(other_corpus_freq_unigrams, output, -1)
    output.close()

    other_corpus_bigrams = nltk.bigrams(other_corpus_unigrams)
    other_corpus_freq_bigrams = high_freq(other_corpus_bigrams, HIGH_FREQ_BI)
    output = open('bigrams_data.pkl', 'wb')
    dump(other_corpus_freq_bigrams, output, -1)
    output.close()

    other_corpus_trigrams = nltk.trigrams(other_corpus_unigrams)
    other_corpus_freq_trigrams = high_freq(other_corpus_trigrams,
                                           HIGH_FREQ_TRI)
    output = open('trigrams_data.pkl', 'wb')
    dump(other_corpus_freq_trigrams, output, -1)
    output.close()
Example #4
0
    def add_known_words(self):
        """Add known words to the spellchecker from external and internal files"""
        # adding known words file if given - these words will not count as misspelled
        if self.known_words_file_paths:
            for known_words_file_path in self.known_words_file_paths:
                self.spellchecker.word_frequency.load_text_file(
                    known_words_file_path)

        # adding the KNOWN_WORDS to the spellchecker recognized words.
        self.spellchecker.word_frequency.load_words(KNOWN_WORDS)

        if self.expand_dictionary:
            # nltk - natural language tool kit - is a large package containing several dictionaries.
            # to use it we need to download one of it's dictionaries - we will use the
            # reasonably sized "brown" and "webtext" dicts.
            # to avoid SSL download error we disable SSL connection.
            try:
                _create_unverified_https_context = ssl._create_unverified_context
            except AttributeError:
                pass
            else:
                ssl._create_default_https_context = _create_unverified_https_context

            # downloading "brown" and "webtext" sets from nltk.
            click.secho(
                "Downloading expanded dictionary, this may take a minute...",
                fg='yellow')
            nltk.download('brown')
            nltk.download('webtext')

            # adding nltk's word set to spellchecker.
            self.spellchecker.word_frequency.load_words(brown.words())
            self.spellchecker.word_frequency.load_words(webtext.words())
Example #5
0
def freq(arquivo):
    palavras = webtext.words(arquivo)
    that = nltk.FreqDist([s for s in token(palavras) if s == 'that'])
    #that.plot(cumulative = True)
    the = nltk.FreqDist([s for s in token(palavras) if s == 'the'])
    #the.plot(cumulative = True)
    return print(f'O arquivo {arquivo} possui {that["that"]} "that" e {the["the"]} "the".')
Example #6
0
def main():
    text = webtext.words("grail.txt")
    # Dein Code¨

    for words in text:
        words = words.lower()
        if len(words) == word_length:
            if words_with_wordlength.has_key(words):
                value = words_with_wordlength[words]
                words_with_wordlength[words] = value + 1
            else:
                words_with_wordlength[words] = 1
        if words[0] == letter:
            if words_with_letter.has_key(words):
                value = words_with_letter[words]
                words_with_letter[words] = value + 1
            else:
                words_with_letter[words] = 1

    sorted_words_wordlength = sorted(words_with_wordlength.items(), key=operator.itemgetter(1), reverse=True)

    print "Häufigstes Wort mit", word_length, "Buchstaben:"
    print sorted_words_wordlength[0][0], ":", sorted_words_wordlength[0][1]

    sorted_words_letter = sorted(words_with_letter.items(), key=operator.itemgetter(1), reverse=True)

    print "Häufigstes Wort mit Anfangsbuchstabe", letter, ":"
    print sorted_words_letter[0][0], ":", sorted_words_letter[0][1]

    for words in sorted_words_wordlength:
        if words[0][0] == letter:
            print "Häufigstes Wort mit", word_length, "Buchstaben und Angangsbuchstabe", letter + ":"
            print words[0], ":", words[1]
            break
Example #7
0
def generateSentence():
    corpus = random.randint(0,3)
    if corpus == 0:
        text = brown.words()
    elif corpus == 1:
        text = gutenberg.words()
    elif corpus == 2:
        text = webtext.words()
    elif corpus == 3:
        text = movie_reviews.words()
    tweetString = ''
    lengthOfTweet = random.randint(0,20)
    len(text)
    firstRun = True
    blank = ' '
    startOfWord = ''
    startOfWordIndex = 0
    startingWord = random.randint(0, (len(text) - 40))
    punctuation = [".", ",", '"', ";", ":", "?", "!", ")", "(", "*", "[", "]", "‘", "“", "#"]

    for x in xrange(startingWord,(startingWord + len(text))):
        startOfWord = text[x]
        if startOfWord ==".":
                startOfWordIndex = x
                break

    for x in xrange(startOfWordIndex + 1, startOfWordIndex+lengthOfTweet):
        if text[x] in punctuation:
            tweetString = tweetString + text[x]

        elif text[x] not in punctuation:
            tweetString = tweetString + blank + text[x]
    return tweetString
Example #8
0
def demo(scorer=None, compare_scorer=None):
    """Finds bigram collocations in the files of the WebText corpus."""
    from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores

    if scorer is None:
        scorer = BigramAssocMeasures.likelihood_ratio
    if compare_scorer is None:
        compare_scorer = BigramAssocMeasures.raw_freq

    from nltk.corpus import stopwords, webtext

    ignored_words = stopwords.words('english')
    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words

    for file in webtext.fileids():
        words = [word.lower()
                 for word in webtext.words(file)]

        cf = BigramCollocationFinder.from_words(words)
        cf.apply_freq_filter(3)
        cf.apply_word_filter(word_filter)

        print(file)
        print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
        print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__,
                                               spearman_correlation(
                                                   ranks_from_scores(cf.score_ngrams(scorer)),
                                                   ranks_from_scores(cf.score_ngrams(compare_scorer)))))
Example #9
0
def frequency(filter, arq):
    for p in filter:
        print(f'Arquivo {arq}'
              f' e frequência da palavra {p}'
              f' {nltk.FreqDist(webtext.words(arq))[p]}')

    return webtext.words(arq)
Example #10
0
def demo(scorer=None, compare_scorer=None):
    """Finds trigram collocations in the files of the WebText corpus."""
    from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores

    if scorer is None:
        scorer = BigramAssocMeasures.likelihood_ratio
    if compare_scorer is None:
        compare_scorer = BigramAssocMeasures.raw_freq

    from nltk.corpus import stopwords, webtext

    ignored_words = stopwords.words('english')
    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words

    for file in webtext.fileids():
        words = [word.lower() for word in webtext.words(file)]

        cf = BigramCollocationFinder.from_words(words)
        cf.apply_freq_filter(3)
        cf.apply_word_filter(word_filter)

        print(file)
        print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
        print('\t Correlation to %s: %0.4f' %
              (compare_scorer.__name__,
               spearman_correlation(
                   ranks_from_scores(cf.score_ngrams(scorer)),
                   ranks_from_scores(cf.score_ngrams(compare_scorer)))))
def raw_word_generator():
    from nltk.corpus import webtext, reuters, brown, gutenberg

    return (w.lower() for w in itertools.chain(
        brown.words(),
        webtext.words(),
        reuters.words(),
        gutenberg.words(),
    ) if w.isalnum())
Example #12
0
def get_bigrams(filelocation, ratio):
    '''BigramCollocationFinder constructs two frequency distributions: one for each word, 
    and another for bigrams.'''
    words = [w.lower() for w in webtext.words(filelocation)]
    stopset = set(stopwords.words('english'))
    filter_stops = lambda w: len(w) < 3 or w in stopset
    bcf = BigramCollocationFinder.from_words(words)
    bcf.apply_word_filter(filter_stops)
    return bcf.nbest(BigramAssocMeasures.likelihood_ratio, ratio)
Example #13
0
def get_trigrams(filelocation, ratio):
    '''In addition to BigramCollocationFinder, there's also TrigramCollocationFinder, which 
    finds triplets instead of pairs.'''
    words = [w.lower() for w in webtext.words(filelocation)]
    stopset = set(stopwords.words('english'))
    filter_stops = lambda w: len(w) < 3 or w in stopset
    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_word_filter(filter_stops)
    tcf.apply_freq_filter(3)
    return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, ratio)
Example #14
0
def load_data(folder_name):
    filter_list = ['âˇ','ŕş','đžń','é','ĺ','é','ę','ŕť','đžđťđžđłđ','ľŕš','şá','řşů','ç','żŕ','ŕ','î','ŕž','ď','ďż','ŕż','ă','ŕˇ','łŕ','ŕľ','ąŕ','l','ŕ','ŕś','','ŕľ','á','ŕž','ŕ','ů','ř','ŕš','đˇđ','őťőľö','őľő','â','ôźőťőłőąőľőť','đľ','ä','đ','ő','ö','ń','đťđ','đž','post','date','nbsp','cc','âśăłă','âťăšr','âšâś','âťăšr','âšâś','âśâąâ','âśâąâ','âťasian','âťasian','âś','âśăłă','nfsâ','â','nov','com','oct','octn','theâ','aimăš','maniăšre','cm','http','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
    path_name_lists = glob.glob(ROOT_DIR + folder_name)
    results = []
    # Loading the data
    for path_name in path_name_lists:
        words = [w.lower() for w in webtext.words(path_name)]
        filter_words = [word for word in words if word not in filter_list and word.isalpha()]
        results.extend(filter_words)
        print(len(results))
    return results
def TestSim_irrelevant_corpus():
    '''
    Webtext: Wine
    '''
    wine = ' '.join(webtext.words('wine.txt'))
    tokens = tokenize(wine)
    tokens = [
        tokens[i * 100:(i + 1) * 100] for i in range(int(len(tokens) / 100))
    ]
    vecs = [text2Vec(' '.join(token))[0] for token in tokens]
    print(len(vecs))
    print(vecs[0])
Example #16
0
def webtext():
    from nltk.corpus import webtext as webtext
    from nltk.corpus import nps_chat

    # list comprehension version
    file_ids = [fileid for fileid in webtext.fileids()]
    chat_file_ids = [fileid for fileid in nps_chat.fileids()]

    pirates = webtext.raw('pirates.txt')
    pirates_words = len(webtext.words('pirates.txt'))
    pirates_sents = len(webtext.sents('pirates.txt'))
    uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')]))

    lexical_diversity = lexical_div(uniqs, pirates_words)

    # import nltk.book as book
    # text1 = book.text1
    # pirates = webtext.raw('pirates.txt')

    return render_template('webtext.html',
                           file_ids=file_ids,
                           chat_file_ids=chat_file_ids,
                           pirates=pirates)
Example #17
0
def fun_2_1():
    from nltk.util import ngrams
    from nltk.corpus import alpino

    # Unigram (一元语法)代表单个标识符。以下代码用于为 Alpino 语料库生成 unigrams
    print alpino.words()
    unigrams = ngrams(alpino.words(), 1)
    for i in unigrams:
        # print i
        pass

    # 考虑另一个有关从 alpino 语料库生成 quadgrams 或 fourgrams (四元语法)的例子
    unigrams = ngrams(alpino.words(), 4)
    for i in unigrams:
        # print i
        pass

    # bigram(二元语法)指的是一对标识符。为了在文本中找到 bigrams,首先需要搜索
    # 小写单词,把文本创建为小写单词列表后,然后创建 BigramCollocationFinder 实例。
    # 在 nltk.metrics 包中找到的 BigramAssocMeasures 可用于在文本中查找 bigrams
    from nltk.collocations import BigramCollocationFinder
    from nltk.corpus import webtext
    from nltk.metrics import BigramAssocMeasures
    tokens = [t.lower() for t in webtext.words('grail.txt')]
    words = BigramCollocationFinder.from_words(tokens)
    print words.nbest(BigramAssocMeasures.likelihood_ratio, 10)

    # 在上面的代码中,我们可以添加一个用来消除停止词和标点符号的单词过滤器
    from nltk.corpus import stopwords
    set1 = set(stopwords.words('english'))
    stops_filter = lambda w: len(w) < 3 or w in set1
    words.apply_word_filter(stops_filter)
    print words.nbest(BigramAssocMeasures.likelihood_ratio, 10)

    # 这里,我们可以将 bigrams 的频率更改为其他数字。
    # 另一种从文本中生成 bigrams 的方法是使用词汇搭配查找器,如下代码所示
    import nltk
    text1 = "Hardwork is the key to success. Never give up!"
    word = nltk.tokenize.wordpunct_tokenize(text1)
    finder = BigramCollocationFinder.from_words(word)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    value = finder.score_ngrams(bigram_measures.raw_freq)
    print sorted(bigram for bigram, score in value)

    # 为了生成 fourgrams 并生成 fourgrams 的频率,可以使用如下代码
    text = "Hello how are you doing ? I hope you find the book interesting"
    tokens = nltk.wordpunct_tokenize(text)
    fourgrams = nltk.collocations.QuadgramCollocationFinder.from_words(tokens)
    for fourgram, freq in fourgrams.ngram_fd.items():
        print(fourgram, freq)
def _english_word_frequencies():
    """
    Get frequencies of english words based on four corpora:
    Gutenberg Corpus, Web and Chat Text, Brown Corpus, Reuters Corpus.
    
    Returns:
        tuple: Frequencies of words based on Gutenberg, Web and Chat Text, 
               Brown and Reuters corpora, respectively
    """
    gutenberg_freqs = FreqDist(gutenberg.words())
    webtext_freqs = FreqDist(webtext.words())
    brown_freqs = FreqDist(brown.words())
    reuters_freqs = FreqDist(reuters.words())
    
    return gutenberg_freqs, webtext_freqs, brown_freqs, reuters_freqs
def extract_bigrams(file_toanalyze, word_length, num_of_bigrams):
    # get the list of words from the file
    words_list = [word.lower() for word in webtext.words(file_toanalyze)]

    # construct a finder object to find the best bigrams
    finder = BigramCollocationFinder.from_words(words_list)

    # create a noise filtering handler
    noise_handler = filter_word_noise(word_length)

    # apply the noise filtering handler
    finder.apply_word_filter(noise_handler)

    # actually find the desired number of bigrams
    list_of_bigrams = finder.nbest(BigramAssocMeasures.likelihood_ratio,
                                   num_of_bigrams)

    # return the list of bigrams
    return list_of_bigrams
Example #20
0
def main():
  # store word lengths
  brown_word_lens = []
  web_word_lens = []
  inaugural_word_lens = []
  gutenberg_word_lens = []
  genesis_word_lens = []

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      gutenberg_word_lens.append(len(word))

  for file in brown.fileids():
    for word in brown.words(file):
      brown_word_lens.append(len(word))

  for file in webtext.fileids():
    for word in webtext.words(file):
      web_word_lens.append(len(word))

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      inaugural_word_lens.append(len(word))

  for file in genesis.fileids():
    for word in genesis.words(file):
      genesis_word_lens.append(len(word))
  with open("wordlens.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), 
        len(web_word_lens), len(brown_word_lens),
        len(gutenberg_word_lens))):
      for corpus in [genesis_word_lens, inaugural_word_lens,
          web_word_lens, brown_word_lens, gutenberg_word_lens]:
        if(i >= len(corpus)):
          f.write(",")
        else:
          f.write(str(corpus[i]) + ",")
      f.write("\n")
Example #21
0
def demo(scorer_bam=None,
         compare_scorer_bam=None,
         scorer_tam=None,
         compare_scorer_tam=None):
    if scorer_bam is None:
        scorer_bam = BigramAssocMeasures.likelihood_ratio
    if compare_scorer_bam is None:
        compare_scorer_bam = BigramAssocMeasures.raw_freq

    if scorer_tam is None:
        scorer_tam = TrigramAssocMeasures.likelihood_ratio
    if compare_scorer_tam is None:
        compare_scorer_tam = BigramAssocMeasures.raw_freq

    regex = '^[A-Za-z]+$'  #正则表达式匹配英文单词
    str_regex = re.compile(regex)
    for file in webtext.fileids():  # 根据文件逐个处理
        words_list = []
        for word in webtext.words(file):
            if not str_regex.match(word):  #如果不是纯英文单词,则跳过
                continue
            words_list.append(word)

        # 获取二元搭配,窗口大小为3,4,5
        for window_size in range(3, 4):
            bcf = BigramCollocationFinder.from_words(words_list, window_size)
            bcf.apply_freq_filter(window_size)

            for item in bcf.nbest(scorer_bam, 1000):
                get_collocation(item)  #获取搭配次词
        # 获取三元搭配
        for window_size in range(3, 4):
            tcf = TrigramCollocationFinder.from_words(words_list, window_size)
            tcf.apply_freq_filter(window_size)
            # tcf.apply_word_filter(word_filter)
            #corr = spearman_correlation(ranks_from_scores(tcf.score_ngrams(scorer)),
            #                          ranks_from_scores(tcf.score_ngrams(compare_scorer)))
            for item in tcf.nbest(scorer_tam, 1000):
                get_collocation(item)
Example #22
0
def generateSentence():
    corpus = random.randint(0, 3)
    if corpus == 0:
        text = brown.words()
    elif corpus == 1:
        text = gutenberg.words()
    elif corpus == 2:
        text = webtext.words()
    elif corpus == 3:
        text = movie_reviews.words()
    tweetString = ''
    lengthOfTweet = random.randint(0, 20)
    len(text)
    firstRun = True
    blank = ' '
    startOfWord = ''
    startOfWordIndex = 0
    startingWord = random.randint(0, (len(text) - 40))
    punctuation = [
        ".", ",", '"', ";", ":", "?", "!", ")", "(", "*", "[", "]", "‘", "“",
        "#"
    ]

    for x in xrange(startingWord, (startingWord + len(text))):
        startOfWord = text[x]
        if startOfWord == ".":
            startOfWordIndex = x
            break

    for x in xrange(startOfWordIndex + 1, startOfWordIndex + lengthOfTweet):
        if text[x] in punctuation:
            tweetString = tweetString + text[x]

        elif text[x] not in punctuation:
            tweetString = tweetString + blank + text[x]
    return tweetString
import nltk, matplotlib
from nltk.corpus import webtext
print(webtext.fileids())

fileid = 'singles.txt'
wbt_words = webtext.words(fileid)
fdist = nltk.FreqDist(wbt_words)

print('최대 발생 토큰 "', fdist.max(), '" 수 : ', fdist[fdist.max()])
print('말뭉치 내 총 고유 토큰 수 : ', fdist.N())
print('말뭉치에서 가장 흔한 10개 단어는 다음과 같습니다.')
print(fdist.most_common(10))
print('개인 광고의 빈도 분포')
print(fdist.tabulate())
fdist.plot(cumulative=True)
Example #24
0
from nltk import pos_tag, sent_tokenize, word_tokenize
from nltk.corpus import stopwords, brown, words, webtext
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
import string

LEX_ASIAN = {"Chinese", "Japanese", "Korean", "China", "Japan", "Korea"}
LEX_EUROPEAN = {"French", "France", "Spain", "Spanish"}
WORD_SET = set(webtext.words()) | set(list(string.punctuation))


def load_dataset(filepath):
    """
    Returns a list of docs from the given filepath.

    Parameters:
    filepath -- (str)

    Returns:
    -------
    list of filenames: (str)
    """

    file_list = []
    with open(filepath) as f:
        for line in f:
            file_list.append(line.strip())
    return file_list


def get_feature_dict(text):
Example #25
0
text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)

text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words('chesterton-thursday.txt'))
print("text9:", text9.name)

def texts():
    print("text1:", text1.name)
    print("text2:", text2.name)
    print("text3:", text3.name)
# In[9]:

finder.nbest(bigram_measures.pmi, 10) 


# In[10]:

from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures


# In[11]:

words = [w.lower() for w in webtext.words('grail.txt')]


# In[12]:

bcf = BigramCollocationFinder.from_words(words)


# In[13]:

bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)


# Eliminating Stopwords

# In[14]:
Example #27
0
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

words = [w.lower() for w in webtext.words('grail.txt')]
bcf = BigramCollocationFinder.from_words(words)
print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4))

# remove punctuation and stopwords
from nltk.corpus import stopwords

stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
print(bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4))

# trigrams
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
words = [w.lower() for w in webtext.words('singles.txt')]
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_word_filter(filter_stops)
tcf.apply_freq_filter(3)
print(tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4))
Example #28
0
File: A3_1c.py Project: billbos/CL
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#PCL I, Übung 5, HS15
#Aufgabe 3.1c
#Autor: Bill Bosshard
#Matrikel-Nr.: 12-933-255
#Autor: Lukas Vollenweider
#Matrikel-Nr.: 13-751-888

import nltk

from nltk.book import *
from nltk.corpus import webtext

text6 = webtext.words("grail.txt")

def findVerbs(text):
	verbs = []
	for word in set(text):
		if len(word) >= 4 and word[-3:] == "ing":
			verbs.append(word.lower())
	return sorted(verbs)

if __name__ == "__main__":
	print findVerbs(text6)
Example #29
0
text1 = Text(gutenberg.words("melville-moby_dick.txt"))
print("text1:", text1.name)

text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)

text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words("chesterton-thursday.txt"))
print("text9:", text9.name)


def texts():
    print("text1:", text1.name)
    print("text2:", text2.name)
Example #30
0
import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.corpus import webtext
from nltk.metrics import BigramAssocMeasures

tokens = [t.lower() for t in webtext.words('grail.txt')]
words = BigramCollocationFinder.from_words(tokens)
print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))
Example #31
0
from nltk.corpus import webtext
from nltk.corpus import stopwords

# ### Corpora

# Each corpus has different files containing some text. To get a list of such files of e.g. the above imported wordnet (as wn) corpus, run:

# In[3]:

print(wordnet.fileids())

# To get the list of words inside a corpus we use the .words() method:

# In[4]:

print(webtext.words())

# ### Wordnet (OMW) => Synset basics

# <p><b>Synset</b> are wordnet instances grouping synonymous words that express the same concept.</p>

# In[5]:

syn = wordnet.synsets('fantasma', lang='ita')
print(syn)

# In[6]:

print("NAME: ", syn[0].name())
print("DEFINITION: ", syn[0].definition())
print("EXAMPLES: ", syn[0].examples())
Example #32
0
def text8():
    text = Text(webtext.words('singles.txt'), name="Personals Corpus")
    print("text8:", text.name)
    return text
Example #33
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#PCL I, Übung 5, HS15
#Aufgabe 3.2
#Autor: Bill Bosshard
#Matrikel-Nr.: 12-933-255
#Autor: Lukas Vollenweider
#Matrikel-Nr.: 13-751-888

import nltk
from nltk.corpus import webtext
text6 = webtext.words('grail.txt')

def long_words(text):
	return sorted([word.lower() for word in set(text) if word >= 7 and word[-3:] == "ing"])

def tuples(text):
    return [(word, len(word)) for word in set(text)]

def trigrams(text):
   	return [(text[i-2], text[i-1], text[i]) for i in range(2, len(text))]

print "long_words: ",long_words(text6)[:10]
print "tuples: ", tuples(text6)[:10]
print "trigrams: ", trigrams(text6)[:10]
Example #34
0
def main():
  # store word lengths
  brown_common_freq = []
  web_common_freq = []
  inaugural_common_freq = []
  gutenberg_common_freq = []
  genesis_common_freq = []

  common = ["the", "be", "to", "of", "and", "a", "in", "that", "have",
            "i", "it", "for", "not", "on", "with", "he", "as", "you",
            "do", "at", "this", "but", "his", "by", "from", "they",
            "we", "say", "her", "she", "or", "an", "will", "my", "one",
            "all", "would", "there", "their", "what", "so", "up", "out",
            "if", "about", "who", "get", "which", "go", "me", "when",
            "make", "can", "like", "time", "no", "just", "him", "know",
            "take", "people", "into", "year", "your", "good", "some",
            "could", "them", "see", "other", "than", "then", "now", "look",
            "only", "come", "its", "over", "think", "also", "back", "after",
            "use", "two", "how", "our", "work", "first", "well", "way",
            "even", "new", "want", "because", "any", "these", "give", "day",
            "most", "us"]
  common.sort()

  for file in gutenberg.fileids():
    total_words = len(gutenberg.words(file))
    total_common = 0
    for word in gutenberg.words(file):
      if word.lower() in common:
        total_common += 1
    gutenberg_common_freq.append(float(total_common)/total_words)

  for file in brown.fileids():
    total_words = len(brown.words(file))
    total_common = 0
    for word in brown.words(file):
      if word.lower() in common:
        total_common += 1
    brown_common_freq.append(float(total_common)/total_words)

  for file in webtext.fileids():
    total_words = len(webtext.words(file))
    total_common = 0
    for word in webtext.words(file):
      if word.lower() in common:
        total_common += 1
    web_common_freq.append(float(total_common)/total_words)

  for file in inaugural.fileids():
    total_words = len(inaugural.words(file))
    total_common = 0
    for word in inaugural.words(file):
      if word.lower() in common:
        total_common += 1
    inaugural_common_freq.append(float(total_common)/total_words)

  for file in genesis.fileids():
    total_words = len(genesis.words(file))
    total_common = 0
    for word in genesis.words(file):
      if word.lower() in common:
        total_common += 1
    genesis_common_freq.append(float(total_common)/total_words)

  with open("common-words.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq),
                        len(web_common_freq), len(brown_common_freq),
                        len(gutenberg_common_freq))):
      for corpus in [genesis_common_freq, inaugural_common_freq,
                     web_common_freq, brown_common_freq, gutenberg_common_freq]:
        if i >= len(corpus):
          f.write(",")
        else:
          f.write(str(round(corpus[i], 5)) + ",")
      f.write("\n")
Example #35
0
    print(e)

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))

    print(int(num_chars / num_words), int(num_words / num_sents),
          int(num_words / num_vocab), fileid)

from nltk.corpus import webtext
import nltk
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65])
    print(type(webtext.words(fileid)))

from nltk.corpus import brown

brown.categories()

t = brown.words(categories='news')
print(t)

fdist = nltk.FreqDist([w.lower() for w in t])
fdist1 = nltk.FreqDist(t)
print(type(fdist))
print(fdist)
print(fdist1['May'])

for f in fdist.items():
Example #36
0
from nltk.corpus import brown, webtext

# Brown corpus
print('Categories:', list(brown.categories()))
print('Brown sample text:\n\t', ' '.join(brown.words(categories='adventure')[:50]))

# Webtext corpus
print()
print('Categories:', webtext.fileids())
print('Webtext sample text:\n\t', ' '.join(webtext.words('firefox.txt')[:50]))
Example #37
0
def main():
#store FreqDist's
#index is the length of the word, 0 is for all words
  samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

  brown_letters = FreqDist()
  web_letters = FreqDist()
  inaugural_letters = FreqDist()
  gutenberg_letters = FreqDist()
  genesis_letters = FreqDist()

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      for character in word:
        if(character in string.letters):
            gutenberg_letters[character.upper()] += 1

  for file in brown.fileids():
    for word in brown.words(file):
      for character in word:
        if(character in string.letters):
            brown_letters[character.upper()] += 1

  for file in webtext.fileids():
    for word in webtext.words(file):
      for character in word:
        if(character in string.letters):
            web_letters[character.upper()] += 1

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      for character in word:
        if(character in string.letters):
            inaugural_letters[character.upper()] += 1

  for file in genesis.fileids():
    for word in genesis.words(file):
      for character in word:
        if(character in string.letters):
            genesis_letters[character.upper()] += 1

  with open("genesis-letter-freq.txt",'w') as f:
    sys.stdout = f
    f.write("GENESIS\n")
    for let in samples:
        print(str(genesis_letters[let]))
  
  with open("gutenberg-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("GUTENBERG\n")
    for let in samples:
        print(str(gutenberg_letters[let]))
  with open("webtext-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("WEBTEXT\n")
    for let in samples:
        print(str(web_letters[let]))
  with open("inaugural-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("INAUGURAL\n")
    for let in samples:
        print(str(inaugural_letters[let]))
  with open("brown-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("BROWN\n")
    for let in samples:
        print(str(brown_letters[let]))
  
  with open("letter-freq.txt", 'w') as f:
    corpora = [gutenberg_letters, web_letters, inaugural_letters,
        brown_letters, genesis_letters]
    f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n")
    for let in samples:
      for corpus in corpora:
        f.write(str(corpus[let]) + ",")
      f.write("\n")
def main_process(api,text, token_key, token_key_secret):
  #print 'text',text;
  # Used when tokenizing words
  sentence_re = r'''(?x)      # set flag to allow verbose regexps
        ([A-Z])(\.[A-Z])+\.?  # abbreviations, e.g. U.S.A.
      | \w+(-\w+)*            # words with optional internal hyphens
      | \$?\d+(\.\d+)?%?      # currency and percentages, e.g. $12.40, 82%
      | \.\.\.                # ellipsis
      | [][.,;"'?():-_`]      # these are separate tokens
  '''

  
  #stemmer = nltk.stem.porter.PorterStemmer()

  #Taken from Su Nam Kim Paper...
  grammar = r"""
      NBAR:
          {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
          
      NP:
          {<NBAR>}
          {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
  """
  chunker = nltk.RegexpParser(grammar)

  toks = nltk.regexp_tokenize(text, sentence_re)

  postoks = nltk.tag.pos_tag(toks)

  #print postoks

  #print 'sbl0';
  #print text;
  #print toks;
  #print postoks;
  tree = chunker.parse(postoks)
  #print 'sbl0';
  #from nltk.corpus import stopwords
  #stopwords = stopwords.words('english')

  terms = get_terms(tree)

  key_words = []
  key_words_list = []

  for term in terms:
    key_words = []
    for word in term:
      key_words.append(word)
      print word,
    key_words_list.append(key_words)
    print

  #get user instance on Twitter
  #api = twitter.Api(consumer_key='TuLPEoqwSkiVreWEODQ6tA',consumer_secret='LnPHHrMOiPVX5PlObJKryROYtdC3475Xq0WJ2tHlJHM',access_token_key=access_token['oauth_token'],access_token_secret=access_token['oauth_token_secret'])
  #api = twitter.Api(consumer_key='TuLPEoqwSkiVreWEODQ6tA',consumer_secret='LnPHHrMOiPVX5PlObJKryROYtdC3475Xq0WJ2tHlJHM',access_token_key=token_key,access_token_secret=token_key_secret)

  keyword_recommenders = {}

  people_interests = {}

  followers = api.GetFollowers()

  keywords_count_all = []
  keywords_count = []

  person_count = {}

  if len(followers)  == 0:
    print "No followers!";
    sys.exit(1)
  #print 'sbl1';
  classifier = train_classifier(make_full_dict)
  for person in followers:
    timeline = api.GetUserTimeline(person.id)
    keywords_count_all = []
    for status in timeline:
      keywords_count = []
      coin_count = 0
      for term in key_words_list:
        tmp_count = 0
        for word in term:
          tmp_count = tmp_count + status.text.lower().count(word)
        if tmp_count > 0:
          coin_count = coin_count + 1
        keywords_count.append(tmp_count)
      for k_count in keywords_count:
        if k_count > 0:
          k_count = k_count + coin_count
      if sum(keywords_count) > 0:
        sentiment = guess_sentiment(status.text, classifier)
        print sentiment;
        if sentiment == 'neg' and sentiment_enable == 1:
          continue
      if keywords_count_all == []:
        keywords_count_all = keywords_count
      else:
        for i in range(0, len(keywords_count_all)):
          keywords_count_all[i] = keywords_count_all[i] + keywords_count[i]
    person_count[person.name] = keywords_count_all
  #print 'sbl2';
  for i in range(0,len(key_words_list)):
    term = key_words_list[i]
    key_word = ''
    for word in term:
      if key_word == []:
        key_word = key_word + word
      else:
        key_word = key_word + ' ' + word
    recommenders_weight = {}
    for person in followers:
      #print i,len(person_count[person.name]);
      #print person.name;
      if person_count[person.name]:
        recommenders_weight[person.name] = person_count[person.name][i]

    recommenders_weight = sorted(recommenders_weight.iteritems(), key=lambda d:d[1], reverse = True)
    j = 0
    recommenders_weight_sort = {}
    for pair in recommenders_weight:
      j = j + 1
      if j > 3 or pair[1] == 0:
        break
      recommenders_weight_sort[pair[0]] = [pair[1]]
    if len(recommenders_weight_sort) > 0:
      keyword_recommenders[key_word] = recommenders_weight_sort

  followers = sorted(followers, key = lambda follower: follower.followers_count, reverse = True)
  connectors = []
  connectors.append(followers[0])
  if len(followers) > 1:
    connectors.append(followers[1])
  print "connector:"
  for connector in connectors:
    print connector.name;

  replyNum = {}
  replys = api.GetReplies()
  for reply in replys:
    if reply.in_reply_to_screen_name != None:
      reply.in_reply_to_screen_name = api.GetUsersSearch(reply.in_reply_to_screen_name)[0].name
      if reply.in_reply_to_screen_name in replyNum:
        replyNum[reply.in_reply_to_screen_name] = replyNum[reply.in_reply_to_screen_name] + 1
      else:
        replyNum[reply.in_reply_to_screen_name] = 1

  connectors_info = {}

  if len(keyword_recommenders) > 0:
    for key in keyword_recommenders:
      closeness = {}
      for person in keyword_recommenders[key]:
        if person in replyNum:
          closeness[person] = replyNum[person]
        else:
          closeness[person] = 0
      closeness = sorted(closeness.iteritems(), key=lambda d:d[1], reverse = True)
      j = 0
      for pair in closeness:
        j = j + 1
        keyword_recommenders[key][pair[0]].append(j)

  closeness = {}
  for connector in connectors:
    if connector.name in replyNum:
      closeness[connector.name] = replyNum[connector.name]
    else:
      closeness[connector.name] = 0
  closeness = sorted(closeness.iteritems(), key=lambda d:d[1], reverse = True)
  j = 0
  #print 'closeness:',closeness;
  for pair in closeness:
    j = j + 1
    for connector in connectors:
      if connector.name == pair[0]:
        connectors_info[pair[0]] = [connector.followers_count, j]
        break


#def extract_keyword():
  corpus = webtext.words()
  corpus_length = len(corpus)

  people_interest_words = {}

  for person in followers:
    timeline = api.GetUserTimeline(person.id)
    status_all = ''
    for status in timeline:
      if status_all == '':
        status_all = status.text
      else:
        status_all = status_all + '. ' + status.text
    status_all = status_all.lower()

    timeline_length = len(status_all)

    toks = nltk.regexp_tokenize(status_all, sentence_re)
    postoks = nltk.tag.pos_tag(toks)
    tree = chunker.parse(postoks)
    terms = get_terms(tree)

    interest_words_prob = {}
    interest_words = []
    for term in terms:
      interest_word = ''
      for word in term:
        if interest_word == '':
          interest_word = word
        else:
          interest_word = interest_word + ' ' + word
      if len(interest_word) > 1:
        interest_words.append(interest_word)
    for phrase in interest_words:
      status_count = status_all.count(phrase)
      if corpus_enable == 1:
        corpus_count = corpus.count(phrase)
      else:
        corpus_count = 1
      if corpus_count == 0:
        corpus_count = 1;
      interest_words_prob[phrase] = status_count * corpus_length / corpus_count / timeline_length

    interest_words_prob = sorted(interest_words_prob.iteritems(), key=lambda d:d[1], reverse = True)
    j = 0
    tmp = []
    for pair in interest_words_prob:
      j = j + 1
      if j > 4:
        break
      tmp.append(pair[0])
    people_interest_words[person.name] = tmp

  #print people_interest_words

  keys = keyword_recommenders
  people = people_interest_words


  #visualisation
  #print 'mavens';
  fig_num = 0
  for k in keys.keys():
    fig_num = fig_num + 1
  if connectors_info.keys():
    fig_num = fig_num + 1
  max_volumn = 2
  max_row = fig_num / max_volumn + fig_num % max_volumn
  #mavens
  i = 0
  for k in keys.keys():
    #if keys[k]:
    #print 'exist mavens';
    i = i + 1
    G = nx.Graph()
    #print G
    node_size_list = []
    node_color_list = []
    nodes_list = []
    G.add_node(k)
    nodes_list.append(k)
    #print G.node
    node_size_list.append(5000)
    node_color_list.append('r')
    for name in keys[k].keys():
      G.add_node(name)
      nodes_list.append(name)
      G.add_edge(k,name,weight = (1.0 / keys[k][name][1])*0.5)
      node_size_list.append(keys[k][name][0]*1500)
      node_color_list.append('y')
      
      if name in people.keys():
        for itrst in people[name]:
          if itrst != k and itrst != name:
            if itrst in nodes_list:
              continue
            else:
              G.add_node(itrst)
              nodes_list.append(itrst)
              G.add_edge(name,itrst,weight=3)
              node_size_list.append(1500)
              node_color_list.append('c')
      
    plt.subplot(max_row,max_volumn,i)
    plt.title('maven')
    #for i in range(len(node_size_list)):
    #  node_size_list[i] = node_size_list[i] / sum(node_size_list) * 100

    #nx.draw_networkx(G,pos=nx.spring_layout(G),with_labels=True,nodelist=nodes_list,node_size=node_size_list,node_color=node_color_list,tick_labels=False)
    nx.draw(G,pos=nx.spring_layout(G),title='mavens', with_labels=True,nodelist=nodes_list,node_size=node_size_list,node_color=node_color_list,tick_labels=False)
    #plt.show()
  #print 'connectors';
  #connectors
  user = api.VerifyCredentials()
  username = user.name

  if connectors_info.keys():
    #print 'connectors_info_keys()';
    G = nx.Graph()
    i = i + 1
    node_size_list = []
    node_color_list = []
    nodes_list = []
    G.add_node(username)
    nodes_list.append(username)
    node_size_list.append(5000)
    node_color_list.append('r')
    for c in connectors_info.keys():
      #print c;
      G.add_node(c)
      nodes_list.append(c)
      G.add_edge(c,username,weight = (1.0 / connectors_info[c][1])*500)
      node_size_list.append(connectors_info[c][0] * 700)
      node_color_list.append('y')
    #plt.subplot(1,fig_num,i)
    plt.subplot(max_row,max_volumn,i)
    plt.title('Connectors')

    #for i in range(len(node_size_list)):
    #  node_size_list[i] = node_size_list[i] / sum(node_size_list) * 100

    nx.draw(G,pos=nx.spring_layout(G),with_labels=True,nodelist=nodes_list,node_size=node_size_list,node_color=node_color_list)
  
  plt.show()
from nltk.corpus import stopwords
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
set = set(stopwords.words('english'))
stops_filter = lambda w: len(w) < 3 or w in set
tokens=[t.lower() for t in webtext.words('grail.txt')]
words=BigramCollocationFinder.from_words(tokens)
words.apply_word_filter(stops_filter)
print(words.nbest(BigramAssocMeasures.likelihood_ratio, 10))
Example #40
0
text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)

text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'),
             name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
print("text8:", text8.name)

text9 = Text(gutenberg.words('chesterton-thursday.txt'))
print("text9:", text9.name)


def texts():
    print("text1:", text1.name)
from nltk.corpus import webtext
from nltk.corpus import nps_chat
from nltk.corpus import brown
from nltk.corpus import reuters
from nltk.corpus import inaugural
from nltk import word_tokenize
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pandas as pd

# Pick out the first of these texts — Emma by Jane Austen — and give it a short name, gutenberg_raw
gutenberg_raw = gutenberg.raw("austen-emma.txt")

# Pick out the words from webtext corpus and give it a short name, webtext_words
webtext_words = webtext.words()
print(webtext_words)

# Pick out the text from np_chat corpus and name it as nps_chat_raw
nps_chat_raw = nps_chat.raw()

# Pick out the text from brown corpus and name it as brown_raw
brown_raw = brown.raw()
print(brown_raw)

# Pick out the text from reuters corpus and name it as reuters_words
reuters_words = reuters.words()
print(reuters_words)

# Pick out the text from inaugural corpus and name it as inaugral_raw
inaugral_words = inaugural.words()
Example #42
0
from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import stopwords
textwords = [w.lower() for w in webtext.words('pirates.txt')]
finder = BigramCollocationFinder.from_words(textwords)
finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
ignored_words = set(stopwords.words('english'))
filterstops = lambda w: len(w) < 3 or w in ignored_words
finder.apply_word_filter(filterstops)
finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
finder.nbest(BigramAssocMeasures.likelihood_ratio, 15)
with open("D:/Python/Consumer Complaints/Consumer_Complaints_CreditCard.csv", 'r') as file:
  complaints = list(csv.reader(file))
  file.close()

compClean = []
for i in range(len(complaints)):
    tokens = re.sub("[^A-Za-z0-9()'.]+", " ", complaints[i][5])
    tokens = re.sub('!', ".", tokens)
    compClean.append(tokens)


from nltk.corpus import webtext
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
words = [w.lower() for w in webtext.words('D:/Python/Consumer Complaints/complaintsDump.txt')]
bcf = BigramCollocationFinder.from_words(words)

#from nltk.collocations import TrigramCollocationFinder
#from nltk.metrics import TrigramAssocMeasures
#tcf = TrigramCollocationFinder.from_words(words)
#tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)

from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
collocations = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 50)

newText = 'a credit card is issued to me'
tokens = re.sub(" ".join(collocations[1]), "-".join(collocations[1]), newText)
'''
from nltk.corpus import stopwords,webtext
from nltk.collocations import BigramCollocationFinder,BigramAssocMeasures
from nltk.probability import FreqDist
from nltk.book import text1
from pip._vendor.distlib.resources import finder
from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores
from nltk.corpus import stopwords, webtext

scorer = BigramAssocMeasures.likelihood_ratio
compare_scorer = BigramAssocMeasures.raw_freq
ignored_words = stopwords.words('english')
word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words
for file in webtext.fileids():
    words = [word.lower()
             for word in webtext.words(file)]
    cf = BigramCollocationFinder.from_words(words)
    cf.apply_freq_filter(3)
    cf.apply_word_filter(word_filter)
    print(file)
    print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
    print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__,
                                            spearman_correlation(
                                            ranks_from_scores(cf.score_ngrams(scorer)),
                                            ranks_from_scores(cf.score_ngrams(compare_scorer)))))
        
'''
#from nltk.util import bigrams
bigram_measures=BigramAssocMeasures()
trigram_measure=BigramAssocMeasures()
finder=BigramCollocationFinder.from_words("grail.txt")
Example #45
0
text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print "text1:", text1.name

text2 = Text(gutenberg.words('austen-sense.txt'))
print "text2:", text2.name

text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
print "text3:", text3.name

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print "text4:", text4.name

text5 = Text(nps_chat.words(), name="Chat Corpus")
print "text5:", text5.name

text6 = Text(webtext.words('grail.txt'),
             name="Monty Python and the Holy Grail")
print "text6:", text6.name

text7 = Text(treebank.words(), name="Wall Street Journal")
print "text7:", text7.name

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
print "text8:", text8.name

text9 = Text(gutenberg.words('chesterton-thursday.txt'))
print "text9:", text9.name


def texts():
    print "text1:", text1.name
Example #46
0
    
macbethRaw = gutenberg.raw('shakespeare-macbeth.txt')
macbethWords = gutenberg.words('shakespeare-macbeth.txt')
macbethSents = gutenberg.sents('shakespeare-macbeth.txt')

longestLen = max([len(s) for s in macbethSents])
longestSents = [s for s in macbethSents if len(s) == longestLen]


from nltk.corpus import webtext
webtext.fileids()
for fileid in webtext.fileids():
    print fileid, webtext.raw(fileid)[:65], '...'
    
webtext.raw('pirates.txt').lower().count('jack')   
pirates = nltk.Text(webtext.words('pirates.txt'))


from nltk.corpus import brown
brown.categories()
brown.words(categories = 'news')
brown.words(fileids = ['cg22'])
brown.words(fileids = ['cg22','ca16']) # Concatenates the two corpora into one.

from nltk.corpus import brown
newsText = brown.words(categories = 'news')
fdist = nltk.FreqDist([w.lower() for w in newsText])
modals = ['can','could','may','might','must','will']
for m in modals:
    print m + ':', fdist[m], 
Example #47
0
            dict_tmp[ele] = dictLen[ele]
    return dict_tmp


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Start Program
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# import the arguments
length = sys.argv[1]
char = sys.argv[2]

print ""
print 'START------------------------------------------------------'
print "Aufgabe 5 (length=" + length + ", character='" + char + ")"
print '-----------------------------------------------------------'

text = webtext.words('grail.txt')

print "Häufigstes Wort mit " + str(length) + " Buchstaben:"
printresult(dictLen(text, length))

print "Häufigstes Wort mit Anfangsbuchstabe '" + char + "':"
printresult(dictChar(text, char))

print "Häufigstes Wort mit " + str(
    length) + " Buchstaben und Anfangsbuchstabe '" + char + "':"
printresult(dictLenChar(dictLen(text, length), char))
print 'END------------------------------------ lvthiessen | rolben'
print ""
Example #48
0
#Importing data
import nltk
from nltk.corpus import webtext
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import string
nltk.download('webtext')
wt_sentences = webtext.sents('firefox.txt')
wt_words = webtext.words('firefox.txt')
print(len(wt_sentences))
print(len(wt_words))
Example #49
0
import re
from random import shuffle
from nltk.corpus import webtext
from nltk.corpus import nps_chat

from gensim.models.doc2vec import LabeledSentence, Doc2Vec

gendered_terms = [
    r'\bhe\b', r'\bhes', r'\bshe\b', r'\bshes\b', r'\bhis\b', r'\bher\b',
    r'\bbro\b', r'\bman\b', r'\bsir\b', r'\bdude\b', r'\bgirl\b', r'\bgirls\b',
    r'\blady\b', r'\bgurl\b', r'\bhims\b', r'\bhers\b', r'\bhisself\b',
    r'\bherself\b', r'\bman\b', r'\bwoman\b'
]

dictionary_words = {}
for x in nps_chat.words() + webtext.words():
    dictionary_words[x] = True

print(len(dictionary_words))


class LabeledLineSentence(object):
    def __init__(self, messages_dic, is_sample=True):
        self.documents = []
        self.messages_dic = messages_dic
        self.is_sample = is_sample

    def __iter__(self):
        for user in self.messages_dic:
            if self.is_sample:
                for i in range(200):
Example #50
0
import nltk
from nltk.corpus import webtext
from nltk.corpus import nps_chat
from nltk.corpus import brown 

# for fileid in webtext.fileids():
# 	print fileid, webtext.raw(fileid)[:65]

# for fileId in nps_chat.fileids():
# 	print fileId

pirates = webtext.raw('pirates.txt')
pirates_char = len(webtext.raw('pirates.txt'))
pirates_words = len(webtext.words('pirates.txt'))
pirates_sents = len(webtext.sents('pirates.txt'))
print 'pirates_char: ', pirates_char, 'pirates_words: ', pirates_words, 'pirates_sents: ', pirates_sents, 'avg char per word: ', int(pirates_char/pirates_words), 'avg words per sentence: ', int(pirates_words/pirates_sents)

uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')]))

def lexical_div(un, total):
	return total/un

print 'lexical diversity: ', lexical_div(uniqs, pirates_words)

# brown_categories = brown.categories()
# for genre in brown_categories:
# 	print genre

news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
# modal verbs