Beispiel #1
0
def demo(scorer=None, compare_scorer=None):
    """Finds trigram collocations in the files of the WebText corpus."""
    from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores

    if scorer is None:
        scorer = BigramAssocMeasures.likelihood_ratio
    if compare_scorer is None:
        compare_scorer = BigramAssocMeasures.raw_freq

    from nltk.corpus import stopwords, webtext

    ignored_words = stopwords.words('english')
    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words

    for file in webtext.fileids():
        words = [word.lower() for word in webtext.words(file)]

        cf = BigramCollocationFinder.from_words(words)
        cf.apply_freq_filter(3)
        cf.apply_word_filter(word_filter)

        print(file)
        print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
        print('\t Correlation to %s: %0.4f' %
              (compare_scorer.__name__,
               spearman_correlation(
                   ranks_from_scores(cf.score_ngrams(scorer)),
                   ranks_from_scores(cf.score_ngrams(compare_scorer)))))
Beispiel #2
0
def ch03_42_wordnet_semantic_index():
    from nltk.corpus import webtext
    from nltk.corpus import wordnet as wn
    postings = []
    docids = {}
    for (pos, fileid) in enumerate(webtext.fileids()):
        docids[pos] = fileid
        wpos = 0
        words = webtext.words(fileid)
        for word in words:
            try:
                postings.append((word.lower(), (pos, wpos)))
                offset = wn.synsets(word)[0].offset
                postings.append((offset, (pos, wpos)))
                poffset = wn.synsets(word)[0].hypernyms()[0].offset
                postings.append((poffset, (pos, wpos)))
            except IndexError:
                continue
            wpos = wpos + 1
    index = nltk.Index(postings)
    query = "canine"
    qpostings = []
    qpostings.extend([(pos, wpos) for (pos, wpos) in index[query]])
    try:
        offset = wn.synsets(query)[0].offset
        qpostings.extend([(pos, wpos) for (pos, wpos) in index[offset]])
    except IndexError:
        pass
    for (pos, wpos) in qpostings:
        left = webtext.words(docids[pos])[wpos - 4:wpos]
        right = webtext.words(docids[pos])[wpos:wpos + 4]
        print left, right
Beispiel #3
0
def demo(scorer=None, compare_scorer=None):
    """Finds bigram collocations in the files of the WebText corpus."""
    from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores

    if scorer is None:
        scorer = BigramAssocMeasures.likelihood_ratio
    if compare_scorer is None:
        compare_scorer = BigramAssocMeasures.raw_freq

    from nltk.corpus import stopwords, webtext

    ignored_words = stopwords.words('english')
    word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words

    for file in webtext.fileids():
        words = [word.lower()
                 for word in webtext.words(file)]

        cf = BigramCollocationFinder.from_words(words)
        cf.apply_freq_filter(3)
        cf.apply_word_filter(word_filter)

        print(file)
        print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)])
        print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__,
                                               spearman_correlation(
                                                   ranks_from_scores(cf.score_ngrams(scorer)),
                                                   ranks_from_scores(cf.score_ngrams(compare_scorer)))))
Beispiel #4
0
def ch03_42_wordnet_semantic_index():
  from nltk.corpus import webtext
  from nltk.corpus import wordnet as wn
  postings = []
  docids = {}
  for (pos, fileid) in enumerate(webtext.fileids()):
    docids[pos] = fileid
    wpos = 0
    words = webtext.words(fileid)
    for word in words:
      try:
        postings.append((word.lower(), (pos, wpos)))
        offset = wn.synsets(word)[0].offset
        postings.append((offset, (pos, wpos)))
        poffset = wn.synsets(word)[0].hypernyms()[0].offset
        postings.append((poffset, (pos, wpos)))
      except IndexError:
        continue
      wpos = wpos + 1
  index = nltk.Index(postings)
  query = "canine"
  qpostings = []
  qpostings.extend([(pos, wpos) for (pos, wpos) in index[query]])
  try:
    offset = wn.synsets(query)[0].offset
    qpostings.extend([(pos, wpos) for (pos, wpos) in index[offset]])
  except IndexError:
    pass
  for (pos, wpos) in qpostings:
    left = webtext.words(docids[pos])[wpos-4:wpos]
    right = webtext.words(docids[pos])[wpos:wpos+4]
    print left, right
Beispiel #5
0
def process_webtext():
    print 'webtext'
    from nltk.corpus import webtext
    count = 0
    word = 'bank'
    sen1 = 'depository_financial_institution.n.01'
    sen2 = 'bank.n.01'
    file_name = 'data/bank_webtext_tmp.txt'
    for f in webtext.fileids():
        sents = webtext.sents(f)
        for i in range(len(sents)):
            sent = sents[i]
            if (word in sent):
                appendToFile(file_name, sentToStr(sent, '0'))
                count = count + 1
                print count
Beispiel #6
0
def main():
  # store word lengths
  brown_word_lens = []
  web_word_lens = []
  inaugural_word_lens = []
  gutenberg_word_lens = []
  genesis_word_lens = []

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      gutenberg_word_lens.append(len(word))

  for file in brown.fileids():
    for word in brown.words(file):
      brown_word_lens.append(len(word))

  for file in webtext.fileids():
    for word in webtext.words(file):
      web_word_lens.append(len(word))

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      inaugural_word_lens.append(len(word))

  for file in genesis.fileids():
    for word in genesis.words(file):
      genesis_word_lens.append(len(word))
  with open("wordlens.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), 
        len(web_word_lens), len(brown_word_lens),
        len(gutenberg_word_lens))):
      for corpus in [genesis_word_lens, inaugural_word_lens,
          web_word_lens, brown_word_lens, gutenberg_word_lens]:
        if(i >= len(corpus)):
          f.write(",")
        else:
          f.write(str(corpus[i]) + ",")
      f.write("\n")
Beispiel #7
0
def demo(scorer_bam=None,
         compare_scorer_bam=None,
         scorer_tam=None,
         compare_scorer_tam=None):
    if scorer_bam is None:
        scorer_bam = BigramAssocMeasures.likelihood_ratio
    if compare_scorer_bam is None:
        compare_scorer_bam = BigramAssocMeasures.raw_freq

    if scorer_tam is None:
        scorer_tam = TrigramAssocMeasures.likelihood_ratio
    if compare_scorer_tam is None:
        compare_scorer_tam = BigramAssocMeasures.raw_freq

    regex = '^[A-Za-z]+$'  #正则表达式匹配英文单词
    str_regex = re.compile(regex)
    for file in webtext.fileids():  # 根据文件逐个处理
        words_list = []
        for word in webtext.words(file):
            if not str_regex.match(word):  #如果不是纯英文单词,则跳过
                continue
            words_list.append(word)

        # 获取二元搭配,窗口大小为3,4,5
        for window_size in range(3, 4):
            bcf = BigramCollocationFinder.from_words(words_list, window_size)
            bcf.apply_freq_filter(window_size)

            for item in bcf.nbest(scorer_bam, 1000):
                get_collocation(item)  #获取搭配次词
        # 获取三元搭配
        for window_size in range(3, 4):
            tcf = TrigramCollocationFinder.from_words(words_list, window_size)
            tcf.apply_freq_filter(window_size)
            # tcf.apply_word_filter(word_filter)
            #corr = spearman_correlation(ranks_from_scores(tcf.score_ngrams(scorer)),
            #                          ranks_from_scores(tcf.score_ngrams(compare_scorer)))
            for item in tcf.nbest(scorer_tam, 1000):
                get_collocation(item)
Beispiel #8
0
def webtext():
    from nltk.corpus import webtext as webtext
    from nltk.corpus import nps_chat

    # list comprehension version
    file_ids = [fileid for fileid in webtext.fileids()]
    chat_file_ids = [fileid for fileid in nps_chat.fileids()]

    pirates = webtext.raw('pirates.txt')
    pirates_words = len(webtext.words('pirates.txt'))
    pirates_sents = len(webtext.sents('pirates.txt'))
    uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')]))

    lexical_diversity = lexical_div(uniqs, pirates_words)

    # import nltk.book as book
    # text1 = book.text1
    # pirates = webtext.raw('pirates.txt')

    return render_template('webtext.html',
                           file_ids=file_ids,
                           chat_file_ids=chat_file_ids,
                           pirates=pirates)
Beispiel #9
0
def exercise_webtext():
    # 打印网络文本的文件名
    for file_id in webtext.fileids():
        print file_id
Beispiel #10
0
Datei: NLP.py Projekt: Toma-L/NLP
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print (int(num_chars/num_words), int(num_words/num_sents)) #avg word & sentence length and the diversity of words

macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences #load sentences of Macbeth
macbeth_sentences[1037]
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len] #find longest sentence

from nltk.corpus import webtext
for fileid in webtext.fileids():
    print (fileid, webtext.raw(fileid)[:65], '...')

from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]

from nltk.corpus import brown
brown.categories()
brown.words(categories = 'news') 
brown.words(fileids = ['cg22'])

from nltk.corpus import brown
news_text = brown.words(categories = 'news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
Beispiel #11
0
from nltk.corpus import webtext
from nltk.corpus import nps_chat

#webtext overview
for idx in webtext.fileids():
    print('file={} , words={}'.format(idx, len(webtext.words(idx))))
Beispiel #12
0
def main():
  # store word lengths
  brown_common_freq = []
  web_common_freq = []
  inaugural_common_freq = []
  gutenberg_common_freq = []
  genesis_common_freq = []

  common = ["the", "be", "to", "of", "and", "a", "in", "that", "have",
            "i", "it", "for", "not", "on", "with", "he", "as", "you",
            "do", "at", "this", "but", "his", "by", "from", "they",
            "we", "say", "her", "she", "or", "an", "will", "my", "one",
            "all", "would", "there", "their", "what", "so", "up", "out",
            "if", "about", "who", "get", "which", "go", "me", "when",
            "make", "can", "like", "time", "no", "just", "him", "know",
            "take", "people", "into", "year", "your", "good", "some",
            "could", "them", "see", "other", "than", "then", "now", "look",
            "only", "come", "its", "over", "think", "also", "back", "after",
            "use", "two", "how", "our", "work", "first", "well", "way",
            "even", "new", "want", "because", "any", "these", "give", "day",
            "most", "us"]
  common.sort()

  for file in gutenberg.fileids():
    total_words = len(gutenberg.words(file))
    total_common = 0
    for word in gutenberg.words(file):
      if word.lower() in common:
        total_common += 1
    gutenberg_common_freq.append(float(total_common)/total_words)

  for file in brown.fileids():
    total_words = len(brown.words(file))
    total_common = 0
    for word in brown.words(file):
      if word.lower() in common:
        total_common += 1
    brown_common_freq.append(float(total_common)/total_words)

  for file in webtext.fileids():
    total_words = len(webtext.words(file))
    total_common = 0
    for word in webtext.words(file):
      if word.lower() in common:
        total_common += 1
    web_common_freq.append(float(total_common)/total_words)

  for file in inaugural.fileids():
    total_words = len(inaugural.words(file))
    total_common = 0
    for word in inaugural.words(file):
      if word.lower() in common:
        total_common += 1
    inaugural_common_freq.append(float(total_common)/total_words)

  for file in genesis.fileids():
    total_words = len(genesis.words(file))
    total_common = 0
    for word in genesis.words(file):
      if word.lower() in common:
        total_common += 1
    genesis_common_freq.append(float(total_common)/total_words)

  with open("common-words.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq),
                        len(web_common_freq), len(brown_common_freq),
                        len(gutenberg_common_freq))):
      for corpus in [genesis_common_freq, inaugural_common_freq,
                     web_common_freq, brown_common_freq, gutenberg_common_freq]:
        if i >= len(corpus):
          f.write(",")
        else:
          f.write(str(round(corpus[i], 5)) + ",")
      f.write("\n")
Beispiel #13
0
# Use the Brown corpus reader nltk.corpus.brown.words() or the Web text corpus reader nltk.corpus.webtext.words() to
# access some sample text in two different genres.
from nltk.corpus import brown
from nltk.corpus import webtext

brown_genres = brown.categories()
print(brown_genres)
print(brown.sents(categories=brown_genres[-1]))

webtext_genres = webtext.fileids()
print(webtext_genres)
print(webtext.words(webtext_genres[1]))
Beispiel #14
0
inaugural.words(fileids='1933-Roosevelt.txt')


# # WEBTEXT CORPUS

# In[5]:


from nltk.corpus import webtext


# In[6]:


webtext.fileids()


# In[7]:


webtext.words(fileids='pirates.txt')[:10]


# In[16]:


k= webtext.fileids()


# In[46]:
Beispiel #15
0
# 특정 문서를 문장 단위로 읽어온다.

n = 5

sentence = nltk.corpus.gutenberg.sents('austen-emma.txt')
for i in n range(5):
    print(sentence[i])
print("문장 개수 = ", len(sentence))

import nltk
from nltk.corpus import webtext
nltk.download('punkt')
nltk.download('webtext')

# Webtext 코퍼스의 파일 ID를 조회한다.
textId = webtext.fileids()
print(textId)

text = """
Natural language processing (NLP) is a subfield of computer science, information engineering, 
and artificial intelligence concerned with the interactions between computers and human (natural) languages, 
in particular how to program computers to process and analyze large amounts of natural language data. 
Challenges in natural language processing frequently involve speech recognition, natural language understanding, 
and natural language generation.
"""

sent_tok = nltk.sent_tokenize(text) # 문서 -> 문장. 위 문서를 두 문장('.'으로 나뉨)으로 나눔
print(len(sent_tok))

sent_tok[1] # 첫 번째 문장
len(sent_tok) # 문장의 개수
Beispiel #16
0
import nltk
from nltk.corpus import gutenberg

fileids = gutenberg.fileids()
emma = gutenberg.words("austen-emma.txt")
emmaTxt = nltk.Text(gutenberg.words('austen-emma.txt'))
print(fileids)
print(emma, len(emma), len(set(emma)))
print(emmaTxt, len(emmaTxt), len(set(emmaTxt)))


from nltk.corpus import webtext

fileids = webtext.fileids()
print("webtext fileids", fileids)


from nltk.corpus import brown

fileids = brown.fileids()
print("brown fileids", fileids)

news_text = brown.words(categories="news")
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ["can", "could", "may", "mignt", "must", "will"]
for m in modals:
    print "modals," + m + ":", fdist[m]

#-----------------------

from nltk.corpus import words
words_only_digits = [w for w in chat if w.isdigit()]
words_lower_unique = len(set([w.lower() for w in chat if w.isalpha() ])) #true vocab
#HERE: TO SOLVE SOME CHAPTER 1 EXERCISES
#---------------------------------
##CHAPTER 2:Accessing Text Corpora
import nltk
#print(nltk.corpus.gutenberg.fileids()) #prints filenames for nltk.gutenberg
emma = nltk.corpus.gutenberg.words('austen-emma.txt') #select text
#print(len(emma))
emma = nltk.Text(emma)  #to use previous functions as with nltk.book txts
print(emma.concordance('surprise'))
print(' '.join(emma[20:50])) #LIST to STRING - comes out as text

#examples of corpus available in nltk
from nltk.corpus import webtext  #less formal text
print(webtext.fileids())  #filenames

from nltk.corpus import nps_chat #predators
print(nps_chat.fileids()) 

from nltk.corpus import brown #brown uni various texts
print(brown.fileids())

from nltk.corpus import reuters
print(reuters.fileids())

from nltk.corpus import inaugural
print(inaugural.fileids())
#page 72 for a variety of corpus functionality commands

Beispiel #18
0
def get_webtext_raw():
    for fileid in webtext.fileids():
        print(fileid, webtext.raw(fileid)[:65], '...')
Beispiel #19
0
def webtext():

    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:65], '...'
Beispiel #20
0
def main():
#store FreqDist's
#index is the length of the word, 0 is for all words
  samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

  brown_letters = FreqDist()
  web_letters = FreqDist()
  inaugural_letters = FreqDist()
  gutenberg_letters = FreqDist()
  genesis_letters = FreqDist()

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      for character in word:
        if(character in string.letters):
            gutenberg_letters[character.upper()] += 1

  for file in brown.fileids():
    for word in brown.words(file):
      for character in word:
        if(character in string.letters):
            brown_letters[character.upper()] += 1

  for file in webtext.fileids():
    for word in webtext.words(file):
      for character in word:
        if(character in string.letters):
            web_letters[character.upper()] += 1

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      for character in word:
        if(character in string.letters):
            inaugural_letters[character.upper()] += 1

  for file in genesis.fileids():
    for word in genesis.words(file):
      for character in word:
        if(character in string.letters):
            genesis_letters[character.upper()] += 1

  with open("genesis-letter-freq.txt",'w') as f:
    sys.stdout = f
    f.write("GENESIS\n")
    for let in samples:
        print(str(genesis_letters[let]))
  
  with open("gutenberg-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("GUTENBERG\n")
    for let in samples:
        print(str(gutenberg_letters[let]))
  with open("webtext-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("WEBTEXT\n")
    for let in samples:
        print(str(web_letters[let]))
  with open("inaugural-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("INAUGURAL\n")
    for let in samples:
        print(str(inaugural_letters[let]))
  with open("brown-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("BROWN\n")
    for let in samples:
        print(str(brown_letters[let]))
  
  with open("letter-freq.txt", 'w') as f:
    corpora = [gutenberg_letters, web_letters, inaugural_letters,
        brown_letters, genesis_letters]
    f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n")
    for let in samples:
      for corpus in corpora:
        f.write(str(corpus[let]) + ",")
      f.write("\n")
Beispiel #21
0
import nltk
nltk.download()


'''
Tokenization

What:  Separate text into units such as sentences or words
Why:   Gives structure to previously unstructured text
Notes: Relatively easy with English language text, not easy with some languages
'''

# "corpus" = collection of documents
# "corpora" = plural form of corpus
from nltk.corpus import webtext
webtext.fileids()

# wine reviews corpus
text = webtext.raw('wine.txt')
text[:500]

# tokenize into sentences
sentences = [sent for sent in nltk.sent_tokenize(text)]
sentences[:10]

# tokenize into words
tokens = [word for word in nltk.word_tokenize(text)]
tokens[:100]

# only keep tokens that start with a letter (using regular expressions)
import re
'''
College is so hectic,I'm tired
'''


# In[9]:


#importing library from nltk.corpus
from nltk.corpus import webtext


# In[10]:


webtext.fileids()


# In[11]:


webtext.words('pirates.txt')[:20]


# In[12]:


#printing first 20 words
file_ids = webtext.fileids()
for file in file_ids:
    print(file)
Beispiel #23
0
from nltk.corpus import brown, webtext

# Brown corpus
print('Categories:', list(brown.categories()))
print('Brown sample text:\n\t', ' '.join(brown.words(categories='adventure')[:50]))

# Webtext corpus
print()
print('Categories:', webtext.fileids())
print('Webtext sample text:\n\t', ' '.join(webtext.words('firefox.txt')[:50]))
Beispiel #24
0
 def __init__(self):
     self.number_id = 28
     self.source_id = "webtext"
     self.titles = [name for name in webtext.fileids()]
     self.data = [webtext.raw(name) for name in self.titles]
Beispiel #25
0
if __name__ == '__main__':
    inputs1 = [\
               'Spread the peanut butter.',\
               'spread the Peanut butter with the knife.',\
               'spread the Peanut butter on the bread.',\
               'get two Slices. of bread.',\
               'get a knife.'\
               ]
    inputs2 = [\
                'spread the peanut butter',\
                'spread the peanut butter',\
                'get a knife.'\
              ]

    # runtime tests
    ff = webtext.fileids()[0]
    #the sentences we want to sample from
    ffs = webtext.raw(ff)
    ffx = unicodedata.normalize('NFKD', ffs).encode('ascii',
                                                    'ignore').split('.')
    ffs = []
    for entry in ffx:
        if len(entry) < 10:
            ffs.append(entry)

    ffs = [x for x in ffs if x != '']
    final = []
    for i in range(1):
        this_round = [[]]
        for size in [1000]:
            inputs = numpy.random.choice(ffs, size=size)
@author: Ritwik Gupta
"""

#20/12/19

from nltk.corpus import brown
brown.categories()
print(brown.words(categories='hobbies')[0:5])

from nltk.corpus import inaugural
inaugural.fileids()
inaugural.words(fileids='1933-Roosevelt.txt')[0:10]

from nltk.corpus import webtext
d1 = {}
for i in webtext.fileids():
    d1[i] = webtext.words(fileids=i)[:20]

#Downloaded the MASC data
import nltk
with open('tweets1.txt', 'r') as f:
    text = f.read().strip()
    text1 = text.split()
    text2 = nltk.Text(text1)
    text2.concordance("good", 1)

#Project Gutenberg
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf-8')
of external corpora.
 
NLTK corpus okuyuculari. Bu paketteki moduller, cesitli bicimlerde 
corpus dosyalarini okumak icin kullanilabilen islevleri saglar. 
Bu islevler, hem NLTK corpus paketinde dagitilan corpus dosyalarini 
hem de dis corpora'nin bir parcasi olan corpus dosyalarini okumak icin kullanilabilir.

Common Structures for Text Corpora: The simplest kind of corpus is a collection of 
isolated texts with no particular organization; some corpora are structured 
into categories like genre (Brown Corpus); some categorizations overlap, such as
topic categories (Reuters Corpus); other corpora represent language use over
time (Inaugural Address Corpus).
 
1- Gutenberg Corpus
2- Web and Chat Text
3- Brown Corpus
4- Reuters Corpus
5- Inaugural Address Corpus
6- Annotated Text Corpora
"""

from nltk.corpus import gutenberg, webtext, brown, reuters, inaugural

print "Gutenberg FileIds   :", gutenberg.fileids()
print "Webtext FileIds     :", webtext.fileids()
print "Brown FileIds       :", brown.fileids()
print "Brown Categories    :", brown.categories()
print "Reuters FileIds     :", reuters.fileids()
print "Reuters Categories  :", reuters.categories()
print "Inaugural FileIds   :", inaugural.fileids()
Beispiel #28
0
        word_sim = {}
        for i in range(self.unique_word):
            vet_w2 = self.w1[i]
            theta_sum = np.dot(vet_w1, vet_w2)
            theta_den = np.linalg.norm(vet_w1) + np.linalg.norm(vet_w2)
            theta = theta_sum / theta_den

            word = self.index_word[i]
            word_sim[word] = theta
        sort_word = sorted(word_sim.items(),
                           key=lambda kv: kv[1],
                           reverse=True)

        for word, sim in sort_word[:top_n]:
            print(word, sim)


fx = webtext.raw(webtext.fileids()[0])
corpus = fx[:1000]
print(corpus)
settings = {"train": {"window": 2, "epoch": 3000, "lr": 0.01}}
w2 = word2vec(settings)
pre_pr = w2.pre_process(corpus, ispara=False)
# print(corpus)
training_Data = w2.gen_training_data(pre_pr)
w2.train(training_Data)
t_word = "phoenix"
print(w2.word_vec(t_word))
w2.vec_sim(t_word, 5)
# print(training_Data.size*training_Data.itemsize)
# Importing modules with datasets within nltk.corpus
from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import nps_chat
from nltk.corpus import brown
from nltk.corpus import reuters

# Printing the list of all dataset names in each module
print('Printing the file IDs for each module...\n')
print('gutenberg:\n', gutenberg.fileids())
print('webtext:\n', webtext.fileids())
print('nps_chat:\n', nps_chat.fileids())
print('brown:\n', brown.fileids())
print('reuters:\n', reuters.fileids())

# Printing the categories of each module
# NOTE: gutenberg, webtext and nps_text do not have "categories"
print('Printing the categories for each module, if available...\n')
print('brown:\n', brown.categories())
print('reuters:\n', reuters.categories())

# Accessing the corpora
# NOTE: TXT files can be accessed through "raw" to get the full files
print('Accessing the sample files...')
print('gutenberg:\n', gutenberg.raw("austen-emma.txt"))

# Accessing sentences of a sample file
print('Getting a list of sentences...')
print('List of sentences from austen-emma.txt:\n', gutenberg.sents("austen-emma.txt"))
print('List of sentences from a chat:\n', nps_chat.posts("10-19-20s_706posts.xml"))
Beispiel #30
0
def fun04():
    """fun04"""
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:50]
Beispiel #31
0
def webtext_example():
    #Get raw text for the webtext dictionary
    for fileid in webtext.fileids():
        raw_text = webtext.raw(fileid)
        for i, line in enumerate(raw_text.split('\n')):
            print('[{}] {} : {}'.format(fileid, i, line))
import nltk, matplotlib
from nltk.corpus import webtext
print(webtext.fileids())

fileid = 'singles.txt'
wbt_words = webtext.words(fileid)
fdist = nltk.FreqDist(wbt_words)

print('최대 발생 토큰 "', fdist.max(), '" 수 : ', fdist[fdist.max()])
print('말뭉치 내 총 고유 토큰 수 : ', fdist.N())
print('말뭉치에서 가장 흔한 10개 단어는 다음과 같습니다.')
print(fdist.most_common(10))
print('개인 광고의 빈도 분포')
print(fdist.tabulate())
fdist.plot(cumulative=True)
Beispiel #33
0
# -*- coding: utf-8 -*-
import matplotlib
matplotlib.use('TkAgg')
import nltk 
'''
☼ Use the Brown corpus reader nltk.corpus.brown.words() or the Web text corpus
reader nltk.corpus.webtext.words() to access some sample text in two different genres.
'''

from nltk.corpus import brown,webtext
romance_text = brown.words(categories='romance')
print brown.categories()
print webtext.fileids()
print webtext.words('firefox.txt')
Beispiel #34
0
def fun04():
    """fun04"""
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:50]
Beispiel #35
0
#Return the max len of sentences 
longest_len = max([len(s) for s in macbeth_sentences])

#Save the sentences biggest
longest_sent = [s for s in macbeth_sentences if len(s) == longest_len]

#********************************************************************************************************
#                                        Web and Chat Text
#********************************************************************************************************

'''
Web Texts
'''
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print fileid, webtext.raw(fileid)[:65]
'''
Chats
'''
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]


#********************************************************************************************************
#                                        Brown Corpus
#********************************************************************************************************

from nltk.corpus import brown
Beispiel #36
0
def fun3():
    from nltk.corpus import webtext
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:65]
Beispiel #37
0
def exercise_webtext():
    # 打印网络文本的文件名
    for file_id in webtext.fileids():
        print(file_id)
Beispiel #38
0
#!/usr/bin/env python
# coding: utf-8

# In[6]:

from nltk.corpus import webtext

# In[7]:

webtext.fileids()

# In[8]:

print(webtext.words(fileids='pirates.txt'))

# In[9]:

for file in webtext.fileids():
    print(webtext.words(fileids=file)[:20])
def webtext_example():
    from nltk.corpus import webtext
    for fileid in webtext.fileids():
        print(fileid, webtext.raw(fileid)[:65], '...')
Beispiel #40
0
def print_private():
    from nltk.corpus import webtext
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:65]
Beispiel #41
0
          round(word_count / vocab_count), fileid)
macbeth_sentences = gutenberg.sents(
    'shakespeare-macbeth.txt'
)  # Notes: Dispalay the longest sentence from the macbeth text and its length
print("\n112th Macbeth sentence: ", macbeth_sentences[111])
print("\nNumber of sentences in Macbeth: ", len(macbeth_sentences))
longest_length = max(len(s) for s in macbeth_sentences)
longest_sentence = [
    sentence for sentence in macbeth_sentences
    if len(sentence) == longest_length
]
print("\nLength of longest sentence in Macbeth: ", longest_length)
print("\nLongest sentence in Macbeth: ", longest_sentence)

from nltk.corpus import webtext  # Notes: Web and Chat Corpus. Gutenberg contains formal literature, it is important to consider less formal language as well.
for fileids in webtext.fileids(
):  # Notes: From the web and chat corpus print the fileids with first 65 characters
    print(fileids, webtext.raw(fileids)[:65], "\n")
from nltk.corpus import nps_chat  # Notes: Instant messaging chat session corpus. Contains over 10,000 'posts'
chatroom = nps_chat.posts(
    '10-19-20s_706posts.xml')  # Notes: format = dd_mm-age_numberofposts.xml
print(chatroom[123])

# Notes: Borwn corpus and its categories
# Notes: Reuters corpus
# Notes: Insuagral Address Corpus
# Notes: Annotated Text Corpora
# Notes: Corpora in other languages
# Notes: Loading your own corpus

print(
    "\n--- 2.2 Conditional Frequency Distributions ---\n"