Ejemplo n.º 1
0
 def multi_words_xpn(self):
     mwes = []
     bigram_measures = nltk.collocations.BigramAssocMeasures()
     finder = nltk.collocations.BigramCollocationFinder.from_words(gw.words('english-web.txt'))
     finder.apply_freq_filter(self.FILTERING_NUM)
     mwes.append(finder.nbest(bigram_measures.pmi, self.N_GRAM_NUM))
     trigram_measures = nltk.collocations.TrigramAssocMeasures()
     finder = nltk.collocations.TrigramCollocationFinder.from_words(gw.words('english-web.txt'))
     finder.apply_freq_filter(self.FILTERING_NUM)
     mwes.append(finder.nbest(bigram_measures.pmi, self.N_GRAM_NUM))
     return mwes
Ejemplo n.º 2
0
def create_example_data():
    import nltk
    try:
        os.listdir(nltk.data.find('genesis'))
        from nltk.corpus import genesis as dataset
        #print(path_to)
        #from nltk.corpus import genesis as dataset
    except:
        #try:
        #import nltk
        nltk.download('genesis')
        #quit()
        from nltk.corpus import genesis as dataset
#except Exception as e:
#    print(e)
#    raise EnvironmentError("For Genesis toy data from NLTK you need the Internet access to download it.")

    languages = [
        "finnish", "german", "portuguese", "english", "french", "swedish"
    ]

    corpus_words = {
        "finnish": list(dataset.words('finnish.txt')),
        "german": list(dataset.words('german.txt')),
        "portuguese": list(dataset.words('portuguese.txt')),
        "english": list(dataset.words('english-web.txt')),
        "french": list(dataset.words('french.txt')),
        "swedish": list(dataset.words('swedish.txt'))
    }
    return corpus_words
Ejemplo n.º 3
0
def main():
  # store word lengths
  brown_word_lens = []
  web_word_lens = []
  inaugural_word_lens = []
  gutenberg_word_lens = []
  genesis_word_lens = []

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      gutenberg_word_lens.append(len(word))

  for file in brown.fileids():
    for word in brown.words(file):
      brown_word_lens.append(len(word))

  for file in webtext.fileids():
    for word in webtext.words(file):
      web_word_lens.append(len(word))

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      inaugural_word_lens.append(len(word))

  for file in genesis.fileids():
    for word in genesis.words(file):
      genesis_word_lens.append(len(word))
  with open("wordlens.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), 
        len(web_word_lens), len(brown_word_lens),
        len(gutenberg_word_lens))):
      for corpus in [genesis_word_lens, inaugural_word_lens,
          web_word_lens, brown_word_lens, gutenberg_word_lens]:
        if(i >= len(corpus)):
          f.write(",")
        else:
          f.write(str(corpus[i]) + ",")
      f.write("\n")
Ejemplo n.º 4
0
def main():
  # store word lengths
  brown_common_freq = []
  web_common_freq = []
  inaugural_common_freq = []
  gutenberg_common_freq = []
  genesis_common_freq = []

  common = ["the", "be", "to", "of", "and", "a", "in", "that", "have",
            "i", "it", "for", "not", "on", "with", "he", "as", "you",
            "do", "at", "this", "but", "his", "by", "from", "they",
            "we", "say", "her", "she", "or", "an", "will", "my", "one",
            "all", "would", "there", "their", "what", "so", "up", "out",
            "if", "about", "who", "get", "which", "go", "me", "when",
            "make", "can", "like", "time", "no", "just", "him", "know",
            "take", "people", "into", "year", "your", "good", "some",
            "could", "them", "see", "other", "than", "then", "now", "look",
            "only", "come", "its", "over", "think", "also", "back", "after",
            "use", "two", "how", "our", "work", "first", "well", "way",
            "even", "new", "want", "because", "any", "these", "give", "day",
            "most", "us"]
  common.sort()

  for file in gutenberg.fileids():
    total_words = len(gutenberg.words(file))
    total_common = 0
    for word in gutenberg.words(file):
      if word.lower() in common:
        total_common += 1
    gutenberg_common_freq.append(float(total_common)/total_words)

  for file in brown.fileids():
    total_words = len(brown.words(file))
    total_common = 0
    for word in brown.words(file):
      if word.lower() in common:
        total_common += 1
    brown_common_freq.append(float(total_common)/total_words)

  for file in webtext.fileids():
    total_words = len(webtext.words(file))
    total_common = 0
    for word in webtext.words(file):
      if word.lower() in common:
        total_common += 1
    web_common_freq.append(float(total_common)/total_words)

  for file in inaugural.fileids():
    total_words = len(inaugural.words(file))
    total_common = 0
    for word in inaugural.words(file):
      if word.lower() in common:
        total_common += 1
    inaugural_common_freq.append(float(total_common)/total_words)

  for file in genesis.fileids():
    total_words = len(genesis.words(file))
    total_common = 0
    for word in genesis.words(file):
      if word.lower() in common:
        total_common += 1
    genesis_common_freq.append(float(total_common)/total_words)

  with open("common-words.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq),
                        len(web_common_freq), len(brown_common_freq),
                        len(gutenberg_common_freq))):
      for corpus in [genesis_common_freq, inaugural_common_freq,
                     web_common_freq, brown_common_freq, gutenberg_common_freq]:
        if i >= len(corpus):
          f.write(",")
        else:
          f.write(str(round(corpus[i], 5)) + ",")
      f.write("\n")
Ejemplo n.º 5
0
Archivo: 30.py Proyecto: bmw9t/nltk
# ◑ Use the Porter Stemmer to normalize some tokenized text, calling the stemmer on each word. Do the same thing with the Lancaster Stemmer and see if you observe any differences.

import nltk
from nltk.corpus import genesis

text = genesis.words()
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()


for word in text:
	print(word)
	print("porter: " + porter.stem(word))
	print("lancaster: " + lancaster.stem(word))
Ejemplo n.º 6
0
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'),
              name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode('latin-1')

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode('latin-1')

ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
print "ptext3:", ptext3.name.decode('latin-1')

ptext4 = Text(mac_morpho.words('mu94se01.txt'),
              name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name.decode('latin-1')


def texts():
    print "ptext1:", ptext1.name.decode('latin-1')
    print "ptext2:", ptext2.name.decode('latin-1')
    print "ptext3:", ptext3.name.decode('latin-1')
    print "ptext4:", ptext4.name.decode('latin-1')


psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split(
from __future__ import division


def lexical_diversity(text):
    return len(text) / len(set(text))


def lexical_diversity_multiline(text):
    word_count = len(text)
    vocab_size = len(set(text))
    diversity_score = vocab_size / word_count
    return diversity_score


from nltk.corpus import genesis
kjv = genesis.words('english-kjv.txt')
lexical_diversity_multiline(kjv)

# In[72]:


def plural(word):
    if word.endswith('y'):
        return word[:-1] + 'ies'
    elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']:
        return word + 'es'
    elif word.endswith('an'):
        return word[:-2] + 'en'
    else:
        return word + 's'
Ejemplo n.º 8
0
#Q20
words = ['a', 'b', 'c', 'a', 'b', 'b', 'c', 'd', 'b']

fd = FreqDist(words)
length = len(set(fd))
answer = list(fd.most_common(length))
answer = [i[0] for i in answer]
print(answer)

#['b', 'a', 'c', 'd']

#Q21
from nltk.corpus import genesis

print(set(genesis.words()).difference(['writing', 'another', 'random', 'sentence']))

#Yes, I am able to do that.

#Q22
from operator import itemgetter

words = ['this', 'is', 'my', 'list', 'of', 'words']

sorted(words, key=itemgetter(1))
# ['of', 'this', 'list', 'words', 'is', 'my']

sorted(words, key=itemgetter(-1))
# ['of', 'this', 'is', 'words', 'list', 'my']

#operator.itemgetter(n) constructs a callable that assumes iterable object (list, tuple, set) as input an fetches n-th element out of it.
Ejemplo n.º 9
0
Archivo: book.py Proyecto: sp00/nltk
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading text1, ..., text9 and sent1, ..., sent9"
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print "text1:", text1.name

text2 = Text(gutenberg.words('austen-sense.txt'))
print "text2:", text2.name

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')],
             name="The Book of Genesis")
print "text3:", text3.name

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print "text4:", text4.name

text5 = Text(nps_chat.words(), name="Chat Corpus")
print "text5:", text5.name

text6 = Text(webtext.words('grail.txt'),
             name="Monty Python and the Holy Grail")
print "text6:", text6.name

text7 = Text(treebank.words(), name="Wall Street Journal")
print "text7:", text7.name
Ejemplo n.º 10
0
from nltk.corpus import (gutenberg, genesis, inaugural,
                         nps_chat, webtext, treebank, wordnet)
from nltk.text import Text

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)

text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
Ejemplo n.º 11
0
conn = sqlite3.connect(os.path.join(os.path.dirname(os.path.realpath(__file__)), "wofkov_db.sqlite"))
c = conn.cursor()

with open('wofkov_db_schema.sql', 'r') as sql:
    commands = sql.read().split(';')
    for command in commands:
        c.execute(command)
    
print "Building clean words list..."
words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]
words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])

print "Building clean sentences list"
sentences = []
for s in brown.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in treebank.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in abc.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in movie_reviews.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in genesis.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))

    
Ejemplo n.º 12
0
# ◑ Write a function that takes a text and a vocabulary as its arguments and returns the set of words that appear in the text but not in the vocabulary. Both arguments can be represented as lists of strings. Can you do this in a single line, using set.difference()?

from nltk.corpus import genesis

print(set(genesis.words()).difference(['this', 'is', 'my', 'vocabulary', 'lookee']))
Ejemplo n.º 13
0
from __future__ import division


def lexical_diversity(my_text_data):
    word_count = len(my_text_data)
    vocab_size = len(set(my_text_data))
    diversity_score = vocab_size / word_count
    return diversity_score


t = "This is a test"
lexical_diversity(t)

from nltk.corpus import genesis

lexical_diversity(genesis.words('english-kjv.txt'))

#%%
# WordNet
#  Let's find synonyms

from nltk.corpus import wordnet as wn

wn.synsets('motorcar')
wn.synset('car.n.01').lemma_names()

wn.synsets('dish')
#%%
#Word Hierarchy

motorcar = wn.synset('car.n.01')
Ejemplo n.º 14
0
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words("romance/marm05.txt"), name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode("latin-1")

ptext2 = Text(machado.words("romance/marm08.txt"), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode("latin-1")

ptext3 = Text(genesis.words("portuguese.txt"), name="Gênesis")
print "ptext3:", ptext3.name.decode("latin-1")

ptext4 = Text(mac_morpho.words("mu94se01.txt"), name="Folha de Sao Paulo (1994)")
print "ptext4:", ptext4.name.decode("latin-1")


def texts():
    print "ptext1:", ptext1.name.decode("latin-1")
    print "ptext2:", ptext2.name.decode("latin-1")
    print "ptext3:", ptext3.name.decode("latin-1")
    print "ptext4:", ptext4.name.decode("latin-1")


psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split()
psent2 = "Não consultes dicionários .".split()
Ejemplo n.º 15
0
# ◑ Write a function that takes a text and a vocabulary as its arguments and returns the set of words that appear in the text but not in the vocabulary. Both arguments can be represented as lists of strings. Can you do this in a single line, using set.difference()?

from nltk.corpus import genesis

print(
    set(genesis.words()).difference(
        ['this', 'is', 'my', 'vocabulary', 'lookee']))
Ejemplo n.º 16
0
def main():
#store FreqDist's
#index is the length of the word, 0 is for all words
  samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

  brown_letters = FreqDist()
  web_letters = FreqDist()
  inaugural_letters = FreqDist()
  gutenberg_letters = FreqDist()
  genesis_letters = FreqDist()

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      for character in word:
        if(character in string.letters):
            gutenberg_letters[character.upper()] += 1

  for file in brown.fileids():
    for word in brown.words(file):
      for character in word:
        if(character in string.letters):
            brown_letters[character.upper()] += 1

  for file in webtext.fileids():
    for word in webtext.words(file):
      for character in word:
        if(character in string.letters):
            web_letters[character.upper()] += 1

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      for character in word:
        if(character in string.letters):
            inaugural_letters[character.upper()] += 1

  for file in genesis.fileids():
    for word in genesis.words(file):
      for character in word:
        if(character in string.letters):
            genesis_letters[character.upper()] += 1

  with open("genesis-letter-freq.txt",'w') as f:
    sys.stdout = f
    f.write("GENESIS\n")
    for let in samples:
        print(str(genesis_letters[let]))
  
  with open("gutenberg-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("GUTENBERG\n")
    for let in samples:
        print(str(gutenberg_letters[let]))
  with open("webtext-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("WEBTEXT\n")
    for let in samples:
        print(str(web_letters[let]))
  with open("inaugural-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("INAUGURAL\n")
    for let in samples:
        print(str(inaugural_letters[let]))
  with open("brown-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("BROWN\n")
    for let in samples:
        print(str(brown_letters[let]))
  
  with open("letter-freq.txt", 'w') as f:
    corpora = [gutenberg_letters, web_letters, inaugural_letters,
        brown_letters, genesis_letters]
    f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n")
    for let in samples:
      for corpus in corpora:
        f.write(str(corpus[let]) + ",")
      f.write("\n")
Ejemplo n.º 17
0
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading text1, ..., text9 and sent1, ..., sent9"
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print "text1:", text1.name

text2 = Text(gutenberg.words('austen-sense.txt'))
print "text2:", text2.name

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print "text3:", text3.name

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print "text4:", text4.name

text5 = Text(nps_chat.words(), name="Chat Corpus")
print "text5:", text5.name

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print "text6:", text6.name

text7 = Text(treebank.words(), name="Wall Street Journal")
print "text7:", text7.name

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
Ejemplo n.º 18
0
Archivo: pt.py Proyecto: ciju/yql_hash
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'), name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode('latin-1')

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode('latin-1')

ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
print "ptext3:", ptext3.name.decode('latin-1')

ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name.decode('latin-1')

def texts():
    print "ptext1:", ptext1.name.decode('latin-1')
    print "ptext2:", ptext2.name.decode('latin-1')
    print "ptext3:", ptext3.name.decode('latin-1')
    print "ptext4:", ptext4.name.decode('latin-1')

psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split()
psent2 = "Não consultes dicionários .".split()
psent3 = "No princípio, criou Deus os céus e a terra.".split()
psent4 = "A Cáritas acredita que outros cubanos devem chegar ao Brasil .".split()
Ejemplo n.º 19
0
        
cdf.plot()
#%%
from __future__ import division

def lexical_diversity(my_text_data):
     word_count = len(my_text_data)
     vocab_size = len(set(my_text_data))
     diversity_score = vocab_size / word_count
     return diversity_score

t="This is a test"
lexical_diversity(t)

from nltk.corpus import genesis
lexical_diversity(genesis.words('english-kjv.txt'))

#%%
# WordNet
#  Let's find synonyms

from nltk.corpus import wordnet as wn

wn.synsets('motorcar')
wn.synset('car.n.01').lemma_names()

wn.synsets('dish')
#%%
#Word Hierarchy

motorcar=wn.synset('car.n.01')
Ejemplo n.º 20
0
NUM_INTERVALS = 10

if __name__ == '__main__':

    if len(sys.argv) < 2:
        print(
            'Usage: {} <word2vec-model> [<min-len = {}> [<max-len = {}> [<num-intervals = {}>]]]'
            .format(sys.argv[0], MIN_LEN, MAX_LEN, NUM_INTERVALS))
        exit()

    model = sys.argv[1]
    minLen = int(sys.argv[2]) if len(sys.argv) > 2 else MIN_LEN
    maxLen = int(sys.argv[3]) if len(sys.argv) > 3 else MAX_LEN
    numIntervals = int(sys.argv[4]) if len(sys.argv) > 4 else NUM_INTERVALS

    text = genesis.words(fileids='english-kjv.txt')
    feat = textutils.text2mat(text, model)

    start = time.time()
    intervals = maxdiv(feat,
                       method='gaussian_cov',
                       mode='TS',
                       extint_min_len=minLen,
                       extint_max_len=maxLen,
                       num_intervals=numIntervals)
    stop = time.time()
    print(
        'The search for anomalous paragraphs in a text of {} words took {} seconds.'
        .format(len(text), stop - start))

    textutils.printDetectedParagraphs(text, intervals)
Ejemplo n.º 21
0
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)

text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")
Ejemplo n.º 22
0
def text3():
    text = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
    print("text3:", text.name)
    return text
Ejemplo n.º 23
0
from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words("melville-moby_dick.txt"))
print("text1:", text1.name)

text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)

text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")
Ejemplo n.º 24
0
        for i in self._index[key]:
            lcontext = ' '.join(self._text[i - wc:i])
            rcontext = ' '.join(self._text[i:i + wc])
            ldisplay = '{:>{width}}'.format(lcontext[-width:], width=width)
            rdisplay = '{:{width}}'.format(rcontext[:width], width=width)
            print(ldisplay, rdisplay)

    def _stem(self, word):
        return self._stemmer.stem(word).lower()


porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')


text = genesis.words()


def sem_index(text):
    word_with_syns = []
    # iterate over every word in the text
    for word in text:
        # synsets are equal to all the synsets for the word
        synsets = wn.synsets(word)
        syns_indices = []
        # for every synset in the synset grouping
        for synset in synsets:
            # set the index number equal to its offset
            sem_index_num = synset.offset()
            syns_indices += [sem_index_num]
        if syns_indices: