Ejemplos de words en Python, ejemplos de nltk.corpus.genesis.words en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: wordnet_boosting.py Proyecto: happinesstaker/FYP

 def multi_words_xpn(self):
     mwes = []
     bigram_measures = nltk.collocations.BigramAssocMeasures()
     finder = nltk.collocations.BigramCollocationFinder.from_words(gw.words('english-web.txt'))
     finder.apply_freq_filter(self.FILTERING_NUM)
     mwes.append(finder.nbest(bigram_measures.pmi, self.N_GRAM_NUM))
     trigram_measures = nltk.collocations.TrigramAssocMeasures()
     finder = nltk.collocations.TrigramCollocationFinder.from_words(gw.words('english-web.txt'))
     finder.apply_freq_filter(self.FILTERING_NUM)
     mwes.append(finder.nbest(bigram_measures.pmi, self.N_GRAM_NUM))
     return mwes

Ejemplo n.º 2

0

Mostrar archivo

def create_example_data():
    import nltk
    try:
        os.listdir(nltk.data.find('genesis'))
        from nltk.corpus import genesis as dataset
        #print(path_to)
        #from nltk.corpus import genesis as dataset
    except:
        #try:
        #import nltk
        nltk.download('genesis')
        #quit()
        from nltk.corpus import genesis as dataset
#except Exception as e:
#    print(e)
#    raise EnvironmentError("For Genesis toy data from NLTK you need the Internet access to download it.")

    languages = [
        "finnish", "german", "portuguese", "english", "french", "swedish"
    ]

    corpus_words = {
        "finnish": list(dataset.words('finnish.txt')),
        "german": list(dataset.words('german.txt')),
        "portuguese": list(dataset.words('portuguese.txt')),
        "english": list(dataset.words('english-web.txt')),
        "french": list(dataset.words('french.txt')),
        "swedish": list(dataset.words('swedish.txt'))
    }
    return corpus_words

Ejemplo n.º 3

0

Mostrar archivo

Archivo: main2.py Proyecto: bbusching/Stat-312

def main():
  # store word lengths
  brown_word_lens = []
  web_word_lens = []
  inaugural_word_lens = []
  gutenberg_word_lens = []
  genesis_word_lens = []

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      gutenberg_word_lens.append(len(word))

  for file in brown.fileids():
    for word in brown.words(file):
      brown_word_lens.append(len(word))

  for file in webtext.fileids():
    for word in webtext.words(file):
      web_word_lens.append(len(word))

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      inaugural_word_lens.append(len(word))

  for file in genesis.fileids():
    for word in genesis.words(file):
      genesis_word_lens.append(len(word))
  with open("wordlens.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), 
        len(web_word_lens), len(brown_word_lens),
        len(gutenberg_word_lens))):
      for corpus in [genesis_word_lens, inaugural_word_lens,
          web_word_lens, brown_word_lens, gutenberg_word_lens]:
        if(i >= len(corpus)):
          f.write(",")
        else:
          f.write(str(corpus[i]) + ",")
      f.write("\n")

Ejemplo n.º 4

0

Mostrar archivo

Archivo: main3.py Proyecto: bbusching/Stat-312

def main():
  # store word lengths
  brown_common_freq = []
  web_common_freq = []
  inaugural_common_freq = []
  gutenberg_common_freq = []
  genesis_common_freq = []

  common = ["the", "be", "to", "of", "and", "a", "in", "that", "have",
            "i", "it", "for", "not", "on", "with", "he", "as", "you",
            "do", "at", "this", "but", "his", "by", "from", "they",
            "we", "say", "her", "she", "or", "an", "will", "my", "one",
            "all", "would", "there", "their", "what", "so", "up", "out",
            "if", "about", "who", "get", "which", "go", "me", "when",
            "make", "can", "like", "time", "no", "just", "him", "know",
            "take", "people", "into", "year", "your", "good", "some",
            "could", "them", "see", "other", "than", "then", "now", "look",
            "only", "come", "its", "over", "think", "also", "back", "after",
            "use", "two", "how", "our", "work", "first", "well", "way",
            "even", "new", "want", "because", "any", "these", "give", "day",
            "most", "us"]
  common.sort()

  for file in gutenberg.fileids():
    total_words = len(gutenberg.words(file))
    total_common = 0
    for word in gutenberg.words(file):
      if word.lower() in common:
        total_common += 1
    gutenberg_common_freq.append(float(total_common)/total_words)

  for file in brown.fileids():
    total_words = len(brown.words(file))
    total_common = 0
    for word in brown.words(file):
      if word.lower() in common:
        total_common += 1
    brown_common_freq.append(float(total_common)/total_words)

  for file in webtext.fileids():
    total_words = len(webtext.words(file))
    total_common = 0
    for word in webtext.words(file):
      if word.lower() in common:
        total_common += 1
    web_common_freq.append(float(total_common)/total_words)

  for file in inaugural.fileids():
    total_words = len(inaugural.words(file))
    total_common = 0
    for word in inaugural.words(file):
      if word.lower() in common:
        total_common += 1
    inaugural_common_freq.append(float(total_common)/total_words)

  for file in genesis.fileids():
    total_words = len(genesis.words(file))
    total_common = 0
    for word in genesis.words(file):
      if word.lower() in common:
        total_common += 1
    genesis_common_freq.append(float(total_common)/total_words)

  with open("common-words.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq),
                        len(web_common_freq), len(brown_common_freq),
                        len(gutenberg_common_freq))):
      for corpus in [genesis_common_freq, inaugural_common_freq,
                     web_common_freq, brown_common_freq, gutenberg_common_freq]:
        if i >= len(corpus):
          f.write(",")
        else:
          f.write(str(round(corpus[i], 5)) + ",")
      f.write("\n")

Ejemplo n.º 5

0

Mostrar archivo

Archivo: 30.py Proyecto: bmw9t/nltk

# ◑ Use the Porter Stemmer to normalize some tokenized text, calling the stemmer on each word. Do the same thing with the Lancaster Stemmer and see if you observe any differences.

import nltk
from nltk.corpus import genesis

text = genesis.words()
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()


for word in text:
	print(word)
	print("porter: " + porter.stem(word))
	print("lancaster: " + lancaster.stem(word))

Ejemplo n.º 6

0

Mostrar archivo

from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'),
              name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode('latin-1')

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode('latin-1')

ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
print "ptext3:", ptext3.name.decode('latin-1')

ptext4 = Text(mac_morpho.words('mu94se01.txt'),
              name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name.decode('latin-1')


def texts():
    print "ptext1:", ptext1.name.decode('latin-1')
    print "ptext2:", ptext2.name.decode('latin-1')
    print "ptext3:", ptext3.name.decode('latin-1')
    print "ptext4:", ptext4.name.decode('latin-1')


psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split(

Ejemplo n.º 7

0

Mostrar archivo

Archivo: 2-Accessing-text-corpora-and-lexical-resources.py Proyecto: subrota-mondal/nltk-book-examples

from __future__ import division


def lexical_diversity(text):
    return len(text) / len(set(text))


def lexical_diversity_multiline(text):
    word_count = len(text)
    vocab_size = len(set(text))
    diversity_score = vocab_size / word_count
    return diversity_score


from nltk.corpus import genesis
kjv = genesis.words('english-kjv.txt')
lexical_diversity_multiline(kjv)

# In[72]:


def plural(word):
    if word.endswith('y'):
        return word[:-1] + 'ies'
    elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']:
        return word + 'es'
    elif word.endswith('an'):
        return word[:-2] + 'en'
    else:
        return word + 's'

Ejemplo n.º 8

0

Mostrar archivo

Archivo: Semantics.py Proyecto: ishantnayer/Python-Files

#Q20
words = ['a', 'b', 'c', 'a', 'b', 'b', 'c', 'd', 'b']

fd = FreqDist(words)
length = len(set(fd))
answer = list(fd.most_common(length))
answer = [i[0] for i in answer]
print(answer)

#['b', 'a', 'c', 'd']

#Q21
from nltk.corpus import genesis

print(set(genesis.words()).difference(['writing', 'another', 'random', 'sentence']))

#Yes, I am able to do that.

#Q22
from operator import itemgetter

words = ['this', 'is', 'my', 'list', 'of', 'words']

sorted(words, key=itemgetter(1))
# ['of', 'this', 'list', 'words', 'is', 'my']

sorted(words, key=itemgetter(-1))
# ['of', 'this', 'is', 'words', 'list', 'my']

#operator.itemgetter(n) constructs a callable that assumes iterable object (list, tuple, set) as input an fetches n-th element out of it.

Ejemplo n.º 9

0

Mostrar archivo

Archivo: book.py Proyecto: sp00/nltk

from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading text1, ..., text9 and sent1, ..., sent9"
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print "text1:", text1.name

text2 = Text(gutenberg.words('austen-sense.txt'))
print "text2:", text2.name

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')],
             name="The Book of Genesis")
print "text3:", text3.name

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print "text4:", text4.name

text5 = Text(nps_chat.words(), name="Chat Corpus")
print "text5:", text5.name

text6 = Text(webtext.words('grail.txt'),
             name="Monty Python and the Holy Grail")
print "text6:", text6.name

text7 = Text(treebank.words(), name="Wall Street Journal")
print "text7:", text7.name

Ejemplo n.º 10

0

Mostrar archivo

Archivo: book.py Proyecto: CaptainAL/Spyder

from nltk.corpus import (gutenberg, genesis, inaugural,
                         nps_chat, webtext, treebank, wordnet)
from nltk.text import Text

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)

text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")

Ejemplo n.º 11

0

Mostrar archivo

Archivo: wofkov_db_init.py Proyecto: hillmanov/wofkov

conn = sqlite3.connect(os.path.join(os.path.dirname(os.path.realpath(__file__)), "wofkov_db.sqlite"))
c = conn.cursor()

with open('wofkov_db_schema.sql', 'r') as sql:
    commands = sql.read().split(';')
    for command in commands:
        c.execute(command)
    
print "Building clean words list..."
words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]
words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])

print "Building clean sentences list"
sentences = []
for s in brown.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in treebank.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in abc.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in movie_reviews.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in genesis.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))

Ejemplo n.º 12

0

Mostrar archivo

Archivo: 21.py Proyecto: Anastasia1302/nltk

# ◑ Write a function that takes a text and a vocabulary as its arguments and returns the set of words that appear in the text but not in the vocabulary. Both arguments can be represented as lists of strings. Can you do this in a single line, using set.difference()?

from nltk.corpus import genesis

print(set(genesis.words()).difference(['this', 'is', 'my', 'vocabulary', 'lookee']))

Ejemplo n.º 13

0

Mostrar archivo

from __future__ import division


def lexical_diversity(my_text_data):
    word_count = len(my_text_data)
    vocab_size = len(set(my_text_data))
    diversity_score = vocab_size / word_count
    return diversity_score


t = "This is a test"
lexical_diversity(t)

from nltk.corpus import genesis

lexical_diversity(genesis.words('english-kjv.txt'))

#%%
# WordNet
#  Let's find synonyms

from nltk.corpus import wordnet as wn

wn.synsets('motorcar')
wn.synset('car.n.01').lemma_names()

wn.synsets('dish')
#%%
#Word Hierarchy

motorcar = wn.synset('car.n.01')

Ejemplo n.º 14

0

Mostrar archivo

Archivo: pt.py Proyecto: approximatelylinear/nltk

from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words("romance/marm05.txt"), name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode("latin-1")

ptext2 = Text(machado.words("romance/marm08.txt"), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode("latin-1")

ptext3 = Text(genesis.words("portuguese.txt"), name="Gênesis")
print "ptext3:", ptext3.name.decode("latin-1")

ptext4 = Text(mac_morpho.words("mu94se01.txt"), name="Folha de Sao Paulo (1994)")
print "ptext4:", ptext4.name.decode("latin-1")


def texts():
    print "ptext1:", ptext1.name.decode("latin-1")
    print "ptext2:", ptext2.name.decode("latin-1")
    print "ptext3:", ptext3.name.decode("latin-1")
    print "ptext4:", ptext4.name.decode("latin-1")


psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split()
psent2 = "Não consultes dicionários .".split()

Ejemplo n.º 15

0

Mostrar archivo

Archivo: 21.py Proyecto: peizhe/sandbox-github-clone

# ◑ Write a function that takes a text and a vocabulary as its arguments and returns the set of words that appear in the text but not in the vocabulary. Both arguments can be represented as lists of strings. Can you do this in a single line, using set.difference()?

from nltk.corpus import genesis

print(
    set(genesis.words()).difference(
        ['this', 'is', 'my', 'vocabulary', 'lookee']))

Ejemplo n.º 16

0

Mostrar archivo

Archivo: main.py Proyecto: bbusching/Stat-312

def main():
#store FreqDist's
#index is the length of the word, 0 is for all words
  samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

  brown_letters = FreqDist()
  web_letters = FreqDist()
  inaugural_letters = FreqDist()
  gutenberg_letters = FreqDist()
  genesis_letters = FreqDist()

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      for character in word:
        if(character in string.letters):
            gutenberg_letters[character.upper()] += 1

  for file in brown.fileids():
    for word in brown.words(file):
      for character in word:
        if(character in string.letters):
            brown_letters[character.upper()] += 1

  for file in webtext.fileids():
    for word in webtext.words(file):
      for character in word:
        if(character in string.letters):
            web_letters[character.upper()] += 1

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      for character in word:
        if(character in string.letters):
            inaugural_letters[character.upper()] += 1

  for file in genesis.fileids():
    for word in genesis.words(file):
      for character in word:
        if(character in string.letters):
            genesis_letters[character.upper()] += 1

  with open("genesis-letter-freq.txt",'w') as f:
    sys.stdout = f
    f.write("GENESIS\n")
    for let in samples:
        print(str(genesis_letters[let]))
  
  with open("gutenberg-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("GUTENBERG\n")
    for let in samples:
        print(str(gutenberg_letters[let]))
  with open("webtext-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("WEBTEXT\n")
    for let in samples:
        print(str(web_letters[let]))
  with open("inaugural-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("INAUGURAL\n")
    for let in samples:
        print(str(inaugural_letters[let]))
  with open("brown-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("BROWN\n")
    for let in samples:
        print(str(brown_letters[let]))
  
  with open("letter-freq.txt", 'w') as f:
    corpora = [gutenberg_letters, web_letters, inaugural_letters,
        brown_letters, genesis_letters]
    f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n")
    for let in samples:
      for corpus in corpora:
        f.write(str(corpus[let]) + ",")
      f.write("\n")

Ejemplo n.º 17

0

Mostrar archivo

Archivo: book.py Proyecto: wrand/tweater

from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading text1, ..., text9 and sent1, ..., sent9"
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print "text1:", text1.name

text2 = Text(gutenberg.words('austen-sense.txt'))
print "text2:", text2.name

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print "text3:", text3.name

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print "text4:", text4.name

text5 = Text(nps_chat.words(), name="Chat Corpus")
print "text5:", text5.name

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print "text6:", text6.name

text7 = Text(treebank.words(), name="Wall Street Journal")
print "text7:", text7.name

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")

Ejemplo n.º 18

0

Mostrar archivo

Archivo: pt.py Proyecto: ciju/yql_hash

from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'), name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode('latin-1')

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode('latin-1')

ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
print "ptext3:", ptext3.name.decode('latin-1')

ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name.decode('latin-1')

def texts():
    print "ptext1:", ptext1.name.decode('latin-1')
    print "ptext2:", ptext2.name.decode('latin-1')
    print "ptext3:", ptext3.name.decode('latin-1')
    print "ptext4:", ptext4.name.decode('latin-1')

psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split()
psent2 = "Não consultes dicionários .".split()
psent3 = "No princípio, criou Deus os céus e a terra.".split()
psent4 = "A Cáritas acredita que outros cubanos devem chegar ao Brasil .".split()

Ejemplo n.º 19

0

Mostrar archivo

Archivo: NLTK_lesson_chp_2.py Proyecto: a1309820/Tw_1

        
cdf.plot()
#%%
from __future__ import division

def lexical_diversity(my_text_data):
     word_count = len(my_text_data)
     vocab_size = len(set(my_text_data))
     diversity_score = vocab_size / word_count
     return diversity_score

t="This is a test"
lexical_diversity(t)

from nltk.corpus import genesis
lexical_diversity(genesis.words('english-kjv.txt'))

#%%
# WordNet
#  Let's find synonyms

from nltk.corpus import wordnet as wn

wn.synsets('motorcar')
wn.synset('car.n.01').lemma_names()

wn.synsets('dish')
#%%
#Word Hierarchy

motorcar=wn.synset('car.n.01')

Ejemplo n.º 20

0

Mostrar archivo

NUM_INTERVALS = 10

if __name__ == '__main__':

    if len(sys.argv) < 2:
        print(
            'Usage: {} <word2vec-model> [<min-len = {}> [<max-len = {}> [<num-intervals = {}>]]]'
            .format(sys.argv[0], MIN_LEN, MAX_LEN, NUM_INTERVALS))
        exit()

    model = sys.argv[1]
    minLen = int(sys.argv[2]) if len(sys.argv) > 2 else MIN_LEN
    maxLen = int(sys.argv[3]) if len(sys.argv) > 3 else MAX_LEN
    numIntervals = int(sys.argv[4]) if len(sys.argv) > 4 else NUM_INTERVALS

    text = genesis.words(fileids='english-kjv.txt')
    feat = textutils.text2mat(text, model)

    start = time.time()
    intervals = maxdiv(feat,
                       method='gaussian_cov',
                       mode='TS',
                       extint_min_len=minLen,
                       extint_max_len=maxLen,
                       num_intervals=numIntervals)
    stop = time.time()
    print(
        'The search for anomalous paragraphs in a text of {} words took {} seconds.'
        .format(len(text), stop - start))

    textutils.printDetectedParagraphs(text, intervals)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: book.py Proyecto: 2ricecrackerfolder/twittermood

from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)

text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")

Ejemplo n.º 22

0

Mostrar archivo

Archivo: books.py Proyecto: neuroph12/nlpy

def text3():
    text = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
    print("text3:", text.name)
    return text

Ejemplo n.º 23

0

Mostrar archivo

Archivo: book.py Proyecto: zlpmichelle/nltk

from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words("melville-moby_dick.txt"))
print("text1:", text1.name)

text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)

text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")

Ejemplo n.º 24

0

Mostrar archivo

Archivo: BLK42.py Proyecto: jingyi199858/CS557Stevens

        for i in self._index[key]:
            lcontext = ' '.join(self._text[i - wc:i])
            rcontext = ' '.join(self._text[i:i + wc])
            ldisplay = '{:>{width}}'.format(lcontext[-width:], width=width)
            rdisplay = '{:{width}}'.format(rcontext[:width], width=width)
            print(ldisplay, rdisplay)

    def _stem(self, word):
        return self._stemmer.stem(word).lower()


porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')


text = genesis.words()


def sem_index(text):
    word_with_syns = []
    # iterate over every word in the text
    for word in text:
        # synsets are equal to all the synsets for the word
        synsets = wn.synsets(word)
        syns_indices = []
        # for every synset in the synset grouping
        for synset in synsets:
            # set the index number equal to its offset
            sem_index_num = synset.offset()
            syns_indices += [sem_index_num]
        if syns_indices: