Python wordsの例、nltk.corpus.genesis.words Pythonの例

コード例 #1

0

ファイルを表示

ファイル: wordnet_boosting.py プロジェクト: happinesstaker/FYP

 def multi_words_xpn(self):
     mwes = []
     bigram_measures = nltk.collocations.BigramAssocMeasures()
     finder = nltk.collocations.BigramCollocationFinder.from_words(gw.words('english-web.txt'))
     finder.apply_freq_filter(self.FILTERING_NUM)
     mwes.append(finder.nbest(bigram_measures.pmi, self.N_GRAM_NUM))
     trigram_measures = nltk.collocations.TrigramAssocMeasures()
     finder = nltk.collocations.TrigramCollocationFinder.from_words(gw.words('english-web.txt'))
     finder.apply_freq_filter(self.FILTERING_NUM)
     mwes.append(finder.nbest(bigram_measures.pmi, self.N_GRAM_NUM))
     return mwes

コード例 #2

0

ファイルを表示

def create_example_data():
    import nltk
    try:
        os.listdir(nltk.data.find('genesis'))
        from nltk.corpus import genesis as dataset
        #print(path_to)
        #from nltk.corpus import genesis as dataset
    except:
        #try:
        #import nltk
        nltk.download('genesis')
        #quit()
        from nltk.corpus import genesis as dataset
#except Exception as e:
#    print(e)
#    raise EnvironmentError("For Genesis toy data from NLTK you need the Internet access to download it.")

    languages = [
        "finnish", "german", "portuguese", "english", "french", "swedish"
    ]

    corpus_words = {
        "finnish": list(dataset.words('finnish.txt')),
        "german": list(dataset.words('german.txt')),
        "portuguese": list(dataset.words('portuguese.txt')),
        "english": list(dataset.words('english-web.txt')),
        "french": list(dataset.words('french.txt')),
        "swedish": list(dataset.words('swedish.txt'))
    }
    return corpus_words

コード例 #3

0

ファイルを表示

ファイル: main2.py プロジェクト: bbusching/Stat-312

def main():
  # store word lengths
  brown_word_lens = []
  web_word_lens = []
  inaugural_word_lens = []
  gutenberg_word_lens = []
  genesis_word_lens = []

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      gutenberg_word_lens.append(len(word))

  for file in brown.fileids():
    for word in brown.words(file):
      brown_word_lens.append(len(word))

  for file in webtext.fileids():
    for word in webtext.words(file):
      web_word_lens.append(len(word))

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      inaugural_word_lens.append(len(word))

  for file in genesis.fileids():
    for word in genesis.words(file):
      genesis_word_lens.append(len(word))
  with open("wordlens.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_word_lens), len(inaugural_word_lens), 
        len(web_word_lens), len(brown_word_lens),
        len(gutenberg_word_lens))):
      for corpus in [genesis_word_lens, inaugural_word_lens,
          web_word_lens, brown_word_lens, gutenberg_word_lens]:
        if(i >= len(corpus)):
          f.write(",")
        else:
          f.write(str(corpus[i]) + ",")
      f.write("\n")

コード例 #4

0

ファイルを表示

ファイル: main3.py プロジェクト: bbusching/Stat-312

def main():
  # store word lengths
  brown_common_freq = []
  web_common_freq = []
  inaugural_common_freq = []
  gutenberg_common_freq = []
  genesis_common_freq = []

  common = ["the", "be", "to", "of", "and", "a", "in", "that", "have",
            "i", "it", "for", "not", "on", "with", "he", "as", "you",
            "do", "at", "this", "but", "his", "by", "from", "they",
            "we", "say", "her", "she", "or", "an", "will", "my", "one",
            "all", "would", "there", "their", "what", "so", "up", "out",
            "if", "about", "who", "get", "which", "go", "me", "when",
            "make", "can", "like", "time", "no", "just", "him", "know",
            "take", "people", "into", "year", "your", "good", "some",
            "could", "them", "see", "other", "than", "then", "now", "look",
            "only", "come", "its", "over", "think", "also", "back", "after",
            "use", "two", "how", "our", "work", "first", "well", "way",
            "even", "new", "want", "because", "any", "these", "give", "day",
            "most", "us"]
  common.sort()

  for file in gutenberg.fileids():
    total_words = len(gutenberg.words(file))
    total_common = 0
    for word in gutenberg.words(file):
      if word.lower() in common:
        total_common += 1
    gutenberg_common_freq.append(float(total_common)/total_words)

  for file in brown.fileids():
    total_words = len(brown.words(file))
    total_common = 0
    for word in brown.words(file):
      if word.lower() in common:
        total_common += 1
    brown_common_freq.append(float(total_common)/total_words)

  for file in webtext.fileids():
    total_words = len(webtext.words(file))
    total_common = 0
    for word in webtext.words(file):
      if word.lower() in common:
        total_common += 1
    web_common_freq.append(float(total_common)/total_words)

  for file in inaugural.fileids():
    total_words = len(inaugural.words(file))
    total_common = 0
    for word in inaugural.words(file):
      if word.lower() in common:
        total_common += 1
    inaugural_common_freq.append(float(total_common)/total_words)

  for file in genesis.fileids():
    total_words = len(genesis.words(file))
    total_common = 0
    for word in genesis.words(file):
      if word.lower() in common:
        total_common += 1
    genesis_common_freq.append(float(total_common)/total_words)

  with open("common-words.txt", 'w') as f:
    sys.stdout = f
    f.write("GENESIS, INAUGURAL, WEBTEXT, BROWN, GUTENBERG\n")
    for i in xrange(max(len(genesis_common_freq), len(inaugural_common_freq),
                        len(web_common_freq), len(brown_common_freq),
                        len(gutenberg_common_freq))):
      for corpus in [genesis_common_freq, inaugural_common_freq,
                     web_common_freq, brown_common_freq, gutenberg_common_freq]:
        if i >= len(corpus):
          f.write(",")
        else:
          f.write(str(round(corpus[i], 5)) + ",")
      f.write("\n")

コード例 #5

0

ファイルを表示

ファイル: 30.py プロジェクト: bmw9t/nltk

# ◑ Use the Porter Stemmer to normalize some tokenized text, calling the stemmer on each word. Do the same thing with the Lancaster Stemmer and see if you observe any differences.

import nltk
from nltk.corpus import genesis

text = genesis.words()
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()


for word in text:
	print(word)
	print("porter: " + porter.stem(word))
	print("lancaster: " + lancaster.stem(word))

コード例 #6

0

ファイルを表示

from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'),
              name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode('latin-1')

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode('latin-1')

ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
print "ptext3:", ptext3.name.decode('latin-1')

ptext4 = Text(mac_morpho.words('mu94se01.txt'),
              name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name.decode('latin-1')


def texts():
    print "ptext1:", ptext1.name.decode('latin-1')
    print "ptext2:", ptext2.name.decode('latin-1')
    print "ptext3:", ptext3.name.decode('latin-1')
    print "ptext4:", ptext4.name.decode('latin-1')


psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split(

コード例 #7

0

ファイルを表示

ファイル: 2-Accessing-text-corpora-and-lexical-resources.py プロジェクト: subrota-mondal/nltk-book-examples

from __future__ import division


def lexical_diversity(text):
    return len(text) / len(set(text))


def lexical_diversity_multiline(text):
    word_count = len(text)
    vocab_size = len(set(text))
    diversity_score = vocab_size / word_count
    return diversity_score


from nltk.corpus import genesis
kjv = genesis.words('english-kjv.txt')
lexical_diversity_multiline(kjv)

# In[72]:


def plural(word):
    if word.endswith('y'):
        return word[:-1] + 'ies'
    elif word[-1] in 'sx' or word[-2:] in ['sh', 'ch']:
        return word + 'es'
    elif word.endswith('an'):
        return word[:-2] + 'en'
    else:
        return word + 's'

コード例 #8

0

ファイルを表示

ファイル: Semantics.py プロジェクト: ishantnayer/Python-Files

#Q20
words = ['a', 'b', 'c', 'a', 'b', 'b', 'c', 'd', 'b']

fd = FreqDist(words)
length = len(set(fd))
answer = list(fd.most_common(length))
answer = [i[0] for i in answer]
print(answer)

#['b', 'a', 'c', 'd']

#Q21
from nltk.corpus import genesis

print(set(genesis.words()).difference(['writing', 'another', 'random', 'sentence']))

#Yes, I am able to do that.

#Q22
from operator import itemgetter

words = ['this', 'is', 'my', 'list', 'of', 'words']

sorted(words, key=itemgetter(1))
# ['of', 'this', 'list', 'words', 'is', 'my']

sorted(words, key=itemgetter(-1))
# ['of', 'this', 'is', 'words', 'list', 'my']

#operator.itemgetter(n) constructs a callable that assumes iterable object (list, tuple, set) as input an fetches n-th element out of it.

コード例 #9

0

ファイルを表示

ファイル: book.py プロジェクト: sp00/nltk

from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading text1, ..., text9 and sent1, ..., sent9"
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print "text1:", text1.name

text2 = Text(gutenberg.words('austen-sense.txt'))
print "text2:", text2.name

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')],
             name="The Book of Genesis")
print "text3:", text3.name

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print "text4:", text4.name

text5 = Text(nps_chat.words(), name="Chat Corpus")
print "text5:", text5.name

text6 = Text(webtext.words('grail.txt'),
             name="Monty Python and the Holy Grail")
print "text6:", text6.name

text7 = Text(treebank.words(), name="Wall Street Journal")
print "text7:", text7.name

コード例 #10

0

ファイルを表示

ファイル: book.py プロジェクト: CaptainAL/Spyder

from nltk.corpus import (gutenberg, genesis, inaugural,
                         nps_chat, webtext, treebank, wordnet)
from nltk.text import Text

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)

text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")

コード例 #11

0

ファイルを表示

ファイル: wofkov_db_init.py プロジェクト: hillmanov/wofkov

conn = sqlite3.connect(os.path.join(os.path.dirname(os.path.realpath(__file__)), "wofkov_db.sqlite"))
c = conn.cursor()

with open('wofkov_db_schema.sql', 'r') as sql:
    commands = sql.read().split(';')
    for command in commands:
        c.execute(command)
    
print "Building clean words list..."
words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]
words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])

print "Building clean sentences list"
sentences = []
for s in brown.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in treebank.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in abc.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in movie_reviews.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in genesis.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))

コード例 #12

0

ファイルを表示

ファイル: 21.py プロジェクト: Anastasia1302/nltk

# ◑ Write a function that takes a text and a vocabulary as its arguments and returns the set of words that appear in the text but not in the vocabulary. Both arguments can be represented as lists of strings. Can you do this in a single line, using set.difference()?

from nltk.corpus import genesis

print(set(genesis.words()).difference(['this', 'is', 'my', 'vocabulary', 'lookee']))

コード例 #13

0

ファイルを表示

from __future__ import division


def lexical_diversity(my_text_data):
    word_count = len(my_text_data)
    vocab_size = len(set(my_text_data))
    diversity_score = vocab_size / word_count
    return diversity_score


t = "This is a test"
lexical_diversity(t)

from nltk.corpus import genesis

lexical_diversity(genesis.words('english-kjv.txt'))

#%%
# WordNet
#  Let's find synonyms

from nltk.corpus import wordnet as wn

wn.synsets('motorcar')
wn.synset('car.n.01').lemma_names()

wn.synsets('dish')
#%%
#Word Hierarchy

motorcar = wn.synset('car.n.01')

コード例 #14

0

ファイルを表示

ファイル: pt.py プロジェクト: approximatelylinear/nltk

from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words("romance/marm05.txt"), name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode("latin-1")

ptext2 = Text(machado.words("romance/marm08.txt"), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode("latin-1")

ptext3 = Text(genesis.words("portuguese.txt"), name="Gênesis")
print "ptext3:", ptext3.name.decode("latin-1")

ptext4 = Text(mac_morpho.words("mu94se01.txt"), name="Folha de Sao Paulo (1994)")
print "ptext4:", ptext4.name.decode("latin-1")


def texts():
    print "ptext1:", ptext1.name.decode("latin-1")
    print "ptext2:", ptext2.name.decode("latin-1")
    print "ptext3:", ptext3.name.decode("latin-1")
    print "ptext4:", ptext4.name.decode("latin-1")


psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split()
psent2 = "Não consultes dicionários .".split()

コード例 #15

0

ファイルを表示

ファイル: 21.py プロジェクト: peizhe/sandbox-github-clone

# ◑ Write a function that takes a text and a vocabulary as its arguments and returns the set of words that appear in the text but not in the vocabulary. Both arguments can be represented as lists of strings. Can you do this in a single line, using set.difference()?

from nltk.corpus import genesis

print(
    set(genesis.words()).difference(
        ['this', 'is', 'my', 'vocabulary', 'lookee']))

コード例 #16

0

ファイルを表示

ファイル: main.py プロジェクト: bbusching/Stat-312

def main():
#store FreqDist's
#index is the length of the word, 0 is for all words
  samples = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

  brown_letters = FreqDist()
  web_letters = FreqDist()
  inaugural_letters = FreqDist()
  gutenberg_letters = FreqDist()
  genesis_letters = FreqDist()

  for file in gutenberg.fileids():
    for word in gutenberg.words(file):
      for character in word:
        if(character in string.letters):
            gutenberg_letters[character.upper()] += 1

  for file in brown.fileids():
    for word in brown.words(file):
      for character in word:
        if(character in string.letters):
            brown_letters[character.upper()] += 1

  for file in webtext.fileids():
    for word in webtext.words(file):
      for character in word:
        if(character in string.letters):
            web_letters[character.upper()] += 1

  for file in inaugural.fileids():
    for word in inaugural.words(file):
      for character in word:
        if(character in string.letters):
            inaugural_letters[character.upper()] += 1

  for file in genesis.fileids():
    for word in genesis.words(file):
      for character in word:
        if(character in string.letters):
            genesis_letters[character.upper()] += 1

  with open("genesis-letter-freq.txt",'w') as f:
    sys.stdout = f
    f.write("GENESIS\n")
    for let in samples:
        print(str(genesis_letters[let]))
  
  with open("gutenberg-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("GUTENBERG\n")
    for let in samples:
        print(str(gutenberg_letters[let]))
  with open("webtext-letter-freq.txt", 'w') as f:
    sys.stdout = f
    f.write("WEBTEXT\n")
    for let in samples:
        print(str(web_letters[let]))
  with open("inaugural-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("INAUGURAL\n")
    for let in samples:
        print(str(inaugural_letters[let]))
  with open("brown-letter-freq.txt", 'w') as f:
    sys.stdout = f

    f.write("BROWN\n")
    for let in samples:
        print(str(brown_letters[let]))
  
  with open("letter-freq.txt", 'w') as f:
    corpora = [gutenberg_letters, web_letters, inaugural_letters,
        brown_letters, genesis_letters]
    f.write("GUTENBERG,WEBTEXT,INAUGURAL,BROWN,GENESIS\n")
    for let in samples:
      for corpus in corpora:
        f.write(str(corpus[let]) + ",")
      f.write("\n")

コード例 #17

0

ファイルを表示

ファイル: book.py プロジェクト: wrand/tweater

from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading text1, ..., text9 and sent1, ..., sent9"
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print "text1:", text1.name

text2 = Text(gutenberg.words('austen-sense.txt'))
print "text2:", text2.name

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print "text3:", text3.name

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print "text4:", text4.name

text5 = Text(nps_chat.words(), name="Chat Corpus")
print "text5:", text5.name

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print "text6:", text6.name

text7 = Text(treebank.words(), name="Wall Street Journal")
print "text7:", text7.name

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")

コード例 #18

0

ファイルを表示

ファイル: pt.py プロジェクト: ciju/yql_hash

from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print "*** Introductory Examples for the NLTK Book ***"
print "Loading ptext1, ... and psent1, ..."
print "Type the name of the text or sentence to view it."
print "Type: 'texts()' or 'sents()' to list the materials."

ptext1 = Text(machado.words('romance/marm05.txt'), name="Memórias Póstumas de Brás Cubas (1881)")
print "ptext1:", ptext1.name.decode('latin-1')

ptext2 = Text(machado.words('romance/marm08.txt'), name="Dom Casmurro (1899)")
print "ptext2:", ptext2.name.decode('latin-1')

ptext3 = Text(genesis.words('portuguese.txt'), name="Gênesis")
print "ptext3:", ptext3.name.decode('latin-1')

ptext4 = Text(mac_morpho.words('mu94se01.txt'), name="Folha de Sau Paulo (1994)")
print "ptext4:", ptext4.name.decode('latin-1')

def texts():
    print "ptext1:", ptext1.name.decode('latin-1')
    print "ptext2:", ptext2.name.decode('latin-1')
    print "ptext3:", ptext3.name.decode('latin-1')
    print "ptext4:", ptext4.name.decode('latin-1')

psent1 = "o amor da glória era a coisa mais verdadeiramente humana que há no homem , e , conseqüentemente , a sua mais genuína feição .".split()
psent2 = "Não consultes dicionários .".split()
psent3 = "No princípio, criou Deus os céus e a terra.".split()
psent4 = "A Cáritas acredita que outros cubanos devem chegar ao Brasil .".split()

コード例 #19

0

ファイルを表示

ファイル: NLTK_lesson_chp_2.py プロジェクト: a1309820/Tw_1

        
cdf.plot()
#%%
from __future__ import division

def lexical_diversity(my_text_data):
     word_count = len(my_text_data)
     vocab_size = len(set(my_text_data))
     diversity_score = vocab_size / word_count
     return diversity_score

t="This is a test"
lexical_diversity(t)

from nltk.corpus import genesis
lexical_diversity(genesis.words('english-kjv.txt'))

#%%
# WordNet
#  Let's find synonyms

from nltk.corpus import wordnet as wn

wn.synsets('motorcar')
wn.synset('car.n.01').lemma_names()

wn.synsets('dish')
#%%
#Word Hierarchy

motorcar=wn.synset('car.n.01')

コード例 #20

0

ファイルを表示

NUM_INTERVALS = 10

if __name__ == '__main__':

    if len(sys.argv) < 2:
        print(
            'Usage: {} <word2vec-model> [<min-len = {}> [<max-len = {}> [<num-intervals = {}>]]]'
            .format(sys.argv[0], MIN_LEN, MAX_LEN, NUM_INTERVALS))
        exit()

    model = sys.argv[1]
    minLen = int(sys.argv[2]) if len(sys.argv) > 2 else MIN_LEN
    maxLen = int(sys.argv[3]) if len(sys.argv) > 3 else MAX_LEN
    numIntervals = int(sys.argv[4]) if len(sys.argv) > 4 else NUM_INTERVALS

    text = genesis.words(fileids='english-kjv.txt')
    feat = textutils.text2mat(text, model)

    start = time.time()
    intervals = maxdiv(feat,
                       method='gaussian_cov',
                       mode='TS',
                       extint_min_len=minLen,
                       extint_max_len=maxLen,
                       num_intervals=numIntervals)
    stop = time.time()
    print(
        'The search for anomalous paragraphs in a text of {} words took {} seconds.'
        .format(len(text), stop - start))

    textutils.printDetectedParagraphs(text, intervals)

コード例 #21

0

ファイルを表示

ファイル: book.py プロジェクト: 2ricecrackerfolder/twittermood

from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk.misc import babelize_shell

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words('melville-moby_dick.txt'))
print("text1:", text1.name)

text2 = Text(gutenberg.words('austen-sense.txt'))
print("text2:", text2.name)

text3 = Text([str(w) for w in genesis.words('english-kjv.txt')], name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words('singles.txt'), name="Personals Corpus")

コード例 #22

0

ファイルを表示

ファイル: books.py プロジェクト: neuroph12/nlpy

def text3():
    text = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis")
    print("text3:", text.name)
    return text

コード例 #23

0

ファイルを表示

ファイル: book.py プロジェクト: zlpmichelle/nltk

from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams

print("*** Introductory Examples for the NLTK Book ***")
print("Loading text1, ..., text9 and sent1, ..., sent9")
print("Type the name of the text or sentence to view it.")
print("Type: 'texts()' or 'sents()' to list the materials.")

text1 = Text(gutenberg.words("melville-moby_dick.txt"))
print("text1:", text1.name)

text2 = Text(gutenberg.words("austen-sense.txt"))
print("text2:", text2.name)

text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis")
print("text3:", text3.name)

text4 = Text(inaugural.words(), name="Inaugural Address Corpus")
print("text4:", text4.name)

text5 = Text(nps_chat.words(), name="Chat Corpus")
print("text5:", text5.name)

text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail")
print("text6:", text6.name)

text7 = Text(treebank.words(), name="Wall Street Journal")
print("text7:", text7.name)

text8 = Text(webtext.words("singles.txt"), name="Personals Corpus")

コード例 #24

0

ファイルを表示

ファイル: BLK42.py プロジェクト: jingyi199858/CS557Stevens

        for i in self._index[key]:
            lcontext = ' '.join(self._text[i - wc:i])
            rcontext = ' '.join(self._text[i:i + wc])
            ldisplay = '{:>{width}}'.format(lcontext[-width:], width=width)
            rdisplay = '{:{width}}'.format(rcontext[:width], width=width)
            print(ldisplay, rdisplay)

    def _stem(self, word):
        return self._stemmer.stem(word).lower()


porter = nltk.PorterStemmer()
grail = nltk.corpus.webtext.words('grail.txt')


text = genesis.words()


def sem_index(text):
    word_with_syns = []
    # iterate over every word in the text
    for word in text:
        # synsets are equal to all the synsets for the word
        synsets = wn.synsets(word)
        syns_indices = []
        # for every synset in the synset grouping
        for synset in synsets:
            # set the index number equal to its offset
            sem_index_num = synset.offset()
            syns_indices += [sem_index_num]
        if syns_indices: