Python wordsの例、nltk.corpus.cmudict.words Pythonの例

コード例 #1

0

ファイルを表示

def initial_consonants():
    for word in cmudict.words():
        initial = ''
        for char in word:
            if char not in 'aeiou' and char in string.ascii_letters:
                initial += char
            else:
                if len(initial) != 0:
                    yield initial
                break

コード例 #2

0

ファイルを表示

def final_consonants():
    for word in cmudict.words():
        final = ''
        for char in word[::-1]:
            if char not in 'aeiou' and char in string.ascii_letters:
                final += char
            else:
                if len(final) != 0:
                    yield final[::-1]
                break

コード例 #3

0

ファイルを表示

ファイル: hpylm.py プロジェクト: ginesiametlle/hpylm

 def __init__(self, order):
     # order of the model
     self.order = order
     # prior distribution over parameters (initially discount=0.8, strength=1.0)
     self.prior = PYPrior(0.8, 1.0)
     # back-off distribution
     if order == 1:
         self.backoff = G0Uniform(len(cmudict.words()) + 2)
     else:
         self.backoff = HPYLM(order - 1)
     # mapping of contexts to the corresponding Pitman-Yor Process
     self.u2pyp = {}

コード例 #4

0

ファイルを表示

def CreatePickle(AlgQuiet=False):
    def SyllableCount(AszWord):
        """return the max syllable count in the case of multiple pronunciations"""

        #http://groups.google.com/group/nltk-users/msg/81e70cb6704dc01e?pli=1

        return [
            len([y for y in x if isdigit(y[-1])])
            for x in GzzCMUDict[AszWord.lower()]
        ]

    # try:
    #     LhaInputFile = open('cmudict','r+')
    # except:
    #     print "Could not open the cmudict file"
    #     raise IOError

    try:
        for LszLine in cmudict.words():

            LszWord = LszLine.split(' ')[0].lower()

            LliSyllableList = SyllableCount(LszWord)

            if LszWord not in GdcSyllableCount:
                GdcSyllableCount[LszWord] = sorted(LliSyllableList)
                if not AlgQuiet:
                    print "%-20s added %s" % (LszWord, LliSyllableList)
            else:
                if not AlgQuiet:
                    print "  -Word (%s) found twice. First count was %s, second was %s" % (
                        LszWord, GdcSyllableCount[LszWord], LliSyllableList)
    except:
        print "An error was encountered processing the file."
        raise IOError

    try:
        #-----
        # Now write the dictionary away to a new pickle file

        LhaOutputFile = open('cmusyllables.pickle', 'w')

        if not AlgQuiet:
            print "Finished processing input file\n\nNow dumping pickle file\n"
        pickle.dump(GdcSyllableCount, LhaOutputFile, -1)

        if not AlgQuiet:
            print "Pickle file cmusyllables.pickle has been created."
    except:
        print "An error was encountered writing the pickle file."
        raise IOError

コード例 #5

0

ファイルを表示

ファイル: utils.py プロジェクト: infinitin/the_poetry_plagiarist

def get_pronunciations(word):
    try:
        pronunciations = dictionary[word]
    except KeyError:
        #Fuzzy matching on words to find closest
        pronunciations = dictionary[get_close_matches(word, cmudict.words(), 1)[0]]
        #Other options to make this more accurate:
            #Break many syllable words into likely part-words
            #Try all combos (stress/syllables only)
            #Add from shakespeare sonnets
            #Add from limericks
            #Add manually
        #Could also bias this for fewer changes near the end of the word for the sake of rhyme only.

    return pronunciations

コード例 #6

0

ファイルを表示

ファイル: syllables_buildpickle.py プロジェクト: chmullig/datascience-aes

def CreatePickle(AlgQuiet=False):
 
    def SyllableCount(AszWord):
        """return the max syllable count in the case of multiple pronunciations"""
 
        #http://groups.google.com/group/nltk-users/msg/81e70cb6704dc01e?pli=1
 
        return [len([y for y in x if isdigit(y[-1])]) for x in GzzCMUDict[AszWord.lower()]]
 
    # try:
    #     LhaInputFile = open('cmudict','r+')
    # except:
    #     print "Could not open the cmudict file"
    #     raise IOError
 
    try:
        for LszLine in cmudict.words():
 
            LszWord = LszLine.split(' ')[0].lower()
 
            LliSyllableList = SyllableCount(LszWord)
 
            if LszWord not in GdcSyllableCount:
                GdcSyllableCount[LszWord] = sorted(LliSyllableList)
                if not AlgQuiet:
                    print "%-20s added %s" % (LszWord, LliSyllableList)
            else:
                if not AlgQuiet:
                    print "  -Word (%s) found twice. First count was %s, second was %s" % (LszWord, GdcSyllableCount[LszWord], LliSyllableList)
    except:
        print "An error was encountered processing the file."
        raise IOError
 
    try:
        #-----
        # Now write the dictionary away to a new pickle file
 
        LhaOutputFile = open('cmusyllables.pickle','w')
 
        if not AlgQuiet:
            print "Finished processing input file\n\nNow dumping pickle file\n"
        pickle.dump(GdcSyllableCount, LhaOutputFile,-1)
 
        if not AlgQuiet:
            print "Pickle file cmusyllables.pickle has been created."
    except:
        print "An error was encountered writing the pickle file."
        raise IOError

コード例 #7

0

ファイルを表示

def vowels():
    for word in cmudict.words():
        vowel = ''
        vowel_started = False
        for char in word:
            if char in 'aeiou':
                vowel_started = True
                vowel += char
            else:
                if vowel_started:
                    yield vowel
                break

        vowel = ''
        vowel_started = False
        for char in word[::-1]:
            if char in 'aeiou':
                vowel_started = True
                vowel += char
            else:
                if vowel_started:
                    yield vowel[::-1]
                break

コード例 #8

0

ファイルを表示

ファイル: halloweenpoet.py プロジェクト: Jeuro/itcc

import os.path
import pickle
import random
import sys
from collections import Counter, defaultdict
from curses.ascii import isdigit
from itertools import islice
from nltk.corpus import cmudict
from nltk.tokenize import RegexpTokenizer


VOWELS = "AEIOU"
CONSONANTS = "BCDFGHJKLMNPQRSTVWXYZ"
BORDER = "-----------------------------------"
phonedict = cmudict.dict()
cmuwords = cmudict.words()


class TextHandler:
    def __init__(self, order, files, rhyme_file, scary_file):
        self.order = order
        self.files = files
        self.scary_words = self.read_scary_words(scary_file)
        self.rhyme_dict = self.load_rhyme_dict(rhyme_file)

    def get_matrix(self):
        content_text = self.merge_text(self.files)
        tokenizer = RegexpTokenizer(r"[\w\']+")
        corpus_words = tokenizer.tokenize(content_text)
        reverse_matrix = self.create_reverse_matrix(corpus_words)
        return reverse_matrix

コード例 #9

0

ファイルを表示

# A sample logging configuration. The only tangible logging
# performed by this configuration is to send an email to
# the site admins on every HTTP 500 error.
# See http://docs.djangoproject.com/en/dev/topics/logging for
# more details on how to customize your logging configuration.
LOGGING = {
    'version': 1,
    'disable_existing_loggers': False,
    'handlers': {
        'mail_admins': {
            'level': 'ERROR',
            'class': 'django.utils.log.AdminEmailHandler'
        }
    },
    'loggers': {
        'django.request': {
            'handlers': ['mail_admins'],
            'level': 'ERROR',
            'propagate': True,
        },
    }
}

from nltk.corpus import cmudict

WORDS = cmudict.words()

from localsettings import *

コード例 #10

0

ファイルを表示

def getWordSyllablesLessOrEq(syllableNum):
    word = random.choice(cmudict.words())
    while countSyllables(d[word][0]) > syllableNum:
        word = random.choice(cmudict.words())
    return word

コード例 #11

0

ファイルを表示

ファイル: max_match_algorithm.py プロジェクト: nisgarg86/natural_language_processing

'''
this python code implements max match algorithm - it segments/tokenizes words from a sentence without any spaces/delimeters.
Uses a pre defined dictionary of words
uses a greedy approach - tries to match the longest possible word from string. if no word matches, creates single character as a word
works very well on chinese language, not so well on english
'''

__author__ = 'nishant'

from nltk.corpus import cmudict

WORD_LIST = cmudict.words()

def max_match(string):

    length = len(string)
    # base condition of recursion
    if length == 0:
        return []

    for i in range(length):
        first_word = string[:length-i]
        remaining_string = string[length-i:]

        if first_word in WORD_LIST:
            return [first_word] + max_match(remaining_string)

    # if no word matches, we consider the first character as a single word and apply max match recursively on remaining string
    first_word = string[0]
    remaining_string = string[1:]

コード例 #12

0

ファイルを表示

ファイル: textPreproc.py プロジェクト: kenpachiii/clean-crowdsourced-text

print "Convert numbers into words..."
numbers = [str(x) for x in tokens if x.isdigit()]
words = [num2words(num) for num in numbers]
numwords = dict(zip(numbers, words))
tokens_nw = [numwords[x] if x in numwords else x for x in tokens]

#==============================================================================
# Normalise pronunciation to US English and correct misspellings.
# Get the brown and cmudict corpora and count information.
# Will use brown corpus for correction instead of cmudict as the latter
# does not provide frequency information.
#==============================================================================
print "Load corpora..."
dict_brown = Counter(brown.words())
dict_cmu = Counter(cmudict.words())
# some additional known strings
dict_mine = Counter(
    ["mrs", "miss", "mr", "dr", "prof", "dr.", "prof.", "\n", "lt.", '"'])
full_dict = set(dict_brown + dict_cmu + dict_mine)

print "Correct spelling and US grammar..."
tokens_sp = [x if x in full_dict else spell(x, dict_brown) for x in tokens_nw]
text_sp = ' '.join(word for word in tokens_sp)

# Format text
text_clean = formatter(text_sp)

print "Write cleaned text to 'cleaned_text.txt'..."
with io.open('./cleaned_text.txt', 'w', encoding='utf-8') as fid:
    fid.write(text_clean)

コード例 #13

0

ファイルを表示

ファイル: scansion.py プロジェクト: jrladd/scansion

#!/usr/bin/env python

#----------------#
# All lines in EEBO: ./scansion.py ../../data/eebo_tcp_MARCH_2015/just_lines/
#----------------#

from nltk.corpus import cmudict
from nltk.corpus import stopwords
import sys, glob, codecs
import Levenshtein as lev
from collections import defaultdict

i = sys.argv[1]
output = sys.argv[2]
prondict = cmudict.dict()
cmuwords = cmudict.words()


def just_stress(
    word
):  #Find a word in cmudict and return the numerical stresses for that word
    prons = prondict[word]
    stress = []
    if len(
            prons
    ) > 1:  #For one-syllable words, prefer a 0 over a 1 (if one of prondict's options is a 0)
        possibles = []
        for s in prons:
            possible_stress = ''.join([
                ''.join([char for char in syllable if char.isdigit()])
                for syllable in s

コード例 #14

0

ファイルを表示

ファイル: test_minimum_phonetic_distance_calculator.py プロジェクト: swigder/phonetic_distance

 def test_closest_word_krypton(self):
     words = cmudict.words()
     c_start = binary_search(words, 'c', lambda x, y: 0 if x == y else 1 if x > y else -1)
     c_end = binary_search(words, 'd', lambda x, y: 0 if x == y else 1 if x > y else -1)
     assert self.calculator.closest_words("KRYPTON", words[c_start:c_end]) == ['crippen', 'crypto']

コード例 #15

0

ファイルを表示

ファイル: poetry.py プロジェクト: Jeuro/itcc

import pickle
from nltk.corpus import cmudict 
from itertools import islice
from nltk.tokenize import RegexpTokenizer
from collections import Counter, defaultdict
from curses.ascii import isdigit


VOWELS = "AEIOU"
CONSONANTS = "BCDFGHJKLMNPQRSTVWXYZ"

order = int(sys.argv[1])
content_text = sys.stdin.read()
tokenizer = RegexpTokenizer('\w+')
phonedict = cmudict.dict()
wordsdict = cmudict.words()


def window_generator(seq, n=2):
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result    
    for elem in it:
        result = result[1:] + (elem,)
        yield result


def matching_phonemes(words, mode):
    phonemes = []
    for word in words:

コード例 #16

0

ファイルを表示

# from lyricsFetch import lyricsFetch
from __future__ import division
import re
from nltk.corpus import cmudict
import time
import numpy as np
import os

cmud = cmudict.dict()
cmuw = cmudict.words()
vowels = [
    'AA', 'AH', 'AW', 'EH', 'EY', 'IH', 'OW', 'UH', 'AE', 'AO', 'AY', 'ER',
    'IY', 'OY', 'UW'
]
fixed = {}
rhyme2words = {}


def get_lyric_ngrams(artistCount, songCount, category='rock', ngram=3):
    lf = lyricsFetch('rock', artistCount, songCount)

    lyrics = []
    for i in range(artistCount - 1):
        lyrics.append(lf.getNextLyricSet()[2])

    allLines = []
    for l in lyrics:
        lines_str = re.sub('\n\n', '\n', l)
        lines = lines_str.split('\n')
        lines = [re.sub(r'([,!?]|\.{3})', r' \1', x) for x in lines]
        for li in lines:

コード例 #17

0

ファイルを表示

ファイル: 02-12.py プロジェクト: shuxinzhang/nltk-learning

# -*- coding: utf-8 -*-
import matplotlib
matplotlib.use('TkAgg')
import nltk
'''
◑ The CMU Pronouncing Dictionary contains multiple pronunciations
for certain words.  How many distinct words does it contain?  What fraction
of words in this dictionary have more than one possible pronunciation?
'''

from nltk.corpus import cmudict
prondict = cmudict.words()
print len(prondict)
print len(set(prondict))
print -(len(set(prondict)) - len(prondict)) * 1.0 / len(prondict)

コード例 #18

0

ファイルを表示

ファイル: l16_corpus-cmudict.py プロジェクト: coder352/shellscript

#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import cmudict  # 存放着英语发音规则
##################################################################
## 简单查看
print(cmudict.fileids())  # ['cmudict']
print(type(cmudict))  # <class 'nltk.corpus.reader.cmudict.CMUDictCorpusReader'>
print(len(cmudict.words()))  # 133737; 个英语单词
print(cmudict.words()[:5])  # ['a', 'a.', 'a', 'a42128', 'aaa']
##################################################################
## entries()
entries = nltk.corpus.cmudict.entries()
print(len(entries))  # 133737
print(entries[:5])  # [('a', ['AH0']), ('a.', ['EY1']), ('a', ['EY1']), ('a42128', ['EY1', 'F', 'AO1', 'R', 'T', 'UW1', 'W', 'AH1', 'N', 'T', 'UW1', 'EY1', 'T']), ('aaa', ['T', 'R', 'IH2', 'P', 'AH0', 'L', 'EY1'])]
for entry in entries[42371:42374]: print(entry)
# ('fir', ['F', 'ER1'])
# ('fire', ['F', 'AY1', 'ER0'])
# ('fire', ['F', 'AY1', 'R'])
##################################################################
## dict()
prondict = nltk.corpus.cmudict.dict()
print(prondict['fire'])  # [['F', 'AY1', 'ER0'], ['F', 'AY1', 'R']]
print(prondict['blog'])  # 没有 blog, 会报错
# Traceback (most recent call last):
#   File "<stdin>", line 1, in <module>
# KeyError: 'blog'
prondict['blog'] = [['B', 'L', 'AA1', 'G']]  # 自己添加
print(prondict['blog'])  # [['B', 'L', 'AA1', 'G']]

コード例 #19

0

ファイルを表示

ファイル: rhyme-working-2-2-syl.py プロジェクト: Floor-VDB/Rhyme

#def that returns every phoneme after last stressed vowel for given word
#Ex.: cakedays => EY1 + S
def getrhymepart(word):
    phonemes = arpabet[word]
    for phoneme in reversed(range(len(phonemes[0]))):
        for vowel in range(len(vowels)):
            if str(phonemes[0][phoneme]) == str(vowels[vowel]):
                phonemeoffset = 0 - len(phonemes[0]) + phoneme
                rhymepart = phonemes[0][phonemeoffset:]
                rhymepart[0] = rhymepart[0][:-1]
                return rhymepart


dividedscheme = [[2, 1], [2]]

#For-loop to do all the stuff
for word1 in cmudict.words():
    if nsyl(word1) == [2]:
        print(rhymecount)
        print(word1)
        text_file.close()
        rhymecount = 0
        text_file = open("2-2-syl.txt", "a")
        rhymepart1 = getrhymepart(word1)
        for word2 in cmudict.words():
            if nsyl(word2) == [2]:
                rhymepart2 = getrhymepart(word2)
                if rhymepart1 == rhymepart2:
                    n = text_file.write(word1 + "\n" + word2 + "\n" + "\n")
                    rhymecount = rhymecount + 1

コード例 #20

0

ファイルを表示

ファイル: phonetic_distance.py プロジェクト: swigder/phonetic_distance

def closest_word_for_letter(target, first_letter, similar_phone_substitution_cost):
    first_letter = first_letter.lower()
    words = cmudict.words()
    letter_start = binary_search(words, first_letter, lambda x, y: 0 if x == y else 1 if x > y else -1)
    letter_end = binary_search(words, chr(ord(first_letter) + 1), lambda x, y: 0 if x == y else 1 if x > y else -1)
    return closest_word(target, words[letter_start:letter_end], similar_phone_substitution_cost)

コード例 #21

0

ファイルを表示

#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import cmudict  # 存放着英语发音规则
##################################################################
## 简单查看
print(cmudict.fileids())  # ['cmudict']
print(
    type(cmudict))  # <class 'nltk.corpus.reader.cmudict.CMUDictCorpusReader'>
print(len(cmudict.words()))  # 133737; 个英语单词
print(cmudict.words()[:5])  # ['a', 'a.', 'a', 'a42128', 'aaa']
##################################################################
## entries()
entries = nltk.corpus.cmudict.entries()
print(len(entries))  # 133737
print(
    entries[:5]
)  # [('a', ['AH0']), ('a.', ['EY1']), ('a', ['EY1']), ('a42128', ['EY1', 'F', 'AO1', 'R', 'T', 'UW1', 'W', 'AH1', 'N', 'T', 'UW1', 'EY1', 'T']), ('aaa', ['T', 'R', 'IH2', 'P', 'AH0', 'L', 'EY1'])]
for entry in entries[42371:42374]:
    print(entry)
# ('fir', ['F', 'ER1'])
# ('fire', ['F', 'AY1', 'ER0'])
# ('fire', ['F', 'AY1', 'R'])
##################################################################
## dict()
prondict = nltk.corpus.cmudict.dict()
print(prondict['fire'])  # [['F', 'AY1', 'ER0'], ['F', 'AY1', 'R']]
print(prondict['blog'])  # 没有 blog, 会报错
# Traceback (most recent call last):
#   File "<stdin>", line 1, in <module>
# KeyError: 'blog'
prondict['blog'] = [['B', 'L', 'AA1', 'G']]  # 自己添加

コード例 #22

0

ファイルを表示

ファイル: corpus.py プロジェクト: ginesiametlle/hpylm

 def _build_vocabulary(self):
     for word in cmudict.words():
         if not self.contains(word):
             self.word2id[word] = len(self.id2word)
             self.id2word.append(word)