def exercise6():
	regexText = 'One day, his horse ran away. The neighbors came to express their concern: "Oh, that\'s too bad. How are you going to work the fields now?" The farmer replied: "Good thing, Bad thing, Who knows?"'
	regexText = regexText + '\rIn a few days, his horse came back and brought another horse with her. Now, the neighbors were glad: "Oh, how lucky! Now you can do twice as much work as before!" The farmer replied: "Good thing, Bad thing, Who knows?"'

	print("part b")
	print("[A-Z][a-z]*", "matches patterns start with an upper case, following an optional lower case.")
	nltk.re_show(r'[A-Z][a-z]*', regexText)
	print("part c")
	print("p[aeiou]{,2}t", "matches patterns start with p, end with t, and may contain 2 vowels (aeiou) in between position 0 to 2")
	nltk.re_show(r'p[aeiou]{,2}t', regexText)
	print("part f")
	print("\w+|[^\w\s]+", "matches patterns of non-space characters")
	print(nltk.re_show(r'\w+|[^\w\s]+', regexText))
Exemple #2
0
 def re_show(self, regexp, string, left='{', right='}'):
     '''
     把找到的符合regexp的non-overlapping matches标记出来
     如:
     nltk.re_show('[a-zA-Z]+','12fFdsDFDS3rtG4')#12{fFdsDFDS}3{rtG}4
     '''
     return nltk.re_show(regexp, string, left, right)
Exemple #3
0
def exercise6():
    regular_ex = {'part b': '[A-Z][a-z]*',
                  'part c': 'p[aeiou]{,2}t',
                  'part f': '\w+|[^\w\s]+'}
    for part in regular_ex:
        print(part+' :')
        result = nltk.re_show(regular_ex[part], SimpleText, '{', '}')
        if result:
            print(result)
        print()
Exemple #4
0
def exercise6():
    print("part b")
    nltk.re_show('[A-Z][a-z]*', SimpleText)
    print("part c")
    nltk.re_show('p[aeiou]{,2}t', SimpleText)
    print("part f")
    nltk.re_show('\w+|[^\w\s]+', SimpleText)
def question4():

    # Write regular expressions to match the following class of strings:
    # A single determiner (assume that a, an, and the are the only determiners).
    # What is the entire output of your regular expression when it is run on SimpleText?
    # SimpleText is a string provided in the template document.
    # Use nltk.re_show() for your output.

    match_determiner = nltk.re_show(r'\b([Aa][Nn]?|[Tt][Hh][Ee])\b',
                                    SimpleText,
                                    left='[ ',
                                    right=' ]')

    return match_determiner
# Step for slice
# 'exponentiation'[2:11:2]
# 'exponentiation'[12:2:-2]
# 'exponentiation'[12::-2]
# 'exponentiation'[:5:-2]
# 'exponentiation'[::2]
# 'exponentiation'[4::]

# Ex5
# 'monty'[::-1] gives you 'ytnom'
# because the start index and end index are default
# while step index is 1

# Ex6
# If you want to test you write it like this
nltk.re_show(r'[[a-zA-Z]+]+', 'october2009')
# [a-zA-Z]+ = one or more of alphabet
nltk.re.findall(r'[a-zA-Z]+', 'October2009')
# returns ['October']
# [A-Z][a-z]* = Kleene Closure, same as previous
# p[aeiou]{,2}t = p and then up to 2 vowels and t
nltk.re.findall(r'p[aeiou]{,2}t', 'paella, pat, pout')
# returns ['pat', 'pout']
# \d+(\.\d+)? = extract one or more decimals of a floating number
nltk.re.findall(r'\d+(\.\d+)?', '0.99, 2345, 234.5')
# This gives us ['.99', '', '.5']
# Remember that parentheses, except for defining operator scope
# have a second function, to select substrings to be extracted
# Zero or one of the decimal in the front \d+?
# ([^aeiou][aeiou][^aeiou])* = extract zero or more of any character other than a vowel,
# then a vowel, then any character other than a vowel
Exemple #7
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jul  6 14:58:22 2018

@author: vpapg
"""

# Write regular expressions to match the following classes of strings:
#
#       a. A single determiner (assume that a, an, and the are the only determiners).
#       b. An arithmetic expression using integers, addition, and multiplication, such as 2*3+8.

import nltk

s = "A heart that's full up like a landfill \n A job that slowly kills you \n Bruises that won't heal \n You look so tired, unhappy \n Bring down the government \n They don't, they don't speak for us \n I'll take a quiet life \n A handshake of carbon monoxide"

s_list = s.split()
for element in s_list:
    nltk.re_show('^([a|A][n]?|the)$', element)

print()
s = "107+5*4/2-9"
nltk.re_show('[\d+(\+|\-|\*|\/)?]+', s)
Exemple #8
0
def q_seven():
    return nltk.re_show(
        r'(\ba\b|\ban\b|\bthe\b|\+|-|\*|/)',
        '"2 + 1" says the deer. Now this does not make a single sense.')
re_show(regexp, string, left='{', right='}')
    Return a string with markers surrounding the matched substrings.
    Search str for substrings matching ``regexp`` and wrap the matches
    with braces.  This is convenient for learning about regular expressions.
    
    :param regexp: The regular expression.
    :type regexp: str
    :param string: The string being matched.
    :type string: str
    :param left: The left delimiter (printed before the matched substring)
    :type left: str
    :param right: The right delimiter (printed after the matched substring)
    :type right: str
    :rtype: str

>>> nltk.re_show("(oats|eat)", sent)
Mares {eat} {oats} and does {eat} {oats}, and little lambs {eat} ivy; a kid'll {eat} ivy too, wouldn't you?
>>> re.findall("(eat)", sent)
['eat', 'eat', 'eat', 'eat']
>>> re.findall("(eat|oats)", sent)
['eat', 'oats', 'eat', 'oats', 'eat', 'eat']
>>> from nltk.tokenize import WordPunctTokenizer
>>> from nltk.stem import PorterStemmer
>>> t = WordPunctTokenizer()
>>> s = PorterStemmer()
>>> t.tokenize(sent)
['Mares', 'eat', 'oats', 'and', 'does', 'eat', 'oats', ',', 'and', 'little', 'lambs', 'eat', 'ivy', ';', 'a', 'kid', "'", 'll', 'eat', 'ivy', 'too', ',', 'wouldn', "'", 't', 'you', '?']
>>> s.stem(sent)
"Mares eat oats and does eat oats, and little lambs eat ivy; a kid'll eat ivy too, wouldn't you?"
>>> [s.stem(x) for x in t.tokenize(sent)]
['Mare', 'eat', 'oat', 'and', 'doe', 'eat', 'oat', ',', 'and', 'littl', 'lamb', 'eat', 'ivi', ';', 'a', 'kid', "'", 'll', 'eat', 'ivi', 'too', ',', 'wouldn', "'", 't', 'you', '?']
Exemple #10
0
"""

[a-zA-Z]+
    # any amount of letters, caps or not
[A-Z][a-z]*
    #
p[aeiou]{,2}t
    # p followed by any vowel up to x2 then a t, e.g. pout, pot, not poot
\d+(\.\d+)?
    # IDs a decimal, like 12.222 (multiple groups/matches: 12, .222, and 12.222)
([^aeiou][aeiou][^aeiou])*
    # finds any sub-match of 3 chars thats: not a vowel(consonant or digit or punc), a vowel, then not a vowel again. Doesn't allow double-use of chars, like in !ababababab => [!ab]a[bab]a[bab]a...
\w+|[^\w\s]+
    # \w => equivalent to [a-zA-Z0-9_]
    # finds any word character, or any non-word char or space, any amount of times


"""

from nltk import re_show  # re_show(pattern, string)

p1 = "[a-zA-Z]+"  # any alphabetical chunk, any length
s1 = "abcdefg123"
print(re_show(p1, s1))

p2 = "[A-Z][a-z]*"
s2 = "Cassady"
# ...
#
print 'nationality'[:-5]
print 'undo'[:-2]
print 'preheat'[3:]
'''
6. 说明以下的正则表达式匹配的字符串类:
a. [a-zA-Z]+ 匹配一个或多个大小写字母的字符串
b. [A-Z][a-z]* 匹配零个或多个以大写字母和小写字母组合的字符串
c. p[aeiou]{,2}t 匹配p加至多重复2次的元音字母加t的字符串
d. \d+(\.\d+)? 匹配一个数字加零个或一个小数点加多个数字的字符串
e. ([^aeiou][aeiou][^aeiou])* 匹配零个或多个非元音加元音加非元
                                音组成的字符串
f. \w+|[^\w\s]+ 匹配字符或一个或多个非字符加非空白
使用nltk.re_show()测试你的答案
'''
import nltk, re
nltk.re_show(r"[a-zA-Z]+", 'fd3ffdadZdd')
nltk.re_show(r"[A-Z][a-z]*", 'fd3ffdadZdd ab')
nltk.re_show(r"p[aeiou]{,2}t", 'fd3paatdZdd ab')
nltk.re_show(r"\d+(\.\d+)?", 'abd12.32')
nltk.re_show(r"[^aeiou][aeiou][^aeiou]", 'aefadaea be')
nltk.re_show(r"\w+|[^\w\s]+", ', aefadaea be')
'''
17. 格式化字符串%6s 与%-6s 用来显示长度大于6个字符的字符串时,
会发生什么
'''
fdist = nltk.FreqDist(
    ['dogrrrrr', 'cat', 'dog', 'cat', 'dog', 'snake1', 'dog', 'cat'])
for word in fdist:
    print '%6s' % word  #右对齐,超出部分向右延伸

for word in fdist:
Exemple #12
0
    stem, suffix = re.findall(regexp, word)[0]
    return stem

raw = """DENNIS: Listen, strange woman lying in ponds distributing swords
is no basis for a system of government. Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = word_tokenize(raw)
print([stem(t) for t in tokens])
print("-" * 40)

print("Searching Tokenized Text")
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r'<a>(<.*>)<man>')
chat = nltk.Text(nps_chat.words())
chat.findall(r'<.*> <.*> <bro>')
chat.findall(r'<l.*>{3,}')
print("-" * 40)

nltk.re_show('kaa', ' '.join(rotokas_words))
nltk.app.nemo()
print("-" * 40)

from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r'<\w*> <and> <other> <\w*s>')
print("-" * 40)

hobbies_learned.findall(r'<as> <\w*> <as> <\w*>')
print("-" * 40)
Exemple #13
0
monty = 'Monty Python'
if monty[::-1]=='nohtyP ytnoM':
    print('This statement is definitely true.')

# 6 Describe the class of strings matched by the following regular expressions.
# [a-zA-Z]+
# [A-Z][a-z]*
# p[aeiou]{,2}t
# \d+(\.\d+)?
# ([^aeiou][aeiou][^aeiou])*
# \w+|[^\w\s]+
# Test your answers using nltk.re_show().

print('[a-zA-Z]+')
nltk.re_show('[a-zA-Z]+','Matches all groups of characters, but not the number 3.')
print('[A-Z][a-z]*')
nltk.re_show('[A-Z][a-z]*','Matches Matches, Aitor, Joaquin, or Abba, but not chainsaw, or nyu.')
print('p[aeiou]{,2}t')
nltk.re_show('p[aeiou]{,2}t','Matches pt., Sengupta., put, paet, piit, or pot.')
print('\d+(\.\d+)?')
nltk.re_show('\d+(\.\d+)?','Matches numbers with decimals or not, like 0.1, 1000, or 1000.1.')
print('([^aeiou][aeiou][^aeiou])*')
nltk.re_show('([^aeiou][aeiou][^aeiou])*','Matches ?a_-ex+ih, tiptaptok, or (iptakto) (and some other unexpected shit).')
print('\w+|[^\w\s]+')
nltk.re_show('\w+|[^\w\s]+','Matches groups of characters and of non-(characters or spaces).')

# 7 Write regular expressions to match the following classes of strings:
#   A single determiner (assume that a, an, and the are the only determiners).
#   An arithmetic expression using integers, addition, and multiplication, such as 2*3+8.
Exemple #14
0
def exercise7():
    reg_ex = {"part a" : "\\b[Aa][n]?\\b|\\b[Tt][h][e]\\b" }
    result =  nltk.re_show(reg_ex["part a"], SimpleText, '{', '}')
    if result:
        print(result)
Exemple #15
0
import nltk
nltk.download()

from nltk.corpus import PlaintextCorpusReader
from nltk import FreqDist

FreqDist()
import nltk
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target))

data = cfd['america']

nltk.re_show()
#        )

        
print '''
Your Turn (1).

nltk.app.nemo() opens a GUI for experimenting with regexps.

But first, try out nltk.re_show(regexp, string) - shows matches for 'regexp' in 'string'
'''

sentence = "A man, a plan, a canal, panama."
nltk.re_show(
                r"[Aa] "    
                r"([\w]+)"                      # substring captures are IGNORED by re_show()
                r","
                ,
                sentence
                )                               # {A man,} {a plan,} {a canal,} panama.
    
print ""
print "To try 'Finding (and Replacing) Nemo, run `nltk.app.nemo()`"
#nltk.app.nemo()


print '''
Your Turn (2)

Looking for "as x as y" - analogous to the "x and other ys" example
'''
Exemple #17
0
re.split(r'-', 'fun-ny')[0]

#3
#Yes, if we [:-3] and the word only contains 2 characters

#4.
monty = 'There is a monty python.'
monty[1:11:2]
monty[11:1:-2]

#5.
monty[::-1]

#6.
monty = 'There is a 100% monty pi Python.'
nltk.re_show(r'[a-zA-Z]+', monty)
nltk.re_show(r'[A-Z][a-z]*', monty)  #select a capitalized word
nltk.re_show(r'p[aeiou]{,2}t', monty)
nltk.re_show(r'([^aeiou][aeiou][^aeiou])*', monty)
nltk.re_show(r'\w+|[^\w\s]+', monty)

#7.
x = r'^(the|a|an)$'
nltk.re_show(x, 'the apple is big')
nltk.re_show(x, 'the')

#8.

#9.

#10.
Exemple #18
0
# Ex 1
import nltk
import re

text = "The quick brown fox jumps over the lazy dog 123456789 ,./[](){}?#¬`$%^&*"
dec = "33.333"

fil = nltk.re_show(r'[a-zA-Z]+', text)
print(fil)

fil2 = nltk.re_show(r'[A-Z][a-z]*', text)
print(fil2)

fil3 = nltk.re_show(r'p[aeiou]{,2}t', text)
print(fil3)

fil4 = nltk.re_show(r'\d+(\.\d+)?', text)
fil4_1 = nltk.re_show(r'\d+(\.\d+)?', dec)
print(fil4)

fil5 = nltk.re_show(r'([^aeiou][aeiou][^aeiou])*', text)
print(fil5)

fil6 = nltk.re_show(r'/w+|[^\w\s]+', text)
print(fil6)
'''
- Expression 1: [a-zA-Z]+
Matches one or more upper case or lower case alphabets and disregards numerical characters

- Expression 2: [A-Z][a-z]*
Matches to a string beginning with an upper case ASCII character followed by lower case characters
Exemple #19
0
def exercise7():
    print("part a")
    nltk.re_show(r'\b(a|an|the)\b', SimpleText)
def exercise7():
	#Write regular expressions to match the following classes of strings
    print("part a", "A single determiner (assume that a, an, and the are the only determiners)")
    nltk.re_show(r'\b(a|an|the)\b', SimpleText)
Exemple #21
0
    return stem


raw = """DENNIS: Listen,
strange women lying in ponds distributing swords
is no basis for a system of government.
Supreme executive power derives from a mandate from the masses, 
not from some farcical aquatic ceremony."""

tokens = word_tokenize(raw)
print(tokens)
print([stem(t) for t in tokens])

# 正则表达式的展示函数,可以把符合正则表达式要求的字符标注出来
regexp = r'[ing|ly|ed|ious|ies|ive|es|s|ment]$'
nltk.re_show(regexp, raw)
regexp = r'(ing)$'
nltk.re_show(regexp, raw)
regexp = r'[ing]$'
nltk.re_show(regexp, raw)
regexp = r'ing$'
nltk.re_show(regexp, raw)  # 不能使用re.findall()中的正则表达式标准。需要使用基本的正则表达式标准。
# P109 表3-3 正则表达式基本元字符,P120 表3-4 正则表达式符号
# 也可以参考《自然语言处理综论(第二版)》P18
nltk.re_show('^[D|s|i|S|n]', raw)  # '^' 表示行的开头
nltk.re_show('^[DsiSn]', raw)  # '[]' 内,用不用|都表示析取
nltk.re_show('[s|.|,]$', raw)  # '$' 表示行的结尾
nltk.re_show('ing|tive', raw)  # '|' 表示析取指定的字符串
nltk.re_show('(ing|tive)', raw)  # '()' 表示操作符的范围
nltk.re_show('(s){1,2}', raw)  # '{}' 表示重复的次数
def exercise25():
	#Are you able to write a regular expression to tokenize text in such a way that the word don't is tokenized into do and n't?
	#Explain why this regular expression won't work: n't|\w+.
	nltk.re_show(r"n't|\w+", SimpleText)
Exemple #23
0
# describe which string classes correspond to the following regular expression. [a-zA-Z]+. Check results using nltk.re_show ()

# string classes: A character in the range of "a" to "z" or "A" to "Z" with one repetition or more
import nltk, re
words = [
    'hello', 'John', 'random', 'INSANITY', 'guess', 'name', 'true', 'please',
    'underestimated', '12', 'test!'
]
string = 'hello John random INSANITY guess name true please underestimated 12 test!'
reg = '[a-zA-Z]+'
print([w for w in words if re.search(reg, w)])
print(nltk.re_show(reg, string))

# describe which string classes correspond to the following regular expression. [A-Z][a-z] *. Check results using nltk.re_show ()

# string classes: A character in the range of “A” to “Z” and a character in the range of “a” to “z” without or more repetitions
import nltk, re
words = [
    'hello', 'John', 'random', 'INSANITY', 'guess', 'name', 'true', 'please',
    'underestimated', '12', 'test!'
]
string = 'hello John random INSANITY guess name true please underestimated 12 test!'
reg = '^[A-Z][a-z]*$'
print([w for w in words if re.search(reg, w)])
print(nltk.re_show(reg, string))

# describe which string classes correspond to the following regular expression. \d+(\.\d+). Check results using nltk.re_show ()

# string classes: any number with one repetition or more and an optional group containing the character "." And any number with one repetition or more
import nltk, re
words = [
Exemple #24
0
print nltk.tokenwrap(compress(w) for w in english_uhdr[:75])


cv_word_pairs=[(cv,w) for w in nltk.corpus.toolbox.words('rotokas.dic')
                      for cv in re.findall(r'[ptksvr][aeiou]', w)]
cv_index=nltk.Index(cv_word_pairs)
print cv_index['su']


print re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')


from nltk.corpus import gutenberg
moby=nltk.Text (gutenberg.words('melville-moby_dick.txt'))
print moby.findall(r"<a>(<.*>)<man>")

chat = nltk.Text (nltk.corpus.nps_chat.words())
print chat.findall(r"<l.*>{3,}")

nltk.re_show(r'a', 'banana')

nltk.app.nemo()

from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
print hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
print hobbies_learned.findall(r"<as> <\w*> <as> <\w*s>")



from __future__ import division
len(text3) / len(set(text3))

text3.count("smote")

100 * text4.count('a') / len(text4)

# Question-2 - Regular Expressions
import nltk
print("\nRegular Expressions\n")

print(
    "\n[a-zA-Z]+, It will match one or more uppercase and lowercase ascii letters\n"
)
t1 = "This book contains information obtained from authentic and highly regarded sources."
nltk.re_show('[a-zA-Z]+', t1)

print(
    "\n[A-Z][a-z]*, It will match zero or more ascii letters, due to use of '*'\n"
)
t2 = "This book contains information obtained from authentic and highly regarded sources."
nltk.re_show('[A-Z][a-z]*', t2)

print("\np[aeiou]{,2}t, It will match 'p' followed by 2 vowels and a 't'\n")
t3 = "You are the real phantom of arabia, living in tunisia"
nltk.re_show('p[aeiou]{,2}t', t3)

print(
    "\n\d+(\.\d+)?, It will match currency and percentages, e.g. $12.40, 82%, after a dot as well\n"
)
t4 = "That U.S.A. poster-print costs $12.40..."
Exemple #26
0
@license: Apache Licence 
@contact: [email protected]
@site: 
@software: PyCharm
@file: D8.py
@time: 2017/12/28 0028 上午 11:59
"""
# 搜索已分词文本
import nltk
from nltk.book import gutenberg, nps_chat

moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r'<a><.*><man>')

chat = nltk.Text(nps_chat.words())
chat.findall(r'<.*><.*><bro>')
chat.findall(r'<l.*>{3,}')

nltk.app.nemo()
nltk.re_show(r'\d', 'we are good friend, and you? how old are you? 25')
import re
re.search(r'\d', 'we are good friend, and you? how old are you? 25')

from nltk.corpus import brown
hobbies_learend = nltk.Text(brown.words(categories=['hobbies', ['learned']]))
hobbies_learend.findall(r'<\w*><and><other><\w*s>')

# as x as y
x_y = nltk.Text(brown.words(categories=['fiction', 'romance']))
x_y.findall(r'<as><\w*><as><\w*>')
'''
Exercise 3.23

Stumped me for a bit
'''
print __doc__

import nltk, re
pattern = r"(\w+)(n't)|(\w+)?"
sentence = "I don't think so, that wouldn't be right"
nltk.re_show(pattern, sentence)
print re.findall(pattern, sentence)


print '''
I think the problem with the original regex was that the '\w+' was greedy?
Didn't really know how to fix it like with the "*?" operator
My fix SEEMS to work...
The output format is probably wrong...
'''
Exemple #28
0
#2
#○正则表达式在nlp中的妙用,以下模式是常用的模式可以用来进行文本字符串的提取与过滤:
#a. [a-zA-Z]+           字母字符串
#
#b. [A-Z][a-z]*         开头大写后小字母不限
#
#c. p[aeiou]{,2}t        p开头t结尾中间有<=2个元音字符
#
#d. \d+(\.\d+)?          整数或者小数
#
#e. ([^aeiou][aeiou][^aeiou])*   非元音字母接一个元音字母再接一个非元音字母
#
#f. \w+|[^\w\s]+          \w匹配包括下划线的任何单词字符,等价[A-Za-z0-9_],或匹配非空白符非单词字符的标点符号
import nltk
nltk.re_show(r'[a-zA-Z]+','asdb123')
nltk.re_show(r'[A-Z][a-z]*','asDb123')
nltk.re_show(r'p[aeiou]{,2}t','apaetioo2')
nltk.re_show(r'\d+(\.\d+)?','adf12.34')
nltk.re_show(r'([^aeiou][aeiou][^aeiou])*','papppipe')
nltk.re_show(r'\w+|[^\w\s]+','papppipe?,...')




#3. 格式化字符串
'%6s' %'dog'
'%6s' %'sdasdasdsds'
'%-6s' %'sdasdasdsds'
'%-6s' %'dog'
Exemple #29
0
def q_twentythree():
    pattern = r"n't|(\w+)?"
    sent = "I really don't know."
    res = nltk.re_show(pattern, sent)
    print 'When used r"n\'t|(\w+)?" : ', res
    print """
# [A-Z][a-z]*
# p[aeiou]{,2}t
# \d+(\.\d+)?
# ([^aeiou][aeiou][^aeiou])*
# \w+|[^\w\s]+
# Test your answers using nltk.re_show().

# In[222]:

import nltk, re, pprint
from nltk import word_tokenize

# In[231]:

'[a-zA-Z]+ : strings containing one or more letters (capital or not)'
nltk.re_show(r'[a-zA-Z]+', 'Whole World')

# In[233]:

'[A-Z][a-z]* : one capital letter and zero or more lowercase letters'
nltk.re_show(r'[A-Z][a-z]*', 'Whole world')

# In[234]:

'p[aeiou]{,2}t : starts with p, followed by 0 up to 2 vowels (aeiou), end with t '
nltk.re_show(r'p[aeiou]{,2}t', 'This is a pet which likes to party')

# In[240]:

'\d+(\.\d+)? : an integer or decimal number'
nltk.re_show(r'\d+(\.\d+)?', 'the result is 0.24 or maybe 65')
Exemple #31
0
# Describe the class of strings matched by the following regular expressions.
#
# [a-zA-Z]+
# [A-Z][a-z]*
# p[aeiou]{,2}t
# \d+(.\d+)?
# ([^aeiou][aeiou][^aeiou])*
# \w+|[^\w\s]+
# Test your answers using nltk.re_show().

import nltk

reg_ex = [
    '[a-zA-Z]+', '[A-Z][a-z]*', 'p[aeiou]{,2}t', '\d+(.\d+)?',
    '([^aeiou][aeiou][^aeiou])*', '\w+|[^\w\s]+'
]
for index in range(len(reg_ex)):
    nltk.re_show(
        reg_ex[index],
        'hello World THis is natural language tool kit. let\'s go 1 2 3..')
Exemple #32
0
x = fdist.most_common(5)
y = [char for (char, count) in fdist.most_common()]
n = len(y)
s = 'Monty Python'
print(s[1:5])

word_list = [w for w in nltk.corpus.words.words('en') if w.islower()]
z = [w for w in word_list if re.search('ed$', w)]

integers = [1, 2, 3, 4, 5, 6, 7, 8, 9]
n = sum(1 for n in integers if n > 5)
m = sum(integers)
p = sum(n for n in integers if n > 5)

s = '2009-12-31'
p = re.compile(r'^[0-9]{4}|[0-9]{2}|[0-9]{2}$')
f = re.findall(p, s)
i = [int(c) for c in f]

moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
m = moby.findall(r'<a> (<.*>) <man>')
print(m)
"""

s = 'now is the time for some serious marketing'
p = re.compile(r's')
ss = re.findall(p, s)
sss = nltk.re_show(r's', s)

stop = 1
Exemple #33
0
#    a. [a-zA-Z]+
#    b. [A-Z][a-z]*
#    c. p[aeiou]{,2}t
#    d. \d+(\.\d+)?
#    e. ([^aeiou][aeiou][^aeiou])*
#    f. \w+|[^\w\s]+

#Test your answers using nltk.re_show().


import nltk

s = '1 day I\'m going to grow wings. A chemical reaction, hysterical and useless! Let down...'

print('(a)')
nltk.re_show('[a-zA-Z]+',s)

print('\n(b)')
nltk.re_show('[A-Z][a-z]*',s)

print('\n(c)')
nltk.re_show('p[aeiou]{,2}t','pilot poet pit paet poetry ' + s)

print('\n(d)')
nltk.re_show('\d+(\.\d+)?','12.143dsa ' +s)

print('\n(e)')
nltk.re_show('([^aeiou][aeiou][^aeiou])+',s)

print('\n(f)')
nltk.re_show('\w+|[^\w\s]+',s)
Exemple #34
0
import nltk
import re

# Describe the class of strings matched by the following regular expressions.
print("Regular Expression Validator")
print(
    "**************************************************************************"
)
print("[a-zA-Z]+")
print("Alphabet at least and more than at one time")
print("Example : ")
nltk.re_show(r'[a-zA-Z]+', 'a abc aBcd ABcd ABCD a1234 12A34 aB1234')

print(
    "**************************************************************************"
)
print("[A-Z][a-z]*")
print(
    "Start with upper case after that lower case is coming but lower cases can be omitted"
)
print("Example : ")
nltk.re_show(r'[A-Z][a-z]*', 'a abc aBcd ABcd ABCD a1234 12A34 aB1234')

print(
    "**************************************************************************"
)
print("p[aeiou]{,2}t")
print(
    "start with ‘p’ and end with ‘t’ between them 0 to 2 vowels(aeiou) can be inserted"
)
print("Example : ")
#        r"<l.*>"            # all tokens starting with "l"
#        r"{3,}"             # 3 or more occurrences in a row
#        )

print '''
Your Turn (1).

nltk.app.nemo() opens a GUI for experimenting with regexps.

But first, try out nltk.re_show(regexp, string) - shows matches for 'regexp' in 'string'
'''

sentence = "A man, a plan, a canal, panama."
nltk.re_show(
    r"[Aa] "
    r"([\w]+)"  # substring captures are IGNORED by re_show()
    r",",
    sentence)  # {A man,} {a plan,} {a canal,} panama.

print ""
print "To try 'Finding (and Replacing) Nemo, run `nltk.app.nemo()`"
#nltk.app.nemo()

print '''
Your Turn (2)

Looking for "as x as y" - analogous to the "x and other ys" example
'''

from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
Exemple #36
0
print(monty[7:len(monty):-2])

#5 What happens if you ask the interpreter to evaluate monty[::-1]? Explain why this is a reasonable result.
print(monty[::-1])
#reverses the string completely

#6 Describe the class of strings matched by the following regular expressions.
# [a-zA-Z]+ - one or more instances of a-z or A-Z
# [A-Z][a-z]* - one of A-Z followed by zero or more instances of a-z
# p[aeiou]{,2}t - p followed by 2 repeats of [aeiou] followed by t
# \d+(\.\d+)? - one or more decimal digit followed by . followed by one or more decimal digits
# ([^aeiou][aeiou][^aeiou])* - zero or more instances of does not start with aeiou followed by aeiou followed by not aeiou
# \w+|[^\w\s]+ - one or more instance of word or does not start with word then any whitespace character
# Test your answers using nltk.re_show().
print(
    nltk.re_show(r'[a-zA-Z]+',
                 "This matches all the words in a string not numbers 0998"))
print(nltk.re_show(r'[A-Z][a-z]*', "This matches Title Case stuff"))
print(
    nltk.re_show(r'p[aeiou]{,2}t', "Matches pout peit not parrot post paaaat"))
print(nltk.re_show(r'\d+(\.\d+)?', "Match 98.3232 .98 879 f9889"))
print(nltk.re_show(r'([^aeiou][aeiou][^aeiou])*', "This will match"))
print(nltk.re_show(r'([^aeiou][aeiou][^aeiou])+', "This will match"))
print(nltk.re_show(r'\w+|[^\w\s]+', "98 90734 23 this matches what"))

#7 Write regular expressions to match the following classes of strings:
# A single determiner (assume that a, an, and the are the only determiners).
# An arithmetic expression using integers, addition, and multiplication, such as 2*3+8.
test_string = "This is a ball. This is the tree. Lets eat an apple"
print(nltk.re_show(r'(a\s|an\s|the\s)', test_string))
test_exp = "2+3 2*4-6 2+8-1 6+7-8-9*5"
print(nltk.re_show(r'([0-9][\+\-\*]|[0-9])*', test_exp))