コード例 #1
0
 def _strip_parenthesis(parent_trace, txt):
     if type(txt) != str:
         raise ApodeixiError(
             parent_trace,
             "Encountered problem removing comments in parenthesis: expected a string, "
             + "but instead was given a '" + str(type(txt)),
             data={"invalid input": str(txt)})
     stripped_txt = StringUtils().strip(txt)
     # Remove text within parenthesis, if any, using the natural language tool nltk.tokenize.SExprTokenizer
     sexpr = SExprTokenizer(strict=False)
     sexpr_tokens = sexpr.tokenize(stripped_txt)
     parenthesis_free_tokens = [
         t for t in sexpr_tokens if not ')' in t and not '(' in t
     ]
     parentheis_free_txt = ' '.join(parenthesis_free_tokens)
     return parentheis_free_txt
コード例 #2
0
def runFile(file):
    """ String -> Value
        Runs he given multi function call racket program. """
    file = '\n'.join(stripComments(file.splitlines()))
    rexpList = SExprTokenizer().tokenize(file)
    output = []
    env = topLevelEnv
    for rexp in rexpList:
        out, env = run(rexp, env=env, returnEnv=True)
        output.append(out)
        print(output[-1])
    return output
コード例 #3
0
ファイル: racyon.py プロジェクト: kailanichu/racyhton
def runFile(file):
    def cntChar(char, str):
        return str.count(char)

    file = file.split()
    file = [line for line in file if line.lstrip()[0] != "#"]
    file = '\n'.join(file)
    rexpList = SExprTokenizer().tokenize(file)
    output = []
    for rexp in rexpList:
        output.append(runRexp(rexp))
        print(output[-1])
    return output
コード例 #4
0
def tokenize(query):
    a_query = query + " ()"
    a_list = SExprTokenizer().tokenize(a_query)
    return a_list[:-1]
#Let us say we want to extract all words beginning with an uppercase character
tokenizer = RegexpTokenizer('[A-Z]\w*\S+')
print(tokenizer.tokenize(s))

# #### SExprTokenizer : Tokenizes parenthesized expressions in a string

# In[11]:

from nltk.tokenize import SExprTokenizer

# In[12]:

s = '?(a(b c)d)ef(g(h(i)))'
print("Sentence: " + s)
print("\nSExprTokenizer...")
print(SExprTokenizer().tokenize(s))
print("\n")

# #### TreebankWordTokenizer is standard tokenizer tool used and does a decent job

# In[13]:

#TreebankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer

# In[14]:

s = "Good muffins cost $3.80 in New York. Dr. Ram Please buy me two of them. Thanks."
print("Sentence: " + s)
print("\nTreebankWordTokenizer...")
print(TreebankWordTokenizer().tokenize(s))
コード例 #6
0
# 8. Write a Python NLTK program that will read a given text through each line and look for sentences. Print each sentence and divide two sentences with "==============".
text = '''
Mr. Smith waited for the train. The train was late.
Mary and Samantha took the bus. I looked for Mary and
Samantha at the bus station.
'''
print("\nOriginal Tweet:")
print(text)
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
print('\n==============\n'.join(sent_detector.tokenize(text.strip())))

# 9. Write a Python NLTK program to find parenthesized expressions in a given string and divides the string into a sequence of substrings.
text = '(a b (c d)) e f (g)'
print("\nOriginal Tweet:")
print(text)
print(SExprTokenizer().tokenize(text))
text = '(a b) (c d) e (f g)'
print("\nOriginal Tweet:")
print(text)
print(SExprTokenizer().tokenize(text))
text = '[(a b (c d)) e f (g)]'
print("\nOriginal Tweet:")
print(text)
print(SExprTokenizer().tokenize(text))
print(text)
print(SExprTokenizer().tokenize(text))
text = '{a b {c d}} e f {g}'
print("\nOriginal Tweet:")
print(text)
print(SExprTokenizer().tokenize(text))