def _strip_parenthesis(parent_trace, txt): if type(txt) != str: raise ApodeixiError( parent_trace, "Encountered problem removing comments in parenthesis: expected a string, " + "but instead was given a '" + str(type(txt)), data={"invalid input": str(txt)}) stripped_txt = StringUtils().strip(txt) # Remove text within parenthesis, if any, using the natural language tool nltk.tokenize.SExprTokenizer sexpr = SExprTokenizer(strict=False) sexpr_tokens = sexpr.tokenize(stripped_txt) parenthesis_free_tokens = [ t for t in sexpr_tokens if not ')' in t and not '(' in t ] parentheis_free_txt = ' '.join(parenthesis_free_tokens) return parentheis_free_txt
def runFile(file): """ String -> Value Runs he given multi function call racket program. """ file = '\n'.join(stripComments(file.splitlines())) rexpList = SExprTokenizer().tokenize(file) output = [] env = topLevelEnv for rexp in rexpList: out, env = run(rexp, env=env, returnEnv=True) output.append(out) print(output[-1]) return output
def runFile(file): def cntChar(char, str): return str.count(char) file = file.split() file = [line for line in file if line.lstrip()[0] != "#"] file = '\n'.join(file) rexpList = SExprTokenizer().tokenize(file) output = [] for rexp in rexpList: output.append(runRexp(rexp)) print(output[-1]) return output
def tokenize(query): a_query = query + " ()" a_list = SExprTokenizer().tokenize(a_query) return a_list[:-1]
#Let us say we want to extract all words beginning with an uppercase character tokenizer = RegexpTokenizer('[A-Z]\w*\S+') print(tokenizer.tokenize(s)) # #### SExprTokenizer : Tokenizes parenthesized expressions in a string # In[11]: from nltk.tokenize import SExprTokenizer # In[12]: s = '?(a(b c)d)ef(g(h(i)))' print("Sentence: " + s) print("\nSExprTokenizer...") print(SExprTokenizer().tokenize(s)) print("\n") # #### TreebankWordTokenizer is standard tokenizer tool used and does a decent job # In[13]: #TreebankWordTokenizer from nltk.tokenize import TreebankWordTokenizer # In[14]: s = "Good muffins cost $3.80 in New York. Dr. Ram Please buy me two of them. Thanks." print("Sentence: " + s) print("\nTreebankWordTokenizer...") print(TreebankWordTokenizer().tokenize(s))
# 8. Write a Python NLTK program that will read a given text through each line and look for sentences. Print each sentence and divide two sentences with "==============". text = ''' Mr. Smith waited for the train. The train was late. Mary and Samantha took the bus. I looked for Mary and Samantha at the bus station. ''' print("\nOriginal Tweet:") print(text) sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') print('\n==============\n'.join(sent_detector.tokenize(text.strip()))) # 9. Write a Python NLTK program to find parenthesized expressions in a given string and divides the string into a sequence of substrings. text = '(a b (c d)) e f (g)' print("\nOriginal Tweet:") print(text) print(SExprTokenizer().tokenize(text)) text = '(a b) (c d) e (f g)' print("\nOriginal Tweet:") print(text) print(SExprTokenizer().tokenize(text)) text = '[(a b (c d)) e f (g)]' print("\nOriginal Tweet:") print(text) print(SExprTokenizer().tokenize(text)) print(text) print(SExprTokenizer().tokenize(text)) text = '{a b {c d}} e f {g}' print("\nOriginal Tweet:") print(text) print(SExprTokenizer().tokenize(text))