from nltk.corpus.reader import WordListCorpusReader import nltk # print(nltk.data.find('corpora/cookbook')) # print(nltk.data.find('corpora/cookbook/wordlist.txt')) d = nltk.data.find('corpora/cookbook') reader = WordListCorpusReader(d, ['wordlist.txt']) print(reader.words()) print(reader.fileids())
def read_emails(self, path): # Get all files files = [f for f in listdir(path) if isfile(join(path, f))] try: del (files[files.index('DS_Store')]) except: pass reader = WordListCorpusReader(path, files) cleaner = Cleaner() emails = list() # Creates the Email Object out of each email file and appends to list for file_id in reader.fileids(): with open(path + file_id, 'r') as current_file: cleaned_contents = cleaner.clean_file(current_file.read()) split_email_header, split_email_body, split_email_file_id = self.divide( cleaned_contents, file_id) emails.append( Email(split_email_header, split_email_body, split_email_file_id)) # Return list of Email objects return emails
def find_info_type(self): type_list = [] #list of all types('abstract','speaker') content_list = [] #list with content reader = WordListCorpusReader(self.folder_name, [self.file_name]) all_words = reader.words() #is the mail a proper one? if (all_words == []): return ([], []) #append the first tag of the mail ex:<0.1....> type_list.append("") content_list.append(all_words[0]) for w in all_words[1:]: #search for pattern like "Abstract: ..." type = re.search('^(\w+)(:)', w) #using group functionality to split the topic and content if (type != None): type_list.append(type.group(1)) content = re.search('^(\w+:)(.*)', w) content_list.append(content.group(2)) #not the best way to add the \n splitted content but... elif (len(content_list) > 0): last_element = content_list[-1] extra_content = w last_element = last_element + "\n" + extra_content content_list[-1] = last_element #if the type_list[0] will be 'abstract' then content_list[0] will be the abstract content return (type_list, content_list)
def __init__(self, punctuation_marks: str, corpus_dir: str, corpus_files: list): reader = WordListCorpusReader(corpus_dir, corpus_files) self.vi_dict = set(reader.words()) # Thêm các dấu vào từ điển, xem như nó đúng chỉnh tả self.vi_dict.update(list(punctuation_marks)) # Thêm một số từ đặc biệt self.vi_dict.update( ['m', 'g', 'gt', 'kg', 'km', 'mm', 'cm', 'c', 'f', 't']) self.re_d = re.compile(r'\d')
def read_emails(path): files = [f for f in listdir(path) if isfile(join(path, f))] try: del (files[files.index('.DS_Store')]) except: pass reader = WordListCorpusReader(path, files) text = clean(reader.raw()) emails = split_emails(text, reader.fileids()) return emails
def addingCorpus(): path = os.path.expanduser('~/nltk_data') if not os.path.exists(path): os.mkdir(path) print(os.path.exists(path)) print(nltk.data.path) print(path in nltk.data.path) nltk.data.load('corpora/cookbook/cookbook.txt', format='raw') reader = WordListCorpusReader('/Users/Dell/nltk_data/corpora/cookbook/', ['wordlist.txt']) print(reader.words())
def tokenize_file(file, corpus_root, english_stops): #tokenize input file, count words, characters, remove stopwords tokenizer = RegexpTokenizer(r'\w+') item_count = 0 total_chars = 0 word_count = 0 wordlist = [] reader = WordListCorpusReader(corpus_root, file) chunks = reader.words() for item in chunks: total_chars += len(chunks[item_count]) word_tokens = tokenizer.tokenize(chunks[item_count]) word_count += len(word_tokens) item_count += 1 for word in word_tokens: wordlist.append(word) stopsout = [word for word in wordlist if word.lower() not in english_stops] return wordlist, stopsout, word_count, total_chars
def main(): reader = WordListCorpusReader(path, ['banbagsfb.txt']) pages = line_tokenize(reader.raw()) thispage = pages[4] thispage = thispage.raw() """ The easiest way to deal with strings in Python that contain escape characters and quotes is to triple double-quote the string (""") and prefix it with r. For example: my_str = r"""This string would "really "suck"" to write if I didn't know how to tell Python to parse it as "raw" text with the 'r' character and triple " quotes. Especially since I want \n to show up as a backlash followed by n. I don't want \0 to be the null byte either!""" The r means "take escape characters as literal". The triple double-quotes (""") prevent single-quotes, double-quotes, and double double-quotes from prematurely ending the string. """ m = re.search("(\d)", thispage) thisitem = m.group(0) m = re.search("(\d\d\D\d\d)", thispage) thisdate = m.group(0) starturl = thispage.find('http') endurl = thispage.find(' ', starturl)-2 thisurl = thispage[starturl:endurl] soup = BeautifulSoup(thispage) newpage = soup.findAll(text=True) html = replace_all(newpage, reps) html = html[11:len(html)] postdate = html[0:5] posttext = html[5:len(html)] print "post date = " + postdate print "post text = " + posttext def replace_all(txt, reps): for i, j in reps.iteritems(): txt = txt.replace(i, j) return text if __name__ == "__main__": main()
def __init__(self, config_file): try: self.config = ConfigParser.RawConfigParser() self.config.optionxform = str self.config.read(config_file) tokenizers = self.config.get('post_training_corpus', 'regex_file') self.config_tokenizer = json.load(open(tokenizers, "r")) self.isWordList = self.config.getboolean('postaggers', 'isWordList') self.wordlist = self.config.items('postaggers.wordlist') self.training_portion = self.config.getfloat( 'post_training_corpus', 'training_portion') self.taggers_path = self.config.get('postaggers', 'save_to') self.max_ngrams = self.config.getint('postaggers', 'max_ngrams') self.tagger_extension_file = self.config.get( 'postaggers', 'ext_file') corpus = [] for key, corpus_file in self.config.items( 'post_training_corpus.corpus'): print "Generate model from file:", corpus_file corpus.append(corpus_file) self.corpusReader = ConllChunkCorpusReader( self.config.get('post_training_corpus', 'corpora'), corpus, ('NP', 'PP', 'VP', 'AP')) self.corpusSents = self.corpusReader.tagged_sents() self.wordListReader = WordListCorpusReader( self.config.get('post_training_corpus', 'wordlist_path'), r'.*\.txt') self.regex_list = [] for key in self.config_tokenizer.keys(): if self.config_tokenizer[key]['isolate'] == "True": regex = self.config_tokenizer[key]['regex'].encode( 'utf-8').decode('utf-8') post = self.config_tokenizer[key]['post'] self.regex_list.append((regex, post)) #logging.info(self.regex_list) except Exception, e: print "Error :", str(e) pdb.set_trace()
import nltk from nltk.corpus.reader import WordListCorpusReader reader = WordListCorpusReader('', ['computerscience.txt']) words = [nltk.word_tokenize(i) for i in reader.words()] from nltk.stem.porter import * stemmer = PorterStemmer() from nltk.stem import WordNetLemmatizer lemmer = WordNetLemmatizer() stemmed = [[stemmer.stem(y) for y in i] for i in words] lemmed = [[lemmer.lemmatize(y) for y in i] for i in words] print(stemmed)
import nltk from nltk.corpus import brown from nltk.corpus.reader import WordListCorpusReader from nltk.stem.porter import * from nltk.stem import WordNetLemmatizer x = nltk.data.load('files/big.txt', format='text') reader = WordListCorpusReader('files/', ['computerscience.txt']) cs_text = reader.raw() cs_words = [] cs_words = (nltk.word_tokenize(cs_text)) print(cs_words) stemmer = PorterStemmer() wnl = WordNetLemmatizer() for word in cs_words: print(stemmer.stem(word)) print(wnl.lemmatize(word))
import os.path path = os.path.expanduser('~/nltk_data') if not os.path.exists(path): os.mkdir(path) os.path.exists(path) import nltk.data #path in nltk.data.path print path ''' note that this should be a path in the Git_Workspace on D:\ ''' ''' load a sample wordlist ''' #import nltk.data nltk.data.load('corpora/cookbook/GL_Sequent.txt', format='raw') 'nltk\n' from nltk.corpus.reader import WordListCorpusReader reader = WordListCorpusReader(path + '/corpora/cookbook/', ['GL_Sequent.txt']) reader.words() ''' reading a tagged corpus ''' from nltk.corpus.reader import TaggedCorpusReader reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos') reader.words() reader.tagged_words() reader.sents() reader.tagged_sents() reader.paras() reader.tagged_paras() ''' different Tokenizer - works? ''' from nltk.tokenize import SpaceTokenizer reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos',word_tokenizer=SpaceTokenizer())
from nltk.corpus import brown from nltk.corpus.reader import WordListCorpusReader from nltk.stem.porter import * from nltk.stem import WordNetLemmatizer import re from os import listdir from os.path import isfile, join wnl = WordNetLemmatizer() stemmer = PorterStemmer() tagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/training" untagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/seminar_testdata/test_untagged" general_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/Data" l_names = WordListCorpusReader(general_data_filepath, ["names.family"]).words() file_names = [ f for f in listdir(untagged_data_filepath) if isfile(join(untagged_data_filepath, f)) ] file_names = file_names[1:] reader = WordListCorpusReader(untagged_data_filepath, [file_names[0]]) corpus = reader.raw() words = reader.words() def get_tags_by_name(corpus, name): return re.findall(r"<" + name + r">.+</" + name + r">", corpus)
def __init__(self): ''' Constructor for the BE06 word list corpa. @note: Initaly the contructor tries to load the corpora from a .plk file. If this has not created, then a new instance is created by iterating though all files for BE06. ''' try: #Attempt to open .plk file and load. input = open("./Corpus/BE06/BE06.pkl", 'rb') reader = load(input) input.close() except IOError as e: filelist = [] words = [] #Find all .txt files in /BE06 dirctory for files in os.listdir("./Corpus/BE06"): if files.endswith(".txt"): filelist.append(files) if(len(filelist) == 500): #Iterate through whole list of file for name in filelist: f = open("./Corpus/BE06/" + name) lines = f.readlines() #Read line in file, tokonize to words, and remove all #Punctuation for line in lines: tmp1 = nltk.sent_tokenize(line) for lin in tmp1: tmp = nltk.word_tokenize(lin) for word in tmp: for c in string.punctuation: word = word.replace(c, "") words.append(word) f.close() #Write wordlist to output file. a = open("./Corpus/BE06/finalcorpa.txt", "wb") for word in words: if word not in ".,;!?\"": a.write(word + '\n') a.close() #Creat NLTK corpus, and save a copy in folder for later use reader = WordListCorpusReader('./Corpus/BE06', ['finalcorpa.txt']) output = open("./Corpus/BE06/BE06.pkl", 'wb') dump(reader, output, -1) output.close() else: reader = WordListCorpusReader('./Corpus/BE06', ['finalcorpa.txt']) output = open("./Corpus/BE06/BE06.pkl", 'wb') dump(reader, output, -1) output.close() #Return corpus self.corpa = reader
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
# Create a corpus reader with all the files reader = PlaintextCorpusReader('.', files) # Set up a translation table for punctuation to the empty string table = str.maketrans('', '', string.punctuation) # Get a list of English stopwords without punctuation english_stops = set(stopwords.words('english')) english_stops_nopunct = { stopword.translate(table) for stopword in english_stops } # Load the insect wordlist of stems insect_words = WordListCorpusReader('.', ['wordlists/insect-wordstems.txt']) # A list to hold the frequency data freq_data = [] count = 1 # Read each file in turn for file in files: text = reader.raw(file) print(f'{count}: TOKENISING {file}') # Tokenise and normalise to lowercase tokens = word_tokenize(text.lower()) # Remove all punctuation marks
n_score = ret.prob("negative") if max(p_score, n_score) <= cutoff: return "neutral" if p_score > n_score: return "positive" elif n_score > p_score: return "negative" else: return "neutral" reader = WordListCorpusReader('/path/to/sentiment/files', ['positive.txt', 'negative.txt']) pos_feats = [(dict([(word, True)]), 'positive') for word in reader.words('positive.txt')] neg_feats = [(dict([(word, True)]), 'negative') for word in reader.words('negative.txt')] train_feats = pos_feats + neg_feats classifier = NaiveBayesClassifier.train(train_feats) t = Twitter(auth=OAuth("TOKEN", "TOKEN_KEY", "CON_SECRET", "CON_SECRET_KEY")) connection = pymongo.Connection() db = connection.twitter mentions = db.mentions screen_names = ["YOUR_ACCOUNT", "YOUR_OTHER_ACCOUNT"]
#print(fileName) counter =0 docWords = corpus.words(fileName) for word in docWords: #print(word) w = word.lower() if w in wordSet: # i could also use in the fd.inc approach here and it's proabably better-just showing another option. print(w + " is in " + fileName) counter+= 1 billCounts.append(counter) return billCounts from nltk.corpus.reader import WordListCorpusReader path = "/Volumes/Optibay-1TB/Dropbox/Content_Wilker/Gonzalez_Project/Gonzalez_Keywords" reader = WordListCorpusReader(path, ['crime.txt']) #make an nltk word list crime = reader.words() crime = [word.lower().strip() for word in crime] crimeSet = set([w.lower() for w in crime]) crimeCount = make_count(billsCorpora, crimeSet) fd = count_stems(billsCorpora) counter = 0 #lets look at 200 of the most popular items and there counts #you could use the csv writer methods or this which is kind of hacky mywordlist = numpy.asarray([billsCorpora.fileids(), crimeCount]) mywordlist[0][1] #name
x = nltk.data.load('big.txt', format='auto') ''' reader = WordListCorpusReader('',['wordlist.txt','wordlist2.txt']) print(reader.words()) print(reader.fileids()) stemmer = PorterStemmer() print(stemmer.stem('running')) wnl = WordNetLemmatizer() print(wnl.lemmatize('dogs')) ''' csReader = WordListCorpusReader('','computerscience.txt') wnl = WordNetLemmatizer() stemmer = PorterStemmer() ### Concats list of words from reader csWords = nltk.word_tokenize(' '.join(csReader.words())) print(type(csWords)) for word in csWords: print("%s,%s" % (wnl.lemmatize(word), stemmer.stem(word))) inputList = ['16/12/2016'] for inputString in inputList: print(re.findall(r'(.*?)[\s\-\\](.*?)[\s\-\\](.*?)', inputString))
#!/usr/bin/env python # encoding: utf-8 """ wordNet.py Created by Aaron Erlich on 2013-02-13. """ import sys import os import nltk from nltk.corpus.reader import WordListCorpusReader path = #insert your path #path = "/Volumes/Optibay-1TB/Dropbox/Content_Wilker/Gonzalez_Project/Gonzalez_Keywords" reader = WordListCorpusReader(path, ['crime.txt']) #make an nltk word list crime = reader.words() crime = [word.lower().strip() for word in crime] from nltk.corpus import wordnet #lemmas are the distinct meaning of the a word and all of each meaning's possible morphologies #we see that lots of the student's words have both noun and verb meanings. Which does he care about? #these words are polysemous -- they have similar but different meanings for word in crime: print word print wordnet.synsets(word) print "\n" raw_input("Hit Enter") [synset.lemma_names for synset in wordnet.synsets("stealing")]
f for f in listdir(corpora + '/golden_test_subset_a') if isfile(join(corpora + '/golden_test_subset_a', f)) ] onlyfilessbsa2 = [ f for f in listdir(corpora + '/golden_tagged_subset_a') if isfile(join(corpora + '/golden_tagged_subset_a', f)) ] testc = nltk.corpus.reader.plaintext.PlaintextCorpusReader( corpora + '/golden_test_subset_a', onlyfilessbsa1) tagdc = nltk.corpus.reader.plaintext.PlaintextCorpusReader( corpora + '/golden_tagged_subset_a', onlyfilessbsa2) # getting named entity corpora names = WordListCorpusReader( nepath, ['male.txt', 'female.txt', 'family.txt']) # list of names, from canvas titles = WordListCorpusReader(nepath, ['titles.txt']) # list of common titles orgsuffs = WordListCorpusReader( nepath, ['orgsuff.txt']) # list of organisation suffixes daymonths = WordListCorpusReader(nepath, ['daymonths.txt']) # list of days and months # extracting named entities from tagged data # regex patterns to match each tag pattern1 = '<ENAMEX TYPE="PERSON">(.*?)<\/ENAMEX>' pattern2 = '<ENAMEX TYPE="LOCATION">(.*?)<\/ENAMEX>' pattern3 = '<ENAMEX TYPE="ORGANIZATION">(.*?)<\/ENAMEX>' # finding every example in the data, storing in sets people = set(re.findall(pattern1, trainingcorpus.raw()))
import os import re import sys import json import nltk.test import abbreviations import portuguese_tagger_processor from sentilex import sentiLexPairRdd from nltk.corpus.reader import WordListCorpusReader __output_path = "result.json" stopwords = nltk.corpus.stopwords.words('portuguese') reader = WordListCorpusReader('.', ['symbols.txt']) symbols = reader.words() reader = WordListCorpusReader('.', ['positive_emoticons.txt']) positive_emoticons = reader.words() reader = WordListCorpusReader('.', ['negative_emoticons.txt']) negative_emoticons = reader.words() tweet_tokenizer = portuguese_tagger_processor.get_tweet_tokenizer() tagger = portuguese_tagger_processor.get_tagger() json_result = [] tweet_dict = {} def count_positive_emoticons(tokens): counter = 0 for emoticon in positive_emoticons: if emoticon in tokens: counter += 1
def word_normalizer(word): return filter(lambda word: word not in 'the,"inisarefromwithonfor1234567890asbywasretrieved.andof:;''()).{}[]-to\'&#^%160/20102011201220132009ateditwikipedia].isbn\x80\x93),.&#\xe0\xe1has\xd0\xd1\xb0worldthisthat|&201420072006200520042003200220012000+=-', word) # Feature extractor # Frequency count def geo_features (word): return {'any_word':word} # Initialize constants NLTK_HOME = '/home/administrator/nltk_data' l_list = [] # cleaning, tokenizing, normalizing # Read the Corpus state_reader = WordListCorpusReader(NLTK_HOME, ['state_files.txt']) city_reader = WordListCorpusReader(NLTK_HOME, ['city_files.txt']) train_file = '/app/ai/train_file.txt' test_results_file = '/app/ai/test_city_results_file.txt' # Store the URLs in a list urls = ([(url,'city') for url in city_reader.words()]+ [(url,'state') for url in state_reader.words()] ) for url in list(urls): # Remove HTMLtabs after reading the URL raw = nltk.clean_html(urlopen(url[0]).read()) print 'Finished cleaning html for ', url[0] # Compute the frequency distribution of the words
def open_places_wordlist(): path = '/Users/tim/mycode/time/wordlists/' wordlist = 'ga_gazetteer_wordlist.txt' reader = WordListCorpusReader(path, [wordlist]) return reader
WP = 'WP' # 35. Possessive wh-pronoun WP_ = 'WP$' # 36. Wh-adverb WRB = 'WRB' @staticmethod def nounish(word, pos): # nltk apparently defaults to 'NN' for smileys :) so special-case those return pos in (POS.NN, POS.NNS, POS.NNP, POS.NNPS) and \ any(c.isalpha() for c in word) mass_noun_corpora = WordListCorpusReader('wordlist/massnoun', r'[a-z]+') mass_nouns = mass_noun_corpora.words() QUANTITY_POS_TAGS = frozenset(( POS.JJ, POS.VBN, POS.VBP, POS.NN, POS.NNP, POS.RB, POS.RBR, POS.RBS, )) bad_words_corpora = WordListCorpusReader('wordlist/shutterstock-bad-words', r'[a-z]{2,3}') bad_words_en = bad_words_corpora.words('en')
user='******', passwd='Webrowse@123', db='article') cur = db.cursor() ''' dataset = load_files( '/home/soumen/projects/scikit-learn/doc/tutorial/text_analytics/data/languages/paragraphs' ) # Read an article file_id_argv = open(sys.argv[1]) file_id = file_id_argv.read() file_list = file_id.split('\n') file_list.pop(-1) italian_stopwords = WordListCorpusReader('.', ['stop-words-it-en.txt']) def language_detection(text): """Description here""" count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(dataset.data) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_counts = count_vect.transform(text) X_test_tfidf = tfidf_transformer.transform(X_test_counts) clf = MultinomialNB().fit(X_train_tfidf, dataset.target) predicted = clf.predict(X_test_tfidf)
# from nltk.stem import WordNetLemmatizer # wnl = WordNetLemmatizer() # print(wnl.lemmatize('monsters')) ''' In each of the above cases we have handled one word. Now print the stemmed and lemmatized versions of all the words in the document computerscience.txt Preview the document. Here is an overview of what you need to do: 1. Load the file into a reader [ Hint: reader = WordListCorpusReader( ... ) ] 2. use word_tokenize from nltk.tokenize to convert the text into words 3. Loop through the text [Hint: Use the for statement] 4. Lemmatize and Stem each word. 5. Look at the difference between the two, notice how the lemmatizer makes mistakes in some cases - can you identify why and propose a solution? ''' from nltk.corpus.reader import WordListCorpusReader tokens = [] reader = WordListCorpusReader('./', ['computerscience.txt']) for count, ele in enumerate(reader.words()): print(count, "\b:", ele, "\n") tokens += nltk.word_tokenize(ele) print(tokens) from nltk.stem.porter import * stemmer = PorterStemmer() from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() for token in tokens: print(token) print(wnl.lemmatize(token)) print(stemmer.stem(token))
########## WORDLIST CORPUS READER ############### #Basic Corpus Reader from nltk.corpus.reader import WordListCorpusReader #List of a few thousand names organized by gender from nltk.corpus import names #List of english words from nltk.corpus import words nltkDir="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" #nltkFile="mywords.txt" #source=nltkDir+nltkFile ### One File WordListCorpusReader reader=WordListCorpusReader(nltkDir,['wordlist.txt']) print reader.words() print reader.fileids() ### MultiFile WordListCorpusReader #To get the names of the files in the corpus use the "fileids" command names.fileids() print len(names.words('female.txt')) print len(names.words('female.txt')) words.fileids() print len(words.words('en-basic')) print len(words.words('en')) ###Chunked Corpus Reader
import nltk from nltk import load_parser from nltk.corpus.reader import WordListCorpusReader reader = WordListCorpusReader('', ['words.txt']) words = [nltk.word_tokenize(i) for i in reader.words()] cp = load_parser('grammar.fcfg', trace=1) # from nltk.corpus import treebank # from nltk.tag import DefaultTagger # train_set = treebank.tagged_sents()[:4000] # test_set = treebank.tagged_sents()[2000:] # from nltk.tag import UnigramTagger # unigramTagger = UnigramTagger(train_set) # from nltk.tag import BigramTagger, TrigramTagger # bigramTagger = BigramTagger(train_set, cutoff=2) # trigramTagger = TrigramTagger(train_set, cutoff=3) # def backoff_tagger(train_sents, tagger_classes, backoff=None): # for cls in tagger_classes : # backoff = cls(train_sents, backoff=backoff) # return backoff # tagger = backoff_tagger(train_set, [UnigramTagger, BigramTagger, TrigramTagger], backoff=DefaultTagger('NN')) # for sentence in words: # print(tagger.tag(sentence)) for sentence in words: print(sentence) for tree in cp.parse(sentence): print(tree)