def POS_tagging(corpus): train_text = state_union.raw("2005-GWBush.txt") sample_text = corpus #print(train_text) custom_sentence_tokenizer = PunktSentenceTokenizer(train_text) # textfile = open("POS_tagged",'w') # textfile.write(train_text) # textfile.write("\n\n\n\n\n\n\n\n\n\n") # print(custom_sentence_tokenizer) tokenized = custom_sentence_tokenizer.tokenize(sample_text) tuples_list = [] def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) for w in tagged: tuples_list.append(w) except Exception as e: c=0 # print(str(e)) process_content() return tuples_list
def main(): training_text = state_union.raw('2005-GWBush.txt') sample_text = state_union.raw('2006-GWBush.txt') custom_sent_tokenizer = PunktSentenceTokenizer(training_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) choice = 0 while choice < 5: choice = input("1 for named_chunks. This provides some information about proper nouns.\n, 2 for process_chunks. This tells you if a noun phrase followed by n adverb occurs., \n3 for proccess content, this just prints stuff, 4 for...") if choice == 1: named_chunks(text_trained_tokenized(sample_text, training_text)) elif choice == 2: process_chunks(text_trained_tokenized(sample_text, training_text)) elif choice == 3: process_content(text_trained_tokenized(sample_text, training_text)) elif choice == 4: print "try again, bitch!"
def name_ent_recog(post): train_text = state_union.raw("2005-GWBush.txt") sample_text = post custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) namedEnt = [] try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt.append(nltk.ne_chunk(tagged)) except Exception as e: print(str(e)) return namedEnt
def main(argv): print("main") # namedEnts = named_ents("Bill went to the White House. He saw the President of the United States. Then he went to O'hare International Airport. He flew to The Democratic Republic of Congo. He will not go back to the White House any time soon. the President of the United States is dissapointed by this.") # print(namedEnts) f = open("north_korea.txt") text = f.read() # print(text) johnson = state_union.raw("1968-Johnson.txt") ent_list = text_ents(johnson) ent_freq = nltk.FreqDist(ent_list) print(ent_freq.most_common()) print(ent_freq) print(list(ent_freq.values())) print(list(ent_freq.keys()))
def POS_tagging(corpus): train_text = state_union.raw("2005-GWBush.txt") sample_text = "" for i in corpus: sample_text = sample_text+i+" " tuples_list = [] def process_content(): try: words = nltk.word_tokenize(sample_text) tagged = nltk.pos_tag(words) for w in tagged: tuples_list.append(w) except Exception as e: print(str(e)) process_content() return tuples_list
def name_ent_recog(post): train_text = state_union.raw("2005-GWBush.txt") sample_text = post custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) namedEnt = [] try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt.append(nltk.ne_chunk(tagged)) # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP.?>*<NN>?}""" # # chunkGram = r"""Chunk: {<.*>+} # # }<VB.?|IN|DT>+{""" # chunkParser = nltk.RegexpParser(chunkGram) # chunked = chunkParser.parse(tagged) # print(chunked) # #print(tagged) except Exception as e: print(str(e)) return namedEnt
example_text = "This is an example text to test the NLTK parts of speech tagging" words = word_tokenize(example_text) print(words) for word in words: # word_list = [word] tokenized_word = word_tokenize(word) # return a list of word from a sentence or if a single word then covert it to a list # print(tokenized_word) print(pos_tag(tokenized_word)) # pos_tag function takes a list of word as input train_text = state_union.raw("2005-GWBush.txt") # state union is a text corpus. we just use the raw format of a text file from that corpus as training data sample_text = state_union.raw("2006-GWBush.txt") # use another text file as a sample data custom_sentence_tokenizer = PunktSentenceTokenizer(train_text) # initialize PunktSentenceTokenizer with training data tokenized = custom_sentence_tokenizer.tokenize(sample_text) # tokenize sample data. It is a sentence tokenizer def process_content(): try: for i in tokenized: words = word_tokenize(i) # tokenize words from sentences tagged = pos_tag(words) # getting the tags from words print(tagged) except Exception as e: print(str(e))
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer #PunktSentenceTokenizer is an unsupervised ml tokenizer #It comes trained and it can be retrained too train = state_union.raw('2005-GWBush.txt') sample = state_union.raw('2006-GWBush.txt') #training and testing data custom_sent_tokenizer = PunktSentenceTokenizer(train) tokenized = custom_sent_tokenizer.tokenize(sample) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e)) process_content() #pos creates tupples of each word and its tag ''' POS tag list CC coordinating conjuction CD cardinal digit
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer trainText = state_union.raw("project.txt") sampleText = state_union.raw("project.txt") custom_sent_tokenizer = PunktSentenceTokenizer(trainText) tokenized = custom_sent_tokenizer.tokenize(sampleText) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e)) process_content()
#!/usr/bin/env python import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer #unsupervised tokenizer train_text = state_union.raw('2005-GWBush.txt') #print train_text test_text = state_union.raw('2006-GWBush.txt') custom_sent_token = PunktSentenceTokenizer(train_text) tokenized = custom_sent_token.tokenize(test_text) #print tokenized #print type(tokenized) def chunk(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) regexp = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?} }<VB.?|IN|DT|TO>+{""" parser = nltk.RegexpParser(regexp)
TO to go 'to' the store. UH interjection errrrrrrrm VB verb, base form take VBD verb, past tense took VBG verb, gerund/present participle taking VBN verb, past participle taken VBP verb, sing. present, non-3d take VBZ verb, 3rd person sing. present takes WDT wh-determiner which WP wh-pronoun who, what WP$ possessive wh-pronoun whose WRB wh-abverb where, when """ # retrieving the corpus train_text = state_union.raw('2005-GWBush.txt') text = state_union.raw('2006-GWBush.txt') # training the sentence tokenizer (unsupervised) tokenizer = PunktSentenceTokenizer(train_text) sentence = tokenizer.tokenize(text) # tagging the tokens by word tokenizing the sentence and the using regular exp to chunk the tokens try: for s in sentence: token = word_tokenize(s) pos = pos_tag(token) print(pos) chunkreg = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}}<VB.?|IN|DT|TO>+{""" chunkParser = nltk.RegexpParser(chunkreg) chunked = chunkParser.parse(pos)
# -*- coding: utf-8 -*- """ Created on Sun Jan 7 17:46:46 2018 @author: noelg """ import nltk #print(nltk.__file__) from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer from nltk.corpus import state_union sample = state_union.raw ("2005-GWBush.txt") tok = sent_tokenize (sample) for x in range (5): print (tok [x])
''' This program is for chunking with nltk ''' import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenize train_text = state_union.raw("") sample_text = sate_union.raw("") custom_sent_tokenize = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenize.tokenize(sample_text) def process_content(): for i in tokenized: words = nltk.words_tokenize(i) tagged = nltk.pos_tag(words) chunkgram = r"""chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" chunkParser = nltk.RegexpParser(chunkgram) chunked = chunkgram.parse(tagged) chunked.draw()
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw('2005-GWBush.txt') sample_text = state_union.raw('2005-GWBush.txt') custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) # look for any adverb - zero or more adverbs,verbs zero or more # proper noun - 1, and many a noun - this will create a chunk gram chunkGram = r'''Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}''' # create a parser based on the chunkgram chunkParser = nltk.RegexpParser(chunkGram) # pass the tagged images to the parser to the chunks chunked = chunkParser.parse(tagged) print(chunked) chunked.draw() except: pass process_content()
import nltk import nltk.data from nltk.corpus import state_union text = state_union.raw("2006-GWBush.txt") custom_sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle") tokenized = custom_sentence_tokenizer.tokenize(text) #print(tokenized) def process_content(): try: for i in tokenized: word = nltk.word_tokenize(i) tagged = nltk.pos_tag(word) chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" chunkParser = nltk.RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) #chunked.draw() # for getting both chunks and non-chunks for subtree in chunked.subtrees(): print(subtree) # for filtering only chunks for subtree in chunked.subtrees( filter=lambda t: t.label() == "Chunk"): print(subtree)
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) train_text = state_union.raw("2017Balkon.txt") sample_text = state_union.raw("2018Balkon.txt") def process_content(): try: for i in tokenized[:5]: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) except Exception as e: print(str(e)) process_content()
def __init__(self, train_text, sample_text): self.train_text = state_union.raw(train_text) self.sample_text = state_union.raw(sample_text) self.custom_sent_tokenizer = PunktSentenceTokenizer(self.train_text) self.tokenized = self.custom_sent_tokenizer.tokenize(self.sample_text)
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw("2005-GWBush.txt") sample_text = state_union.raw("2006-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" #regular expression chunkParser = nltk.RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) print(chunked) for subtree in chunked.subtrees( filter=lambda t: t.label() == 'Chunk'): print(subtree) chunked.draw() except Exception as e: print(str(e))
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw( "C:/Users/Anurag/Desktop/NLP_EXamples/Chunking/train.txt") sample_text = state_union.raw( "C:/Users/Anurag/Desktop/NLP_EXamples/Chunking/test.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" chunkParser = nltk.RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) print(chunked) for subtree in chunked.subtrees( filter=lambda t: t.label() == 'Chunk'): print(subtree) chunked.draw() except Exception as e:
from nltk.corpus import state_union #from nltk.corpus import PunktSentenceTokenizer from nltk.stem import PorterStemmer #this give the stem of the word to help “normalize’ text from nltk.stem import WordNetLemmatizer #this is like stemming, but gives a complete word or synonym from nltk.corpus import wordnet, movie_reviews #movie_reviews are 1000 positive and 1000 negative movie reviews import random #this is to randomize the movie reviews as the first 1000 are positive and the other 1000 negative import pickle my_text = """The World Wide Web, or simply Web, is a way of accessing information over the medium of the Internet. It is an information-sharing model that is built on top of the Internet. The Web uses the HTTP protocol, only one of the languages spoken over the Internet, to transmit data. Web services, which use HTTP to allow applications to communicate in order to exchange business logic, use the the Web to share information. The Web also utilizes browsers, such as Internet Explorer or Firefox, to access Web documents called Web pages that are linked to each other via hyperlinks. Web documents also contain graphics, sounds, text and video. The Web is just one of the ways that information can be disseminated over the Internet. The Internet, not the Web, is also used for e-mail, which relies on SMTP, Usenet news groups, instant messaging and FTP. So the Web is just a portion of the Internet, albeit a large portion, but the two terms are not synonymous and should not be confused.""" address = state_union.raw('2006-GWBush.txt') def stem_text (text): """reduces the text to its stems and removes the stop words""" tokenized_text = word_tokenize(text) #this is a list comp that filters the stopwords from tokenized text stopped_text = [word for word in tokenized_text if word not in stopwords.words('english')] #note english in stopwords stemmed_list =[] #this give the stem of the word to help “normalize’ text ps = PorterStemmer() for word in stopped_text: x = ps.stem(word) stemmed_list.append(x) print('text has been stemmed') return stemmed_list
import nltk from nltk.corpus import state_union as su from nltk.tokenize import PunktSentenceTokenizer as pst train_text = su.raw("2005-GWBush.txt") sample_text = su.raw("2006-GWBush.txt") custom_sent_tokenizer = pst(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt = nltk.ne_chunk(tagged, binary=True) namedEnt.draw() except Exception as e: print(str(e)) process_content()
from nltk.corpus import state_union from nltk.tokenize import word_tokenize from nltk.corpus import stopwords total_word_freq = {} word_freq_per_speech = {} word_num_per_speech = {} total_word_num = 0 en_stopwords = stopwords.words('english') for fileid in state_union.fileids(): word_freq_per_speech[fileid] = {} word_num = 0 sample = state_union.raw(fileid) words = word_tokenize(sample) for word in words: lower_word = word.lower() if lower_word not in en_stopwords and lower_word.isalpha(): word_num += 1 if lower_word not in total_word_freq.keys(): total_word_freq[lower_word] = 1 else: total_word_freq[lower_word]+=1 if lower_word not in word_freq_per_speech[fileid].keys(): word_freq_per_speech[fileid][lower_word] = 1 else: word_freq_per_speech[fileid][lower_word]+=1 #print fileid, word_num word_num_per_speech[fileid] = word_num
words = ['dogs', 'cars', 'feet', 'people'] for word in words: print(wnl.lemmatize(word)) print(wnl.lemmatize('fantasized', 'v')) """## Corpus - Large collection of text - spoken material on which liguistic analysis is based """ import nltk nltk.download('state_union') from nltk.corpus import state_union dataset = state_union.raw('2001-GWBush-1.txt') """## Wordnet - Lexical database in english language - It group english words in antonyms and synonyms - Also provides short examples and words """ from nltk.corpus import wordnet syns = wordnet.synsets('program') print(syns) print(syns[0].lemmas()) print(syns[0].lemmas()[0].name()) # Getting the name of it print(syns[0].lemmas()[1].name())
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw("margot.txt") sample_text = state_union.raw("gal_gadot.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_data) tokenized = custom_sent_tokenizer(sample_text) try: for w in tokenized: words = nltk.word_tokenize(w) tagged = nltk.pos_tag(words) print(tagged) nameEnt = nltk.ne_chunk(tagged) except Exception as e: print(str(e))
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer training_text = state_union.raw("2005-GWBush.txt") input_text = state_union.raw("2006-GWBush.txt") cust_tokenizer = PunktSentenceTokenizer(training_text) tokenized = cust_tokenizer.tokenize(input_text) def do_chunking(): try: for i in tokenized: words = nltk.word_tokenize(i) tags = nltk.pos_tag(words) chunkPattern = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" chunkParser = nltk.RegexpParser(chunkPattern) chunkedData = chunkParser.parse(tags) print(chunkedData) # chunkedData.draw() except Exception as e: print(str(e)) do_chunking()
import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer sample_text = state_union.raw('2006-GWBush.txt') custom_sen_tok = PunktSentenceTokenizer(sample_text) tokenizer = custom_sen_tok.tokenize(sample_text) def process_content(): try: for i in tokenizer[5:]: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) nameEnt = nltk.chunk(tagged) nameEnt.draw() except Exception as e: print(str(e)) process_content()
print 'Argument List:', str(sys.argv) phrase = sys.argv[1] corpora = sys.argv[2] corpus = [] #Check corpus if corpora == "gutenberg": titles = gutenberg.fileids() for title in titles: corpus.append(gutenberg.raw(title)) elif corpora == "state_union": titles = state_union.fileids() for title in titles: corpus.append(state_union.raw(title)) else: print "Choose from gutenberg or state_union" exit(0) vectorizer = TfidfVectorizer(min_df=1, stop_words="english") X = vectorizer.fit_transform(corpus) XA = X.toarray() # print vectorizer.vocabulary_ print 'The dimensions of the TF.IDF matrix: ' print XA.shape print 'TF.IDF computation for the ' + corpora + ' corpus is completed\n' dict = vectorizer.vocabulary_
#Copywrite Warning: Owner of the code is Gulcheera Academy(Khosiyat Sabirova) #This code can be used by anyone for free, but the name "Gulcheera Academy" must be acknowledged #Named Entity Recognition with NLTK #nltk packages are imported import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer example_4Tagging1 = state_union.raw( "2005-GWBush.txt" ) #create a variable to store a raw data which is in text format provided by the corpus of nltk package example_4Tagging2 = state_union.raw("2006-GWBush.txt") def namedChunk(sample_text, train_text): tokenized_trained = PunktSentenceTokenizer(train_text) tokenized = tokenized_trained.tokenize(sample_text) try: for lexUnit in tokenized[5:]: words = nltk.word_tokenize(lexUnit) taggedUnit = nltk.pos_tag(words) namedChunk = nltk.ne_chunk(taggedUnit, binary=True) #namedChunk.draw() except Exception as skip: print(str(skip)) #print the result namedChunk(example_4Tagging1, example_4Tagging2)
TO to . go'to' the store UH interjection errrrrrrrm VB verb, base from take VBD verb, past tense VBG verb, gerund/ present participle teking VBN verb, past participle taken VBP verb, sing. present, non-3d take VBZ verb, 3rd person sing.persent takes WDT wh-determiner which WP wh-pronoun who, what WPS possessive wh-pronoun whose WRB wh-abverb where, when ''' psng1 = state_union.raw("STSint.testinput.headlines.sent1.txt") psng2 = state_union.raw("STSint.testinput.headlines.sent2.txt") custom = PunktSentenceTokenizer(psng1) token = custom.tokenize(psng2) def proses(): try: for i in token: words = nltk.word_tokenize(i) tag = nltk.pos_tag(words) #print (tag) chunkGram = r"""chunk : {<RB.?>*<VB.?>*<NNP><NN>?}""" chunkParser = nltk.RegexpParser(chunkGram)
#filtered_text = [] #stop_words_list = [] #for j in words: # if j not in stop_words : # filtered_text.append(j) # elif j in stop_words : # stop_words_list.append(j) #print("stopwords:" ,stop_words_list) # #ps = PorterStemmer() # # #for k in filtered_text : # print(k,ps.stem(k)) train_text = state_union.raw("2005-GWBush.txt") sample_text = ( "Barack Obama went to China yesterday. He lives in Grand Hyatt Beijing. This is a superb hotel." ) #my_text.raw("txt.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) #RECOGNITION
def getPresFromSpeech(speech_id): # 2001-GWBush-1.txt words = speech_id.split('.') if len(words) > 0: single_words = words[0].split('-') if len(single_words) > 0: for word in single_words: if word.isalpha(): return word return "" all_words = {} for speech_id in state_union.fileids(): text = state_union.raw(speech_id) words = word_tokenize(text) for word in words: if word not in all_words.keys(): all_words[word] = 1 else: all_words[word] += 1 sent_len = [] word_len = [] pres_list = [] pres_sent_total = {} pres_word_total = {} pres_char_total = {} pres_uniq_word = {}
import nltk import os import string from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords, state_union from nltk.stem import PorterStemmer from nltk.probability import FreqDist #get input text dirpath = os.getcwd() + "/Job Summary.txt" data = state_union.raw(dirpath) #initialize utilities lemma = nltk.wordnet.WordNetLemmatizer() ps = PorterStemmer() stop_words = set(stopwords.words("english")) mystop_words = [ "\'ll", "position", "work", "job", "role", "year", "valley", "skill", "day", "summary", "must", "salary", 'ready', 'great',
#!/usr/bin/env python """ Chunking """ __author__ = "Manan Kalra" __email__ = "*****@*****.**" import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer sample_text = state_union.raw("sample.txt") custom_sent_tokenizer = PunktSentenceTokenizer() tokenized = custom_sent_tokenizer.tokenize(sample_text) def tag(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) # print(tagged) chunk = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}""" chunk_parser = nltk.RegexpParser(chunk) chunked = chunk_parser.parse(tagged) # print(chunked) chunked.draw() except Exception as e: print(str(e))
def process_content(filePath, sub): train_text = state_union.raw(filePath) sample_text = state_union.raw(filePath) custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) try: bigList = [] c = 1 with open("mytext.txt", 'w') as ft1: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) for j in tagged: bigList.append(j) ques = "Q" + str(c) + ")" ft1.write(ques) verbList = [] for x in tagged: if ((x[1] == 'VBZ') or (x[1] == 'VB') or (x[1] == 'VBP') or (x[1] == 'VBG') or (x[1] == 'VBN')) or (x[1] == 'VBD'): verbList.append(x[0]) print(verbList) #ft1.write("Verbs are:") s = str(verbList).strip('[]') #ft1.write(s) finalListV = [] for w in verbList: constrain = w curs.execute("select * from action_verbs where verbs=%s", (constrain, )) data = curs.fetchall() #print("data") #print(data) for j in data: finalListV.append(str(j[0])) level = j[1] print("finalListV1") print(finalListV) finalListV = list(map(int, finalListV)) print(finalListV) nounList = [] for x in tagged: if ((x[1] == 'NNP') or (x[1] == 'NN') or (x[1] == 'NNS')): nounList.append(x[0]) #print("Nouns are:") print(nounList) s1 = str(nounList).strip('[]') finalListN = [] for n in nounList: keyword = n # write if dept is cse taking from user input curs.execute( "select * from jkeywordsc ,levels where jkeywordsc.co=levels.cno and jkey=%s and dept=%s", ( keyword, sub, )) data = curs.fetchall() for j in data: finalListN.append(str(j[4])) #print("finalListN") print(finalListN) stats = dict(Counter(finalListN)) #maxval = max(dict.iteritems(), key=operator.itemgetter(1))[1] #print(keys = [k for k, v in Counter.items() if v == maxval]) match = max(stats.items(), key=operator.itemgetter(1))[0] #for k, v in newList.items(): #print(k, v) match = int(match) if match in finalListV: ft1.write("Accepted at level:" + str(match) + "(" + level + ")" + "of Blooom's Taxonomy") else: ft1.write("Rejected,Levels not satisfied") ft1.write("\n") c = c + 1 #print(finalListN) return ("mytext.txt") except Exception as e: print(str(e))
from nltk.corpus import state_union as su from nltk.tokenize import PunktSentenceTokenizer, word_tokenize from nltk import pos_tag, RegexpParser train_text = su.raw( "2005-GWBush.txt") # training our tokenizer using 2005 speech of GW Bush sample_text = su.raw( "2006-GWBush.txt") # testing our tokenizer model on 2006 speech of GW Bush custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = word_tokenize(i) tagged = pos_tag(words) chunkGram = r"""Chunk: {<.*>+} }<VB.?|IN|DT>+{""" # chunking and chinking of our data chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) print(chunked) chunked.draw() except Exception as e: print(str(e)) process_content()
# -*- coding: utf-8 -*- """ Created on Tue Jul 16 11:30:44 2019 @author: Cosimo chunking """ import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train= state_union.raw("2005-GWBush.txt") sample= state_union.raw("2006-GWBush.txt") custom_sent_tok= PunktSentenceTokenizer(train) tokenized=custom_sent_tok.tokenize(sample) def process_content(): try: for i in tokenized: words= nltk.word_tokenize(i) tagged= nltk.pos_tag(words) chunkGram= r"""Chunk: {<RB.?>*<NNP><NN>?} """ chunkParser= nltk.RegexpParser(chunkGram) chunked=chunkParser.parse(tagged) print(chunked)
def buildhtml(tokenized_sentence, sentence_count): html = "" starting_div = "<div class=\"panel panel-primary\"> <div class=\"panel-heading\"> Sentence "+ str(sentence_count) +"</div><div class=\"panel-body\">" ending_div = "</div></div>" html += starting_div try: for token in tokenized_sentence: words = nltk.word_tokenize(token) tagged = nltk.pos_tag(words) for word in tagged: if word[1] in tagdict: html += "<a href=\"#\" data-toggle=\"tooltip\" title=\""+tagdict[word[1]][0]+"\">"+word[0]+"</a>" html += ending_div return html except Exception as e: print(str(e)) text = state_union.raw("/Users/ponrajuganesh/Desktop/data.txt") sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') tagdict = nltk.data.load("help/tagsets/" + "upenn_tagset" + ".pickle") count = 0 fulldiv = "" for sentence in sent_detector.tokenize(text): count += 1 custom_sent_tokenizer = PunktSentenceTokenizer() fulldiv += buildhtml(custom_sent_tokenizer.tokenize(sentence), count) print fulldiv
import nltk from nltk.tokenize import PunktSentenceTokenizer from nltk.corpus import state_union train_text = state_union.raw('2005-GWBush.txt') sample_text = state_union.raw('2006-GWBush.txt') custom_sent_tokeniser = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokeniser.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEntity = nltk.ne_chunk(tagged, binary=False) namedEntity.draw() except Exception as e: print str(e) process_content()
G_F = gutenberg.fileids() dir(gutenberg) # it has raw, words and sents as method for field in G_F: num_chars = len(gutenberg.raw(field)) num_words = len(gutenberg.words(field)) num_sents = len(gutenberg.sents(field)) num_vocab = len(set(w.lower() for w in gutenberg.words(field))) print('# Chars', num_chars, '# words', num_words, '# sentens', num_sents, '# vocabs', num_vocab, '-- name of fields', field) # ---------------------------------------------------------------------------------------------------------------------- import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer Text1 = state_union.raw("2005-GWBush.txt") Text2 = state_union.raw("2006-GWBush.txt") ST = PunktSentenceTokenizer(Text1) Tok = ST.tokenize(Text1) for i in Tok: words = nltk.word_tokenize(i) tag = nltk.pos_tag(words) print(tag)
from os import path import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer import sys from termcolor import * import termcolor import textblob from textblob import TextBlob from textblob.translate import Translator #Training for then identifying verbs, nouns etc train_text = state_union.raw("2005-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) #Color Codes corresponding to Tags for Verbs, Nouns etc TagCodes = {'CC': 6, 'CD': 1, 'DT': 6, 'EX': 6, 'FW': 6, 'IN': 6, 'JJ': 0, 'JJR': 0, 'JJS': 0, 'LS': 2, 'MD': 2, 'NN': 1, 'NNS': 1, 'NNP': 2, 'NNPS': 2, 'PDT': 6, 'POS': 6, 'PRP': 5, 'PRP$': 5, 'RB': 4, 'RBR': 4, 'RBS': 4, 'RP': 4, 'TO': 7, 'UH': 2, 'VB': 3, 'VBD': 3, 'VBG': 3, 'VBN': 3, 'VBP': 3, 'VBZ': 3, 'WDT': 6, 'WP': 5, 'WP$': 5, 'WRB': 5}; ColorCodes = {0: 'grey', 1: 'red', 2: 'green', 3: 'yellow', 4: 'blue', 5: 'magenta', 6: 'cyan', 7: 'white'} #Each language is assigned a short code for translation LanguageCodes = {'afrikaans' : 'af','albanian' : 'sq','arabic' : 'ar','armenian' : 'hy','azerbaijani' : 'az','basque' : 'eu','belarusian' : 'be','bengali' :'bn','bosnian' : 'bs','bulgarian' : 'bg','catalan' : 'ca','cebuano' : 'ceb','chichewa' : 'ny','chinese-simplified' : 'zh-CN','chinese-traditional' : 'zh-TW','croatian' : 'hr','czech' : 'cs','danish' : 'da','dutch' : 'nl','english' : 'en','esperanto' : 'eo','estonian' : 'et','filipino' : 'tl','finnish' : 'fi','french' : 'fr','galician' : 'gl','georgian' : 'ka','german' : 'de','greek' : 'el','gujarati' : 'gu','haitian-creole' : 'ht','hausa' : 'ha','hebrew' : 'iw','hindi' : 'hi','hmong' : 'hmn','hungarian' : 'hu','icelandic' : 'is','igbo' : 'ig','indonesian' : 'id','irish' : 'ga','italian' : 'it','japanese' : 'ja','javanese' : 'jw','kannada' :'kn','kazakh' : 'kk','khmer' : 'km','korean' : 'ko','lao' : 'lo','latin' : 'la','latvian' : 'lv','lithuanian' : 'lt','macedonian' : 'mk','malagasy' : 'mg','malay' : 'ms','malayalam' : 'ml','maltese' : 'mt','maori' : 'mi','marathi' : 'mr','mongolian' :'mn','burmese' : 'my','nepali' : 'ne','norwegian' : 'no','persian' : 'fa','polish' : 'pl','portuguese' : 'pt','punjabi' : 'ma','romanian' : 'ro','russian' : 'ru','serbian' : 'sr','sesotho' : 'st','sinhala' : 'si','slovak' : 'sk','slovenian' :'sl','somali' : 'so','spanish' : 'es','sudanese' : 'su','swahili' : 'sw','swedish' : 'sv','tajik' : 'tg','tamil' : 'ta','telugu' : 'te','thai' : 'th','turkish' : 'tr','ukrainian' : 'uk','urdu' : 'ur','uzbek' : 'uz','vietnamese' : 'vi','welsh' : 'cy','yiddish' : 'yi','yoruba' : 'yo','zulu' : 'zu'} #Tags corresponding to Verbs, Nouns etc ''' POS tag list:
import nltk from nltk.tokenize import PunktSentenceTokenizer from nltk.corpus import state_union from nltk.stem import WordNetLemmatizer content = state_union.raw('2006-GWBush.txt') tokenizer = PunktSentenceTokenizer() tokenised = tokenizer.tokenize(content) def process_content(): try: for i in tokenised: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) print(tagged) # chunking and chinking chunk_gram = r'''Chunk: {<.*>+} }<VB.?|IN|DT|TO>+{''' # }<put chinking content inside this>{ and {<chunking content inside this>} chunk_parser = nltk.RegexpParser(chunk_gram) chunked = chunk_parser.parse(tagged) print(chunked) #chunked.draw() # find named entity named_entity = nltk.ne_chunk(tagged, binary=True) #named_entity.draw()
import nltk ## To run this example, we need to make use of 'maxent_ne_chunker' and 'words', so first get it downloaded ##nltk.download('maxent_ne_chunker') and nltk.download('words') from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer train_text = state_union.raw("1973-Nixon.txt") sample_text = state_union.raw("1974-Nixon.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEntity = nltk.ne_chunk(tagged, binary=True) namedEntity.draw() except Exception as e: print(str(e)) process_content()
import nltk from nltk.corpus import state_union from nltk.tokenize import PunkSentenceTokenizer train_text = state_union.raw("a.txt") sample_text = state_union.raw("b.txt") custom_sent_tokeniser = PunkSentenceTokenizer(train_text) tokenized = custom_sent_tokeniser(sample_text) def content_process(): try: for i in tokenised: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt = nltk.ne_chunk(tagged, binary=True) #Giving binary option just classifies everything as named entity and doesn't classify to organisation, money and so on namedEnt.draw() #Named Entity Types: - Organisation, Person, Location, Date, Time, Money, Percent, Facility, GPE except Exception as e: print(str(e)) content_process()
# -*- coding: utf-8 -*- """ Created on Thu Nov 19 09:15:11 2015 @author: nilakant """ import nltk from nltk.corpus import state_union from nltk.tokenize import PunktSentenceTokenizer #unsupervised tokenizer train_text = state_union.raw("2006-GWBush.txt") sample_text = state_union.raw("2006-GWBush.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) tokenized = custom_sent_tokenizer.tokenize(sample_text) def process_content(): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) chunkGram = r"""Chunk: {<.*>+} }<VB.?|IN|DT|TO>+{""" chunkParser = nltk.RegexpParser(chunkGram) chunked = chunkParser.parse(tagged) chunked.draw()
return entity_names def extract_entities(taggedText): ''' Create map with entity and their counts :param taggedText: Parsed text (output of ne chunker) in tree form :return: dict of entities and their freq counts ''' entity_names = [] for tree in taggedText: entity_names.extend(extract_entity_names(tree)) return entity_names #get year and words for each file extracted= [(state_union.raw(fileid), int(fileid[:4])) for fileid in state_union.fileids()] docs, years = zip(*extracted) #break text down into sentences, tokens tokens = [nltk.word_tokenize(text) for text in docs] sents = [nltk.sent_tokenize(text.replace("\n", " ")) for text in docs] senttokens = [[nltk.word_tokenize(sent) for sent in entry] for entry in sents] #get counts of unique words and plot over time unique = [len(set(words)) for words in tokens] plt.scatter(years, unique) plt.show() #get unique/total ratio ratios = [(float(len(set(words)))/float(len(words))) for words in tokens] plt.scatter(years, ratios)