Beispiel #1
0
def POS_tagging(corpus):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = corpus
    #print(train_text)
    custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)

    # textfile = open("POS_tagged",'w')
    # textfile.write(train_text)
    # textfile.write("\n\n\n\n\n\n\n\n\n\n")
    # print(custom_sentence_tokenizer)

    tokenized = custom_sentence_tokenizer.tokenize(sample_text)
    tuples_list = []
    def process_content():
        try:
            for i in tokenized:
                words = nltk.word_tokenize(i)
                tagged = nltk.pos_tag(words)
                for w in tagged:
                    tuples_list.append(w)
        except Exception as e:
            c=0
            # print(str(e))
    process_content()
    return tuples_list
def main():
    training_text = state_union.raw('2005-GWBush.txt')
    sample_text = state_union.raw('2006-GWBush.txt')
    custom_sent_tokenizer = PunktSentenceTokenizer(training_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)

    choice = 0
    while choice < 5:
        choice = input("1 for named_chunks. This provides some information about proper nouns.\n, 2 for process_chunks. This tells you if a noun phrase followed by n adverb occurs., \n3 for proccess content, this just prints stuff, 4 for...")
        if choice == 1:
            named_chunks(text_trained_tokenized(sample_text, training_text))
        elif choice == 2:
            process_chunks(text_trained_tokenized(sample_text, training_text))
        elif choice == 3:
            process_content(text_trained_tokenized(sample_text, training_text))
        elif choice == 4:
            print "try again, bitch!"
Beispiel #3
0
def name_ent_recog(post):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = post
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    namedEnt = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt.append(nltk.ne_chunk(tagged))
    except Exception as e:
        print(str(e))
    return namedEnt
def main(argv):
    print("main")
    # namedEnts = named_ents("Bill went to the White House. He saw the President of the United States. Then he went to O'hare International Airport. He flew to The Democratic Republic of Congo. He will not go back to the White House any time soon. the President of the United States is dissapointed by this.")
    # print(namedEnts)
    f = open("north_korea.txt")
    text = f.read()
    # print(text)
    johnson = state_union.raw("1968-Johnson.txt")
    ent_list = text_ents(johnson)
    ent_freq = nltk.FreqDist(ent_list)
    print(ent_freq.most_common())
    print(ent_freq)
    print(list(ent_freq.values()))
    print(list(ent_freq.keys()))
def POS_tagging(corpus):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = ""
    for i in corpus:
        sample_text = sample_text+i+" "
    tuples_list = []
    def process_content():
        try:
            words = nltk.word_tokenize(sample_text)
            tagged = nltk.pos_tag(words)
            for w in tagged:
                tuples_list.append(w)
        except Exception as e:
            print(str(e))
    process_content()
    return tuples_list
def name_ent_recog(post):
    train_text = state_union.raw("2005-GWBush.txt")
    sample_text = post
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)
    namedEnt = []
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt.append(nltk.ne_chunk(tagged))
            # chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP.?>*<NN>?}"""
            # # chunkGram = r"""Chunk: {<.*>+}
            # #                     }<VB.?|IN|DT>+{"""
            # chunkParser = nltk.RegexpParser(chunkGram)
            # chunked = chunkParser.parse(tagged)
            # print(chunked)
            # #print(tagged)
    except Exception as e:
        print(str(e))
    return namedEnt
example_text = "This is an example text to test the NLTK parts of speech tagging"

words = word_tokenize(example_text)

print(words)



for word in words:
    # word_list = [word]
    tokenized_word = word_tokenize(word) # return a list of word from a sentence or if a single word then covert it to a list
    # print(tokenized_word)
    print(pos_tag(tokenized_word)) # pos_tag function takes a list of word as input


train_text = state_union.raw("2005-GWBush.txt") # state union is a text corpus. we just use the raw format of a text file from that corpus as training data
sample_text = state_union.raw("2006-GWBush.txt") # use another text file as a sample data

custom_sentence_tokenizer = PunktSentenceTokenizer(train_text) # initialize PunktSentenceTokenizer with training data

tokenized = custom_sentence_tokenizer.tokenize(sample_text) # tokenize sample data. It is a sentence tokenizer

def process_content():
    try:
        for i in tokenized:
            words = word_tokenize(i) # tokenize words from sentences
            tagged = pos_tag(words) # getting the tags from words
            print(tagged)
    except Exception as e:
        print(str(e))
Beispiel #8
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
#PunktSentenceTokenizer is an unsupervised ml tokenizer
#It comes trained and it can be retrained too

train = state_union.raw('2005-GWBush.txt')
sample = state_union.raw('2006-GWBush.txt')

#training and testing data

custom_sent_tokenizer = PunktSentenceTokenizer(train)
tokenized = custom_sent_tokenizer.tokenize(sample)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

process_content()

#pos creates tupples of each word and its tag

'''
POS tag list
CC  coordinating conjuction
CD cardinal digit
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

trainText = state_union.raw("project.txt")
sampleText = state_union.raw("project.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(trainText)

tokenized = custom_sent_tokenizer.tokenize(sampleText)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()
Beispiel #10
0
#!/usr/bin/env python

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer #unsupervised tokenizer


train_text = state_union.raw('2005-GWBush.txt')

#print train_text

test_text = state_union.raw('2006-GWBush.txt')

custom_sent_token = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_token.tokenize(test_text)

#print tokenized
#print type(tokenized)

def chunk():
	try:
		for i in tokenized:
			words = nltk.word_tokenize(i)
			tagged = nltk.pos_tag(words)

			regexp = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?} 
								}<VB.?|IN|DT|TO>+{"""

			parser = nltk.RegexpParser(regexp)
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
"""

# retrieving the corpus
train_text = state_union.raw('2005-GWBush.txt')
text = state_union.raw('2006-GWBush.txt')

# training the sentence tokenizer (unsupervised)
tokenizer = PunktSentenceTokenizer(train_text)
sentence = tokenizer.tokenize(text)

# tagging the tokens by word tokenizing the sentence and the using regular exp to chunk the tokens
try:
    for s in sentence:
        token = word_tokenize(s)
        pos = pos_tag(token)
        print(pos)
        chunkreg = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}}<VB.?|IN|DT|TO>+{"""
        chunkParser = nltk.RegexpParser(chunkreg)
        chunked = chunkParser.parse(pos)
Beispiel #12
0
# -*- coding: utf-8 -*-
"""
Created on Sun Jan  7 17:46:46 2018

@author: noelg
"""

import nltk

#print(nltk.__file__)

from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer
from nltk.corpus import state_union

sample = state_union.raw ("2005-GWBush.txt")

tok = sent_tokenize (sample)

for x in range (5):
    print (tok [x])
Beispiel #13
0
'''
This program is for chunking with nltk
'''
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenize

train_text = state_union.raw("")
sample_text = sate_union.raw("")

custom_sent_tokenize = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenize.tokenize(sample_text)


def process_content():
    for i in tokenized:
        words = nltk.words_tokenize(i)
        tagged = nltk.pos_tag(words)

        chunkgram = r"""chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
        chunkParser = nltk.RegexpParser(chunkgram)
        chunked = chunkgram.parse(tagged)

        chunked.draw()
Beispiel #14
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2005-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            # look for any adverb - zero or more adverbs,verbs zero or more
            # proper noun - 1, and many a noun - this will create a chunk gram
            chunkGram = r'''Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}'''
            
            # create a parser based on the chunkgram
            chunkParser = nltk.RegexpParser(chunkGram)

            # pass the tagged images to the parser to the chunks
            chunked = chunkParser.parse(tagged)
            print(chunked)
            chunked.draw()
    except:
        pass


process_content()
Beispiel #15
0
import nltk
import nltk.data
from nltk.corpus import state_union

text = state_union.raw("2006-GWBush.txt")

custom_sentence_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")

tokenized = custom_sentence_tokenizer.tokenize(text)


#print(tokenized)
def process_content():
    try:
        for i in tokenized:
            word = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(word)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            #chunked.draw()

            # for getting both chunks and non-chunks
            for subtree in chunked.subtrees():
                print(subtree)

            # for filtering only chunks
            for subtree in chunked.subtrees(
                    filter=lambda t: t.label() == "Chunk"):
                print(subtree)
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)
train_text = state_union.raw("2017Balkon.txt")
sample_text = state_union.raw("2018Balkon.txt")
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()
Beispiel #17
0
 def __init__(self, train_text, sample_text):
     self.train_text = state_union.raw(train_text)
     self.sample_text = state_union.raw(sample_text)
     self.custom_sent_tokenizer = PunktSentenceTokenizer(self.train_text)
     self.tokenized = self.custom_sent_tokenizer.tokenize(self.sample_text)
Beispiel #18
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""  #regular expression
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            print(chunked)
            for subtree in chunked.subtrees(
                    filter=lambda t: t.label() == 'Chunk'):
                print(subtree)

            chunked.draw()

    except Exception as e:
        print(str(e))
Beispiel #19
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw(
    "C:/Users/Anurag/Desktop/NLP_EXamples/Chunking/train.txt")
sample_text = state_union.raw(
    "C:/Users/Anurag/Desktop/NLP_EXamples/Chunking/test.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            print(chunked)
            for subtree in chunked.subtrees(
                    filter=lambda t: t.label() == 'Chunk'):
                print(subtree)

            chunked.draw()

    except Exception as e:
Beispiel #20
0
from nltk.corpus import state_union
#from nltk.corpus import PunktSentenceTokenizer
from nltk.stem import PorterStemmer     #this give the stem of the word to help “normalize’ text
from nltk.stem import WordNetLemmatizer #this is like stemming, but gives a complete word or synonym
from nltk.corpus import wordnet, movie_reviews #movie_reviews are 1000 positive and 1000 negative movie reviews
import random #this is to randomize the movie reviews as the first 1000 are positive and the other 1000 negative
import pickle





my_text = """The World Wide Web, or simply Web, is a way of accessing information over the medium of the Internet. It is an information-sharing model that is built on top of the Internet. The Web uses the HTTP protocol, only one of the languages spoken over the Internet, to transmit data. Web services, which use HTTP to allow applications to communicate in order to exchange business logic, use the the Web to share information. The Web also utilizes browsers, such as Internet Explorer or Firefox, to access Web documents called Web pages that are linked to each other via hyperlinks. Web documents also contain graphics, sounds, text and video.
The Web is just one of the ways that information can be disseminated over the Internet. The Internet, not the Web, is also used for e-mail, which relies on SMTP, Usenet news groups, instant messaging and FTP. So the Web is just a portion of the Internet, albeit a large portion, but the two terms are not synonymous and should not be confused."""

address = state_union.raw('2006-GWBush.txt')


def stem_text (text):
    """reduces the text to its stems and removes the stop words"""
    tokenized_text = word_tokenize(text)
    #this is a list comp that filters the stopwords from  tokenized text
    stopped_text = [word for word in tokenized_text if word not in stopwords.words('english')] #note english in stopwords
    stemmed_list =[]
    #this give the stem of the word to help “normalize’ text
    ps = PorterStemmer()
    for word in stopped_text:
        x = ps.stem(word)
        stemmed_list.append(x)
    print('text has been stemmed')
    return stemmed_list
Beispiel #21
0
import nltk
from nltk.corpus import state_union as su
from nltk.tokenize import PunktSentenceTokenizer as pst

train_text = su.raw("2005-GWBush.txt")
sample_text = su.raw("2006-GWBush.txt")

custom_sent_tokenizer = pst(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            namedEnt = nltk.ne_chunk(tagged, binary=True)

            namedEnt.draw()

    except Exception as e:
        print(str(e))


process_content()
from nltk.corpus import state_union
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

total_word_freq = {}
word_freq_per_speech = {}
word_num_per_speech = {}

total_word_num = 0

en_stopwords = stopwords.words('english')

for fileid in state_union.fileids():
    word_freq_per_speech[fileid] = {}
    word_num = 0
    sample = state_union.raw(fileid)
    words = word_tokenize(sample)
    for word in words:
        lower_word = word.lower()
        if lower_word not in en_stopwords and lower_word.isalpha():
            word_num += 1
            if lower_word not in total_word_freq.keys():
                total_word_freq[lower_word] = 1
            else:
                total_word_freq[lower_word]+=1
            if lower_word not in word_freq_per_speech[fileid].keys():
                word_freq_per_speech[fileid][lower_word] = 1
            else:
                word_freq_per_speech[fileid][lower_word]+=1
    #print fileid, word_num
    word_num_per_speech[fileid] = word_num
Beispiel #23
0
words = ['dogs', 'cars', 'feet', 'people']
for word in words:
    print(wnl.lemmatize(word))

print(wnl.lemmatize('fantasized', 'v'))
"""## Corpus
- Large collection of text
- spoken material on which liguistic analysis is based
"""

import nltk

nltk.download('state_union')
from nltk.corpus import state_union

dataset = state_union.raw('2001-GWBush-1.txt')
"""## Wordnet
- Lexical database in english language
- It group english words in antonyms and synonyms
- Also provides short examples and words
"""

from nltk.corpus import wordnet

syns = wordnet.synsets('program')
print(syns)

print(syns[0].lemmas())
print(syns[0].lemmas()[0].name())  # Getting the name of it
print(syns[0].lemmas()[1].name())
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("margot.txt")
sample_text = state_union.raw("gal_gadot.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_data)

tokenized = custom_sent_tokenizer(sample_text)

try:
    for w in tokenized:
        words = nltk.word_tokenize(w)
        tagged = nltk.pos_tag(words)
        print(tagged)

        nameEnt = nltk.ne_chunk(tagged)
        
except Exception as e:
    print(str(e))
Beispiel #25
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

training_text = state_union.raw("2005-GWBush.txt")
input_text = state_union.raw("2006-GWBush.txt")

cust_tokenizer = PunktSentenceTokenizer(training_text)

tokenized = cust_tokenizer.tokenize(input_text)


def do_chunking():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tags = nltk.pos_tag(words)
            chunkPattern = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""

            chunkParser = nltk.RegexpParser(chunkPattern)
            chunkedData = chunkParser.parse(tags)

            print(chunkedData)
            # chunkedData.draw()
    except Exception as e:
        print(str(e))


do_chunking()
Beispiel #26
0
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

sample_text = state_union.raw('2006-GWBush.txt')
custom_sen_tok = PunktSentenceTokenizer(sample_text)
tokenizer = custom_sen_tok.tokenize(sample_text)


def process_content():
    try:
        for i in tokenizer[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            nameEnt = nltk.chunk(tagged)
            nameEnt.draw()

    except Exception as e:
        print(str(e))


process_content()
Beispiel #27
0
print 'Argument List:', str(sys.argv)

phrase = sys.argv[1]
corpora = sys.argv[2]
corpus = []

#Check corpus
if corpora == "gutenberg":
    titles = gutenberg.fileids()
    for title in titles:
        corpus.append(gutenberg.raw(title))

elif corpora == "state_union":
    titles = state_union.fileids()
    for title in titles:
        corpus.append(state_union.raw(title))
else:
    print "Choose from gutenberg or state_union"
    exit(0)

vectorizer = TfidfVectorizer(min_df=1, stop_words="english")
X = vectorizer.fit_transform(corpus)

XA = X.toarray()
# print vectorizer.vocabulary_
print 'The dimensions of the TF.IDF matrix: '
print XA.shape

print 'TF.IDF computation for the ' + corpora + ' corpus is completed\n'

dict = vectorizer.vocabulary_
#Copywrite Warning: Owner of the code is Gulcheera Academy(Khosiyat Sabirova)
#This code can be used by anyone for free, but the name "Gulcheera Academy" must be acknowledged
#Named Entity Recognition with NLTK

#nltk packages are imported
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

example_4Tagging1 = state_union.raw(
    "2005-GWBush.txt"
)  #create a variable to store a raw data which is in text format provided by the corpus of nltk package
example_4Tagging2 = state_union.raw("2006-GWBush.txt")


def namedChunk(sample_text, train_text):
    tokenized_trained = PunktSentenceTokenizer(train_text)
    tokenized = tokenized_trained.tokenize(sample_text)
    try:
        for lexUnit in tokenized[5:]:
            words = nltk.word_tokenize(lexUnit)
            taggedUnit = nltk.pos_tag(words)
            namedChunk = nltk.ne_chunk(taggedUnit, binary=True)
            #namedChunk.draw()
    except Exception as skip:
        print(str(skip))


#print the result
namedChunk(example_4Tagging1, example_4Tagging2)
Beispiel #29
0
TO      to  . go'to' the store
UH      interjection  errrrrrrrm
VB      verb, base from take
VBD     verb, past tense
VBG     verb, gerund/ present participle teking
VBN     verb, past participle taken
VBP     verb, sing. present, non-3d  take
VBZ     verb, 3rd person sing.persent takes
WDT     wh-determiner which
WP      wh-pronoun  who, what
WPS     possessive wh-pronoun whose
WRB     wh-abverb   where, when


'''
psng1 = state_union.raw("STSint.testinput.headlines.sent1.txt")
psng2 = state_union.raw("STSint.testinput.headlines.sent2.txt")

custom = PunktSentenceTokenizer(psng1)
token = custom.tokenize(psng2)


def proses():
    try:
        for i in token:
            words = nltk.word_tokenize(i)
            tag = nltk.pos_tag(words)
            #print (tag)

            chunkGram = r"""chunk : {<RB.?>*<VB.?>*<NNP><NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
Beispiel #30
0
#filtered_text = []
#stop_words_list = []
#for j in words:
#    if j not in stop_words :
#        filtered_text.append(j)
#    elif j in stop_words :
#        stop_words_list.append(j)
#print("stopwords:" ,stop_words_list)
#
#ps = PorterStemmer()
#
#
#for k in filtered_text :
#    print(k,ps.stem(k))

train_text = state_union.raw("2005-GWBush.txt")
sample_text = (
    "Barack Obama went to China yesterday. He lives in Grand Hyatt Beijing. This is a superb hotel."
)  #my_text.raw("txt.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)

            tagged = nltk.pos_tag(words)

            #RECOGNITION
Beispiel #31
0
def getPresFromSpeech(speech_id):
    # 2001-GWBush-1.txt
    words = speech_id.split('.')

    if len(words) > 0:
        single_words = words[0].split('-')
        if len(single_words) > 0:
            for word in single_words:
                if word.isalpha():
                    return word
    return ""

all_words = {}
for speech_id in state_union.fileids():
    text = state_union.raw(speech_id)
    words = word_tokenize(text)
    for word in words:
        if word not in all_words.keys():
            all_words[word] = 1
        else:
            all_words[word] += 1

sent_len = []
word_len = []

pres_list = []
pres_sent_total = {}
pres_word_total = {}
pres_char_total = {}
pres_uniq_word = {}
import nltk
import os
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, state_union
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist

#get input text
dirpath = os.getcwd() + "/Job Summary.txt"
data = state_union.raw(dirpath)

#initialize utilities
lemma = nltk.wordnet.WordNetLemmatizer()
ps = PorterStemmer()
stop_words = set(stopwords.words("english"))
mystop_words = [
    "\'ll",
    "position",
    "work",
    "job",
    "role",
    "year",
    "valley",
    "skill",
    "day",
    "summary",
    "must",
    "salary",
    'ready',
    'great',
#!/usr/bin/env python
"""
Chunking
"""

__author__ = "Manan Kalra"
__email__ = "*****@*****.**"

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

sample_text = state_union.raw("sample.txt")

custom_sent_tokenizer = PunktSentenceTokenizer()
tokenized = custom_sent_tokenizer.tokenize(sample_text)


def tag():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            # print(tagged)
            chunk = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunk_parser = nltk.RegexpParser(chunk)
            chunked = chunk_parser.parse(tagged)
            # print(chunked)
            chunked.draw()
    except Exception as e:
        print(str(e))
Beispiel #34
0
def process_content(filePath, sub):
    train_text = state_union.raw(filePath)
    sample_text = state_union.raw(filePath)
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    tokenized = custom_sent_tokenizer.tokenize(sample_text)

    try:
        bigList = []
        c = 1
        with open("mytext.txt", 'w') as ft1:
            for i in tokenized:
                words = nltk.word_tokenize(i)
                tagged = nltk.pos_tag(words)
                print(tagged)
                for j in tagged:
                    bigList.append(j)
                ques = "Q" + str(c) + ")"
                ft1.write(ques)
                verbList = []
                for x in tagged:
                    if ((x[1] == 'VBZ') or (x[1] == 'VB') or (x[1] == 'VBP') or
                        (x[1] == 'VBG') or (x[1] == 'VBN')) or (x[1] == 'VBD'):
                        verbList.append(x[0])
                print(verbList)
                #ft1.write("Verbs are:")
                s = str(verbList).strip('[]')
                #ft1.write(s)
                finalListV = []
                for w in verbList:
                    constrain = w
                    curs.execute("select * from action_verbs where verbs=%s",
                                 (constrain, ))
                    data = curs.fetchall()
                    #print("data")
                    #print(data)
                    for j in data:
                        finalListV.append(str(j[0]))
                        level = j[1]
                print("finalListV1")
                print(finalListV)
                finalListV = list(map(int, finalListV))
                print(finalListV)
                nounList = []
                for x in tagged:
                    if ((x[1] == 'NNP') or (x[1] == 'NN') or (x[1] == 'NNS')):
                        nounList.append(x[0])
                #print("Nouns are:")
                print(nounList)
                s1 = str(nounList).strip('[]')
                finalListN = []
                for n in nounList:
                    keyword = n
                    # write if dept is cse taking from user  input
                    curs.execute(
                        "select * from jkeywordsc ,levels  where jkeywordsc.co=levels.cno and jkey=%s and dept=%s",
                        (
                            keyword,
                            sub,
                        ))
                    data = curs.fetchall()
                    for j in data:
                        finalListN.append(str(j[4]))
                #print("finalListN")
                print(finalListN)

                stats = dict(Counter(finalListN))
                #maxval = max(dict.iteritems(), key=operator.itemgetter(1))[1]
                #print(keys = [k for k, v in Counter.items() if v == maxval])

                match = max(stats.items(), key=operator.itemgetter(1))[0]
                #for k, v in newList.items():
                #print(k, v)
                match = int(match)
                if match in finalListV:
                    ft1.write("Accepted at level:" + str(match) + "(" + level +
                              ")" + "of Blooom's Taxonomy")
                else:
                    ft1.write("Rejected,Levels not satisfied")
                ft1.write("\n")

                c = c + 1
                #print(finalListN)
        return ("mytext.txt")

    except Exception as e:
        print(str(e))
from nltk.corpus import state_union as su
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
from nltk import pos_tag, RegexpParser

train_text = su.raw(
    "2005-GWBush.txt")  # training our tokenizer using 2005 speech of GW Bush
sample_text = su.raw(
    "2006-GWBush.txt")  # testing our tokenizer model on 2006 speech of GW Bush
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = word_tokenize(i)
            tagged = pos_tag(words)
            chunkGram = r"""Chunk: {<.*>+} }<VB.?|IN|DT>+{"""  # chunking and chinking of our data
            chunkParser = RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            print(chunked)
            chunked.draw()
    except Exception as e:
        print(str(e))


process_content()
Beispiel #36
0
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 16 11:30:44 2019

@author: Cosimo

chunking
"""

import nltk

from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train= state_union.raw("2005-GWBush.txt")
sample= state_union.raw("2006-GWBush.txt")

custom_sent_tok= PunktSentenceTokenizer(train)
tokenized=custom_sent_tok.tokenize(sample)

def process_content():
    try:
        for i in tokenized:
            words= nltk.word_tokenize(i)
            tagged= nltk.pos_tag(words)
            chunkGram= r"""Chunk: {<RB.?>*<NNP><NN>?} """
            chunkParser= nltk.RegexpParser(chunkGram)
            chunked=chunkParser.parse(tagged)
            
            print(chunked)
            
Beispiel #37
0
def buildhtml(tokenized_sentence, sentence_count):
	html = ""
	starting_div = "<div class=\"panel panel-primary\"> <div class=\"panel-heading\"> Sentence "+ str(sentence_count) +"</div><div class=\"panel-body\">"
	ending_div = "</div></div>"
	html += starting_div
	try:
	    for token in tokenized_sentence:
	    	words = nltk.word_tokenize(token)
	    	tagged = nltk.pos_tag(words)
	    	for word in tagged:
	    		if word[1] in tagdict:
	    			html += "<a href=\"#\" data-toggle=\"tooltip\" title=\""+tagdict[word[1]][0]+"\">"+word[0]+"</a>"
	    	html += ending_div

	    	return html
	except Exception as e:
		print(str(e))

text = state_union.raw("/Users/ponrajuganesh/Desktop/data.txt") 
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

tagdict = nltk.data.load("help/tagsets/" + "upenn_tagset" + ".pickle")
count = 0
fulldiv = ""
for sentence in sent_detector.tokenize(text):
	count += 1
	custom_sent_tokenizer = PunktSentenceTokenizer()
	fulldiv += buildhtml(custom_sent_tokenizer.tokenize(sentence), count)

print fulldiv
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokeniser = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokeniser.tokenize(sample_text)

def process_content():
	try:
		for i in tokenized:
			words = nltk.word_tokenize(i)
			tagged = nltk.pos_tag(words)
			namedEntity = nltk.ne_chunk(tagged, binary=False)
			namedEntity.draw()
	except Exception as e:
		print str(e)

process_content()
Beispiel #39
0
G_F = gutenberg.fileids()

dir(gutenberg)
# it has raw, words and sents as method

for field in G_F:
    num_chars = len(gutenberg.raw(field))
    num_words = len(gutenberg.words(field))
    num_sents = len(gutenberg.sents(field))
    num_vocab = len(set(w.lower() for w in gutenberg.words(field)))
    print('# Chars', num_chars, '# words', num_words, '# sentens', num_sents,
          '# vocabs', num_vocab, '-- name of fields', field)

# ----------------------------------------------------------------------------------------------------------------------

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

Text1 = state_union.raw("2005-GWBush.txt")
Text2 = state_union.raw("2006-GWBush.txt")

ST = PunktSentenceTokenizer(Text1)

Tok = ST.tokenize(Text1)

for i in Tok:
    words = nltk.word_tokenize(i)
    tag = nltk.pos_tag(words)
    print(tag)
Beispiel #40
0
from os import path

import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

import sys
from termcolor import *
import termcolor

import textblob
from textblob import TextBlob
from textblob.translate import Translator

#Training for then identifying verbs, nouns etc
train_text = state_union.raw("2005-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

#Color Codes corresponding to Tags for Verbs, Nouns etc
TagCodes = {'CC': 6, 'CD': 1, 'DT': 6, 'EX': 6, 'FW': 6, 'IN': 6, 'JJ': 0, 'JJR': 0, 'JJS': 0, 'LS': 2, 'MD': 2, 'NN': 1, 'NNS': 1, 'NNP': 2, 'NNPS': 2, 'PDT': 6, 'POS': 6, 'PRP': 5, 'PRP$': 5, 'RB': 4, 'RBR': 4, 'RBS': 4, 'RP': 4, 'TO': 7, 'UH': 2, 'VB': 3, 'VBD': 3, 'VBG': 3, 'VBN': 3, 'VBP': 3, 'VBZ': 3, 'WDT': 6, 'WP': 5, 'WP$': 5, 'WRB': 5};

ColorCodes = {0: 'grey', 1: 'red', 2: 'green', 3: 'yellow', 4: 'blue', 5: 'magenta', 6: 'cyan', 7: 'white'}

#Each language is assigned a short code for translation
LanguageCodes = {'afrikaans' : 'af','albanian' : 'sq','arabic' : 'ar','armenian' : 'hy','azerbaijani' : 'az','basque' : 'eu','belarusian' : 'be','bengali' :'bn','bosnian' : 'bs','bulgarian' : 'bg','catalan' : 'ca','cebuano' : 'ceb','chichewa' : 'ny','chinese-simplified' : 'zh-CN','chinese-traditional' : 'zh-TW','croatian' : 'hr','czech' : 'cs','danish' : 'da','dutch' : 'nl','english' : 'en','esperanto' : 'eo','estonian' : 'et','filipino' : 'tl','finnish' : 'fi','french' : 'fr','galician' : 'gl','georgian' : 'ka','german' : 'de','greek' : 'el','gujarati' : 'gu','haitian-creole' : 'ht','hausa' : 'ha','hebrew' : 'iw','hindi' : 'hi','hmong' : 'hmn','hungarian' : 'hu','icelandic' : 'is','igbo' : 'ig','indonesian' : 'id','irish' : 'ga','italian' : 'it','japanese' : 'ja','javanese' : 'jw','kannada' :'kn','kazakh' : 'kk','khmer' : 'km','korean' : 'ko','lao' : 'lo','latin' : 'la','latvian' : 'lv','lithuanian' : 'lt','macedonian' : 'mk','malagasy' : 'mg','malay' : 'ms','malayalam' : 'ml','maltese' : 'mt','maori' : 'mi','marathi' : 'mr','mongolian' :'mn','burmese' : 'my','nepali' : 'ne','norwegian' : 'no','persian' : 'fa','polish' : 'pl','portuguese' : 'pt','punjabi' : 'ma','romanian' : 'ro','russian' : 'ru','serbian' : 'sr','sesotho' : 'st','sinhala' : 'si','slovak' : 'sk','slovenian' :'sl','somali' : 'so','spanish' : 'es','sudanese' : 'su','swahili' : 'sw','swedish' : 'sv','tajik' : 'tg','tamil' : 'ta','telugu' : 'te','thai' : 'th','turkish' : 'tr','ukrainian' : 'uk','urdu' : 'ur','uzbek' : 'uz','vietnamese' : 'vi','welsh' : 'cy','yiddish' : 'yi','yoruba' : 'yo','zulu' : 'zu'}


#Tags corresponding to Verbs, Nouns etc
'''
POS tag list:
Beispiel #41
0
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union
from nltk.stem import WordNetLemmatizer

content = state_union.raw('2006-GWBush.txt')
tokenizer = PunktSentenceTokenizer()
tokenised = tokenizer.tokenize(content)


def process_content():
    try:
        for i in tokenised:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            print(tagged)

            # chunking and chinking
            chunk_gram = r'''Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{'''
            # }<put chinking content inside this>{ and {<chunking content inside this>}
            chunk_parser = nltk.RegexpParser(chunk_gram)
            chunked = chunk_parser.parse(tagged)
            print(chunked)
            #chunked.draw()

            # find named entity
            named_entity = nltk.ne_chunk(tagged, binary=True)
            #named_entity.draw()
import nltk

## To run this example, we need to make use of 'maxent_ne_chunker' and 'words', so first get it downloaded
##nltk.download('maxent_ne_chunker') and nltk.download('words')

from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("1973-Nixon.txt")
sample_text = state_union.raw("1974-Nixon.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)


def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            namedEntity = nltk.ne_chunk(tagged, binary=True)

            namedEntity.draw()

    except Exception as e:
        print(str(e))


process_content()
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunkSentenceTokenizer

train_text = state_union.raw("a.txt")
sample_text = state_union.raw("b.txt")

custom_sent_tokeniser = PunkSentenceTokenizer(train_text)
tokenized = custom_sent_tokeniser(sample_text)


def content_process():
    try:
        for i in tokenised:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            namedEnt = nltk.ne_chunk(tagged, binary=True)
            #Giving binary option just classifies everything as named entity and doesn't classify to organisation, money and so on
            namedEnt.draw()

            #Named Entity Types: - Organisation, Person, Location, Date, Time, Money, Percent, Facility, GPE

    except Exception as e:
        print(str(e))


content_process()
Beispiel #44
0
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 19 09:15:11 2015

@author: nilakant
"""


import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
#unsupervised tokenizer
train_text = state_union.raw("2006-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            chunked.draw()
            
    return entity_names

def extract_entities(taggedText):
    '''
    Create map with entity and their counts
    :param taggedText: Parsed text (output of ne chunker) in tree form
    :return: dict of entities and their freq counts
    '''
    entity_names = []
    for tree in taggedText:
        entity_names.extend(extract_entity_names(tree))
    return entity_names


#get year and words for each file
extracted= [(state_union.raw(fileid), int(fileid[:4])) for fileid in state_union.fileids()]
docs, years = zip(*extracted)

#break text down into sentences, tokens
tokens = [nltk.word_tokenize(text) for text in docs]
sents = [nltk.sent_tokenize(text.replace("\n", " ")) for text in docs]
senttokens = [[nltk.word_tokenize(sent) for sent in entry] for entry in sents]

#get counts of unique words and plot over time
unique = [len(set(words)) for words in tokens]
plt.scatter(years, unique)
plt.show()

#get unique/total ratio
ratios = [(float(len(set(words)))/float(len(words))) for words in tokens]
plt.scatter(years, ratios)