generated from github/welcome-to-github
-
Notifications
You must be signed in to change notification settings - Fork 0
/
NLP Learning.py
164 lines (134 loc) · 7.14 KB
/
NLP Learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 25 13:45:12 2019
@author: Deepak Gupta
"""
import nltk
import os
import nltk.corpus
os.chdir("E:\\PYTHON NOTES\\file")
#Load Text data
nltk.corpus.gutenberg.fileids()
hmlet = nltk.corpus.gutenberg.words("shakespeare-hamlet.txt")
for word in hmlet[:500]:
print(word,sep=" ",end=" ")
from nltk.tokenize import word_tokenize
al= """The Tragedie of Hamlet by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Barnardo and Francisco two Centinels . Barnardo . Who ' s there ? Fran . Nay answer me : Stand & vnfold your selfe Bar . Long liue the King Fran . Barnardo ? Bar . He Fran . You come most carefully vpon your houre Bar . ' Tis now strook twelue , get thee to bed Francisco Fran . For this releefe much thankes : ' Tis bitter cold , And I am sicke at heart Barn . Haue you had quiet Guard ? Fran . Not a Mouse stirring Barn . Well , goodnight . If you do meet Horatio and Marcellus , the Riuals of my Watch , bid them make hast . Enter Horatio and Marcellus . Fran . I thinke I heare them . Stand : who ' s there ? Hor . Friends to this ground Mar . And Leige - men to the Dane Fran . Giue you good night Mar . O farwel honest Soldier , who hath relieu ' d you ? Fra . Barnardo ha ' s my place : giue you goodnight . Exit Fran . Mar . Holla Barnardo Bar . Say , what is Horatio there ? Hor . A peece of him Bar . Welcome Horatio , welcome good Marcellus Mar . What , ha ' s this thing appear ' d againe to night Bar . I haue seene nothing Mar . Horatio saies , ' tis but our Fantasie , And will not let beleefe take hold of him Touching this dreaded sight , twice seene of vs , Therefore I haue intreated him along With vs , to watch the minutes of this Night , That if againe this Apparition come , He may approue our eyes , and speake to it Hor . Tush , tush , ' twill not appeare Bar . Sit downe a - while , And let vs once againe assaile your eares , That are so fortified against our Story , What we two Nights haue seene Hor . Well , sit we downe , And let vs heare Barnardo speake of this Barn . Last night of all , When yond same Starre that ' s Westward from the Pole Had made his course t ' illume that part of Heauen Where now it burnes , Marcellus and my selfe , The Bell then beating one Mar . Peace , breake thee of : Enter the Ghost . Looke where it comes againe Barn . In the same figure , like the King that ' s dead Mar . Thou art a Scholler ; speake to it Horatio Barn . Lookes it not like the King ? Marke it Horatio Hora . Most like : It harrowes me with fear & wonder Barn . It would be spoke too Mar . Question it Horatio Hor . What art """
type(al)
al_token=word_tokenize(al)
len(al_token)
from nltk.probability import FreqDist
freqdist=FreqDist()
for word in al_token:
freqdist[word.lower()]+=1
freqdist
fdist_top10=freqdist.most_common(10)
from nltk.tokenize import blankline_tokenize
al_blank=blankline_tokenize(al)
len(al_blank)
from nltk.util import bigrams,trigrams,ngrams
strings="Marke it Horatio Hora . Most like : It harrowes me with fear & wonder Barn . It would be spoke too Mar . Question it Horatio Hor . What art "
qutoes_token=nltk.word_tokenize(strings)
qutoes_bigrams=list(nltk.bigrams(qutoes_token))
qutoes_trigrams=list(nltk.trigrams(qutoes_token))
qutoes_ngrams=list(nltk.ngrams(qutoes_token,5))
#stemmer
from nltk.stem import PorterStemmer
ps= PorterStemmer()
ps.stem("speaking")
word_to_strem=["give","given","gave"]
for word in word_to_strem:
print(word+ ":"+ps.stem(word))
from nltk.stem import LancasterStemmer
lan=LancasterStemmer()
for word in word_to_strem:
print(word+ ":" +lan.stem(word))
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer
word_len=WordNetLemmatizer()
for word in word_to_strem:
print(word+ ":" +word_len.lemmatize(word))
from nltk.corpus import stopwords
stop = set(stopwords.words("english"))
len(stop)
import re
punctuation=re.compile(r'[-.?!,:;()|0-9]')
post_punctuation=[]
for word in al_token:
words=punctuation.sub("",word)
if len(words)>0:
post_punctuation.append(words)
len(post_punctuation)
#pos tag & discription
sent="you are one and only my best friend in this city or in the world"
sent_token=word_tokenize(sent)
for token in sent_token:
print(nltk.pos_tag([token]))
#enetity regonation
from nltk import ne_chunk
ne_send ="The US Precident Stay in the White house"
sent_token=word_tokenize(ne_send)
ne_tag=nltk.pos_tag(sent_token)
ne_nrr=ne_chunk(ne_tag)
new="The cat ate the little mouse who was after fresh cheese"
new_token=nltk.pos_tag(word_tokenize(new))
grammer_np=r"NP: {<DT>?<JJ>*<NN>}"
chunk_praser=nltk.RegexpParser(grammer_np)
chunk_result=chunk_praser.parse(new_token)
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import movie_reviews
movie_reviews.categories()
pos_rev=movie_reviews.fileids("pos")
neg_rev=movie_reviews.fileids("neg")
rev=nltk.corpus.movie_reviews.words('pos/cv565_29572.txt')
rev_list=[]
for rev in neg_rev:
rev_text_neg=rev=nltk.corpus.movie_reviews.words(rev)
review_one_string=" ".join(rev_text_neg)
review_one_string=review_one_string.replace(",",",")
review_one_string=review_one_string.replace(".",".")
review_one_string=review_one_string.replace("\' ","'")
review_one_string=review_one_string.replace(" \'","'")
rev_list.append(review_one_string)
len(rev_list)
for rev in pos_rev:
rev_text_neg=rev=nltk.corpus.movie_reviews.words(rev)
review_one_string=" ".join(rev_text_neg)
review_one_string=review_one_string.replace(",",",")
review_one_string=review_one_string.replace(".",".")
review_one_string=review_one_string.replace("\' ","'")
review_one_string=review_one_string.replace(" \'","'")
rev_list.append(review_one_string)
neg_target=np.zeros((1000,),dtype=np.int)
pos_target=np.ones((1000,),dtype=np.int)
target_list=[]
for neg_tar in neg_target:
target_list.append(neg_tar)
for pos_tar in pos_target:
target_list.append(pos_tar)
len(target_list)
y=pd.Series(target_list)
from sklearn.feature_extraction.text import CountVectorizer
count_vec=CountVectorizer(lowercase=True,stop_words="english",min_df=2)
x_count_vec=count_vec.fit_transform(rev_list)
x_count_vec.shape
x_name=count_vec.get_feature_names()
x_name
x_count_vec=pd.DataFrame(x_count_vec.toarray(),columns=x_name)
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
x_train_cv,x_test_cv,y_train_cv,y_test_cv=train_test_split(x_count_vec,y,test_size=0.25,random_state=5)
from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()
y_pred_nb=gnb.fit(x_train_cv,y_train_cv).predict(x_test_cv)
from sklearn.naive_bayes import MultinomialNB
clf_vc=MultinomialNB()
clf_vc.fit(x_train_cv,y_train_cv)
y_pred_cv=clf_vc.predict(x_test_cv)
type(y_pred_cv)
print(metrics.accuracy_score(y_test_cv,y_pred_cv))
score_clf_cv=confusion_matrix(y_test_cv,y_pred_cv)