-
Notifications
You must be signed in to change notification settings - Fork 0
/
miniproject.py
43 lines (30 loc) · 1.03 KB
/
miniproject.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import nltk
from nltk.corpus import indian
from nltk.tag import tnt
import string
tagged_set = 'hindi.pos'
word_set = indian.sents(tagged_set)
count = 0
for sen in word_set:
count = count + 1
sen = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in sen]).strip()
print (count, sen)
print ('Total sentences in the tagged file are',count)
train_perc = .9
train_rows = int(train_perc*count)
test_rows = train_rows + 1
print ('Sentences to be trained',train_rows, 'Sentences to be tested against',test_rows)
data = indian.tagged_sents(tagged_set)
train_data = data[:train_rows]
test_data = data[test_rows:]
pos_tagger = tnt.TnT()
pos_tagger.train(train_data)
pos_tagger.evaluate(test_data)
sentence_to_be_tagged = " नई दिल्ली ।"
def tagthesentence(sentence):
tokenizedSentence = nltk.word_tokenize(sentence)
print(tokenizedSentence)
print(pos_tagger.tag(tokenizedSentence))
return pos_tagger.tag(tokenizedSentence)