コード例 #1
0
from nltk.corpus import indian
from nltk.tokenize import sent_tokenize

sample_text = indian.raw("marathi.pos")
marathi_text = sent_tokenize(sample_text)

print("Below is marathi text: (this is not proper way of reading text)")
print(marathi_text[0:5])
コード例 #2
0
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 12 11:11:06 2015

@author: suppu
"""
from nltk.corpus import indian
'''
Let us generate a file having sentences in indian languages. The file is generated from the indian languages scorpus available
'''
print "Number of charachetrs is:"
for f in indian.fileids():
    print f
    print len(indian.raw(f))
print "No of words in each language are:"
for f in indian.fileids():
    print f
    print len(indian.words(f))
print "Number of sentences in each language:"
for f in indian.fileids():
    print f
    print len(indian.sents(f))
'''POS for hindi
'''
hindi_sent = indian.sents("hindi.pos")
hsent = file("hws.txt", 'w')
for i in hindi_sent:
    hsent.write(" ".join(i))
hpos = indian.tagged_sents("hindi.pos")
hpossent = open("hpossent.txt", 'w')
hpossent.seek(0)
コード例 #3
0
ファイル: Lab2_mycorpus.py プロジェクト: sherly75/PythonNLTK
#This is for my corpus (indian)

import nltk
from nltk.corpus import indian
import matplotlib as cdf

print(indian.raw())
print(indian.fileids())
print(indian.sents())

import matplotlib

word1 = 'country'
word2 = 'city'
cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in indian.fileids()
                               for w in indian.words(fileid)
                               for target in [word1, word2]
                               if w.lower().startswith(target))
cfd.plot()
コード例 #4
0
#nltk.download()

from nltk.corpus import indian

print("Files of Indian languages:-")
# check files for each languare in NLTK
print(indian.fileids())
print()

print("Language details :-")
# find no. of characters in each language
for f in indian.fileids():
    print("Language :-", f)
    print(
        "     No of Characters",
        len(indian.raw(f)),
    )
    print("     No of words :-", len(indian.words(f)))
    print("     No of Sentences :-", len(indian.sents(f)))
print()

print("Checking raw sentences of languages:-")
# print(indian.raw(indian.raw('bangla.pos'))
# print(indian.raw(indian.raw('hindi.pos'))
# print(indian.raw(indian.raw('marathi.pos'))
# print(indian.raw(indian.raw('telugu.pos'))

print("Printing & writing the sentences to a file,  from Marathi language")
sentencesMarathi = open("marathiSentences.txt", "w")
# This will print sentence as a list of words
for sentence in indian.sents('marathi.pos'):
コード例 #5
0
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better", pos="a"))

# In[18]:

import nltk
print(nltk.__file__)

# In[19]:

import cv2
print(cv2.__file__)

# In[24]:

from nltk.corpus import indian
from nltk.tokenize import sent_tokenize
sample = indian.raw("hindi.pos")
t = sent_tokenize(sample)
print(t[:4])

# In[26]:

from nltk.corpus import wordnet
syns = wordnet.synsets("program")
print(syns)

# In[ ]: