from nltk.corpus import indian from nltk.tokenize import sent_tokenize sample_text = indian.raw("marathi.pos") marathi_text = sent_tokenize(sample_text) print("Below is marathi text: (this is not proper way of reading text)") print(marathi_text[0:5])
# -*- coding: utf-8 -*- """ Created on Mon Oct 12 11:11:06 2015 @author: suppu """ from nltk.corpus import indian ''' Let us generate a file having sentences in indian languages. The file is generated from the indian languages scorpus available ''' print "Number of charachetrs is:" for f in indian.fileids(): print f print len(indian.raw(f)) print "No of words in each language are:" for f in indian.fileids(): print f print len(indian.words(f)) print "Number of sentences in each language:" for f in indian.fileids(): print f print len(indian.sents(f)) '''POS for hindi ''' hindi_sent = indian.sents("hindi.pos") hsent = file("hws.txt", 'w') for i in hindi_sent: hsent.write(" ".join(i)) hpos = indian.tagged_sents("hindi.pos") hpossent = open("hpossent.txt", 'w') hpossent.seek(0)
#This is for my corpus (indian) import nltk from nltk.corpus import indian import matplotlib as cdf print(indian.raw()) print(indian.fileids()) print(indian.sents()) import matplotlib word1 = 'country' word2 = 'city' cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in indian.fileids() for w in indian.words(fileid) for target in [word1, word2] if w.lower().startswith(target)) cfd.plot()
#nltk.download() from nltk.corpus import indian print("Files of Indian languages:-") # check files for each languare in NLTK print(indian.fileids()) print() print("Language details :-") # find no. of characters in each language for f in indian.fileids(): print("Language :-", f) print( " No of Characters", len(indian.raw(f)), ) print(" No of words :-", len(indian.words(f))) print(" No of Sentences :-", len(indian.sents(f))) print() print("Checking raw sentences of languages:-") # print(indian.raw(indian.raw('bangla.pos')) # print(indian.raw(indian.raw('hindi.pos')) # print(indian.raw(indian.raw('marathi.pos')) # print(indian.raw(indian.raw('telugu.pos')) print("Printing & writing the sentences to a file, from Marathi language") sentencesMarathi = open("marathiSentences.txt", "w") # This will print sentence as a list of words for sentence in indian.sents('marathi.pos'):
from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() print(lemmatizer.lemmatize("better", pos="a")) # In[18]: import nltk print(nltk.__file__) # In[19]: import cv2 print(cv2.__file__) # In[24]: from nltk.corpus import indian from nltk.tokenize import sent_tokenize sample = indian.raw("hindi.pos") t = sent_tokenize(sample) print(t[:4]) # In[26]: from nltk.corpus import wordnet syns = wordnet.synsets("program") print(syns) # In[ ]: