def data_preparation(self): """ Splits one of Brown, BNC News, Indian corpora into train set and test set Returns: -------- sentences (list): Sentences without POS-tags tagged_sentences (list): Sentences with POS-tags """ if self.corpus == 'brown': tagged_sentences = brown.tagged_sents(categories='news') sentences = brown.sents(categories='news') elif self.corpus == 'bnc': root = find('corpora/bnc') bncnews = TaggedCorpusReader(root, 'bnc-news-wtp.txt', tagset='en-claws') if self.tagset is None: tagged_sentences = bncnews.tagged_sents() elif self.tagset == 'universal': tagged_sentences = bncnews.tagged_sents(tagset=self.tagset) sentences = bncnews.sents() elif self.corpus == 'indian': if self.lang in ['telugu', 'hindi', 'marathi', 'bangla']: tagged_sentences = indian.tagged_sents(f'{self.lang}.pos') sentences = indian.sents(f'{self.lang}.pos') else: print('Language not part of Indian Corpus.') return sentences, tagged_sentences
def train(): taggedSet = "hindi.pos" wordSet = indian.sents(taggedSet) count = 0 print(wordSet[0]) for sen in wordSet: count = count + 1 sen = "".join([ " " + i if not i.startswith("'") and i not in string.punctuation else i for i in sen ]).strip() print(count, sen, "sentences") print("Total sentences in the tagged file are", count) trainPerc = 0.9 trainRows = int(trainPerc * count) testRows = trainRows + 1 data = indian.tagged_sents(taggedSet) train_data = data[:trainRows] test_data = data[testRows:] print("Training dataset length: ", len(train_data)) print("Testing dataset length: ", len(test_data)) pos_tagger = tnt.TnT() pos_tagger.train(train_data) print("Accuracy: ", pos_tagger.evaluate(test_data)) return pos_tagger
def bangla_chunk(): # Parts of speech tagging part................ tagged_set = 'bangla.pos' # pre-trained Indian corpus is stored in bngla.pos word_set = indian.sents( tagged_set ) # From Bengali corpus read the Bengali sentence and put them variable word_set count = 0 ''' Using a for loop count all sentences which present in the corpus. startswith()-function is used to check the string is started with String “ ‘ “ Here set the training percentage is 0.96 since the dataset is not sufficient. ''' for sen in word_set: count = count + 1 sen = "".join([ " " + i if not i.startswith("'") and i not in string.punctuation else i for i in sen ]).strip() print(count, sen) print('Total sentences in the tagged files are', count) train_perc = .96 train_rows = int(train_perc * count) test_rows = train_rows + 1 print("Sentences to be trained", train_rows, "Sentences to be tested against", test_rows) data = indian.tagged_sents(tagged_set) train_data = data[:train_rows] test_data = data[test_rows:] ''' now tokenize and check the parts of speech ''' pos_tagger = tnt.TnT() pos_tagger.train(train_data) pos_tagger.evaluate(test_data) sentence = "আমি ভাত খাই নাই অনেক দিন হল । বিজেআইটি একটি কম্পানি , কি কম্পানি সেটা জানার দরকার নাই । " tokenized = nltk.word_tokenize(sentence) words = pos_tagger.tag(tokenized) ''' RB.? = any form of RB NNP =we are required ''' chunkGram = r"""Chunk: {<.*>+} }<VB.? |IN|DT|TO|NN>+{""" chunkParser = nltk.RegexpParser(chunkGram) # regex parser use krci chunked = chunkParser.parse(words) chunked.draw()
def bangla_chunk(): # Parts of speech tagging part................ tagged_set = 'bangla.pos' # pre-trained Indian corpus is stored in bngla.pos word_set = indian.sents( tagged_set ) # From Bengali corpus read the Bengali sentence and put them variable word_set count = 0 ''' Using a for loop count all sentences which present in the corpus. startswith()-function is used to check the string is started with String “ ‘ “ Here set the training percentage is 0.96 since the dataset is not sufficient. ''' for sen in word_set: count = count + 1 sen = "".join([ " " + i if not i.startswith("'") and i not in string.punctuation else i for i in sen ]).strip() print(count, sen) print('Total sentences in the tagged files are', count) train_perc = .96 train_rows = int(train_perc * count) test_rows = train_rows + 1 print("Sentences to be trained", train_rows, "Sentences to be tested against", test_rows) data = indian.tagged_sents(tagged_set) train_data = data[:train_rows] test_data = data[test_rows:] ''' now tokenize and check the parts of speech ''' pos_tagger = tnt.TnT() pos_tagger.train(train_data) pos_tagger.evaluate(test_data) sentence = "আমি ভাত খাই নাই অনেক দিন হল । বিজেআইটি একটি কম্পানি , কি কম্পানি সেটা জানার দরকার নাই । মানিক ভাইয়ের কি হইল আবার রোকেয়া কাবিলার জন্য চিল্লাইতেছে " tokenized = nltk.word_tokenize(sentence) words = pos_tagger.tag(tokenized) namedEnt = nltk.ne_chunk(words) namedEnt.draw()
def bangla_pos_tagger(): tagged_set = 'bangla.pos' #pre-trained Indian corpus is stored in bngla.pos word_set = indian.sents( tagged_set ) #From Bengali corpus read the Bengali sentence and put them variable word_set count = 0 ''' Using a for loop count all sentences which present in the corpus. startswith()-function is used to check the string is started with String “ ‘ “ Here set the training percentage is 0.96 since the dataset is not sufficient. ''' for sen in word_set: count = count + 1 sen = "".join([ " " + i if not i.startswith("'") and i not in string.punctuation else i for i in sen ]).strip() print(count, sen) print('Total sentences in the tagged files are', count) train_perc = .96 train_rows = int(train_perc * count) test_rows = train_rows + 1 print("Sentences to be trained", train_rows, "Sentences to be tested against", test_rows) data = indian.tagged_sents(tagged_set) train_data = data[:train_rows] test_data = data[test_rows:] ''' now tokenize and check the parts of speech ''' pos_tagger = tnt.TnT() pos_tagger.train(train_data) pos_tagger.evaluate(test_data) sentence = "আমি ভাত খাই নাই অনেক দিন হল ।" tokenized = nltk.word_tokenize(sentence) print(pos_tagger.tag(tokenized))
import nltk from nltk.corpus import indian from nltk.tag import tnt import string nltk.download('punkt') nltk.download() tagged_set = 'hindi.pos' word_set = indian.sents(tagged_set) count = 0 for sen in word_set: count = count + 1 sen = "".join([ " " + i if not i.startswith("'") and i not in string.punctuation else i for i in sen ]).strip() print(sen) print(count) train_perc = .9 train_rows = int(train_perc * count) test_rows = train_rows + 1 print(train_rows, test_rows) data = indian.tagged_sents(tagged_set) train_data = data[:train_rows] test_data = data[test_rows:]
# -*- coding: utf-8 -*- """ Created on Mon Oct 12 14:39:14 2015 @author: suppu """ import nltk import nltk.data from nltk.corpus import indian word_to_be_tagged = u"पूर्ण प्रतिबंध हटाओ : इराक" hindi_sents = indian.sents("hindi.pos") train_data = indian.tagged_sents('hindi.pos')[:300] test_data = indian.tagged_sents('hindi.pos')[301:] from nltk.tag import tnt tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) tnt_pos_tagger.evaluate(test_data) tagged = (tnt_pos_tagger.tag(nltk.word_tokenize(word_to_be_tagged))) out = open("ooutput.txt", 'w') for i in tagged: out.write(i[0]) out.write(" " + i[1] + "\n") ''' OUTPUT ooutput.txt file: पूर्ण JJ
#!/usr/bin/python import os from nltk.corpus import indian import txt2tajson # can also choose from: marathi, bangla, telugu, hindi lang = "hindi" if not os.path.exists("txt/" + lang): os.mkdir("txt/" + lang) sents = indian.sents(lang + ".pos") # arbitrarily put 10 sentences per document. num = 0 for i in range(0, len(sents), 10): with open("txt/" + lang + "/" + str(i), "w") as out: for sent in sents[i:i + 10]: out.write(" ".join(sent) + "\n") num += 1 print("Wrote {} text files to {}".format(num, "txt/" + lang)) # Now convert txt to tajson. txt2tajson.convert("txt/" + lang, "tajson/" + lang) print( "Now run:\n $ ./scripts/buildindex.sh data/tajson/hindi/ data/index_hindi" )
fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') marathi_sent = indian.sents('marathi_pos_rad_3NOV17.pos') mpos = indian.tagged_sents('marathi_pos_rad_3NOV17.pos') mp = shuffle(mpos) size = int(len(marathi_sent) * 0.8) tags = [ tag for (word, tag) in indian.tagged_words('marathi_pos_rad_3NOV17.pos') ] print(np.unique(tags)) #print("no. of tags=",len(nltk.FreqDist(tags))) defaultTag = nltk.FreqDist(tags).max() #print(defaultTag) train_sents = mp[:size] #print(len(train_sents)) test_sents = mp[size:]
import nltk from nltk.corpus import indian data = indian.tagged_sents() test_data = indian.sents() data_m = [] test_data_m = [] with open('IIIT_data') as f: fh = f.readlines() lis = [] lis1 = [] for i in fh: if (i == '\n'): data_m.append(lis) test_data_m.append(lis1) lis = [] lis1 = [] else: temp = (i.strip()).split('\t') lis.append(tuple((temp[0], temp[1]))) lis1.append(temp[0]) print(len(test_data_m)) tot_data = data_m test_data = test_data_m train_size = int(len(tot_data) / 10) * 9 import random import numpy as np training_data = []
def get_list_of_sentences(): sentences = indian.sents('bangla.pos') return sentences
#!/usr/bin/python import os from nltk.corpus import indian import txt2tajson # can also choose from: marathi, bangla, telugu, hindi lang = "hindi" if not os.path.exists("txt/" + lang): os.mkdir("txt/" + lang) sents = indian.sents(lang + ".pos") # arbitrarily put 10 sentences per document. num = 0 for i in range(0,len(sents),10): with open("txt/" + lang + "/" + str(i), "w") as out: for sent in sents[i:i+10]: out.write(" ".join(sent) + "\n") num += 1 print("Wrote {} text files to {}".format(num, "txt/" + lang)) # Now convert txt to tajson. txt2tajson.convert("txt/" + lang, "tajson/" + lang) print("Now run:\n $ ./scripts/buildindex.sh data/tajson/hindi/ data/index_hindi")
from nltk.corpus import indian ''' Let us generate a file having sentences in indian languages. The file is generated from the indian languages scorpus available ''' print "Number of charachetrs is:" for f in indian.fileids(): print f print len(indian.raw(f)) print "No of words in each language are:" for f in indian.fileids(): print f print len(indian.words(f)) print "Number of sentences in each language:" for f in indian.fileids(): print f print len(indian.sents(f)) '''POS for hindi ''' hindi_sent = indian.sents("hindi.pos") hsent = file("hws.txt", 'w') for i in hindi_sent: hsent.write(" ".join(i)) hpos = indian.tagged_sents("hindi.pos") hpossent = open("hpossent.txt", 'w') hpossent.seek(0) for i in hpos: for j in i: hpossent.write(j[0] + " " + j[1] + "\n") ''' POS for bangla '''
print("Files of Indian languages:-") # check files for each languare in NLTK print(indian.fileids()) print() print("Language details :-") # find no. of characters in each language for f in indian.fileids(): print("Language :-", f) print( " No of Characters", len(indian.raw(f)), ) print(" No of words :-", len(indian.words(f))) print(" No of Sentences :-", len(indian.sents(f))) print() print("Checking raw sentences of languages:-") # print(indian.raw(indian.raw('bangla.pos')) # print(indian.raw(indian.raw('hindi.pos')) # print(indian.raw(indian.raw('marathi.pos')) # print(indian.raw(indian.raw('telugu.pos')) print("Printing & writing the sentences to a file, from Marathi language") sentencesMarathi = open("marathiSentences.txt", "w") # This will print sentence as a list of words for sentence in indian.sents('marathi.pos'): #print(sentence) sentencesMarathi.write(" ".join(sentence))
#This is for my corpus (indian) import nltk from nltk.corpus import indian import matplotlib as cdf print(indian.raw()) print(indian.fileids()) print(indian.sents()) import matplotlib word1 = 'country' word2 = 'city' cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in indian.fileids() for w in indian.words(fileid) for target in [word1, word2] if w.lower().startswith(target)) cfd.plot()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ INFORMATION RETRIEVAL IN INDIAN LANGUAGE(HINDI) @author: narayanashanmukhavenkat """ import numpy as np from nltk.corpus import indian, stopwords from gensim import corpora, models, similarities, matutils from gensim.models import lsimodel, nmf documents = indian.sents("hindi.pos") temp = open("hindisw.txt", 'r') stop_words = "" stop_words = stop_words + temp.read() temp.close() stop_words = [word for word in stop_words.split()] texts = [[word for word in document if word not in stop_words] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('/tmp/ir.mm', corpus) lsi = models.LsiModel(corpus, num_topics=43, id2word=dictionary) index = similarities.MatrixSimilarity(lsi[corpus])