def pos_tag(query): train_data = indian.tagged_sents('hindi.pos') tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) #test_data = ("भारतीय संस्कृति में नारी के सम्मान को बहुत महत्व दिया गया है।") a=tnt_pos_tagger.tag(nltk.word_tokenize(query)) return a
'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(simplify_tags=True), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(simplify_tags=True), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(simplify_tags=True), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(simplify_tags=True), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(simplify_tags=True), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(simplify_tags=True), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(simplify_tags=True), } class ConcordanceSearchView(object):
from nltk import UnigramTagger import sys reload(sys) from nltk.tag import tnt from nltk.tokenize import sent_tokenize, word_tokenize tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data_hindi) sys.setdefaultencoding("utf-8") #english tags train_sents = brown.tagged_sents() #hindi tags train_data_hindi = indian.tagged_sents('hindi.pos')[:-1] //used for training tnt_pos_tagger.train(train_data_hindi) def english_tag(eng_tweet): word_features = [] eng_tweet =nltk.word_tokenize(eng_tweet) for i,j in nltk.pos_tag(eng_tweet): if j in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']: word_features.append(i) rating = 0
def train_hmm_pos_tagger(): #Consider tagged hindi sentences of NLTK as training data. train_data = indian.tagged_sents('hindi.pos') #Define dictionary for each database word_tag_dict = {} tag_freq_dict = {} tag_tag_dict = {} tag_freq_dict['<S>'] = len(train_data) tag_freq_dict['</S>'] = len(train_data) #Calculate frequency count for each database for sentence in train_data: prev_tag = '<S>' for word_tag_pair in sentence: if word_tag_pair not in word_tag_dict: word_tag_dict[word_tag_pair] = 1 else: word_tag_dict[word_tag_pair] += 1 current_tag = word_tag_pair[1] if current_tag not in tag_freq_dict: tag_freq_dict[current_tag] = 1 else: tag_freq_dict[current_tag] += 1 tag_tag_pair = (prev_tag,current_tag) if tag_tag_pair not in tag_tag_dict: tag_tag_dict[tag_tag_pair] = 1 else: tag_tag_dict[tag_tag_pair] += 1 prev_tag = current_tag current_tag = '</S>' tag_tag_pair = (prev_tag,current_tag) if tag_tag_pair not in tag_tag_dict: tag_tag_dict[tag_tag_pair] = 1 else: tag_tag_dict[tag_tag_pair] += 1 test_data = indian.tagged_sents('hindi.pos') sentence = test_data[0] prev_tag = '<S>' for pair in sentence: word = pair[0] max_prob = -100000000000000 for current_tag in tag_freq_dict.keys(): word_tag_pair = (word,current_tag) if word_tag_pair not in word_tag_dict: word_tag_prob = math.log(1.0/tag_freq_dict[current_tag]) else: word_tag_prob = math.log(float(word_tag_dict[word_tag_pair])/float(tag_freq_dict[current_tag])) prev_cur_tag_pair = (prev_tag,current_tag) if prev_cur_tag_pair not in tag_tag_dict: prev_cur_tag_prob = math.log(1.0/tag_freq_dict[prev_tag]) else: prev_cur_tag_prob = math.log(float(tag_tag_dict[prev_cur_tag_pair])/float(tag_freq_dict[prev_tag])) max_freq = 1.0 for future_tag in tag_freq_dict.keys(): cur_future_tag_pair = (current_tag,future_tag) if cur_future_tag_pair in tag_tag_dict: if tag_tag_dict[cur_future_tag_pair] > max_freq: max_freq = tag_tag_dict[cur_future_tag_pair] cur_future_tag_prob = math.log(float(max_freq)/float(tag_freq_dict[current_tag])) total_prob = word_tag_prob + prev_cur_tag_prob + cur_future_tag_prob if total_prob > max_prob: max_prob = total_prob tag_assigned = current_tag prev_tag = tag_assigned print word print "Original tag : ", print pair[1] print "Tag assigned : ", print tag_assigned
"English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="simple"), "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="simple"), "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents( categories="science_fiction", tagset="simple" ), "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"), "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"), "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR = "#FFF" # white # Colour of highlighted results _HIGHLIGHT_WORD_COLOUR = "#F00" # red _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
#!/usr/bin/python # -*- coding: utf-8 -*- import sys import nltk from nltk.corpus import indian from nltk.tag import tnt from tabulate import tabulate train_data = indian.tagged_sents('konkani.pos') test_str = "सुनापरांत पयलें पान,सुनापरांत खेळ".decode(sys.stdin.encoding); tnt_tagger = tnt.TnT() n = tnt_tagger.train(train_data) tagged = tnt_tagger.tag(nltk.word_tokenize(test_str)) print tabulate(tagged)
def hindi_model(): train_data = indian.tagged_sents('hindi.pos') tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) return tnt_pos_tagger
"English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts( tagset="universal" ), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents( tagset="universal" ), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents( tagset="universal" ), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents( tagset="universal" ), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents( files="hindi.pos", tagset="universal" ), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents( tagset="universal" ), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents( tagset="universal" ), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents( tagset="universal" ), }
len(indian.raw(f)), ) print(" No of words :-", len(indian.words(f))) print(" No of Sentences :-", len(indian.sents(f))) print() print("Checking raw sentences of languages:-") # print(indian.raw(indian.raw('bangla.pos')) # print(indian.raw(indian.raw('hindi.pos')) # print(indian.raw(indian.raw('marathi.pos')) # print(indian.raw(indian.raw('telugu.pos')) print("Printing & writing the sentences to a file, from Marathi language") sentencesMarathi = open("marathiSentences.txt", "w") # This will print sentence as a list of words for sentence in indian.sents('marathi.pos'): #print(sentence) sentencesMarathi.write(" ".join(sentence)) print("Printing & writing words to a file, from Bangla language") wordsBangla = open("hindiWords.txt", "w") # This will print sentence as a list of words for word in indian.words('bangla.pos'): print("===>", word, end=" ") wordsBangla.write(" ".join(word)) # checking POS tagged sentences for particular language for taggedSents in indian.tagged_sents('hindi.pos'): print(taggedSents) #print(end="")
'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='universal'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='universal'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='universal'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='universal'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='universal'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='universal'), }
"English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="universal"), "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="universal"), "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="universal"), "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="universal"), "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="universal"), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="universal"), "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="universal"), }
sen = "".join([ " " + i if not i.startswith("'") and i not in string.punctuation else i for i in sen ]).strip() print(count, sen) print('Total sentences in the tagged file are', count) train_perc = .9 train_rows = int(train_perc * count) test_rows = train_rows + 1 print('Sentences to be trained', train_rows, 'Sentences to be tested against', test_rows) data = indian.tagged_sents(tagged_set) train_data = data[:train_rows] test_data = data[test_rows:] pos_tagger = tnt.TnT() pos_tagger.train(train_data) pos_tagger.evaluate(test_data) sentence_to_be_tagged = " नई दिल्ली ।" def tagthesentence(sentence): tokenizedSentence = nltk.word_tokenize(sentence) print(tokenizedSentence) print(pos_tagger.tag(tokenizedSentence)) return pos_tagger.tag(tokenizedSentence)
def nepali_model(): train_data = indian.tagged_sents('<path/to/nepali.pos>') tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) return tnt_pos_tagger
'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='simple'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='simple'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='simple'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='simple'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='simple'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='simple'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='simple'), } class ConcordanceSearchView(object):
from nltk.corpus import indian from nltk.tag import tnt from nltk.tokenize import sent_tokenize import nltk import json import io train_data = indian.tagged_sents('hindi.pos') tnt_pos_tagger = tnt.TnT() tnt_pos_tagger.train(train_data) test_data = ("भारतीय संस्कृति में नारी के सम्मान को बहुत महत्व दिया गया है। नारी शिक्षा कहा गया है जंहा स्त्रियों की पूजा होती है वंहा देवता निवास करते हैं । प्राचीन काल से ही नारी को ‘गृह देवी’ या ‘गृह लक्ष्मी’ कहा जाता है ।") ts=test_data.split("।") #s=sent_tokenize(test_data) #a=tnt_pos_tagger.tag(nltk.word_tokenize(test_data)) with io.open('op.txt', 'w', encoding='utf8') as json_file: json.dump(ts, json_file, ensure_ascii=False)
'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='universal'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='universal'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='universal'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='universal'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='universal'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='universal'), } class ConcordanceSearchView(object):
def __init__(self,lang='en'): self.lang = lang self.stopwords = None self.stemmer = None self.sentiment_analyzer = None self.text_processor = None INDIC_NLP_RESOURCES=r"../model/indic_nlp_resources/" common.set_resources_path(INDIC_NLP_RESOURCES) self.pos_tagger = None if lang == 'hi': self.ht = HindiTokenizer.Tokenizer() self.sentiment_analyzer = load_learner(path="../model/hi-sentiment") self.stopwords = [x.strip() for x in open("../data/stopwords.txt").readlines()] other_exclusions = ["#ff", "ff", "rt"] self.stopwords.extend(other_exclusions) self.stemmer = None self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens ) loader.load() train_data = indian.tagged_sents('hindi.pos') self.tnt_pos_tagger = tnt.TnT() self.tnt_pos_tagger.train(train_data) if lang == 'en': self.sentiment_analyzer = VS() self.stopwords = nltk.corpus.stopwords.words("english") other_exclusions = ["#ff", "ff", "rt"] self.stopwords.extend(other_exclusions) self.stemmer = PorterStemmer() self.text_processor = TextPreProcessor( # terms that will be normalized normalize=['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number'], # terms that will be annotated annotate={"hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored'}, fix_html=True, # fix HTML tokens # corpus from which the word statistics are going to be used # for word segmentation segmenter="twitter", # corpus from which the word statistics are going to be used # for spell correction corrector="twitter", unpack_hashtags=True, # perform word segmentation on hashtags unpack_contractions=True, # Unpack contractions (can't -> can not) spell_correct_elong=False, # spell correction for elongated words # select a tokenizer. You can use SocialTokenizer, or pass your own # the tokenizer, should take as input a string and return a list of tokens tokenizer=SocialTokenizer(lowercase=True).tokenize, # list of dictionaries, for replacing tokens extracted from the text, # with other expressions. You can pass more than one dictionaries. dicts=[emoticons,slang] )