コード例 #1
0
ファイル: test.py プロジェクト: tilakpatidar/unl-enconverter
def pos_tag(query):
	train_data = indian.tagged_sents('hindi.pos')
	tnt_pos_tagger = tnt.TnT()
	tnt_pos_tagger.train(train_data) 
	#test_data = ("भारतीय संस्कृति में नारी के सम्मान को बहुत महत्व दिया गया है।")
	a=tnt_pos_tagger.tag(nltk.word_tokenize(query))
	return a
コード例 #2
0
ファイル: concordance_app.py プロジェクト: Bakuchi/naivebayes
            'English: NPS Chat Corpus (simplified)':
                lambda: nps_chat.tagged_posts(simplify_tags=True),
            'English: Wall Street Journal Corpus':
                lambda: treebank.tagged_sents(),
            'English: Wall Street Journal Corpus (simplified)':
                lambda: treebank.tagged_sents(simplify_tags=True),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.tagged_sents(),
            'Chinese: Sinica Corpus (simplified)':
                lambda: sinica_treebank.tagged_sents(simplify_tags=True),
            'Dutch: Alpino Corpus':
                lambda: alpino.tagged_sents(),
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(simplify_tags=True),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', simplify_tags=True),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(simplify_tags=True),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(simplify_tags=True),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(simplify_tags=True),
           }

class ConcordanceSearchView(object):
コード例 #3
0
from nltk import UnigramTagger

import sys
reload(sys)
from nltk.tag import tnt
from nltk.tokenize import sent_tokenize, word_tokenize


tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data_hindi)
sys.setdefaultencoding("utf-8")
#english tags
train_sents = brown.tagged_sents()
#hindi tags

train_data_hindi = indian.tagged_sents('hindi.pos')[:-1] //used for training 

tnt_pos_tagger.train(train_data_hindi)

def english_tag(eng_tweet):
	word_features = []
	eng_tweet =nltk.word_tokenize(eng_tweet)
	for i,j in nltk.pos_tag(eng_tweet):
    if j in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']: 
        word_features.append(i)


	rating = 0

	
	 
コード例 #4
0
def train_hmm_pos_tagger():

	#Consider tagged hindi sentences of NLTK as training data.
	train_data = indian.tagged_sents('hindi.pos')

	#Define dictionary for each database
	word_tag_dict = {}
	tag_freq_dict = {}
	tag_tag_dict = {}

	tag_freq_dict['<S>'] = len(train_data)
	tag_freq_dict['</S>'] = len(train_data)

	#Calculate frequency count for each database
	for sentence in train_data:
		prev_tag = '<S>'
		for word_tag_pair in sentence:
			if word_tag_pair not in word_tag_dict:
				word_tag_dict[word_tag_pair] = 1
			else:
				word_tag_dict[word_tag_pair] += 1
			
			current_tag = word_tag_pair[1]
			if current_tag not in tag_freq_dict:
				tag_freq_dict[current_tag] = 1
			else:
				tag_freq_dict[current_tag] += 1

			tag_tag_pair = (prev_tag,current_tag)
			if tag_tag_pair not in tag_tag_dict:
				tag_tag_dict[tag_tag_pair] = 1
			else:
				tag_tag_dict[tag_tag_pair] += 1

			prev_tag = current_tag

		current_tag = '</S>'
		tag_tag_pair = (prev_tag,current_tag)
		if tag_tag_pair not in tag_tag_dict:
			tag_tag_dict[tag_tag_pair] = 1
		else:
			tag_tag_dict[tag_tag_pair] += 1


	test_data = indian.tagged_sents('hindi.pos')
	sentence = test_data[0]

	prev_tag = '<S>'
	for pair in sentence:
		word = pair[0]
	 	max_prob = -100000000000000
	 	for current_tag in tag_freq_dict.keys():
	 		word_tag_pair = (word,current_tag)
	 		if word_tag_pair not in word_tag_dict:
	 			word_tag_prob = math.log(1.0/tag_freq_dict[current_tag])
	 		else:
	 			word_tag_prob = math.log(float(word_tag_dict[word_tag_pair])/float(tag_freq_dict[current_tag]))

	 		prev_cur_tag_pair = (prev_tag,current_tag)
	 		if prev_cur_tag_pair not in tag_tag_dict:
	 			prev_cur_tag_prob = math.log(1.0/tag_freq_dict[prev_tag])
	 		else:
	 			prev_cur_tag_prob = math.log(float(tag_tag_dict[prev_cur_tag_pair])/float(tag_freq_dict[prev_tag]))

	 		max_freq = 1.0	
	 		for future_tag in tag_freq_dict.keys():
	 			cur_future_tag_pair = (current_tag,future_tag)
	 			if cur_future_tag_pair in tag_tag_dict:
	 				if tag_tag_dict[cur_future_tag_pair] > max_freq:
	 					max_freq = tag_tag_dict[cur_future_tag_pair]
	 		
	 		cur_future_tag_prob = math.log(float(max_freq)/float(tag_freq_dict[current_tag]))
	 		total_prob = word_tag_prob + prev_cur_tag_prob + cur_future_tag_prob
			
			if total_prob > max_prob:
				max_prob = total_prob
				tag_assigned = current_tag

		prev_tag = tag_assigned

		print word
		print "Original tag : ",
		print pair[1]
		print "Tag assigned : ",
		print tag_assigned	
コード例 #5
0
    "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="simple"),
    "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="simple"),
    "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
        categories="science_fiction", tagset="simple"
    ),
    "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"),
    "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"),
    "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
    "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"),
    "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
    "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"),
    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"),
    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"),
    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"),
}


class ConcordanceSearchView(object):
    _BACKGROUND_COLOUR = "#FFF"  # white

    # Colour of highlighted results
    _HIGHLIGHT_WORD_COLOUR = "#F00"  # red
    _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG"
コード例 #6
0
ファイル: posfile-test.py プロジェクト: asifbalgar/kok-nltk
#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
import nltk
from nltk.corpus import indian
from nltk.tag import tnt
from tabulate import tabulate


train_data 	= indian.tagged_sents('konkani.pos')
test_str 	= "सुनापरांत पयलें पान,सुनापरांत खेळ".decode(sys.stdin.encoding);
tnt_tagger 	= tnt.TnT()
n 			= tnt_tagger.train(train_data)
tagged 		= tnt_tagger.tag(nltk.word_tokenize(test_str))

print tabulate(tagged)

コード例 #7
0
def hindi_model():
    train_data = indian.tagged_sents('hindi.pos')
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)
    return tnt_pos_tagger
コード例 #8
0
    "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
        tagset="universal"
    ),
    "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
    "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
        tagset="universal"
    ),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
        tagset="universal"
    ),
    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
        tagset="universal"
    ),
    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(
        files="hindi.pos", tagset="universal"
    ),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(
        tagset="universal"
    ),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(
        tagset="universal"
    ),
    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(
        tagset="universal"
    ),
}
コード例 #9
0
        len(indian.raw(f)),
    )
    print("     No of words :-", len(indian.words(f)))
    print("     No of Sentences :-", len(indian.sents(f)))
print()

print("Checking raw sentences of languages:-")
# print(indian.raw(indian.raw('bangla.pos'))
# print(indian.raw(indian.raw('hindi.pos'))
# print(indian.raw(indian.raw('marathi.pos'))
# print(indian.raw(indian.raw('telugu.pos'))

print("Printing & writing the sentences to a file,  from Marathi language")
sentencesMarathi = open("marathiSentences.txt", "w")
# This will print sentence as a list of words
for sentence in indian.sents('marathi.pos'):
    #print(sentence)
    sentencesMarathi.write(" ".join(sentence))

print("Printing & writing words to a file, from Bangla language")
wordsBangla = open("hindiWords.txt", "w")
# This will print sentence as a list of words
for word in indian.words('bangla.pos'):
    print("===>", word, end=" ")
    wordsBangla.write(" ".join(word))

# checking POS tagged sentences for particular language
for taggedSents in indian.tagged_sents('hindi.pos'):
    print(taggedSents)
    #print(end="")
コード例 #10
0
    'English: NPS Chat Corpus (simplified)':
    lambda: nps_chat.tagged_posts(tagset='universal'),
    'English: Wall Street Journal Corpus':
    lambda: treebank.tagged_sents(),
    'English: Wall Street Journal Corpus (simplified)':
    lambda: treebank.tagged_sents(tagset='universal'),
    'Chinese: Sinica Corpus':
    lambda: sinica_treebank.tagged_sents(),
    'Chinese: Sinica Corpus (simplified)':
    lambda: sinica_treebank.tagged_sents(tagset='universal'),
    'Dutch: Alpino Corpus':
    lambda: alpino.tagged_sents(),
    'Dutch: Alpino Corpus (simplified)':
    lambda: alpino.tagged_sents(tagset='universal'),
    'Hindi: Indian Languages Corpus':
    lambda: indian.tagged_sents(files='hindi.pos'),
    'Hindi: Indian Languages Corpus (simplified)':
    lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
    'Portuguese: Floresta Corpus (Portugal)':
    lambda: floresta.tagged_sents(),
    'Portuguese: Floresta Corpus (Portugal, simplified)':
    lambda: floresta.tagged_sents(tagset='universal'),
    'Portuguese: MAC-MORPHO Corpus (Brazil)':
    lambda: mac_morpho.tagged_sents(),
    'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
    lambda: mac_morpho.tagged_sents(tagset='universal'),
    'Spanish: CESS-ESP Corpus (simplified)':
    lambda: cess_esp.tagged_sents(tagset='universal'),
}

コード例 #11
0
    "English: NPS Chat Corpus (simplified)":
    lambda: nps_chat.tagged_posts(tagset="universal"),
    "English: Wall Street Journal Corpus":
    lambda: treebank.tagged_sents(),
    "English: Wall Street Journal Corpus (simplified)":
    lambda: treebank.tagged_sents(tagset="universal"),
    "Chinese: Sinica Corpus":
    lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)":
    lambda: sinica_treebank.tagged_sents(tagset="universal"),
    "Dutch: Alpino Corpus":
    lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)":
    lambda: alpino.tagged_sents(tagset="universal"),
    "Hindi: Indian Languages Corpus":
    lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)":
    lambda: indian.tagged_sents(files="hindi.pos", tagset="universal"),
    "Portuguese: Floresta Corpus (Portugal)":
    lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)":
    lambda: floresta.tagged_sents(tagset="universal"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)":
    lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)":
    lambda: mac_morpho.tagged_sents(tagset="universal"),
    "Spanish: CESS-ESP Corpus (simplified)":
    lambda: cess_esp.tagged_sents(tagset="universal"),
}

コード例 #12
0
    sen = "".join([
        " " + i if not i.startswith("'") and i not in string.punctuation else i
        for i in sen
    ]).strip()
    print(count, sen)
print('Total sentences in the tagged file are', count)

train_perc = .9

train_rows = int(train_perc * count)
test_rows = train_rows + 1

print('Sentences to be trained', train_rows, 'Sentences to be tested against',
      test_rows)

data = indian.tagged_sents(tagged_set)
train_data = data[:train_rows]
test_data = data[test_rows:]

pos_tagger = tnt.TnT()
pos_tagger.train(train_data)
pos_tagger.evaluate(test_data)

sentence_to_be_tagged = " नई दिल्ली ।"


def tagthesentence(sentence):
    tokenizedSentence = nltk.word_tokenize(sentence)
    print(tokenizedSentence)
    print(pos_tagger.tag(tokenizedSentence))
    return pos_tagger.tag(tokenizedSentence)
def nepali_model():
    train_data = indian.tagged_sents('<path/to/nepali.pos>')
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)
    return tnt_pos_tagger
コード例 #14
0
ファイル: concordance_app.py プロジェクト: Arttii/TextBlob
            'English: NPS Chat Corpus (simplified)':
                lambda: nps_chat.tagged_posts(tagset='simple'),
            'English: Wall Street Journal Corpus':
                lambda: treebank.tagged_sents(),
            'English: Wall Street Journal Corpus (simplified)':
                lambda: treebank.tagged_sents(tagset='simple'),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.tagged_sents(),
            'Chinese: Sinica Corpus (simplified)':
                lambda: sinica_treebank.tagged_sents(tagset='simple'),
            'Dutch: Alpino Corpus':
                lambda: alpino.tagged_sents(),
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(tagset='simple'),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', tagset='simple'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(tagset='simple'),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(tagset='simple'),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(tagset='simple'),
           }

class ConcordanceSearchView(object):
コード例 #15
0
from nltk.corpus import indian
from nltk.tag import tnt
from nltk.tokenize import sent_tokenize
import nltk
import json
import io
train_data = indian.tagged_sents('hindi.pos')
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data) 
test_data = ("भारतीय संस्कृति में नारी के सम्मान को बहुत महत्व दिया गया है। नारी शिक्षा कहा गया है जंहा स्त्रियों की पूजा होती है वंहा देवता निवास करते हैं । प्राचीन काल से ही नारी को ‘गृह देवी’ या ‘गृह लक्ष्मी’ कहा जाता है ।")
ts=test_data.split("।")
#s=sent_tokenize(test_data)

#a=tnt_pos_tagger.tag(nltk.word_tokenize(test_data))
with io.open('op.txt', 'w', encoding='utf8') as json_file:
    json.dump(ts, json_file, ensure_ascii=False)

コード例 #16
0
            'English: NPS Chat Corpus (simplified)':
                lambda: nps_chat.tagged_posts(tagset='universal'),
            'English: Wall Street Journal Corpus':
                lambda: treebank.tagged_sents(),
            'English: Wall Street Journal Corpus (simplified)':
                lambda: treebank.tagged_sents(tagset='universal'),
            'Chinese: Sinica Corpus':
                lambda: sinica_treebank.tagged_sents(),
            'Chinese: Sinica Corpus (simplified)':
                lambda: sinica_treebank.tagged_sents(tagset='universal'),
            'Dutch: Alpino Corpus':
                lambda: alpino.tagged_sents(),
            'Dutch: Alpino Corpus (simplified)':
                lambda: alpino.tagged_sents(tagset='universal'),
            'Hindi: Indian Languages Corpus':
                lambda: indian.tagged_sents(files='hindi.pos'),
            'Hindi: Indian Languages Corpus (simplified)':
                lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'),
            'Portuguese: Floresta Corpus (Portugal)':
                lambda: floresta.tagged_sents(),
            'Portuguese: Floresta Corpus (Portugal, simplified)':
                lambda: floresta.tagged_sents(tagset='universal'),
            'Portuguese: MAC-MORPHO Corpus (Brazil)':
                lambda: mac_morpho.tagged_sents(),
            'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
                lambda: mac_morpho.tagged_sents(tagset='universal'),
            'Spanish: CESS-ESP Corpus (simplified)':
                lambda: cess_esp.tagged_sents(tagset='universal'),
           }

class ConcordanceSearchView(object):
コード例 #17
0
    def __init__(self,lang='en'):
        self.lang = lang
        self.stopwords = None
        self.stemmer = None
        self.sentiment_analyzer = None
        self.text_processor = None        
        INDIC_NLP_RESOURCES=r"../model/indic_nlp_resources/"        
        common.set_resources_path(INDIC_NLP_RESOURCES)
        self.pos_tagger = None



        if lang == 'hi':
            self.ht = HindiTokenizer.Tokenizer()
            self.sentiment_analyzer = load_learner(path="../model/hi-sentiment")
            self.stopwords = [x.strip() for x in open("../data/stopwords.txt").readlines()]	
            other_exclusions = ["#ff", "ff", "rt"]
            self.stopwords.extend(other_exclusions)
            self.stemmer = None
            self.text_processor = TextPreProcessor(
                # terms that will be normalized
                normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'],
                # terms that will be annotated
                annotate={"hashtag", "allcaps", "elongated", "repeated",
                    'emphasis', 'censored'},
                fix_html=True,  # fix HTML tokens
            )
            loader.load()
            train_data = indian.tagged_sents('hindi.pos')
            self.tnt_pos_tagger = tnt.TnT()
            self.tnt_pos_tagger.train(train_data)

        if lang == 'en':
            self.sentiment_analyzer = VS()
            self.stopwords = nltk.corpus.stopwords.words("english")
            other_exclusions = ["#ff", "ff", "rt"]
            self.stopwords.extend(other_exclusions)
            self.stemmer = PorterStemmer()
            self.text_processor = TextPreProcessor(
                # terms that will be normalized
                normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'],
                # terms that will be annotated
                annotate={"hashtag", "allcaps", "elongated", "repeated",
                    'emphasis', 'censored'},
                fix_html=True,  # fix HTML tokens

                # corpus from which the word statistics are going to be used 
                # for word segmentation 
                segmenter="twitter", 

                # corpus from which the word statistics are going to be used 
                # for spell correction
                corrector="twitter", 

                unpack_hashtags=True,  # perform word segmentation on hashtags
                unpack_contractions=True,  # Unpack contractions (can't -> can not)
                spell_correct_elong=False,  # spell correction for elongated words

                # select a tokenizer. You can use SocialTokenizer, or pass your own
                # the tokenizer, should take as input a string and return a list of tokens
                tokenizer=SocialTokenizer(lowercase=True).tokenize,

                # list of dictionaries, for replacing tokens extracted from the text,
                # with other expressions. You can pass more than one dictionaries.
                dicts=[emoticons,slang]
            )