def NER_HINDINBC(): reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos') f1 = reader.fileids() print "The Files of Corpus are:", f1 sents = reader.tagged_sents() sentn = reader.sents() #words=sentn.split() ls = len(sents) #lw=len(words) print "Length of Corpus Is:", ls #print "The Words are:",lw size1 = int(ls * 0.3) test_sents = sents[:size1] train_sents = sents[size1:] nbc_tagger = ClassifierBasedPOSTagger(train=train_sents) test = nbc_tagger.evaluate(test_sents) print "The Test Result is:", test #THE GIVEN INPUT given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode( 'utf-8') gsw = given_sent.split() tag_gs = nbc_tagger.tag(gsw) print "GIVEN SENT TAG:", tag_gs ftag_gs = " ".join(list(itertools.chain(*tag_gs))) print "And its flattened Version is:", ftag_gs
def load_corpus_reviews(self,begin,end): #reader = LazyCorpusLoader() reader = TaggedCorpusReader('data/', r'.*\.pos') pos_fileids = reader.fileids()[1] neg_fileids = reader.fileids()[0] pos_sents = reader.tagged_sents(pos_fileids) neg_sents = reader.tagged_sents(neg_fileids) return (pos_sents[begin:end], neg_sents[begin:end])
def generate_corpus_from_segmented_reports(self): re = ReportEnviroments() new_corpus_of_segmented_reports = TaggedCorpusReader( re.segmented_reports_corpus_path, '.*', sent_tokenizer=LineTokenizer(blanklines='discard'), encoding='utf-8') raw_segmented_reports = [] for i in range(len(new_corpus_of_segmented_reports.fileids())): raw_segmented_reports.append( new_corpus_of_segmented_reports.sents( fileids=new_corpus_of_segmented_reports.fileids()[i])) cut_of_segmented_reports = [] topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED'] for i in range(len(raw_segmented_reports)): cut_of_segmented_reports.append( raw_segmented_reports[i] [raw_segmented_reports[i].index([topics[0].decode('utf-8')]): raw_segmented_reports[i].index([topics[-1].decode('utf-8')]) + 1]) return cut_of_segmented_reports, topics
def generate_corpus_from_segmented_reports(self): re = ReportEnviroments() new_corpus_of_segmented_reports = TaggedCorpusReader(re.segmented_reports_corpus_path, '.*', sent_tokenizer=LineTokenizer(blanklines='discard'), encoding='utf-8') raw_segmented_reports = [] for i in range(len(new_corpus_of_segmented_reports.fileids())): raw_segmented_reports.append(new_corpus_of_segmented_reports.sents(fileids=new_corpus_of_segmented_reports.fileids()[i])) cut_of_segmented_reports = [] topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED'] for i in range(len(raw_segmented_reports)): cut_of_segmented_reports.append(raw_segmented_reports[i][raw_segmented_reports[i].index([topics[0].decode('utf-8')]):raw_segmented_reports[i].index([topics[-1].decode('utf-8')])+1]) return cut_of_segmented_reports, topics
def read_sentences_corpus(reader = None): #reader = LazyCorpusLoader() #its overriding reader reader = TaggedCorpusReader('../data/', r'.*\.pos') ''' create a corpus reader with the files in ../data/*.pos this files contains sentences tagged, and are the bases of trainig, test sets. ''' pos_fileids = reader.fileids()[1] neg_fileids = reader.fileids()[0] pos_sents = reader.tagged_sents(pos_fileids) neg_sents = reader.tagged_sents(neg_fileids) #pos_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in pos_sents ] #neg_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in neg_sents ] return (pos_sents,neg_sents)
class Classifier: def __init__(self, root, keyWords, devRoot): self.__root__ = root self.__keyWords__ = keyWords self.__corpus__ = None self.__classifier__ = None self.__dev_corpus__ = None self.__dev_root__ = devRoot def initClassifier(self): self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep='#') self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep='#') def separateSentence(self): grammer = r""" NP: {<.*>+} }<PU>{ """ return nltk.RegexpParser(grammer) def separateParagraphByReg(self, parag): ''' :return: a list of sentences separated by (,|.) in this paragraph :param parag: the paragraph before seggment :type parag: string ''' grammer = re.compile(',|。') return grammer.split(parag) def updateFeatures(self, src, dest): for key, val in src.items(): if type(val).__name__ == 'bool' and val: dest[key] = val elif type(val).__name__ == 'int': if key in dest: dest[key] += val else: dest[key] = val def training(self): trainSet = [] for file in self.__corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non training data sentences = self.__corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature( list(subtree)) # [(word, tag)] self.updateFeatures(subfea, features) print(features) trainSet.append((features, re.match(r"[a-z]+", file).group(0))) self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet) def salespersonFeature(self, sent): features = {} words = [word for (word, tag) in sent] for w in self.__keyWords__: features["count(%s)" % w] = words.count(w) features["has(%s)" % w] = (w in words) return features def distinguishSalesFromTagfile(self, tagfile): sents = self.__corpus__.tagged_sents(tagfile) feas = {} for sent in sents: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, feas) return self.__classifier__.classify(feas) def testClassifierAccuracy(self): testFea = [] for file in self.__dev_corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non testing data sentences = self.__dev_corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, features) testFea.append((features, re.match(r"[a-z]+", file).group(0))) return nltk.classify.accuracy(self.__classifier__, testFea)
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
class Classifier: def __init__(self, root, keyWords, devRoot): self.__root__ = root self.__keyWords__ = keyWords self.__corpus__ = None self.__classifier__ = None self.__dev_corpus__ = None self.__dev_root__ = devRoot def initClassifier(self): self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep = '#') self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep = '#') def separateSentence(self): grammer = r""" NP: {<.*>+} }<PU>{ """ return nltk.RegexpParser(grammer) def separateParagraphByReg(self, parag): ''' :return: a list of sentences separated by (,|.) in this paragraph :param parag: the paragraph before seggment :type parag: string ''' grammer = re.compile(',|。') return grammer.split(parag) def updateFeatures(self, src, dest): for key, val in src.items(): if type(val).__name__ == 'bool' and val: dest[key] = val elif type(val).__name__ == 'int': if key in dest: dest[key] += val else: dest[key] = val def training(self): trainSet = [] for file in self.__corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non training data sentences = self.__corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) # [(word, tag)] self.updateFeatures(subfea, features) print(features) trainSet.append((features, re.match(r"[a-z]+", file).group(0))) self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet) def salespersonFeature(self, sent): features = {} words = [word for (word, tag) in sent] for w in self.__keyWords__: features["count(%s)" % w] = words.count(w) features["has(%s)" % w] = (w in words) return features def distinguishSalesFromTagfile(self, tagfile): sents = self.__corpus__.tagged_sents(tagfile) feas = {} for sent in sents: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, feas) return self.__classifier__.classify(feas) def testClassifierAccuracy(self): testFea = [] for file in self.__dev_corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non testing data sentences = self.__dev_corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, features) testFea.append((features, re.match(r"[a-z]+", file).group(0))) return nltk.classify.accuracy(self.__classifier__, testFea)
def NER_HINDI(): reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos') f1 = reader.fileids() print "The Files of Corpus are:", f1 sents = reader.tagged_sents() sentn = reader.sents() #words=sentn.split() ls = len(sents) #lw=len(words) print "Length of Corpus Is:", ls #print "The Words are:",lw size1 = int(ls * 0.3) test_sents = sents[:size1] train_sents = sents[size1:] hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents) test = hmm_tagger.test(test_sents) #THE GIVEN INPUT given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode( 'utf-8') gsw = given_sent.split() tag_gs = hmm_tagger.tag(gsw) print "GIVEN SENT TAG:", tag_gs ftag_gs = " ".join(list(itertools.chain(*tag_gs))) print "And its flattened Version is:", ftag_gs #INPUT FROM FILE with open('HINDIHMMNER1.dill', 'wb') as f: dill.dump(hmm_tagger, f) with open('HINDIHMMNER1.dill', 'rb') as f: hmm_tagger1 = dill.load(f) test_tags = [ tag for sent in reader.sents() for (word, tag) in hmm_tagger1.tag(sent) ] gold_tags = [tag for (word, tag) in reader.tagged_words()] ltesttag = len(test_tags) lgtags = len(gold_tags) print "Test Tag Len:", ltesttag print "Gold Tag Len:", lgtags cm = nltk.ConfusionMatrix(gold_tags, test_tags) print(cm.pretty_format(sort_by_count=True, show_percents=False, truncate=5)) labels = set('NA GPE PERS DATE ORG'.split() ) #THE TAG SETS AS GENERATED IN CONFUSION MATRIX true_positives = Counter() false_negatives = Counter() false_positives = Counter() for i in labels: for j in labels: if i == j: true_positives[i] += cm[i, j] else: false_negatives[i] += cm[i, j] false_positives[j] += cm[i, j] print "TP:", sum(true_positives.values()), true_positives print "FN:", sum(false_negatives.values()), false_negatives print "FP:", sum(false_positives.values()), false_positives print for i in sorted(labels): if true_positives[i] == 0: fscore = 0 else: precision = true_positives[i] / float(true_positives[i] + false_positives[i]) recall = true_positives[i] / float(true_positives[i] + false_negatives[i]) fscore = 2 * (precision * recall) / float(precision + recall) fscore1 = fscore * 100 print "TAG:", i, "FMEASURE:", fscore1