コード例 #1
0
def NER_HINDINBC():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    nbc_tagger = ClassifierBasedPOSTagger(train=train_sents)
    test = nbc_tagger.evaluate(test_sents)
    print "The Test Result is:", test
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = nbc_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs
コード例 #2
0
    def load_corpus_reviews(self,begin,end):
        #reader = LazyCorpusLoader()
        reader = TaggedCorpusReader('data/', r'.*\.pos')

        pos_fileids = reader.fileids()[1]
        neg_fileids = reader.fileids()[0]

        pos_sents = reader.tagged_sents(pos_fileids)
        neg_sents = reader.tagged_sents(neg_fileids)

        return (pos_sents[begin:end], neg_sents[begin:end])
コード例 #3
0
 def generate_corpus_from_segmented_reports(self):
     re = ReportEnviroments()
     new_corpus_of_segmented_reports = TaggedCorpusReader(
         re.segmented_reports_corpus_path,
         '.*',
         sent_tokenizer=LineTokenizer(blanklines='discard'),
         encoding='utf-8')
     raw_segmented_reports = []
     for i in range(len(new_corpus_of_segmented_reports.fileids())):
         raw_segmented_reports.append(
             new_corpus_of_segmented_reports.sents(
                 fileids=new_corpus_of_segmented_reports.fileids()[i]))
     cut_of_segmented_reports = []
     topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED']
     for i in range(len(raw_segmented_reports)):
         cut_of_segmented_reports.append(
             raw_segmented_reports[i]
             [raw_segmented_reports[i].index([topics[0].decode('utf-8')]):
              raw_segmented_reports[i].index([topics[-1].decode('utf-8')]) +
              1])
     return cut_of_segmented_reports, topics
コード例 #4
0
 def generate_corpus_from_segmented_reports(self):
     re = ReportEnviroments()
     new_corpus_of_segmented_reports = TaggedCorpusReader(re.segmented_reports_corpus_path, '.*',
                                                          sent_tokenizer=LineTokenizer(blanklines='discard'),
                                                          encoding='utf-8')
     raw_segmented_reports = []
     for i in range(len(new_corpus_of_segmented_reports.fileids())):
         raw_segmented_reports.append(new_corpus_of_segmented_reports.sents(fileids=new_corpus_of_segmented_reports.fileids()[i]))
     cut_of_segmented_reports = []
     topics = ['DISCENTE', 'DOCENTE', 'INFRAESTRUTURA', 'UNCATEGORIZED']
     for i in range(len(raw_segmented_reports)):
         cut_of_segmented_reports.append(raw_segmented_reports[i][raw_segmented_reports[i].index([topics[0].decode('utf-8')]):raw_segmented_reports[i].index([topics[-1].decode('utf-8')])+1])    
     return cut_of_segmented_reports, topics
コード例 #5
0
def read_sentences_corpus(reader = None):
	#reader = LazyCorpusLoader()
	#its overriding reader
	reader = TaggedCorpusReader('../data/', r'.*\.pos')
	'''
	create a corpus reader with the files in ../data/*.pos 
	this files contains sentences tagged, and are the bases of trainig, test sets. 
	'''

	pos_fileids = reader.fileids()[1]
	neg_fileids = reader.fileids()[0]

	pos_sents = reader.tagged_sents(pos_fileids)
	neg_sents = reader.tagged_sents(neg_fileids)

	#pos_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in pos_sents ]
	#neg_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in neg_sents ]

	return (pos_sents,neg_sents)
コード例 #6
0
class Classifier:
    def __init__(self, root, keyWords, devRoot):
        self.__root__ = root
        self.__keyWords__ = keyWords
        self.__corpus__ = None
        self.__classifier__ = None
        self.__dev_corpus__ = None
        self.__dev_root__ = devRoot

    def initClassifier(self):
        self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep='#')
        self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__,
                                                 '.*\.txt',
                                                 sep='#')

    def separateSentence(self):
        grammer = r"""
        NP:
            {<.*>+}
            }<PU>{
        """
        return nltk.RegexpParser(grammer)

    def separateParagraphByReg(self, parag):
        '''
        :return: a list of sentences separated by (,|.) in this paragraph 
        :param parag: the paragraph before seggment
        :type parag: string
        '''
        grammer = re.compile(',|。')
        return grammer.split(parag)

    def updateFeatures(self, src, dest):
        for key, val in src.items():
            if type(val).__name__ == 'bool' and val:
                dest[key] = val
            elif type(val).__name__ == 'int':
                if key in dest:
                    dest[key] += val
                else:
                    dest[key] = val

    def training(self):
        trainSet = []
        for file in self.__corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue  # skip the non training data
            sentences = self.__corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(
                        list(subtree))  # [(word, tag)]
                    self.updateFeatures(subfea, features)
            print(features)
            trainSet.append((features, re.match(r"[a-z]+", file).group(0)))
        self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet)

    def salespersonFeature(self, sent):
        features = {}
        words = [word for (word, tag) in sent]
        for w in self.__keyWords__:
            features["count(%s)" % w] = words.count(w)
            features["has(%s)" % w] = (w in words)
        return features

    def distinguishSalesFromTagfile(self, tagfile):
        sents = self.__corpus__.tagged_sents(tagfile)
        feas = {}
        for sent in sents:
            tree = self.separateSentence().parse(sent)
            for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                subfea = self.salespersonFeature(list(subtree))
                self.updateFeatures(subfea, feas)
        return self.__classifier__.classify(feas)

    def testClassifierAccuracy(self):
        testFea = []
        for file in self.__dev_corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue  # skip the non testing data
            sentences = self.__dev_corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree))
                    self.updateFeatures(subfea, features)
            testFea.append((features, re.match(r"[a-z]+", file).group(0)))
        return nltk.classify.accuracy(self.__classifier__, testFea)
コード例 #7
0
ファイル: CustomCorpora.py プロジェクト: AbhideepRND/NLTK
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
コード例 #8
0
ファイル: NaiveBayesForNLP.py プロジェクト: richzw/CodeHome
class Classifier:
    def __init__(self, root, keyWords, devRoot):
        self.__root__ = root
        self.__keyWords__ = keyWords
        self.__corpus__ = None
        self.__classifier__ = None
        self.__dev_corpus__ = None
        self.__dev_root__ = devRoot
        
    def initClassifier(self):
        self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep = '#')
        self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep = '#')
    
    def separateSentence(self):
        grammer = r"""
        NP:
            {<.*>+}
            }<PU>{
        """
        return nltk.RegexpParser(grammer)

    def separateParagraphByReg(self, parag):
        '''
        :return: a list of sentences separated by (,|.) in this paragraph 
        :param parag: the paragraph before seggment
        :type parag: string
        '''
        grammer = re.compile(',|。')
        return grammer.split(parag)
        
    def updateFeatures(self, src, dest):
        for key, val in src.items():
            if type(val).__name__ == 'bool' and val:
                dest[key] = val
            elif type(val).__name__ == 'int':
                if key in dest:
                    dest[key] += val
                else:
                    dest[key] = val
    
    def training(self):
        trainSet = []
        for file in self.__corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue      # skip the non training data
            sentences = self.__corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree)) # [(word, tag)]
                    self.updateFeatures(subfea, features)
            print(features)
            trainSet.append((features, re.match(r"[a-z]+", file).group(0)))
        self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet)
    
    def salespersonFeature(self, sent):
        features = {}
        words = [word for (word, tag) in sent]
        for w in self.__keyWords__:
            features["count(%s)" % w] = words.count(w)
            features["has(%s)" % w] = (w in words)
        return features
        
    def distinguishSalesFromTagfile(self, tagfile):
        sents = self.__corpus__.tagged_sents(tagfile)
        feas = {}
        for sent in sents:
            tree = self.separateSentence().parse(sent)
            for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                subfea = self.salespersonFeature(list(subtree))
                self.updateFeatures(subfea, feas)
        return self.__classifier__.classify(feas)
    
    def testClassifierAccuracy(self):
        testFea = []
        for file in self.__dev_corpus__.fileids():
            trainingData = re.match(r"[a-z]+", file)
            if trainingData is None:
                continue      # skip the non testing data            
            sentences = self.__dev_corpus__.tagged_sents(file)
            features = {}
            for sent in sentences:
                tree = self.separateSentence().parse(sent)
                for subtree in tree.subtrees(lambda t: t.label() == 'NP'):
                    subfea = self.salespersonFeature(list(subtree))
                    self.updateFeatures(subfea, features)
            testFea.append((features, re.match(r"[a-z]+", file).group(0)))
        return nltk.classify.accuracy(self.__classifier__, testFea)
コード例 #9
0
ファイル: NERHINDI1.py プロジェクト: subhabangalore/ML-Codes
def NER_HINDI():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents)
    test = hmm_tagger.test(test_sents)
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = hmm_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs
    #INPUT FROM FILE
    with open('HINDIHMMNER1.dill', 'wb') as f:
        dill.dump(hmm_tagger, f)
    with open('HINDIHMMNER1.dill', 'rb') as f:
        hmm_tagger1 = dill.load(f)

    test_tags = [
        tag for sent in reader.sents() for (word, tag) in hmm_tagger1.tag(sent)
    ]
    gold_tags = [tag for (word, tag) in reader.tagged_words()]
    ltesttag = len(test_tags)
    lgtags = len(gold_tags)
    print "Test Tag Len:", ltesttag
    print "Gold Tag Len:", lgtags
    cm = nltk.ConfusionMatrix(gold_tags, test_tags)
    print(cm.pretty_format(sort_by_count=True, show_percents=False,
                           truncate=5))
    labels = set('NA GPE PERS DATE  ORG'.split()
                 )  #THE TAG SETS AS GENERATED IN CONFUSION MATRIX
    true_positives = Counter()
    false_negatives = Counter()
    false_positives = Counter()
    for i in labels:
        for j in labels:
            if i == j:
                true_positives[i] += cm[i, j]
            else:
                false_negatives[i] += cm[i, j]
                false_positives[j] += cm[i, j]
    print "TP:", sum(true_positives.values()), true_positives
    print "FN:", sum(false_negatives.values()), false_negatives
    print "FP:", sum(false_positives.values()), false_positives
    print

    for i in sorted(labels):
        if true_positives[i] == 0:
            fscore = 0
        else:
            precision = true_positives[i] / float(true_positives[i] +
                                                  false_positives[i])
            recall = true_positives[i] / float(true_positives[i] +
                                               false_negatives[i])
            fscore = 2 * (precision * recall) / float(precision + recall)
            fscore1 = fscore * 100
            print "TAG:", i, "FMEASURE:", fscore1