def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out'%counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos'%counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
def cltk_pos_cv(full_training_set, local_dir_rel, counter): local_dir = os.path.expanduser(local_dir_rel) stdout_old = sys.stdout sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w') # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter) train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter) test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) sys.stdout.flush() # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') #crf_tagger = UnigramTagger(train_sents) # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) print('crf:', crf_accuracy) sys.stdout = stdout_old
def __init__(self, corpusroot, corpusname): #gunakan custom wordlist corpus dgn method WordListCorpusReader #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt']) #gunakan custom wordlist corpus dgn method PlaintextCorpusReader #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt') reader = TaggedCorpusReader(corpusroot, corpusname) self.reader_train = reader.tagged_sents() self.test_sent = reader.tagged_sents()[1000:]
def get_brill_tagger(self): train_data = TaggedCorpusReader('.', 'tagged_input_sentences.txt', sep="/") traindata = list(train_data.tagged_sents()) postag = load('taggers/maxent_treebank_pos_tagger/english.pickle') templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])), brill.Template(brill.Word([2])), brill.Template(brill.Word([-2, -1])), brill.Template(brill.Word([1, 2])), brill.Template(brill.Word([-3, -2, -1])), brill.Template(brill.Word([1, 2, 3])), brill.Template(brill.Word([-1]), brill.Word([1])) ] trainer = BrillTaggerTrainer(postag, templates=templates, trace=3) brill_tagger = trainer.train(traindata, max_rules=10) return brill_tagger
def make_pos_model(model_type): now = time.time() reader = TaggedCorpusReader('.', 'greek_training_set.pos') train_sents = reader.tagged_sents() if model_type == 'unigram': tagger = UnigramTagger(train_sents) file = 'unigram.pickle' elif model_type == 'bigram': tagger = BigramTagger(train_sents) file = 'bigram.pickle' elif model_type == 'trigram': tagger = TrigramTagger(train_sents) file = 'trigram.pickle' elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) file = '123grambackoff.pickle' elif model_type == 'tnt': tagger = tnt.TnT() tagger.train(train_sents) file = 'tnt.pickle' else: print('Invalid model_type.') _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos') path = os.path.join(_dir, file) with open(path, 'wb') as f: pickle.dump(tagger, f) print('Completed training {0} model in {1} seconds to {2}.'.format( model_type, time.time() - now, path))
def NER_HINDINBC(): reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos') f1 = reader.fileids() print "The Files of Corpus are:", f1 sents = reader.tagged_sents() sentn = reader.sents() #words=sentn.split() ls = len(sents) #lw=len(words) print "Length of Corpus Is:", ls #print "The Words are:",lw size1 = int(ls * 0.3) test_sents = sents[:size1] train_sents = sents[size1:] nbc_tagger = ClassifierBasedPOSTagger(train=train_sents) test = nbc_tagger.evaluate(test_sents) print "The Test Result is:", test #THE GIVEN INPUT given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode( 'utf-8') gsw = given_sent.split() tag_gs = nbc_tagger.tag(gsw) print "GIVEN SENT TAG:", tag_gs ftag_gs = " ".join(list(itertools.chain(*tag_gs))) print "And its flattened Version is:", ftag_gs
def make_pos_model(model_type): now = time.time() reader = TaggedCorpusReader('.', 'greek_training_set.pos') train_sents = reader.tagged_sents() if model_type == 'unigram': tagger = UnigramTagger(train_sents) file = 'unigram.pickle' elif model_type == 'bigram': tagger = BigramTagger(train_sents) file = 'bigram.pickle' elif model_type == 'trigram': tagger = TrigramTagger(train_sents) file = 'trigram.pickle' elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) file = '123grambackoff.pickle' elif model_type == 'tnt': tagger = tnt.TnT() tagger.train(train_sents) file = 'tnt.pickle' else: print('Invalid model_type.') _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos') path = os.path.join(_dir, file) with open(path, 'wb') as f: pickle.dump(tagger, f) print('Completed training {0} model in {1} seconds to {2}.'.format(model_type, time.time() - now, path))
def __init__(self, corpusroot, corpusname): #gunakan custom wordlist corpus dgn method WordListCorpusReader #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt']) #gunakan custom wordlist corpus dgn method PlaintextCorpusReader #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt') #nltk_old = [(3,0,1)] #nltk_current = [tuple([int(x) for x in nltk.__version__.split('.')])] reader = TaggedCorpusReader(corpusroot, corpusname) splitratio = 0.8 self.reader_train = reader.tagged_sents()[:int(len(reader.tagged_sents())*splitratio)] self.test_sent = reader.tagged_sents()[int(len(reader.tagged_sents())*splitratio):] print "split test ratio: ", int(len(reader.tagged_sents())*splitratio),"\n" print "reader_train len: ", len(self.reader_train) print "test_sent len: ", len(self.test_sent)
def __init__(self, corpusroot, corpusname): #gunakan custom wordlist corpus dgn method WordListCorpusReader #wordlist = WordListCorpusReader(corpus_root, ['wordlist.txt']) #gunakan custom wordlist corpus dgn method PlaintextCorpusReader #wordlist = PlaintextCorpusReader(corpus_root,'wordlist.txt') #nltk_old = [(3,0,1)] #nltk_current = [tuple([int(x) for x in nltk.__version__.split('.')])] reader = TaggedCorpusReader(corpusroot, corpusname) splitratio = 0.8 self.reader_train = reader.tagged_sents()[:int(len(reader.tagged_sents())*splitratio)] self.test_sent = reader.tagged_sents()[int(len(reader.tagged_sents())*splitratio):] print "split test ratio: ", int(len(reader.tagged_sents())*splitratio),"\n" print "reader_train len: ", len(self.reader_train) print "test_sent len: ", len(self.test_sent)
def load_corpus_reviews(self,begin,end): #reader = LazyCorpusLoader() reader = TaggedCorpusReader('data/', r'.*\.pos') pos_fileids = reader.fileids()[1] neg_fileids = reader.fileids()[0] pos_sents = reader.tagged_sents(pos_fileids) neg_sents = reader.tagged_sents(neg_fileids) return (pos_sents[begin:end], neg_sents[begin:end])
def read(self, file_path): logger.info('Reading instances from file %s', file_path) reader = TaggedCorpusReader(*os.path.split(file_path), sep='\t', word_tokenizer=RegexpTokenizer(r'\n', gaps=True), sent_tokenizer=BlanklineTokenizer(), para_block_reader=lambda s: [s.read()]) return Dataset([ self.text_to_instance(*tuple(zip(*tagged_sent))) for tagged_sent in reader.tagged_sents() ])
def make_morpho_model(language, model_type, feature, train_file, test_file=None): test_file = train_file if test_file == None else test_file reader_train = TaggedCorpusReader('.', train_file) reader_test = TaggedCorpusReader('.', test_file) train_sents = reader_train.tagged_sents() test_sents = reader_test.tagged_sents() verify_tagged_corpus(reader_train) verify_tagged_corpus(reader_test) tagger = train_tagger(language, model_type, feature, train_sents) acc = tagger.evaluate(test_sents) baseline = compute_baseline(reader_test.tagged_words()) kappa = (acc - baseline) / (1 - baseline) cm = conf_matrix(tagger, reader_test.words(), reader_test.tagged_words()) return (tagger, acc, kappa, cm)
def read_sentences_corpus(reader = None): #reader = LazyCorpusLoader() #its overriding reader reader = TaggedCorpusReader('../data/', r'.*\.pos') ''' create a corpus reader with the files in ../data/*.pos this files contains sentences tagged, and are the bases of trainig, test sets. ''' pos_fileids = reader.fileids()[1] neg_fileids = reader.fileids()[0] pos_sents = reader.tagged_sents(pos_fileids) neg_sents = reader.tagged_sents(neg_fileids) #pos_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in pos_sents ] #neg_sents = [[(word.lower(),tag) for word,tag in sent if word not in stopwords.words('english')] for sent in neg_sents ] return (pos_sents,neg_sents)
class CorpusParser: def __init__(self, root, fileids='.*', encoding='utf8'): """ Reads all the files in root. :param root: Directory. :param fileids: List of files that have to be read. '.*' if all files have to be parsed. :param encoding: File enconding """ self._reader = TaggedCorpusReader(root, fileids, encoding=encoding) def words(self): """ Returns all the words in the corpora. :return: List of words. """ return self._reader.words() def tagged_words(self): """ Returns all words of the corpora with their corresponding tag. :return: List of tuples (word, tag) """ return self._reader.tagged_words() def sentences(self): """ Returns a list of all sentences. :return: List of lists of words. Each list represents a sentence, with a list of its words in it. """ return self._reader.sents() def tagged_sentences(self): """ Returns a list of all sentences with the tag of each word. :return: List of lists of tuples. Each sentence is a list with all its members being tuples (word, tag). """ return self._reader.tagged_sents()
def __init__(self, corpus_path, corpus_files): """ Construct a Treebank object :param corpus_path: path to corpus files :param corpus_files: list of filenames for corpus text """ msg("Importing treebank...") # get a corpus reader object for our corpus using NLTK treebank = TaggedCorpusReader(corpus_path, corpus_files) # get all sentences from corpus in a tagged format self.tagged_sents = treebank.tagged_sents() # get all sentences from corpus in an untagged format self.sents = treebank.sents() msg("done!\n")
def __init__(self, corpus_path, corpus_files): """ Construct a Treebank object :param corpus_path: path to corpus files :param corpus_files: list of filenames for corpus text """ msg("Importing treebank...") # get a corpus reader object for our corpus using NLTK treebank = TaggedCorpusReader(corpus_path, corpus_files) # get all sentences from corpus in a tagged format self.tagged_sents = treebank.tagged_sents() # get all sentences from corpus in an untagged format self.sents = treebank.sents() msg("done!\n")
# Brill tagger parameters max_rules = 300 min_score = 3 # Training parameters development_size = 5110 train = .85 # Read data from development.sdx data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer()) # Get the list of tagged sentences tagged_data = data.tagged_sents() # Lower words and return as a list tagged_data_list = [[t for t in sent] for sent in tagged_data] tagged_data_list = [[(w.lower(), t) for (w, t) in s] for s in tagged_data_list] ## print "Data is read! " # Randomize training and evaluation set random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list) cutoff = int(development_size * train) # Training set training_data = tagged_data_list[:cutoff]
def setUp(self): reader = TaggedCorpusReader('./corpora/oe', 'oe_train.pos') os.system('mkdir -p taggers/oe/pos') self.sents = reader.tagged_sents()
def split_10fold(full_training_set, local_dir_rel): print("full_training_set", full_training_set) crf_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i + n] # a list of 10 lists ten_parts = list(chunks( pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = [ item.rstrip() for item in ten_parts[counter] if len(item) > 0 ] # or: test_set = part if counter == 1: print(len(test_set[993]), len(test_set[994]), len(test_set[995]), len(test_set[996])) # filter out this loop's test index training_set_lists = [ x for x in ten_parts if x is not ten_parts[counter] ] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [ item.rstrip() for sublist in training_set_lists for item in sublist if len(item) > 0 ] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test_%d.pos' % counter) with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter) test_sents = test_reader.tagged_sents() test_sents_tex = [] for test_sent in test_sents: test_sents_tex.append(' '.join([token for token, tag in test_sent])) test_text_path = os.path.join(local_dir, 'test_%d.txt' % counter) with open(test_text_path, 'w') as f: f.write('\n'.join(test_sents_tex)) test_path = os.path.join(local_dir, 'test_%d.pos' % counter) with open(test_path, 'w') as f: f.write('\n'.join(test_set)) train_path = os.path.join(local_dir, 'train_%d.pos' % counter) with open(train_path, 'w') as f: f.write('\n'.join(training_set))
storedModel = "/var/log/Terminology/pos_model_tnt.bin" else: storedModel = "/var/log/Terminology/pos_model_brill.bin" if os.path.isfile(storedModel): Service.logger.debug("Loading stored POS tagger model from %s" % storedModel) modelFile = open(storedModel, "rb") try: pos_tagger = cPickle.load(modelFile) except Exception, e: Servide.logger.debug("Exception while loading pickled POS model!") Service.logger.debug(Service.traceback.format_exc()) modelFile.close() else: autodesk = TaggedCorpusReader(adskCorpusRoot, '.*', encoding='utf-8') train_sents = autodesk.tagged_sents() + treebank.tagged_sents() # Use TnT tagger on request if useTnTTagger: if __debug_on__: Service.logger.debug("Using TnT POS tagger...") unk_tagger = DefaultTagger('NN') pos_tagger = tnt.TnT(unk=unk_tagger, Trained=True) pos_tagger.train(train_sents) # Use Brill tagger by default else: if __debug_on__: Service.logger.debug("Using Brill POS tagger...") def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
def trainPOSTagger(useTnTTagger): global __debug_on__ global pos_tagger global adskCorpusRoot # Train TNT/Brill POS-tagger using own training data + treebank data from nltk. Tested that using treebank data improves results. autodesk = TaggedCorpusReader(adskCorpusRoot, '.*', encoding='utf-8') train_sents = autodesk.tagged_sents() + treebank.tagged_sents() # Use TnT tagger on request if useTnTTagger: if __debug_on__: Service.logger.debug("Using TnT POS tagger...") unk_tagger = DefaultTagger('NN') pos_tagger = tnt.TnT(unk=unk_tagger, Trained=True) pos_tagger.train(train_sents) # Use Brill tagger by default else: if __debug_on__: Service.logger.debug("Using Brill POS tagger...") def backoff_tagger(tagged_sents, tagger_classes, backoff=None): if not backoff: backoff = tagger_classes[0](tagged_sents) del tagger_classes[0] for cls in tagger_classes: tagger = cls(tagged_sents, backoff=backoff) backoff = tagger return backoff word_patterns = [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*ould$', 'MD'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*ness$', 'NN'), (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), (r'.*ious$', 'JJ'), (r'.*ble$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'^a$', 'PREP'), ] raubt_tagger = backoff_tagger(train_sents, [nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger], backoff=nltk.tag.RegexpTagger(word_patterns)) templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)) ] trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates) pos_tagger = trainer.train(train_sents, max_rules=200, min_score=3)
class Classifier: def __init__(self, root, keyWords, devRoot): self.__root__ = root self.__keyWords__ = keyWords self.__corpus__ = None self.__classifier__ = None self.__dev_corpus__ = None self.__dev_root__ = devRoot def initClassifier(self): self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep='#') self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep='#') def separateSentence(self): grammer = r""" NP: {<.*>+} }<PU>{ """ return nltk.RegexpParser(grammer) def separateParagraphByReg(self, parag): ''' :return: a list of sentences separated by (,|.) in this paragraph :param parag: the paragraph before seggment :type parag: string ''' grammer = re.compile(',|。') return grammer.split(parag) def updateFeatures(self, src, dest): for key, val in src.items(): if type(val).__name__ == 'bool' and val: dest[key] = val elif type(val).__name__ == 'int': if key in dest: dest[key] += val else: dest[key] = val def training(self): trainSet = [] for file in self.__corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non training data sentences = self.__corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature( list(subtree)) # [(word, tag)] self.updateFeatures(subfea, features) print(features) trainSet.append((features, re.match(r"[a-z]+", file).group(0))) self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet) def salespersonFeature(self, sent): features = {} words = [word for (word, tag) in sent] for w in self.__keyWords__: features["count(%s)" % w] = words.count(w) features["has(%s)" % w] = (w in words) return features def distinguishSalesFromTagfile(self, tagfile): sents = self.__corpus__.tagged_sents(tagfile) feas = {} for sent in sents: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, feas) return self.__classifier__.classify(feas) def testClassifierAccuracy(self): testFea = [] for file in self.__dev_corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non testing data sentences = self.__dev_corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, features) testFea.append((features, re.match(r"[a-z]+", file).group(0))) return nltk.classify.accuracy(self.__classifier__, testFea)
# tagged_sentences = nltk.corpus.brown.tagged_sents() from nltk.corpus.reader import TaggedCorpusReader reader = TaggedCorpusReader('/Users/lucasrosenblatt/nltk_data/corpora/oldenglish', 'taggedOEnpnounsDone.pos') tagged_sentences = reader.tagged_sents() print(tagged_sentences[0]) print("Tagged sentences: ", len(tagged_sentences)) def features(sentence, index): """ sentence: [w1, w2, ...], index: the index of the word """ return { 'word': sentence[index], 'is_first': index == 0, 'is_last': index == len(sentence) - 1, 'is_capitalized': sentence[index][0].upper() == sentence[index][0], 'is_all_caps': sentence[index].upper() == sentence[index], 'is_all_lower': sentence[index].lower() == sentence[index], 'prefix-1': sentence[index][0], 'prefix-2': sentence[index][:2], 'prefix-3': sentence[index][:3], 'suffix-1': sentence[index][-1], 'suffix-2': sentence[index][-2:], 'suffix-3': sentence[index][-3:], 'prev_word': '' if index == 0 else sentence[index - 1], 'next_word': '' if index == len(sentence) - 1 else sentence[index + 1], 'has_hyphen': '-' in sentence[index], 'is_numeric': sentence[index].isdigit(), 'capitals_inside': sentence[index][1:].lower() != sentence[index][1:] } import pprint pprint.pprint(features(['This', 'is', 'a', 'sentence'], 2))
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) unigram_accuracies = [] bigram_accuracies = [] trigram_accuracies = [] backoff_accuracies = [] tnt_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make unigram tagger unigram_tagger = UnigramTagger(train_sents) # evaluate unigram tagger unigram_accuracy = None unigram_accuracy = unigram_tagger.evaluate(test_sents) unigram_accuracies.append(unigram_accuracy) print('Unigram:', unigram_accuracy) # make bigram tagger bigram_tagger = BigramTagger(train_sents) # evaluate bigram tagger bigram_accuracy = None bigram_accuracy = bigram_tagger.evaluate(test_sents) bigram_accuracies.append(bigram_accuracy) print('Bigram:', bigram_accuracy) # make trigram tagger trigram_tagger = TrigramTagger(train_sents) # evaluate trigram tagger trigram_accuracy = None trigram_accuracy = trigram_tagger.evaluate(test_sents) trigram_accuracies.append(trigram_accuracy) print('Trigram:', trigram_accuracy) # make 1, 2, 3-gram backoff tagger tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger3 = TrigramTagger(train_sents, backoff=tagger2) # evaluate trigram tagger backoff_accuracy = None backoff_accuracy = tagger3.evaluate(test_sents) backoff_accuracies.append(backoff_accuracy) print('1, 2, 3-gram backoff:', backoff_accuracy) # make tnt tagger tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) # evaulate tnt tagger tnt_accuracy = None tnt_accuracy = tnt_tagger.evaluate(test_sents) tnt_accuracies.append(tnt_accuracy) print('TnT:', tnt_accuracy) final_accuracies_list = [] mean_accuracy_unigram = mean(unigram_accuracies) standard_deviation_unigram = stdev(unigram_accuracies) uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}} final_accuracies_list.append(uni) mean_accuracy_bigram = mean(bigram_accuracies) standard_deviation_bigram = stdev(bigram_accuracies) bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}} final_accuracies_list.append(bi) mean_accuracy_trigram = mean(trigram_accuracies) standard_deviation_trigram = stdev(trigram_accuracies) tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}} final_accuracies_list.append(tri) mean_accuracy_backoff = mean(backoff_accuracies) standard_deviation_backoff = stdev(backoff_accuracies) back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}} final_accuracies_list.append(back) mean_accuracy_tnt = mean(tnt_accuracies) standard_deviation_tnt = stdev(tnt_accuracies) tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}} final_accuracies_list.append(tnt_score) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer import nltk d = nltk.data.find('corpora/cookbook') reader = TaggedCorpusReader(d, r'.*\.pos') print(reader.words()) print(reader.tagged_words()) print(reader.sents()) print(reader.tagged_sents()) print(reader.paras()) print(reader.tagged_paras()) # custom tokenizer reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer()) print(reader.sents()) print(reader.tagged_sents()) # universal tagset reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.tagged_sents(tagset='universal')) # NLTK tagged corpora from nltk.corpus import treebank print(reader.tagged_words()) print(reader.tagged_words(tagset='universal'))
# # Brill Tagger # # In[11]: from nltk.wsd import lesk import nltk from nltk.tokenize import sent_tokenize,word_tokenize import tkinter from nltk.tag import brill, brill_trainer from nltk.tag.brill_trainer import BrillTaggerTrainer from nltk.data import load from nltk.corpus.reader import TaggedCorpusReader train_data = TaggedCorpusReader('.', 'tagged_input_sentences.txt', sep="/") traindata= list(train_data.tagged_sents()) postag= load('taggers/maxent_treebank_pos_tagger/english.pickle') templates = [ brill.Template(brill.Pos([-1])), brill.Template(brill.Pos([1])), brill.Template(brill.Pos([-2])), brill.Template(brill.Pos([2])), brill.Template(brill.Pos([-2, -1])), brill.Template(brill.Pos([1, 2])), brill.Template(brill.Pos([-3, -2, -1])), brill.Template(brill.Pos([1, 2, 3])), brill.Template(brill.Pos([-1]), brill.Pos([1])), brill.Template(brill.Word([-1])), brill.Template(brill.Word([1])), brill.Template(brill.Word([-2])),
########## TAGGED CORPUS READER ############### from nltk.corpus.reader import TaggedCorpusReader root="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" file="brown.pos" source=root+file #Using Regex to match all files with extension .pos reader=TaggedCorpusReader(root,r'.*\.pos') print reader.words() print reader.tagged_words() print reader.sents() print reader.tagged_sents() print reader.paras() print reader.tagged_paras() #TaggedCorpus uses default tokenizer but we can change it by customizing it from nltk.tokenize import SpaceTokenizer reader=TaggedCorpusReader(root,r'.*\.pos',word_tokenizer=SpaceTokenizer()) print reader.words() #Customing TaggedCorpus's sentence tokenizer from nltk.tokenize import LineTokenizer reader=TaggedCorpusReader(root,r'.*\.pos',sent_tokenizer=LineTokenizer()) print reader.words() #Customizing TaggedCorpus's paragraph Block reader #Customizing TaggedCorpus's tag separator - Pg 57
import sys from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import LineTokenizer filename = sys.argv[1] without_extension = filename.split('.') file_address = filename.split('/') directory = file_address[:-1] directory_address = '/'.join('{}'.format(x) for x in directory) + '/' corpus_reader = TaggedCorpusReader(directory_address, [filename], sent_tokenizer=LineTokenizer(), sep='|') corpus = corpus_reader.tagged_sents() new_tags_only = open( without_extension[0] + '_tag_sets.' + without_extension[1], 'a+') count = 1 for each in corpus: new_tags_only.write(' '.join('{}'.format(x[1]) for x in each)) new_tags_only.write('\n') print(count) count += 1 print(without_extension[1] + "Tag extracting finished") new_tags_only.close()
class Classifier: def __init__(self, root, keyWords, devRoot): self.__root__ = root self.__keyWords__ = keyWords self.__corpus__ = None self.__classifier__ = None self.__dev_corpus__ = None self.__dev_root__ = devRoot def initClassifier(self): self.__corpus__ = TaggedCorpusReader(self.__root__, '.*\.txt', sep = '#') self.__dev_corpus__ = TaggedCorpusReader(self.__dev_root__, '.*\.txt', sep = '#') def separateSentence(self): grammer = r""" NP: {<.*>+} }<PU>{ """ return nltk.RegexpParser(grammer) def separateParagraphByReg(self, parag): ''' :return: a list of sentences separated by (,|.) in this paragraph :param parag: the paragraph before seggment :type parag: string ''' grammer = re.compile(',|。') return grammer.split(parag) def updateFeatures(self, src, dest): for key, val in src.items(): if type(val).__name__ == 'bool' and val: dest[key] = val elif type(val).__name__ == 'int': if key in dest: dest[key] += val else: dest[key] = val def training(self): trainSet = [] for file in self.__corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non training data sentences = self.__corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) # [(word, tag)] self.updateFeatures(subfea, features) print(features) trainSet.append((features, re.match(r"[a-z]+", file).group(0))) self.__classifier__ = nltk.NaiveBayesClassifier.train(trainSet) def salespersonFeature(self, sent): features = {} words = [word for (word, tag) in sent] for w in self.__keyWords__: features["count(%s)" % w] = words.count(w) features["has(%s)" % w] = (w in words) return features def distinguishSalesFromTagfile(self, tagfile): sents = self.__corpus__.tagged_sents(tagfile) feas = {} for sent in sents: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, feas) return self.__classifier__.classify(feas) def testClassifierAccuracy(self): testFea = [] for file in self.__dev_corpus__.fileids(): trainingData = re.match(r"[a-z]+", file) if trainingData is None: continue # skip the non testing data sentences = self.__dev_corpus__.tagged_sents(file) features = {} for sent in sentences: tree = self.separateSentence().parse(sent) for subtree in tree.subtrees(lambda t: t.label() == 'NP'): subfea = self.salespersonFeature(list(subtree)) self.updateFeatures(subfea, features) testFea.append((features, re.match(r"[a-z]+", file).group(0))) return nltk.classify.accuracy(self.__classifier__, testFea)
import nltk from nltk.tag import UnigramTagger from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import PunktWordTokenizer from nltk import RegexpParser from nltk.corpus import stopwords from nltk.tokenize.regexp import WhitespaceTokenizer global corpus, sent_tags, tagger # corpus = TaggedCorpusReader('/root/adail/python/names',r'.*\.txt',word_tokenizer=PunktWordTokenizer(),sep="_") PATH no linux corpus = TaggedCorpusReader( 'C:/Users/jose.adail/workspace/TextProcessor/names', r'.*\.txt', word_tokenizer=WhitespaceTokenizer(), sep="_") name_tags = corpus.tagged_sents( ) # Recebe as sentenças marcadas com POS_Tags. tagger = UnigramTagger( name_tags ) # UnigramTagger é treinado com essas sentenças marcadas que o são repassadas. class RegexpReplacer(object): def __init__(self): self.replacement_patterns = [(r"'", ''), (r'#', 'hash'), (r'no', 'no_'), (r'not', 'not_'), (r'RT ', ''), (r'rs[rs]+', 'rs'), (r'ha[ha]+', 'haha'), (r's[s]+', 'sxs'), (r'r[r]+', 'rxr'), (r'a[a]+', 'aqa'), (r'e[e]+', 'eqe'), (r'o[o]+', 'oqo'), (r'tt', 'tqt'), (r'ff', 'fqf'), (r'dd', 'dqd'), (r'mm', 'mqm'),
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) crf_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make crf tagger crf_tagger = CRFTagger() crf_tagger.train(train_sents, 'model.crf.tagger') # evaluate crf tagger crf_accuracy = None crf_accuracy = crf_tagger.evaluate(test_sents) crf_accuracies.append(crf_accuracy) print('crf:', crf_accuracy) #if counter> 0: break final_accuracies_list = [] mean_accuracy_crf = mean(crf_accuracies) standard_deviation_crf = stdev(crf_accuracies) uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}} final_accuracies_list.append(uni) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
import nltk from nltk.tag import RegexpTagger from nltk.corpus.reader import TaggedCorpusReader reader = TaggedCorpusReader('corpus','tagged_corpus') train = reader.tagged_sents() tagger0 = nltk.DefaultTagger('n') tagger1 = nltk.UnigramTagger(train,backoff=tagger0) tagger2 = nltk.BigramTagger(train,backoff=tagger1) patterns = [ (r'^\d+((.|,)\d+)?\.?$', 'NC'), (r'^.*\$$','$'), (r'R\$\d+((.|,)\d+)?\.?$','NC$'), (r'^(R|r)eais$','$'), (r'^(D|d)(o|ó)lares','$') ] tagger3 = RegexpTagger(patterns,backoff=tagger2) def tag(sent): result = tagger3.tag(sent.split()) return result
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer import nltk d = nltk.data.find('corpora/cookbook') reader = TaggedCorpusReader(d, r'.*\.pos') print(reader.words()) print(reader.tagged_words()) print(reader.sents()) print(reader.tagged_sents()) print(reader.paras()) print(reader.tagged_paras()) # custom tokenizer reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer()) print(reader.sents()) print(reader.tagged_sents()) # universal tagset reader = TaggedCorpusReader(d, r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.tagged_sents(tagset='universal')) # NLTK tagged corpora from nltk.corpus import treebank print(reader.tagged_words()) print(reader.tagged_words(tagset='universal'))
# Brill tagger parameters max_rules=300 min_score=3 # Training parameters development_size=5110 train=.85 # Read data from development.sdx data = TaggedCorpusReader('.', r'.*\.sdx', sep='|', sent_tokenizer=BlanklineTokenizer()) # Get the list of tagged sentences tagged_data = data.tagged_sents() # Lower words and return as a list tagged_data_list = [[t for t in sent] for sent in tagged_data] tagged_data_list = [[(w.lower(),t) for (w,t) in s] for s in tagged_data_list] ## print "Data is read! " # Randomize training and evaluation set random.seed(len(tagged_data_list)) random.shuffle(tagged_data_list) cutoff = int(development_size*train) # Training set training_data = tagged_data_list[:cutoff]
def split_10fold(full_training_set, local_dir_rel): print("full_training_set", full_training_set) crf_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = [item.rstrip() for item in ten_parts[counter] if len(item) > 0] # or: test_set = part if counter==1: print(len(test_set[993]),len(test_set[994]),len(test_set[995]),len(test_set[996])) # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item.rstrip() for sublist in training_set_lists for item in sublist if len(item) > 0] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test_%d.pos'%counter) with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter) test_sents = test_reader.tagged_sents() test_sents_tex = [] for test_sent in test_sents: test_sents_tex.append(' '.join([token for token,tag in test_sent])) test_text_path = os.path.join(local_dir, 'test_%d.txt'%counter) with open(test_text_path, 'w') as f: f.write('\n'.join(test_sents_tex)) test_path = os.path.join(local_dir, 'test_%d.pos'%counter) with open(test_path, 'w') as f: f.write('\n'.join(test_set)) train_path = os.path.join(local_dir, 'train_%d.pos'%counter) with open(train_path, 'w') as f: f.write('\n'.join(training_set))
from nltk.corpus.reader import TaggedCorpusReader from nltk import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger from nltk.probability import FreqDist from numpy import mean # for kfold validation, not working though # cross-fold validation is just brute forced... #from sklearn.model_selection import KFold #import numpy as np mypath = "C:/Users/Lauren Shin/Documents/LING 111/.final project" EstonianCorpus = TaggedCorpusReader(mypath, "estonianCaps.txt", encoding = "latin-1") sentences = EstonianCorpus.tagged_sents() tags = [tag for _, tag in EstonianCorpus.tagged_words()] mostFrequent = FreqDist(tags).max() default = DefaultTagger(mostFrequent) # cross validation #kf = KFold(n_splits = 3) # ## turns the data into a 2d array #X = np.array(sentences) ## creates a 1d array with same length/number of rows as X #y = np.arange(0, len(sentences), 1) #
def NER_HINDI(): reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos') f1 = reader.fileids() print "The Files of Corpus are:", f1 sents = reader.tagged_sents() sentn = reader.sents() #words=sentn.split() ls = len(sents) #lw=len(words) print "Length of Corpus Is:", ls #print "The Words are:",lw size1 = int(ls * 0.3) test_sents = sents[:size1] train_sents = sents[size1:] hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents) test = hmm_tagger.test(test_sents) #THE GIVEN INPUT given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode( 'utf-8') gsw = given_sent.split() tag_gs = hmm_tagger.tag(gsw) print "GIVEN SENT TAG:", tag_gs ftag_gs = " ".join(list(itertools.chain(*tag_gs))) print "And its flattened Version is:", ftag_gs #INPUT FROM FILE with open('HINDIHMMNER1.dill', 'wb') as f: dill.dump(hmm_tagger, f) with open('HINDIHMMNER1.dill', 'rb') as f: hmm_tagger1 = dill.load(f) test_tags = [ tag for sent in reader.sents() for (word, tag) in hmm_tagger1.tag(sent) ] gold_tags = [tag for (word, tag) in reader.tagged_words()] ltesttag = len(test_tags) lgtags = len(gold_tags) print "Test Tag Len:", ltesttag print "Gold Tag Len:", lgtags cm = nltk.ConfusionMatrix(gold_tags, test_tags) print(cm.pretty_format(sort_by_count=True, show_percents=False, truncate=5)) labels = set('NA GPE PERS DATE ORG'.split() ) #THE TAG SETS AS GENERATED IN CONFUSION MATRIX true_positives = Counter() false_negatives = Counter() false_positives = Counter() for i in labels: for j in labels: if i == j: true_positives[i] += cm[i, j] else: false_negatives[i] += cm[i, j] false_positives[j] += cm[i, j] print "TP:", sum(true_positives.values()), true_positives print "FN:", sum(false_negatives.values()), false_negatives print "FP:", sum(false_positives.values()), false_positives print for i in sorted(labels): if true_positives[i] == 0: fscore = 0 else: precision = true_positives[i] / float(true_positives[i] + false_positives[i]) recall = true_positives[i] / float(true_positives[i] + false_negatives[i]) fscore = 2 * (precision * recall) / float(precision + recall) fscore1 = fscore * 100 print "TAG:", i, "FMEASURE:", fscore1