def initializeConll(self): from nltk.corpus import conll2000 self.test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) self.train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) self.NPChunker = ChunkParser(self.train_sents)
def main(): # 使用CoNLL2000分块语料库训练 test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) chunker = ConsecutiveNPChunker(train_sents) print(chunker.evaluate(test_sents))
def train_ai(): print("Training...") train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP', 'VP', 'PP']) test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP', 'VP', 'PP']) return BigramChunker(train_sents)
def exercise3(): print("part a") test_sents = conll2000.chunked_sents('train.txt')[:99] grammar = r""" NP: {<DT>?<JJ>*<NN>} {<VBD>?<IN>?<JJ>*<NNS>} """ cp = nltk.RegexpParser(grammar) print(cp.evaluate(test_sents)) print("part b") test_sents = "Many little dogs barked at cats" cp = nltk.RegexpParser("") test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) print("Baseline with no chunks : ", cp.evaluate(test_sents)) grammar = r"NP: {<[CDJNP].*>+}" cp = nltk.RegexpParser(grammar) print("IOB tag evaluation: ", cp.evaluate(test_sents)) print("part c") test_sents = conll2000.chunked_sents('train.txt')[:99] grammar = r""" NP: {<DT>?<JJ>*<NN>} {<VBD>?<IN>?<JJ>*<NNS>} {<[CDJNP].*>+} """ cp = nltk.RegexpParser(grammar) print(cp.evaluate(test_sents))
def get_noun_phrases_and_named_entities_data(data): # print data train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) chunker = BigramChunker.BigramChunker(train_sents + test_sents) tagged_data = [] for sent in data: tokens = nltk.word_tokenize(sent) tagged = nltk.pos_tag(tokens) tagged_data.append(tagged) noun_phrases = [] for tagged_sent in tagged_data: tree = chunker.parse(tagged_sent) noun_phrases += nltk.chunk.tree2conlltags(tree) named_entities = [] for tagged_sent in tagged_data: tree = nltk.chunk.ne_chunk(tagged_sent) named_entities += nltk.chunk.tree2conlltags(tree) words = [] cnt = 0 for sent in data: cnt += 1 tokens = nltk.word_tokenize(sent) for token in tokens: words.append((token, cnt)) # print words # print noun_phrases # print named_entities return (words, noun_phrases, named_entities)
def exercise3(): #Carry out the following evaluation tasks for the chunker you have developed in question 2 # set variables chunk_types = ['NP', 'NNS'] #'JJ', 'NNS', 'VBD', 'IN' test_sents = "Many little dogs barked at cats" #test_sents = conll2000.chunked_sents('test.txt', chunk_types=chunk_types) train_sents = conll2000.chunked_sents('train.txt', chunk_types=chunk_types) # establishing a baseline for the trivial chunk parser cp that creates no chunks cp = nltk.RegexpParser("") test_sents = conll2000.chunked_sents('test.txt', chunk_types=chunk_types) print("Baseline with no chunks", cp.evaluate(test_sents)) grammar = r"NP: {<[CDJNP].*>+}" #tags beginning with letters that are characteristic of noun phrase tags (e.g. CD, DT, and JJ) cp = nltk.RegexpParser(grammar) print("IOB tag evaluation", cp.evaluate(test_sents)) # UnigramChunker unigram_chunker = UnigramChunker(train_sents) print("UnigramChunker", unigram_chunker.evaluate(test_sents)) # BiGramChunker bigram_chunker = BigramChunker(train_sents) print("BigramChunker", bigram_chunker.evaluate(test_sents)) # ConsecutiveNPChunker ngram_chunker = ConsecutiveNPChunker(train_sents) print("ConsecutiveNPChunker", ngram_chunker.evaluate(test_sents))
def __init__(self): train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) ctagged_sents = [[((w,t),c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) self._test_sents = [[((w,t), c) for (w,t,c) in nltk.chunk.tree2conlltags(sent)] for sent in test_sents] self._tagger = ClassifierBasedTagger(train=ctagged_sents, feature_detector=npchunk_features)
def evaluate(): text = ''' he PRP B-NP accepted VBD B-VP the DT B-NP position NN I-NP of IN B-PP vice NN B-NP chairman NN I-NP of IN B-PP Carlyle NNP B-NP Group NNP I-NP , , O a DT B-NP merchant NN I-NP banking NN I-NP concern NN I-NP . . O ''' nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw() print conll2000.chunked_sents('train.txt')[99] print conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99]
def main(): train_sents = (nltk.chunk.tree2conlltags(s) for s in conll2000.chunked_sents('train.txt', chunk_types=['NP'])) # test_sents = (nltk.chunk.tree2conlltags(s) for s in conll2000.chunked_sents('test.txt', chunk_types=['NP'])) test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) fd = np_tags_fd(train_sents) print_frequencies(fd, num_results=50) # pattern = regex_generator(fd) # print pattern # pattern = r"NP: {<NN>}" print nltk.RegexpParser("").evaluate(test_sents) print '' pattern_book = r"NP: {<[CDJNP].*>+}" print nltk.RegexpParser(pattern_book).evaluate(test_sents) print '' pattern_modified = r"NP: {<(\$)>?<[CDJNP].*>+}" print nltk.RegexpParser(pattern_modified).evaluate(test_sents) print '' pattern_modified = r"""NP: {<(\$)>?<[CDJNP].*>+} {<W(P|DT)>}""" print nltk.RegexpParser(pattern_modified).evaluate(test_sents)
def chunk_with_unigram_tagger(): # use unigram tagger to find the IOB tag given its POS tag from nltk.corpus import conll2000 test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"]) train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"]) unigram_chunker = UnigramChunker(train_sents) print unigram_chunker.evaluate(test_sents) postags = sorted(set(pos for sent in train_sents for (word, pos) in sent.leaves())) print unigram_chunker.tagger.tag(postags)
def chunker_sample7(): """ 分类器分块器示例代码 :return: """ train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"]) test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"]) tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] chunker = ClassifierChunker(tagged_sents) print(chunker.evaluate(test_sents))
def chunked_sents(): print(conll2000.chunked_sents('train.txt')[99]) # (S # (PP Over/IN) # (NP a/DT cup/NN) # (PP of/IN) # (NP coffee/NN) # ,/, # (NP Mr./NNP Stone/NNP) # (VP told/VBD) # (NP his/PRP$ story/NN) # ./.) print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])
def _load_data(): try: train_set = conll2000.chunked_sents('train.txt') test_set = conll2000.chunked_sents('test.txt') except Exception: if license_prompt('CONLL2000 data set', 'http://www.nltk.org/nltk_data/') is False: sys.exit(0) nltk.download('conll2000') train_set = conll2000.chunked_sents('train.txt') test_set = conll2000.chunked_sents('test.txt') train_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in train_set] test_data = [list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in test_set] return train_data, test_data
def main(convert_func = None): train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) if convert_func: # transform the sentence print "convert leaf nodes" test_sents = [convert_leaf_node(sent, convert_func) for sent in test_sents] print "train..." chunker = ConsecutiveNPChunker(train_sents) print "evaluate..." print(chunker.evaluate(test_sents))
def main(convert_func=None): train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) if convert_func: # transform the sentence print "convert leaf nodes" test_sents = [ convert_leaf_node(sent, convert_func) for sent in test_sents ] print "train..." chunker = ConsecutiveNPChunker(train_sents) print "evaluate..." print(chunker.evaluate(test_sents))
def chunker(sent): #a = [("I","PRP"),("hear","VBP"),("Jerusalem","NNP"),("bells","NNS"),("ringing","VBG")] #input_sent = " Rockwell said the agreement calls for it to supply 200 addititonal so-called shipsets for the planes." input_sent = sent text = nltk.word_tokenize(input_sent) a = nltk.pos_tag(text) phrases = [] tup = () '''test_sents = conll2000.chunked_sents('test.txt', chunk_types=['VP']) train_sents = conll2000.chunked_sents('train.txt', chunk_types=['VP']) test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])''' NP_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) VP_sents = conll2000.chunked_sents('train.txt', chunk_types=['VP']) class ChunkParser(nltk.ChunkParserI): def __init__(self, train_sents): train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.TrigramTagger(train_data) def parse(self, sentence): pos_tags = [pos for (word, pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)] return nltk.chunk.util.conlltags2tree(conlltags) NPChunker = ChunkParser(NP_sents) VPChunker = ChunkParser(VP_sents) #print (NPChunker.parse("I hear Jerusalem bells ringing")) parsed_sent = NPChunker.parse(a) for i in parsed_sent: if (type(i) != type(tup)): l = [] for t in tuple(i): l.append(t[0]) phrases.append({"NP": " ".join(l)}) parsed_sent = VPChunker.parse(a) for i in parsed_sent: if (type(i) != type(tup)): l = [] for t in tuple(i): l.append(t[0]) phrases.append({"VP": " ".join(l)}) return phrases
def main(): # 使用CoNLL2000分块语料库训练 test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) # chunker = UnigramChunker(train_sents) chunker = BigramChunker(train_sents) print(chunker.evaluate(test_sents)) # ChunkParse score: # IOB Accuracy: 92.9%% # Precision: 79.9%% # Recall: 86.8%% # F-Measure: 83.2%% postags = sorted( set(pos for sent in train_sents for (word, pos) in sent.leaves())) print(chunker.tagger.tag(postags))
def exercise3(): print "Exercise - 3" grammar1 = r""" NP: {<DT>?<JJ><NNS>} {<CD><NNS>} """ test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])[:100] cp1 = nltk.RegexpParser(grammar1) res1 = cp1.evaluate(test_sents) print "Statistics data for custom chunker" print res1 print cp2 = nltk.RegexpParser("") res2 = cp2.evaluate(test_sents) print "Statistics data for baseline chunker" print res2 print grammar3 = r""" NP: {<DT>?<JJ><NNS>} {<CD><NNS>} {<DT><NN>} """ cp3 = nltk.RegexpParser(grammar3) res3 = cp3.evaluate(test_sents) print "Statistics data for custom chunker with added regular expression: {<DT><NN>}" print res3 print
def __init__(self): try: tagger = cPickle.load(open('nerdb_tagger.pkl')) except IOError: print 'failed to load nerdb_tagger, recreating...' train_sents = conll2000.tagged_sents() + brown.tagged_sents() tagger = nltk.DefaultTagger('NN') tagger = nltk.UnigramTagger(train_sents, backoff=tagger) tagger = nltk.BigramTagger(train_sents, backoff=tagger) tagger = nltk.TrigramTagger(train_sents, backoff=tagger) cPickle.dump(tagger, open('nerdb_tagger.pkl', 'w')) print 'done' try: chunker = cPickle.load(open('nerdb_chunker.pkl')) except IOError: print 'failed to load nerdb_chunker, recreating...' train_sents = conll2000.chunked_sents() chunker = ConsecutiveNPChunker(tagger, train_sents) cPickle.dump(chunker, open('nerdb_chunker.pkl', 'w')) print 'done' self.chunker = chunker self.people = [line.strip().split(" ", 1) for line in open('actors_index.txt').readlines()] self.people += [line.strip().split(" ", 1) for line in open('actresses_index.txt').readlines()] self.movies = [line.strip().split(" ", 1) for line in open('title_index.txt').readlines()] self.entity_types = {'PERSON' : self.people, 'MOVIE' : self.movies}
def regexp_parser_sample5(): grammar = r"NP: {<[CDJNP].*>+}" cp = nltk.RegexpParser(grammar) # 加载训练文本中的NP块 test_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"]) print(cp.evaluate(test_sents))
def simple_np_bgram(documents): bgram = BigramChunker(conll2000.chunked_sents('train.txt')) for doc in documents: buf = [] for sent in pos.preprocess(doc): buf.append(bgram.parse(sent)) yield buf
def train_parser(): """ 训练分块器 """ # 简单的分块器,抽取NNP(专有名词) def mySimpleChunker(): grammar = 'NP: {<NNP>+}' return nltk.RegexpParser(grammar) # 不抽取任何东西,只用于检验算法能否正常运行 def test_nothing(data): cp = nltk.RegexpParser("") print(cp.evaluate(data)) # 测试mySimpleChunker()函数 def test_mySimpleChunker(data): schunker = mySimpleChunker() print(schunker.evaluate(data)) datasets = [ conll2000.chunked_sents('test.txt', chunk_types=['NP']), treebank_chunk.chunked_sents(), ] # 前50个IOB标注语句 计算分块器的准确率 for dataset in datasets: test_nothing(dataset[:50]) print('---------------------') test_mySimpleChunker(dataset[:50]) print()
def __init__(self): super().__init__() nltk.download("conll2000") nltk.download("averaged_perceptron_tagger") data = conll2000.chunked_sents() train_data = data[:10900] self.model = ClassifierChunkParser(train_data)
def __init__(self): try: tagger = cPickle.load(open("nerdb_tagger.pkl")) except IOError: print "failed to load nerdb_tagger, recreating..." train_sents = conll2000.tagged_sents() + brown.tagged_sents() tagger = nltk.DefaultTagger("NN") tagger = nltk.UnigramTagger(train_sents, backoff=tagger) tagger = nltk.BigramTagger(train_sents, backoff=tagger) tagger = nltk.TrigramTagger(train_sents, backoff=tagger) cPickle.dump(tagger, open("nerdb_tagger.pkl", "w")) print "done" try: chunker = cPickle.load(open("nerdb_chunker.pkl")) except IOError: print "failed to load nerdb_chunker, recreating..." train_sents = conll2000.chunked_sents() chunker = ConsecutiveNPChunker(tagger, train_sents) cPickle.dump(chunker, open("nerdb_chunker.pkl", "w")) print "done" self.chunker = chunker self.people = [line.strip().split(" ", 1) for line in open("actors_index.txt").readlines()] self.people += [line.strip().split(" ", 1) for line in open("actresses_index.txt").readlines()] self.movies = [line.strip().split(" ", 1) for line in open("title_index.txt").readlines()] self.entity_types = {"PERSON": self.people, "MOVIE": self.movies} self.numbers = eval(open("numbers.txt").read())
def classifier_based_parser(input_dict): train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) classifier_based_chunker = ClassifierBasedChunkParser(train_sents) return {'chunker': {'object': classifier_based_chunker, 'function': 'parse', } }
def get_noun_phrases_and_named_entities(file_name, start_index, end_index): sentences = conll2000.sents(file_name) noun_phrase_sentences = conll2000.chunked_sents(file_name, chunk_types=['NP']) pos_tagged_sentences = conll2000.tagged_sents(file_name) sentences = sentences[start_index:end_index] pos_tagged_sentences = pos_tagged_sentences[start_index:end_index] noun_phrase_sentences = noun_phrase_sentences[start_index:end_index] # Extacting mentions. words = [] cnt = 0 for sent in sentences: cnt += 1 for word in sent: words.append((word, cnt)) noun_phrases = [] for sent in noun_phrase_sentences: noun_phrases += nltk.chunk.tree2conlltags(sent) named_entities = [] for tagged_sent in pos_tagged_sentences: tree = nltk.chunk.ne_chunk(tagged_sent) named_entities += nltk.chunk.tree2conlltags(tree) return (words, noun_phrases, named_entities)
def evaluate_chunker(): from nltk.corpus import conll2000 cp = nltk.RegexpParser("") # baseline test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"]) print cp.evaluate(test_sents) grammar = r"NP: {<[CDJNP].*>+}" cp1 = nltk.RegexpParser(grammar) # naive tagger, look for all tags in NP chunk print cp1.evaluate(test_sents)
def __init__(self): try: self.unigram_chunker = cPickle.load(open('chunker.pkl', 'r')) except (EOFError, IOError): train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) unigram_chunker = ConsecutiveNPChunker(train_sents) f = open('chunker.pkl', 'wb') cPickle.dump(unigram_chunker, f, -1)
def train_chunker(filesDir): # Create chunked sentences in the CoNLL format. train_sents = conll2000.chunked_sents('train_locations.txt', chunk_types=['Loc']) # Train the chunker with the NaiveBayesClassifier chunker = ConsecutiveNPChunker(train_sents, combine_features, nltk.NaiveBayesClassifier) return chunker
def _build_training_sents(self ): # This method randomly select a corpus from the provided lists and then # build and return a train sentences that the chunkers will use corpuses = [(conll2000,'train.txt'),(conll2002,'esp.train')] #trainer = random.choice(corpuses) #train_sents = trainer[0].chunked_sents(trainer[1],chunk_types=['NP']) train_sents = conll2000.chunked_sents('train.txt',chunk_types=['NP']) return train_sents
def drawParse(text): sentences = posTagging(text) # test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"]) chunker = ChunkParser(train_sents) for s in sentences: chunker.parse(s).draw()
def simple_np_ugram(documents): ugram = UnigramChunker(conll2000.chunked_sents('train.txt')) """String sentences get split up into a datastructure""" for doc in documents: buf = [] for sent in pos.preprocess(doc): buf.append(ugram.parse(sent)) yield buf
def chunking(): train_sents = conll2000.chunked_sents('train.txt') train_data = [[w for w, t, c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] train_label = [[c for w, t, c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] # now append chunking to the front of each group/string return train_data, train_label
def __init__(self, stopWordPath=False, megamPath=False): ''' Initialise the class. The method initialises a. self.texts = a list of texts contained in the corpus b. self.IDS : the id of each text c. self.stopWords : a list of any additional stopword d. self.Lemmatizer : a defaultdict to stem words. It returns 'n' for each token, except for verbs, for which it returns 'v' --------------------------- KeyWord arguments: i. stopWordPath : a file path to load extra stop words ii. megamPath : a path to the megam binary to train the chunker ''' ## a container of the texts in the corpus self.texts = [] ## input files self.in_files = [] ## ids of the texts self.IDS = [] ## stopwords self.stopWords = [] if stopWordPath: ## a path to a text file containing a list of ## stop words self.stopWordPath = stopWordPath self.stopWords = [ l.strip() for l in open(self.stopWordPath).readlines() ] ## initialise the dict for stemming self.Lemmatizer = collections.defaultdict(nounDict) self.Lemmatizer['v'] = 'v' ## try to load a trained chunker trainPath = pkg_resources.resource_filename('MyLanguageCorpus', '') if os.path.exists('%s/trainedChunker.pkl' % trainPath): fin = open('%s/trainedChunker.pkl' % trainPath, 'rb') self.chunker = pickle.load(fin) fin.close() else: train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) print("Training chunker...") if 'MEGAM' in os.environ: MEGAM = os.environ['MEGAM'] elif megamPath: MEGAM = megamPath os.environ['MEGAM'] = megamPath chunker = ConsecutiveNPChunker(train_sents) fout = open('%s/trainedChunker.pkl' % trainPath, 'wb') pickle.dump(chunker, fout) fout.close() self.chunker = chunker
def simpleEvaluation(): cp = nltk.RegexpParser("") test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) print cp.evaluate(test_sents) grammar = r"NP: {<[CDJNP].*>+}" cp = nltk.RegexpParser(grammar) print cp.evaluate(test_sents)
def _load_data(): try: train_set = conll2000.chunked_sents('train.txt') test_set = conll2000.chunked_sents('test.txt') except Exception: if license_prompt('CONLL2000 data set', 'http://www.nltk.org/nltk_data/') is False: sys.exit(0) nltk.download('conll2000') train_set = conll2000.chunked_sents('train.txt') test_set = conll2000.chunked_sents('test.txt') train_data = [ list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in train_set ] test_data = [ list(zip(*nltk.chunk.tree2conlltags(sent))) for sent in test_set ] return train_data, test_data
def train_unigram(fichero): corpus_comida = conll2000.chunked_sents(fichero, chunk_types=['COMIDA', 'CANTIDAD']) print(corpus_comida) train_data = [[(w, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] for sent in corpus_comida] print(train_data) tagger = nltk.UnigramTagger(train_data) return tagger
def drawParse(text): sentences = posTagging(text) #test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) chunker = ChunkParser(train_sents) for s in sentences: chunker.parse(s).draw()
def __init__(self, POS): ''' @param POS: the POS tagger is passed through ''' train_sents = conll2000.chunked_sents() train_data = [[(t, c) for w, t, c in tree2conlltags(sent)] for sent in train_sents] self.T = nltk.TrigramTagger(train_data) self.Tagger = POS self.tmp = []
def ch07_07_chunker_eval(): from nltk.corpus import conll2000 grammar = r""" NP: {<NN.*>} {<DT> <NN> <JJ> <NN>} {<DT> <JJ>* <NN.*>} {<POS> <JJ>* <NN>} {<NNP> <CC> <NNP>} """ cp = nltk.RegexpParser(grammar) test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"]) print cp.evaluate(test_sents)
def load_nltk_chunked_sentences(cls): """ Load CONLL2000 chunktagged sentences and convert from nltk.tree format Returns: List of lists where inner list is [(pos, chunk_tag), ... ] representing one sentence """ train_sents = [ [(pos_tag, chunk_tag) for word, pos_tag, chunk_tag in nltk.chunk.tree2conlltags(sentence)] for sentence in conll2000.chunked_sents() ] return cls(train_sents)
def ch07_13c_better_chunker(): # can be improved with more patterns from the top from previous method from nltk.corpus import conll2000 grammar = r""" NP : {<DT> <JJ> <NN.*>} {<DT> <NN.*>} {<JJ> <NN.*>} {<NN.*>+} """ cp = nltk.RegexpParser(grammar) test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"]) print cp.evaluate(test_sents)
def run_on_corpus(self, corpus): """Write sentences to a temporary file as strings of words, run TagChunk on the file and retrieve the tagged results, then delete the file. """ # Check if the corpus consists of Sentences or MultiSentences, and # get a single list of Sentences either way sentences = [] if corpus[0].__class__ == text.sentence.MultiSentence: for multisentence in corpus: # Collect the Sentence objects from each MultiSentence sentences.extend(multisentence.sentences) else: sentences = corpus train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP', 'VP', 'PP']) unigram_chunker = UnigramChunker(train_sents) strings_BIO = [] for sentence in sentences: sentence_text = ' '.join(sentence.tokens) tags = [t[1] for t in nltk.pos_tag(sentence_text.split())] sentence_arr = sentence_text.split(" ") bio = unigram_chunker.tagger.tag(tags) temp = "" for i in range(0, len(sentence_arr)): if str(bio[i][1]) == "O": temp += sentence_arr[i] + "_" + bio[i][0] + "_B-" + str( bio[i][1]) + " " else: temp += sentence_arr[i] + "_" + bio[i][0] + "_" + str( bio[i][1]) + " " temp = temp[:-1] strings_BIO.append(temp) #print(strings_BIO) # Process sentence for sentence, string_BIO in zip(sentences, strings_BIO): #print("debugging") pos_tags, chunks = self.process_BIO_string(string_BIO) sentence.add_token_tags(pos_tags, name='pos_tags', annotator='chunker') sentence.add_span_tags(chunks, name='chunks', annotator='chunker')
def ch07_03_develop_grammar_with_chunkparser(): # nltk.app.chunkparser() from nltk.corpus import conll2000 grammar = r""" NP: {<NN.*>} {<DT> <NN> <JJ> <NN>} {<DT> <JJ>* <NN.*>} {<POS> <JJ>* <NN>} {<NNP> <CC> <NNP>} """ cp = nltk.RegexpParser(grammar) for sentence in conll2000.chunked_sents("train.txt", chunk_types=["NP"]): print cp.parse(sentence)
def process_file(input_file): file_text='' with open(input_file, 'r') as content_file: file_text = content_file.read().decode('utf-8') #sentences= nltk.sent_tokenize(file_text) sentences=file_text.split('\n') noun_phrases=[] train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) chunker = BigramChunker(train_sents) print (chunker.evaluate(test_sents)) for sent in sentences: if not sent: continue tokens = nltk.word_tokenize(sent) if len(tokens)>0: tagged = nltk.pos_tag(tokens) chunked = chunk_np(tagged) #chunked = chunker.parse(tagged) #chunked.draw() utils.traverse(chunked) """
def ch07_13a_tag_seqs_for_np(): from nltk.corpus import conll2000 train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"]) fdist = nltk.FreqDist() tagseq = [] for sent in train_sents: for word, postag, iobtag in nltk.chunk.tree2conlltags(sent): if iobtag == "B-NP": fdist.inc(" ".join(tagseq)) tagseq = [] tagseq.append(postag) elif iobtag == "O": continue else: tagseq.append(postag) for tagseq in fdist.keys(): print tagseq, fdist[tagseq]
def __init__(self, **kwargs): super(StoryApp, self).__init__(**kwargs) self.model = models.Model() self.parser = strategy.Parser( stemmer=nltk.PorterStemmer(), sentence_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'), # todo: analyze more chunk types # possible chunk types: # NP (noun phrase) # VP (such as 'has already delivered') # PP (such as 'because of') chunker=chunkers.UnigramChunker(conll2000.chunked_sents('train.txt', chunk_types=['NP'])) ) self.entity_resolver = strategy.EntityResolutionStrategy() self.window = None
def test_chunker(filesDir, classifier): # Create chunked sentences in the CoNLL format. test_sents = conll2000.chunked_sents('test_locations.txt', chunk_types=['Loc']) print classifier.evaluate(test_sents); text = conll2000.raw('test_data_normal.txt') location_list = [] sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') for sent in sent_tokenizer.tokenize(text): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'node'): if chunk.node == "GPE": location = ' '.join(c[0] for c in chunk.leaves()) location_list.append(location) print location_list
def __init__(self, binary=False, extract_noun_phrases=False, first_sentence_weight=1): self.columns = FEATURE_COLUMNS self.binary = binary self.extract_noun_phrases = extract_noun_phrases self.first_sentence_weight = first_sentence_weight train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) self.chunker = ChunkParser(train_sents) opts = '' if binary: opts = opts + 'binary ' if extract_noun_phrases: opts = opts + 'extract_noun_phrases ' if first_sentence_weight > 1: opts = opts + 'upweight_first_sentence ' self.name = opts
def run(q_id): train_sents = conll2000.chunked_sents('train.txt') unigram_chunker = UnigramChunker(train_sents) import init #get document here and tag; put into this format: #tagged = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN"),(".", ".")] topdoc = init.get_corpus(q_id) doc_nums = topdoc.keys() answers= []; for key in doc_nums: doc_text = topdoc[key] docnum= key #print docnum doc_text = clean_punctuation(doc_text) #print doc_text doc_text= doc_text.split() tagged=pos_tag(doc_text) chunked=unigram_chunker.parse2(tagged) flatten= chunked.pos() #print flatten numbered= enumerate(flatten) currentTag='' words=[] for i,v in numbered: #print i,v ((word,tag),phrasetag)=v if currentTag=='': currentTag=phrasetag if currentTag==phrasetag: words.append(word) else: answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id)) currentTag= phrasetag words= [word] answers.append((' '.join(words),docnum,i-len(words),currentTag,q_id)) #print answers return answers
def demo(): """ A demonstration for the C{RegexpChunkParser} class. A single text is parsed with four different chunk parsers, using a variety of rules and strategies. """ from nltk import chunk, Tree text = """\ [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./. [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./. [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./. """ print '*'*75 print 'Evaluation text:' print text print '*'*75 print grammar = r""" NP: # NP stage {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns {<NNP>+} # chunk proper nouns """ cp = chunk.RegexpParser(grammar) chunk.demo_eval(cp, text) grammar = r""" NP: {<.*>} # start by chunking each tag }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods <DT|JJ>{}<NN.*> # merge det/adj with nouns """ cp = chunk.RegexpParser(grammar) chunk.demo_eval(cp, text) grammar = r""" NP: {<DT>?<JJ>*<NN>} # chunk determiners, adjectives and nouns VP: {<TO>?<VB.*>} # VP = verb words """ cp = chunk.RegexpParser(grammar) chunk.demo_eval(cp, text) grammar = r""" NP: {<.*>*} # start by chunking everything }<[\.VI].*>+{ # chink any verbs, prepositions or periods <.*>}{<DT> # separate on determiners PP: {<IN><NP>} # PP = preposition + noun phrase VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs """ cp = chunk.RegexpParser(grammar) chunk.demo_eval(cp, text) # Evaluation from nltk.corpus import conll2000 print print "Demonstration of empty grammar:" cp = chunk.RegexpParser("") print chunk.accuracy(cp, conll2000.chunked_sents('test.txt', chunk_types=('NP',))) print print "Demonstration of accuracy evaluation using CoNLL tags:" grammar = r""" NP: {<.*>} # start by chunking each tag }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods <DT|JJ>{}<NN.*> # merge det/adj with nouns """ cp = chunk.RegexpParser(grammar) print chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5]) print print "Demonstration of tagged token input" grammar = r""" NP: {<.*>*} # start by chunking everything }<[\.VI].*>+{ # chink any verbs, prepositions or periods <.*>}{<DT> # separate on determiners PP: {<IN><NP>} # PP = preposition + noun phrase VP: {<VB.*><NP|PP>*} # VP = verb words + NPs and PPs """ cp = chunk.RegexpParser(grammar) print cp.parse([("the","DT"), ("little","JJ"), ("cat", "NN"), ("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN"), (".", ".")])