def get_ptb_data(w2id): all_words = [] for item in ptb.fileids(): all_words.extend(list(map(str.lower, ptb.words(item)))) # print(all_words) all_words_id = [] for w in all_words: id = w2id.get(w) if id == None: id = w2id.get("<unk>") all_words_id.append(id) return all_words_id
def parse_wsj(): from nltk.corpus import ptb import logging logger = logging.getLogger('root') all_lines = [] for wsj_folder in range(0, 25): for file_number in range(0, 100): wsj_file = 'wsj/%02d/wsj_%02d%02d.mrg' % (wsj_folder, wsj_folder, file_number) logger.info('opening %s' % wsj_file) try: line = [word.lower() for word in ptb.words(wsj_file)] all_lines.append(line) except IOError: logger.info('not found') return all_lines
def test_category_words(self): self.assertEqual( ptb.words(categories=['humor', 'fiction'])[:6], ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back'], )
def test_words(self): self.assertEqual( ptb.words('WSJ/00/WSJ_0003.MRG')[:7], ['A', 'form', 'of', 'asbestos', 'once', 'used', '*'], )
def test_category_words(self): self.assertEqual( ptb.words(categories=['humor','fiction'])[:6], ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back'] )
def test_words(self): self.assertEqual( ptb.words('WSJ/00/WSJ_0003.MRG')[:7], ['A', 'form', 'of', 'asbestos', 'once', 'used', '*'] )
def test_words(self): self.assertEqual( ptb.words("WSJ/00/WSJ_0003.MRG")[:7], ["A", "form", "of", "asbestos", "once", "used", "*"], )
def test_category_words(self): self.assertEqual( ptb.words(categories=["humor", "fiction"])[:6], ["Thirty-three", "Scotty", "did", "not", "go", "back"], )
def main(test=False): """ makes a big dumb PTB CFG, and ShiftReduceParser, and a ViterbiParser, and serializes them all to disk for future use. The ViterbiParser runs in cubic time and give the most likely parse. The ShiftReduceParser runs in linear time and gives a single parse. https://stackoverflow.com/questions/7056996/how-do-i-get-a-set-of-grammar-rules-from-penn-treebank-using-python-nltk https://groups.google.com/forum/#!topic/nltk-users/_LXtbIekLvc https://www.nltk.org/_modules/nltk/grammar.html """ vocabulary = chainer.datasets.get_ptb_words_vocabulary() freq_thresh = 0 ## ARBITRARY word_freqs = FreqDist(ptb.words()) if not os.path.isfile('parsers/grammar.pkl'): productions = [] add_dict = {} # use the entire treebank's parsed sentences to generate the CFG for i, tree in enumerate(ptb.parsed_sents()): # is it a good idea to combine this with my preprocessing? tree.collapse_unary(collapsePOS=False) tree.chomsky_normal_form(horzMarkov=2) # preprocess all productions by removing all tags these_productions = tree.productions() for production in these_productions: # remove all tags from the LHS (only keep primary tag) production._lhs = preprocess_nt(production._lhs) rhs = [] for item in production._rhs: # remove all tags from the Nonterminals on the RHS if type(item) == nltk.grammar.Nonterminal: rhs.append(preprocess_nt(item)) # replace numbers with N elif is_number(item): rhs.append('N') # items not in dictionary replaced with <unk> # dictionary requires lower elif not is_key(vocabulary, item.lower()): rhs.append('<unk>') # replace infrequent words with <unk> elif word_freqs[item] < freq_thresh: rhs.append('<unk>') # lowercase all entries in the grammar else: rhs.append(item.lower()) production._rhs = tuple(rhs) if not is_key(add_dict, production.unicode_repr()): add_dict[production.unicode_repr()] = True productions.append(production) print('** {} productions found! **'.format(len(productions))) grammar = induce_pcfg(Nonterminal('S'), productions) with open('parsers/grammar.pkl', 'wb') as f: f.write(pickle.dumps(grammar)) if not os.path.isfile('parsers/viterbi_parser.pkl'): filename = open('parsers/grammar.pkl', 'rb') grammar = pickle.load(filename) viterbi_parser = ViterbiParser(grammar, trace=0) # cubic time with open('parsers/viterbi_parser.pkl', 'wb') as f: f.write(pickle.dumps(viterbi_parser)) if not os.path.isfile('parsers/shift_reduce_parser.pkl'): filename = open('parsers/grammar.pkl', 'rb') grammar = pickle.load(filename) shift_reduce_parser = ShiftReduceParser(grammar, trace=0) # linear time with open('parsers/shift_reduce_parser.pkl', 'wb') as f: f.write(pickle.dumps(shift_reduce_parser)) with open('data/ptb.train.txt', 'r') as f: data = f.readlines() if test: for sample in [1, 23, 20330, 20332, 443]: t1 = time.time() viterbi_parser.parse_one(data[sample].split()) t2 = time.time() print('viterbi = {:.2f} sec for {} words'.format( t2-t1, len(data[sample].split()))) t1 = time.time() shift_reduce_parser.parse_one(data[sample].split()) t2 = time.time() print('shift reduce = {:.2f} sec for {} words'.format( t2-t1, len(data[sample].split())))
def test_category_words(self): self.assertEqual( ptb.words(categories=["humor", "fiction"])[:6], ["Thirty-three", "Scotty", "did", "not", "go", "back"] )
def test_words(self): self.assertEqual(ptb.words("WSJ/00/WSJ_0003.MRG")[:7], ["A", "form", "of", "asbestos", "once", "used", "*"])
singleWords=set([","]) punc=set([":","#","-LRB-","CD","TO","``","S","''","$","NP","IN","-RRB-",",","."]) punc2=set([":","#","``","''","$",",","."]) i=0 # I read all files then I will go thorugh all lines in each file and I used stack to extract the grammers. #for each ')' I pop from stack until I face '(' so I generate rule and push back the last item in the stack for a next rule. for o in os.listdir('wsj'): if (os.path.isdir(os.path.join('wsj',o))): for filename in os.listdir(os.path.join('wsj',o)): i=i+1 print i if filename.endswith(".mrg"): path='wsj/'+o+'/'+filename singleWords=set(ptb.words(path))| singleWords pranthesis=Stack() words=Stack() f=open(path,'r') content=f.readlines() allWords=[] for line in content: for l in line.strip().split(): # with regular expression we break each line to: words '(' ')' result=re.findall("[)]|[(]|[^\)\(]*",l.strip()) for r in result: if r !="": allWords.append(r) buffer=[] for word in allWords:
print(docs[0].docno) print(docs[0].doctype) print(docs[0].date_time) print(docs[0].headline) print(docs[0].text) # doctest: +ELLIPSIS # parsed corpora print(treebank.fileids()) # doctest: +ELLIPSIS print(treebank.words('wsj_0003.mrg')) print(treebank.tagged_words('wsj_0003.mrg')) print(treebank.parsed_sents('wsj_0003.mrg') [0]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # nltk.download('ptb') print(ptb.fileids()) # doctest: +SKIP # download the corpus from here: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/treebank.zip # then extract and place to the following location: .../nltk_data/corpora/ptb/ print(ptb.words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP print(ptb.tagged_words('treebank/combined/wsj_0003.mrg')) # doctest: +SKIP # print(ptb.categories()) # doctest: +SKIP # print(ptb.fileids('news')) # doctest: +SKIP # print(ptb.words(categories=['humor', 'fiction'])) # doctest: +SKIP # nltk.download('sinica_treebank') print(sinica_treebank.sents()) # doctest: +SKIP print(sinica_treebank.parsed_sents()[25]) # doctest: +SKIP # nltk.download('conll2007') print(conll2007.sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0]) # doctest: +SKIP print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP # for tree in ycoe.parsed_sents('cocuraC')[:4]: # print(tree) # doctest: +SKIP # word lists and lexicons print(words.fileids())