def get_ptb_data(w2id):
    all_words = []
    for item in ptb.fileids():
        all_words.extend(list(map(str.lower, ptb.words(item))))
    # print(all_words)
    all_words_id = []
    for w in all_words:
        id = w2id.get(w)
        if id == None:
            id = w2id.get("<unk>")
        all_words_id.append(id)
    return all_words_id
def parse_wsj():
    from nltk.corpus import ptb
    import logging

    logger = logging.getLogger('root')

    all_lines = []

    for wsj_folder in range(0, 25):
        for file_number in range(0, 100):
            wsj_file = 'wsj/%02d/wsj_%02d%02d.mrg' % (wsj_folder, wsj_folder, file_number)
            logger.info('opening %s' % wsj_file)
            try:
                line = [word.lower() for word in ptb.words(wsj_file)]
                all_lines.append(line)
            except IOError:
                logger.info('not found')

    return all_lines
Beispiel #3
0
 def test_category_words(self):
     self.assertEqual(
         ptb.words(categories=['humor', 'fiction'])[:6],
         ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back'],
     )
Beispiel #4
0
 def test_words(self):
     self.assertEqual(
         ptb.words('WSJ/00/WSJ_0003.MRG')[:7],
         ['A', 'form', 'of', 'asbestos', 'once', 'used', '*'],
     )
 def test_category_words(self):
     self.assertEqual(
         ptb.words(categories=['humor','fiction'])[:6],
         ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back']
     )
 def test_words(self):
     self.assertEqual(
         ptb.words('WSJ/00/WSJ_0003.MRG')[:7],
         ['A', 'form', 'of', 'asbestos', 'once', 'used', '*']
     )
Beispiel #7
0
 def test_words(self):
     self.assertEqual(
         ptb.words("WSJ/00/WSJ_0003.MRG")[:7],
         ["A", "form", "of", "asbestos", "once", "used", "*"],
     )
Beispiel #8
0
 def test_category_words(self):
     self.assertEqual(
         ptb.words(categories=["humor", "fiction"])[:6],
         ["Thirty-three", "Scotty", "did", "not", "go", "back"],
     )
Beispiel #9
0
def main(test=False):
    """
    makes a big dumb PTB CFG, and ShiftReduceParser, and a ViterbiParser, and
    serializes them all to disk for future use.

    The ViterbiParser runs in cubic time and give the most likely parse.
    The ShiftReduceParser runs in linear time and gives a single parse.

    https://stackoverflow.com/questions/7056996/how-do-i-get-a-set-of-grammar-rules-from-penn-treebank-using-python-nltk
    https://groups.google.com/forum/#!topic/nltk-users/_LXtbIekLvc
    https://www.nltk.org/_modules/nltk/grammar.html
    """
    vocabulary = chainer.datasets.get_ptb_words_vocabulary()
    freq_thresh = 0 ## ARBITRARY
    word_freqs = FreqDist(ptb.words())

    if not os.path.isfile('parsers/grammar.pkl'):

        productions = []
        add_dict = {}

        # use the entire treebank's parsed sentences to generate the CFG
        for i, tree in enumerate(ptb.parsed_sents()):

            # is it a good idea to combine this with my preprocessing?
            tree.collapse_unary(collapsePOS=False)
            tree.chomsky_normal_form(horzMarkov=2)

            # preprocess all productions by removing all tags
            these_productions = tree.productions()
            for production in these_productions:

                # remove all tags from the LHS (only keep primary tag)
                production._lhs = preprocess_nt(production._lhs)

                rhs = []
                for item in production._rhs:

                    # remove all tags from the Nonterminals on the RHS
                    if type(item) == nltk.grammar.Nonterminal:
                        rhs.append(preprocess_nt(item))

                    # replace numbers with N
                    elif is_number(item):
                        rhs.append('N')

                    # items not in dictionary replaced with <unk>
                    # dictionary requires lower
                    elif not is_key(vocabulary, item.lower()):
                        rhs.append('<unk>')

                    # replace infrequent words with <unk>
                    elif word_freqs[item] < freq_thresh:
                        rhs.append('<unk>')

                    # lowercase all entries in the grammar
                    else:
                        rhs.append(item.lower())

                production._rhs = tuple(rhs)

                if not is_key(add_dict, production.unicode_repr()):
                    add_dict[production.unicode_repr()] = True
                    productions.append(production)

        print('** {} productions found! **'.format(len(productions)))
        grammar = induce_pcfg(Nonterminal('S'), productions)

        with open('parsers/grammar.pkl', 'wb') as f:
            f.write(pickle.dumps(grammar))

    if not os.path.isfile('parsers/viterbi_parser.pkl'):
        filename = open('parsers/grammar.pkl', 'rb')
        grammar = pickle.load(filename)
        viterbi_parser = ViterbiParser(grammar, trace=0) # cubic time

        with open('parsers/viterbi_parser.pkl', 'wb') as f:
            f.write(pickle.dumps(viterbi_parser))

    if not os.path.isfile('parsers/shift_reduce_parser.pkl'):
        filename = open('parsers/grammar.pkl', 'rb')
        grammar = pickle.load(filename)
        shift_reduce_parser = ShiftReduceParser(grammar, trace=0)     # linear time

        with open('parsers/shift_reduce_parser.pkl', 'wb') as f:
            f.write(pickle.dumps(shift_reduce_parser))

    with open('data/ptb.train.txt', 'r') as f:
        data = f.readlines()

    if test:
        for sample in [1, 23, 20330, 20332, 443]:

            t1 = time.time()
            viterbi_parser.parse_one(data[sample].split())
            t2 = time.time()
            print('viterbi      = {:.2f} sec for {} words'.format(
                t2-t1, len(data[sample].split())))

            t1 = time.time()
            shift_reduce_parser.parse_one(data[sample].split())
            t2 = time.time()
            print('shift reduce = {:.2f} sec for {} words'.format(
            t2-t1, len(data[sample].split())))
Beispiel #10
0
 def test_category_words(self):
     self.assertEqual(
         ptb.words(categories=["humor", "fiction"])[:6], ["Thirty-three", "Scotty", "did", "not", "go", "back"]
     )
Beispiel #11
0
 def test_words(self):
     self.assertEqual(ptb.words("WSJ/00/WSJ_0003.MRG")[:7], ["A", "form", "of", "asbestos", "once", "used", "*"])
Beispiel #12
0
singleWords=set([","])
punc=set([":","#","-LRB-","CD","TO","``","S","''","$","NP","IN","-RRB-",",","."])
punc2=set([":","#","``","''","$",",","."])
i=0
# I read all files then I will go thorugh all lines in each file and I used stack to extract the grammers.
#for each ')' I pop from stack until I face '(' so I generate rule and push back the last item in the stack for a next rule.

for o in os.listdir('wsj'):
    if (os.path.isdir(os.path.join('wsj',o))):
        for filename in os.listdir(os.path.join('wsj',o)):
            i=i+1
            print i
            if filename.endswith(".mrg"):
                path='wsj/'+o+'/'+filename
              
                singleWords=set(ptb.words(path))| singleWords
                pranthesis=Stack()
                words=Stack()
                f=open(path,'r')
                content=f.readlines()
                allWords=[]
                for line in content:
                    for l in line.strip().split():
                        # with regular expression we break each line to: words '('  ')'
                        result=re.findall("[)]|[(]|[^\)\(]*",l.strip())
                        for r in result:
                            if r !="":
                                allWords.append(r)

                buffer=[]
                for word in allWords:
Beispiel #13
0
print(docs[0].docno)
print(docs[0].doctype)
print(docs[0].date_time)
print(docs[0].headline)
print(docs[0].text)  # doctest: +ELLIPSIS
# parsed corpora
print(treebank.fileids())  # doctest: +ELLIPSIS
print(treebank.words('wsj_0003.mrg'))
print(treebank.tagged_words('wsj_0003.mrg'))
print(treebank.parsed_sents('wsj_0003.mrg')
      [0])  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# nltk.download('ptb')
print(ptb.fileids())  # doctest: +SKIP
# download the corpus from here: https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/treebank.zip
# then extract and place to the following location: .../nltk_data/corpora/ptb/
print(ptb.words('treebank/combined/wsj_0003.mrg'))  # doctest: +SKIP
print(ptb.tagged_words('treebank/combined/wsj_0003.mrg'))  # doctest: +SKIP
# print(ptb.categories())  # doctest: +SKIP
# print(ptb.fileids('news'))  # doctest: +SKIP
# print(ptb.words(categories=['humor', 'fiction']))  # doctest: +SKIP
# nltk.download('sinica_treebank')
print(sinica_treebank.sents())  # doctest: +SKIP
print(sinica_treebank.parsed_sents()[25])  # doctest: +SKIP
# nltk.download('conll2007')
print(conll2007.sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0])  # doctest: +SKIP
print(conll2007.parsed_sents('esp.train')[0].tree())  # doctest: +SKIP
# for tree in ycoe.parsed_sents('cocuraC')[:4]:
#     print(tree)  # doctest: +SKIP
# word lists and lexicons
print(words.fileids())