Esempio n. 1
0
def read_brackets(constitfile):
    sys.stderr.write("\nReading constituents from " + constitfile + " ...\n")
    reader = BracketParseCorpusReader(PARSER_DATA_DIR + "rnng/", constitfile)
    parses = reader.parsed_sents()
    return parses
Esempio n. 2
0
def parse_trees(dir, fileid):
    # reader = BracketParseCorpusReader('/home/lnn/Documents/ability/cranfield_testdata/upenn_transfer/new_ctb', fileid)
    reader = BracketParseCorpusReader(dir, fileid)
    tree = reader.parsed_sents()
    return tree
def load_reader_and_filedids(lang,data_type):
    assert data_type in ('train','val','test')
    def filter_trees(tree, data_type):
        def _is_control(char):
            """Checks whether `chars` is a control character."""
            # These are technically control characters but we count them as whitespace
            # characters.
            if char == "\t" or char == "\n" or char == "\r":
                return False
            cat = unicodedata.category(char)
            if cat.startswith("C"):
                return True
            return False
        
        sent=tree.leaves()
        if data_type=='wsj' and len(sent)>10: return False
        if data_type!='wsj' and len(sent)>128: return False
        try:
            for c in ' '.join(sent):
                cp=ord(c)
                if cp == 0 or cp == 0xfffd or _is_control(c):
                    return False
            return True
        except:
            return False

    def filt_id(fileids,lang):
        assert lang in ('en','fr','zh')
        train_file_ids,valid_file_ids,test_file_ids=[],[],[]
        for id in fileids:
            prefix=id.split('.')[0]
            if lang=='en':
                if 'WSJ/22/WSJ_2200' <= prefix <= 'WSJ/22/WSJ_2299':
                    valid_file_ids.append(id)
                elif 'WSJ/23/WSJ_2300' <= prefix <= 'WSJ/23/WSJ_2399':
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)        
            elif lang=='zh':
                if '0886' <= prefix <= '0931' or '1148' <= prefix <= '1151':
                    valid_file_ids.append(id)
                elif '0816' <= prefix <= '0885' or '1137' <= prefix <='1147':
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)        
            else:
                if prefix in ('flmf3_12500_12999co','flmf7ab2ep','flmf7ad1co','flmf7ae1ep'):
                    valid_file_ids.append(id) 
                elif prefix in ('flmf3_12000_12499ep','flmf7aa1ep','flmf7aa2ep','flmf7ab1co'):
                    test_file_ids.append(id)
                else:
                    train_file_ids.append(id)
        return train_file_ids,valid_file_ids,test_file_ids

    assert lang in ('en','zh','fr','il','jp','sp','ca','sw','de')
    lang_dir=treebank_dir+'/'+lang
    reader=BracketParseCorpusReader(lang_dir, '.*')
    fileids=reader.fileids()
    if data_type=='wsj10':
        return [t for t in reader.parsed_sents(fileids) if filter_trees(t,data_type)]
    train_file_ids = []
    valid_file_ids = []
    test_file_ids = []
    if lang in ('en','zh','fr'):
        train_file_ids,valid_file_ids,test_file_ids=filt_id(fileids,lang)
        train_trees=reader.parsed_sents(train_file_ids)
        val_trees=reader.parsed_sents(valid_file_ids)
        test_trees=reader.parsed_sents(test_file_ids)
    else:
        for fid in fileids:
            if 'train' in fid:
                train_trees=reader.parsed_sents(fid)
            elif 'val' in fid:
                val_trees=reader.parsed_sents(fid)
            elif 'test' in fid:
                test_trees=reader.parsed_sents(fid)
    if data_type=='train':
        train_trees=[t for t in train_trees if filter_trees(t,data_type)]
        print(f'train:{len(train_trees)}')
        return train_trees
    elif data_type=='val':
        val_trees=[t for t in val_trees if filter_trees(t,data_type)]
        print(f'val:{len(val_trees)}')
        return val_trees
    else:
        test_trees=[t for t in test_trees if filter_trees(t,data_type)]
        print(f'test:{len(test_trees)}')
        return test_trees     
Esempio n. 4
0
print(sents[1:20])

# 1.9. 载入自己的语料库
from nltk.corpus import PlaintextCorpusReader

corpus_root = '/Temp/delete'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
print(wordlists.fileids())
print(wordlists.words('blake-poems.txt'))

from nltk.corpus import BracketParseCorpusReader

corpus_root = r'C:\nltk_data\corpora\treebank\combined'
file_pattern = r'.*/wsj_.*\.mrg'
file_pattern = r'wsj_.*.mrg'
ptb = BracketParseCorpusReader(corpus_root, file_pattern)
print(ptb)
print(ptb.fileids())
print(len(ptb.sents()))
print(ptb.sents(fileids='wsj_0199.mrg')[1])

# 2. 条件频率分布:是频率分布的集合,每个频率分布有一个不同的“条件”。(condition,word)根据condition(条件)统计word(单词)的频率。
# 2.1. 条件 和 事件
# 2.2. 按文体计数词汇
from nltk.corpus import brown

cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
                               for word in brown.words(categories=genre))
genre_word = [(genre, word) for genre in ['news', 'romance']
              for word in brown.words(categories=genre)]
print(genre_word)
Esempio n. 5
0
nombank_root = join(corpus_root, 'nombank.1.0')
nombank_file = 'nombank.1.0_sorted'
nombank_nouns_file = 'nombank.1.0.words'

frame_file_pattern = 'frames/.*\.xml'


def fileid_xform_function(filename):
    result = re.sub(r'^wsj/', '', filename)
    # result = re.sub(r'^wsj/\d\d/', '', filename)
    # result = re.sub(r'\.mrg$', '', result)
    return result


treebank = BracketParseCorpusReader(root=treebank_root,
                                    fileids=treebank_file_pattern,
                                    tagset='wsj',
                                    encoding='ascii')

propbank = PropbankCorpusReader(root=FileSystemPathPointer(propbank_root),
                                propfile=propbank_file,
                                framefiles=frame_file_pattern,
                                verbsfile=propbank_verbs_file,
                                parse_fileid_xform=fileid_xform_function,
                                parse_corpus=treebank)

nombank = NombankCorpusReader(root=FileSystemPathPointer(nombank_root),
                              nomfile=nombank_file,
                              framefiles=frame_file_pattern,
                              nounsfile=nombank_nouns_file,
                              parse_fileid_xform=fileid_xform_function,
                              parse_corpus=treebank)
    tags.append("EOS")

    while(sentence[i] != "" or len(sentence) <= 3 ):
        tags.append(get_next_tag(pos_dist, tags[i]))
        sentence.append(get_next_word(t2w_dist, tags[i+1]))
        i += 1

    return (sentence, tags)


# In[ ]:

# Import and parse the corpus

corpus_root = './corpus_clean/'
corpus = BracketParseCorpusReader(corpus_root, ".*")

tagged_sentences = corpus.tagged_sents()
ngram_input = []
pos_input = []
legal_tags = ["EOS","$","#", "GW", "CC", "CD", "DT", "EX", "FW", "IN", "JJ","JJR","JJS","LS","MD",
             "NN","NNS","NNP",'NNPS','PDT','POS','PRP','PRP$','RB','RBR','RBS','RP','TO', "UH",'VB',
             'VBD',"VBG","VBN","VBP","VBZ","WDT","WP","WP$","WRB", "\"", "\'", ",", ".", "AFX"]

single_letter_words = ["a", "i", ",", ".", "!", "?", "\'", "\"", ":", ';', '0', '1', '2', "3", '4',
                       '5', "6", '7', '8', "9", "=", "&", "#", '/', '>', "$", '<', '+', '%',]

# tags_removed = ["-NONE-","SYM", "CODE", "ADD", "HYPH","-LSB-", "-RSB-",":", "NFP", "XX", "-LRB-", "-RRB-"]

#  Remove -NONE- and  SYM tags from the training data and create a list of tokens and a list of tags.
for sentence in tagged_sentences:
    h.close()

    vocab = [i[0] for i in vocab]
    return vocab


if __name__ == '__main__':
    TRAIN_FILE = 'data/wsj_2-21'
    TEST_FILE = 'data/wsj_23'
    DEV_FILE = 'data/wsj_24'
    SECTIONS = [(2, 21), (23, 23), (24, 24)]
    MAXLEN = 50

    wsj = '/data/penn_tb_3.0/TREEBANK_3/PARSED/MRG/WSJ/'
    file_pattern = r".*/WSJ_.*\.MRG"
    ptb = BracketParseCorpusReader(wsj, file_pattern)
    print('Gathered %d files...' % len(ptb.fileids()))

    print('Generating vocabulary...')
    vocab = get_vocab()
    print('Done.')

    print('Preprocessing all sections...')
    for fn, sections in zip([TRAIN_FILE, TEST_FILE, DEV_FILE], SECTIONS):
        print('Preprocessing %s...' % fn)
        h = open(fn, 'wt')
        for section in range(sections[0], sections[1] + 1):
            fileids = [
                i for i in ptb.fileids()
                if i.startswith(str(section).zfill(2))
            ]
Esempio n. 8
0
import nltk
import random

from nltk.corpus import BracketParseCorpusReader
from nltk import induce_pcfg

treebank = BracketParseCorpusReader(
    "resources/",
    "skladnica_no_heads.txt",
)

productions = []
for item in treebank.fileids()[:2]:
    for tree in treebank.parsed_sents(item):
        #tree.draw()
        productions += tree.productions()

grammar = induce_pcfg(nltk.Nonterminal('wypowiedzenie:'), productions)
print(grammar.start())

##get types


def get_type(lhs):

    left = lhs.find("[")
    right = lhs.rfind("]")

    if left == -1:
        return None
    return lhs[left:right + 1]
Esempio n. 9
0
    "seemed": "os-seem",
    "seems": "os-seem",
    "seeming": "os-seem",
    "sound": "os-sound",
    "sounded": "os-sound",
    "sounds": "os-sound",
    "sounding": "os-sound",
    "ask": "os-ask",
    "asked": "os-ask",
    "asks": "os-ask",
    "asking": "os-ask",
}

ptb = BracketParseCorpusReader(
    corpus_root,
    file_pattern,
    #encoding='utf-8'
    encoding='iso-8859-1')
ptbS = ptb.parsed_sents()


def getClauseHead(st):
    clauseHead = ''
    # delete disfluencies at the S level
    for daughter in st:
        if daughter.label() in [
                'EDITED', 'RS', 'PRN', '-DFL-', 'CONJP', 'ADVP'
        ]:  # including "not only ..."
            del daughter
    #print(joinLeaves(st), st.label())
    if st[0].label()[:2] == 'WH' and joinLeaves(st[0]).lower() == '0':
def CorpusPTBReader(ptb_data_path):
    ptb_sent_file = open("total_ptb.txt", "w")

    file_pattern = r".*/.*\.mrg"

    ptb = BracketParseCorpusReader(ptb_data_path, file_pattern)
    #print (ptb.fileids())
    #print ((ptb.sents()))
    #ptb.sents(fileids= 'brown/cf/cf01.mrg')[0]
    count = 0
    for sent in ptb.sents():
        '''sent = ""
        for word in sent:
            if "\\" in word or "e_s" in word or "n_s" in word:
                continue
            else:
                sent += word + " "
        out = sent[:-1]'''
        if len(sent) < 7: continue
        out = ' '.join(sent)
        out = out.lower()
        #        print(len(sent), out)

        parser = Parser(grammar, 'all')
        temp_result = parser.parse(out)
        sub_sent = []
        start_index = 0
        for num_info in temp_result[1]:
            sub_sent.append(out[start_index:num_info[1]])
            sub_sent.append("NUM" + (str(num_info[2] - num_info[1])))
            start_index = num_info[2]
        sub_sent.append(out[start_index:])
        final_out = ''.join(sub_sent)

        final_out = re.sub(r'\*\-NUM\d ', '', final_out)
        final_out = re.sub(r'e_s ', '', final_out)
        final_out = re.sub(r'n_s ', '', final_out)
        final_out = re.sub(r'e_s', '', final_out)
        final_out = re.sub(r'n_s', '', final_out)
        final_out = re.sub(r'\\. ', '', final_out)
        final_out = re.sub(r'\\.', '', final_out)
        final_out = re.sub(r'\*. ', '', final_out)
        final_out = re.sub(r'\*.', '', final_out)
        final_out = re.sub(r'-. ', '', final_out)
        final_out = re.sub(r'-.', '', final_out)
        #final_out = re.sub(r'\**.\* ', '', final_out)
        #final_out = re.sub(r'\**.\*', '', final_out)
        final_out = re.sub(r'\*{,3}.\*.. ', '', final_out)
        final_out = re.sub(r'\*{,3}.\*. ', '', final_out)
        final_out = re.sub(r'\*.. ', '', final_out)
        final_out = re.sub(r'\*..', '', final_out)
        final_out = re.sub(r'\* ', '', final_out)
        #final_out = re.sub(r'\*', '', final_out)
        final_out = re.sub(r'- ', '', final_out)
        final_out = re.sub(r'-', '', final_out)
        final_out = re.sub(r'; ; ', '; ', final_out)
        final_out = final_out[:-1]
        ptb_sent_file.write(final_out)
        ptb_sent_file.write("\n")
        #print(final_out)
        count += 1
        #if count == 10000: break
        #if count > 10: break
    ptb_sent_file.close()
    print(count)
Esempio n. 11
0
def parse_trees(fileid):
    reader = BracketParseCorpusReader('/home/lnn/Downloads/ctb_test', fileid)
    tree = reader.parsed_sents()
    return tree
        if "NN" == word5[1] or "NNS" == word5[1] or "NNP" == word5[
                1] or "NNPS" == word5[1]:
            word1 = taggedList[i]
            word2 = taggedList[i + 1]
            word3 = taggedList[i + 2]
            word4 = taggedList[i + 3]
            if "JJ" == word1[1] and "JJ" == word2[1] and "JJ" == word3[
                    1] and "JJ" == word4[1]:
                result.append((word1, word2, word3, word4, word5))
    return resultM


# Using nltk to parse and load Penn Treebank Corpus
ptb_root = "treebank_3/parsed/mrg/"
ptb_fileid = r".*\.mrg"

ptb = BracketParseCorpusReader(ptb_root, ptb_fileid)

ptb_tagged = ptb.tagged_words()

# Extract 2-seq adjectives from corpus
ptb_adj2 = get_prenominal_adj2(ptb_tagged)
#ptb_adj3 = get_prenominal_adj3(ptb_tagged)
#ptb_adj4 = get_prenominal_adj4(ptb_tagged)

# Write to file
with open('adj_ptb.txt', 'w') as fp:
    #fp.write('\n'.join('{}/{} {}/{} {}/{}'.format(x[0][0],x[0][1],x[1][0],x[1][1],x[2][0],x[2][1]) for x in ptb_adj2))
    fp.write('\n'.join('{} {}'.format(x[0][0], x[1][0]) for x in ptb_adj2))
    #fp.write('\n'.join('{}/{} {}/{} {}/{} {}/{}'.format(x[0][0],x[0][1],x[1][0],x[1][1],x[2][0],x[2][1],x[3][0],x[3][1]) for x in ptb_adj3))
    #fp.write('\n'.join('{}/{} {}/{} {}/{} {}/{} {}/{}'.format(x[0][0],x[0][1],x[1][0],x[1][1],x[2][0],x[2][1],x[3][0],x[3][1],x[4][0],x[4][1]) for x in ptb_adj4))
Esempio n. 13
0
from nltk.corpus import propbank, BracketParseCorpusReader
from nltk.classify.naivebayes import NaiveBayesClassifier

wsj_root = r'/Users/maxlikely/data/penn_treebank/treebank-3/parsed/mrg/wsj'
file_pattern = r".*/wsj_.*\.mrg"

my_treebank = BracketParseCorpusReader(wsj_root, file_pattern)


def get_tree(instance):
    '''Helper function for loading'''
    fileloc = '%s/%s' % (instance.fileid[4:6], instance.fileid)
    tree = my_treebank.parsed_sents(fileids=fileloc)[instance.sentnum]
    return tree