def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') print(knbc.fileids()[:10]) print(''.join(knbc.words()[:100])) print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: '/'.join( "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' ).encode('utf-8') print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2])) print( '\n'.join( ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ) )
def __init__(self, root='udhr'): fileids = find_corpus_fileids(root, r'(?!README|\.).*') super(UdhrCorpusReader, self).__init__( root, [fileid for fileid in fileids if fileid not in self.SKIP], encoding=self.ENCODINGS, )
def __init__(self, root='udhr'): fileids = find_corpus_fileids(root, r'(?!README|\.).*') super(UdhrCorpusReader, self).__init__( root, [fileid for fileid in fileids if fileid not in self.SKIP], encoding=self.ENCODINGS )
def __init__(self, root="udhr"): fileids = find_corpus_fileids(root, r"(?!README|\.).*") super().__init__( root, [fileid for fileid in fileids if fileid not in self.SKIP], encoding=self.ENCODINGS, )
def read_knbc(train_file, test_file, reference_file): root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') sentences = knbc.sents() write_train(sentences[0:4000], train_file) write_test(sentences[4000:-1], test_file) write_reference(sentences[4000:-1], reference_file)
def ctb_clear(): ctb_dir = path.join(home_dir, 'normal_ctb_test') reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*' # reg='.*dev' ctb_dir = FileSystemPathPointer(ctb_dir) fileids = find_corpus_fileids(root=ctb_dir, regexp=reg) for fid in fileids: f1 = open('normal_ctb_test/' + fid, mode='r') f2 = open('for_clearnlp/' + fid, mode='w') for line in f1.readlines(): if line.find('<S>') >= 0 or line.find('</S>') >= 0: continue f2.write(line) f1.close() f2.close()
def static_dp(): ctb_dir = path.join(home_dir, 'for_clearnlp') # reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*' reg = '(.*dep)*' ctb_dir = FileSystemPathPointer(ctb_dir) fileids = find_corpus_fileids(root=ctb_dir, regexp=reg) ct = 0 for fid in fileids: f2 = open('for_clearnlp/' + fid, mode='r') for line in f2.readlines(): if line == '\n': ct += 1 f2.close() print(ct)
def read_knbc(train_file, test_file, reference_file): root = nltk.data.find('corpora/knbc/corpus1') fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') sentences = knbc.sents() write_train(sentences[0:4000], train_file) write_test(sentences[4000:-1], test_file) write_reference(sentences[4000:-1], reference_file)
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find("corpora/knbc/corpus1") fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split("-") return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader( "knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp", ) print(knbc.fileids()[:10]) print("".join(knbc.words()[:100])) print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: "/".join("{}({})".format( m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS").encode("utf-8") print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])) print("\n".join(" ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2]))
home_dir = path.join(path.dirname(__file__), './') import re # ctb_dir = '/home/lnn/Downloads/ctb_test' ctb_dir = '/home/lnn/Downloads/ctb_paper/origin/out_paper' # ctb_dir = '/home/lnn/Documents/ability/cranfield_testdata/upenn_transfer/normal_ctb_test' # ctb_dir = '/home/nana/Documents/pycharmforlinux/upenn_transfer/normal_ctb_test_v1' ctb_path = path.join(ctb_dir, 'ctb.secondtest.clean') counts = 0 fc = open(ctb_path, mode='w', encoding='utf-8') # reg = 'chtb_3095.bn' reg = '(.*nw)*(.*bc)*(.*mz)*(.*bn)*(.*wb)*' # reg = '(.*nw)*(.*mz)*' ctb_dir = FileSystemPathPointer(ctb_dir) fileids = find_corpus_fileids(root=ctb_dir, regexp=reg) OTHER = [']', '[', ')', '(', '<', '/', '>'] def strQ2B(ustring): """全角转半角""" rstring = "" for uchar in ustring: inside_code = ord(uchar) if inside_code == 12288: #全角空格直接转换 inside_code = 32 elif (inside_code >= 65281 and inside_code <= 65374): #全角字符(除空格)根据关系转化 inside_code -= 65248 rstring += chr(inside_code)
def rules(normal_save_dir, mmbroken_dir, other_broken_dir, phrases_dir, value_error_dir): ctb_dir = '/home/lnn/Downloads/ctb_paper/origin/all_data' # ctb_dir = '/home/lnn/Downloads/ctb_bracket' # ctb_dir = home_dir # ctb_dir = path.join(home_dir,'ctb_test') # reg = 'chtb_0040.nw' reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*' ctb_dir = FileSystemPathPointer(ctb_dir) fileids = find_corpus_fileids(root=ctb_dir, regexp=reg) statis = [0, 0, 0, 0] sum_broken_phrases = {} sum_mmbrokens = {} for fid in fileids: print(fid) normal_trees, mmbrokens, mmbroken_trees, other_brokens, broken_phrases, value_error, mmtext = analysis_v2( ctb_dir, fid) # break statis[0] += len(normal_trees) statis[1] += len(other_brokens) statis[2] += len(value_error) statis[3] += len(mmbroken_trees) # f=open('mmtext.txt',mode='a') # f.write('{}: \n'.format(fid)) # for line in mmtext: # f.write(' '.join(mm_out(line[0]))+'\n') # f.write(' '.join(mm_out(line[1]))+'\n') # f.write(' '.join(mm_out(line[2]))+'\n') # f.write(' '.join(mm_out(line[3]))+'\n') # f.write('\n') # f.write('\n\n') # f.close() for k, v in broken_phrases.items(): if sum_broken_phrases.get(k, 0) == 0: sum_broken_phrases[k] = v else: sum_broken_phrases[k] = sum_broken_phrases[k] + v for k, v in mmbrokens.items(): if sum_mmbrokens.get(k, 0) == 0: sum_mmbrokens[k] = v else: sum_mmbrokens[k] = sum_mmbrokens[k] + v if len(value_error) > 0: f = open(value_error_dir + '/' + fid, mode='w') for i in value_error: f.write('<S>\n') f.write('( {})\n'.format(i.__str__())) f.write('</S>\n') f.close() if len(normal_trees) > 0: f = open(normal_save_dir + '/' + fid, mode='w') for i in normal_trees: f.write('<S>\n') f.write('( {})\n'.format(i.__str__())) f.write('</S>\n') f.close() if len(mmbroken_trees) > 0: f = open(mmbroken_dir + '/' + fid, mode='w') for i in mmbroken_trees: f.write('<S>\n') f.write('( {})\n'.format(i.__str__())) f.write('</S>\n') f.close() if len(other_brokens) > 0: f = open(other_broken_dir + '/' + fid, mode='w') for i in other_brokens: f.write('<S>\n') f.write('( {})\n'.format(i.__str__())) f.write('</S>\n') f.close() if len(sum_broken_phrases) > 0: f = open(phrases_dir + '/broken_phrases.txt', mode='w') for k, v in sum_broken_phrases.items(): f.write('{} {}\n'.format(k, v)) f.close() if len(sum_mmbrokens) > 0: f = open(mmbroken_dir + '/mmbrokens.txt', mode='w') for k, v in sum_mmbrokens.items(): f.write('{} {}\n'.format(k, v)) f.close() print(statis)