Beispiel #1
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

    def _knbc_fileids_sort(x):
        cells = x.split('-')
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

    print(knbc.fileids()[:10])
    print(''.join(knbc.words()[:100]))

    print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2]))

    knbc.morphs2str = lambda morphs: '/'.join(
        "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
    ).encode('utf-8')

    print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2]))

    print(
        '\n'.join(
            ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent)
            for sent in knbc.tagged_sents()[0:2]
        )
    )
Beispiel #2
0
 def __init__(self, root='udhr'):
     fileids = find_corpus_fileids(root, r'(?!README|\.).*')
     super(UdhrCorpusReader, self).__init__(
         root,
         [fileid for fileid in fileids if fileid not in self.SKIP],
         encoding=self.ENCODINGS,
     )
Beispiel #3
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

    def _knbc_fileids_sort(x):
        cells = x.split('-')
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

    print(knbc.fileids()[:10])
    print(''.join(knbc.words()[:100]))

    print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2]))

    knbc.morphs2str = lambda morphs: '/'.join(
        "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
    ).encode('utf-8')

    print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2]))

    print(
        '\n'.join(
            ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent)
            for sent in knbc.tagged_sents()[0:2]
        )
    )
Beispiel #4
0
 def __init__(self, root='udhr'):
     fileids = find_corpus_fileids(root, r'(?!README|\.).*')
     super(UdhrCorpusReader, self).__init__(
         root,
         [fileid for fileid in fileids if fileid not in self.SKIP],
         encoding=self.ENCODINGS
     )
Beispiel #5
0
 def __init__(self, root="udhr"):
     fileids = find_corpus_fileids(root, r"(?!README|\.).*")
     super().__init__(
         root,
         [fileid for fileid in fileids if fileid not in self.SKIP],
         encoding=self.ENCODINGS,
     )
def read_knbc(train_file, test_file, reference_file):

	root = nltk.data.find('corpora/knbc/corpus1')
	fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
              if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

	knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
           sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

	sentences = knbc.sents()

	write_train(sentences[0:4000], train_file)
	write_test(sentences[4000:-1], test_file)
	write_reference(sentences[4000:-1], reference_file)
Beispiel #7
0
def ctb_clear():
    ctb_dir = path.join(home_dir, 'normal_ctb_test')
    reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*'
    # reg='.*dev'
    ctb_dir = FileSystemPathPointer(ctb_dir)
    fileids = find_corpus_fileids(root=ctb_dir, regexp=reg)
    for fid in fileids:
        f1 = open('normal_ctb_test/' + fid, mode='r')
        f2 = open('for_clearnlp/' + fid, mode='w')
        for line in f1.readlines():
            if line.find('<S>') >= 0 or line.find('</S>') >= 0:
                continue
            f2.write(line)
        f1.close()
        f2.close()
Beispiel #8
0
def static_dp():
    ctb_dir = path.join(home_dir, 'for_clearnlp')
    # reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*'
    reg = '(.*dep)*'
    ctb_dir = FileSystemPathPointer(ctb_dir)
    fileids = find_corpus_fileids(root=ctb_dir, regexp=reg)
    ct = 0
    for fid in fileids:
        f2 = open('for_clearnlp/' + fid, mode='r')
        for line in f2.readlines():
            if line == '\n':
                ct += 1

        f2.close()
    print(ct)
def read_knbc(train_file, test_file, reference_file):

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [
        f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
        if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    knbc = LazyCorpusLoader('knbc/corpus1',
                            KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort),
                            encoding='euc-jp')

    sentences = knbc.sents()

    write_train(sentences[0:4000], train_file)
    write_test(sentences[4000:-1], test_file)
    write_reference(sentences[4000:-1], reference_file)
Beispiel #10
0
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find("corpora/knbc/corpus1")
    fileids = [
        f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
        if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)
    ]

    def _knbc_fileids_sort(x):
        cells = x.split("-")
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader(
        "knbc/corpus1",
        KNBCorpusReader,
        sorted(fileids, key=_knbc_fileids_sort),
        encoding="euc-jp",
    )

    print(knbc.fileids()[:10])
    print("".join(knbc.words()[:100]))

    print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2]))

    knbc.morphs2str = lambda morphs: "/".join("{}({})".format(
        m[0], m[1].split(" ")[2]) for m in morphs
                                              if m[0] != "EOS").encode("utf-8")

    print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]))

    print("\n".join(" ".join("{}/{}".format(w[0], w[1].split(" ")[2])
                             for w in sent)
                    for sent in knbc.tagged_sents()[0:2]))
Beispiel #11
0
home_dir = path.join(path.dirname(__file__), './')
import re
# ctb_dir = '/home/lnn/Downloads/ctb_test'
ctb_dir = '/home/lnn/Downloads/ctb_paper/origin/out_paper'
# ctb_dir = '/home/lnn/Documents/ability/cranfield_testdata/upenn_transfer/normal_ctb_test'
# ctb_dir = '/home/nana/Documents/pycharmforlinux/upenn_transfer/normal_ctb_test_v1'

ctb_path = path.join(ctb_dir, 'ctb.secondtest.clean')
counts = 0
fc = open(ctb_path, mode='w', encoding='utf-8')
# reg = 'chtb_3095.bn'
reg = '(.*nw)*(.*bc)*(.*mz)*(.*bn)*(.*wb)*'
# reg = '(.*nw)*(.*mz)*'
ctb_dir = FileSystemPathPointer(ctb_dir)
fileids = find_corpus_fileids(root=ctb_dir, regexp=reg)

OTHER = [']', '[', ')', '(', '<', '/', '>']


def strQ2B(ustring):
    """全角转半角"""
    rstring = ""
    for uchar in ustring:
        inside_code = ord(uchar)
        if inside_code == 12288:  #全角空格直接转换
            inside_code = 32
        elif (inside_code >= 65281 and inside_code <= 65374):  #全角字符(除空格)根据关系转化
            inside_code -= 65248

        rstring += chr(inside_code)
Beispiel #12
0
def rules(normal_save_dir, mmbroken_dir, other_broken_dir, phrases_dir,
          value_error_dir):
    ctb_dir = '/home/lnn/Downloads/ctb_paper/origin/all_data'
    # ctb_dir = '/home/lnn/Downloads/ctb_bracket'
    # ctb_dir = home_dir
    # ctb_dir = path.join(home_dir,'ctb_test')
    # reg = 'chtb_0040.nw'
    reg = '(.*nw)*(.*bn)*(.*mz)*(.*bc)*(.*wb)*'
    ctb_dir = FileSystemPathPointer(ctb_dir)
    fileids = find_corpus_fileids(root=ctb_dir, regexp=reg)
    statis = [0, 0, 0, 0]
    sum_broken_phrases = {}
    sum_mmbrokens = {}
    for fid in fileids:
        print(fid)
        normal_trees, mmbrokens, mmbroken_trees, other_brokens, broken_phrases, value_error, mmtext = analysis_v2(
            ctb_dir, fid)
        # break
        statis[0] += len(normal_trees)
        statis[1] += len(other_brokens)
        statis[2] += len(value_error)
        statis[3] += len(mmbroken_trees)
        # f=open('mmtext.txt',mode='a')
        # f.write('{}: \n'.format(fid))
        # for line in mmtext:
        #     f.write(' '.join(mm_out(line[0]))+'\n')
        #     f.write(' '.join(mm_out(line[1]))+'\n')
        #     f.write(' '.join(mm_out(line[2]))+'\n')
        #     f.write(' '.join(mm_out(line[3]))+'\n')
        #     f.write('\n')
        # f.write('\n\n')
        # f.close()
        for k, v in broken_phrases.items():
            if sum_broken_phrases.get(k, 0) == 0:
                sum_broken_phrases[k] = v
            else:
                sum_broken_phrases[k] = sum_broken_phrases[k] + v
        for k, v in mmbrokens.items():
            if sum_mmbrokens.get(k, 0) == 0:
                sum_mmbrokens[k] = v
            else:
                sum_mmbrokens[k] = sum_mmbrokens[k] + v
        if len(value_error) > 0:
            f = open(value_error_dir + '/' + fid, mode='w')
            for i in value_error:
                f.write('<S>\n')
                f.write('( {})\n'.format(i.__str__()))
                f.write('</S>\n')

            f.close()

        if len(normal_trees) > 0:
            f = open(normal_save_dir + '/' + fid, mode='w')
            for i in normal_trees:
                f.write('<S>\n')
                f.write('( {})\n'.format(i.__str__()))
                f.write('</S>\n')
            f.close()
        if len(mmbroken_trees) > 0:
            f = open(mmbroken_dir + '/' + fid, mode='w')
            for i in mmbroken_trees:
                f.write('<S>\n')
                f.write('( {})\n'.format(i.__str__()))
                f.write('</S>\n')
            f.close()
        if len(other_brokens) > 0:
            f = open(other_broken_dir + '/' + fid, mode='w')
            for i in other_brokens:
                f.write('<S>\n')
                f.write('( {})\n'.format(i.__str__()))
                f.write('</S>\n')
            f.close()

    if len(sum_broken_phrases) > 0:
        f = open(phrases_dir + '/broken_phrases.txt', mode='w')
        for k, v in sum_broken_phrases.items():
            f.write('{} {}\n'.format(k, v))
        f.close()
    if len(sum_mmbrokens) > 0:
        f = open(mmbroken_dir + '/mmbrokens.txt', mode='w')
        for k, v in sum_mmbrokens.items():
            f.write('{} {}\n'.format(k, v))

        f.close()

    print(statis)