def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') print(knbc.fileids()[:10]) print(''.join( knbc.words()[:100] )) print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )) knbc.morphs2str = lambda morphs: '/'.join( "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' ).encode('utf-8') print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )) print('\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ))
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find("corpora/knbc/corpus1") fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split("-") return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader("knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp") print knbc.fileids()[:10] print "".join(knbc.words()[:100]) print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]) knbc.morphs2str = lambda morphs: "/".join( "%s(%s)" % (m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS" ).encode("utf-8") print "\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2]) print "\n".join(" ".join("%s/%s" % (w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2])
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') print(knbc.fileids()[:10]) print(''.join(knbc.words()[:100])) print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: '/'.join( "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' ).encode('utf-8') print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2])) print( '\n'.join( ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ) )
def parse_wsj(processes=8): ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ portions 'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg', cat_file='allcats.txt', tagset='wsj') fileids = ptb.fileids() params = [] for f in fileids: corpus = zip(ptb.parsed_sents(f), ptb.tagged_sents(f)) for i, (parsed, tagged) in enumerate(corpus): params.append((f, i, parsed, tagged)) p = Pool(processes) p.starmap(get_best_parse, sorted(params, key=lambda x: (x[0], x[1])))
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find("corpora/knbc/corpus1") fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split("-") return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader( "knbc/corpus1", KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding="euc-jp", ) print(knbc.fileids()[:10]) print("".join(knbc.words()[:100])) print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: "/".join("{}({})".format( m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS").encode("utf-8") print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])) print("\n".join(" ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent) for sent in knbc.tagged_sents()[0:2]))
def treebank_chunk_tagger_demo(): from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import PlaintextCorpusReader from nltk_contrib.coref.util import TreebankChunkTaggerCorpusReader state_union = LazyCorpusLoader( 'state_union', PlaintextCorpusReader, r'(?!\.svn).*\.txt') state_union = TreebankChunkTaggerCorpusReader(state_union) print 'Treebank chunker demo...' print 'Chunked sentences:' for sent in state_union.chunked_sents()[500:505]: print sent print print print 'Parsed sentences:' for tree in state_union.parsed_sents()[500:505]: print tree print print
if __name__ == '__main__': fileids = icepahc.fileids() # leave uncommented for whole corpus use #fileids = ['1350.bandamennM.nar-sag.psd'] # For debug use only c = Converter() # Creates instance of Converter class total_sents = 0 file_num = 1 # f = open('homilia_conllu.conllu', 'w+') ''' Prints the dependency graph data in conllU format ''' for fileid in fileids: error_num = 0 start = time.time() file_sents = 0 #print('\nProcessing file: {0}...'.format(fileid)) for tree in icepahc.parsed_sents(fileid): treeID = fileid + '_' + str(file_sents+1) + '_' + str(total_sents+1) try: dep = c.create_dependency_graph(str(tree)) # dep_c = dep.to_conllU() # print(dep_c) # f.write('# sent_id =') # f.write(treeID) # f.write('\n') # print('# sent_id =', treeID) # f.write(dep.to_conllU()) # print(dep.to_conllU()) except: error_num += 1 file_sents += 1 total_sents += 1
#! /usr/bin/python # -*- coding: utf-8 -*- import nltk import util from knbc import * from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/KNBC_v1.0_090925/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('KNBC_v1.0_090925/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') # print knbc.fileids() # print '\n'.join( ''.join(sent) for sent in knbc.words() ) print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[0:2] ) print type(knbc.parsed_sents()[0]) # print '\n'.join( ' '.join("%s/%s"%(w[0], w[1][2]) for w in sent) for sent in knbc.tagged_words()[0:20] )
import util from knbc import * from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/KNBC_v1.0_090925/corpus1') fileids = [ f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) ] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('KNBC_v1.0_090925/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') # print knbc.fileids() # print '\n'.join( ''.join(sent) for sent in knbc.words() ) print '\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[0:2]) print type(knbc.parsed_sents()[0]) # print '\n'.join( ' '.join("%s/%s"%(w[0], w[1][2]) for w in sent) for sent in knbc.tagged_words()[0:20] )
#!/usr/bin/env python # encoding: utf-8 # KNBCコーパスをNLTKで読み込むサンプル from nltk_jp import * from nltk.corpus.reader import * from nltk.corpus.util import LazyCorpusLoader def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) # コーパスを読み込み root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') #print "fileids :", knbc.fileids() print "words :", pp(knbc.words()[:10]) print "parsed_sents :", str(knbc.parsed_sents()[0]) print "tagged_words :", pp(knbc.tagged_words()[:5])
def main(): IcePaHC_DIR = '../psd/corpora/icepahc-v0.9/psd' FIXED_IcePaHC_DIR = '../psd/corpora/icepahc-v0.9/psd_fix' fix_annotation_errors(IcePaHC_DIR, FIXED_IcePaHC_DIR) run_pre(FIXED_IcePaHC_DIR) path.extend(['..']) ICEPAHC = LazyCorpusLoader( 'icepahc-v0.9/psd_fix/', IcePaHCFormatReader, r'.*\.psd', cat_pattern=r'.*(nar|rel|sci|bio|law)\-.*' ) fileids = ICEPAHC.fileids() # leave uncommented for whole corpus use # fileids = ['1150.homiliubok.rel-ser.psd'] # For debug use only # fileids = ['2008.mamma.nar-fic.psd', '2008.ofsi.nar-sag.psd'] # For debug use only # Instance of Converter class c = Converter(auto_tags='corpus') # c = Converter() total_sents = 0 file_num = 1 # OUTPUT_DIR = '../testing/CoNLLU_output/' OUTPUT_DIR = '../IcePaHC-CoNLLU/' if not os.path.isdir(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) # f = open('ofsi_conllu.conllu', 'w+') ''' Prints the dependency graph data in conllU format ''' for fileid in fileids: OUT_FILE_NAME = re.sub(r'\.psd', '.conllu', fileid) OUT_FILE_PATH = OUTPUT_DIR + OUT_FILE_NAME OUT_FILE = open(OUT_FILE_PATH, 'w+') # file_length = len(ICEPAHC.parsed_sents(fileid)) error_num = 0 start = time.time() file_sents = 0 # print('\nProcessing file: {0}...'.format(fileid)) tree_counter = 0 tag_dict = tagged_corpus(ICEPAHC.parsed_sents(fileid)) c.set_tag_dict(tag_dict) to_join = [] try: for tree in ICEPAHC.parsed_sents(fileid): # Catch error in corpus where punctuation tokens are missing tree = fix_IcePaHC_tree_errors(tree) # UniversalDependencyGraph object created from tree dep = c.create_dependency_graph(tree) # Sentences split between clauses joined together and output written # to file if dep.get_by_address(len(dep.nodes)-1)['word'] not in {'.', ':', '?', '!', 'kafli', 'kapítuli'} \ and len(dep.nodes) != 1: to_join.append(dep) else: if len(to_join) == 0: # write out dep. graphs that don't need to be joined dep_c = c.add_space_after(dep).to_conllU() sent_id = re.sub(r'\.psd', '', fileid).upper() + ',' + str(file_sents+1) + '.' + str(total_sents+1) sent_id_line = '# sent_id = ' + sent_id + '\n' text_line = dep.plain_text()+'\n' # icepahc_id_line = str(dep.original_ID_plain_text(corpus_name='IcePaHC')) + '\n' icepahc_id_line = str(dep.original_ID_plain_text()) + '\n' OUT_FILE.write(sent_id_line) OUT_FILE.write(icepahc_id_line) OUT_FILE.write(text_line) OUT_FILE.write(dep_c) file_sents += 1 total_sents += 1 else: # write out joined dependency graphs to_join.append(dep) dep = c.add_space_after(c.join_graphs(to_join)) dep_c = dep.to_conllU() sent_id = re.sub(r'\.psd', '', fileid).upper() + ',' + str(file_sents+1) + '.' + str(total_sents+1) sent_id_line = '# sent_id = ' + sent_id + '\n' text_line = dep.plain_text()+'\n' # icepahc_id_line = str(dep.original_ID_plain_text(corpus_name='IcePaHC')) + '\n' icepahc_id_line = str(dep.original_ID_plain_text()) + '\n' OUT_FILE.write(sent_id_line) OUT_FILE.write(icepahc_id_line) OUT_FILE.write(text_line) OUT_FILE.write(dep_c) file_sents += 1 total_sents += 1 to_join = [] tree_counter += 1 except Exception as ex: print('ERROR', '# sent_id =', sent_id) print(tree.corpus_id) print(tree) print('Failure - {0}. Arguments:\n{1!r}'.format(type(ex).__name__, ex.args)) raise error_num += 1 run_post_file(OUT_FILE_PATH) end = time.time() duration = '%.2f' % float(end - start) # if error_num > 0: print('\t'.join([str(i) for i in [file_num, fileid, tree_counter, file_sents, error_num, str(duration)+' sec']])) file_num += 1
Usage: python3 get_tree.py 2008.OFSI.NAR-SAG,.13 Prints tree .13 from 2008.ofsi.nar.psd file. """ import os import re from sys import argv import nltk >= 3.4.5 from nltk.corpus.util import LazyCorpusLoader from nltk.data import path from lib.reader import IcePaHCFormatReader path.extend(['../testing/']) ICEPAHC = LazyCorpusLoader( 'icecorpus/psd/', IcePaHCFormatReader, r'.*\.psd', cat_pattern=r'.*(nar|rel|sci|bio|law)\-.*' ) INPUT_ID = argv[1] file_id = INPUT_ID.split(',')[0].lower()+'.psd' tree_num = INPUT_ID.split(',')[1] for tree in ICEPAHC.parsed_sents(file_id): if tree.corpus_id_num == tree_num: print(tree)