コード例 #1
def LoadFile(filepath, corpusname):
	tree = ET.parse(filepath)
	root = tree.getroot()
	c = t.beta2unicodeTrie()
	punct = [u'.',u',',u';',u':',u',']
	sentences = root.findall('sentence')
	for sentence in sentences:
		sentence_id = sentence.get("id") #added post-success
		words = sentence.findall('word')
		for word in words:
			word_id = word.get('id')
			form = word.get('form')
			lemma = word.get('lemma')
			postag = word.get('postag')
			head = word.get('head')
			relation = word.get('relation')
			uform,b = c.convert(form.upper())
			ulemma,d = c.convert(lemma[:-1].upper())
			sqlcmd = ''' insert into corpus (sentenceid,wordid,form,lemma,postag,head,relation,corpusname) 
								values (?,?,?,?,?,?,?,?)
			cur.execute(sqlcmd, (sentence_id,word_id,uform,ulemma,postag,head,relation,corpusname))
コード例 #2
ファイル: parse_corpus.py プロジェクト: ramatevish/greekify
def convert_to_unicode(text):
    Give a string of Beta Code (see http://en.wikipedia.org/wiki/Beta_code) 
    tokenize and convert to unicode_
    :param text:
    #tokenize text on spaces
    tokens = text.split(' ')
    #create converter object
    converter = beta2unicode.beta2unicodeTrie()
    #iterate over tokens, capitalize them, and convert, adding unicode_ translation to string
    converted = u""
    for word in tokens:
        unicode_, _ = converter.convert(word.upper())
        converted += unicode_ + " "
    converted = converted[:-1]
    return converted
コード例 #3
def CleanFile(filepath,unicode_file_name):
	tree = ET.parse(filepath)
	root = tree.getroot()
	c = t.beta2unicodeTrie()
	punct = [u'.',u',',u';',u':']
	sentences = root.findall('sentence')
	output = u""
	for sentence in sentences:
		words = sentence.findall('word')
		for word in words:
			form = word.get('form')
			lemma = word.get('lemma')
			if (word.get('relation') not in ('AuxK','AuxX')):
				uform,b = c.convert(form.upper())
				ulemma,d = c.convert(lemma.upper())
コード例 #4
	Take the data from the Iliad XML file from Perseus Hopper Treebank
	and make it into something that looks like NLTK can parse

import xml.etree.cElementTree as ET
import beta2unicode as t
tree = ET.parse("Iliad.xml")
root = tree.getroot()
c = t.beta2unicodeTrie()
punct = [u'.',u',',u';',u':']
sentences = root.findall('sentence')
output = u""
for sentence in sentences:
	words = sentence.findall('word')
	for word in words:
		form = word.get('form')
		lemma = word.get('lemma')
		a, b = c.convert(form.upper())
		if a in punct:
			output += u"%s/%s " % (a,a)
			output += u"%s/%s " % (a,word.get('relation'))
	output += "\r\n"
			I was originally going to convert the lemmata to 
			unicode as well, but I don't really have the patience to futz around with it
		#a, b = c.convert(lemma.upper())
コード例 #5

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Convert the CATSS LXXM text to unicode")
    subs = parser.add_subparsers(dest='command')
    # Download
    parser_dl = subs.add_parser("download", help="Download the files")
    # Patch
    parser_diff = subs.add_parser("patch", help="Apply corrections")
    # Convert
    parser_conv = subs.add_parser("convert",
                                  help="Convert from betacode to unicode")
    # Rename
    parser_ren = subs.add_parser("rename", help="Rename files")
    # All
    parser_all = subs.add_parser("all", help="Complete all actions")
    args = parser.parse_args()

    if args.command == "download" or args.command == "all":
    # Apply corrections so unicode conversion will work
    if args.command == "patch" or args.command == "all":
        subprocess.call("patch -p1 < lxxm-corrections.patch", shell=True)
    if args.command == "convert" or args.command == "all":
        t = beta2unicode.beta2unicodeTrie()
        for text in texts:
            convert_file(text, t)
    if args.command == "rename" or args.command == "all":