def LoadFile(filepath, corpusname):
	tree = ET.parse(filepath)
	root = tree.getroot()
	c = t.beta2unicodeTrie()
	punct = [u'.',u',',u';',u':',u',']
	sentences = root.findall('sentence')
	
	for sentence in sentences:
		sentence_id = sentence.get("id") #added post-success
		words = sentence.findall('word')
		for word in words:
			word_id = word.get('id')
			form = word.get('form')
			lemma = word.get('lemma')
			postag = word.get('postag')
			head = word.get('head')
			relation = word.get('relation')
			
			uform,b = c.convert(form.upper())
			ulemma,d = c.convert(lemma[:-1].upper())
			
			sqlcmd = ''' insert into corpus (sentenceid,wordid,form,lemma,postag,head,relation,corpusname) 
								values (?,?,?,?,?,?,?,?)
				'''
				
			cur.execute(sqlcmd, (sentence_id,word_id,uform,ulemma,postag,head,relation,corpusname))
				
				
	conn.commit()
Beispiel #2
0
def convert_to_unicode(text):
    '''
    Give a string of Beta Code (see http://en.wikipedia.org/wiki/Beta_code) 
    tokenize and convert to unicode_
    :param text:
    '''
    #tokenize text on spaces
    tokens = text.split(' ')
    
    #create converter object
    converter = beta2unicode.beta2unicodeTrie()
    
    #iterate over tokens, capitalize them, and convert, adding unicode_ translation to string
    converted = u""
    for word in tokens:
        unicode_, _ = converter.convert(word.upper())
        converted += unicode_ + " "
    converted = converted[:-1]
    
    return converted
def CleanFile(filepath,unicode_file_name):
	tree = ET.parse(filepath)
	root = tree.getroot()
	c = t.beta2unicodeTrie()
	punct = [u'.',u',',u';',u':']
	sentences = root.findall('sentence')
	output = u""
	for sentence in sentences:
		words = sentence.findall('word')
		for word in words:
			form = word.get('form')
			lemma = word.get('lemma')
			if (word.get('relation') not in ('AuxK','AuxX')):
				uform,b = c.convert(form.upper())
				ulemma,d = c.convert(lemma.upper())
				
				word.set('form',uform)
				word.set('lemma',ulemma)
				
	tree.write(unicode_file_name,encoding='UTF-8')
'''
	Take the data from the Iliad XML file from Perseus Hopper Treebank
	and make it into something that looks like NLTK can parse
'''

import xml.etree.cElementTree as ET
import beta2unicode as t
tree = ET.parse("Iliad.xml")
root = tree.getroot()
c = t.beta2unicodeTrie()
punct = [u'.',u',',u';',u':']
sentences = root.findall('sentence')
output = u""
for sentence in sentences:
	words = sentence.findall('word')
	for word in words:
		form = word.get('form')
		lemma = word.get('lemma')
		a, b = c.convert(form.upper())
		if a in punct:
			output += u"%s/%s " % (a,a)
		else:
			output += u"%s/%s " % (a,word.get('relation'))
	output += "\r\n"
		'''
			I was originally going to convert the lemmata to 
			unicode as well, but I don't really have the patience to futz around with it
		'''
		#word.set('unicode_form',a)
		#a, b = c.convert(lemma.upper())
		#word.set('unicode_lemma',a)
    o.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Convert the CATSS LXXM text to unicode")
    subs = parser.add_subparsers(dest='command')
    # Download
    parser_dl = subs.add_parser("download", help="Download the files")
    # Patch
    parser_diff = subs.add_parser("patch", help="Apply corrections")
    # Convert
    parser_conv = subs.add_parser("convert",
                                  help="Convert from betacode to unicode")
    # Rename
    parser_ren = subs.add_parser("rename", help="Rename files")
    # All
    parser_all = subs.add_parser("all", help="Complete all actions")
    args = parser.parse_args()

    if args.command == "download" or args.command == "all":
        download_lxxm()
    # Apply corrections so unicode conversion will work
    if args.command == "patch" or args.command == "all":
        subprocess.call("patch -p1 < lxxm-corrections.patch", shell=True)
    if args.command == "convert" or args.command == "all":
        t = beta2unicode.beta2unicodeTrie()
        for text in texts:
            convert_file(text, t)
    if args.command == "rename" or args.command == "all":
        rename()