Python beta2unicodeTrie 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: beta2unicode

메소드/함수: beta2unicodeTrie

hotexamples.com에서의 예제들: 5

Python beta2unicodeTrie - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 beta2unicode.beta2unicodeTrie에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: CleanLoadSentences.py 프로젝트: swasheck/greek_import

def LoadFile(filepath, corpusname):
	tree = ET.parse(filepath)
	root = tree.getroot()
	c = t.beta2unicodeTrie()
	punct = [u'.',u',',u';',u':',u',']
	sentences = root.findall('sentence')
	
	for sentence in sentences:
		sentence_id = sentence.get("id") #added post-success
		words = sentence.findall('word')
		for word in words:
			word_id = word.get('id')
			form = word.get('form')
			lemma = word.get('lemma')
			postag = word.get('postag')
			head = word.get('head')
			relation = word.get('relation')
			
			uform,b = c.convert(form.upper())
			ulemma,d = c.convert(lemma[:-1].upper())
			
			sqlcmd = ''' insert into corpus (sentenceid,wordid,form,lemma,postag,head,relation,corpusname) 
								values (?,?,?,?,?,?,?,?)
				'''
				
			cur.execute(sqlcmd, (sentence_id,word_id,uform,ulemma,postag,head,relation,corpusname))
				
				
	conn.commit()

예제 #2

파일 보기

파일: parse_corpus.py 프로젝트: ramatevish/greekify

def convert_to_unicode(text):
    '''
    Give a string of Beta Code (see http://en.wikipedia.org/wiki/Beta_code) 
    tokenize and convert to unicode_
    :param text:
    '''
    #tokenize text on spaces
    tokens = text.split(' ')
    
    #create converter object
    converter = beta2unicode.beta2unicodeTrie()
    
    #iterate over tokens, capitalize them, and convert, adding unicode_ translation to string
    converted = u""
    for word in tokens:
        unicode_, _ = converter.convert(word.upper())
        converted += unicode_ + " "
    converted = converted[:-1]
    
    return converted

예제 #3

파일 보기

파일: CleanSentences.py 프로젝트: swasheck/greek_import

def CleanFile(filepath,unicode_file_name):
	tree = ET.parse(filepath)
	root = tree.getroot()
	c = t.beta2unicodeTrie()
	punct = [u'.',u',',u';',u':']
	sentences = root.findall('sentence')
	output = u""
	for sentence in sentences:
		words = sentence.findall('word')
		for word in words:
			form = word.get('form')
			lemma = word.get('lemma')
			if (word.get('relation') not in ('AuxK','AuxX')):
				uform,b = c.convert(form.upper())
				ulemma,d = c.convert(lemma.upper())
				
				word.set('form',uform)
				word.set('lemma',ulemma)
				
	tree.write(unicode_file_name,encoding='UTF-8')

예제 #4

파일 보기

파일: 1 - Create Tagged Sentences.py 프로젝트: swasheck/IliadBrillTrainer

'''
	Take the data from the Iliad XML file from Perseus Hopper Treebank
	and make it into something that looks like NLTK can parse
'''

import xml.etree.cElementTree as ET
import beta2unicode as t
tree = ET.parse("Iliad.xml")
root = tree.getroot()
c = t.beta2unicodeTrie()
punct = [u'.',u',',u';',u':']
sentences = root.findall('sentence')
output = u""
for sentence in sentences:
	words = sentence.findall('word')
	for word in words:
		form = word.get('form')
		lemma = word.get('lemma')
		a, b = c.convert(form.upper())
		if a in punct:
			output += u"%s/%s " % (a,a)
		else:
			output += u"%s/%s " % (a,word.get('relation'))
	output += "\r\n"
		'''
			I was originally going to convert the lemmata to 
			unicode as well, but I don't really have the patience to futz around with it
		'''
		#word.set('unicode_form',a)
		#a, b = c.convert(lemma.upper())
		#word.set('unicode_lemma',a)

예제 #5

파일 보기

파일: lxxm-convert.py 프로젝트: Jaden-J/biblical-studies

    o.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Convert the CATSS LXXM text to unicode")
    subs = parser.add_subparsers(dest='command')
    # Download
    parser_dl = subs.add_parser("download", help="Download the files")
    # Patch
    parser_diff = subs.add_parser("patch", help="Apply corrections")
    # Convert
    parser_conv = subs.add_parser("convert",
                                  help="Convert from betacode to unicode")
    # Rename
    parser_ren = subs.add_parser("rename", help="Rename files")
    # All
    parser_all = subs.add_parser("all", help="Complete all actions")
    args = parser.parse_args()

    if args.command == "download" or args.command == "all":
        download_lxxm()
    # Apply corrections so unicode conversion will work
    if args.command == "patch" or args.command == "all":
        subprocess.call("patch -p1 < lxxm-corrections.patch", shell=True)
    if args.command == "convert" or args.command == "all":
        t = beta2unicode.beta2unicodeTrie()
        for text in texts:
            convert_file(text, t)
    if args.command == "rename" or args.command == "all":
        rename()