Python beta2unicodeTrie Beispiele

Programmiersprache: Python

Namespace / Paketname: beta2unicode

Methode / Funktion: beta2unicodeTrie

Beispiele auf hotexamples.com: 5

Python beta2unicodeTrie - 5 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die beta2unicode.beta2unicodeTrie, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

Datei: CleanLoadSentences.py Projekt: swasheck/greek_import

def LoadFile(filepath, corpusname):
	tree = ET.parse(filepath)
	root = tree.getroot()
	c = t.beta2unicodeTrie()
	punct = [u'.',u',',u';',u':',u',']
	sentences = root.findall('sentence')
	
	for sentence in sentences:
		sentence_id = sentence.get("id") #added post-success
		words = sentence.findall('word')
		for word in words:
			word_id = word.get('id')
			form = word.get('form')
			lemma = word.get('lemma')
			postag = word.get('postag')
			head = word.get('head')
			relation = word.get('relation')
			
			uform,b = c.convert(form.upper())
			ulemma,d = c.convert(lemma[:-1].upper())
			
			sqlcmd = ''' insert into corpus (sentenceid,wordid,form,lemma,postag,head,relation,corpusname) 
								values (?,?,?,?,?,?,?,?)
				'''
				
			cur.execute(sqlcmd, (sentence_id,word_id,uform,ulemma,postag,head,relation,corpusname))
				
				
	conn.commit()

Beispiel #2

Datei anzeigen

Datei: parse_corpus.py Projekt: ramatevish/greekify

def convert_to_unicode(text):
    '''
    Give a string of Beta Code (see http://en.wikipedia.org/wiki/Beta_code) 
    tokenize and convert to unicode_
    :param text:
    '''
    #tokenize text on spaces
    tokens = text.split(' ')
    
    #create converter object
    converter = beta2unicode.beta2unicodeTrie()
    
    #iterate over tokens, capitalize them, and convert, adding unicode_ translation to string
    converted = u""
    for word in tokens:
        unicode_, _ = converter.convert(word.upper())
        converted += unicode_ + " "
    converted = converted[:-1]
    
    return converted

Beispiel #3

Datei anzeigen

Datei: CleanSentences.py Projekt: swasheck/greek_import

def CleanFile(filepath,unicode_file_name):
	tree = ET.parse(filepath)
	root = tree.getroot()
	c = t.beta2unicodeTrie()
	punct = [u'.',u',',u';',u':']
	sentences = root.findall('sentence')
	output = u""
	for sentence in sentences:
		words = sentence.findall('word')
		for word in words:
			form = word.get('form')
			lemma = word.get('lemma')
			if (word.get('relation') not in ('AuxK','AuxX')):
				uform,b = c.convert(form.upper())
				ulemma,d = c.convert(lemma.upper())
				
				word.set('form',uform)
				word.set('lemma',ulemma)
				
	tree.write(unicode_file_name,encoding='UTF-8')

Beispiel #4

Datei anzeigen

Datei: 1 - Create Tagged Sentences.py Projekt: swasheck/IliadBrillTrainer

'''
	Take the data from the Iliad XML file from Perseus Hopper Treebank
	and make it into something that looks like NLTK can parse
'''

import xml.etree.cElementTree as ET
import beta2unicode as t
tree = ET.parse("Iliad.xml")
root = tree.getroot()
c = t.beta2unicodeTrie()
punct = [u'.',u',',u';',u':']
sentences = root.findall('sentence')
output = u""
for sentence in sentences:
	words = sentence.findall('word')
	for word in words:
		form = word.get('form')
		lemma = word.get('lemma')
		a, b = c.convert(form.upper())
		if a in punct:
			output += u"%s/%s " % (a,a)
		else:
			output += u"%s/%s " % (a,word.get('relation'))
	output += "\r\n"
		'''
			I was originally going to convert the lemmata to 
			unicode as well, but I don't really have the patience to futz around with it
		'''
		#word.set('unicode_form',a)
		#a, b = c.convert(lemma.upper())
		#word.set('unicode_lemma',a)

Beispiel #5

Datei anzeigen

Datei: lxxm-convert.py Projekt: Jaden-J/biblical-studies

    o.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Convert the CATSS LXXM text to unicode")
    subs = parser.add_subparsers(dest='command')
    # Download
    parser_dl = subs.add_parser("download", help="Download the files")
    # Patch
    parser_diff = subs.add_parser("patch", help="Apply corrections")
    # Convert
    parser_conv = subs.add_parser("convert",
                                  help="Convert from betacode to unicode")
    # Rename
    parser_ren = subs.add_parser("rename", help="Rename files")
    # All
    parser_all = subs.add_parser("all", help="Complete all actions")
    args = parser.parse_args()

    if args.command == "download" or args.command == "all":
        download_lxxm()
    # Apply corrections so unicode conversion will work
    if args.command == "patch" or args.command == "all":
        subprocess.call("patch -p1 < lxxm-corrections.patch", shell=True)
    if args.command == "convert" or args.command == "all":
        t = beta2unicode.beta2unicodeTrie()
        for text in texts:
            convert_file(text, t)
    if args.command == "rename" or args.command == "all":
        rename()