def LoadFile(filepath, corpusname): tree = ET.parse(filepath) root = tree.getroot() c = t.beta2unicodeTrie() punct = [u'.',u',',u';',u':',u','] sentences = root.findall('sentence') for sentence in sentences: sentence_id = sentence.get("id") #added post-success words = sentence.findall('word') for word in words: word_id = word.get('id') form = word.get('form') lemma = word.get('lemma') postag = word.get('postag') head = word.get('head') relation = word.get('relation') uform,b = c.convert(form.upper()) ulemma,d = c.convert(lemma[:-1].upper()) sqlcmd = ''' insert into corpus (sentenceid,wordid,form,lemma,postag,head,relation,corpusname) values (?,?,?,?,?,?,?,?) ''' cur.execute(sqlcmd, (sentence_id,word_id,uform,ulemma,postag,head,relation,corpusname)) conn.commit()
def convert_to_unicode(text): ''' Give a string of Beta Code (see http://en.wikipedia.org/wiki/Beta_code) tokenize and convert to unicode_ :param text: ''' #tokenize text on spaces tokens = text.split(' ') #create converter object converter = beta2unicode.beta2unicodeTrie() #iterate over tokens, capitalize them, and convert, adding unicode_ translation to string converted = u"" for word in tokens: unicode_, _ = converter.convert(word.upper()) converted += unicode_ + " " converted = converted[:-1] return converted
def CleanFile(filepath,unicode_file_name): tree = ET.parse(filepath) root = tree.getroot() c = t.beta2unicodeTrie() punct = [u'.',u',',u';',u':'] sentences = root.findall('sentence') output = u"" for sentence in sentences: words = sentence.findall('word') for word in words: form = word.get('form') lemma = word.get('lemma') if (word.get('relation') not in ('AuxK','AuxX')): uform,b = c.convert(form.upper()) ulemma,d = c.convert(lemma.upper()) word.set('form',uform) word.set('lemma',ulemma) tree.write(unicode_file_name,encoding='UTF-8')
''' Take the data from the Iliad XML file from Perseus Hopper Treebank and make it into something that looks like NLTK can parse ''' import xml.etree.cElementTree as ET import beta2unicode as t tree = ET.parse("Iliad.xml") root = tree.getroot() c = t.beta2unicodeTrie() punct = [u'.',u',',u';',u':'] sentences = root.findall('sentence') output = u"" for sentence in sentences: words = sentence.findall('word') for word in words: form = word.get('form') lemma = word.get('lemma') a, b = c.convert(form.upper()) if a in punct: output += u"%s/%s " % (a,a) else: output += u"%s/%s " % (a,word.get('relation')) output += "\r\n" ''' I was originally going to convert the lemmata to unicode as well, but I don't really have the patience to futz around with it ''' #word.set('unicode_form',a) #a, b = c.convert(lemma.upper()) #word.set('unicode_lemma',a)
o.close() if __name__ == '__main__': parser = argparse.ArgumentParser( description="Convert the CATSS LXXM text to unicode") subs = parser.add_subparsers(dest='command') # Download parser_dl = subs.add_parser("download", help="Download the files") # Patch parser_diff = subs.add_parser("patch", help="Apply corrections") # Convert parser_conv = subs.add_parser("convert", help="Convert from betacode to unicode") # Rename parser_ren = subs.add_parser("rename", help="Rename files") # All parser_all = subs.add_parser("all", help="Complete all actions") args = parser.parse_args() if args.command == "download" or args.command == "all": download_lxxm() # Apply corrections so unicode conversion will work if args.command == "patch" or args.command == "all": subprocess.call("patch -p1 < lxxm-corrections.patch", shell=True) if args.command == "convert" or args.command == "all": t = beta2unicode.beta2unicodeTrie() for text in texts: convert_file(text, t) if args.command == "rename" or args.command == "all": rename()