def find_lang(source, xml): ''' Finds and returns the iso 639-3 code for an xml file in the given source folder. ''' tree = ElementTree.parse(source + xml) root = process.clean_up(tree.getroot()) lang = "" #Try to find the code from the lang attribute in xml. for key in root.attrib: if key.endswith('lang') and len( root.attrib[key]) >= 3 and not root.attrib[key].endswith( 'und'): lang = root.attrib[key][-3:].lower() return lang #Try to find the code from the kindOf element in form. for child in root: if child.tag == "S" or child.tag == "M": forms = child.findall("FORM") for form in forms: if "kindOf" in form.attrib and form.attrib['kindOf'].find( '-txt-') != -1: i = form.attrib['kindOf'].find('-txt-') lang = form.attrib['kindOf'][i + 5:i + 8] return lang #If code is still not found, try to find the code from the xml name. if lang == "": lang_name = xml[xml.find('_') + 1:-4].replace('_', ' ') #Hard coded one language that was causing trouble. look_up_lang function could be used here also, which is a more general solution, but I didn't want to create unnecessary traffic. if lang_name == "Xaracuu": lang = "ane" else: lang = pycountry.languages.get(name=lang_name).alpha_3.lower() return lang
if __name__ == "__main__": src = "Processed/" txt_dest = "label/" wav_dest = "wav/" skip = ["ortho", "west_uvean", "wallisian", "tiri", "bwatoo", "cemuhi", "numee", "laz", "paici", "maore_comorian", "wayana", "ngazidja_comorian", "araki", "wetamut", "yucuna", "ajie", \ "xaracuu", "xaragure", "dehu", "nelemwa", "nemi"] time = 0 logger.info("Creating phoneme files...") for file in os.listdir(src): create = True for name in skip: if name in file.lower(): create = False break if create: divide_phonemes(file, src, txt_dest, wav_dest) idx = file.find('_Processed') tree = ElementTree.parse('Recordings_xml/' + file[:idx] + '.xml') root = clean_up(tree.getroot()) time += calc_time(root) ''' with open(f'{dest}total_audio.txt', 'w') as outf: outf.write(f'Total audio in minutes: {time/60} mins') ''' logger.info('Phoneme files created. Total audio in minutes:' + str(time / 60) + ' mins.')
def create_set(source, dest, xml): ''' Creates character sets (phono, ortho, undetermined) and audio info for a given xml from the source folder, into the specific language's folder in dest. ''' tree = ElementTree.parse(source + xml) root = process.clean_up(tree.getroot()) speakers = {'Total time': 0, f'{root.tag}': 0} written = {} lang = find_lang(source, xml) path = f"{dest}{lang}/" if not os.path.exists(path): os.makedirs(path) create_written(written, path) create_audio_info(speakers, lang, root, path) ''' for name in os.listdir(path): if xml[xml.find('_')+1:-4] == name[:"_Set"]: filename = name else: filename = f"{xml.find('_')+1:-4}_Set" ''' with open(f'{path}{lang}_phono.txt', 'ab') as phonof, open( f'{path}{lang}_ortho.txt', 'ab') as orthof, open(f'{path}{lang}_undet.txt', 'ab') as undetf: sents = root.findall("S") #Three different processes for three different main formats of the xml files. if sents: for sent in sents: lines = [] kinds = [] process.process_sent(xml, sent, lines, kinds, get_info=False) update_audio_info(speakers, sent) update_files(written, lines, kinds, phonof, orthof, undetf) elif root.findall("W"): for word in root.findall("W"): lines = [] kinds = [] forms = word.findall("FORM") if forms: #For each form, add the line to lines, and find the kind for the same index and add it to kinds. for i, form in enumerate(forms): if form.text is not None: #line = word.attrib['id'] + audio_info(word) + " " + word.find("FORM").text + "\r\n" line = process.strip_punc(form.text, after_info=False) process.add_to_list(lines, line, i) process.update_kinds(form, lines, kinds, i) update_audio_info(speakers, word) update_files(written, lines, kinds, phonof, orthof, undetf) ''' elif word.find("TRANSL") is not None and word.find("TRANSL").text is not None: line = strip_punc(xml[:-4] + "_" + word.attrib['id'] + " " + word.find("TRANSL").text) + '\r\n' ids.append(line[:line.find(' ')]) outf.write(line.encode('utf-8')) ''' else: lines = [] kinds = [] lines.append( process.strip_punc(root.find("FORM").text, after_info=False)) process.update_kinds(root.find("FORM"), lines, kinds, 0) update_audio_info(speakers, root) update_files(written, lines, kinds, phonof, orthof, undetf) write_audio_info(speakers, lang, path) process.remove_empty_files(path, report=False)