Esempio n. 1
0
def find_lang(source, xml):
    ''' Finds and returns the iso 639-3 code for an xml file in the given source folder. '''

    tree = ElementTree.parse(source + xml)
    root = process.clean_up(tree.getroot())

    lang = ""

    #Try to find the code from the lang attribute in xml.
    for key in root.attrib:
        if key.endswith('lang') and len(
                root.attrib[key]) >= 3 and not root.attrib[key].endswith(
                    'und'):
            lang = root.attrib[key][-3:].lower()
            return lang

    #Try to find the code from the kindOf element in form.
    for child in root:
        if child.tag == "S" or child.tag == "M":
            forms = child.findall("FORM")
            for form in forms:
                if "kindOf" in form.attrib and form.attrib['kindOf'].find(
                        '-txt-') != -1:
                    i = form.attrib['kindOf'].find('-txt-')
                    lang = form.attrib['kindOf'][i + 5:i + 8]
                    return lang

    #If code is still not found, try to find the code from the xml name.
    if lang == "":
        lang_name = xml[xml.find('_') + 1:-4].replace('_', ' ')

        #Hard coded one language that was causing trouble. look_up_lang function could be used here also, which is a more general solution, but I didn't want to create unnecessary traffic.
        if lang_name == "Xaracuu":
            lang = "ane"
        else:
            lang = pycountry.languages.get(name=lang_name).alpha_3.lower()

    return lang
if __name__ == "__main__":
    src = "Processed/"
    txt_dest = "label/"
    wav_dest = "wav/"
    skip = ["ortho", "west_uvean", "wallisian", "tiri", "bwatoo", "cemuhi", "numee", "laz", "paici", "maore_comorian", "wayana", "ngazidja_comorian", "araki", "wetamut", "yucuna", "ajie", \
            "xaracuu", "xaragure", "dehu", "nelemwa", "nemi"]

    time = 0

    logger.info("Creating phoneme files...")

    for file in os.listdir(src):
        create = True
        for name in skip:
            if name in file.lower():
                create = False
                break
        if create:
            divide_phonemes(file, src, txt_dest, wav_dest)

            idx = file.find('_Processed')
            tree = ElementTree.parse('Recordings_xml/' + file[:idx] + '.xml')
            root = clean_up(tree.getroot())
            time += calc_time(root)
    '''
    with open(f'{dest}total_audio.txt', 'w') as outf:
        outf.write(f'Total audio in minutes: {time/60} mins')
    '''
    logger.info('Phoneme files created. Total audio in minutes:' +
                str(time / 60) + ' mins.')
Esempio n. 3
0
def create_set(source, dest, xml):
    ''' Creates character sets (phono, ortho, undetermined) and audio info for a given xml from the source folder, into the specific language's folder in dest. '''
    tree = ElementTree.parse(source + xml)
    root = process.clean_up(tree.getroot())

    speakers = {'Total time': 0, f'{root.tag}': 0}
    written = {}

    lang = find_lang(source, xml)

    path = f"{dest}{lang}/"

    if not os.path.exists(path):
        os.makedirs(path)

    create_written(written, path)
    create_audio_info(speakers, lang, root, path)
    '''
    for name in os.listdir(path):
        if xml[xml.find('_')+1:-4] == name[:"_Set"]:
            filename = name
        else:
            filename = f"{xml.find('_')+1:-4}_Set"
    '''

    with open(f'{path}{lang}_phono.txt', 'ab') as phonof, open(
            f'{path}{lang}_ortho.txt',
            'ab') as orthof, open(f'{path}{lang}_undet.txt', 'ab') as undetf:
        sents = root.findall("S")

        #Three different processes for three different main formats of the xml files.
        if sents:
            for sent in sents:
                lines = []
                kinds = []
                process.process_sent(xml, sent, lines, kinds, get_info=False)
                update_audio_info(speakers, sent)
                update_files(written, lines, kinds, phonof, orthof, undetf)

        elif root.findall("W"):

            for word in root.findall("W"):
                lines = []
                kinds = []
                forms = word.findall("FORM")
                if forms:
                    #For each form, add the line to lines, and find the kind for the same index and add it to kinds.
                    for i, form in enumerate(forms):
                        if form.text is not None:
                            #line = word.attrib['id'] + audio_info(word) + " " + word.find("FORM").text + "\r\n"
                            line = process.strip_punc(form.text,
                                                      after_info=False)
                            process.add_to_list(lines, line, i)
                            process.update_kinds(form, lines, kinds, i)
                    update_audio_info(speakers, word)
                    update_files(written, lines, kinds, phonof, orthof, undetf)
                '''
                elif word.find("TRANSL") is not None and word.find("TRANSL").text is not None:
                    line = strip_punc(xml[:-4] + "_" + word.attrib['id'] + " " + word.find("TRANSL").text) + '\r\n'
                    ids.append(line[:line.find(' ')])
                    outf.write(line.encode('utf-8'))
                '''

        else:
            lines = []
            kinds = []
            lines.append(
                process.strip_punc(root.find("FORM").text, after_info=False))
            process.update_kinds(root.find("FORM"), lines, kinds, 0)
            update_audio_info(speakers, root)
            update_files(written, lines, kinds, phonof, orthof, undetf)

    write_audio_info(speakers, lang, path)
    process.remove_empty_files(path, report=False)