コード例 #1
0
def process(corpus_path, output_datadir):
    common_utils.make_sure_path_exists(output_datadir)
    nlp = spacy.load('de_core_news_sm')

    # Common voice has repetitions and the text is not normalized
    # we cache text normalizations since they can be slow
    normalize_cache = {}

    # we first load the entire corpus text into memory, sort by ID and then write it out into Kaldis data_dir format
    corpus = {}

    print('Loading', corpus_path + validated_filename)
    with open(corpus_path + validated_filename) as corpus_path_in:
        for line in corpus_path_in:
            split = line.split('\t')
            #        print(split)

            #myid = split[0]
            filename = split[1]
            text = split[2]

            #       print(filename)
            m = re.match(r'[^0-9]*([0-9]+)[^0-9]*mp3', filename)

            # only proceed if we can parse the sequence num from the filename
            if m:
                seq_num = int(m.group(1))

                myid = "%.10d" % seq_num

                spk = myid

                if text not in normalize_cache:
                    normalized_text = normalize_sentences.normalize(nlp, text)
                    normalize_cache[text] = normalized_text
                else:
                    normalized_text = normalize_cache[text]

                #print(myid, filename, text, normalized_text)

                corpus[myid] = (filename, normalized_text)

    print('done loading common voice tsv!')
    print('Now writing out to', output_datadir, 'in Kaldi format!')

    with open(output_datadir + 'wav.scp', 'w') as wav_scp, open(
            output_datadir + 'utt2spk',
            'w') as utt2spk, open(output_datadir + 'text', 'w') as text_out:
        for myid in sorted(corpus.keys()):
            spk = myid
            fullid = spk + '_' + myid
            filename, normalized_text = corpus[myid]

            wav_scp.write(fullid + ' ' + wav_scp_template.replace(
                "$filepath", corpus_path + 'clips/' + filename) + '\n')
            utt2spk.write(fullid + ' ' + spk + '\n')
            text_out.write(fullid + ' ' + normalized_text + '\n')

    print('done!')
コード例 #2
0
def writeKaldiDataFolder(dest_dir, utts, filter_fileid_list=None):
    ''' Exports the internal representation utts for all utterances into KALDIs corpus description format '''
    # Kaldi format, files: text,wav.scp,utt2spk,spk2gender

    # File: text
    # List of:
    # recording-id transcription

    # File: wav.scp
    # List of:
    # recording-id extended-filename

    # File: utt2spk
    # List of:
    # utteranceid speaker

    # File: spk2gender
    # List of:
    # speakerid gender

    #All files need to be sorted by key value!

    make_sure_path_exists(dest_dir)

    with open(dest_dir + 'wav.scp',
              'w') as wavscp, open(dest_dir + 'utt2spk', 'w') as utt2spk, open(
                  dest_dir + 'spk2gender',
                  'w') as spk2gender, codecs.open(dest_dir + 'text', 'w',
                                                  'utf-8') as text:
        speaker2gender = {}

        #sort by kaldi id
        utts = sorted(utts, key=lambda utt: utt['kaldi_id'])

        for utt in utts:
            kaldi_base_id = utt['kaldi_id']
            transcription = ' '.join(utt['clean_sentence_tokens'])

            for fileid, mic in zip(utt['fileids'], 'abcdefgh'):
                if fileid != 'missing':
                    if filter_fileid_list is not None:
                        if filter_fileid_list != mic:
                            continue
                    kaldi_id = kaldi_base_id + '_' + mic
                    text.write(kaldi_id + ' ' + transcription + '\n')
                    wavscp.write(kaldi_id + ' ' + fileid + '\n')
                    utt2spk.write(kaldi_id + ' ' + utt['speakerid'] + '\n')
                    speaker2gender[utt['speakerid']] = utt['gender']

        #sort by speaker
        speaker2gender = collections.OrderedDict(
            sorted(speaker2gender.items(), key=lambda x: x[0]))

        for speaker, gender in iter(speaker2gender.items()):
            spk2gender.write(speaker + ' ' +
                             ('f' if gender == 'female' else 'm') + '\n')
コード例 #3
0
def writeKaldiDataFolder(dest_dir, utts, postfix, wavextension):
    ''' Exports the internal representation utts for all utterances into KALDIs corpus description format '''
    # Kaldi format, files: text,wav.scp,utt2spk,spk2gender

    # File: text
    # List of:
    # recording-id transcription

    # File: wav.scp
    # List of:
    # recording-id extended-filename

    # File: utt2spk
    # List of:
    # utteranceid speaker
    
    # File: spk2gender
    # List of:
    # speakerid gender

    #All files need to be sorted by key value!

    make_sure_path_exists(dest_dir)

    with open(dest_dir+'wav.scp','w') as wavscp, open(dest_dir+'utt2spk','w') as utt2spk, open(dest_dir+'spk2gender','w') as spk2gender, codecs.open(dest_dir+'text','w','utf-8') as text:
        speaker2gender = {}
        
        #sort by kaldi id
        utts = sorted(utts,key=lambda utt:utt['kaldi_id'])

        for utt in utts:
            kaldi_id = utt['kaldi_id']
            transcription = ' '.join(utt['clean_sentence_tokens'])

            text.write(kaldi_id+' '+transcription+'\n')
            wavscp.write(kaldi_id+' '+utt['fileid']+postfix+wavextension+'\n')
            utt2spk.write(kaldi_id+' '+utt['speakerid']+'\n')
            speaker2gender[utt['speakerid']] = utt['gender']

        #sort by speaker
        speaker2gender = collections.OrderedDict(sorted(speaker2gender.items(), key=lambda x: x[0]))

        for speaker,gender in speaker2gender.iteritems():
            spk2gender.write(speaker+' '+('f' if gender=='female' else 'm')+'\n')
コード例 #4
0
def main():
    main_page_html = requests.get(ROOT_URL)
    main_soup = BeautifulSoup(main_page_html.text, 'html.parser')
    pages_sidebar_soup = main_soup.find("div", {"id": "pages-2"})
    all_links = [
        x.find('a')['href'] for x in pages_sidebar_soup.find_all('li')
        if x.find('a').get_text() != 'About'
    ]
    print('Links scraped! Number of episodes:')
    print(len(all_links))
    make_sure_path_exists(output_dir)
    for current_url in all_links:
        current_bowl = BeautifulSoup(
            requests.get(current_url).text, 'html.parser')
        current_main_text = current_bowl.find('div', {'class': 'entrytext'})
        current_title = current_bowl.find('h2', {'class': 'title'}).get_text()
        current_title = current_title.replace(' ', ' ')
        current_all_paragraphs = current_main_text.find_all('p')
        current_all_paragraphs = [
            x.get_text().strip() for x in current_all_paragraphs
        ]
        current_all_paragraphs = [
            x for x in current_all_paragraphs if len(x) > 0
        ]
        while ('Written by' in current_all_paragraphs[-1]
               or 'Teleplay: ' in current_all_paragraphs[-1]
               or 'Story: ' in current_all_paragraphs[-1]):
            print('Removing line:')
            print(current_all_paragraphs[-1])
            current_all_paragraphs = current_all_paragraphs[:-1]

        print('------------')
        print('Description of the first scene')
        print(current_all_paragraphs[0])
        filename = current_title.split('-')[0].lower().strip().replace(
            ' ', '_').replace('/', '_') + '.txt'
        with open(os.path.join(output_dir, filename), 'w',
                  encoding='utf8') as f:
            f.write('>> ' + current_title + '\n')
            f.write('\n'.join(current_all_paragraphs))
        print('Document written:')
        print(current_title)
    print('Whew! Finished.')
コード例 #5
0
def main():
    usage_str = 'Input dir (the JSON files), The JSON file containing each user\'s contributions, Output dir'
    if len(sys.argv) != 4:
        print(usage_str)
        return
    input_dir = sys.argv[1]
    contribs_filename = sys.argv[2]
    output_dir = sys.argv[3]

    contribs_dict = json.load(open(contribs_filename, 'r'))
    starting_day = 12
    starting_month = 12
    all_filenames = os.listdir(input_dir)
    all_filenames = [
        x for x in all_filenames if int(x.split('_')[2]) >= starting_day
        or int(x.split('_')[1]) != starting_month
    ]
    all_jsons = dict()
    for x in all_filenames:
        print(x)
        with open(os.path.join(input_dir, x), 'r') as f:
            current_json = json.load(f)
            current_username = current_json['name_field']
            if current_username in contribs_dict:
                current_json['contrib_count'] = len(
                    contribs_dict[current_username])
            else:
                current_json['contrib_count'] = 0
            all_jsons[x] = current_json
    with_username_removed = {
        x: remove_username_field(all_jsons[x])
        for x in all_jsons
    }
    make_sure_path_exists(output_dir)
    for x in with_username_removed:
        with open(
                os.path.join(
                    output_dir, 'anon_' +
                    hashlib.md5(x.encode('utf8')).hexdigest()) + '.json',
                'w') as f:
            json.dump(with_username_removed[x], f)
コード例 #6
0
def main():
    if len(sys.argv) != 4:
        print('Input file 1 (experienced), Input file 2 (novice), output dir.')
        return

    experienced_file = sys.argv[1]
    novice_file = sys.argv[2]
    output_dir = sys.argv[3]

    experienced_usernames = open(experienced_file, 'r').readlines()
    novice_usernames = open(novice_file, 'r').readlines()

    experienced_hash_to_username, experienced_str = hash_usernames(
        experienced_usernames)
    novice_hash_to_username, novice_str = hash_usernames(novice_usernames)

    e_hashes = set(experienced_hash_to_username.keys())
    n_hashes = set(novice_hash_to_username.keys())

    print(
        'Intersection of the two hash sets (alternatively known as "Are we f****d, or not?"):'
    )
    print(e_hashes.intersection(n_hashes))

    make_sure_path_exists(output_dir)

    with open(os.path.join(output_dir, 'experienced_user_and_hash.txt'),
              'w') as f:
        f.write(experienced_str)
    with open(os.path.join(output_dir, 'novice_user_and_hash.txt'), 'w') as f:
        f.write(novice_str)

    all_hash_to_username = experienced_hash_to_username.copy()
    all_hash_to_username.update(novice_hash_to_username)

    with open(os.path.join(output_dir, 'token_to_name_map.json'), 'w') as f:
        json.dump(all_hash_to_username, f)
コード例 #7
0
                  ', because it already exists')


if not os.path.exists('run.sh'):
    print(
        'You have to run this python script from the base dir, where run.sh is located. WARNING: aborting.'
    )
    sys.exit('wrong woring directory')

data_lex_dir = 'data/lexicon/'
data_local_dir = 'data/local/'
data_local_dict_dir = 'data/local/dict/'

print('Creating local data directories...')
print('Creating directoy if necessary:', data_lex_dir)
make_sure_path_exists(data_lex_dir)
print('Creating directoy if necessary:', data_local_dir)
make_sure_path_exists(data_local_dir)
print('Creating directoy if necessary:', data_local_dict_dir)
make_sure_path_exists(data_local_dict_dir)

#if exp and mfcc don't exist locally, create them as link to some other directory on a larger disk
if not os.path.exists('exp/') and not os.path.exists('mfcc/'):
    default_dir = '/srv/data/speech/tuda_kaldi_de/'

    myinput = ''
    while (myinput != 'y' and myinput != 'n'):
        myinput = input(
            'Do you want to symlink big data directories (features, models, wavs) to another path than the current directory? (y/n) '
        )
コード例 #8
0
    try:
        os.symlink(file1, file2)
    except (OSError, e):
        if e.errno == errno.EEXIST:
            print('Omitted symlink', file1, '->', file2,
                  ', because it already exists')


if not os.path.exists('run.sh'):
    print(
        'You have to run this python script from the base dir, where run.sh is located. WARNING: aborting.'
    )
    sys.exit('wrong woring directory')

print('Creating data dir(s)...')
make_sure_path_exists('data/lexicon/')
make_sure_path_exists('data/local/')
#make_sure_path_exists('data/wav/')
make_sure_path_exists('data/local/dict/')
#make_sure_path_exists('data/local/lang/')
#make_sure_path_exists('data/local/lm/')
#make_sure_path_exists('data/local/lm/3gram-mincount/')

#if exp and mfcc don't exist locally, create them as link to some other directory on a larger disk
if not os.path.exists('exp/') and not os.path.exists('mfcc/'):
    default_dir = '/srv/data/speech/tuda_kaldi_de/'
    data_dir = raw_input(
        'Where do you want to store mfcc vectors and models (exp)? It should point to some largish disk. default: '
        + default_dir + ' : ')
    if data_dir == '':
        data_dir = default_dir
コード例 #9
0
def save_dict(output_dir_dict, out_dict, filename='col_dict.json'):
    output_dir_dict = add_slash_to_dir(output_dir_dict)
    output_name_dict = output_dir_dict + filename
    make_sure_path_exists(output_dir_dict)
    json.dump(out_dict, open(output_name_dict, mode='w'))
コード例 #10
0
# limitations under the License.

# import argparse
import common_utils
import codecs
# import traceback
# import datetime
# import maryclient
# import StringIO
# import os
# import errno

# from bs4 import BeautifulSoup
#
from common_utils import make_sure_path_exists
# import collections
# import itertools

if __name__ == '__main__':

    wavscp = codecs.open('../data/local/train_trans.txt', 'r', 'utf-8')
    datadir = '../data/train/'
    make_sure_path_exists(datadir)
    with codecs.open(datadir + 'text', 'w', 'utf-8') as train_text:
        for wav in wavscp:
            try:
                train_text.write(wav)
            except Exception as err:
                print 'Error in file, omitting', wav
                print err
コード例 #11
0
                    '{} sox {} -r 16k -t wav -c 1 -b 16 -e signed - |\n'.
                    format(uttid, wavdir + wav))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        'Prepares the German data from the M-ailabs corpus (http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/) for Kaldi.'
    )
    parser.add_argument(
        '-i',
        '--inputcorpus',
        dest='inputcorpus',
        help=
        'Path to the M-ailabs data (download here: http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/)',
        type=str,
        default='data/wav/m_ailabs/de_DE/')
    parser.add_argument('-o',
                        '--outputfolder',
                        dest='outputfolder',
                        help='Export to this Kaldi folder.',
                        type=str,
                        default='data/m_ailabs_train')
    args = parser.parse_args()

    common_utils.make_sure_path_exists(args.outputfolder)

    create_kaldi_datadir(args.outputfolder, args.inputcorpus)

    #subprocess.call('utils/fix_data_dir.sh {}'.format(odir), shell=True)
コード例 #12
0
import os
import sys

def symlink_file(file1,file2):
    try:
        os.symlink(file1, file2)
    except OSError, e:
        if e.errno == errno.EEXIST:
            print 'Omitted symlink', file1, '->', file2, ', because it already exists'        

if not os.path.exists('run.sh'):
    print 'You have to run this python script from the base dir, where run.sh is located. WARNING: aborting.'
    sys.exit('wrong woring directory')

print 'Creating data dir(s)...'
make_sure_path_exists('data/lexicon/')
make_sure_path_exists('data/local/')
#make_sure_path_exists('data/wav/')
make_sure_path_exists('data/local/dict/')
#make_sure_path_exists('data/local/lang/')
#make_sure_path_exists('data/local/lm/')
#make_sure_path_exists('data/local/lm/3gram-mincount/')

#if exp and mfcc don't exist locally, create them as link to some other directory on a larger disk
if not os.path.exists('exp/') and not os.path.exists('mfcc/'):
    default_dir = '/srv/data/speech/tuda_kaldi_de/'
    data_dir = raw_input('Where do you want to store mfcc vectors and models (exp)? It should point to some largish disk. default: ' + default_dir + ' : ') 
    if data_dir == '':
        data_dir = default_dir

    if data_dir.endswith('/'):