def process(corpus_path, output_datadir): common_utils.make_sure_path_exists(output_datadir) nlp = spacy.load('de_core_news_sm') # Common voice has repetitions and the text is not normalized # we cache text normalizations since they can be slow normalize_cache = {} # we first load the entire corpus text into memory, sort by ID and then write it out into Kaldis data_dir format corpus = {} print('Loading', corpus_path + validated_filename) with open(corpus_path + validated_filename) as corpus_path_in: for line in corpus_path_in: split = line.split('\t') # print(split) #myid = split[0] filename = split[1] text = split[2] # print(filename) m = re.match(r'[^0-9]*([0-9]+)[^0-9]*mp3', filename) # only proceed if we can parse the sequence num from the filename if m: seq_num = int(m.group(1)) myid = "%.10d" % seq_num spk = myid if text not in normalize_cache: normalized_text = normalize_sentences.normalize(nlp, text) normalize_cache[text] = normalized_text else: normalized_text = normalize_cache[text] #print(myid, filename, text, normalized_text) corpus[myid] = (filename, normalized_text) print('done loading common voice tsv!') print('Now writing out to', output_datadir, 'in Kaldi format!') with open(output_datadir + 'wav.scp', 'w') as wav_scp, open( output_datadir + 'utt2spk', 'w') as utt2spk, open(output_datadir + 'text', 'w') as text_out: for myid in sorted(corpus.keys()): spk = myid fullid = spk + '_' + myid filename, normalized_text = corpus[myid] wav_scp.write(fullid + ' ' + wav_scp_template.replace( "$filepath", corpus_path + 'clips/' + filename) + '\n') utt2spk.write(fullid + ' ' + spk + '\n') text_out.write(fullid + ' ' + normalized_text + '\n') print('done!')
def writeKaldiDataFolder(dest_dir, utts, filter_fileid_list=None): ''' Exports the internal representation utts for all utterances into KALDIs corpus description format ''' # Kaldi format, files: text,wav.scp,utt2spk,spk2gender # File: text # List of: # recording-id transcription # File: wav.scp # List of: # recording-id extended-filename # File: utt2spk # List of: # utteranceid speaker # File: spk2gender # List of: # speakerid gender #All files need to be sorted by key value! make_sure_path_exists(dest_dir) with open(dest_dir + 'wav.scp', 'w') as wavscp, open(dest_dir + 'utt2spk', 'w') as utt2spk, open( dest_dir + 'spk2gender', 'w') as spk2gender, codecs.open(dest_dir + 'text', 'w', 'utf-8') as text: speaker2gender = {} #sort by kaldi id utts = sorted(utts, key=lambda utt: utt['kaldi_id']) for utt in utts: kaldi_base_id = utt['kaldi_id'] transcription = ' '.join(utt['clean_sentence_tokens']) for fileid, mic in zip(utt['fileids'], 'abcdefgh'): if fileid != 'missing': if filter_fileid_list is not None: if filter_fileid_list != mic: continue kaldi_id = kaldi_base_id + '_' + mic text.write(kaldi_id + ' ' + transcription + '\n') wavscp.write(kaldi_id + ' ' + fileid + '\n') utt2spk.write(kaldi_id + ' ' + utt['speakerid'] + '\n') speaker2gender[utt['speakerid']] = utt['gender'] #sort by speaker speaker2gender = collections.OrderedDict( sorted(speaker2gender.items(), key=lambda x: x[0])) for speaker, gender in iter(speaker2gender.items()): spk2gender.write(speaker + ' ' + ('f' if gender == 'female' else 'm') + '\n')
def writeKaldiDataFolder(dest_dir, utts, postfix, wavextension): ''' Exports the internal representation utts for all utterances into KALDIs corpus description format ''' # Kaldi format, files: text,wav.scp,utt2spk,spk2gender # File: text # List of: # recording-id transcription # File: wav.scp # List of: # recording-id extended-filename # File: utt2spk # List of: # utteranceid speaker # File: spk2gender # List of: # speakerid gender #All files need to be sorted by key value! make_sure_path_exists(dest_dir) with open(dest_dir+'wav.scp','w') as wavscp, open(dest_dir+'utt2spk','w') as utt2spk, open(dest_dir+'spk2gender','w') as spk2gender, codecs.open(dest_dir+'text','w','utf-8') as text: speaker2gender = {} #sort by kaldi id utts = sorted(utts,key=lambda utt:utt['kaldi_id']) for utt in utts: kaldi_id = utt['kaldi_id'] transcription = ' '.join(utt['clean_sentence_tokens']) text.write(kaldi_id+' '+transcription+'\n') wavscp.write(kaldi_id+' '+utt['fileid']+postfix+wavextension+'\n') utt2spk.write(kaldi_id+' '+utt['speakerid']+'\n') speaker2gender[utt['speakerid']] = utt['gender'] #sort by speaker speaker2gender = collections.OrderedDict(sorted(speaker2gender.items(), key=lambda x: x[0])) for speaker,gender in speaker2gender.iteritems(): spk2gender.write(speaker+' '+('f' if gender=='female' else 'm')+'\n')
def main(): main_page_html = requests.get(ROOT_URL) main_soup = BeautifulSoup(main_page_html.text, 'html.parser') pages_sidebar_soup = main_soup.find("div", {"id": "pages-2"}) all_links = [ x.find('a')['href'] for x in pages_sidebar_soup.find_all('li') if x.find('a').get_text() != 'About' ] print('Links scraped! Number of episodes:') print(len(all_links)) make_sure_path_exists(output_dir) for current_url in all_links: current_bowl = BeautifulSoup( requests.get(current_url).text, 'html.parser') current_main_text = current_bowl.find('div', {'class': 'entrytext'}) current_title = current_bowl.find('h2', {'class': 'title'}).get_text() current_title = current_title.replace(' ', ' ') current_all_paragraphs = current_main_text.find_all('p') current_all_paragraphs = [ x.get_text().strip() for x in current_all_paragraphs ] current_all_paragraphs = [ x for x in current_all_paragraphs if len(x) > 0 ] while ('Written by' in current_all_paragraphs[-1] or 'Teleplay: ' in current_all_paragraphs[-1] or 'Story: ' in current_all_paragraphs[-1]): print('Removing line:') print(current_all_paragraphs[-1]) current_all_paragraphs = current_all_paragraphs[:-1] print('------------') print('Description of the first scene') print(current_all_paragraphs[0]) filename = current_title.split('-')[0].lower().strip().replace( ' ', '_').replace('/', '_') + '.txt' with open(os.path.join(output_dir, filename), 'w', encoding='utf8') as f: f.write('>> ' + current_title + '\n') f.write('\n'.join(current_all_paragraphs)) print('Document written:') print(current_title) print('Whew! Finished.')
def main(): usage_str = 'Input dir (the JSON files), The JSON file containing each user\'s contributions, Output dir' if len(sys.argv) != 4: print(usage_str) return input_dir = sys.argv[1] contribs_filename = sys.argv[2] output_dir = sys.argv[3] contribs_dict = json.load(open(contribs_filename, 'r')) starting_day = 12 starting_month = 12 all_filenames = os.listdir(input_dir) all_filenames = [ x for x in all_filenames if int(x.split('_')[2]) >= starting_day or int(x.split('_')[1]) != starting_month ] all_jsons = dict() for x in all_filenames: print(x) with open(os.path.join(input_dir, x), 'r') as f: current_json = json.load(f) current_username = current_json['name_field'] if current_username in contribs_dict: current_json['contrib_count'] = len( contribs_dict[current_username]) else: current_json['contrib_count'] = 0 all_jsons[x] = current_json with_username_removed = { x: remove_username_field(all_jsons[x]) for x in all_jsons } make_sure_path_exists(output_dir) for x in with_username_removed: with open( os.path.join( output_dir, 'anon_' + hashlib.md5(x.encode('utf8')).hexdigest()) + '.json', 'w') as f: json.dump(with_username_removed[x], f)
def main(): if len(sys.argv) != 4: print('Input file 1 (experienced), Input file 2 (novice), output dir.') return experienced_file = sys.argv[1] novice_file = sys.argv[2] output_dir = sys.argv[3] experienced_usernames = open(experienced_file, 'r').readlines() novice_usernames = open(novice_file, 'r').readlines() experienced_hash_to_username, experienced_str = hash_usernames( experienced_usernames) novice_hash_to_username, novice_str = hash_usernames(novice_usernames) e_hashes = set(experienced_hash_to_username.keys()) n_hashes = set(novice_hash_to_username.keys()) print( 'Intersection of the two hash sets (alternatively known as "Are we f****d, or not?"):' ) print(e_hashes.intersection(n_hashes)) make_sure_path_exists(output_dir) with open(os.path.join(output_dir, 'experienced_user_and_hash.txt'), 'w') as f: f.write(experienced_str) with open(os.path.join(output_dir, 'novice_user_and_hash.txt'), 'w') as f: f.write(novice_str) all_hash_to_username = experienced_hash_to_username.copy() all_hash_to_username.update(novice_hash_to_username) with open(os.path.join(output_dir, 'token_to_name_map.json'), 'w') as f: json.dump(all_hash_to_username, f)
', because it already exists') if not os.path.exists('run.sh'): print( 'You have to run this python script from the base dir, where run.sh is located. WARNING: aborting.' ) sys.exit('wrong woring directory') data_lex_dir = 'data/lexicon/' data_local_dir = 'data/local/' data_local_dict_dir = 'data/local/dict/' print('Creating local data directories...') print('Creating directoy if necessary:', data_lex_dir) make_sure_path_exists(data_lex_dir) print('Creating directoy if necessary:', data_local_dir) make_sure_path_exists(data_local_dir) print('Creating directoy if necessary:', data_local_dict_dir) make_sure_path_exists(data_local_dict_dir) #if exp and mfcc don't exist locally, create them as link to some other directory on a larger disk if not os.path.exists('exp/') and not os.path.exists('mfcc/'): default_dir = '/srv/data/speech/tuda_kaldi_de/' myinput = '' while (myinput != 'y' and myinput != 'n'): myinput = input( 'Do you want to symlink big data directories (features, models, wavs) to another path than the current directory? (y/n) ' )
try: os.symlink(file1, file2) except (OSError, e): if e.errno == errno.EEXIST: print('Omitted symlink', file1, '->', file2, ', because it already exists') if not os.path.exists('run.sh'): print( 'You have to run this python script from the base dir, where run.sh is located. WARNING: aborting.' ) sys.exit('wrong woring directory') print('Creating data dir(s)...') make_sure_path_exists('data/lexicon/') make_sure_path_exists('data/local/') #make_sure_path_exists('data/wav/') make_sure_path_exists('data/local/dict/') #make_sure_path_exists('data/local/lang/') #make_sure_path_exists('data/local/lm/') #make_sure_path_exists('data/local/lm/3gram-mincount/') #if exp and mfcc don't exist locally, create them as link to some other directory on a larger disk if not os.path.exists('exp/') and not os.path.exists('mfcc/'): default_dir = '/srv/data/speech/tuda_kaldi_de/' data_dir = raw_input( 'Where do you want to store mfcc vectors and models (exp)? It should point to some largish disk. default: ' + default_dir + ' : ') if data_dir == '': data_dir = default_dir
def save_dict(output_dir_dict, out_dict, filename='col_dict.json'): output_dir_dict = add_slash_to_dir(output_dir_dict) output_name_dict = output_dir_dict + filename make_sure_path_exists(output_dir_dict) json.dump(out_dict, open(output_name_dict, mode='w'))
# limitations under the License. # import argparse import common_utils import codecs # import traceback # import datetime # import maryclient # import StringIO # import os # import errno # from bs4 import BeautifulSoup # from common_utils import make_sure_path_exists # import collections # import itertools if __name__ == '__main__': wavscp = codecs.open('../data/local/train_trans.txt', 'r', 'utf-8') datadir = '../data/train/' make_sure_path_exists(datadir) with codecs.open(datadir + 'text', 'w', 'utf-8') as train_text: for wav in wavscp: try: train_text.write(wav) except Exception as err: print 'Error in file, omitting', wav print err
'{} sox {} -r 16k -t wav -c 1 -b 16 -e signed - |\n'. format(uttid, wavdir + wav)) if __name__ == "__main__": parser = argparse.ArgumentParser( description= 'Prepares the German data from the M-ailabs corpus (http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/) for Kaldi.' ) parser.add_argument( '-i', '--inputcorpus', dest='inputcorpus', help= 'Path to the M-ailabs data (download here: http://www.m-ailabs.bayern/en/the-mailabs-speech-dataset/)', type=str, default='data/wav/m_ailabs/de_DE/') parser.add_argument('-o', '--outputfolder', dest='outputfolder', help='Export to this Kaldi folder.', type=str, default='data/m_ailabs_train') args = parser.parse_args() common_utils.make_sure_path_exists(args.outputfolder) create_kaldi_datadir(args.outputfolder, args.inputcorpus) #subprocess.call('utils/fix_data_dir.sh {}'.format(odir), shell=True)
import os import sys def symlink_file(file1,file2): try: os.symlink(file1, file2) except OSError, e: if e.errno == errno.EEXIST: print 'Omitted symlink', file1, '->', file2, ', because it already exists' if not os.path.exists('run.sh'): print 'You have to run this python script from the base dir, where run.sh is located. WARNING: aborting.' sys.exit('wrong woring directory') print 'Creating data dir(s)...' make_sure_path_exists('data/lexicon/') make_sure_path_exists('data/local/') #make_sure_path_exists('data/wav/') make_sure_path_exists('data/local/dict/') #make_sure_path_exists('data/local/lang/') #make_sure_path_exists('data/local/lm/') #make_sure_path_exists('data/local/lm/3gram-mincount/') #if exp and mfcc don't exist locally, create them as link to some other directory on a larger disk if not os.path.exists('exp/') and not os.path.exists('mfcc/'): default_dir = '/srv/data/speech/tuda_kaldi_de/' data_dir = raw_input('Where do you want to store mfcc vectors and models (exp)? It should point to some largish disk. default: ' + default_dir + ' : ') if data_dir == '': data_dir = default_dir if data_dir.endswith('/'):