def main(): parser = argparse.ArgumentParser() parser.add_argument("--bindir", required=True, type=str, help="name of the dir of bin files") parser.add_argument("--stats", required=True, type=str, help="filename of hdf5 format") parser.add_argument("--spklist", required=True, type=str, help="list of speakers") parser.add_argument("--verbose", default=1, type=int, help="log message level") args = parser.parse_args() # set log level if args.verbose == 1: logging.basicConfig(level=logging.INFO, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') elif args.verbose > 1: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') else: logging.basicConfig(level=logging.WARN, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') logging.warn("logging is disabled.") # show argmument for key, value in vars(args).items(): logging.info("%s = %s" % (key, str(value))) # define feat param here feat_param = { 'fs' : 22050, 'shift_ms' : 5, 'length_ms' : 25, 'fftl' : 1024, 'n_mels' : 80, 'mcep_dim' : 34, 'mcep_alpha' : 0.455, 'feat_dim' : 1064, } # read speakers spk_list = read_txt(args.spklist) # read file list, for trainind data only # file_list = sorted(find_files(args.bindir, "[12]*.bin")) file_list = sorted(find_files(args.bindir, "*/VAD/*.bin")) logging.info("number of utterances = %d" % len(file_list)) # calculate statistics if not os.path.exists(os.path.dirname(args.stats)): os.makedirs(os.path.dirname(args.stats)) calc_stats(file_list, feat_param, spk_list, args)
def main(): parser = argparse.ArgumentParser(description="Conversion.") parser.add_argument("--logdir", required=True, type=str, help="path of log directory") parser.add_argument("--checkpoint", default=None, type=str, help="path of checkpoint") parser.add_argument("--src", default=None, required=True, type=str, help="source speaker") parser.add_argument("--trg", default=None, required=True, type=str, help="target speaker") parser.add_argument("--type", default='test', type=str, help="test or valid (default is test)") parser.add_argument("--input_feat", required=True, type=str, help="input feature type") parser.add_argument("--output_feat", required=True, type=str, help="output feature type") parser.add_argument("--mcd", action='store_true', help="calculate mcd or not") parser.add_argument("--syn", action='store_true', help="synthesize voice or not") args = parser.parse_args() # make exp directory output_dir = get_default_logdir_output(args) tf.gfile.MakeDirs(output_dir) # set log level fmt = '%(asctime)s %(message)s' datefmt = '%m/%d/%Y %I:%M:%S' logFormatter = logging.Formatter(fmt, datefmt=datefmt) logging.basicConfig( level=logging.INFO, filename=os.path.join(output_dir, 'exp.log'), format=fmt, datefmt=datefmt, ) consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logFormatter) logging.getLogger().addHandler(consoleHandler) logging.info('====================') logging.info('Conversion start') logging.info(args) # Load architecture arch = tf.gfile.Glob(os.path.join( args.logdir, 'architecture*.json'))[0] # should only be 1 file with open(arch) as fp: arch = json.load(fp) # Load the model module module = import_module(arch['model_module'], package=None) MODEL = getattr(module, arch['model']) input_feat = args.input_feat input_feat_dim = arch['feat_param']['dim'][input_feat] output_feat = args.output_feat # read speakers spk_list = read_txt(arch['spklist']) # Load statistics, normalize and NCHW normalizers = {} for k in arch['normalizer']: normalizers[k] = {} for norm_type in arch['normalizer'][k]['type']: if norm_type == 'minmax': normalizer = MinMaxScaler( xmax=read_hdf5(arch['stats'], '/max/' + k), xmin=read_hdf5(arch['stats'], '/min/' + k), ) elif norm_type == 'meanvar': normalizer = StandardScaler( mu=read_hdf5(arch['stats'], '/mean/' + k), std=read_hdf5(arch['stats'], '/scale/' + k), ) normalizers[k][norm_type] = normalizer # Define placeholders x_pl = tf.placeholder(tf.float32, [None, input_feat_dim]) yh_pl = tf.placeholder(dtype=tf.int64, shape=[ 1, ]) yh = yh_pl * tf.ones(shape=[ tf.shape(x_pl)[0], ], dtype=tf.int64) yh = tf.expand_dims(yh, 0) # Define model model = MODEL(arch, normalizers) z, _ = model.encode(x_pl, input_feat) xh = model.decode(z, yh, output_feat) # make directories for output tf.gfile.MakeDirs(os.path.join(output_dir, 'latent')) tf.gfile.MakeDirs( os.path.join(output_dir, 'converted-{}'.format(output_feat))) # Define session with tf.Session() as sess: # define saver saver = tf.train.Saver() # load checkpoint if args.checkpoint is None: load( saver, sess, args.logdir, ) else: _, ckpt = os.path.split(args.checkpoint) load(saver, sess, args.logdir, ckpt=ckpt) # get feature list, either validation set or test set if args.type == 'test': files = tf.gfile.Glob( arch['conversion']['test_file_pattern'].format(args.src)) elif args.type == 'valid': files = [] for p in arch['training']['valid_file_pattern']: files.extend(tf.gfile.Glob(p.replace('*', args.src))) files = sorted(files) # conversion for f in files: basename = os.path.split(f)[-1] path_to_latent = os.path.join( output_dir, 'latent', '{}-{}-{}'.format(args.src, args.trg, basename)) path_to_cvt = os.path.join( output_dir, 'converted-{}'.format(output_feat), '{}-{}-{}'.format(args.src, args.trg, basename)) logging.info(basename) # load source features src_data = Whole_feature_reader(f, arch['feat_param']) # latent, cvt = sess.run( [z, xh], feed_dict={ yh_pl: np.asarray([spk_list.index(args.trg)]), x_pl: src_data[input_feat] }) # save bin with open(path_to_latent, 'wb') as fp: fp.write(latent.tostring()) with open(path_to_cvt, 'wb') as fp: fp.write(cvt.tostring()) # optionally calculate MCD if args.mcd: cmd = "python ./mcd_calculate.py" + \ " --type " + args.type + \ " --logdir " + output_dir + \ " --input_feat " + input_feat + \ " --output_feat " + output_feat print(cmd) os.system(cmd) # optionally synthesize waveform if args.syn: cmd = "python ./synthesize.py" + \ " --type " + args.type + \ " --logdir " + output_dir + \ " --input_feat " + input_feat + \ " --output_feat " + output_feat print(cmd) os.system(cmd)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--bindir", required=True, type=str, help="name of the dir of bin files") parser.add_argument("--stats", required=True, type=str, help="filename of hdf5 format") parser.add_argument("--spklist", required=True, type=str, help="list of speakers") parser.add_argument("--verbose", default=1, type=int, help="log message level") args = parser.parse_args() # set log level if args.verbose == 1: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') elif args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') else: logging.basicConfig( level=logging.WARN, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') logging.warn("logging is disabled.") # show argmument for key, value in vars(args).items(): logging.info("%s = %s" % (key, str(value))) # define feat param here feat_param = { 'fs': 22050, 'shift_ms': 5, 'length_ms': 25, 'fftl': 1024, 'n_mels': 80, 'mcep_dim': 34, 'mcep_alpha': 0.455, 'feat_dim': 1064, } # read speakers spk_list = read_txt(args.spklist) # read file list, for trainind data only # file_list = sorted(find_files(args.bindir, "[12]*.bin")) file_list = find_files(args.bindir, "*.bin") cuted_file_list = list() total_count = 0 if len(file_list) > 20000: max_num = math.ceil(10000 / len(spk_list)) # contain VAD and noVAD count_dict = dict() for fname in file_list: is_vad = fname.split("/")[-3].strip() spk = fname.split("/")[-2].strip() if is_vad not in count_dict: count_dict[is_vad] = dict() if spk in count_dict[is_vad] and count_dict[is_vad][spk] > max_num: continue count_dict[is_vad][spk] = count_dict[is_vad].get(spk, 0) + 1 cuted_file_list.append(fname) total_count += 1 else: cuted_file_list = file_list file_list = sorted(cuted_file_list) logging.info("number of utterances = %d" % len(file_list)) # calculate statistics if not os.path.exists(os.path.dirname(args.stats)): os.makedirs(os.path.dirname(args.stats)) # calc_stats(file_list, feat_param, spk_list, args) import time calc_sp_stats(file_list, feat_param, spk_list, args) calc_mcc_stats(file_list, feat_param, spk_list, args) calc_f0_stats(file_list, feat_param, spk_list, args)
def main(): parser = argparse.ArgumentParser( description="making feature file argsurations.") parser.add_argument( "--waveforms", required=True, type=str, help="directory or list of filename of input wavfile") parser.add_argument( "--bindir", required=True, type=str, help="directory to save bin") parser.add_argument( "--confdir", required=True, type=str, help="configuration directory") parser.add_argument( "--overwrite", default=False, type=strtobool, help="if set true, overwrite the exist feature files") parser.add_argument( "--n_jobs", default=12, type=int, help="number of parallel jobs") parser.add_argument( "--verbose", default=1, type=int, help="log message level") args = parser.parse_args() # set log level if args.verbose == 1: logging.basicConfig(level=logging.INFO, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') elif args.verbose > 1: logging.basicConfig(level=logging.DEBUG, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') else: logging.basicConfig(level=logging.WARN, format='%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S') logging.warn("logging is disabled.") # show argmument for key, value in vars(args).items(): logging.info("%s = %s" % (key, str(value))) # read list if os.path.isdir(args.waveforms): file_list = sorted(find_files(args.waveforms, "*.wav")) else: file_list = read_txt(args.waveforms) logging.info("number of utterances = %d" % len(file_list)) # read speaker list spk_list = read_txt(os.path.join(args.confdir, 'spk.list')) # read f0 max/min of the speaker, and define feature extractor feat_param_list = [] for s in spk_list: with open(args.confdir + '/' + s + '.f0', 'r') as f: f0min, f0max = [int(f0) for f0 in f.read().split(' ')] feat_param_list.append({ 'fs' : 22050, 'shift_ms' : 5, 'length_ms' : 25, 'fftl' : 1024, 'n_mels' : 80, 'mcep_dim' : 34, 'mcep_alpha' : 0.455, 'f0min' : f0min, 'f0max' : f0max, 'highpass_cutoff' : 70, }) # create file folders filepath_create(file_list, args.bindir) # divide list file_lists = np.array_split(file_list, args.n_jobs) file_lists = [f_list.tolist() for f_list in file_lists] # multi processing processes = [] target_fn = world_feature_extract for f in file_lists: p = mp.Process(target=target_fn, args=(f, spk_list, feat_param_list, args,)) p.start() processes.append(p) # wait for all process for p in processes: p.join()