def align_bowtie2(task): for ref_order in range(1, task.ref_num + 1): logger.info('Running Bowtie2 alignment for ref #%d.' % ref_order) aligner_cwd = task.path.joinpath(task.id, 'alignment', 'bowtie2') ref_index_path = str( aligner_cwd.joinpath('%s_ref_%d' % (task.id, ref_order))) if task.dehost != None: filterd_R1 = str( task.path.joinpath(task.id, 'reads', task.id + '_host_removed_R1.fastq.gz')) filterd_R2 = str( task.path.joinpath(task.id, 'reads', task.id + '_host_removed_R2.fastq.gz')) else: filterd_R1 = str( task.path.joinpath(task.id, 'reads', task.id + '_R1.fastq.gz')) filterd_R2 = str( task.path.joinpath(task.id, 'reads', task.id + '_R2.fastq.gz')) reads_cmd = ['-1', filterd_R1, '-2', filterd_R2] thread_cmd = ['-p', str(task.threads)] output_cmd = ['-S', '%s_ref_%d.sam' % (task.id, ref_order)] other_cmd = [ '--very-sensitive-local', '--un-conc-gz', '%s' % ('%s_ref_%d_unmapped_R%%.fastq.gz' % (task.id, ref_order)) ] aln_cmd = ['bowtie2', '-x', ref_index_path ] + reads_cmd + output_cmd + thread_cmd + other_cmd logger.info('CMD: ' + ' '.join(aln_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(aln_cmd)) bt2_run = subprocess.run(aln_cmd, cwd=aligner_cwd, capture_output=True) print(bt2_run.stdout.decode(encoding='utf-8')) print(bt2_run.stderr.decode(encoding='utf-8'))
def run_fastp(task): logger.info('Running fastp to filter reads.') original_R1 = task.path.joinpath(task.id, 'reads', 'original', task.id + '_R1.fastq.gz') original_R2 = task.path.joinpath(task.id, 'reads', 'original', task.id + '_R2.fastq.gz') filterd_R1 = task.path.joinpath(task.id, 'reads', task.id + '_R1.fastq.gz') filterd_R2 = task.path.joinpath(task.id, 'reads', task.id + '_R2.fastq.gz') report_json = task.path.joinpath(task.id, 'reads', 'fastp.json') report_html = task.path.joinpath(task.id, 'reads', 'fastp.html') fastp_cmd = [ "fastp", "-i", str(original_R1), "-I", str(original_R2), "-o", str(filterd_R1), "-O", str(filterd_R2) ] reports_cmd = ['-j', str(report_json), '-h', str(report_html)] parameter_cmd = [ '-f', str(task.global_trimming), '-t', str(task.global_trimming), '-F', str(task.global_trimming), '-T', str(task.global_trimming), '-w', str(task.threads) ] logger.info('CMD: ' + ' '.join(fastp_cmd + reports_cmd + parameter_cmd)) utils.write_log_file( task.path.joinpath(task.id), 'CMD: ' + ' '.join(fastp_cmd + reports_cmd + parameter_cmd)) fastp_run = subprocess.run(fastp_cmd + reports_cmd + parameter_cmd, capture_output=True) print(fastp_run.stderr.decode(encoding='utf-8'))
def align_flagstat(task, aligners): stats_dict = {'mapped_rate': {}} for aligner in aligners: logger.info('Analysis BAM file from %s' % aligner) aligner_cwd = task.path.joinpath(task.id, 'alignment', aligner) stats_dict['mapped_rate'][aligner] = {} for ref_order in range(1, task.ref_num + 1): flagstat_cmd = [ 'samtools', 'flagstat', '-@', task.threads, '%s_ref_%d.sorted.bam' % (task.id, ref_order) ] logger.info('CMD: ' + ' '.join(flagstat_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(flagstat_cmd)) flagstat_run = subprocess.run(flagstat_cmd, cwd=aligner_cwd, capture_output=True) stats_text = flagstat_run.stdout.decode(encoding='utf-8') stats_list = stats_text.split('\n') utils.build_text_file( task.path.joinpath(aligner_cwd, 'flagstat_ref_%d.txt' % ref_order), stats_text) mapped_rate = stats_list[4].split(' ')[4][1:] stats_dict['mapped_rate'][aligner][ref_order] = mapped_rate utils.build_json_file( task.path.joinpath(task.id, 'alignment', 'flagstat.json'), stats_dict)
def bam_sort_n_index(task, aligner): for ref_order in range(1, task.ref_num + 1): logger.info('Sorting & indexing BAM file for aln #%d.' % ref_order) aligner_cwd = task.path.joinpath(task.id, 'alignment', aligner) # sorting sorting_cmd = [ 'samtools', 'sort', '-@', task.threads, '%s_ref_%d.sam' % (task.id, ref_order), '-o', '%s_ref_%d.sorted.bam' % (task.id, ref_order) ] logger.info('CMD: ' + ' '.join(sorting_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(sorting_cmd)) sorting_run = subprocess.run(sorting_cmd, cwd=aligner_cwd, capture_output=True) print(sorting_run.stdout.decode(encoding='utf-8')) print(sorting_run.stderr.decode(encoding='utf-8')) # indexing indexing_cmd = [ 'samtools', 'index', '-@', task.threads, '%s_ref_%d.sorted.bam' % (task.id, ref_order) ] logger.info('CMD: ' + ' '.join(indexing_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(indexing_cmd)) index_run = subprocess.run(indexing_cmd, cwd=aligner_cwd, capture_output=True) print(index_run.stdout.decode(encoding='utf-8')) print(index_run.stderr.decode(encoding='utf-8')) # remove sam file to release disk space os.remove(aligner_cwd.joinpath('%s_ref_%d.sam' % (task.id, ref_order)))
def ref_index(task, aligner): for ref_order in range(1, task.ref_num + 1): logger.info('Building Bowtie2 ref index for ref #%d.' % ref_order) aligner_cwd = task.path.joinpath(task.id, 'alignment', aligner) ref_fasta_path = task.path.joinpath( task.id, 'reference', '%s_ref_%d.fasta' % (task.id, ref_order)) Path.mkdir(aligner_cwd, parents=True, exist_ok=True) shutil.copy2(ref_fasta_path, aligner_cwd) if aligner == 'bowtie2': index_cmd = [ 'bowtie2-build', '--threads', task.threads, '%s_ref_%d.fasta' % (task.id, ref_order), '%s_ref_%d' % (task.id, ref_order) ] elif aligner == 'bwa': index_cmd = [ 'bwa', 'index', '-p', '%s_ref_%d' % (task.id, ref_order), '%s_ref_%d.fasta' % (task.id, ref_order) ] logger.info('CMD: ' + ' '.join(index_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(index_cmd)) ref_index_run = subprocess.run(index_cmd, cwd=aligner_cwd, capture_output=True) print(ref_index_run.stdout.decode(encoding='utf-8')) print(ref_index_run.stderr.decode(encoding='utf-8'))
def get_total_graphs(self): time_1 = datetime.now() for graph_idx in tqdm(self.graph_id_list): data = torch.load(os.path.join(self.data_processed_path, '{}_{}.pt'.format(self.name, graph_idx))) self.total_graph[graph_idx] = data write_log_file(self.log_path, "load and append {} graph, time = {}".format(len(self.graph_id_list), datetime.now() - time_1))
def align_bwa(task): for ref_order in range(1, task.ref_num + 1): logger.info('Running BWA alignment for ref #%d.' % ref_order) aligner_cwd = task.path.joinpath(task.id, 'alignment', 'bwa') ref_index_path = str( aligner_cwd.joinpath('%s_ref_%d' % (task.id, ref_order))) if task.dehost != None: filterd_R1 = str( task.path.joinpath(task.id, 'reads', task.id + '_host_removed_R1.fastq.gz')) filterd_R2 = str( task.path.joinpath(task.id, 'reads', task.id + '_host_removed_R2.fastq.gz')) else: filterd_R1 = str( task.path.joinpath(task.id, 'reads', task.id + '_R1.fastq.gz')) filterd_R2 = str( task.path.joinpath(task.id, 'reads', task.id + '_R2.fastq.gz')) reads_cmd = [filterd_R1, filterd_R2] thread_cmd = ['-t', str(task.threads)] output_cmd = ['-o', '%s_ref_%d.sam' % (task.id, ref_order)] aln_cmd = ['bwa', 'mem'] + thread_cmd + [ref_index_path ] + reads_cmd + output_cmd logger.info('CMD: ' + ' '.join(aln_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(aln_cmd)) bt2_run = subprocess.run(aln_cmd, cwd=aligner_cwd, capture_output=True) print(bt2_run.stdout.decode(encoding='utf-8')) print(bt2_run.stderr.decode(encoding='utf-8'))
def align_coverage_stat(task, aligners): cov_dict = {} for aligner in aligners: cov_dict[aligner] = {} logger.info('Analysis coverage stats from %s BAM files.' % aligner) aligner_cwd = task.path.joinpath(task.id, 'alignment', aligner) for ref_order in range(1, task.ref_num + 1): cov_dict[aligner][ref_order] = {} flagstat_cmd = [ 'samtools', 'coverage', '%s_ref_%d.sorted.bam' % (task.id, ref_order) ] logger.info('CMD: ' + ' '.join(flagstat_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(flagstat_cmd)) flagstat_run = subprocess.run(flagstat_cmd, cwd=aligner_cwd, capture_output=True) stats_text = flagstat_run.stdout.decode(encoding='utf-8') titles = stats_text.split('\n')[0].split('\t') stats = stats_text.split('\n')[1].split('\t') for i in range(len(titles)): cov_dict[aligner][ref_order][titles[i]] = stats[i] utils.build_json_file( task.path.joinpath(task.id, 'alignment', 'coverage_stat.json'), cov_dict)
def variant_calling_lofreq(task): logger.info('Starting variant calling by LoFreq.') thread_cmd = ['call-parallel', '--pp-threads', str(task.threads)] other_cmd = [ '--call-indels', '-N', '-B', '-q', '20', '-Q', '20', '-m', '20' ] for aligner in task.alns: logger.info('Running VC for %s output.' % aligner) aln_data_cwd = task.path.joinpath(task.id, 'alignment', aligner) for ref_order in range(1, task.ref_num + 1): aln_input_name = '%s_ref_%d.sorted.bam' % (task.id, ref_order) aln_indelqual_name = '%s_ref_%d.indelqual.sorted.bam' % (task.id, ref_order) ref_name = '%s_ref_%d.fasta' % (task.id, ref_order) # index ref faidx_cmd = ['lofreq', 'faidx', ref_name] logger.info('CMD: ' + ' '.join(faidx_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(faidx_cmd)) faidx_run = subprocess.run(faidx_cmd, cwd=aln_data_cwd, capture_output=True) print(faidx_run.stdout.decode(encoding='utf-8')) print(faidx_run.stderr.decode(encoding='utf-8')) # indelqual indelqual_cmd = [ 'lofreq', 'indelqual', '--dindel', '--ref', ref_name, '--out', aln_indelqual_name, aln_input_name ] indelqual_run = subprocess.run(indelqual_cmd, cwd=aln_data_cwd, capture_output=True) print(indelqual_run.stdout.decode(encoding='utf-8')) print(indelqual_run.stderr.decode(encoding='utf-8')) # index indelqual-ed BAM indelqual_index_cmd = ['samtools', 'index', aln_indelqual_name] indelqual_index_run = subprocess.run(indelqual_index_cmd, cwd=aln_data_cwd, capture_output=True) print(indelqual_index_run.stdout.decode(encoding='utf-8')) print(indelqual_index_run.stderr.decode(encoding='utf-8')) # vc ref_cmd = ['-f', ref_name] output_cmd = [ '-o', '%s_%s_ref_%d_lofreq.vcf' % (task.id, aligner, ref_order) ] vc_cmd = ['lofreq'] + thread_cmd + \ ref_cmd + output_cmd + \ other_cmd + [aln_indelqual_name] logger.info('CMD: ' + ' '.join(vc_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(vc_cmd)) vc_run = subprocess.run(vc_cmd, cwd=aln_data_cwd, capture_output=True) print(vc_run.stdout.decode(encoding='utf-8')) print(vc_run.stderr.decode(encoding='utf-8'))
def test(self, iter_no): write_log_file(self.log_path, "Start to testing ...") test_query_ids = self.text_data.split_ids['test'] success = {1: 0, 5: 0, 10: 0} total_test_scores = [] test_start = datetime.now() for test_chunk in chunk(test_query_ids, 100): one_chunk_scores = [] for i, query_id in enumerate(test_chunk): rank_ids, one_row_scores = self.retrieve_rank( query_id, test_chunk, self.text_data, self.code_data) one_chunk_scores.append(one_row_scores) for k in success.keys(): if query_id in rank_ids[:k]: success[k] += 1 total_test_scores.append(one_chunk_scores) write_log_file( self.log_path, "\n&Testing Iteration {}: for {} queries finished. Time elapsed = {}." .format(iter_no, len(test_query_ids), datetime.now() - test_start)) all_mrr = [] for i in range(len(total_test_scores)): one_chunk_square_score = total_test_scores[i] one_chunk_square_score = np.vstack(one_chunk_square_score) assert one_chunk_square_score.shape[ 0] == one_chunk_square_score.shape[ 1], "Every Chunk must be square" mrr_array = self.calculate_square_mrr(one_chunk_square_score) all_mrr.extend(mrr_array) mrr = np.array(all_mrr).mean() self.test_iter.append(iter_no) self.test_mrr.append(mrr) write_log_file( self.log_path, "&Testing Iteration {}: MRR = &{}&".format(iter_no, mrr)) for k, v in success.items(): value = v * 1.0 / len(test_query_ids) write_log_file( self.log_path, "&Testing Iteration {}: S@{}@ = &{}&".format( iter_no, k, value)) if k == 1: self.test_s1.append(value) elif k == 5: self.test_s5.append(value) elif k == 10: self.test_s10.append(value) else: print('cannot find !') write_log_file( self.log_path, "S@1, S@5, S@10\n{}, {}, {}".format(self.test_s1[-1], self.test_s5[-1], self.test_s10[-1]))
def __init__(self, args): self.args = args self.dataset_dir = args.data_dir if self.args.only_test: self.sig = os.path.join( args.log_dir, "OnlyText_" + datetime.now().strftime("%Y-%m-%d@%H:%M:%S")) else: self.sig = os.path.join( args.log_dir, datetime.now().strftime("%Y-%m-%d@%H:%M:%S")) os.mkdir(self.sig) self.log_path = os.path.join(self.sig, 'log_{}.txt'.format(args_file_name)) self.best_model_path = os.path.join(self.sig, 'best_model.pt') table_draw = arguments_to_tables(args=arguments) write_log_file(self.log_path, str(table_draw)) self.train_batch_size = args.train_batch_size self.valid_batch_size = args.valid_batch_size self.max_iteration = args.max_iter self.margin = args.margin self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') write_log_file(self.log_path, "\n****CPU or GPU: " + str(self.device)) max_number_edge_types = 3 if self.args.conv.lower() in ['rgcn', 'cg', 'nnconv']: self.model = GraphMatchNetwork( node_init_dims=300, arguments=args, device=self.device, max_number_of_edges=max_number_edge_types).to(self.device) else: raise NotImplementedError write_log_file(self.log_path, str(self.model)) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=args.lr) write_log_file(self.log_path, "Init Reading Code Graphs ... ") self.code_data = ProcessedDataset(name='code', root=self.dataset_dir, log_path=self.log_path) write_log_file(self.log_path, "Init Reading Text Graphs ... ") self.text_data = ProcessedDataset(name='text', root=self.dataset_dir, log_path=self.log_path) # for plotting and record (init empty list) self.train_iter, self.train_smooth_loss, self.valid_iter, self.valid_loss, self.test_iter, self.test_mrr, self.test_s1, self.test_s5, self.test_s10 = ( [] for _ in range(9))
def __init__(self, name, root, log_path): self.name = name self.data_processed_path = os.path.join(root, '{}_processed'.format(self.name)) self.graph_id_file = os.path.join(root, '{}_graph_ids.pt'.format(self.name)) self.graph_id_list = torch.load(self.graph_id_file) self.log_path = log_path self._check_whether_all_graph_ids_files_exist() self.total_graph = {} # self.get_total_graphs() # Split train, test, validation set if os.path.exists(os.path.join(root, 'split.json')): with open(os.path.join(root, 'split.json'), 'rb') as f: self.split_ids = json.loads(f.read()) else: raise NotImplementedError write_log_file(self.log_path, "Train={}\nValid={}\nTest={}".format(len(self.split_ids['train']), len(self.split_ids['valid']), len(self.split_ids['test'])))
def variant_calling_varscan2(task): logger.info('Starting variant calling by VarScan2.') mpileup_cmd = ['samtools', 'mpileup', '-B'] mpileup2cns_cmd = ['varscan', 'mpileup2cns'] output_cmd = ['--output-vcf', '1'] other_cmd = ['--min-avg-qual', '20', '--P-value', '0.01'] for aligner in task.alns: logger.info('Running VC for %s output.' % aligner) aln_data_cwd = task.path.joinpath(task.id, 'alignment', aligner) for ref_order in range(1, task.ref_num + 1): aln_input_cmd = [ str( aln_data_cwd.joinpath('%s_ref_%d.sorted.bam' % (task.id, ref_order))) ] ref_path = aln_data_cwd.joinpath('%s_ref_%d.fasta' % (task.id, ref_order)) ref_cmd = ['-f', str(ref_path)] output_path = str( aln_data_cwd.joinpath('%s_%s_ref_%d_varscan.vcf' % (task.id, aligner, ref_order))) # Run samtools mpileup and pipe to varscan2 samtools_cmd = mpileup_cmd + ref_cmd + aln_input_cmd logger.info('CMD: ' + ' '.join(samtools_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(samtools_cmd)) samtools_run = subprocess.run(samtools_cmd, cwd=aln_data_cwd, capture_output=True) varscan2_cmd = mpileup2cns_cmd + other_cmd + output_cmd logger.info('CMD: ' + ' '.join(varscan2_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(varscan2_cmd)) vc_run = subprocess.run(varscan2_cmd, cwd=aln_data_cwd, input=samtools_run.stdout, capture_output=True) utils.build_text_file(output_path, vc_run.stdout.decode(encoding='utf-8')) print(vc_run.stderr.decode(encoding='utf-8'))
def fit(self): best_val_loss = 1e10 all_loss = [] code_train_batch = self.code_data.triple_train_batch( self.train_batch_size) time_1 = datetime.now() for iteration in range(self.max_iteration): self.model.train() # Compute similarity pos_code_graph_id_list, text_graph_id_list, neg_code_graph_id_list = next( code_train_batch) # next for yield pos_code_batch = self.code_data.get_batch_graph( pos_code_graph_id_list) text_batch = self.text_data.get_batch_graph(text_graph_id_list) neg_code_batch = self.code_data.get_batch_graph( neg_code_graph_id_list) pos_pred = self.model(pos_code_batch, text_batch).reshape(-1, 1) # [batch, 1] neg_pred = self.model(neg_code_batch, text_batch).reshape(-1, 1) loss = (self.margin - pos_pred + neg_pred).clamp(min=1e-6).mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() all_loss.append(loss) # Print if iteration % self.args.print_interval == 0 and iteration > 0: self.train_iter.append(iteration) self.train_smooth_loss.append( torch.tensor(all_loss).mean().cpu().detach()) write_log_file( self.log_path, '@Train Iter {}: mean smooth loss = @{}@, time = {}.'. format(iteration, torch.tensor(all_loss).mean(), datetime.now() - time_1)) all_loss = [] time_1 = datetime.now() # Validation if (iteration % self.args.valid_interval == 0 and iteration >= self.args.val_start) or iteration == 0: s_time = datetime.now() loss = self.validation() self.valid_iter.append(iteration) self.valid_loss.append(loss.cpu().detach()) end_time = datetime.now() if loss < best_val_loss: write_log_file( self.log_path, '#Valid Iter {}: loss = #{}# (Decrease) < Best loss = {}. Save to best model..., time elapsed = {}.' .format(iteration, loss, best_val_loss, end_time - s_time)) best_val_loss = loss torch.save(self.model.state_dict(), self.best_model_path) else: write_log_file( self.log_path, '#Valid Iter {}: loss = #{}# (Increase). Best val loss = {}, time elapsed = {}.' .format(iteration, loss, best_val_loss, end_time - s_time)) # only testing when iteration == 0 (whether code is rightly run) if iteration == 0: self.test(iter_no=iteration)
def main_body(): '''Main body of this file''' parser = argparse.ArgumentParser() # Configurations: read noisyspeech_synthesizer.cfg and gather inputs parser.add_argument( '--cfg', default='noisyspeech_synthesizer.cfg', help='Read noisyspeech_synthesizer.cfg for all the details') parser.add_argument('--cfg_str', type=str, default='noisy_speech') args = parser.parse_args() params = dict() params['args'] = args cfgpath = os.path.join(os.path.dirname(__file__), args.cfg) assert os.path.exists(cfgpath), f'No configuration file as [{cfgpath}]' cfg = CP.ConfigParser() cfg._interpolation = CP.ExtendedInterpolation() cfg.read(cfgpath) params['cfg'] = cfg._sections[args.cfg_str] cfg = params['cfg'] clean_dir = os.path.join(os.path.dirname(__file__), 'CleanSpeech') if cfg['speech_dir'] != 'None': clean_dir = cfg['speech_dir'] if not os.path.exists(clean_dir): assert False, ('Clean speech data is required') noise_dir = os.path.join(os.path.dirname(__file__), 'Noise') if cfg['noise_dir'] != 'None': noise_dir = cfg['noise_dir'] if not os.path.exists: assert False, ('Noise data is required') params['fs'] = int(cfg['sampling_rate']) params['audioformat'] = cfg['audioformat'] params['audio_length'] = float(cfg['audio_length']) params['silence_length'] = float(cfg['silence_length']) params['total_hours'] = float(cfg['total_hours']) if cfg['fileindex_start'] != 'None' and cfg['fileindex_start'] != 'None': params['num_files'] = int(cfg['fileindex_end']) - int( cfg['fileindex_start']) params['fileindex_start'] = int(cfg['fileindex_start']) params['fileindex_end'] = int(cfg['fileindex_end']) else: params['num_files'] = int( (params['total_hours'] * 60 * 60) / params['audio_length']) params['fileindex_start'] = 0 params['fileindex_end'] = params['num_files'] print('Number of files to be synthesized:', params['num_files']) params['is_test_set'] = utils.str2bool(cfg['is_test_set']) params['clean_activity_threshold'] = float(cfg['clean_activity_threshold']) params['noise_activity_threshold'] = float(cfg['noise_activity_threshold']) params['snr_lower'] = int(cfg['snr_lower']) params['snr_upper'] = int(cfg['snr_upper']) params['randomize_snr'] = utils.str2bool(cfg['randomize_snr']) params['target_level_lower'] = int(cfg['target_level_lower']) params['target_level_upper'] = int(cfg['target_level_upper']) if 'snr' in cfg.keys(): params['snr'] = int(cfg['snr']) else: params['snr'] = int((params['snr_lower'] + params['snr_upper']) / 2) params['noisyspeech_dir'] = utils.get_dir(cfg, 'noisy_destination', 'noisy') params['clean_proc_dir'] = utils.get_dir(cfg, 'clean_destination', 'clean') params['noise_proc_dir'] = utils.get_dir(cfg, 'noise_destination', 'noise') if 'speech_csv' in cfg.keys() and cfg['speech_csv'] != 'None': cleanfilenames = pd.read_csv(cfg['speech_csv']) cleanfilenames = cleanfilenames['filename'] else: cleanfilenames = glob.glob( os.path.join(clean_dir, params['audioformat'])) params['cleanfilenames'] = cleanfilenames shuffle(params['cleanfilenames']) params['num_cleanfiles'] = len(params['cleanfilenames']) # If there are .wav files in noise_dir directory, use those # If not, that implies that the noise files are organized into subdirectories by type, # so get the names of the non-excluded subdirectories if 'noise_csv' in cfg.keys() and cfg['noise_csv'] != 'None': noisefilenames = pd.read_csv(cfg['noise_csv']) noisefilenames = noisefilenames['filename'] else: noisefilenames = glob.glob( os.path.join(noise_dir, params['audioformat'])) if len(noisefilenames) != 0: shuffle(noisefilenames) params['noisefilenames'] = noisefilenames else: noisedirs = glob.glob(os.path.join(noise_dir, '*')) if cfg['noise_types_excluded'] != 'None': dirstoexclude = cfg['noise_types_excluded'].split(',') for dirs in dirstoexclude: noisedirs.remove(dirs) shuffle(noisedirs) params['noisedirs'] = noisedirs # Call main_gen() to generate audio clean_source_files, clean_clipped_files, clean_low_activity_files, \ noise_source_files, noise_clipped_files, noise_low_activity_files = main_gen(params) # Create log directory if needed, and write log files of clipped and low activity files log_dir = utils.get_dir(cfg, 'log_dir', 'Logs') utils.write_log_file(log_dir, 'source_files.csv', clean_source_files + noise_source_files) utils.write_log_file(log_dir, 'clipped_files.csv', clean_clipped_files + noise_clipped_files) utils.write_log_file(log_dir, 'low_activity_files.csv', \ clean_low_activity_files + noise_low_activity_files) # Compute and print stats about percentange of clipped and low activity files total_clean = len(clean_source_files) + len(clean_clipped_files) + len( clean_low_activity_files) total_noise = len(noise_source_files) + len(noise_clipped_files) + len( noise_low_activity_files) pct_clean_clipped = round(len(clean_clipped_files) / total_clean * 100, 1) pct_noise_clipped = round(len(noise_clipped_files) / total_noise * 100, 1) pct_clean_low_activity = round( len(clean_low_activity_files) / total_clean * 100, 1) pct_noise_low_activity = round( len(noise_low_activity_files) / total_noise * 100, 1) print("Of the " + str(total_clean) + " clean speech files analyzed, " + \ str(pct_clean_clipped) + "% had clipping, and " + str(pct_clean_low_activity) + \ "% had low activity " + "(below " + str(params['clean_activity_threshold']*100) + \ "% active percentage)") print("Of the " + str(total_noise) + " noise files analyzed, " + str(pct_noise_clipped) + \ "% had clipping, and " + str(pct_noise_low_activity) + "% had low activity " + \ "(below " + str(params['noise_activity_threshold']*100) + "% active percentage)")
def main_body(): '''Main body of this file''' parser = argparse.ArgumentParser() # Configurations: read noisyspeech_synthesizer.cfg and gather inputs parser.add_argument('--cfg', default='noisyspeech_synthesizer.cfg', help='Read noisyspeech_synthesizer.cfg for all the details') parser.add_argument('--cfg_str', type=str, default='noisy_speech') args = parser.parse_args() params = dict() params['args'] = args cfgpath = os.path.join(os.path.dirname(__file__), args.cfg) assert os.path.exists(cfgpath), f'No configuration file as [{cfgpath}]' cfg = CP.ConfigParser() cfg._interpolation = CP.ExtendedInterpolation() cfg.read(cfgpath) params['cfg'] = cfg._sections[args.cfg_str] cfg = params['cfg'] clean_dir = os.path.join(os.path.dirname(__file__), 'CleanSpeech') if cfg['speech_dir'] != 'None': clean_dir = cfg['speech_dir'] if not os.path.exists: assert False, ('Clean speech data is required') noise_dir = os.path.join(os.path.dirname(__file__), 'Noise') if cfg['noise_dir'] != 'None': noise_dir = cfg['noise_dir'] if not os.path.exists: assert False, ('Noise data is required') params['fs'] = int(cfg['sampling_rate']) params['audioformat'] = cfg['audioformat'] params['audio_length'] = float(cfg['audio_length']) params['silence_length'] = float(cfg['silence_length']) params['total_hours'] = float(cfg['total_hours']) if cfg['fileindex_start'] != 'None' and cfg['fileindex_start'] != 'None': params['fileindex_start'] = int(cfg['fileindex_start']) params['fileindex_end'] = int(cfg['fileindex_end']) params['num_files'] = int(params['fileindex_end'])-int(params['fileindex_start']) else: params['num_files'] = int((params['total_hours']*60*60)/params['audio_length']) print('Number of files to be synthesized:', params['num_files']) params['is_test_set'] = utils.str2bool(cfg['is_test_set']) params['clean_activity_threshold'] = float(cfg['clean_activity_threshold']) params['noise_activity_threshold'] = float(cfg['noise_activity_threshold']) params['snr_lower'] = int(cfg['snr_lower']) params['snr_upper'] = int(cfg['snr_upper']) params['randomize_snr'] = utils.str2bool(cfg['randomize_snr']) params['target_level_lower'] = int(cfg['target_level_lower']) params['target_level_upper'] = int(cfg['target_level_upper']) if 'snr' in cfg.keys(): params['snr'] = int(cfg['snr']) else: params['snr'] = int((params['snr_lower'] + params['snr_upper'])/2) params['noisyspeech_dir'] = utils.get_dir(cfg, 'noisy_destination', 'noisy') params['clean_proc_dir'] = utils.get_dir(cfg, 'clean_destination', 'clean') params['noise_proc_dir'] = utils.get_dir(cfg, 'noise_destination', 'noise') if 'speech_csv' in cfg.keys() and cfg['speech_csv'] != 'None': cleanfilenames = pd.read_csv(cfg['speech_csv']) cleanfilenames = cleanfilenames['filename'] else: cleanfilenames = glob.glob(os.path.join(clean_dir, params['audioformat'])) params['cleanfilenames'] = cleanfilenames shuffle(params['cleanfilenames']) params['num_cleanfiles'] = len(params['cleanfilenames']) params['noisefilenames'] = glob.glob(os.path.join(noise_dir, params['audioformat'])) shuffle(params['noisefilenames']) # Invoke multiple processes and fan out calls to main_gen() to these processes global clean_counter, noise_counter clean_counter = multiprocessing.Value('i', 0) noise_counter = multiprocessing.Value('i', 0) multi_pool = multiprocessing.Pool(processes=PROCESSES, initializer = init, initargs = (clean_counter, noise_counter, )) fileindices = range(params['num_files']) output_lists = multi_pool.starmap(main_gen, zip(repeat(params), fileindices)) flat_output_lists = [] num_lists = 6 for i in range(num_lists): flat_output_lists.append(extract_list(output_lists, i)) # Create log directory if needed, and write log files of clipped and low activity files log_dir = utils.get_dir(cfg, 'log_dir', 'Logs') utils.write_log_file(log_dir, 'source_files.csv', flat_output_lists[0] + flat_output_lists[3]) utils.write_log_file(log_dir, 'clipped_files.csv', flat_output_lists[1] + flat_output_lists[4]) utils.write_log_file(log_dir, 'low_activity_files.csv', flat_output_lists[2] + flat_output_lists[5]) # Compute and print stats about percentange of clipped and low activity files total_clean = len(flat_output_lists[0]) + len(flat_output_lists[1]) + len(flat_output_lists[2]) total_noise = len(flat_output_lists[3]) + len(flat_output_lists[4]) + len(flat_output_lists[5]) pct_clean_clipped = round(len(flat_output_lists[1])/total_clean*100, 1) pct_noise_clipped = round(len(flat_output_lists[4])/total_noise*100, 1) pct_clean_low_activity = round(len(flat_output_lists[2])/total_clean*100, 1) pct_noise_low_activity = round(len(flat_output_lists[5])/total_noise*100, 1) print("Of the " + str(total_clean) + " clean speech files analyzed, " + str(pct_clean_clipped) + \ "% had clipping, and " + str(pct_clean_low_activity) + "% had low activity " + \ "(below " + str(params['clean_activity_threshold']*100) + "% active percentage)") print("Of the " + str(total_noise) + " noise files analyzed, " + str(pct_noise_clipped) + \ "% had clipping, and " + str(pct_noise_low_activity) + "% had low activity " + \ "(below " + str(params['noise_activity_threshold']*100) + "% active percentage)")
def main(): task = Task() task.conda_pkgs = [ 'fastp', 'samtools', 'bcftools', 'bowtie2', 'bwa', 'varscan', 'lofreq', 'spades.py', 'blastn', 'makeblastdb' ] check_deps(task) task.path = Path.cwd().joinpath('tasks') task.name = args.prefix task.id = '' task.ref = args.ref task.with_ref = False task.ex_r1 = args.r1 task.ex_r2 = args.r2 task.threads = str(args.threads) task.alns = args.alns.split(',') task.global_trimming = str(args.trimming) task.dehost = args.remove_host task.spades_mem = str(args.spades_mem) task.spades_mode = args.spades_mode task.vc_threshold = '0.7' task.ref_num = 0 task.min_vc_score = args.min_vc_score if args.test != None: task.name = 'test_run' task.ex_r1 = Path.cwd().joinpath('test_data','AdV_R1.fastq.gz') task.ex_r2 = Path.cwd().joinpath('test_data','AdV_R2.fastq.gz') if args.test == 'ref': task.ref = Path.cwd().joinpath('test_data', 'AC_000008.1.fasta') elif args.test == 'multi_ref': task.ref = Path.cwd().joinpath('test_data', 'adv_multi_ref.fasta') elif args.test == 'denovo': task.dehost = 'human' task.ref = None logger.info('Checking reference.') if task.ref != None: if check_ref_file(task): task.with_ref = True else: logger.error('Input reference not found. Exiting pipeline.') sys.exit() else: logger.info('Input reference not provided. Will go de novo') if task.with_ref == False: logger.info('Checking RVDB.') if utils.setup_rvdb() == -1: logger.error('RVDB setup error. Exiting pipeline.') sys.exit() if task.dehost != None: if utils.setup_genomes(task.dehost) == -1: logger.error('Host genome not found. Exiting pipeline.') sys.exit() logger.info('Checking reads files.') if check_reads_file(task) != -1: task.id = "%s_%s" % (task.name, time.strftime( "%Y%m%d%H%M", time.localtime())) logger.info('Creating new task %s.' % task.id) Path.mkdir(task.path.joinpath(task.id), parents=True) logger.info('Starting pipeline.') utils.write_log_file( task.path.joinpath(task.id), 'Starting pipeline.' ) # main pipeline reads_preprocess.run(task) reference_prepare.run(task) reads_alignment.run(task) variant_calling.run(task) logger.info('Pipeline finished.') utils.write_log_file( task.path.joinpath(task.id), 'Pipeline finished.' ) # report generator summary_generator.run(task) report_generator.run(task) else: logger.error('Reads not found. Exiting pipeline.') sys.exit()
def remove_host(task): logger.info('Removing host genome.') dehost_meta = {'genome': '', 'remove_percentage': ''} host_remove_cwd = task.path.joinpath(task.id, 'reads') Path.mkdir(host_remove_cwd, parents=True, exist_ok=True) if task.dehost == 'dog': dehost_meta['genome'] = 'Dog (Dog10K_Boxer_Tasha, GCF_000002285.5)' genome_path = '/app/genomes/' + 'dog10k' elif task.dehost == 'human': dehost_meta['genome'] = 'Human (GRCh38.p13, GCF_000001405.39)' genome_path = '/app/genomes/' + 'grch38' elif task.dehost == 'vero': dehost_meta['genome'] = 'Vero (Vero_WHO_p1.0, GCF_015252025.1)' genome_path = '/app/genomes/' + 'vero' elif task.dehost == 'chicken': dehost_meta['genome'] = 'Chicken (GRCg6a, GCF_000002315.6)' genome_path = '/app/genomes/' + 'grcg6a' elif task.dehost == 'rhesus_monkey': dehost_meta['genome'] = 'Rhesus monkey (Mmul_10, GCF_003339765.1)' genome_path = '/app/genomes/' + 'mmul_10' unconc_reads_out = task.id + '_host_removed_R%.fastq.gz' mapped_reads_out = 'host_mapped.sam' align_cmd = [ 'bowtie2', '-p', str(task.threads), '-x', str(genome_path), '-1', str(task.path.joinpath(task.id, 'reads', task.id + '_R1.fastq.gz')), '-2', str(task.path.joinpath(task.id, 'reads', task.id + '_R2.fastq.gz')), '-S', str(mapped_reads_out), '--very-sensitive-local', '--un-conc-gz', '%s' % str(unconc_reads_out) ] logger.info('CMD: ' + ' '.join(align_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(align_cmd)) cmd_run = subprocess.run(align_cmd, cwd=host_remove_cwd, capture_output=True) # print(cmd_run.stdout.decode(encoding='utf-8')) print(cmd_run.stderr.decode(encoding='utf-8')) # build meta logger.info('Analysis BAM file from host mapped reads') # sorting sorting_cmd = [ 'samtools', 'sort', '-@', task.threads, 'host_mapped.sam', '-o', 'host_mapped.sorted.bam' ] logger.info('CMD: ' + ' '.join(sorting_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(sorting_cmd)) sorting_run = subprocess.run(sorting_cmd, cwd=host_remove_cwd, capture_output=True) print(sorting_run.stdout.decode(encoding='utf-8')) print(sorting_run.stderr.decode(encoding='utf-8')) # flagstat flagstat_cmd = [ 'samtools', 'flagstat', '-@', task.threads, 'host_mapped.sorted.bam' ] logger.info('CMD: ' + ' '.join(flagstat_cmd)) utils.write_log_file(task.path.joinpath(task.id), 'CMD: ' + ' '.join(flagstat_cmd)) flagstat_run = subprocess.run(flagstat_cmd, cwd=host_remove_cwd, capture_output=True) stats_text = flagstat_run.stdout.decode(encoding='utf-8') stats_list = stats_text.split('\n') utils.build_text_file(task.path.joinpath(host_remove_cwd, 'flagstat.txt'), stats_text) mapped_rate = stats_list[4].split(' ')[4][1:] dehost_meta['remove_percentage'] = mapped_rate utils.build_json_file( task.path.joinpath(host_remove_cwd, 'dehost_meta.json'), dehost_meta) # remove sam file to release disk space os.remove(task.path.joinpath(host_remove_cwd, mapped_reads_out)) # remove host bam file to release disk space os.remove(task.path.joinpath(host_remove_cwd, 'host_mapped.sorted.bam'))
pc_clean_sr_passed = round(clean_sr_results_list.count('True')/total_clips*100, 1) pc_noise_sr_passed = round(noise_sr_results_list.count('True')/total_clips*100, 1) pc_noisy_sr_passed = round(noisy_sr_results_list.count('True')/total_clips*100, 1) pc_clean_clipping_passed = round(clean_clipping_results_list.count('True')/total_clips*100, 1) pc_noise_clipping_passed = round(noise_clipping_results_list.count('True')/total_clips*100, 1) pc_noisy_clipping_passed = round(noisy_clipping_results_list.count('True')/total_clips*100, 1) print('% clips that passed SNR test:', pc_snr_passed) print('% clean clips that passed Normalization tests:', pc_clean_norm_passed) print('% noise clips that passed Normalization tests:', pc_noise_norm_passed) print('% noisy clips that passed Normalization tests:', pc_noisy_norm_passed) print('% clean clips that passed Sampling Rate tests:', pc_clean_sr_passed) print('% noise clips that passed Sampling Rate tests:', pc_noise_sr_passed) print('% noisy clips that passed Sampling Rate tests:', pc_noisy_sr_passed) print('% clean clips that passed Clipping tests:', pc_clean_clipping_passed) print('% noise clips that passed Clipping tests:', pc_noise_clipping_passed) print('% noisy clips that passed Clipping tests:', pc_noisy_clipping_passed) log_dir = utils.get_dir(cfg, 'unit_tests_log_dir', 'Unit_tests_logs') if not os.path.exists(log_dir): log_dir = os.path.join(os.path.dirname(__file__), 'Unit_tests_logs') os.makedirs(log_dir) utils.write_log_file(log_dir, 'unit_test_results.csv', [noisy_filenames_list, clean_filenames_list, \ noise_filenames_list, snr_results_list, clean_norm_results_list, noise_norm_results_list, \ noisy_norm_results_list, clean_sr_results_list, noise_sr_results_list, noisy_sr_results_list, \ clean_clipping_results_list, noise_clipping_results_list, noisy_clipping_results_list])
def main_body(): '''Main body of this file''' parser = argparse.ArgumentParser() # Configurations: read noisyspeech_synthesizer.cfg and gather inputs parser.add_argument( '--cfg', default='pdns_synthesizer_icassp2022.cfg', help='Read noisyspeech_synthesizer.cfg for all the details') parser.add_argument('--cfg_str', type=str, default='noisy_speech') args = parser.parse_args() params = dict() params['args'] = args cfgpath = os.path.join(args.cfg) # os.path.join(os.path.dirname(__file__), args.cfg) assert os.path.exists(cfgpath), f'No configuration file as [{cfgpath}]' cfg = CP.ConfigParser() cfg._interpolation = CP.ExtendedInterpolation() cfg.read(cfgpath) params['cfg'] = cfg._sections[args.cfg_str] cfg = params['cfg'] clean_dir = os.path.join('datasets/clean') if cfg['speech_dir'] != 'None': clean_dir = cfg['speech_dir'] if not os.path.exists(clean_dir): assert False, ('Clean speech data is required') if cfg['speech_dir2'] != 'None': clean_dir2 = cfg['speech_dir2'] if cfg['spkid_csv'] != 'None': spkid_csv = cfg['spkid_csv'] if not os.path.exists(clean_dir2): assert False, ('Clean speech2 data is required') if cfg['rir_dir'] != 'None': rir_dir = cfg['rir_dir'] if cfg['noise_dir'] != 'None': noise_dir = cfg['noise_dir'] if not os.path.exists(noise_dir): assert False, ('Clean speech data is required') print(clean_dir) print(clean_dir2) print(noise_dir) print(spkid_csv) print(rir_dir) if cfg['noise_dir'] != 'None': noise_dir = cfg['noise_dir'] if not os.path.exists: assert False, ('Noise data is required') params['fs'] = int(cfg['sampling_rate']) params['audioformat'] = cfg['audioformat'] params['audio_length'] = float(cfg['audio_length']) params['silence_length'] = float(cfg['silence_length']) params['total_hours'] = float(cfg['total_hours']) # clean singing speech params['clean_singing'] = str(cfg['clean_singing']) params['singing_choice'] = int(cfg['singing_choice']) # rir params['rir_choice'] = int(cfg['rir_choice']) params['lower_t60'] = float(cfg['lower_t60']) params['upper_t60'] = float(cfg['upper_t60']) params['rir_table_csv'] = str(cfg['rir_table_csv']) params['clean_speech_t60_csv'] = str(cfg['clean_speech_t60_csv']) if cfg['fileindex_start'] != 'None' and cfg['fileindex_start'] != 'None': params['num_files'] = int(cfg['fileindex_end']) - int( cfg['fileindex_start']) params['fileindex_start'] = int(cfg['fileindex_start']) params['fileindex_end'] = int(cfg['fileindex_end']) else: params['num_files'] = int( (params['total_hours'] * 60 * 60) / params['audio_length']) params['fileindex_start'] = 0 params['fileindex_end'] = int(params['num_files']) print('Number of files to be synthesized:', params['num_files']) params['is_test_set'] = utils.str2bool(cfg['is_test_set']) params['clean_activity_threshold'] = float(cfg['clean_activity_threshold']) params['noise_activity_threshold'] = float(cfg['noise_activity_threshold']) params['snr_lower'] = int(cfg['snr_lower']) params['snr_upper'] = int(cfg['snr_upper']) params['randomize_snr'] = utils.str2bool(cfg['randomize_snr']) params['target_level_lower'] = int(cfg['target_level_lower']) params['target_level_upper'] = int(cfg['target_level_upper']) if 'snr' in cfg.keys(): params['snr'] = int(cfg['snr']) else: params['snr'] = int((params['snr_lower'] + params['snr_upper']) / 2) params['noisyspeech_dir'] = utils.get_dir(cfg, 'noisy_destination', 'noisy') params['clean_proc_dir'] = utils.get_dir(cfg, 'clean_destination', 'clean') params['noise_proc_dir'] = utils.get_dir(cfg, 'noise_destination', 'noise') if 'speech_csv' in cfg.keys() and cfg['speech_csv'] != 'None': cleanfilenames = pd.read_csv(cfg['speech_csv']) cleanfilenames = cleanfilenames['filename'] else: cleanfilenames = [] for path in Path(cfg['speech_dir']).rglob('*.wav'): cleanfilenames.append(str(path.resolve())) selected_primary = [] selected_secondary = [] with open(spkid_csv, 'r') as file: my_reader = csv.reader(file, delimiter=',') for row in my_reader: if row[1] == 'primary': selected_primary.append(row) elif row[1] == 'secondary': selected_secondary.append(row) cleanfilenames = [] for row in selected_primary: cleanfilenames.append(row[0]) cleanfilenames2 = [] for row in selected_secondary: cleanfilenames2.append(row[0]) params['cleanfilenames'] = cleanfilenames shuffle(cleanfilenames2) params['cleanfilenames2'] = cleanfilenames2 rirfilenames = [] for path in Path(cfg['rir_dir']).rglob('*.wav'): rirfilenames.append(str(path.resolve())) shuffle(rirfilenames) params['myrir'] = rirfilenames if 'noise_csv' in cfg.keys() and cfg['noise_csv'] != 'None': noisefilenames = pd.read_csv(cfg['noise_csv']) noisefilenames = noisefilenames['filename'] else: noisefilenames = glob.glob( os.path.join(noise_dir, params['audioformat'])) if len(noisefilenames) != 0: shuffle(noisefilenames) params['noisefilenames'] = noisefilenames else: noisedirs = glob.glob(os.path.join(noise_dir, '*')) if cfg['noise_types_excluded'] != 'None': dirstoexclude = cfg['noise_types_excluded'].split(',') for dirs in dirstoexclude: noisedirs.remove(dirs) shuffle(noisedirs) params['noisedirs'] = noisedirs # Call main_gen() to generate audio clean_source_files, clean_clipped_files, clean_low_activity_files, \ noise_source_files, noise_clipped_files, noise_low_activity_files = main_gen(params) # Create log directory if needed, and write log files of clipped and low activity files log_dir = utils.get_dir(cfg, 'log_dir', 'Logs') utils.write_log_file(log_dir, 'source_files.csv', clean_source_files + noise_source_files) utils.write_log_file(log_dir, 'clipped_files.csv', clean_clipped_files + noise_clipped_files) utils.write_log_file(log_dir, 'low_activity_files.csv', \ clean_low_activity_files + noise_low_activity_files) # Compute and print stats about percentange of clipped and low activity files total_clean = len(clean_source_files) + len(clean_clipped_files) + len( clean_low_activity_files) total_noise = len(noise_source_files) + len(noise_clipped_files) + len( noise_low_activity_files) pct_clean_clipped = round(len(clean_clipped_files) / total_clean * 100, 1) pct_noise_clipped = round(len(noise_clipped_files) / total_noise * 100, 1) pct_clean_low_activity = round( len(clean_low_activity_files) / total_clean * 100, 1) pct_noise_low_activity = round( len(noise_low_activity_files) / total_noise * 100, 1) print("Of the " + str(total_clean) + " clean speech files analyzed, " + \ str(pct_clean_clipped) + "% had clipping, and " + str(pct_clean_low_activity) + \ "% had low activity " + "(below " + str(params['clean_activity_threshold']*100) + \ "% active percentage)") print("Of the " + str(total_noise) + " noise files analyzed, " + str(pct_noise_clipped) + \ "% had clipping, and " + str(pct_noise_low_activity) + "% had low activity " + \ "(below " + str(params['noise_activity_threshold']*100) + "% active percentage)")
def main_body(): '''Main body of this file''' parser = argparse.ArgumentParser() # Configurations: read noisyspeech_synthesizer.cfg and gather inputs parser.add_argument('--cfg', default='noisyspeech_synthesizer.cfg', help='Read noisyspeech_synthesizer.cfg for all the details') parser.add_argument('--cfg_str', type=str, default='noisy_speech') args = parser.parse_args() params = dict() params['args'] = args cfgpath = os.path.join(os.path.dirname(__file__), args.cfg) assert os.path.exists(cfgpath), f'No configuration file as [{cfgpath}]' cfg = CP.ConfigParser() cfg._interpolation = CP.ExtendedInterpolation() cfg.read(cfgpath) params['cfg'] = cfg._sections[args.cfg_str] cfg = params['cfg'] clean_dir = os.path.join(os.path.dirname(__file__), 'datasets/clean') if cfg['speech_dir'] != 'None': clean_dir = cfg['speech_dir'] if not os.path.exists(clean_dir): assert False, ('Clean speech data is required') noise_dir = os.path.join(os.path.dirname(__file__), 'datasets/noise') if cfg['noise_dir'] != 'None': noise_dir = cfg['noise_dir'] if not os.path.exists: assert False, ('Noise data is required') params['fs'] = int(cfg['sampling_rate']) params['audioformat'] = cfg['audioformat'] params['audio_length'] = float(cfg['audio_length']) params['silence_length'] = float(cfg['silence_length']) params['total_hours'] = float(cfg['total_hours']) # clean singing speech params['use_singing_data'] = int(cfg['use_singing_data']) params['clean_singing'] = str(cfg['clean_singing']) params['singing_choice'] = int(cfg['singing_choice']) # clean emotional speech params['use_emotion_data'] = int(cfg['use_emotion_data']) params['clean_emotion'] = str(cfg['clean_emotion']) # clean mandarin speech params['use_mandarin_data'] = int(cfg['use_mandarin_data']) params['clean_mandarin'] = str(cfg['clean_mandarin']) # rir params['rir_choice'] = int(cfg['rir_choice']) params['lower_t60'] = float(cfg['lower_t60']) params['upper_t60'] = float(cfg['upper_t60']) params['rir_table_csv'] = str(cfg['rir_table_csv']) params['clean_speech_t60_csv'] = str(cfg['clean_speech_t60_csv']) if cfg['fileindex_start'] != 'None' and cfg['fileindex_end'] != 'None': params['num_files'] = int(cfg['fileindex_end'])-int(cfg['fileindex_start']) params['fileindex_start'] = int(cfg['fileindex_start']) params['fileindex_end'] = int(cfg['fileindex_end']) else: params['num_files'] = int((params['total_hours']*60*60)/params['audio_length']) params['fileindex_start'] = 0 params['fileindex_end'] = params['num_files'] print('Number of files to be synthesized:', params['num_files']) params['is_test_set'] = utils.str2bool(cfg['is_test_set']) params['clean_activity_threshold'] = float(cfg['clean_activity_threshold']) params['noise_activity_threshold'] = float(cfg['noise_activity_threshold']) params['snr_lower'] = int(cfg['snr_lower']) params['snr_upper'] = int(cfg['snr_upper']) params['randomize_snr'] = utils.str2bool(cfg['randomize_snr']) params['target_level_lower'] = int(cfg['target_level_lower']) params['target_level_upper'] = int(cfg['target_level_upper']) if 'snr' in cfg.keys(): params['snr'] = int(cfg['snr']) else: params['snr'] = int((params['snr_lower'] + params['snr_upper'])/2) params['noisyspeech_dir'] = utils.get_dir(cfg, 'noisy_destination', 'noisy') params['clean_proc_dir'] = utils.get_dir(cfg, 'clean_destination', 'clean') params['noise_proc_dir'] = utils.get_dir(cfg, 'noise_destination', 'noise') if 'speech_csv' in cfg.keys() and cfg['speech_csv'] != 'None': cleanfilenames = pd.read_csv(cfg['speech_csv']) cleanfilenames = cleanfilenames['filename'] else: #cleanfilenames = glob.glob(os.path.join(clean_dir, params['audioformat'])) cleanfilenames= [] for path in Path(clean_dir).rglob('*.wav'): cleanfilenames.append(str(path.resolve())) shuffle(cleanfilenames) # add singing voice to clean speech if params['use_singing_data'] ==1: all_singing= [] for path in Path(params['clean_singing']).rglob('*.wav'): all_singing.append(str(path.resolve())) if params['singing_choice']==1: # male speakers mysinging = [s for s in all_singing if ("male" in s and "female" not in s)] elif params['singing_choice']==2: # female speakers mysinging = [s for s in all_singing if "female" in s] elif params['singing_choice']==3: # both male and female mysinging = all_singing else: # default both male and female mysinging = all_singing shuffle(mysinging) if mysinging is not None: all_cleanfiles= cleanfilenames + mysinging else: all_cleanfiles= cleanfilenames # add emotion data to clean speech if params['use_emotion_data'] ==1: all_emotion= [] for path in Path(params['clean_emotion']).rglob('*.wav'): all_emotion.append(str(path.resolve())) shuffle(all_emotion) if all_emotion is not None: all_cleanfiles = all_cleanfiles + all_emotion else: print('NOT using emotion data for training!') # add mandarin data to clean speech if params['use_mandarin_data'] ==1: all_mandarin= [] for path in Path(params['clean_mandarin']).rglob('*.wav'): all_mandarin.append(str(path.resolve())) shuffle(all_mandarin) if all_mandarin is not None: all_cleanfiles = all_cleanfiles + all_mandarin else: print('NOT using non-english (Mandarin) data for training!') params['cleanfilenames'] = all_cleanfiles params['num_cleanfiles'] = len(params['cleanfilenames']) # If there are .wav files in noise_dir directory, use those # If not, that implies that the noise files are organized into subdirectories by type, # so get the names of the non-excluded subdirectories if 'noise_csv' in cfg.keys() and cfg['noise_csv'] != 'None': noisefilenames = pd.read_csv(cfg['noise_csv']) noisefilenames = noisefilenames['filename'] else: noisefilenames = glob.glob(os.path.join(noise_dir, params['audioformat'])) if len(noisefilenames)!=0: shuffle(noisefilenames) params['noisefilenames'] = noisefilenames else: noisedirs = glob.glob(os.path.join(noise_dir, '*')) if cfg['noise_types_excluded'] != 'None': dirstoexclude = cfg['noise_types_excluded'].split(',') for dirs in dirstoexclude: noisedirs.remove(dirs) shuffle(noisedirs) params['noisedirs'] = noisedirs # rir temp = pd.read_csv(params['rir_table_csv'], skiprows=[1], sep=',', header=None, names=['wavfile','channel','T60_WB','C50_WB','isRealRIR']) temp.keys() #temp.wavfile rir_wav = temp['wavfile'][1:] # 115413 rir_channel = temp['channel'][1:] rir_t60 = temp['T60_WB'][1:] rir_isreal= temp['isRealRIR'][1:] rir_wav2 = [w.replace('\\', '/') for w in rir_wav] rir_channel2 = [w for w in rir_channel] rir_t60_2 = [w for w in rir_t60] rir_isreal2= [w for w in rir_isreal] myrir =[] mychannel=[] myt60=[] lower_t60= params['lower_t60'] upper_t60= params['upper_t60'] if params['rir_choice']==1: # real 3076 IRs real_indices= [i for i, x in enumerate(rir_isreal2) if x == "1"] chosen_i = [] for i in real_indices: if (float(rir_t60_2[i]) >= lower_t60) and (float(rir_t60_2[i]) <= upper_t60): chosen_i.append(i) myrir= [rir_wav2[i] for i in chosen_i] mychannel = [rir_channel2[i] for i in chosen_i] myt60 = [rir_t60_2[i] for i in chosen_i] elif params['rir_choice']==2: # synthetic 112337 IRs synthetic_indices= [i for i, x in enumerate(rir_isreal2) if x == "0"] chosen_i = [] for i in synthetic_indices: if (float(rir_t60_2[i]) >= lower_t60) and (float(rir_t60_2[i]) <= upper_t60): chosen_i.append(i) myrir= [rir_wav2[i] for i in chosen_i] mychannel = [rir_channel2[i] for i in chosen_i] myt60 = [rir_t60_2[i] for i in chosen_i] elif params['rir_choice']==3: # both real and synthetic all_indices= [i for i, x in enumerate(rir_isreal2)] chosen_i = [] for i in all_indices: if (float(rir_t60_2[i]) >= lower_t60) and (float(rir_t60_2[i]) <= upper_t60): chosen_i.append(i) myrir= [rir_wav2[i] for i in chosen_i] mychannel = [rir_channel2[i] for i in chosen_i] myt60 = [rir_t60_2[i] for i in chosen_i] else: # default both real and synthetic all_indices= [i for i, x in enumerate(rir_isreal2)] chosen_i = [] for i in all_indices: if (float(rir_t60_2[i]) >= lower_t60) and (float(rir_t60_2[i]) <= upper_t60): chosen_i.append(i) myrir= [rir_wav2[i] for i in chosen_i] mychannel = [rir_channel2[i] for i in chosen_i] myt60 = [rir_t60_2[i] for i in chosen_i] params['myrir'] = myrir params['mychannel'] = mychannel params['myt60'] = myt60 # Call main_gen() to generate audio clean_source_files, clean_clipped_files, clean_low_activity_files, \ noise_source_files, noise_clipped_files, noise_low_activity_files = main_gen(params) # Create log directory if needed, and write log files of clipped and low activity files log_dir = utils.get_dir(cfg, 'log_dir', 'Logs') utils.write_log_file(log_dir, 'source_files.csv', clean_source_files + noise_source_files) utils.write_log_file(log_dir, 'clipped_files.csv', clean_clipped_files + noise_clipped_files) utils.write_log_file(log_dir, 'low_activity_files.csv', \ clean_low_activity_files + noise_low_activity_files) # Compute and print stats about percentange of clipped and low activity files total_clean = len(clean_source_files) + len(clean_clipped_files) + len(clean_low_activity_files) total_noise = len(noise_source_files) + len(noise_clipped_files) + len(noise_low_activity_files) pct_clean_clipped = round(len(clean_clipped_files)/total_clean*100, 1) pct_noise_clipped = round(len(noise_clipped_files)/total_noise*100, 1) pct_clean_low_activity = round(len(clean_low_activity_files)/total_clean*100, 1) pct_noise_low_activity = round(len(noise_low_activity_files)/total_noise*100, 1) print("Of the " + str(total_clean) + " clean speech files analyzed, " + \ str(pct_clean_clipped) + "% had clipping, and " + str(pct_clean_low_activity) + \ "% had low activity " + "(below " + str(params['clean_activity_threshold']*100) + \ "% active percentage)") print("Of the " + str(total_noise) + " noise files analyzed, " + str(pct_noise_clipped) + \ "% had clipping, and " + str(pct_noise_low_activity) + "% had low activity " + \ "(below " + str(params['noise_activity_threshold']*100) + "% active percentage)")
self.test_s5.append(value) elif k == 10: self.test_s10.append(value) else: print('cannot find !') write_log_file( self.log_path, "S@1, S@5, S@10\n{}, {}, {}".format(self.test_s1[-1], self.test_s5[-1], self.test_s10[-1])) if __name__ == '__main__': all_time_1 = datetime.now() trainer = Trainer(arguments) if arguments.only_test: trainer.load_model(arguments.model_path) else: trainer.fit() trainer.load_model(trainer.best_model_path) all_time_1 = datetime.now() write_log_file( trainer.log_path, "finished to load the model, next to start to test and time is = {}". format(all_time_1)) trainer.test(iter_no=trainer.max_iteration + 1) write_log_file( trainer.log_path, "\nAll Finished using ({})\n".format(datetime.now() - all_time_1))