def backup_files(self, backup_subfolder): backup_path = os.path.join(self.backup_folder, backup_subfolder) if not os.path.isfile( get_backup_file_path(backup_path, self.scene_folder, self.SCENE_OBJECTS_FILE)): pbar = ProgressBar([self.SCENE_OBJECTS_FILE], title="backup " + self.SCENE_OBJECTS_FILE) backup_file(backup_path, self.scene_folder, self.SCENE_OBJECTS_FILE, pbar=pbar)
def combine_wavs_batch(audio_paths, method, **kargv): audio_paths.sort() method = method.lower() if method == "librosa": fn = partial(split_on_silence_with_librosa, **kargv) elif method == "pydub": fn = partial(split_on_silence_with_pydub, **kargv) parallel_run(fn, audio_paths, desc="Split on silence", parallel=False) audio_path = audio_paths[0] spl = os.path.basename(audio_path).split('.', 1) prefix = os.path.dirname(audio_path)+"/"+spl[0]+"." in_ext = audio_path.rsplit(".")[1] data = load_json(config.alignment_path, encoding="utf8") #print(data) for i in range(len(wavs)-1): if len(wavs[i]) > 15000: continue if not paths[i] in data: continue sum = len(wavs[i]) filename = prefix + str(i).zfill(4)+"." asr = data[paths[i]]+" " concated = wavs[i] for j in range(i+1, len(wavs)): sum += len(wavs[j]) sum += 400 if sum > 15000: break if not paths[j] in data: break filename = filename + str(j).zfill(4) + "." asr = asr + data[paths[j]] + " " concated = concated + silence + wavs[j] final_fn = filename+"wav" data[final_fn] = asr concated.export(final_fn, format="wav") print(filename+"wav | "+str(len(concated))) if os.path.exists(config.alignment_path): backup_file(config.alignment_path) write_json(config.alignment_path, data) get_durations(data.keys(), print_detail=False) return 0
def get_output_base_path(load_path, eval_dirname="eval"): if not os.path.isdir(load_path): base_dir = os.path.dirname(load_path) else: base_dir = load_path base_dir = os.path.join(base_dir, eval_dirname) if os.path.exists(base_dir): backup_file(base_dir) makedirs(base_dir) m = re.compile(r'.*?\.ckpt\-([0-9]+)').match(load_path) base_path = os.path.join(base_dir, 'eval-%d' % int(m.group(1)) if m else 'eval') return base_path
def _roll_back(change, settings): update_id = change['update_id'] tmp_dir = path.join('/tmp', update_id) logger.info("Sanitizing [{}]".format(tmp_dir)) if os.path.exists(tmp_dir): shutil.rmtree(tmp_dir) os.mkdir(tmp_dir) origin_zip = path.join(utils.sanitize_path(settings['backup_dir']), change['change_id'] + '.zip') logger.info("Start unzip...") utils.unzip_file(tmp_dir, origin_zip) logger.info("Unzip finished.") pack_file = path.join(tmp_dir, "package-info.json") logger.info("reading package-info.json from [{}]".format(pack_file)) with open(pack_file, 'r') as f: content = f.read() package = json.loads(content) logger.info("creating rollback change...") roll_back_change = create_roll_back_change(change, package) logger.info("creation of rollback change finished") logger.info("start backing up files...") backup_location = utils.backup_file(roll_back_change, settings) logger.info("back up files finished") for file in roll_back_change['files_to_update']: logger.info("Performing rollback for file [{}]...".format(file)) utils.do_single_file_move(tmp_dir, file, backup_location) logger.info("Performing rollback for file finished. [{}]".format(file)) return roll_back_change
def __install_update(change, settings): logger.info("Backing up files for change [{}]".format(change['change_id'])) backup_tmp_dir = utils.backup_file(change, settings) logger.info("backed up file is in [{}]".format(backup_tmp_dir)) tmp_loc = '/tmp/' + change['change_id'] logger.info("sanitizing [{}]".format(tmp_loc)) if os.path.exists(tmp_loc): shutil.rmtree(tmp_loc) os.mkdir(tmp_loc) logger.info("Unzipping file...") utils.unzip_file( tmp_loc, os.path.join(utils.sanitize_path(settings['download_dir']), change['update_id'], change['package_name'])) logger.info("Unzip finished.") for file in change['files_to_update']: logger.info("Apply change for file [{}]".format(file)) utils.do_single_file_move(tmp_loc, file, backup_tmp_dir) logger.info("cleaning [{}]".format(tmp_loc)) shutil.rmtree(tmp_loc) logger.info("cleaning [{}]".format(backup_tmp_dir)) shutil.rmtree(backup_tmp_dir)
return results if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--audio_pattern', required=True) parser.add_argument('--recognition_filename', default="recognition.json") parser.add_argument('--sample_rate', default=16000, type=int) parser.add_argument('--pre_silence_length', default=1, type=int) parser.add_argument('--post_silence_length', default=1, type=int) parser.add_argument('--max_duration', default=60, type=int) config, unparsed = parser.parse_known_args() audio_dir = os.path.dirname(config.audio_pattern) for tmp_path in glob(os.path.join(audio_dir, "*.tmp.*")): remove_file(tmp_path) paths = glob(config.audio_pattern) paths.sort() results = text_recognition_batch(paths, config) base_dir = os.path.dirname(audio_dir) recognition_path = \ os.path.join(base_dir, config.recognition_filename) if os.path.exists(recognition_path): backup_file(recognition_path) write_json(recognition_path, results)
results.update(item) found_count = sum([type(value) == str for value in results.values()]) print(" [*] # found: {:.5f}% ({}/{})".format( len(results) / len(data), len(results), len(data))) print(" [*] # exact match: {:.5f}% ({}/{})".format( found_count / len(items), found_count, len(items))) return results if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--recognition_path', required=True) parser.add_argument('--alignment_filename', default="alignment.json") parser.add_argument('--score_threshold', default=0.4, type=float) parser.add_argument('--recognition_encoding', default='utf-8') config, unparsed = parser.parse_known_args() results = align_text_batch(config) base_dir = os.path.dirname(config.recognition_path) alignment_path = \ os.path.join(base_dir, config.alignment_filename) if os.path.exists(alignment_path): backup_file(alignment_path) write_json(alignment_path, results) duration = get_durations(results.keys(), print_detail=False)
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--audio_pattern', required=True) parser.add_argument('--alignment_path', required=True) parser.add_argument('--out_ext', default='wav') parser.add_argument('--method', choices=['librosa', 'pydub'], required=True) config = parser.parse_args() data = load_json(config.alignment_path, encoding="utf8") audio_paths = glob(config.audio_pattern) for path in audio_paths: single_path = path.replace('.wav', '_s??.wav') single_paths = glob(single_path) combine_wavs_batch( single_paths, config.method, out_ext=config.out_ext, ) wavs = [] paths = [] if os.path.exists(config.alignment_path): backup_file(config.alignment_path) write_json(config.alignment_path, data) get_durations(data.keys(), print_detail=False)
def backup_files(self, backup_path, dry_mode=False, pbar=None): backup_file(backup_path, self.folder, self.dbf_file_name, dry_mode=dry_mode, pbar=pbar) backup_file(backup_path, self.folder, self.shp_file_name, dry_mode=dry_mode, pbar=pbar) backup_file(backup_path, self.folder, self.shx_file_name, dry_mode=dry_mode, pbar=pbar) self.backup_file(backup_path, dry_mode=dry_mode, pbar=pbar)
def backup_file(self, backup_path, dry_mode=False, pbar=None): if self.optimization_in_progress: return backup_file(backup_path, self.folder, self.model_file, dry_mode=dry_mode, pbar=pbar)
def backup_file(self, backup_path, dry_mode=False, pbar=None): backup_file(backup_path, self.folder, self.definition_file, dry_mode=dry_mode, pbar=pbar)
def main(): # data downloaded from ftp://ftp.ncbi.nih.gov/pub/taxonomy/ args = parse_args() nodes_df = load_nodes(args.nodes_file) names_df = load_names(args.names_file) df = nodes_df.merge(names_df, on='tax_id') df = df[['tax_id', 'parent_tax_id', 'rank', 'name_txt']] df.reset_index(drop=True, inplace=True) logging.info('# of tax ids: {0}'.format(df.shape[0])) # log summary info about the dataframe print('=' * 50) df.info(verbose=True, memory_usage="deep") print('=' * 50) # force to use global variable TAXONOMY_DICT because map doesn't allow # passing extra args easily # TAXONOMY_DICT: a dict with tax_id as the key and each record as a value. # example tuple items: # (1, # {'parent_tax_id': 1, 'name_txt': 'root', # 'rank': 'no rank', 'tax_id': 1} # ) # (16, # {'parent_tax_id': 32011, 'name_txt': 'Methylophilus', # 'rank': 'genus', 'tax_id': 16} # ) global TAXONOMY_DICT logging.info('generating TAXONOMY_DICT...') TAXONOMY_DICT = dict(zip(df.tax_id.values, df.to_dict('records'))) ncpus = multiprocessing.cpu_count() logging.info('found {0} cpus, and will use all of them to find lineages ' 'for all tax ids'.format(ncpus)) pool = multiprocessing.Pool(ncpus) # take about 18G memory lineages_dd = pool.map(find_lineage, df.tax_id.values) pool.close() logging.info('generating a dictionary of lineages information...') dd_for_df = dict(zip(range(len(lineages_dd)), lineages_dd)) logging.info('generating lineages_df...') lineages_df = pd.DataFrame.from_dict(dd_for_df, orient='index') lineages_df.sort_values('tax_id', inplace=True) # # alternatively, but less useful, sort by ranks # lineages_df.sort_values(['superkingdom', # 'phylum', # 'class', # 'order', # 'family', # 'genus', # 'species'], inplace=True) lineages_csv_output = os.path.join('{0}.csv.gz'.format(args.output_prefix)) backup_file(lineages_csv_output) logging.info("writing lineages to {0}".format(lineages_csv_output)) with open(lineages_csv_output, 'wb') as opf: # make sure the name and timestamp are not gzipped, (like gzip -n) opf_gz = gzip.GzipFile( filename='', # empty string because fileobj is given mode='wb', # wb doesn't seem to work sometimes compresslevel=9, fileobj=opf, mtime=0. # an optional numeric timestamp, set to be deterministic ) cols = [ 'tax_id', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] other_cols = sorted( [__ for __ in lineages_df.columns if __ not in cols]) output_cols = cols + other_cols lineages_df.to_csv(opf_gz, index=False, columns=output_cols) opf_gz.close()