def download(self): self.pl.logger.info(f'Initiating download to: {self.sketch_path}') self.sketch_path.mkdir(parents=True, exist_ok=True) db_paths = [] for sketch in self.sketch_files.keys(): file = self.sketch_files[sketch] file_path = self.sketch_path / file cmd = f'wget {self.base_url + file} -O {file_path}' if not file_path.exists(): self.pl.logger.info(f'Downloading {file} from {self.base_url}') run_cmd(cmd) else: self.pl.logger.info(f'File exists: {file_path}') db_paths.append(self.sketch_path / file) self.pl.logger.info( f'Downloads complete, use `sketchy db-list` to view local sketches.' ) return db_paths
def mashdist(fasta, index, output, kmer_size, sketch_size): """ Experimental: compute a population graph with NetView and Mash """ li = LineageIndex(index_file=index) with tempfile.TemporaryDirectory() as dirname: dirpath = Path(dirname) run_cmd(f'mash sketch -l -k {kmer_size} -s {sketch_size} ' f'{fasta} -o {dirpath / "lineage"}') run_cmd( f'mash dist {dirpath / "lineage.msh"} {dirpath / "lineage.msh"}' f' > {dirpath / "lineage.tsv"}', shell=True) df = read_mash_pairwise(dirpath / "lineage.tsv") matrix = [ genome_data.dist.values for g2, genome_data in df.groupby('genome2', sort=False) ] matrix = np.stack(matrix, axis=0) np.savetxt(output, matrix, fmt="%.12f", delimiter="\t")
def run( self, ranks: int = 10, limit: int = None, stable: int = 1000, threads: int = 4, palette: str = 'YlGnBu', image_format: str = 'pdf' ) -> None: sketch, features, keys = self.get_sketch_files() if limit is not None: limit_pipe = f'| head -{limit* 4 if is_fastq(self.fastx) else 2}' else: limit_pipe = '' self.logger.info(f'Sketch database: {sketch}') self.logger.info(f'Consensus ranks: {ranks}') self.logger.info(f'Read limit: {"all" if limit is None else limit}') self.logger.info(f'Stability breakpoint: {stable}') self.logger.info(f'Threads for Mash: {threads}') command_ssh = f'cat {self.fastx} {limit_pipe}' \ f' | sketchy-rs compute -r {ranks} -s {sketch}' \ f' -t {threads} -p {1 if self.verbose else 0}' \ f' > {self.outdir / self.prefix}.ssh.tsv' command_sssh = f'cat {self.outdir / self.prefix}.ssh.tsv' \ f' | sketchy-rs evaluate -f {features} -s {stable}' \ f' > {self.outdir / self.prefix}.sssh.tsv' self.logger.info('Computing sum of shared hashes...') run_cmd(command_ssh, shell=True) self.logger.info('Evaluating sum of shared hashes...') run_cmd(command_sssh, shell=True) eve = Evaluation( sssh=Path(f"{self.outdir / self.prefix}.sssh.tsv"), ssh=Path(f'{self.outdir / self.prefix}.ssh.tsv'), index=features, key=keys, stable=stable, verbose=self.verbose ) eve.plot_feature_evaluations( plot_file=Path(f'{self.outdir / self.prefix}.{image_format}'), break_file=Path(f'{self.outdir / self.prefix}.data.tsv'), color=palette, break_point=True, )
def _parse_read_stats(self, read_file): # last read read = run_cmd( f'tail -n 4 {read_file}', shell=True ) lines = read.decode("utf-8").split('\n') try: header = lines[0] seq = lines[1] try: timestr = re.search(self.start_time_regex, header) if timestr: time = timestr.group(1).strip().replace('start_time=', '') dtime = dateutil.parser.parse(time) else: dtime = '-' except IndexError: dtime = '-' except KeyError: self.logger.info(f'Could not detect last read in: {read_file}') dtime, seq = '-', '' return len(seq), str(dtime)
def sketch( fdir: Path, name: Path = 'sketchy', k: int = 15, size: int = 1000, glob: str = "*.fasta", ncpu: int = 4 ) -> Path: """ Sketch a collection of FASTA files """ run_cmd( f'mash sketch -p {ncpu} -s {size}' f' -k {k} -o {name} {fdir}{os.sep}{glob}', shell=True ) return name.with_suffix('.msh')
def _get_total_reads(self, fastq: Path) -> int or None: try: out = run_cmd(f'wc -l {fastq}', shell=True) return int(out.decode("utf-8").strip('\n').split()[0]) // 4 except: self.logger.debug('Error in getting total read count.') self.logger.info(f'Could not detect total read number in {fastq}') raise
def cut_read(nread, mode, fastq, tmpdir): n = 4 * (nread + 1) if mode == "cumulative": fpath = tmpdir / f'reads_{n}.fq' run_cmd( f'head -n {n} {fastq} > {fpath}', shell=True ) elif mode == "single": fpath = tmpdir / f'read_{n // 4}.fq' run_cmd( f'head -n {n} {fastq} | tail -4 > {fpath}', shell=True ) else: raise return fpath
def check_rust_dependencies(self): """ Check dependency versions for Rust pipeline """ try: output = run_cmd('rustc --version') rustc_version = output.decode('utf-8').split()[1].strip() self.logger.info(f'Rustc version: {rustc_version}') except FileNotFoundError: self.logger.info('Failed to run Sketchy: no `rustc` in $PATH.') exit(1) except KeyError: self.logger.info('Failed to parse version of: rustc') exit(1) try: output = run_cmd('mash --version') mash_version = output.decode('utf-8').strip() self.logger.info(f'Mash version: {mash_version}') except FileNotFoundError: self.logger.info('Failed to run Sketchy: no `mash` in $PATH.') exit(1) except KeyError: self.logger.info('Failed to parse version of: mash') exit(1) try: output = run_cmd('sketchy-rs --version') sketchyrs_version = output.decode('utf-8').split()[1].strip() self.logger.info(f'Sketchy Rust version: {sketchyrs_version}') except FileNotFoundError: self.logger.info('Failed to run Sketchy: no `sketchy-rs` in $PATH.') exit(1) except KeyError: self.logger.info('Failed to parse version of: sketchy-rs') exit(1)
def mash_dist(file, mashdb, ncpu=4, sort_by='shared'): """ Compute MASH distance by simply calling MASH :param file: :param mashdb: :param ncpu: :param sort_by: :return: """ result = run_cmd( f'mash dist -p {ncpu} {mashdb} {file}', shell=True ) df = pandas.read_csv( StringIO(result.decode("utf-8")), sep='\t', header=None, names=[ "id", 'file', 'dist', "p-value", "shared" ], index_col=False ) shared = pandas.DataFrame( df.shared.str.split('/').tolist(), columns=['shared', 'total'] ) df.shared = shared.shared.astype(int) df.dist = df.dist.astype(float) if sort_by == 'shared': return df.sort_values(by=sort_by, ascending=False) elif sort_by == 'dist': return df.sort_values(by=sort_by, ascending=True) else: raise ValueError( 'MASH distance must be sorted by one of: shared, dist' )
def merge(sketch, features, key, prefix, index_column, mash_column, verbose): """ Merge sketch and feature data by common indices """ pl = PoreLogger(level=logging.INFO if verbose else logging.ERROR).logger pl.info(f'Extracting data from sketch: {sketch}') run_cmd(f'mash info -t {sketch} > {prefix}.mashinfo', shell=True) pl.info(f'Reading and converting data indices from sketch') converters = {'id': lambda x: Path(x).stem} mash_info = pandas.read_csv( f'{prefix}.mashinfo', sep='\t', header=None, skiprows=1, index_col=0, engine='c', usecols=[2], names=['id'], converters=converters, ) pl.info(f'Assigning sequential indices to index column: `idx`') mash_info['idx'] = [i for i in range(len(mash_info))] mash_info['ids'] = mash_info.index.tolist() nsketch = len(mash_info) pl.info(f'Ordered merge on column {index_column} with feature file {features}') d = pandas.read_csv(features, sep='\t') ndata = len(d) print(mash_info) print(d) mash_info = d.merge( mash_info, left_on=index_column, right_on=mash_column, how='inner' ) pl.info('Merged data and sketch information') if 'idx_y' in mash_info.columns: mash_info = mash_info.drop(columns="idx_x") mash_info = mash_info.rename(columns={'idx_y': 'idx'}) if 'ids_y' in mash_info.columns: mash_info = mash_info.drop(columns=["ids_y"]) if "ids_x" in mash_info.columns: mash_info = mash_info.drop(columns=["ids_x"]) mash_info = mash_info.sort_values('idx') mash_info.index = mash_info['idx'] mash_info = mash_info.drop(columns='idx') if key is not None: key_table = pandas.read_csv( key, sep='\t', header=0 ) mash_info = mash_info.merge( key_table, left_on='ids', right_on='uuid' ) mash_info.drop(columns=['uuid', 'fasta'], inplace=True) mash_info.rename(columns={'id': 'key'}, inplace=True) print(mash_info) pl.info(f'Writing merged feature index to: {prefix}.tsv') mash_info.to_csv( f'{prefix}.tsv', sep='\t', header=True, index=True, ) pl.info(f'Merged sketch data ({nsketch}) and feature data ({ndata})') pl.info(f'Final sketch and feature size is {len(mash_info)}') pl.info(f'Removed features not present in sketch: {len(mash_info) - ndata}') pl.info(f'Removing temporary file {prefix}.mashinfo') os.remove(f'{prefix}.mashinfo')
def download_blob(self, bucket_name, archive_file): run_cmd( f'wget -q https://storage.googleapis.com/{bucket_name}/{archive_file.name} ' f'-O {self.sketch_path / archive_file}')