Beispiel #1
0
    def download(self):

        self.pl.logger.info(f'Initiating  download to: {self.sketch_path}')

        self.sketch_path.mkdir(parents=True, exist_ok=True)

        db_paths = []
        for sketch in self.sketch_files.keys():
            file = self.sketch_files[sketch]
            file_path = self.sketch_path / file

            cmd = f'wget {self.base_url + file} -O {file_path}'
            if not file_path.exists():
                self.pl.logger.info(f'Downloading {file} from {self.base_url}')
                run_cmd(cmd)
            else:
                self.pl.logger.info(f'File exists: {file_path}')

            db_paths.append(self.sketch_path / file)

        self.pl.logger.info(
            f'Downloads complete, use `sketchy db-list` to view local sketches.'
        )

        return db_paths
Beispiel #2
0
def mashdist(fasta, index, output, kmer_size, sketch_size):
    """ Experimental: compute a population graph with NetView and Mash """

    li = LineageIndex(index_file=index)

    with tempfile.TemporaryDirectory() as dirname:
        dirpath = Path(dirname)
        run_cmd(f'mash sketch -l -k {kmer_size} -s {sketch_size} '
                f'{fasta} -o {dirpath / "lineage"}')

        run_cmd(
            f'mash dist {dirpath / "lineage.msh"} {dirpath / "lineage.msh"}'
            f' > {dirpath / "lineage.tsv"}',
            shell=True)

        df = read_mash_pairwise(dirpath / "lineage.tsv")

    matrix = [
        genome_data.dist.values
        for g2, genome_data in df.groupby('genome2', sort=False)
    ]

    matrix = np.stack(matrix, axis=0)

    np.savetxt(output, matrix, fmt="%.12f", delimiter="\t")
Beispiel #3
0
    def run(
        self,
        ranks: int = 10,
        limit: int = None,
        stable: int = 1000,
        threads: int = 4,
        palette: str = 'YlGnBu',
        image_format: str = 'pdf'
    ) -> None:

        sketch, features, keys = self.get_sketch_files()

        if limit is not None:
            limit_pipe = f'| head -{limit* 4 if is_fastq(self.fastx) else 2}'
        else:
            limit_pipe = ''

        self.logger.info(f'Sketch database: {sketch}')
        self.logger.info(f'Consensus ranks: {ranks}')
        self.logger.info(f'Read limit: {"all" if limit is None else limit}')
        self.logger.info(f'Stability breakpoint: {stable}')
        self.logger.info(f'Threads for Mash: {threads}')

        command_ssh = f'cat {self.fastx} {limit_pipe}' \
            f' | sketchy-rs compute -r {ranks} -s {sketch}' \
            f' -t {threads} -p {1 if self.verbose else 0}' \
            f' > {self.outdir / self.prefix}.ssh.tsv'

        command_sssh = f'cat {self.outdir / self.prefix}.ssh.tsv' \
            f' | sketchy-rs evaluate -f {features} -s {stable}' \
            f' > {self.outdir / self.prefix}.sssh.tsv'

        self.logger.info('Computing sum of shared hashes...')
        run_cmd(command_ssh, shell=True)

        self.logger.info('Evaluating sum of shared hashes...')
        run_cmd(command_sssh, shell=True)

        eve = Evaluation(
            sssh=Path(f"{self.outdir / self.prefix}.sssh.tsv"),
            ssh=Path(f'{self.outdir / self.prefix}.ssh.tsv'),
            index=features,
            key=keys,
            stable=stable,
            verbose=self.verbose
        )

        eve.plot_feature_evaluations(
            plot_file=Path(f'{self.outdir / self.prefix}.{image_format}'),
            break_file=Path(f'{self.outdir / self.prefix}.data.tsv'),
            color=palette, break_point=True,
        )
Beispiel #4
0
    def _parse_read_stats(self, read_file):

        # last read
        read = run_cmd(
            f'tail -n 4 {read_file}', shell=True
        )

        lines = read.decode("utf-8").split('\n')

        try:
            header = lines[0]
            seq = lines[1]

            try:
                timestr = re.search(self.start_time_regex, header)
                if timestr:
                    time = timestr.group(1).strip().replace('start_time=', '')
                    dtime = dateutil.parser.parse(time)
                else:
                    dtime = '-'
            except IndexError:
                dtime = '-'

        except KeyError:
            self.logger.info(f'Could not detect last read in: {read_file}')
            dtime, seq = '-', ''

        return len(seq), str(dtime)
Beispiel #5
0
    def sketch(
        fdir: Path,
        name: Path = 'sketchy',
        k: int = 15,
        size: int = 1000,
        glob: str = "*.fasta",
        ncpu: int = 4
    ) -> Path:

        """ Sketch a collection of FASTA files """

        run_cmd(
            f'mash sketch -p {ncpu} -s {size}'
            f' -k {k} -o {name} {fdir}{os.sep}{glob}',
            shell=True
        )

        return name.with_suffix('.msh')
Beispiel #6
0
    def _get_total_reads(self, fastq: Path) -> int or None:

        try:
            out = run_cmd(f'wc -l {fastq}', shell=True)
            return int(out.decode("utf-8").strip('\n').split()[0]) // 4
        except:
            self.logger.debug('Error in getting total read count.')
            self.logger.info(f'Could not detect total read number in {fastq}')
            raise
Beispiel #7
0
    def cut_read(nread, mode, fastq, tmpdir):

        n = 4 * (nread + 1)

        if mode == "cumulative":
            fpath = tmpdir / f'reads_{n}.fq'
            run_cmd(
                f'head -n {n} {fastq} > {fpath}', shell=True
            )
        elif mode == "single":
            fpath = tmpdir / f'read_{n // 4}.fq'
            run_cmd(
                f'head -n {n} {fastq} | tail -4 > {fpath}', shell=True
            )
        else:
            raise

        return fpath
Beispiel #8
0
    def check_rust_dependencies(self):

        """ Check dependency versions for Rust pipeline """

        try:
            output = run_cmd('rustc --version')
            rustc_version = output.decode('utf-8').split()[1].strip()
            self.logger.info(f'Rustc version: {rustc_version}')
        except FileNotFoundError:
            self.logger.info('Failed to run Sketchy: no `rustc` in $PATH.')
            exit(1)
        except KeyError:
            self.logger.info('Failed to parse version of: rustc')
            exit(1)

        try:
            output = run_cmd('mash --version')
            mash_version = output.decode('utf-8').strip()
            self.logger.info(f'Mash version: {mash_version}')
        except FileNotFoundError:
            self.logger.info('Failed to run Sketchy: no `mash` in $PATH.')
            exit(1)
        except KeyError:
            self.logger.info('Failed to parse version of: mash')
            exit(1)

        try:
            output = run_cmd('sketchy-rs --version')
            sketchyrs_version = output.decode('utf-8').split()[1].strip()
            self.logger.info(f'Sketchy Rust version: {sketchyrs_version}')
        except FileNotFoundError:
            self.logger.info('Failed to run Sketchy: no `sketchy-rs` in $PATH.')
            exit(1)
        except KeyError:
            self.logger.info('Failed to parse version of: sketchy-rs')
            exit(1)
Beispiel #9
0
    def mash_dist(file, mashdb, ncpu=4, sort_by='shared'):

        """ Compute MASH distance by simply calling MASH

        :param file:
        :param mashdb:
        :param ncpu:
        :param sort_by:
        :return:
        """

        result = run_cmd(
            f'mash dist -p {ncpu} {mashdb} {file}', shell=True
        )

        df = pandas.read_csv(
            StringIO(result.decode("utf-8")), sep='\t', header=None,
            names=[
                "id", 'file', 'dist', "p-value", "shared"
            ], index_col=False
        )

        shared = pandas.DataFrame(
            df.shared.str.split('/').tolist(), columns=['shared', 'total']
        )

        df.shared = shared.shared.astype(int)
        df.dist = df.dist.astype(float)

        if sort_by == 'shared':
            return df.sort_values(by=sort_by, ascending=False)
        elif sort_by == 'dist':
            return df.sort_values(by=sort_by, ascending=True)
        else:
            raise ValueError(
                'MASH distance must be sorted by one of: shared, dist'
            )
Beispiel #10
0
def merge(sketch, features, key, prefix, index_column, mash_column, verbose):

    """ Merge sketch and feature data by common indices """

    pl = PoreLogger(level=logging.INFO if verbose else logging.ERROR).logger

    pl.info(f'Extracting data from sketch: {sketch}')
    run_cmd(f'mash info -t {sketch} > {prefix}.mashinfo', shell=True)

    pl.info(f'Reading and converting data indices from sketch')
    converters = {'id': lambda x: Path(x).stem}
    mash_info = pandas.read_csv(
        f'{prefix}.mashinfo',
        sep='\t',
        header=None,
        skiprows=1,
        index_col=0,
        engine='c',
        usecols=[2],
        names=['id'],
        converters=converters,
    )

    pl.info(f'Assigning sequential indices to index column: `idx`')
    mash_info['idx'] = [i for i in range(len(mash_info))]
    mash_info['ids'] = mash_info.index.tolist()

    nsketch = len(mash_info)

    pl.info(f'Ordered merge on column {index_column} with feature file {features}')
    d = pandas.read_csv(features, sep='\t')

    ndata = len(d)

    print(mash_info)
    print(d)

    mash_info = d.merge(
        mash_info, left_on=index_column, right_on=mash_column, how='inner'
    )
    pl.info('Merged data and sketch information')
    if 'idx_y' in mash_info.columns:
        mash_info = mash_info.drop(columns="idx_x")
        mash_info = mash_info.rename(columns={'idx_y': 'idx'})

    if 'ids_y' in mash_info.columns:
        mash_info = mash_info.drop(columns=["ids_y"])

    if "ids_x" in mash_info.columns:
        mash_info = mash_info.drop(columns=["ids_x"])

    mash_info = mash_info.sort_values('idx')
    mash_info.index = mash_info['idx']
    mash_info = mash_info.drop(columns='idx')

    if key is not None:
        key_table = pandas.read_csv(
            key, sep='\t', header=0
        )
        mash_info = mash_info.merge(
            key_table, left_on='ids', right_on='uuid'
        )
        mash_info.drop(columns=['uuid', 'fasta'], inplace=True)
        mash_info.rename(columns={'id': 'key'}, inplace=True)

    print(mash_info)
    pl.info(f'Writing merged feature index to: {prefix}.tsv')
    mash_info.to_csv(
        f'{prefix}.tsv',
        sep='\t',
        header=True,
        index=True,
    )

    pl.info(f'Merged sketch data ({nsketch}) and feature data ({ndata})')
    pl.info(f'Final sketch and feature size is {len(mash_info)}')
    pl.info(f'Removed features not present in sketch: {len(mash_info) - ndata}')
    pl.info(f'Removing temporary file {prefix}.mashinfo')
    os.remove(f'{prefix}.mashinfo')
Beispiel #11
0
    def download_blob(self, bucket_name, archive_file):

        run_cmd(
            f'wget -q https://storage.googleapis.com/{bucket_name}/{archive_file.name} '
            f'-O {self.sketch_path / archive_file}')