def genome(aphidicola): handler = TestHandler() genome = ("GCA_000521565.1_Buchnera_aphidicola_G002_" "Myzus_persicae_Complete_Genome.fasta") genome = os.path.join(aphidicola.path, genome) with handler: genome = Genome(genome, assembly_summary) genome.sketch() genome.get_contigs() genome.get_assembly_size() genome.get_unknowns() yield genome, handler
def genome(ctx, path, metadata): """ Get information about a single genome. """ genome = Genome(path, ctx.assembly_summary) if metadata: click.echo(genome.metadata)
def __init__(self, path, max_unknowns=200, contigs=3.0, assembly_size=3.0, mash=3.0, assembly_summary=None, processes=1): """Represents a collection of genomes in `path` :param path: Path to the directory of related genomes you wish to analyze. :param max_unknowns: Number of allowable unknown bases, i.e. not [ATCG] :param contigs: Acceptable deviations from median number of contigs :param assembly_size: Acceptable deviations from median assembly size :param mash: Acceptable deviations from median MASH distances :param assembly_summary: a pandas DataFrame with assembly summary information """ self.max_unknowns = max_unknowns self.contigs = contigs self.assembly_size = assembly_size self.mash = mash self.assembly_summary = assembly_summary self.deviation_values = [max_unknowns, contigs, assembly_size, mash] self.ncpus = processes self.path = os.path.abspath(path) self.name = os.path.basename(os.path.normpath(path)) self.log = logbook.Logger(self.name) self.qc_dir = os.path.join(self.path, "qc") self.label = '-'.join(map(str, self.deviation_values)) self.qc_results_dir = os.path.join(self.qc_dir, self.label) self.passed_dir = os.path.join(self.qc_results_dir, "passed") self.stats_path = os.path.join(self.qc_dir, 'stats.csv') self.nw_path = os.path.join(self.qc_dir, 'tree.nw') self.dmx_path = os.path.join(self.qc_dir, 'dmx.csv') self.failed_path = os.path.join(self.qc_results_dir, "failed.csv") self.tree_img = os.path.join(self.qc_results_dir, "tree.svg") self.summary_path = os.path.join(self.qc_results_dir, "summary.txt") self.allowed_path = os.path.join(self.qc_results_dir, "allowed.p") self.paste_file = os.path.join(self.qc_dir, 'all.msh') # Figure out if defining these as None is necessary self.tree = None self.stats = None self.dmx = None if os.path.isfile(self.stats_path): self.stats = pd.read_csv(self.stats_path, index_col=0) if os.path.isfile(self.nw_path): self.tree = Tree(self.nw_path, 1) if os.path.isfile(self.failed_path): self.failed_report = pd.read_csv(self.failed_path, index_col=0) if os.path.isfile(self.dmx_path): try: self.dmx = pd.read_csv(self.dmx_path, index_col=0, sep="\t") self.log.info("Distance matrix read succesfully") except pd.errors.EmptyDataError: self.log.exception() self.metadata_path = os.path.join(self.qc_dir, "{}_metadata.csv".format(self.name)) try: self.metadata_df = pd.read_csv(self.metadata_path, index_col="accession") except FileNotFoundError: self.metadata_df = pd.DataFrame(columns=["accession"]) self.criteria = ["unknowns", "contigs", "assembly_size", "distance"] self.tolerance = { "unknowns": max_unknowns, "contigs": contigs, "assembly_size": assembly_size, "distance": mash } self.passed = self.stats self.failed = {} self.med_abs_devs = {} self.dev_refs = {} self.allowed = {"unknowns": max_unknowns} self.colors = { "unknowns": "red", "contigs": "green", "distance": "purple", "assembly_size": "orange" } self.genomes = [ Genome.Genome(genome, self.assembly_summary) for genome in self.genome_paths ] self.assess_tree()
def genome(path, metadata): """ Get information about a single genome.""" genome = Genome(path) if metadata: click.echo(genome.metadata)
def ecoli_genome(genbank): genome = "GCA_002012025.1_Escherichia_coli_Ecol_542_Complete_Genome.fasta" genome = os.path.join(genbank.root, "Escherichia_coli", genome) genome = Genome(genome, assembly_summary) yield genome