コード例 #1
0
ファイル: Genome_test.py プロジェクト: tfursten/GenBankQC
def genome(aphidicola):
    handler = TestHandler()
    genome = ("GCA_000521565.1_Buchnera_aphidicola_G002_"
              "Myzus_persicae_Complete_Genome.fasta")
    genome = os.path.join(aphidicola.path, genome)
    with handler:
        genome = Genome(genome, assembly_summary)
        genome.sketch()
        genome.get_contigs()
        genome.get_assembly_size()
        genome.get_unknowns()
        yield genome, handler
コード例 #2
0
def genome(ctx, path, metadata):
    """
    Get information about a single genome.
    """

    genome = Genome(path, ctx.assembly_summary)
    if metadata:
        click.echo(genome.metadata)
コード例 #3
0
    def __init__(self,
                 path,
                 max_unknowns=200,
                 contigs=3.0,
                 assembly_size=3.0,
                 mash=3.0,
                 assembly_summary=None,
                 processes=1):
        """Represents a collection of genomes in `path`

        :param path: Path to the directory of related genomes you wish to analyze.
        :param max_unknowns: Number of allowable unknown bases, i.e. not [ATCG]
        :param contigs: Acceptable deviations from median number of contigs
        :param assembly_size: Acceptable deviations from median assembly size
        :param mash: Acceptable deviations from median MASH distances
        :param assembly_summary: a pandas DataFrame with assembly summary information
        """
        self.max_unknowns = max_unknowns
        self.contigs = contigs
        self.assembly_size = assembly_size
        self.mash = mash
        self.assembly_summary = assembly_summary
        self.deviation_values = [max_unknowns, contigs, assembly_size, mash]
        self.ncpus = processes
        self.path = os.path.abspath(path)
        self.name = os.path.basename(os.path.normpath(path))
        self.log = logbook.Logger(self.name)
        self.qc_dir = os.path.join(self.path, "qc")
        self.label = '-'.join(map(str, self.deviation_values))
        self.qc_results_dir = os.path.join(self.qc_dir, self.label)
        self.passed_dir = os.path.join(self.qc_results_dir, "passed")
        self.stats_path = os.path.join(self.qc_dir, 'stats.csv')
        self.nw_path = os.path.join(self.qc_dir, 'tree.nw')
        self.dmx_path = os.path.join(self.qc_dir, 'dmx.csv')
        self.failed_path = os.path.join(self.qc_results_dir, "failed.csv")
        self.tree_img = os.path.join(self.qc_results_dir, "tree.svg")
        self.summary_path = os.path.join(self.qc_results_dir, "summary.txt")
        self.allowed_path = os.path.join(self.qc_results_dir, "allowed.p")
        self.paste_file = os.path.join(self.qc_dir, 'all.msh')
        # Figure out if defining these as None is necessary
        self.tree = None
        self.stats = None
        self.dmx = None
        if os.path.isfile(self.stats_path):
            self.stats = pd.read_csv(self.stats_path, index_col=0)
        if os.path.isfile(self.nw_path):
            self.tree = Tree(self.nw_path, 1)
        if os.path.isfile(self.failed_path):
            self.failed_report = pd.read_csv(self.failed_path, index_col=0)
        if os.path.isfile(self.dmx_path):
            try:
                self.dmx = pd.read_csv(self.dmx_path, index_col=0, sep="\t")
                self.log.info("Distance matrix read succesfully")
            except pd.errors.EmptyDataError:
                self.log.exception()
        self.metadata_path = os.path.join(self.qc_dir,
                                          "{}_metadata.csv".format(self.name))
        try:
            self.metadata_df = pd.read_csv(self.metadata_path,
                                           index_col="accession")
        except FileNotFoundError:
            self.metadata_df = pd.DataFrame(columns=["accession"])
        self.criteria = ["unknowns", "contigs", "assembly_size", "distance"]
        self.tolerance = {
            "unknowns": max_unknowns,
            "contigs": contigs,
            "assembly_size": assembly_size,
            "distance": mash
        }
        self.passed = self.stats
        self.failed = {}
        self.med_abs_devs = {}
        self.dev_refs = {}
        self.allowed = {"unknowns": max_unknowns}
        self.colors = {
            "unknowns": "red",
            "contigs": "green",
            "distance": "purple",
            "assembly_size": "orange"
        }
        self.genomes = [
            Genome.Genome(genome, self.assembly_summary)
            for genome in self.genome_paths
        ]
        self.assess_tree()
コード例 #4
0
ファイル: __main__.py プロジェクト: wangdi2014/GenBankQC
def genome(path, metadata):
    """ Get information about a single genome."""

    genome = Genome(path)
    if metadata:
        click.echo(genome.metadata)
コード例 #5
0
ファイル: genome_test.py プロジェクト: wangdi2014/GenBankQC
def ecoli_genome(genbank):
    genome = "GCA_002012025.1_Escherichia_coli_Ecol_542_Complete_Genome.fasta"
    genome = os.path.join(genbank.root, "Escherichia_coli", genome)
    genome = Genome(genome, assembly_summary)
    yield genome