Ejemplo n.º 1
0
 def __init__(self, samples, name, base_dir=None):
     # Save samples #
     self.name = name
     self.samples, self.children = samples, samples
     # Check names are unique #
     names = [s.short_name for s in samples if s.used]
     assert len(names) == len(set(names))
     # Figure out pools #
     self.pools = list(set([s.pool for s in self.samples]))
     self.pools.sort(key=lambda x: x.id_name)
     # Load them #
     for p in self.pools:
         p.load()
     # Dir #
     if base_dir: self.base_dir = base_dir
     else:
         self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Runner #
     self.runner = ClusterRunner(self)
     # FASTA #
     self.reads = FASTA(self.p.all_reads_fasta)
     # OTU picking #
     self.otu_uparse = UparseOTUs(self)
     self.otu_uclust = UclustOTUs(self)
     self.otu_cdhit = CdhitOTUs(self)
     # Reporting #
     self.reporter = ClusterReporter(self)
Ejemplo n.º 2
0
 def load(self):
     """A second __init__ that is delayed and called only if needed"""
     # Load the pools and samples #
     for p in self.pools: p.load()
     for s in self.samples: s.load()
     # Dir #
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Figure out if it's a project #
     if set(self.samples) == set(self.first.pool.project.samples): self.project = self.first.pool.project
     else: self.project = None
     # Runner #
     self.runner = ClusterRunner(self)
     # FASTA #
     self.reads = FASTA(self.p.all_reads_fasta)
     # OTU picking #
     self.otu_uparse = UparseOTUs(self)
     self.otu_uclust = UclustOTUs(self)
     self.otu_cdhit  = CdhitOTUs(self)
     # Preferred #
     self.otus = self.otu_uparse
     # Simple reporting #
     self.reporter = ClusterReporter(self)
     # Full report #
     self.report = ClusterReport(self)
     # Loaded #
     self.loaded = True
     # Return self for convenience #
     return self
Ejemplo n.º 3
0
class Cluster(object):
    """Analyzes a group of samples."""

    all_paths = """
    /reads/all_reads.fasta
    /otus/
    /logs/
    /metadata.csv
    """

    def __repr__(self):
        return '<%s object "%s" with %i samples>' % (
            self.__class__.__name__, self.name, len(self.samples))

    def __iter__(self):
        return iter(self.samples)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, key):
        return self.samples[key]

    @property
    def first(self):
        return self.children[0]

    @property
    def count_seq(self):
        return sum([len(sample) for sample in self])

    def __init__(self, samples, name, base_dir=None):
        # Save samples #
        self.name = name
        self.samples, self.children = samples, samples
        # Check names are unique #
        names = [s.short_name for s in samples if s.used]
        assert len(names) == len(set(names))
        # Figure out pools #
        self.pools = list(set([s.pool for s in self.samples]))
        self.pools.sort(key=lambda x: x.id_name)
        # Load them #
        for p in self.pools:
            p.load()
        # Dir #
        if base_dir: self.base_dir = base_dir
        else:
            self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Runner #
        self.runner = ClusterRunner(self)
        # FASTA #
        self.reads = FASTA(self.p.all_reads_fasta)
        # OTU picking #
        self.otu_uparse = UparseOTUs(self)
        self.otu_uclust = UclustOTUs(self)
        self.otu_cdhit = CdhitOTUs(self)
        # Reporting #
        self.reporter = ClusterReporter(self)

    def run(self, *args, **kwargs):
        self.runner.run(*args, **kwargs)

    def run_slurm(self, *args, **kwargs):
        self.runner.run_slurm(*args, **kwargs)

    def process_samples(self):
        for sample in tqdm(self):
            sample.process()

    def combine_reads(self):
        paths = [sample.fasta.path for sample in self]
        shell_output('cat %s > %s' % (' '.join(paths), self.reads))

    def set_size(self, length):
        """Trim all sequences to a specific length starting from the end"""
        self.size_trimmed = FASTA(new_temp_path())

        def trim_iterator(reads):
            for read in reads:
                if len(read) < length: continue
                yield read[-length:]

        self.size_trimmed.write(trim_iterator(self.reads))
        self.size_trimmed.close()
        # Replace it #
        self.reads.remove()
        shutil.move(self.size_trimmed, self.reads)

    def run_uparse(self):
        self.otu_uparse.run()

    @property
    def metadata(self):
        return pandas.DataFrame([s.info for s in self],
                                index=[s.short_name for s in self])

    def export_metadata(self):
        self.metadata.to_csv(self.p.metadata, sep='\t', encoding='utf-8')
Ejemplo n.º 4
0
class Cluster(object):
    """Analyzes a group of samples."""

    all_paths = """
    /reads/all_reads.fasta
    /otus/
    /logs/
    /report/report.pdf
    /metadata.csv
    """

    def __repr__(self): return '<%s object "%s" with %i samples>' % (self.__class__.__name__, self.name, len(self.samples))
    def __iter__(self): return iter(self.samples)
    def __len__(self): return len(self.samples)
    def __getitem__(self, key):
        if isinstance(key, basestring): return [c for c in self.children if c.short_name == key.lower()][0]
        elif isinstance(key, int) and hasattr(self.first, 'num'): return [c for c in self.children if c.num == key][0]
        else: return self.children[key]

    @property
    def first(self): return self.children[0]

    @property
    def count_seq(self):
        return sum([len(sample) for sample in self])

    def __init__(self, samples, name, base_dir=None):
        # Save samples #
        self.name = name
        self.samples, self.children = samples, samples
        # Check names are unique #
        names = [s.short_name for s in samples if s.used]
        assert len(names) == len(set(names))
        # Figure out pools #
        self.pools = list(set([s.pool for s in self.samples]))
        self.pools.sort(key = lambda x: x.id_name)
        # Directory #
        if base_dir: self.base_dir = base_dir
        else: self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/'
        # Loaded #
        self.loaded = False

    def load(self):
        """A second __init__ that is delayed and called only if needed"""
        # Load the pools and samples #
        for p in self.pools: p.load()
        for s in self.samples: s.load()
        # Dir #
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Figure out if it's a project #
        if set(self.samples) == set(self.first.pool.project.samples): self.project = self.first.pool.project
        else: self.project = None
        # Runner #
        self.runner = ClusterRunner(self)
        # FASTA #
        self.reads = FASTA(self.p.all_reads_fasta)
        # OTU picking #
        self.otu_uparse = UparseOTUs(self)
        self.otu_uclust = UclustOTUs(self)
        self.otu_cdhit  = CdhitOTUs(self)
        # Preferred #
        self.otus = self.otu_uparse
        # Simple reporting #
        self.reporter = ClusterReporter(self)
        # Full report #
        self.report = ClusterReport(self)
        # Loaded #
        self.loaded = True
        # Return self for convenience #
        return self

    def run(self, *args, **kwargs):
        self.runner.run(*args, **kwargs)

    def run_slurm(self, *args, **kwargs):
        self.runner.run_slurm(*args, **kwargs)

    def process_samples(self):
        for sample in tqdm(self): sample.process()

    def combine_reads(self):
        """This is the first function should call. It will combine all the
        reads of all the samples of this cluster into one big FASTA file."""
        paths = [sample.fasta.path for sample in self]
        shell_output('cat %s > %s' % (' '.join(paths), self.reads))
        return self.reads

    def set_size(self, length):
        """Trim all sequences to a specific length starting from the end."""
        self.size_trimmed = FASTA(new_temp_path())
        def trim_iterator(reads):
            for read in reads:
                if len(read) < length: continue
                yield read[-length:]
        self.size_trimmed.write(trim_iterator(self.reads))
        self.size_trimmed.close()
        # Replace it #
        self.reads.remove()
        shutil.move(self.size_trimmed, self.reads)

    def run_uparse(self): self.otu_uparse.run()

    @property
    def metadata(self):
        return pandas.DataFrame([s.info for s in self], index=[s.short_name for s in self])

    def export_metadata(self):
        self.metadata.to_csv(self.p.metadata, sep='\t', encoding='utf-8')