Esempio n. 1
0
    def __init__(self, fasta_path, parent, database='silvamod', base_dir=None):
        # Parent #
        self.otu, self.parent = parent, parent
        # Inherited #
        self.samples = self.parent.samples
        # FASTA #
        self.fasta = FASTA(fasta_path)
        # The database to use #
        self.database = database
        self.database_path = databases[database]
        # Dir #
        if base_dir is None: self.base_dir = self.parent.p.crest_dir
        else: self.base_dir = base_dir
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Graphs #
        self.graphs = [
            getattr(plots, cls_name)(self) for cls_name in plots.__all__
        ]
        # OTU table #
        self.otu_csv = CSVTable(self.p.otu_csv, d='\t')
        self.otu_csv_norm = CSVTable(self.p.otu_csv_norm, d='\t')
        # Filtered centers file #
        self.centers = FASTA(self.p.centers)
        # Composition tables #
        self.comp_phyla = CompositionPhyla(self, self.p.comp_phyla)
        self.comp_tips = CompositionTips(self, self.p.comp_tips)
        self.comp_order = CompositionOrder(self, self.p.comp_order)
        self.comp_class = CompositionClass(self, self.p.comp_class)

        # Stats #
        self.stats = StatsOnTaxonomy(self)
Esempio n. 2
0
 def __init__(self, samples, name, base_dir=None):
     # Save samples #
     self.name = name
     self.samples, self.children = samples, samples
     # Check names are unique #
     names = [s.short_name for s in samples if s.used]
     assert len(names) == len(set(names))
     # Figure out pools #
     self.pools = list(set([s.pool for s in self.samples]))
     self.pools.sort(key=lambda x: x.id_name)
     # Load them #
     for p in self.pools:
         p.load()
     # Dir #
     if base_dir: self.base_dir = base_dir
     else:
         self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Runner #
     self.runner = ClusterRunner(self)
     # FASTA #
     self.reads = FASTA(self.p.all_reads_fasta)
     # OTU picking #
     self.otu_uparse = UparseOTUs(self)
     self.otu_uclust = UclustOTUs(self)
     self.otu_cdhit = CdhitOTUs(self)
     # Reporting #
     self.reporter = ClusterReporter(self)
Esempio n. 3
0
 def __init__(self, json_path, out_dir):
     # Attributes #
     self.out_dir = out_dir
     self.json_path = FilePath(json_path)
     # Parse #
     with open(json_path) as handle:
         self.info = json.load(handle)
     # Basic #
     self.run_num = self.info['run_num']
     self.project_short_name = self.info['project']
     self.project_long_name = self.info['project_name']
     # Own attributes #
     self.num = self.info['sample_num']
     self.short_name = self.info['sample']
     self.long_name = self.info['sample_name']
     self.name = 'run%i_sample%i' % (self.run_num, self.num)
     self.group = self.info['group']
     self.id_name = "run%03d-sample%02d" % (self.run_num, self.num)
     # SFF files #
     self.sff_files_info = self.info['files']
     for f in self.sff_files_info:
         if not os.path.exists(f['path']):
             raise Exception("No file at %s" % f['path'])
     # Automatic paths #
     self.base_dir = self.out_dir + self.id_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Make an alias to the json #
     self.p.info_json.link_from(self.json_path, safe=True)
     # Pool dummy #
     self.pool, self.parent = self, self
     # Other dummy variables #
     self.bar_len = 0
     self.gziped = False
     self.used = True
     # Primer #
     self.primer_regex = re.compile(self.info['primer'])
     # Raw files #
     self.raw_fasta = FASTA(self.p.raw_fasta)
     self.raw_fastq = FASTQ(self.p.raw_fastq)
     # Standard FASTA #
     self.reads = FASTA(self.p.reads_fasta)
     self.fasta = FASTA(self.p.renamed)
     # Special FASTQ #
     self.fastq = FASTQ(self.p.reads_fastq)
     # A shameless hack for cdhit to work #
     self.renamed = self.fastq
     # Pre-denoised special case #
     if self.info['predenoised'] and False:
         self.sff_files_info = []
         self.reads.link_from(self.info['predenoised'], safe=True)
Esempio n. 4
0
 def __init__(self, fasta_path, base_dir, parent, verbose=False):
     # Base #
     self.fasta = FASTA(fasta_path)
     self.parent = parent
     self.verbose = verbose
     # Auto paths #
     self.base_dir = base_dir
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Files #
     self.derep_cluster = SizesFASTA(self.p.derep_cluster)
     self.cluster_99 = SizesFASTA(self.p.cluster_99)
     self.positive = SizesFASTA(self.p.positive)
     self.negative = SizesFASTA(self.p.negative)
     self.subsampled = FASTA(self.p.subsampled)
Esempio n. 5
0
    def set_size(self, length):
        """Trim all sequences to a specific length starting from the end"""
        self.size_trimmed = FASTA(new_temp_path())

        def trim_iterator(reads):
            for read in reads:
                if len(read) < length: continue
                yield read[-length:]

        self.size_trimmed.write(trim_iterator(self.reads))
        self.size_trimmed.close()
        # Replace it #
        self.reads.remove()
        shutil.move(self.size_trimmed, self.reads)
Esempio n. 6
0
 def __init__(self, parent):
     # Save parent #
     self.stat, self.parent = parent, parent
     self.tax = parent.tax
     # Paths #
     self.p = AutoPaths(self.parent.p.unifrac_dir, self.all_paths)
     # Files #
     self.clustalo_aligned = FASTA(self.p.clustalo_align)
     self.pynast_aligned = FASTA(self.p.pynast_align)
     self.mothur_aligned = FASTA(self.p.mothur_align)
     self.raxml_tree = FilePath(self.p.raxml_tree)
     self.fasttree_tree = FilePath(self.p.fasttree_tree)
     self.distances_csv = CSVTable(self.p.distances_csv)
     # Graphs #
     self.nmds = NMDS(self, self.distances_csv, calc_distance=False)
Esempio n. 7
0
 def __init__(self, fasta, base_dir):
     # Base params #
     self.fasta = fasta if isinstance(fasta, FASTA) else FASTA(fasta)
     self.base_dir = base_dir
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Extra simple composition #
     from illumitag.clustering.composition import SimpleComposition
     self.composition = SimpleComposition(self, self.base_dir + 'comp_' + self.short_name + '/')
Esempio n. 8
0
 def __init__(self, parent):
     # Save parent #
     self.parent, self.assemble_group = parent, parent
     self.samples = parent.samples
     self.pool = self.parent.pool
     # Auto paths #
     self.base_dir = parent.p.groups_dir + self.short_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # More #
     self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples)
     self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples)
     # Quality filtered #
     if self.parent == 'assembled':
         self.qual_filtered = FASTQ(self.p.qual_filtered, samples=self.samples)
         self.len_filtered = FASTQ(self.p.len_filtered_fastq, samples=self.samples)
         self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes)
     # Further #
     self.load()
Esempio n. 9
0
 def __init__(self, cluster):
     # Save parent #
     self.cluster, self.parent = cluster, cluster
     # Inherited #
     self.samples = self.parent.samples
     # Paths #
     self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Main FASTA file #
     self.reads = self.parent.reads
     # Files #
     self.all_otus = FilePath(self.p.all_otus)
     self.all_centers = FASTA(self.p.all_centers)
     self.otus = FilePath(self.base_dir + "otus.txt")
     self.centers = FASTA(self.base_dir + "centers.fasta")
     # Taxonomy #
     self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva)
     self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
     # Preferred one #
     self.taxonomy = self.taxonomy_silva
Esempio n. 10
0
 def __init__(self, cluster):
     # Save parent #
     self.cluster, self.parent = cluster, cluster
     # Inherited #
     self.samples = self.parent.samples
     # Paths #
     self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Main reads file here FASTQ #
     self.reads = FASTQ(self.p.all_reads)
     # Files #
     self.cdhit_clusters = FilePath(self.p.clstr)
     self.cdhit_centers = FASTA(self.p.clusters_dir + "OTU")
     self.centers = FASTA(self.p.centers)
     # Taxonomy #
     self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod',
                                         self.p.silva)
     self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater',
                                      self.p.fw_dir)
     # Preferred one #
     self.taxonomy = self.taxonomy_silva
Esempio n. 11
0
 def load(self):
     # Special case for dummy samples #
     if self.info.get('dummy'): return
     # Paths #
     self.base_dir = self.pool.p.samples_dir + self.bar_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     self.path = str(self.p.orig_fastq)
     # Distances #
     self.trim_fwd = self.pool.samples.trim_fwd
     self.trim_rev = self.pool.samples.trim_rev
     # Files #
     self.trimmed = FASTQ(self.p.trimmed)
     self.renamed = FASTQ(self.p.renamed)
     self.fasta = FASTA(self.p.reads_fasta)
Esempio n. 12
0
 def __init__(self, cluster):
     # Save parent #
     self.cluster, self.parent = cluster, cluster
     # Inherited #
     self.samples = self.parent.samples
     # Paths #
     self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Main FASTA file #
     self.reads = self.parent.reads
     # Files #
     self.derep = SizesFASTA(self.p.derep)
     self.sorted = SizesFASTA(self.p.sorted)
     self.centers = FASTA(self.p.centers)
     self.readmap = UClusterFile(self.p.readmap)
     # Taxonomy #
     self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva_dir)
     self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
     self.taxonomy_rpd = RdpTaxonomy(self.centers, self)
     # Preferred one #
     self.taxonomy = self.taxonomy_silva
     # Source tracking #
     self.seqenv = Seqenv(self)
Esempio n. 13
0
 def __init__(self, parent, base_dir, lower_bound, upper_bound):
     # Save parent #
     self.parent, self.fractions = parent, parent
     # Auto paths #
     self.base_dir = base_dir
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Bounds #
     self.lower_bound = lower_bound
     self.upper_bound = upper_bound
     # Size fractions #
     self.reads = FASTA(self.p.reads_fasta)
     self.refere = UchimeRef(self.p.reads, self.p.refere_dir, self)
     self.denovo = UchimeDenovo(self.p.reads, self.p.denovo_dir, self)
     # Classification #
     self.rdp = SimpleRdpTaxonomy(self.reads, self.p.rdp_dir)
     self.crest = SimpleCrestTaxonomy(self.reads, self.p.crest_dir)
Esempio n. 14
0
 def __init__(self, fasta_path, parent, base_dir=None):
     # Parent #
     self.otu, self.parent = parent, parent
     # Inherited #
     self.samples = self.parent.samples
     # FASTA #
     self.fasta = FASTA(fasta_path)
     # Dir #
     if base_dir is None: self.base_dir = self.parent.p.rdp_dir
     else: self.base_dir = base_dir
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Graphs #
     self.graphs = [
         getattr(plots, cls_name)(self) for cls_name in plots.__all__
     ]
     # Tables #
     self.otu_csv = CSVTable(self.p.otu_csv)
     # Composition tables #
     #self.comp_phyla = CompositionPhyla(self, self.p.comp_phyla)
     #self.comp_tips = CompositionTips(self, self.p.comp_tips)
     # Stats #
     self.stats = StatsOnTaxonomy(self)
Esempio n. 15
0
 def __init__(self, path, parent):
     # Save parent #
     self.parent, self.pool = parent, parent
     self.samples = parent.samples
     # Auto paths #
     self.base_dir = parent.p.quality_dir + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Files #
     self.untrimmed = FASTQ(path, samples=self.samples)
     self.only_used = FASTA(self.p.only_used, samples=self.samples)
     self.trimmed = FASTA(self.p.trimmed)
     # Qiime output #
     self.qiime_fasta = FASTA(self.p.qiime_fasta)
     # Mothur #
     self.mothur_fasta = FASTA(self.p.mothur_fasta)
     self.mothur_qual = QualFile(self.p.mothur_qual)
     self.mothur_groups = GroupFile(self.p.mothur_groups)
     # Primer size #
     self.trim_fwd = self.pool.samples.trim_fwd
     self.trim_rev = self.pool.samples.trim_rev
Esempio n. 16
0
class Cluster(object):
    """Analyzes a group of samples."""

    all_paths = """
    /reads/all_reads.fasta
    /otus/
    /logs/
    /metadata.csv
    """

    def __repr__(self):
        return '<%s object "%s" with %i samples>' % (
            self.__class__.__name__, self.name, len(self.samples))

    def __iter__(self):
        return iter(self.samples)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, key):
        return self.samples[key]

    @property
    def first(self):
        return self.children[0]

    @property
    def count_seq(self):
        return sum([len(sample) for sample in self])

    def __init__(self, samples, name, base_dir=None):
        # Save samples #
        self.name = name
        self.samples, self.children = samples, samples
        # Check names are unique #
        names = [s.short_name for s in samples if s.used]
        assert len(names) == len(set(names))
        # Figure out pools #
        self.pools = list(set([s.pool for s in self.samples]))
        self.pools.sort(key=lambda x: x.id_name)
        # Load them #
        for p in self.pools:
            p.load()
        # Dir #
        if base_dir: self.base_dir = base_dir
        else:
            self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Runner #
        self.runner = ClusterRunner(self)
        # FASTA #
        self.reads = FASTA(self.p.all_reads_fasta)
        # OTU picking #
        self.otu_uparse = UparseOTUs(self)
        self.otu_uclust = UclustOTUs(self)
        self.otu_cdhit = CdhitOTUs(self)
        # Reporting #
        self.reporter = ClusterReporter(self)

    def run(self, *args, **kwargs):
        self.runner.run(*args, **kwargs)

    def run_slurm(self, *args, **kwargs):
        self.runner.run_slurm(*args, **kwargs)

    def process_samples(self):
        for sample in tqdm(self):
            sample.process()

    def combine_reads(self):
        paths = [sample.fasta.path for sample in self]
        shell_output('cat %s > %s' % (' '.join(paths), self.reads))

    def set_size(self, length):
        """Trim all sequences to a specific length starting from the end"""
        self.size_trimmed = FASTA(new_temp_path())

        def trim_iterator(reads):
            for read in reads:
                if len(read) < length: continue
                yield read[-length:]

        self.size_trimmed.write(trim_iterator(self.reads))
        self.size_trimmed.close()
        # Replace it #
        self.reads.remove()
        shutil.move(self.size_trimmed, self.reads)

    def run_uparse(self):
        self.otu_uparse.run()

    @property
    def metadata(self):
        return pandas.DataFrame([s.info for s in self],
                                index=[s.short_name for s in self])

    def export_metadata(self):
        self.metadata.to_csv(self.p.metadata, sep='\t', encoding='utf-8')
Esempio n. 17
0
class Pyrosample(object):
    """A Pyrosample is a legacy object for the few 454 samples we still have and that we need to compare against the new Illumina technology."""

    all_paths = """
    /info.json
    /reads.fasta
    /renamed.fasta
    /raw/raw.sff
    /raw/raw.fastq
    /raw/raw.fasta
    /raw/raw.qual
    /raw/manifest.txt
    /fastq/reads.fastq
    """

    def __repr__(self):
        return '<%s object "%s">' % (self.__class__.__name__, self.id_name)

    def __init__(self, json_path, out_dir):
        # Attributes #
        self.out_dir = out_dir
        self.json_path = FilePath(json_path)
        # Parse #
        with open(json_path) as handle:
            self.info = json.load(handle)
        # Basic #
        self.run_num = self.info['run_num']
        self.project_short_name = self.info['project']
        self.project_long_name = self.info['project_name']
        # Own attributes #
        self.num = self.info['sample_num']
        self.short_name = self.info['sample']
        self.long_name = self.info['sample_name']
        self.name = 'run%i_sample%i' % (self.run_num, self.num)
        self.group = self.info['group']
        self.id_name = "run%03d-sample%02d" % (self.run_num, self.num)
        # SFF files #
        self.sff_files_info = self.info['files']
        for f in self.sff_files_info:
            if not os.path.exists(f['path']):
                raise Exception("No file at %s" % f['path'])
        # Automatic paths #
        self.base_dir = self.out_dir + self.id_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Make an alias to the json #
        self.p.info_json.link_from(self.json_path, safe=True)
        # Pool dummy #
        self.pool, self.parent = self, self
        # Other dummy variables #
        self.bar_len = 0
        self.gziped = False
        self.used = True
        # Primer #
        self.primer_regex = re.compile(self.info['primer'])
        # Raw files #
        self.raw_fasta = FASTA(self.p.raw_fasta)
        self.raw_fastq = FASTQ(self.p.raw_fastq)
        # Standard FASTA #
        self.reads = FASTA(self.p.reads_fasta)
        self.fasta = FASTA(self.p.renamed)
        # Special FASTQ #
        self.fastq = FASTQ(self.p.reads_fastq)
        # A shameless hack for cdhit to work #
        self.renamed = self.fastq
        # Pre-denoised special case #
        if self.info['predenoised'] and False:
            self.sff_files_info = []
            self.reads.link_from(self.info['predenoised'], safe=True)

    def load(self):
        pass

    def extract(self):
        # Call extraction #
        shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta))
        shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual))
        shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest))
        # Convert #
        sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq)

    def clean_iterator(self,
                       reads,
                       minlength=400,
                       threshold=21,
                       windowsize=20):
        for read in reads:
            # Length #
            if len(read) < minlength: continue
            # Primer #
            match = self.primer_regex.search(str(read.seq))
            if not match: continue
            # PHRED score #
            scores = read.letter_annotations["phred_quality"]
            averaged = moving_average(scores, windowsize)
            discard = False
            for i, value in enumerate(averaged):
                if value < threshold:
                    read = read[:i + windowsize - 1]
                    if len(read) < minlength: discard = True
                    break
            if discard: continue
            # Undetermined bases #
            if 'N' in read: continue
            # Remove primer #
            read = read[match.end():]
            # Flip them because 454 reads the other end #
            read = read.reverse_complement()
            # Return #
            yield read

    def clean(self, **kwargs):
        self.reads.write(self.clean_iterator(self.raw_fastq, **kwargs))

    def report_loss(self):
        print "Before cleaning: %i" % len(self.raw_fastq)
        print "After cleaning: %i" % len(self.reads)
        print "Loss: %.2f%%" % (100 *
                                (1 - (len(self.raw_fastq) / len(self.reads))))

    def process(self):
        self.reads.rename_with_num(self.name + '_read', new_path=self.fasta)

    def make_fastq(self, **kwargs):
        """In some special cases we want the FASTQ"""
        self.fastq.write(self.clean_iterator(self.raw_fastq, **kwargs))
        self.fastq.rename_with_num(self.name + '_read')
        print "make_fastq for sample %s completed" % self.id_name
Esempio n. 18
0
# Internal modules #
import illumitag
from illumitag.fasta.single import FASTA

# Third party modules #
import sh

# Constants #
home = os.environ['HOME'] + '/'
silvamod_path = home + 'glob/16s/silvamod.fasta'
amplified_path = home + 'glob/16s/silvamod_v3_v4.fasta'
aligned_path = home + 'glob/16s/silvamod_v3_v4.align'

# Objects #
silvamod = FASTA(silvamod_path)
amplified = FASTA(amplified_path)
aligned = FASTA(aligned_path)


###############################################################################
def amplify():
    """A function to parse the silvamod 16S database and find the primers within
    the full-length sequences to determine the probable length of our amplified
    region."""
    primers = illumitag.pools[0].primers
    bar_len = illumitag.pools[0].bar_len
    counts = {
        'success': 0,
        'only_fwd': 0,
        'only_rev': 0,
Esempio n. 19
0
class PrimerGroup(object):
    """A bunch of sequences all having the same type of primer outcome
    (and assembly outcome)"""

    all_paths = """
    /orig.fastq
    /n_filtered.fastq
    /qual_filtered.fastq
    /len_filtered.fastq
    /trimmed_barcodes.fasta
    """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)
    def __len__(self): return len(self.orig_reads)

    def create(self): self.orig_reads.create()
    def add_read(self, read): self.orig_reads.add_read(read)
    def close(self): self.orig_reads.close()

    def __init__(self, parent):
        # Save parent #
        self.parent, self.assemble_group = parent, parent
        self.samples = parent.samples
        self.pool = self.parent.pool
        # Auto paths #
        self.base_dir = parent.p.groups_dir + self.short_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # More #
        self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples)
        self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples)
        # Quality filtered #
        if self.parent == 'assembled':
            self.qual_filtered = FASTQ(self.p.qual_filtered, samples=self.samples)
            self.len_filtered = FASTQ(self.p.len_filtered_fastq, samples=self.samples)
            self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes)
        # Further #
        self.load()

    def load(self): pass

    def n_filter(self):
        """Called from AssembleGroup.discard_reads_with_n"""
        def no_n_iterator(reads):
            for read in reads:
                if 'N' in read: continue
                yield read
        self.n_filtered.write(no_n_iterator(self.orig_reads))

    def qual_filter(self):
        """Called from Assemble.quality_filter"""
        def good_qual_iterator(reads, threshold=5, windowsize=10):
            for read in reads:
                averaged = moving_average(read.letter_annotations["phred_quality"], windowsize)
                if any([value < threshold for value in averaged]): continue
                yield read
        self.qual_filtered.write(good_qual_iterator(self.n_filtered))

    def len_filter(self):
        """Called from Assemble.length_filter"""
        def good_len_iterator(reads, min_length=400):
            for read in reads:
                if len(read) < min_length: continue
                yield read
        self.len_filtered.write(good_len_iterator(self.qual_filtered))

    def trim_bc(self):
        """Called from Assemble.trim_barcodes"""
        def no_barcodes_iterator(reads):
            for read in reads:
                yield read[self.pool.bar_len:-self.pool.bar_len]
        if self.pool.bar_len == 0:
            self.len_filtered.to_fasta(self.trimmed_barcodes)
        else:
            self.trimmed_barcodes.write(no_barcodes_iterator(self.len_filtered))
Esempio n. 20
0
 def load(self):
     self.cls = FASTA
     self.base_dir = self.outcome.p.unassembled_dir
     self.p = AutoPaths(self.base_dir, self.all_paths)
     self.path = self.p.orig_fasta
     self.flipped_reads = FASTA(self.p.flipped, self.samples, self.primers)
Esempio n. 21
0
# Make fraction graph #
proj.graphs[-1].plot()

# Get statistics #
proj.reporter.fraction_discarded

# Get clustering values #
r1, r2 = list(set([p.run for p in proj]))
r1.parse_report_xml()
r2.parse_report_xml()
print float(r1.report_stats['fwd']['DensityPF']) / float(r1.report_stats['fwd']['DensityRaw'])
print float(r2.report_stats['fwd']['DensityPF']) / float(r2.report_stats['fwd']['DensityRaw'])

# Check below 400 bp sequences #
folder = DirectoryPath(illumitag.projects['evaluation'].base_dir + "below_400/")
over = FASTA(folder + "reads.fasta")
def over_iterator(reads, max_length=400):
    for read in reads:
        if len(read) <= max_length: yield read
over.create()
for pool in pools: over.add_iterator(over_iterator(pool.good_barcodes.assembled.good_primers.qual_filtered))
over.close()
over.graphs[-1].plot()
crest = SimpleCrestTaxonomy(over, folder)
crest.assign()
crest.composition.graph.plot()
rdp = SimpleRdpTaxonomy(over, folder)
rdp.assign()
rdp.composition.graph.plot()
Esempio n. 22
0
class UclustOTUs(OTUs):
    """Will use uclust via the qimme wraper to create OTU clusters from a given FASTA file
    http://qiime.org/scripts/pick_otus.html"""

    short_name = 'uclust'
    title = 'UCLUST-QIIME denovo picking'

    all_paths = """
    /clusters/clusters.uc
    /clusters/qiime.log
    /clusters/all_otus.txt
    /clusters/all_centers.fasta
    /centers.fasta
    /otus.txt
    /taxonomy_silva/
    /taxonomy_fw/
    /graphs/
    """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)

    def __init__(self, cluster):
        # Save parent #
        self.cluster, self.parent = cluster, cluster
        # Inherited #
        self.samples = self.parent.samples
        # Paths #
        self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Main FASTA file #
        self.reads = self.parent.reads
        # Files #
        self.all_otus = FilePath(self.p.all_otus)
        self.all_centers = FASTA(self.p.all_centers)
        self.otus = FilePath(self.base_dir + "otus.txt")
        self.centers = FASTA(self.base_dir + "centers.fasta")
        # Taxonomy #
        self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva)
        self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
        # Preferred one #
        self.taxonomy = self.taxonomy_silva

    def run(self):
        # Clean #
        shutil.rmtree(self.p.clusters_dir)
        # Run command #
        pick_otus = sh.Command('pick_otus.py')
        pick_otus('-m', 'uclust', '-i', self.reads, '-o', self.p.clusters_dir)
        # Move into place #
        base_name = self.p.clusters_dir + self.reads.prefix
        shutil.move(base_name + '_otus.txt', self.all_otus)
        shutil.move(base_name + '_otus.log', self.p.qiime_log)
        shutil.move(base_name + '_clusters.uc', self.p.clusters_uc)
        # Remove OTUs that are only one read #
        def filter_singletons(f):
            for line in f:
                line = line.split()
                if len(line) > 2: yield '\t'.join(line) + '\n'
        self.otus.writelines(filter_singletons(self.all_otus))
        # Create the centers file that is missing #
        pick_rep = sh.Command('pick_rep_set.py')
        pick_rep('-i', self.all_otus, '-f', self.reads, '-o', self.all_centers)
        # Remake the centers file without the filtered OTUs #
        self.otus_to_keep = [line.split()[0] for line in self.otus]
        def filter_otus(f):
            for seq in f:
                if seq.id in self.otus_to_keep: yield seq
        self.centers.write(filter_otus(self.all_centers))

    @property_cached
    def cluster_counts_table(self):
        """Create the unfiltered OTU table"""
        # Put results in a dict of dicts #
        result = defaultdict(lambda: defaultdict(int))
        # Loop #
        for line in self.otus:
            # Parse the line #
            contents = line.split()
            otu, reads = contents[0], contents[1:]
            # Parse the hits #
            for r in reads:
                nums = re.findall("run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)", r)
                if nums:
                    run_num, pool_num, sample_num, read_num = map(int, nums[0])
                    sample = illumitag.runs[run_num][pool_num-1][sample_num-1]
                    name = sample.short_name
                else:
                    nums = re.findall("run([0-9]+)_sample([0-9]+)_read([0-9]+)", r)
                    run_num, sample_num, read_num = map(int, nums[0])
                    sample = [s for s in illumitag.presamples+illumitag.pyrosamples if s.run_num==run_num and s.num==sample_num][0]
                    name = sample.short_name
                # Count #
                result[otu][name] += 1
        # Return #
        result = pandas.DataFrame(result)
        result = result.fillna(0)
        result = result.astype(int)
        result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1)
        return result
Esempio n. 23
0
class UparseOTUs(OTUs):
    """Will use uparse to create OTU clusters from a given FASTA file
    http://www.nature.com/doifinder/10.1038/nmeth.2604"""

    short_name = 'uparse'
    title = 'UPARSE denovo picking'

    all_paths = """
    /derep.fasta
    /sorted.fasta
    /centers.fasta
    /readmap.uc
    /taxonomy_silva/
    /taxonomy_fw/
    /taxonomy_rdp/
    /graphs/
    /seqenv/
    """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)

    def __init__(self, cluster):
        # Save parent #
        self.cluster, self.parent = cluster, cluster
        # Inherited #
        self.samples = self.parent.samples
        # Paths #
        self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Main FASTA file #
        self.reads = self.parent.reads
        # Files #
        self.derep = SizesFASTA(self.p.derep)
        self.sorted = SizesFASTA(self.p.sorted)
        self.centers = FASTA(self.p.centers)
        self.readmap = UClusterFile(self.p.readmap)
        # Taxonomy #
        self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva_dir)
        self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
        self.taxonomy_rpd = RdpTaxonomy(self.centers, self)
        # Preferred one #
        self.taxonomy = self.taxonomy_silva
        # Source tracking #
        self.seqenv = Seqenv(self)

    def run(self, threshold=3.0):
        # Dereplicate #
        sh.usearch7("--derep_fulllength", self.reads, '-output', self.derep, '-sizeout')
        # Order by size and kill singeltons #
        sh.usearch7("--sortbysize", self.derep, '-output', self.sorted, '-minsize', 2)
        # Compute the centers #
        sh.usearch7("--cluster_otus", self.sorted, '-otus', self.centers, '-otu_radius_pct', threshold)
        # Rename the centers #
        self.centers.rename_with_num('OTU_')
        # Map the reads back to the centers #
        identity = (100 - threshold) / 100
        sh.usearch7("-usearch_global", self.reads, '-db', self.centers, '-strand', 'plus', '-id', identity, '-uc', self.readmap)

    def checks(self):
        assert len(self.reads) == len(self.derep)
        assert len(self.reads) == len(self.readmap)

    @property_cached
    def cluster_counts_table(self):
        """Parse that custom output for creating the unfiltered OTU table"""
        result = pandas.DataFrame(self.readmap.otu_sample_counts)
        result = result.fillna(0)
        result = result.astype(int)
        result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1)
        return result
Esempio n. 24
0
class CdhitOTUs(OTUs):
    """Will use cd-hit to create OTU clusters from a given FASTQ file
    http://weizhong-lab.ucsd.edu/cd-hit-otu/"""

    short_name = 'cdhit'
    title = 'CD-HIT Illumina OTU picking'

    all_paths = """
    /all_reads.fastq
    /clusters/OTU.nr2nd.clstr
    /centers.fasta
    /otus.txt
    /taxonomy_silva/
    /taxonomy_fw/
    /graphs/
    """

    def __repr__(self):
        return '<%s object of %s>' % (self.__class__.__name__, self.parent)

    def __init__(self, cluster):
        # Save parent #
        self.cluster, self.parent = cluster, cluster
        # Inherited #
        self.samples = self.parent.samples
        # Paths #
        self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Main reads file here FASTQ #
        self.reads = FASTQ(self.p.all_reads)
        # Files #
        self.cdhit_clusters = FilePath(self.p.clstr)
        self.cdhit_centers = FASTA(self.p.clusters_dir + "OTU")
        self.centers = FASTA(self.p.centers)
        # Taxonomy #
        self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod',
                                            self.p.silva)
        self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater',
                                         self.p.fw_dir)
        # Preferred one #
        self.taxonomy = self.taxonomy_silva

    def run(self):
        # Combine reads but in fastq format this time #
        paths = [sample.renamed for sample in self.cluster]
        shell_output('cat %s > %s' % (' '.join(paths), self.reads))
        # Clean #
        shutil.rmtree(self.p.clusters_dir)
        # Run command #
        cdhit = sh.Command(cdhit_script)
        cdhit('-i', self.reads, '-o', self.p.clusters_dir, '-p',
              TmpFile.from_string('[ACTG]'))
        # Create the centers file with good names #
        self.cdhit_centers.rename_with_num('OTU_', self.centers)

    @property_cached
    def cluster_counts_table(self):
        """Create the unfiltered OTU table"""
        # Put results in a dict of dicts #
        result = defaultdict(lambda: defaultdict(int))
        # Loop #
        for line in self.cdhit_clusters:
            if line.startswith('>'):
                otu = "OTU_%s" % line.split()[1]
                continue
            nums = re.findall(
                ">run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.",
                line)
            if nums:
                run_num, pool_num, sample_num, read_num = map(int, nums[0])
                sample = illumitag.runs[run_num][pool_num - 1][sample_num - 1]
                name = sample.short_name
            else:
                nums = re.findall(
                    ">run([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.", line)
                run_num, sample_num, read_num = map(int, nums[0])
                sample = [
                    s for s in illumitag.presamples + illumitag.pyrosamples
                    if s.run_num == run_num and s.num == sample_num
                ][0]
                name = sample.short_name
            # Count #
            result[otu][name] += 1
        # Return #
        result = pandas.DataFrame(result)
        result = result.fillna(0)
        result = result.astype(int)
        result = result.reindex_axis(sorted(result.columns, key=natural_sort),
                                     axis=1)
        # Remove OTUs that are only one read #
        return result
Esempio n. 25
0
# Built-in modules #
import os, shutil, glob

# Internal modules #
from illumitag.common.autopaths import AutoPaths, FilePath
from illumitag.common.cache import property_cached
from illumitag.common.csv_tables import CSVTable
from illumitag.clustering.statistics.nmds import NMDS

# Third party modules #
import sh, pandas
from illumitag.fasta.single import FASTA

# Constants #
home = os.environ['HOME'] + '/'
reference = FASTA(
    home + 'glob/16s/silva/v111/rep_set_aligned/97_Silva_111_rep_set.fasta')


###############################################################################
class Unifrac(object):
    """A class to compute the Unifrac algorithm producing a distance matrix
    from a bunch of different samples and their reads.

    Step 1. Make an alignment of all the OTU centers against a reference.
    One can use:
        * clustalo
        * PyNAST
        * mothur <- fastest
        * SINA

    Step 2. From the alignment produced make a tree.
Esempio n. 26
0
class QualityReads(object):
    """A set of sequences determined to be quality controlled"""

    all_paths = """
    /mothur_reads.fasta
    /mothur_reads.qual
    /mothur_groups.tsv
    /qiime_reads.fasta
    /only_used_samples.fasta
    /trimmed.fasta
    """

    def __repr__(self):
        return '<%s object of %s>' % (self.__class__.__name__, self.parent)

    def __len__(self):
        return len(self.trimmed)

    def __init__(self, path, parent):
        # Save parent #
        self.parent, self.pool = parent, parent
        self.samples = parent.samples
        # Auto paths #
        self.base_dir = parent.p.quality_dir + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Files #
        self.untrimmed = FASTQ(path, samples=self.samples)
        self.only_used = FASTA(self.p.only_used, samples=self.samples)
        self.trimmed = FASTA(self.p.trimmed)
        # Qiime output #
        self.qiime_fasta = FASTA(self.p.qiime_fasta)
        # Mothur #
        self.mothur_fasta = FASTA(self.p.mothur_fasta)
        self.mothur_qual = QualFile(self.p.mothur_qual)
        self.mothur_groups = GroupFile(self.p.mothur_groups)
        # Primer size #
        self.trim_fwd = self.pool.samples.trim_fwd
        self.trim_rev = self.pool.samples.trim_rev

    def filter_unused(self):
        def no_unused_iterator(reads):
            for r in reads.parse_barcodes():
                if r.first.sample.used: yield r.read

        self.only_used.write(no_unused_iterator(self.untrimmed))

    def trim_primers(self):
        def no_primers_iterator(reads):
            for read in reads:
                yield read[self.trim_fwd:-self.trim_rev]

        self.trimmed.write(no_primers_iterator(self.only_used))

    def make_mothur_output(self):
        # Trimmed fasta #
        self.mothur_fasta.link_from(self.trimmed.path)
        # The groups file #
        self.mothur_groups.create()
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            read_name = '%s\t%s\n' % (r.read.id, sample_name)
            self.mothur_groups.handle.write(read_name)
        self.mothur_groups.close()

    def make_qiime_output(self):
        # Prepare fasta writer #
        handle = open(self.qiime_fasta.path, 'w')
        writer = FastaWriter(handle, wrap=0)
        writer.write_header()
        # Counter #
        counter = defaultdict(int)
        # Do it #
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            counter[sample_name] += 1
            r.read.id = '%s_%i %s' % (sample_name, counter[sample_name],
                                      r.read.id)
            bar_seq = r.read.seq[0:self.pool.bar_len]
            r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq,
                                                                      bar_seq)
            writer.write_record(r.read[self.trim_fwd:-self.trim_rev])
        # Close #
        writer.write_footer()
        handle.close()
Esempio n. 27
0
 def __init__(self, json_path, out_dir):
     # Attributes #
     self.out_dir = out_dir
     self.json_path = FilePath(json_path)
     # Parse #
     with open(json_path) as handle:
         self.info = json.load(handle)
     # Basic #
     self.account = self.info['uppmax_id']
     self.run_num = self.info['run_num']
     self.run_label = self.info['run_id']
     self.project_short_name = self.info['project']
     self.project_long_name = self.info['project_name']
     self.fwd_name = self.info['forward_reads']
     self.rev_name = self.info['reverse_reads']
     # Own attributes #
     self.num = self.info['sample_num']
     self.label = self.info['sample_id']
     self.short_name = self.info['sample']
     self.long_name = self.info['sample_name']
     self.name = 'run%i_sample%i' % (self.run_num, self.num)
     self.group = self.info['group']
     self.id_name = "run%03d-sample%02d" % (self.run_num, self.num)
     self.fwd_mid = self.info['forward_mid']
     self.rev_mid = self.info['forward_mid']
     # Automatic paths #
     self.base_dir = self.out_dir + self.id_name + '/'
     self.p = AutoPaths(self.base_dir, self.all_paths)
     # Special #
     self.primers = TwoPrimers(self)
     # Samples dummy #
     self.info['samples'] = [{
         "name": self.short_name,
         "used": 1,
         "group": self.group,
         "dummy": 1,
         "num": self.num,
         "fwd": "",
         "rev": ""
     }]
     self.samples = Samples(self)
     self.samples.load()
     # Pool dummy #
     self.pool, self.parent = self, self
     # Files #
     self.fwd_path = home + "ILLUMITAG/INBOX/%s/%s/%s" % (
         self.run_label, self.label, self.fwd_name)
     self.rev_path = home + "ILLUMITAG/INBOX/%s/%s/%s" % (
         self.run_label, self.label, self.rev_name)
     self.gziped = True if self.fwd_path.endswith('gz') else False
     self.fwd = FASTQ(self.p.fwd)
     self.rev = FASTQ(self.p.rev)
     self.fastq = PairedFASTQ(self.fwd.path, self.rev.path, self)
     # Barcode length #
     self.bar_len = 0
     # Make an alias to the json #
     self.p.info_json.link_from(self.json_path, safe=True)
     # Assembly files as children #
     self.assembled = Assembled(self)
     self.unassembled = Unassembled(self)
     self.children = (self.assembled, self.unassembled)
     self.first = self.assembled
     # Graphs #
     self.graphs = [
         getattr(outcome_plots, cls_name)(self)
         for cls_name in outcome_plots.__all__
     ]
     # Runner #
     self.runner = PresampleRunner(self)
     # Final #
     self.trimmed = FASTQ(self.p.trimmed)
     self.renamed = FASTQ(self.p.renamed)
     self.fasta = FASTA(self.p.reads_fasta)