Ejemplo n.º 1
0
 def combine_rerun_with_orig(self):
     """Special case when a sample with low reads was rerun in an other pool.
     Run this just before the combine_reads() method of the associated cluster.
     This method is called on the reruned sampled, not the original."""
     # Check we have a rerun #
     if self.info.get('rerun') is None: return False
     # Check we are processed #
     assert self.fasta.count > 0
     # Get the original sample #
     run, pool, num = self.info['rerun']['run'], self.info['rerun']['pool'], self.info['rerun']['num']
     orig_sample    = illumitag.runs[run][pool-1][num-1]
     merged         = FASTA(orig_sample.base_dir + 'rerun_merged.fasta')
     # Check we don't merge twice #
     assert orig_sample.count == orig_sample.fasta.count
     # Do it #
     merged.create()
     merged.add(orig_sample.fasta)
     merged.add(self.fasta)
     merged.close()
     merged.rename_with_num(orig_sample.name + '_read', orig_sample.fasta)
     merged.remove()
     # Check #
     orig_sample.fasta = FASTA(orig_sample.fasta.path)
     assert orig_sample.count < orig_sample.fasta.count
     return True
Ejemplo n.º 2
0
class UparseOTUs(OTUs):
    """Will use uparse to create OTU clusters from a given FASTA file
    http://www.nature.com/doifinder/10.1038/nmeth.2604"""

    short_name = 'uparse'
    title = 'UPARSE denovo picking'
    article = "http://www.nature.com/doifinder/10.1038/nmeth.2604"
    version = uparse_version
    threshold = 3.0

    all_paths = """
    /derep.fasta
    /sorted.fasta
    /centers.fasta
    /readmap.uc
    /taxonomy_silva/
    /taxonomy_fw/
    /taxonomy_unite/
    /taxonomy_rdp/
    /graphs/
    /seqenv/
    """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)
    def __len__(self): return 0

    def __init__(self, cluster):
        # Save parent #
        self.cluster, self.parent = cluster, cluster
        # Inherited #
        self.samples = self.parent.samples
        # Paths #
        self.base_dir = self.parent.p.otus_dir + self.short_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Main FASTA file #
        self.reads = self.parent.reads
        # Files #
        self.derep = SizesFASTA(self.p.derep)
        self.sorted = SizesFASTA(self.p.sorted)
        self.centers = FASTA(self.p.centers)
        self.readmap = UClusterFile(self.p.readmap)
        # Taxonomy #
        self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod',   self.p.silva_dir)
        self.taxonomy_fw    = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir)
        self.taxonomy_unite = CrestTaxonomy(self.centers, self, 'unite',      self.p.unite_dir)
        self.taxonomy_rdp   = RdpTaxonomy(self.centers, self)
        # Preferred one #
        self.taxonomy = self.taxonomy_silva
        # Source tracking #
        self.seqenv = Seqenv(self)

    def run(self, threshold=None):
        # Optional threshold #
        if threshold is None: threshold = self.threshold
        identity = (100 - threshold) / 100
        # Dereplicate (uparse version 32bit version runs out of memory) #
        if False: sh.usearch7("--derep_fulllength", self.reads, '-output', self.derep, '-sizeout')
        sh.fasta_make_unique(self.reads, self.derep)
        # Order by size and kill singeltons #
        sh.usearch7("--sortbysize", self.derep, '-output', self.sorted, '-minsize', 2)
        # Compute the centers #
        sh.usearch7("--cluster_otus", self.sorted, '-otus', self.centers, '-otu_radius_pct', threshold)
        # Rename the centers #
        self.centers.rename_with_num('OTU-')
        # Map the reads back to the centers #
        sh.usearch7("-usearch_global", self.reads, '-db', self.centers, '-strand', 'plus', '-id', identity, '-uc', self.readmap)

    def checks(self):
        assert len(self.reads) == len(self.derep)
        assert len(self.reads) == len(self.readmap)

    @property_cached
    def cluster_counts_table(self):
        """Parse that custom output for creating the unfiltered OTU table"""
        result = pandas.DataFrame(self.readmap.otu_sample_counts)
        result = result.fillna(0)
        result = result.astype(int)
        result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1)
        return result
Ejemplo n.º 3
0
class Pyrosample(object):
    """A Pyrosample is a legacy object for the few 454 samples we still have and that we need to compare against the new Illumina technology."""

    all_paths = """
    /info.json
    /reads.fasta
    /renamed.fasta
    /raw/raw.sff
    /raw/raw.fastq
    /raw/raw.fasta
    /raw/raw.qual
    /raw/manifest.txt
    /fastq/reads.fastq
    """

    kind = "pyrosample"

    def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.id_name)

    def __init__(self, json_path, out_dir):
        # Attributes #
        self.out_dir = out_dir
        self.json_path = FilePath(json_path)
        # Parse #
        self.info = load_json_path(self.json_path)
        # Basic #
        self.account = "/dev/null"
        self.run_num = self.info['run_num']
        self.run_label = "pyrosample_run_%i" % self.run_num
        self.project_short_name = self.info['project']
        self.project_long_name = self.info['project_name']
        # Own attributes #
        self.num = self.info['sample_num']
        self.short_name = self.info['sample']
        self.long_name = self.info['sample_name']
        self.name = 'run%i_sample%i' % (self.run_num, self.num)
        self.group = self.info['group']
        self.id_name = "run%03d-sample%02d" % (self.run_num, self.num)
        # Hard coded attributes #
        self.machine = "454 GS FLX Titanium"
        # SFF files #
        self.sff_files_info = self.info['files']
        # Pool dummy #
        self.pool, self.parent = self, self
        # Other dummy variables #
        self.bar_len = 0
        self.gzipped = False
        self.used = True
        # Loaded #
        self.loaded = False

    def load(self):
        """A second __init__ that is delayed and called only if needed"""
        # Check files are there #
        for f in self.sff_files_info:
            if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path'])
        # Automatic paths #
        self.base_dir = self.out_dir + self.id_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Make an alias to the json #
        self.p.info_json.link_from(self.json_path, safe=True)
        # Primer #
        self.primer_regex = re.compile(self.info['primer'])
        # Raw files #
        self.raw_fasta = FASTA(self.p.raw_fasta)
        self.raw_fastq = FASTQ(self.p.raw_fastq)
        # Standard FASTA #
        self.reads = FASTA(self.p.reads_fasta)
        self.fasta = FASTA(self.p.renamed)
        # Special FASTQ #
        self.fastq = FASTQ(self.p.reads_fastq)
        # A shameless hack for cdhit to work #
        self.renamed = self.fastq
        # Pre-denoised special case #
        if self.info['predenoised'] and False:
            self.sff_files_info = []
            self.reads.link_from(self.info['predenoised'], safe=True)
        # Special submission attributes #
        self.sra = PyroSampleSRA(self)
        # Loaded #
        self.loaded = True
        # Return self for convenience #
        return self

    @property
    def mate(self):
        if not 'mate' in self.info: return False
        run_num = self.info['mate']['run']
        pool_num = self.info['mate']['pool']
        barcode_num = self.info['mate']['num']
        return illumitag.runs[run_num][pool_num-1][barcode_num-1]

    def extract(self):
        # Call extraction #
        shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta))
        shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual))
        shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest))
        # Convert #
        sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq)

    def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20):
        for read in reads:
            # Length #
            if len(read) < minlength: continue
            # Primer #
            match = self.primer_regex.search(str(read.seq))
            if not match: continue
            # PHRED score #
            scores = read.letter_annotations["phred_quality"]
            averaged = moving_average(scores, windowsize)
            discard = False
            for i,value in enumerate(averaged):
                if value < threshold:
                    read = read[:i+windowsize-1]
                    if len(read) < minlength: discard = True
                    break
            if discard: continue
            # Undetermined bases #
            if 'N' in read: continue
            # Remove primer #
            read = read[match.end():]
            # Flip them because 454 reads the other end #
            read = read.reverse_complement()
            # Return #
            yield read

    def clean(self, **kwargs):
        self.reads.write(self.clean_iterator(self.raw_fastq, **kwargs))

    def report_loss(self):
        print "Before cleaning: %i" % len(self.raw_fastq)
        print "After cleaning: %i" % len(self.reads)
        print "Loss: %.2f%%" % (100 * (1 - (len(self.raw_fastq)/len(self.reads))))

    def process(self):
        self.reads.rename_with_num(self.name + '_read', new_path=self.fasta)

    def make_fastq(self, **kwargs):
        """In some special cases we want the FASTQ"""
        self.fastq.write(self.clean_iterator(self.raw_fastq, **kwargs))
        self.fastq.rename_with_num(self.name + '_read')
        print "make_fastq for sample %s completed" % self.id_name
Ejemplo n.º 4
0
class CdhitOTUs(OTUs):
    """Will use cd-hit to create OTU clusters from a given FASTQ file
    http://weizhong-lab.ucsd.edu/cd-hit-otu/"""

    short_name = "cdhit"
    title = "CD-HIT Illumina OTU picking"

    all_paths = """
    /all_reads.fastq
    /clusters/OTU.nr2nd.clstr
    /centers.fasta
    /otus.txt
    /taxonomy_silva/
    /taxonomy_fw/
    /graphs/
    """

    def __repr__(self):
        return "<%s object of %s>" % (self.__class__.__name__, self.parent)

    def __init__(self, cluster):
        # Save parent #
        self.cluster, self.parent = cluster, cluster
        # Inherited #
        self.samples = self.parent.samples
        # Paths #
        self.base_dir = self.parent.p.otus_dir + self.short_name + "/"
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Main reads file here FASTQ #
        self.reads = FASTQ(self.p.all_reads)
        # Files #
        self.cdhit_clusters = FilePath(self.p.clstr)
        self.cdhit_centers = FASTA(self.p.clusters_dir + "OTU")
        self.centers = FASTA(self.p.centers)
        # Taxonomy #
        self.taxonomy_silva = CrestTaxonomy(self.centers, self, "silvamod", self.p.silva)
        self.taxonomy_fw = CrestTaxonomy(self.centers, self, "freshwater", self.p.fw_dir)
        # Preferred one #
        self.taxonomy = self.taxonomy_silva

    def run(self):
        # Combine reads but in fastq format this time #
        paths = [sample.renamed for sample in self.cluster]
        shell_output("cat %s > %s" % (" ".join(paths), self.reads))
        # Clean #
        shutil.rmtree(self.p.clusters_dir)
        # Run command #
        cdhit = sh.Command(cdhit_script)
        cdhit("-i", self.reads, "-o", self.p.clusters_dir, "-p", TmpFile.from_string("[ACTG]"))
        # Create the centers file with good names #
        self.cdhit_centers.rename_with_num("OTU-", self.centers)

    @property_cached
    def cluster_counts_table(self):
        """Create the unfiltered OTU table"""
        # Put results in a dict of dicts #
        result = defaultdict(lambda: defaultdict(int))
        # Loop #
        for line in self.cdhit_clusters:
            if line.startswith(">"):
                otu = "OTU-%s" % line.split()[1]
                continue
            nums = re.findall(">run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.", line)
            if nums:
                run_num, pool_num, sample_num, read_num = map(int, nums[0])
                sample = illumitag.runs[run_num][pool_num - 1][sample_num - 1]
                name = sample.short_name
            else:
                nums = re.findall(">run([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.", line)
                run_num, sample_num, read_num = map(int, nums[0])
                sample = [
                    s
                    for s in illumitag.presamples + illumitag.pyrosamples
                    if s.run_num == run_num and s.num == sample_num
                ][0]
                name = sample.short_name
            # Count #
            result[otu][name] += 1
        # Return #
        result = pandas.DataFrame(result)
        result = result.fillna(0)
        result = result.astype(int)
        result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1)
        # Remove OTUs that are only one read #
        return result