Example #1
0
class Pyrosample(object):
    """A Pyrosample is a legacy object for the few 454 samples we still have and that we need to compare against the new Illumina technology."""

    all_paths = """
    /info.json
    /reads.fasta
    /renamed.fasta
    /raw/raw.sff
    /raw/raw.fastq
    /raw/raw.fasta
    /raw/raw.qual
    /raw/manifest.txt
    /fastq/reads.fastq
    """

    kind = "pyrosample"

    def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.id_name)

    def __init__(self, json_path, out_dir):
        # Attributes #
        self.out_dir = out_dir
        self.json_path = FilePath(json_path)
        # Parse #
        self.info = load_json_path(self.json_path)
        # Basic #
        self.account = "/dev/null"
        self.run_num = self.info['run_num']
        self.run_label = "pyrosample_run_%i" % self.run_num
        self.project_short_name = self.info['project']
        self.project_long_name = self.info['project_name']
        # Own attributes #
        self.num = self.info['sample_num']
        self.short_name = self.info['sample']
        self.long_name = self.info['sample_name']
        self.name = 'run%i_sample%i' % (self.run_num, self.num)
        self.group = self.info['group']
        self.id_name = "run%03d-sample%02d" % (self.run_num, self.num)
        # Hard coded attributes #
        self.machine = "454 GS FLX Titanium"
        # SFF files #
        self.sff_files_info = self.info['files']
        # Pool dummy #
        self.pool, self.parent = self, self
        # Other dummy variables #
        self.bar_len = 0
        self.gzipped = False
        self.used = True
        # Loaded #
        self.loaded = False

    def load(self):
        """A second __init__ that is delayed and called only if needed"""
        # Check files are there #
        for f in self.sff_files_info:
            if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path'])
        # Automatic paths #
        self.base_dir = self.out_dir + self.id_name + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Make an alias to the json #
        self.p.info_json.link_from(self.json_path, safe=True)
        # Primer #
        self.primer_regex = re.compile(self.info['primer'])
        # Raw files #
        self.raw_fasta = FASTA(self.p.raw_fasta)
        self.raw_fastq = FASTQ(self.p.raw_fastq)
        # Standard FASTA #
        self.reads = FASTA(self.p.reads_fasta)
        self.fasta = FASTA(self.p.renamed)
        # Special FASTQ #
        self.fastq = FASTQ(self.p.reads_fastq)
        # A shameless hack for cdhit to work #
        self.renamed = self.fastq
        # Pre-denoised special case #
        if self.info['predenoised'] and False:
            self.sff_files_info = []
            self.reads.link_from(self.info['predenoised'], safe=True)
        # Special submission attributes #
        self.sra = PyroSampleSRA(self)
        # Loaded #
        self.loaded = True
        # Return self for convenience #
        return self

    @property
    def mate(self):
        if not 'mate' in self.info: return False
        run_num = self.info['mate']['run']
        pool_num = self.info['mate']['pool']
        barcode_num = self.info['mate']['num']
        return illumitag.runs[run_num][pool_num-1][barcode_num-1]

    def extract(self):
        # Call extraction #
        shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta))
        shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual))
        shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest))
        # Convert #
        sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq)

    def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20):
        for read in reads:
            # Length #
            if len(read) < minlength: continue
            # Primer #
            match = self.primer_regex.search(str(read.seq))
            if not match: continue
            # PHRED score #
            scores = read.letter_annotations["phred_quality"]
            averaged = moving_average(scores, windowsize)
            discard = False
            for i,value in enumerate(averaged):
                if value < threshold:
                    read = read[:i+windowsize-1]
                    if len(read) < minlength: discard = True
                    break
            if discard: continue
            # Undetermined bases #
            if 'N' in read: continue
            # Remove primer #
            read = read[match.end():]
            # Flip them because 454 reads the other end #
            read = read.reverse_complement()
            # Return #
            yield read

    def clean(self, **kwargs):
        self.reads.write(self.clean_iterator(self.raw_fastq, **kwargs))

    def report_loss(self):
        print "Before cleaning: %i" % len(self.raw_fastq)
        print "After cleaning: %i" % len(self.reads)
        print "Loss: %.2f%%" % (100 * (1 - (len(self.raw_fastq)/len(self.reads))))

    def process(self):
        self.reads.rename_with_num(self.name + '_read', new_path=self.fasta)

    def make_fastq(self, **kwargs):
        """In some special cases we want the FASTQ"""
        self.fastq.write(self.clean_iterator(self.raw_fastq, **kwargs))
        self.fastq.rename_with_num(self.name + '_read')
        print "make_fastq for sample %s completed" % self.id_name
Example #2
0
class QualityReads(object):
    """A set of sequences determined to be quality controlled"""

    all_paths = """
    /mothur_reads.fasta
    /mothur_reads.qual
    /mothur_groups.tsv
    /qiime_reads.fasta
    /only_used_samples.fasta
    /trimmed.fasta
    """

    def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent)
    def __len__(self): return len(self.trimmed)

    def __init__(self, path, parent):
        # Save parent #
        self.parent, self.pool = parent, parent
        self.samples = parent.samples
        # Auto paths #
        self.base_dir = parent.p.quality_dir + '/'
        self.p = AutoPaths(self.base_dir, self.all_paths)
        # Files #
        self.untrimmed = BarcodedFASTQ(path, samples=self.samples)
        self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples)
        self.trimmed = FASTA(self.p.trimmed)
        # Qiime output #
        self.qiime_fasta = FASTA(self.p.qiime_fasta)
        # Mothur #
        self.mothur_fasta = FASTA(self.p.mothur_fasta)
        self.mothur_qual = QualFile(self.p.mothur_qual)
        self.mothur_groups = FilePath(self.p.mothur_groups)
        # Primer size #
        self.trim_fwd = self.pool.samples.trim_fwd
        self.trim_rev = self.pool.samples.trim_rev

    def filter_unused(self):
        def no_unused_iterator(reads):
            for r in reads.parse_barcodes():
                if r.first.sample.used: yield r.read
        self.only_used.write(no_unused_iterator(self.untrimmed))

    def trim_primers(self):
        def no_primers_iterator(reads):
            for read in reads:
                yield read[self.trim_fwd:-self.trim_rev]
        self.trimmed.write(no_primers_iterator(self.only_used))

    def make_mothur_output(self):
        # Trimmed fasta #
        self.mothur_fasta.link_from(self.trimmed.path)
        # The groups file #
        self.mothur_groups.create()
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            read_name = '%s\t%s\n' % (r.read.id, sample_name)
            self.mothur_groups.handle.write(read_name)
        self.mothur_groups.close()

    def make_qiime_output(self):
        # Prepare fasta writer #
        handle = open(self.qiime_fasta.path, 'w')
        writer = FastaWriter(handle, wrap=0)
        writer.write_header()
        # Counter #
        counter = defaultdict(int)
        # Do it #
        for r in self.only_used.parse_barcodes():
            sample_name = r.first.sample.short_name
            counter[sample_name] += 1
            r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id)
            bar_seq = r.read.seq[0:self.pool.bar_len]
            r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq)
            writer.write_record(r.read[self.trim_fwd:-self.trim_rev])
        # Close #
        writer.write_footer()
        handle.close()