class Pyrosample(object): """A Pyrosample is a legacy object for the few 454 samples we still have and that we need to compare against the new Illumina technology.""" all_paths = """ /info.json /reads.fasta /renamed.fasta /raw/raw.sff /raw/raw.fastq /raw/raw.fasta /raw/raw.qual /raw/manifest.txt /fastq/reads.fastq """ kind = "pyrosample" def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.id_name) def __init__(self, json_path, out_dir): # Attributes # self.out_dir = out_dir self.json_path = FilePath(json_path) # Parse # self.info = load_json_path(self.json_path) # Basic # self.account = "/dev/null" self.run_num = self.info['run_num'] self.run_label = "pyrosample_run_%i" % self.run_num self.project_short_name = self.info['project'] self.project_long_name = self.info['project_name'] # Own attributes # self.num = self.info['sample_num'] self.short_name = self.info['sample'] self.long_name = self.info['sample_name'] self.name = 'run%i_sample%i' % (self.run_num, self.num) self.group = self.info['group'] self.id_name = "run%03d-sample%02d" % (self.run_num, self.num) # Hard coded attributes # self.machine = "454 GS FLX Titanium" # SFF files # self.sff_files_info = self.info['files'] # Pool dummy # self.pool, self.parent = self, self # Other dummy variables # self.bar_len = 0 self.gzipped = False self.used = True # Loaded # self.loaded = False def load(self): """A second __init__ that is delayed and called only if needed""" # Check files are there # for f in self.sff_files_info: if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path']) # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Primer # self.primer_regex = re.compile(self.info['primer']) # Raw files # self.raw_fasta = FASTA(self.p.raw_fasta) self.raw_fastq = FASTQ(self.p.raw_fastq) # Standard FASTA # self.reads = FASTA(self.p.reads_fasta) self.fasta = FASTA(self.p.renamed) # Special FASTQ # self.fastq = FASTQ(self.p.reads_fastq) # A shameless hack for cdhit to work # self.renamed = self.fastq # Pre-denoised special case # if self.info['predenoised'] and False: self.sff_files_info = [] self.reads.link_from(self.info['predenoised'], safe=True) # Special submission attributes # self.sra = PyroSampleSRA(self) # Loaded # self.loaded = True # Return self for convenience # return self @property def mate(self): if not 'mate' in self.info: return False run_num = self.info['mate']['run'] pool_num = self.info['mate']['pool'] barcode_num = self.info['mate']['num'] return illumitag.runs[run_num][pool_num-1][barcode_num-1] def extract(self): # Call extraction # shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta)) shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual)) shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest)) # Convert # sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq) def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20): for read in reads: # Length # if len(read) < minlength: continue # Primer # match = self.primer_regex.search(str(read.seq)) if not match: continue # PHRED score # scores = read.letter_annotations["phred_quality"] averaged = moving_average(scores, windowsize) discard = False for i,value in enumerate(averaged): if value < threshold: read = read[:i+windowsize-1] if len(read) < minlength: discard = True break if discard: continue # Undetermined bases # if 'N' in read: continue # Remove primer # read = read[match.end():] # Flip them because 454 reads the other end # read = read.reverse_complement() # Return # yield read def clean(self, **kwargs): self.reads.write(self.clean_iterator(self.raw_fastq, **kwargs)) def report_loss(self): print "Before cleaning: %i" % len(self.raw_fastq) print "After cleaning: %i" % len(self.reads) print "Loss: %.2f%%" % (100 * (1 - (len(self.raw_fastq)/len(self.reads)))) def process(self): self.reads.rename_with_num(self.name + '_read', new_path=self.fasta) def make_fastq(self, **kwargs): """In some special cases we want the FASTQ""" self.fastq.write(self.clean_iterator(self.raw_fastq, **kwargs)) self.fastq.rename_with_num(self.name + '_read') print "make_fastq for sample %s completed" % self.id_name
class QualityReads(object): """A set of sequences determined to be quality controlled""" all_paths = """ /mothur_reads.fasta /mothur_reads.qual /mothur_groups.tsv /qiime_reads.fasta /only_used_samples.fasta /trimmed.fasta """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return len(self.trimmed) def __init__(self, path, parent): # Save parent # self.parent, self.pool = parent, parent self.samples = parent.samples # Auto paths # self.base_dir = parent.p.quality_dir + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Files # self.untrimmed = BarcodedFASTQ(path, samples=self.samples) self.only_used = BarcodedFASTA(self.p.only_used, samples=self.samples) self.trimmed = FASTA(self.p.trimmed) # Qiime output # self.qiime_fasta = FASTA(self.p.qiime_fasta) # Mothur # self.mothur_fasta = FASTA(self.p.mothur_fasta) self.mothur_qual = QualFile(self.p.mothur_qual) self.mothur_groups = FilePath(self.p.mothur_groups) # Primer size # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev def filter_unused(self): def no_unused_iterator(reads): for r in reads.parse_barcodes(): if r.first.sample.used: yield r.read self.only_used.write(no_unused_iterator(self.untrimmed)) def trim_primers(self): def no_primers_iterator(reads): for read in reads: yield read[self.trim_fwd:-self.trim_rev] self.trimmed.write(no_primers_iterator(self.only_used)) def make_mothur_output(self): # Trimmed fasta # self.mothur_fasta.link_from(self.trimmed.path) # The groups file # self.mothur_groups.create() for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name read_name = '%s\t%s\n' % (r.read.id, sample_name) self.mothur_groups.handle.write(read_name) self.mothur_groups.close() def make_qiime_output(self): # Prepare fasta writer # handle = open(self.qiime_fasta.path, 'w') writer = FastaWriter(handle, wrap=0) writer.write_header() # Counter # counter = defaultdict(int) # Do it # for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name counter[sample_name] += 1 r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id) bar_seq = r.read.seq[0:self.pool.bar_len] r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq) writer.write_record(r.read[self.trim_fwd:-self.trim_rev]) # Close # writer.write_footer() handle.close()