def __init__(self, fwd, rev, pools): # Attributes # self.pools = pools self.samples = pools[0].samples self.primers = pools[0].primers # Files # self.fwd = FASTQ(fwd) self.rev = FASTQ(rev) # Paired # self.pair = PairedFASTQ(fwd, rev, self)
def __init__(self, fwd_path, rev_path, parent): # Basic # self.fwd_path = fwd_path self.rev_path = rev_path # File objects # self.fwd = FASTQ(fwd_path) self.rev = FASTQ(rev_path) # Extra # self.pool, self.parent = parent, parent self.samples = parent.samples self.primers = parent.primers self.gziped = True if self.fwd_path.endswith('gz') else False
def load(self): # Special case for dummy samples # if self.info.get('dummy'): return # Paths # self.base_dir = self.pool.p.samples_dir + self.bar_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) self.path = str(self.p.orig_fastq) # Distances # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev # Files # self.trimmed = FASTQ(self.p.trimmed) self.renamed = FASTQ(self.p.renamed) self.fasta = FASTA(self.p.reads_fasta)
def __init__(self, json_path, out_dir): # Attributes # self.out_dir = out_dir self.json_path = FilePath(json_path) # Parse # with open(json_path) as handle: self.info = json.load(handle) # Basic # self.run_num = self.info['run_num'] self.project_short_name = self.info['project'] self.project_long_name = self.info['project_name'] # Own attributes # self.num = self.info['sample_num'] self.short_name = self.info['sample'] self.long_name = self.info['sample_name'] self.name = 'run%i_sample%i' % (self.run_num, self.num) self.group = self.info['group'] self.id_name = "run%03d-sample%02d" % (self.run_num, self.num) # SFF files # self.sff_files_info = self.info['files'] for f in self.sff_files_info: if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path']) # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Pool dummy # self.pool, self.parent = self, self # Other dummy variables # self.bar_len = 0 self.gziped = False self.used = True # Primer # self.primer_regex = re.compile(self.info['primer']) # Raw files # self.raw_fasta = FASTA(self.p.raw_fasta) self.raw_fastq = FASTQ(self.p.raw_fastq) # Standard FASTA # self.reads = FASTA(self.p.reads_fasta) self.fasta = FASTA(self.p.renamed) # Special FASTQ # self.fastq = FASTQ(self.p.reads_fastq) # A shameless hack for cdhit to work # self.renamed = self.fastq # Pre-denoised special case # if self.info['predenoised'] and False: self.sff_files_info = [] self.reads.link_from(self.info['predenoised'], safe=True)
def __init__(self, parent): # Save parent # self.parent, self.assemble_group = parent, parent self.samples = parent.samples self.pool = self.parent.pool # Auto paths # self.base_dir = parent.p.groups_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # More # self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples) self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples) # Quality filtered # if self.parent == 'assembled': self.qual_filtered = FASTQ(self.p.qual_filtered, samples=self.samples) self.len_filtered = FASTQ(self.p.len_filtered_fastq, samples=self.samples) self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes) # Further # self.load()
def __init__(self, path): # Basic # self.path = path self.name = os.path.basename(path) # Optional raw output # path = illumitag.view_dir + 'pyrosamples/raw/' + self.prefix self.raw_fasta_path = path + ".fasta" self.raw_qual_path = path + ".qual" self.fastq = FASTQ(path + ".fastq")
def load(self): """A second __init__ that is delayed, solves some circular references""" # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Children # self.samples.load() # Raw file pairs # self.fwd_path = home + "ILLUMITAG/INBOX/%s/%s/%s" % (self.run.label, self.label, self.info['forward_reads']) self.rev_path = home + "ILLUMITAG/INBOX/%s/%s/%s" % (self.run.label, self.label, self.info['reverse_reads']) self.fwd = FASTQ(self.fwd_path) self.rev = FASTQ(self.rev_path) self.fastq = PairedFASTQ(self.fwd.path, self.rev.path, self) # Barcode length # self.bar_len = self.samples.bar_len # Make Outcomes # self.no_barcodes = NoBarcode(self) self.one_barcodes = OneBarcode(self) self.same_barcodes = SameBarcode(self) self.bad_barcodes = BadBarcode(self) self.good_barcodes = GoodBarcode(self) self.outcomes = (self.good_barcodes, self.no_barcodes, self.one_barcodes, self.same_barcodes, self.bad_barcodes) self.children = self.outcomes # The good reads # self.quality_reads = QualityReads(self.good_barcodes.assembled.good_primers.len_filtered, self) self.fractions = Fractions(self) # Runner # self.runner = PoolRunner(self) # Graphs # self.graphs = [getattr(pool_plots, cls_name)(self) for cls_name in pool_plots.__all__] # Loaded # self.loaded = True # Return self for convenience # return self
def __init__(self, path, parent): # Save parent # self.parent, self.pool = parent, parent self.samples = parent.samples # Auto paths # self.base_dir = parent.p.quality_dir + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Files # self.untrimmed = FASTQ(path, samples=self.samples) self.only_used = FASTA(self.p.only_used, samples=self.samples) self.trimmed = FASTA(self.p.trimmed) # Qiime output # self.qiime_fasta = FASTA(self.p.qiime_fasta) # Mothur # self.mothur_fasta = FASTA(self.p.mothur_fasta) self.mothur_qual = QualFile(self.p.mothur_qual) self.mothur_groups = GroupFile(self.p.mothur_groups) # Primer size # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev
def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main reads file here FASTQ # self.reads = FASTQ(self.p.all_reads) # Files # self.cdhit_clusters = FilePath(self.p.clstr) self.cdhit_centers = FASTA(self.p.clusters_dir + "OTU") self.centers = FASTA(self.p.centers) # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva
class PrimerGroup(object): """A bunch of sequences all having the same type of primer outcome (and assembly outcome)""" all_paths = """ /orig.fastq /n_filtered.fastq /qual_filtered.fastq /len_filtered.fastq /trimmed_barcodes.fasta """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return len(self.orig_reads) def create(self): self.orig_reads.create() def add_read(self, read): self.orig_reads.add_read(read) def close(self): self.orig_reads.close() def __init__(self, parent): # Save parent # self.parent, self.assemble_group = parent, parent self.samples = parent.samples self.pool = self.parent.pool # Auto paths # self.base_dir = parent.p.groups_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # More # self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples) self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples) # Quality filtered # if self.parent == 'assembled': self.qual_filtered = FASTQ(self.p.qual_filtered, samples=self.samples) self.len_filtered = FASTQ(self.p.len_filtered_fastq, samples=self.samples) self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes) # Further # self.load() def load(self): pass def n_filter(self): """Called from AssembleGroup.discard_reads_with_n""" def no_n_iterator(reads): for read in reads: if 'N' in read: continue yield read self.n_filtered.write(no_n_iterator(self.orig_reads)) def qual_filter(self): """Called from Assemble.quality_filter""" def good_qual_iterator(reads, threshold=5, windowsize=10): for read in reads: averaged = moving_average(read.letter_annotations["phred_quality"], windowsize) if any([value < threshold for value in averaged]): continue yield read self.qual_filtered.write(good_qual_iterator(self.n_filtered)) def len_filter(self): """Called from Assemble.length_filter""" def good_len_iterator(reads, min_length=400): for read in reads: if len(read) < min_length: continue yield read self.len_filtered.write(good_len_iterator(self.qual_filtered)) def trim_bc(self): """Called from Assemble.trim_barcodes""" def no_barcodes_iterator(reads): for read in reads: yield read[self.pool.bar_len:-self.pool.bar_len] if self.pool.bar_len == 0: self.len_filtered.to_fasta(self.trimmed_barcodes) else: self.trimmed_barcodes.write(no_barcodes_iterator(self.len_filtered))
def load(self): self.cls = FASTQ self.base_dir = self.outcome.p.assembled_dir self.p = AutoPaths(self.base_dir, self.all_paths) self.path = self.p.orig_fastq self.flipped_reads = FASTQ(self.p.flipped, self.samples, self.primers)
def __init__(self, json_path, out_dir): # Attributes # self.out_dir = out_dir self.json_path = FilePath(json_path) # Parse # with open(json_path) as handle: self.info = json.load(handle) # Basic # self.account = self.info['uppmax_id'] self.run_num = self.info['run_num'] self.run_label = self.info['run_id'] self.project_short_name = self.info['project'] self.project_long_name = self.info['project_name'] self.fwd_name = self.info['forward_reads'] self.rev_name = self.info['reverse_reads'] # Own attributes # self.num = self.info['sample_num'] self.label = self.info['sample_id'] self.short_name = self.info['sample'] self.long_name = self.info['sample_name'] self.name = 'run%i_sample%i' % (self.run_num, self.num) self.group = self.info['group'] self.id_name = "run%03d-sample%02d" % (self.run_num, self.num) self.fwd_mid = self.info['forward_mid'] self.rev_mid = self.info['forward_mid'] # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Special # self.primers = TwoPrimers(self) # Samples dummy # self.info['samples'] = [{ "name": self.short_name, "used": 1, "group": self.group, "dummy": 1, "num": self.num, "fwd": "", "rev": "" }] self.samples = Samples(self) self.samples.load() # Pool dummy # self.pool, self.parent = self, self # Files # self.fwd_path = home + "ILLUMITAG/INBOX/%s/%s/%s" % ( self.run_label, self.label, self.fwd_name) self.rev_path = home + "ILLUMITAG/INBOX/%s/%s/%s" % ( self.run_label, self.label, self.rev_name) self.gziped = True if self.fwd_path.endswith('gz') else False self.fwd = FASTQ(self.p.fwd) self.rev = FASTQ(self.p.rev) self.fastq = PairedFASTQ(self.fwd.path, self.rev.path, self) # Barcode length # self.bar_len = 0 # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Assembly files as children # self.assembled = Assembled(self) self.unassembled = Unassembled(self) self.children = (self.assembled, self.unassembled) self.first = self.assembled # Graphs # self.graphs = [ getattr(outcome_plots, cls_name)(self) for cls_name in outcome_plots.__all__ ] # Runner # self.runner = PresampleRunner(self) # Final # self.trimmed = FASTQ(self.p.trimmed) self.renamed = FASTQ(self.p.renamed) self.fasta = FASTA(self.p.reads_fasta)
class Presample(BarcodeGroup): """A Presample is a clumsy name for a new type of barcoded-sequence files. As we updated the lab protocol, sample are not multiplexed with our traditional 50 barcodes anymore, but with Illumina specific MIDs. The demultiplexing thus happens in their pipeline and we are left with one sample per file. This object is a bit like a *Pool*, a *BarcodeGroup* and a *Sample*.""" all_paths = """ /logs/ /graphs/ /info.json /fwd.fastq /rev.fastq /quality/trimmed.fastq /quality/renamed.fastq /quality/reads.fasta /assembled/ /unassembled/ /fastqc/ """ def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.id_name) def __str__(self): return self.id_name def __iter__(self): return iter(self.children) def __len__(self): return self.count def __getitem__(self, key): return self.samples[key] @property def seq_len(self): return len(self.fwd.first_read) def __init__(self, json_path, out_dir): # Attributes # self.out_dir = out_dir self.json_path = FilePath(json_path) # Parse # with open(json_path) as handle: self.info = json.load(handle) # Basic # self.account = self.info['uppmax_id'] self.run_num = self.info['run_num'] self.run_label = self.info['run_id'] self.project_short_name = self.info['project'] self.project_long_name = self.info['project_name'] self.fwd_name = self.info['forward_reads'] self.rev_name = self.info['reverse_reads'] # Own attributes # self.num = self.info['sample_num'] self.label = self.info['sample_id'] self.short_name = self.info['sample'] self.long_name = self.info['sample_name'] self.name = 'run%i_sample%i' % (self.run_num, self.num) self.group = self.info['group'] self.id_name = "run%03d-sample%02d" % (self.run_num, self.num) self.fwd_mid = self.info['forward_mid'] self.rev_mid = self.info['forward_mid'] # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Special # self.primers = TwoPrimers(self) # Samples dummy # self.info['samples'] = [{ "name": self.short_name, "used": 1, "group": self.group, "dummy": 1, "num": self.num, "fwd": "", "rev": "" }] self.samples = Samples(self) self.samples.load() # Pool dummy # self.pool, self.parent = self, self # Files # self.fwd_path = home + "ILLUMITAG/INBOX/%s/%s/%s" % ( self.run_label, self.label, self.fwd_name) self.rev_path = home + "ILLUMITAG/INBOX/%s/%s/%s" % ( self.run_label, self.label, self.rev_name) self.gziped = True if self.fwd_path.endswith('gz') else False self.fwd = FASTQ(self.p.fwd) self.rev = FASTQ(self.p.rev) self.fastq = PairedFASTQ(self.fwd.path, self.rev.path, self) # Barcode length # self.bar_len = 0 # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Assembly files as children # self.assembled = Assembled(self) self.unassembled = Unassembled(self) self.children = (self.assembled, self.unassembled) self.first = self.assembled # Graphs # self.graphs = [ getattr(outcome_plots, cls_name)(self) for cls_name in outcome_plots.__all__ ] # Runner # self.runner = PresampleRunner(self) # Final # self.trimmed = FASTQ(self.p.trimmed) self.renamed = FASTQ(self.p.renamed) self.fasta = FASTA(self.p.reads_fasta) def load(self): pass def uncompress(self): shell_output('gunzip -c %s > %s' % (self.fwd_path, self.fwd)) shell_output('gunzip -c %s > %s' % (self.rev_path, self.rev)) def presample_fastqc(self): self.fastq.fastqc(self.p.fastqc_dir) def process(self): def no_primers_iterator(reads): for read in reads: yield read[self.primers.fwd_len:-self.primers.rev_len] reads = self.assembled.good_primers.len_filtered self.trimmed.write(no_primers_iterator(reads)) self.trimmed.rename_with_num(self.name + '_read', self.renamed) self.renamed.to_fasta(self.fasta) def make_mothur_output(self): pass def make_qiime_output(self): pass def make_presample_plots(self): for graph in self.graphs: graph.plot()
class Sample(FASTQ): """All sequences with the same barcode pair grouped together""" all_paths = """ /orig.fastq /trimmed.fastq /renamed.fastq /reads.fasta """ def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.name) def __str__(self): return self.bar_name def __init__(self, info, parent): # Save attributes # self.info = info self.parent = parent self.pool = parent.pool # Basic # self.short_name = info['name'] self.group_name = info['group'] self.num = int(info['num']) self.used = bool(info['used']) self.fwd_str = info['fwd'] self.rev_str = info['rev'] # Other # self.bar_name = 'barcode%i' % self.num self.name = 'run%i_pool%i_sample%i' % (self.pool.run_num, self.pool.num, self.num) def load(self): # Special case for dummy samples # if self.info.get('dummy'): return # Paths # self.base_dir = self.pool.p.samples_dir + self.bar_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) self.path = str(self.p.orig_fastq) # Distances # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev # Files # self.trimmed = FASTQ(self.p.trimmed) self.renamed = FASTQ(self.p.renamed) self.fasta = FASTA(self.p.reads_fasta) def process(self): def no_primers_iterator(reads): for read in reads: yield read[self.trim_fwd:-self.trim_rev] self.trimmed.write(no_primers_iterator(self)) self.trimmed.rename_with_num(self.name + '_read', self.renamed) self.renamed.to_fasta(self.fasta) @property def json(self): """Regenerate the JSON string from the object including extra info""" result = OrderedDict([(k, self.info[k]) for k in ('name', 'used', 'group', 'num', 'fwd', 'rev')]) result = json.dumps(result) if self.extra_metadata: result = result[:-1] + ',' + json.dumps(self.extra_metadata, indent=4)[1:] result = re.compile(r'\bNaN\b').sub('null', result) return result @property def count_raw_reads(self): """The number of reads the sample originally had right after barcode processing and before any other quality filtering""" return self.pool.good_barcodes.breakdown[self.bar_name]
class Pyrosample(object): """A Pyrosample is a legacy object for the few 454 samples we still have and that we need to compare against the new Illumina technology.""" all_paths = """ /info.json /reads.fasta /renamed.fasta /raw/raw.sff /raw/raw.fastq /raw/raw.fasta /raw/raw.qual /raw/manifest.txt /fastq/reads.fastq """ def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.id_name) def __init__(self, json_path, out_dir): # Attributes # self.out_dir = out_dir self.json_path = FilePath(json_path) # Parse # with open(json_path) as handle: self.info = json.load(handle) # Basic # self.run_num = self.info['run_num'] self.project_short_name = self.info['project'] self.project_long_name = self.info['project_name'] # Own attributes # self.num = self.info['sample_num'] self.short_name = self.info['sample'] self.long_name = self.info['sample_name'] self.name = 'run%i_sample%i' % (self.run_num, self.num) self.group = self.info['group'] self.id_name = "run%03d-sample%02d" % (self.run_num, self.num) # SFF files # self.sff_files_info = self.info['files'] for f in self.sff_files_info: if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path']) # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Pool dummy # self.pool, self.parent = self, self # Other dummy variables # self.bar_len = 0 self.gziped = False self.used = True # Primer # self.primer_regex = re.compile(self.info['primer']) # Raw files # self.raw_fasta = FASTA(self.p.raw_fasta) self.raw_fastq = FASTQ(self.p.raw_fastq) # Standard FASTA # self.reads = FASTA(self.p.reads_fasta) self.fasta = FASTA(self.p.renamed) # Special FASTQ # self.fastq = FASTQ(self.p.reads_fastq) # A shameless hack for cdhit to work # self.renamed = self.fastq # Pre-denoised special case # if self.info['predenoised'] and False: self.sff_files_info = [] self.reads.link_from(self.info['predenoised'], safe=True) def load(self): pass def extract(self): # Call extraction # shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta)) shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual)) shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest)) # Convert # sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq) def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20): for read in reads: # Length # if len(read) < minlength: continue # Primer # match = self.primer_regex.search(str(read.seq)) if not match: continue # PHRED score # scores = read.letter_annotations["phred_quality"] averaged = moving_average(scores, windowsize) discard = False for i, value in enumerate(averaged): if value < threshold: read = read[:i + windowsize - 1] if len(read) < minlength: discard = True break if discard: continue # Undetermined bases # if 'N' in read: continue # Remove primer # read = read[match.end():] # Flip them because 454 reads the other end # read = read.reverse_complement() # Return # yield read def clean(self, **kwargs): self.reads.write(self.clean_iterator(self.raw_fastq, **kwargs)) def report_loss(self): print "Before cleaning: %i" % len(self.raw_fastq) print "After cleaning: %i" % len(self.reads) print "Loss: %.2f%%" % (100 * (1 - (len(self.raw_fastq) / len(self.reads)))) def process(self): self.reads.rename_with_num(self.name + '_read', new_path=self.fasta) def make_fastq(self, **kwargs): """In some special cases we want the FASTQ""" self.fastq.write(self.clean_iterator(self.raw_fastq, **kwargs)) self.fastq.rename_with_num(self.name + '_read') print "make_fastq for sample %s completed" % self.id_name