def __init__(self, fasta_path, parent, database='silvamod', base_dir=None): # Parent # self.otu, self.parent = parent, parent # Inherited # self.samples = self.parent.samples # FASTA # self.fasta = FASTA(fasta_path) # The database to use # self.database = database self.database_path = databases[database] # Dir # if base_dir is None: self.base_dir = self.parent.p.crest_dir else: self.base_dir = base_dir self.p = AutoPaths(self.base_dir, self.all_paths) # Graphs # self.graphs = [ getattr(plots, cls_name)(self) for cls_name in plots.__all__ ] # OTU table # self.otu_csv = CSVTable(self.p.otu_csv, d='\t') self.otu_csv_norm = CSVTable(self.p.otu_csv_norm, d='\t') # Filtered centers file # self.centers = FASTA(self.p.centers) # Composition tables # self.comp_phyla = CompositionPhyla(self, self.p.comp_phyla) self.comp_tips = CompositionTips(self, self.p.comp_tips) self.comp_order = CompositionOrder(self, self.p.comp_order) self.comp_class = CompositionClass(self, self.p.comp_class) # Stats # self.stats = StatsOnTaxonomy(self)
def __init__(self, samples, name, base_dir=None): # Save samples # self.name = name self.samples, self.children = samples, samples # Check names are unique # names = [s.short_name for s in samples if s.used] assert len(names) == len(set(names)) # Figure out pools # self.pools = list(set([s.pool for s in self.samples])) self.pools.sort(key=lambda x: x.id_name) # Load them # for p in self.pools: p.load() # Dir # if base_dir: self.base_dir = base_dir else: self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Runner # self.runner = ClusterRunner(self) # FASTA # self.reads = FASTA(self.p.all_reads_fasta) # OTU picking # self.otu_uparse = UparseOTUs(self) self.otu_uclust = UclustOTUs(self) self.otu_cdhit = CdhitOTUs(self) # Reporting # self.reporter = ClusterReporter(self)
def __init__(self, json_path, out_dir): # Attributes # self.out_dir = out_dir self.json_path = FilePath(json_path) # Parse # with open(json_path) as handle: self.info = json.load(handle) # Basic # self.run_num = self.info['run_num'] self.project_short_name = self.info['project'] self.project_long_name = self.info['project_name'] # Own attributes # self.num = self.info['sample_num'] self.short_name = self.info['sample'] self.long_name = self.info['sample_name'] self.name = 'run%i_sample%i' % (self.run_num, self.num) self.group = self.info['group'] self.id_name = "run%03d-sample%02d" % (self.run_num, self.num) # SFF files # self.sff_files_info = self.info['files'] for f in self.sff_files_info: if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path']) # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Pool dummy # self.pool, self.parent = self, self # Other dummy variables # self.bar_len = 0 self.gziped = False self.used = True # Primer # self.primer_regex = re.compile(self.info['primer']) # Raw files # self.raw_fasta = FASTA(self.p.raw_fasta) self.raw_fastq = FASTQ(self.p.raw_fastq) # Standard FASTA # self.reads = FASTA(self.p.reads_fasta) self.fasta = FASTA(self.p.renamed) # Special FASTQ # self.fastq = FASTQ(self.p.reads_fastq) # A shameless hack for cdhit to work # self.renamed = self.fastq # Pre-denoised special case # if self.info['predenoised'] and False: self.sff_files_info = [] self.reads.link_from(self.info['predenoised'], safe=True)
def __init__(self, fasta_path, base_dir, parent, verbose=False): # Base # self.fasta = FASTA(fasta_path) self.parent = parent self.verbose = verbose # Auto paths # self.base_dir = base_dir self.p = AutoPaths(self.base_dir, self.all_paths) # Files # self.derep_cluster = SizesFASTA(self.p.derep_cluster) self.cluster_99 = SizesFASTA(self.p.cluster_99) self.positive = SizesFASTA(self.p.positive) self.negative = SizesFASTA(self.p.negative) self.subsampled = FASTA(self.p.subsampled)
def set_size(self, length): """Trim all sequences to a specific length starting from the end""" self.size_trimmed = FASTA(new_temp_path()) def trim_iterator(reads): for read in reads: if len(read) < length: continue yield read[-length:] self.size_trimmed.write(trim_iterator(self.reads)) self.size_trimmed.close() # Replace it # self.reads.remove() shutil.move(self.size_trimmed, self.reads)
def __init__(self, parent): # Save parent # self.stat, self.parent = parent, parent self.tax = parent.tax # Paths # self.p = AutoPaths(self.parent.p.unifrac_dir, self.all_paths) # Files # self.clustalo_aligned = FASTA(self.p.clustalo_align) self.pynast_aligned = FASTA(self.p.pynast_align) self.mothur_aligned = FASTA(self.p.mothur_align) self.raxml_tree = FilePath(self.p.raxml_tree) self.fasttree_tree = FilePath(self.p.fasttree_tree) self.distances_csv = CSVTable(self.p.distances_csv) # Graphs # self.nmds = NMDS(self, self.distances_csv, calc_distance=False)
def __init__(self, fasta, base_dir): # Base params # self.fasta = fasta if isinstance(fasta, FASTA) else FASTA(fasta) self.base_dir = base_dir self.p = AutoPaths(self.base_dir, self.all_paths) # Extra simple composition # from illumitag.clustering.composition import SimpleComposition self.composition = SimpleComposition(self, self.base_dir + 'comp_' + self.short_name + '/')
def __init__(self, parent): # Save parent # self.parent, self.assemble_group = parent, parent self.samples = parent.samples self.pool = self.parent.pool # Auto paths # self.base_dir = parent.p.groups_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # More # self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples) self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples) # Quality filtered # if self.parent == 'assembled': self.qual_filtered = FASTQ(self.p.qual_filtered, samples=self.samples) self.len_filtered = FASTQ(self.p.len_filtered_fastq, samples=self.samples) self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes) # Further # self.load()
def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.all_otus = FilePath(self.p.all_otus) self.all_centers = FASTA(self.p.all_centers) self.otus = FilePath(self.base_dir + "otus.txt") self.centers = FASTA(self.base_dir + "centers.fasta") # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva
def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main reads file here FASTQ # self.reads = FASTQ(self.p.all_reads) # Files # self.cdhit_clusters = FilePath(self.p.clstr) self.cdhit_centers = FASTA(self.p.clusters_dir + "OTU") self.centers = FASTA(self.p.centers) # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva
def load(self): # Special case for dummy samples # if self.info.get('dummy'): return # Paths # self.base_dir = self.pool.p.samples_dir + self.bar_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) self.path = str(self.p.orig_fastq) # Distances # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev # Files # self.trimmed = FASTQ(self.p.trimmed) self.renamed = FASTQ(self.p.renamed) self.fasta = FASTA(self.p.reads_fasta)
def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.derep = SizesFASTA(self.p.derep) self.sorted = SizesFASTA(self.p.sorted) self.centers = FASTA(self.p.centers) self.readmap = UClusterFile(self.p.readmap) # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva_dir) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) self.taxonomy_rpd = RdpTaxonomy(self.centers, self) # Preferred one # self.taxonomy = self.taxonomy_silva # Source tracking # self.seqenv = Seqenv(self)
def __init__(self, parent, base_dir, lower_bound, upper_bound): # Save parent # self.parent, self.fractions = parent, parent # Auto paths # self.base_dir = base_dir self.p = AutoPaths(self.base_dir, self.all_paths) # Bounds # self.lower_bound = lower_bound self.upper_bound = upper_bound # Size fractions # self.reads = FASTA(self.p.reads_fasta) self.refere = UchimeRef(self.p.reads, self.p.refere_dir, self) self.denovo = UchimeDenovo(self.p.reads, self.p.denovo_dir, self) # Classification # self.rdp = SimpleRdpTaxonomy(self.reads, self.p.rdp_dir) self.crest = SimpleCrestTaxonomy(self.reads, self.p.crest_dir)
def __init__(self, fasta_path, parent, base_dir=None): # Parent # self.otu, self.parent = parent, parent # Inherited # self.samples = self.parent.samples # FASTA # self.fasta = FASTA(fasta_path) # Dir # if base_dir is None: self.base_dir = self.parent.p.rdp_dir else: self.base_dir = base_dir self.p = AutoPaths(self.base_dir, self.all_paths) # Graphs # self.graphs = [ getattr(plots, cls_name)(self) for cls_name in plots.__all__ ] # Tables # self.otu_csv = CSVTable(self.p.otu_csv) # Composition tables # #self.comp_phyla = CompositionPhyla(self, self.p.comp_phyla) #self.comp_tips = CompositionTips(self, self.p.comp_tips) # Stats # self.stats = StatsOnTaxonomy(self)
def __init__(self, path, parent): # Save parent # self.parent, self.pool = parent, parent self.samples = parent.samples # Auto paths # self.base_dir = parent.p.quality_dir + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Files # self.untrimmed = FASTQ(path, samples=self.samples) self.only_used = FASTA(self.p.only_used, samples=self.samples) self.trimmed = FASTA(self.p.trimmed) # Qiime output # self.qiime_fasta = FASTA(self.p.qiime_fasta) # Mothur # self.mothur_fasta = FASTA(self.p.mothur_fasta) self.mothur_qual = QualFile(self.p.mothur_qual) self.mothur_groups = GroupFile(self.p.mothur_groups) # Primer size # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev
class Cluster(object): """Analyzes a group of samples.""" all_paths = """ /reads/all_reads.fasta /otus/ /logs/ /metadata.csv """ def __repr__(self): return '<%s object "%s" with %i samples>' % ( self.__class__.__name__, self.name, len(self.samples)) def __iter__(self): return iter(self.samples) def __len__(self): return len(self.samples) def __getitem__(self, key): return self.samples[key] @property def first(self): return self.children[0] @property def count_seq(self): return sum([len(sample) for sample in self]) def __init__(self, samples, name, base_dir=None): # Save samples # self.name = name self.samples, self.children = samples, samples # Check names are unique # names = [s.short_name for s in samples if s.used] assert len(names) == len(set(names)) # Figure out pools # self.pools = list(set([s.pool for s in self.samples])) self.pools.sort(key=lambda x: x.id_name) # Load them # for p in self.pools: p.load() # Dir # if base_dir: self.base_dir = base_dir else: self.base_dir = illumitag.view_dir + "clusters/" + self.name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Runner # self.runner = ClusterRunner(self) # FASTA # self.reads = FASTA(self.p.all_reads_fasta) # OTU picking # self.otu_uparse = UparseOTUs(self) self.otu_uclust = UclustOTUs(self) self.otu_cdhit = CdhitOTUs(self) # Reporting # self.reporter = ClusterReporter(self) def run(self, *args, **kwargs): self.runner.run(*args, **kwargs) def run_slurm(self, *args, **kwargs): self.runner.run_slurm(*args, **kwargs) def process_samples(self): for sample in tqdm(self): sample.process() def combine_reads(self): paths = [sample.fasta.path for sample in self] shell_output('cat %s > %s' % (' '.join(paths), self.reads)) def set_size(self, length): """Trim all sequences to a specific length starting from the end""" self.size_trimmed = FASTA(new_temp_path()) def trim_iterator(reads): for read in reads: if len(read) < length: continue yield read[-length:] self.size_trimmed.write(trim_iterator(self.reads)) self.size_trimmed.close() # Replace it # self.reads.remove() shutil.move(self.size_trimmed, self.reads) def run_uparse(self): self.otu_uparse.run() @property def metadata(self): return pandas.DataFrame([s.info for s in self], index=[s.short_name for s in self]) def export_metadata(self): self.metadata.to_csv(self.p.metadata, sep='\t', encoding='utf-8')
class Pyrosample(object): """A Pyrosample is a legacy object for the few 454 samples we still have and that we need to compare against the new Illumina technology.""" all_paths = """ /info.json /reads.fasta /renamed.fasta /raw/raw.sff /raw/raw.fastq /raw/raw.fasta /raw/raw.qual /raw/manifest.txt /fastq/reads.fastq """ def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.id_name) def __init__(self, json_path, out_dir): # Attributes # self.out_dir = out_dir self.json_path = FilePath(json_path) # Parse # with open(json_path) as handle: self.info = json.load(handle) # Basic # self.run_num = self.info['run_num'] self.project_short_name = self.info['project'] self.project_long_name = self.info['project_name'] # Own attributes # self.num = self.info['sample_num'] self.short_name = self.info['sample'] self.long_name = self.info['sample_name'] self.name = 'run%i_sample%i' % (self.run_num, self.num) self.group = self.info['group'] self.id_name = "run%03d-sample%02d" % (self.run_num, self.num) # SFF files # self.sff_files_info = self.info['files'] for f in self.sff_files_info: if not os.path.exists(f['path']): raise Exception("No file at %s" % f['path']) # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Pool dummy # self.pool, self.parent = self, self # Other dummy variables # self.bar_len = 0 self.gziped = False self.used = True # Primer # self.primer_regex = re.compile(self.info['primer']) # Raw files # self.raw_fasta = FASTA(self.p.raw_fasta) self.raw_fastq = FASTQ(self.p.raw_fastq) # Standard FASTA # self.reads = FASTA(self.p.reads_fasta) self.fasta = FASTA(self.p.renamed) # Special FASTQ # self.fastq = FASTQ(self.p.reads_fastq) # A shameless hack for cdhit to work # self.renamed = self.fastq # Pre-denoised special case # if self.info['predenoised'] and False: self.sff_files_info = [] self.reads.link_from(self.info['predenoised'], safe=True) def load(self): pass def extract(self): # Call extraction # shell_output('sffinfo -s %s > %s' % (self.p.raw_sff, self.p.raw_fasta)) shell_output('sffinfo -q %s > %s' % (self.p.raw_sff, self.p.raw_qual)) shell_output('sffinfo -m %s > %s' % (self.p.raw_sff, self.p.manifest)) # Convert # sh.fasta_to_fastq(self.p.raw_fasta, self.p.raw_qual, self.p.raw_fastq) def clean_iterator(self, reads, minlength=400, threshold=21, windowsize=20): for read in reads: # Length # if len(read) < minlength: continue # Primer # match = self.primer_regex.search(str(read.seq)) if not match: continue # PHRED score # scores = read.letter_annotations["phred_quality"] averaged = moving_average(scores, windowsize) discard = False for i, value in enumerate(averaged): if value < threshold: read = read[:i + windowsize - 1] if len(read) < minlength: discard = True break if discard: continue # Undetermined bases # if 'N' in read: continue # Remove primer # read = read[match.end():] # Flip them because 454 reads the other end # read = read.reverse_complement() # Return # yield read def clean(self, **kwargs): self.reads.write(self.clean_iterator(self.raw_fastq, **kwargs)) def report_loss(self): print "Before cleaning: %i" % len(self.raw_fastq) print "After cleaning: %i" % len(self.reads) print "Loss: %.2f%%" % (100 * (1 - (len(self.raw_fastq) / len(self.reads)))) def process(self): self.reads.rename_with_num(self.name + '_read', new_path=self.fasta) def make_fastq(self, **kwargs): """In some special cases we want the FASTQ""" self.fastq.write(self.clean_iterator(self.raw_fastq, **kwargs)) self.fastq.rename_with_num(self.name + '_read') print "make_fastq for sample %s completed" % self.id_name
# Internal modules # import illumitag from illumitag.fasta.single import FASTA # Third party modules # import sh # Constants # home = os.environ['HOME'] + '/' silvamod_path = home + 'glob/16s/silvamod.fasta' amplified_path = home + 'glob/16s/silvamod_v3_v4.fasta' aligned_path = home + 'glob/16s/silvamod_v3_v4.align' # Objects # silvamod = FASTA(silvamod_path) amplified = FASTA(amplified_path) aligned = FASTA(aligned_path) ############################################################################### def amplify(): """A function to parse the silvamod 16S database and find the primers within the full-length sequences to determine the probable length of our amplified region.""" primers = illumitag.pools[0].primers bar_len = illumitag.pools[0].bar_len counts = { 'success': 0, 'only_fwd': 0, 'only_rev': 0,
class PrimerGroup(object): """A bunch of sequences all having the same type of primer outcome (and assembly outcome)""" all_paths = """ /orig.fastq /n_filtered.fastq /qual_filtered.fastq /len_filtered.fastq /trimmed_barcodes.fasta """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return len(self.orig_reads) def create(self): self.orig_reads.create() def add_read(self, read): self.orig_reads.add_read(read) def close(self): self.orig_reads.close() def __init__(self, parent): # Save parent # self.parent, self.assemble_group = parent, parent self.samples = parent.samples self.pool = self.parent.pool # Auto paths # self.base_dir = parent.p.groups_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # More # self.orig_reads = self.parent.cls(self.p.orig_fastq, samples=self.samples) self.n_filtered = self.parent.cls(self.p.n_filtered, samples=self.samples) # Quality filtered # if self.parent == 'assembled': self.qual_filtered = FASTQ(self.p.qual_filtered, samples=self.samples) self.len_filtered = FASTQ(self.p.len_filtered_fastq, samples=self.samples) self.trimmed_barcodes = FASTA(self.p.trimmed_barcodes) # Further # self.load() def load(self): pass def n_filter(self): """Called from AssembleGroup.discard_reads_with_n""" def no_n_iterator(reads): for read in reads: if 'N' in read: continue yield read self.n_filtered.write(no_n_iterator(self.orig_reads)) def qual_filter(self): """Called from Assemble.quality_filter""" def good_qual_iterator(reads, threshold=5, windowsize=10): for read in reads: averaged = moving_average(read.letter_annotations["phred_quality"], windowsize) if any([value < threshold for value in averaged]): continue yield read self.qual_filtered.write(good_qual_iterator(self.n_filtered)) def len_filter(self): """Called from Assemble.length_filter""" def good_len_iterator(reads, min_length=400): for read in reads: if len(read) < min_length: continue yield read self.len_filtered.write(good_len_iterator(self.qual_filtered)) def trim_bc(self): """Called from Assemble.trim_barcodes""" def no_barcodes_iterator(reads): for read in reads: yield read[self.pool.bar_len:-self.pool.bar_len] if self.pool.bar_len == 0: self.len_filtered.to_fasta(self.trimmed_barcodes) else: self.trimmed_barcodes.write(no_barcodes_iterator(self.len_filtered))
def load(self): self.cls = FASTA self.base_dir = self.outcome.p.unassembled_dir self.p = AutoPaths(self.base_dir, self.all_paths) self.path = self.p.orig_fasta self.flipped_reads = FASTA(self.p.flipped, self.samples, self.primers)
# Make fraction graph # proj.graphs[-1].plot() # Get statistics # proj.reporter.fraction_discarded # Get clustering values # r1, r2 = list(set([p.run for p in proj])) r1.parse_report_xml() r2.parse_report_xml() print float(r1.report_stats['fwd']['DensityPF']) / float(r1.report_stats['fwd']['DensityRaw']) print float(r2.report_stats['fwd']['DensityPF']) / float(r2.report_stats['fwd']['DensityRaw']) # Check below 400 bp sequences # folder = DirectoryPath(illumitag.projects['evaluation'].base_dir + "below_400/") over = FASTA(folder + "reads.fasta") def over_iterator(reads, max_length=400): for read in reads: if len(read) <= max_length: yield read over.create() for pool in pools: over.add_iterator(over_iterator(pool.good_barcodes.assembled.good_primers.qual_filtered)) over.close() over.graphs[-1].plot() crest = SimpleCrestTaxonomy(over, folder) crest.assign() crest.composition.graph.plot() rdp = SimpleRdpTaxonomy(over, folder) rdp.assign() rdp.composition.graph.plot()
class UclustOTUs(OTUs): """Will use uclust via the qimme wraper to create OTU clusters from a given FASTA file http://qiime.org/scripts/pick_otus.html""" short_name = 'uclust' title = 'UCLUST-QIIME denovo picking' all_paths = """ /clusters/clusters.uc /clusters/qiime.log /clusters/all_otus.txt /clusters/all_centers.fasta /centers.fasta /otus.txt /taxonomy_silva/ /taxonomy_fw/ /graphs/ """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.all_otus = FilePath(self.p.all_otus) self.all_centers = FASTA(self.p.all_centers) self.otus = FilePath(self.base_dir + "otus.txt") self.centers = FASTA(self.base_dir + "centers.fasta") # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva def run(self): # Clean # shutil.rmtree(self.p.clusters_dir) # Run command # pick_otus = sh.Command('pick_otus.py') pick_otus('-m', 'uclust', '-i', self.reads, '-o', self.p.clusters_dir) # Move into place # base_name = self.p.clusters_dir + self.reads.prefix shutil.move(base_name + '_otus.txt', self.all_otus) shutil.move(base_name + '_otus.log', self.p.qiime_log) shutil.move(base_name + '_clusters.uc', self.p.clusters_uc) # Remove OTUs that are only one read # def filter_singletons(f): for line in f: line = line.split() if len(line) > 2: yield '\t'.join(line) + '\n' self.otus.writelines(filter_singletons(self.all_otus)) # Create the centers file that is missing # pick_rep = sh.Command('pick_rep_set.py') pick_rep('-i', self.all_otus, '-f', self.reads, '-o', self.all_centers) # Remake the centers file without the filtered OTUs # self.otus_to_keep = [line.split()[0] for line in self.otus] def filter_otus(f): for seq in f: if seq.id in self.otus_to_keep: yield seq self.centers.write(filter_otus(self.all_centers)) @property_cached def cluster_counts_table(self): """Create the unfiltered OTU table""" # Put results in a dict of dicts # result = defaultdict(lambda: defaultdict(int)) # Loop # for line in self.otus: # Parse the line # contents = line.split() otu, reads = contents[0], contents[1:] # Parse the hits # for r in reads: nums = re.findall("run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)", r) if nums: run_num, pool_num, sample_num, read_num = map(int, nums[0]) sample = illumitag.runs[run_num][pool_num-1][sample_num-1] name = sample.short_name else: nums = re.findall("run([0-9]+)_sample([0-9]+)_read([0-9]+)", r) run_num, sample_num, read_num = map(int, nums[0]) sample = [s for s in illumitag.presamples+illumitag.pyrosamples if s.run_num==run_num and s.num==sample_num][0] name = sample.short_name # Count # result[otu][name] += 1 # Return # result = pandas.DataFrame(result) result = result.fillna(0) result = result.astype(int) result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1) return result
class UparseOTUs(OTUs): """Will use uparse to create OTU clusters from a given FASTA file http://www.nature.com/doifinder/10.1038/nmeth.2604""" short_name = 'uparse' title = 'UPARSE denovo picking' all_paths = """ /derep.fasta /sorted.fasta /centers.fasta /readmap.uc /taxonomy_silva/ /taxonomy_fw/ /taxonomy_rdp/ /graphs/ /seqenv/ """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main FASTA file # self.reads = self.parent.reads # Files # self.derep = SizesFASTA(self.p.derep) self.sorted = SizesFASTA(self.p.sorted) self.centers = FASTA(self.p.centers) self.readmap = UClusterFile(self.p.readmap) # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva_dir) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) self.taxonomy_rpd = RdpTaxonomy(self.centers, self) # Preferred one # self.taxonomy = self.taxonomy_silva # Source tracking # self.seqenv = Seqenv(self) def run(self, threshold=3.0): # Dereplicate # sh.usearch7("--derep_fulllength", self.reads, '-output', self.derep, '-sizeout') # Order by size and kill singeltons # sh.usearch7("--sortbysize", self.derep, '-output', self.sorted, '-minsize', 2) # Compute the centers # sh.usearch7("--cluster_otus", self.sorted, '-otus', self.centers, '-otu_radius_pct', threshold) # Rename the centers # self.centers.rename_with_num('OTU_') # Map the reads back to the centers # identity = (100 - threshold) / 100 sh.usearch7("-usearch_global", self.reads, '-db', self.centers, '-strand', 'plus', '-id', identity, '-uc', self.readmap) def checks(self): assert len(self.reads) == len(self.derep) assert len(self.reads) == len(self.readmap) @property_cached def cluster_counts_table(self): """Parse that custom output for creating the unfiltered OTU table""" result = pandas.DataFrame(self.readmap.otu_sample_counts) result = result.fillna(0) result = result.astype(int) result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1) return result
class CdhitOTUs(OTUs): """Will use cd-hit to create OTU clusters from a given FASTQ file http://weizhong-lab.ucsd.edu/cd-hit-otu/""" short_name = 'cdhit' title = 'CD-HIT Illumina OTU picking' all_paths = """ /all_reads.fastq /clusters/OTU.nr2nd.clstr /centers.fasta /otus.txt /taxonomy_silva/ /taxonomy_fw/ /graphs/ """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __init__(self, cluster): # Save parent # self.cluster, self.parent = cluster, cluster # Inherited # self.samples = self.parent.samples # Paths # self.base_dir = self.parent.p.otus_dir + self.short_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Main reads file here FASTQ # self.reads = FASTQ(self.p.all_reads) # Files # self.cdhit_clusters = FilePath(self.p.clstr) self.cdhit_centers = FASTA(self.p.clusters_dir + "OTU") self.centers = FASTA(self.p.centers) # Taxonomy # self.taxonomy_silva = CrestTaxonomy(self.centers, self, 'silvamod', self.p.silva) self.taxonomy_fw = CrestTaxonomy(self.centers, self, 'freshwater', self.p.fw_dir) # Preferred one # self.taxonomy = self.taxonomy_silva def run(self): # Combine reads but in fastq format this time # paths = [sample.renamed for sample in self.cluster] shell_output('cat %s > %s' % (' '.join(paths), self.reads)) # Clean # shutil.rmtree(self.p.clusters_dir) # Run command # cdhit = sh.Command(cdhit_script) cdhit('-i', self.reads, '-o', self.p.clusters_dir, '-p', TmpFile.from_string('[ACTG]')) # Create the centers file with good names # self.cdhit_centers.rename_with_num('OTU_', self.centers) @property_cached def cluster_counts_table(self): """Create the unfiltered OTU table""" # Put results in a dict of dicts # result = defaultdict(lambda: defaultdict(int)) # Loop # for line in self.cdhit_clusters: if line.startswith('>'): otu = "OTU_%s" % line.split()[1] continue nums = re.findall( ">run([0-9]+)_pool([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.", line) if nums: run_num, pool_num, sample_num, read_num = map(int, nums[0]) sample = illumitag.runs[run_num][pool_num - 1][sample_num - 1] name = sample.short_name else: nums = re.findall( ">run([0-9]+)_sample([0-9]+)_read([0-9]+)\.\.\.", line) run_num, sample_num, read_num = map(int, nums[0]) sample = [ s for s in illumitag.presamples + illumitag.pyrosamples if s.run_num == run_num and s.num == sample_num ][0] name = sample.short_name # Count # result[otu][name] += 1 # Return # result = pandas.DataFrame(result) result = result.fillna(0) result = result.astype(int) result = result.reindex_axis(sorted(result.columns, key=natural_sort), axis=1) # Remove OTUs that are only one read # return result
# Built-in modules # import os, shutil, glob # Internal modules # from illumitag.common.autopaths import AutoPaths, FilePath from illumitag.common.cache import property_cached from illumitag.common.csv_tables import CSVTable from illumitag.clustering.statistics.nmds import NMDS # Third party modules # import sh, pandas from illumitag.fasta.single import FASTA # Constants # home = os.environ['HOME'] + '/' reference = FASTA( home + 'glob/16s/silva/v111/rep_set_aligned/97_Silva_111_rep_set.fasta') ############################################################################### class Unifrac(object): """A class to compute the Unifrac algorithm producing a distance matrix from a bunch of different samples and their reads. Step 1. Make an alignment of all the OTU centers against a reference. One can use: * clustalo * PyNAST * mothur <- fastest * SINA Step 2. From the alignment produced make a tree.
class QualityReads(object): """A set of sequences determined to be quality controlled""" all_paths = """ /mothur_reads.fasta /mothur_reads.qual /mothur_groups.tsv /qiime_reads.fasta /only_used_samples.fasta /trimmed.fasta """ def __repr__(self): return '<%s object of %s>' % (self.__class__.__name__, self.parent) def __len__(self): return len(self.trimmed) def __init__(self, path, parent): # Save parent # self.parent, self.pool = parent, parent self.samples = parent.samples # Auto paths # self.base_dir = parent.p.quality_dir + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Files # self.untrimmed = FASTQ(path, samples=self.samples) self.only_used = FASTA(self.p.only_used, samples=self.samples) self.trimmed = FASTA(self.p.trimmed) # Qiime output # self.qiime_fasta = FASTA(self.p.qiime_fasta) # Mothur # self.mothur_fasta = FASTA(self.p.mothur_fasta) self.mothur_qual = QualFile(self.p.mothur_qual) self.mothur_groups = GroupFile(self.p.mothur_groups) # Primer size # self.trim_fwd = self.pool.samples.trim_fwd self.trim_rev = self.pool.samples.trim_rev def filter_unused(self): def no_unused_iterator(reads): for r in reads.parse_barcodes(): if r.first.sample.used: yield r.read self.only_used.write(no_unused_iterator(self.untrimmed)) def trim_primers(self): def no_primers_iterator(reads): for read in reads: yield read[self.trim_fwd:-self.trim_rev] self.trimmed.write(no_primers_iterator(self.only_used)) def make_mothur_output(self): # Trimmed fasta # self.mothur_fasta.link_from(self.trimmed.path) # The groups file # self.mothur_groups.create() for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name read_name = '%s\t%s\n' % (r.read.id, sample_name) self.mothur_groups.handle.write(read_name) self.mothur_groups.close() def make_qiime_output(self): # Prepare fasta writer # handle = open(self.qiime_fasta.path, 'w') writer = FastaWriter(handle, wrap=0) writer.write_header() # Counter # counter = defaultdict(int) # Do it # for r in self.only_used.parse_barcodes(): sample_name = r.first.sample.short_name counter[sample_name] += 1 r.read.id = '%s_%i %s' % (sample_name, counter[sample_name], r.read.id) bar_seq = r.read.seq[0:self.pool.bar_len] r.read.description = "orig_bc=%s new_bc=%s bc_diffs=0" % (bar_seq, bar_seq) writer.write_record(r.read[self.trim_fwd:-self.trim_rev]) # Close # writer.write_footer() handle.close()
def __init__(self, json_path, out_dir): # Attributes # self.out_dir = out_dir self.json_path = FilePath(json_path) # Parse # with open(json_path) as handle: self.info = json.load(handle) # Basic # self.account = self.info['uppmax_id'] self.run_num = self.info['run_num'] self.run_label = self.info['run_id'] self.project_short_name = self.info['project'] self.project_long_name = self.info['project_name'] self.fwd_name = self.info['forward_reads'] self.rev_name = self.info['reverse_reads'] # Own attributes # self.num = self.info['sample_num'] self.label = self.info['sample_id'] self.short_name = self.info['sample'] self.long_name = self.info['sample_name'] self.name = 'run%i_sample%i' % (self.run_num, self.num) self.group = self.info['group'] self.id_name = "run%03d-sample%02d" % (self.run_num, self.num) self.fwd_mid = self.info['forward_mid'] self.rev_mid = self.info['forward_mid'] # Automatic paths # self.base_dir = self.out_dir + self.id_name + '/' self.p = AutoPaths(self.base_dir, self.all_paths) # Special # self.primers = TwoPrimers(self) # Samples dummy # self.info['samples'] = [{ "name": self.short_name, "used": 1, "group": self.group, "dummy": 1, "num": self.num, "fwd": "", "rev": "" }] self.samples = Samples(self) self.samples.load() # Pool dummy # self.pool, self.parent = self, self # Files # self.fwd_path = home + "ILLUMITAG/INBOX/%s/%s/%s" % ( self.run_label, self.label, self.fwd_name) self.rev_path = home + "ILLUMITAG/INBOX/%s/%s/%s" % ( self.run_label, self.label, self.rev_name) self.gziped = True if self.fwd_path.endswith('gz') else False self.fwd = FASTQ(self.p.fwd) self.rev = FASTQ(self.p.rev) self.fastq = PairedFASTQ(self.fwd.path, self.rev.path, self) # Barcode length # self.bar_len = 0 # Make an alias to the json # self.p.info_json.link_from(self.json_path, safe=True) # Assembly files as children # self.assembled = Assembled(self) self.unassembled = Unassembled(self) self.children = (self.assembled, self.unassembled) self.first = self.assembled # Graphs # self.graphs = [ getattr(outcome_plots, cls_name)(self) for cls_name in outcome_plots.__all__ ] # Runner # self.runner = PresampleRunner(self) # Final # self.trimmed = FASTQ(self.p.trimmed) self.renamed = FASTQ(self.p.renamed) self.fasta = FASTA(self.p.reads_fasta)