def make_file_for_primer_3(gff_file, ref_file, names_file, output_file, start, end): # check for a tmp direcory if len(glob.glob("./tmp")) == 0: call(["mkdir", "tmp"]) gff_file = list(annotation.read_annotations(gff_file)) print "\nReading in the reference file\n" seq_dict = dict(io.read_sequences(ref_file)) names_file = open(names_file).readlines() config = open("Software/primer_config.txt").readlines() with open("tmp/regions_" + output_file, 'w') as out_f: for name in names_file: sname = name.strip("\n ") found = False for line in gff_file: gff_name = line.attr.get("Name", "No_name") peak = line.attr.get("id", "No_id") if sname in gff_name.split("/"): out_f.write("SEQUENCE_ID=" + gff_name.replace("/", "_") + "_" + peak + "\n") # Move the peaks 30 bases proximal out_f.write("SEQUENCE_TEMPLATE=" + line.shifted(start, end).get_seq(seq_dict) + "\n") found = True for cline in config: out_f.write(cline.strip("\n") + "\n") out_f.write("=" + "\n") if found == False: print "Could not find the gene " + sname + " in the reference gff file\n"
def run(self): if self.output is not None: out_file = open(self.output,'wb') else: out_file = sys.stdout annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if 'ID' not in item.attr and 'locus_tag' in item.attr: item.attr['ID'] = item.attr['locus_tag'] if 'color' not in item.attr: if item.type == 'CDS': item.attr['color'] = '#008800' if item.type == 'rRNA': item.attr['color'] = '#bb0000' if item.type == 'tRNA': item.attr['color'] = '#bb00bb' if item.type == 'misc_feature': item.attr['color'] = '#8888ff' print >> out_file, item.as_gff() if self.output is not None: out_file.close()
def set_annotations(self, filenames): f = self.open('reference.gff','wb') annotation.write_gff3_header(f) for filename in filenames: for feature in annotation.read_annotations(filename): print >> f, feature.as_gff() f.close()
def set_annotations(self, filenames): f = self.open('reference.gff','wb') print >> f, '##gff-version 3' for filename in filenames: for feature in annotation.read_annotations(filename): print >> f, feature.as_gff() f.close()
def build_snpeff(self): jar = io.find_jar('snpEff.jar') with open(self/'snpeff.config','wb') as f: print >> f, 'data_dir = snpeff' print >> f, 'genomes : ' + self.name print >> f, self.name + '.genome : ' + self.name snpwork = io.Workspace(self/'snpeff',must_exist=False) snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False) snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False) annotations = self.annotations_filename() assert annotations with open(snpwork_genome/'genes.gff','wb') as f: for record in annotation.read_annotations(annotations): if record.end <= record.start: continue if not record.attr: record.attr['attributes'] = 'none' print >> f, record.as_gff() with open(snpwork_genomes/(self.name+'.fa'),'wb') as f: for name, seq in io.read_sequences(self.reference_fasta_filename()): io.write_fasta(f, name, seq) io.execute('java -jar JAR build NAME -gff3 -c CONFIG', JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
def coding_regions(self): coding_regions = {} annotations = list( annotation.read_annotations(join(self.dirname, 'reference.gff'))) annotation.link_up_annotations(annotations) for item in annotations: if item.type == "CDS": [mrna] = item.parents [gene] = mrna.parents name = gene.get_id() if name not in coding_regions: coding_regions[name] = [] coding_regions[name].append(item) for gene in annotations: if gene.type != "gene": continue name = gene.get_id() if name not in coding_regions: coding_region = gene.three_prime() coding_region.attr = {"ID": name} else: items = coding_regions[name] coding_region = annotation.Annotation( type="coding_region", seqid=items[0].seqid, strand=items[0].strand, start=min(item2.start for item2 in items), end=max(item2.end for item2 in items), attr={"ID": name}, ) coding_regions[name] = coding_region return coding_regions
def set_annotations(self, filenames): f = self.open('reference.gff', 'wb') print >> f, '##gff-version 3' for filename in filenames: for feature in annotation.read_annotations(filename): print >> f, feature.as_gff() f.close()
def run(self): only = set( item.lower() for item in self.only ) exclude = set( item.lower() for item in self.exclude ) if self.output is not None: out_file = open(self.output,'wb') else: out_file = sys.stdout print >> out_file, '##gff-version 3' for filename in self.filenames: for item in annotation.read_annotations(filename): if only and item.type.lower() not in only: continue if item.type.lower() in exclude: continue if 'ID' not in item.attr and 'locus_tag' in item.attr: item.attr['ID'] = item.attr['locus_tag'] if 'color' not in item.attr: if item.type == 'CDS': item.attr['color'] = '#008800' if item.type == 'rRNA': item.attr['color'] = '#bb0000' if item.type == 'tRNA': item.attr['color'] = '#bb00bb' if item.type == 'misc_feature': item.attr['color'] = '#8888ff' print >> out_file, item.as_gff() if self.output is not None: out_file.close()
def run(self): extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() with workspace.tempspace() as temp: items = list(annotation.read_annotations(self.annotation)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(self.genome): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ], index = self.index, ).run()
def _describe_peaks(self, r): workspace = io.Workspace(self.output_dir, must_exist=False) counts = io.read_grouped_table(workspace/("expression","peakwise","counts.csv"))["Count"] peak_counts = collections.defaultdict(int) read_counts = collections.defaultdict(int) total = 0 for item in annotation.read_annotations(workspace/("peaks","relation-child.gff")): peak_counts[item.attr.get("Relation","None")] += 1 read_counts[item.attr.get("Relation","None")] += sum(int(c) for c in counts[item.get_id()].values()) total += 1 total_reads = sum(read_counts.values()) r.write("<p>\n") r.write("%d peaks\n" % total) for name, desc in [ ("3'UTR", "in a 3' UTR"), ("Exon", "otherwise in an exon"), ("Downstrand", "otherwise downstrand of a non-coding RNA"), ("Intron", "otherwise in an intron"), ("Antisense", "otherwise antisense to a gene"), ("None", "couldn't be related to annotated genes"), ]: r.write("<br/>%d peaks and %.1f%% of reads %s\n" % (peak_counts[name], read_counts[name]*100.0/total_reads, desc)) r.write("</p>\n")
def make_file_for_primer_3 (gff_file, ref_file, names_file, output_file): # check for a tmp direcory if len(glob.glob("./tmp")) == 0: call (["mkdir", "tmp"]) gff_file = list(annotation.read_annotations(gff_file)) print "\n Reading in the reference file" seq_dict = dict(io.read_sequences(ref_file)) names_file = open(names_file).readlines() config = open("primer_config.txt").readlines() with open("tmp/regions_" + output_file, 'w') as out_f: for name in names_file: sname = name.strip("\n") found = False for line in gff_file: gff_name = line.attr.get ("Name", "No_name") peak = line.attr.get ("id", "No_id") if sname in gff_name.split("/"): out_f.write ("SEQUENCE_ID="+ gff_name.replace("/", "_") + "_" + peak + "\n") out_f.write("SEQUENCE_TEMPLATE=" + line.shifted(-100, 0).get_seq(seq_dict) + "\n") found = True for cline in config: out_f.write(cline.strip("\n") + "\n") out_f.write("=" + "\n") if found ==False: print "Could not find the gene " + sname + " in the gff file"
def run(self): if self.output is not None: out_file = open(self.output, 'wb') else: out_file = sys.stdout annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if 'ID' not in item.attr and 'locus_tag' in item.attr: item.attr['ID'] = item.attr['locus_tag'] if 'color' not in item.attr: if item.type == 'CDS': item.attr['color'] = '#008800' if item.type == 'rRNA': item.attr['color'] = '#bb0000' if item.type == 'tRNA': item.attr['color'] = '#bb00bb' if item.type == 'misc_feature': item.attr['color'] = '#8888ff' print >> out_file, item.as_gff() if self.output is not None: out_file.close()
def run(self): extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() with workspace.tempspace() as temp: items = list(annotation.read_annotations(self.annotation)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(self.genome): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ] + self.extra, index = self.index, shrimp = self.shrimp, bowtie = self.bowtie, star = self.star ).run()
def run(self): only = set(item.lower() for item in self.only) exclude = set(item.lower() for item in self.exclude) if self.output is not None: out_file = open(self.output, 'wb') else: out_file = sys.stdout print >> out_file, '##gff-version 3' for filename in self.filenames: for item in annotation.read_annotations(filename): if only and item.type.lower() not in only: continue if item.type.lower() in exclude: continue if 'ID' not in item.attr and 'locus_tag' in item.attr: item.attr['ID'] = item.attr['locus_tag'] if 'color' not in item.attr: if item.type == 'CDS': item.attr['color'] = '#008800' if item.type == 'rRNA': item.attr['color'] = '#bb0000' if item.type == 'tRNA': item.attr['color'] = '#bb00bb' if item.type == 'misc_feature': item.attr['color'] = '#8888ff' print >> out_file, item.as_gff() if self.output is not None: out_file.close()
def coding_regions(self): coding_regions = { } annotations = list(annotation.read_annotations(join(self.dirname,'reference.gff'))) annotation.link_up_annotations(annotations) for item in annotations: if item.type == "CDS": [ mrna ] = item.parents [ gene ] = mrna.parents name = gene.get_id() if name not in coding_regions: coding_regions[name] = [ ] coding_regions[name].append(item) for gene in annotations: if gene.type != "gene": continue name = gene.get_id() if name not in coding_regions: coding_region = gene.three_prime() coding_region.attr = { "ID" : name } else: items = coding_regions[name] coding_region = annotation.Annotation( type = "coding_region", seqid = items[0].seqid, strand= items[0].strand, start = min(item2.start for item2 in items), end = max(item2.end for item2 in items), attr = { "ID" : name }, ) coding_regions[name] = coding_region return coding_regions
def run(self): features_parent = [ _Related_feature(item, item.start, item.end, []) for item in annotation.read_annotations(self.parent) if selection.matches(self.select_parent, [item.type]) ] features_child = [ _Related_feature(item, item.start, item.end, []) for item in annotation.read_annotations(self.child) if selection.matches(self.select_child, [item.type]) ] index = {} for item in features_child: if item.feature.seqid not in index: index[item.feature.seqid] = span_index.Span_index() index[item.feature.seqid].insert(item) for value in index.values(): value.prepare() for item_1 in features_parent: if item_1.feature.strand == 1: start = item_1.start - self.upstrand end = item_1.end + self.downstrand elif item_1.feature.strand == -1: start = item_1.start - self.downstrand end = item_1.end + self.upstrand else: start = item_1.start - max(self.upstrand, self.downstrand) end = item_1.end + max(self.upstrand, self.downstrand) if item_1.feature.seqid in index: for item_2 in index[item_1.feature.seqid].get(start, end): item_1.relations.append(item_2) item_2.relations.append(item_1) for item in features_parent: item.modify_with_relations(self.use, self.to_child, self.to_parent) with open(self.prefix + '-parent.gff', 'wb') as f: annotation.write_gff3_header(f) for item in features_parent: print >> f, item.feature.as_gff() with open(self.prefix + '-child.gff', 'wb') as f: annotation.write_gff3_header(f) for item in features_child: print >> f, item.feature.as_gff()
def run(self): features_parent = [ _Related_feature(item,item.start,item.end,[]) for item in annotation.read_annotations(self.parent) if selection.matches(self.select_parent, [item.type]) ] features_child = [ _Related_feature(item,item.start,item.end,[]) for item in annotation.read_annotations(self.child) if selection.matches(self.select_child, [item.type]) ] index = { } for item in features_child: if item.feature.seqid not in index: index[item.feature.seqid] = span_index.Span_index() index[item.feature.seqid].insert(item) for value in index.values(): value.prepare() for item_1 in features_parent: if item_1.feature.strand == 1: start = item_1.start - self.upstrand end = item_1.end + self.downstrand elif item_1.feature.strand == -1: start = item_1.start - self.downstrand end = item_1.end + self.upstrand else: start = item_1.start - max(self.upstrand,self.downstrand) end = item_1.end + max(self.upstrand,self.downstrand) if item_1.feature.seqid in index: for item_2 in index[item_1.feature.seqid].get(start,end): item_1.relations.append(item_2) item_2.relations.append(item_1) for item in features_parent: item.modify_with_relations(self.use, self.to_child, self.to_parent) with open(self.prefix + '-parent.gff','wb') as f: annotation.write_gff3_header(f) for item in features_parent: print >> f, item.feature.as_gff() with open(self.prefix + '-child.gff','wb') as f: annotation.write_gff3_header(f) for item in features_child: print >> f, item.feature.as_gff()
def index(filename, type=None, modify = lambda item: item, name = lambda item: item.get_id()): result = { } for item in annotation.read_annotations(filename): if type is not None and item.type != type: continue item = modify(item) assert name(item) not in result result[name(item)] = item return result
def index(filename, type=None, modify=lambda item: item, name=lambda item: item.get_id()): result = collections.OrderedDict() for item in annotation.read_annotations(filename): if type is not None and item.type != type: continue item = modify(item) assert name(item) not in result result[name(item)] = item return result
def run(self): assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.' strand_changer = STRAND_CHANGE[self.change_strand] shift_start_absolute, shift_start_proportion = decode_shift( self.shift_start) shift_end_absolute, shift_end_proportion = decode_shift(self.shift_end) renames = [] if self.rename: for item in self.rename.split(','): new, old = item.split('=') if new != old: renames.append((new, old)) out_file = open(self.prefix + '.gff', 'wb') annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type length = item.end - item.start shift_start = int( math.floor(0.5 + shift_start_absolute + shift_start_proportion * length)) shift_end = int( math.floor(0.5 + shift_end_absolute + shift_end_proportion * length)) if item.strand == 1: item.start += shift_start item.end += shift_end elif item.strand == -1: item.end -= shift_start item.start -= shift_end item.start = max(0, item.start) #IGV complains item.strand = strand_changer[item.strand] old_attr = item.attr.copy() for new, old in renames: if old in item.attr: del item.attr[old] for new, old in renames: if old in old_attr: item.attr[new] = old_attr[old] print >> out_file, item.as_gff() out_file.close()
def run(self): annotations = [ ] for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type annotations.append(item) annotations.sort(key=lambda item: (item.seqid, item.strand, item.start)) group = [ ] groups = [ ] def emit(): if not group: return groups.append(group[:]) del group[:] seqid = None strand = None end = 0 for item in annotations: if item.seqid != seqid or item.strand != strand or item.start >= end: emit() seqid = item.seqid strand = item.strand end = item.end-self.overlap group.append(item) end = max(item.end-self.overlap, end) emit() out_file = open(self.prefix+'.gff','wb') annotation.write_gff3_header(out_file) for group in groups: item = annotation.Annotation() item.source = group[0].source item.type = join_descriptions( item2.type for item2 in group ) item.seqid = group[0].seqid item.strand = group[0].strand item.start = min( item2.start for item2 in group ) item.end = max( item2.end for item2 in group ) item.score = None item.phase = None item.attr = { } for item2 in group: for key in item2.attr: if key in item.attr: continue item.attr[key] = join_descriptions( item3.attr[key] for item3 in group if key in item3.attr ) print >> out_file, item.as_gff() out_file.close()
def run(self): assert self.release assert self.species assert self.assembly assert self.dna extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() ensembl = workspace.Workspace(work/'ensembl') genome_filename = self.species+"."+self.assembly+"."+self.dna+".fa.gz" genome_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/fasta/"+self.species.lower()+"/dna/"+genome_filename gff_filename = self.species+"."+self.assembly+"."+self.release+".gff3.gz" gff_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/gff3/"+self.species.lower()+"/"+gff_filename if self.download: self.log.log("Fetching "+genome_url+"\n") io.execute(['rsync','-aP',genome_url, ensembl/genome_filename]) self.log.log("Fetching "+gff_url+"\n") io.execute(['rsync','-aP',gff_url, ensembl/gff_filename]) with workspace.tempspace() as temp: items = list(annotation.read_annotations(ensembl/gff_filename)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(ensembl/genome_filename): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ], index = self.index, ).run()
def run(self): f = self.begin_output() for filename in self.filenames: info = io.get_file_info(filename) any = False name = os.path.splitext(os.path.split(filename)[1])[0] if info.matches('sequences'): total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'total bases', total_length) if total: print >> f, grace.datum(name, 'average length', float(total_length) / total) print >> f any = True if info.matches('annotations'): total = 0 counts = {} for item in annotation.read_annotations(filename, "/"): total += 1 counts[item.type] = counts.get(item.type, 0) + 1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if info.matches('type-vcf'): reader_f = io.open_possibly_compressed_file(filename) reader = vcf.Reader(reader_f) n = 0 for item in reader: n += 1 print >> f, grace.datum(name, 'variants', n) any = True if not any: raise grace.Error('Don\'t know what to do with ' + filename) self.end_output(f)
def run(self): f = self.begin_output() for filename in self.filenames: any = False name = os.path.splitext(os.path.split(filename)[1])[0] try: iterator = io.read_sequences(filename, qualities=True) except grace.Error: iterator = None if iterator is not None: total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'average length', float(total_length) / total) print >> f any = True try: iterator = annotation.read_annotations(filename) except grace.Error: iterator = None if iterator: total = 0 counts = {} for item in iterator: total += 1 counts[item.type] = counts.get(item.type, 0) + 1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if not any: raise grace.Error( filename + ' is neither a sequence file nor an annotation file that nesoni can read.' ) self.end_output(f)
def run(self): assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.' strand_changer = STRAND_CHANGE[self.change_strand] shift_start_absolute, shift_start_proportion = decode_shift(self.shift_start) shift_end_absolute, shift_end_proportion = decode_shift(self.shift_end) renames = [ ] if self.rename: for item in self.rename.split(','): new, old = item.split('=') if new != old: renames.append((new,old)) out_file = open(self.prefix+'.gff','wb') annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type length = item.end-item.start shift_start = int(math.floor(0.5+shift_start_absolute+shift_start_proportion*length)) shift_end = int(math.floor(0.5+shift_end_absolute+shift_end_proportion*length)) if item.strand == 1: item.start += shift_start item.end += shift_end elif item.strand == -1: item.end -= shift_start item.start -= shift_end item.start = max(0, item.start) #IGV complains item.strand = strand_changer[item.strand] old_attr = item.attr.copy() for new,old in renames: if old in item.attr: del item.attr[old] for new,old in renames: if old in old_attr: item.attr[new] = old_attr[old] print >> out_file, item.as_gff() out_file.close()
def run(self): f = self.begin_output() for filename in self.filenames: info = io.get_file_info(filename) any = False name = os.path.splitext(os.path.split(filename)[1])[0] if info.matches('sequences'): total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'total bases', total_length) if total: print >> f, grace.datum(name, 'average length', float(total_length)/total) print >> f any = True if info.matches('annotations'): total = 0 counts = { } for item in annotation.read_annotations(filename): total += 1 counts[item.type] = counts.get(item.type,0)+1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if info.matches('type-vcf'): reader_f = io.open_possibly_compressed_file(filename) reader = vcf.Reader(reader_f) n = 0 for item in reader: n += 1 print >> f, grace.datum(name, 'variants', n) any = True if not any: raise grace.Error('Don\'t know what to do with ' + filename) self.end_output(f)
def run(self): f = self.begin_output() for filename in self.filenames: any = False name = os.path.splitext(os.path.split(filename)[1])[0] try: iterator = io.read_sequences(filename, qualities=True) except grace.Error: iterator = None if iterator is not None: total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'average length', float(total_length)/total) print >> f any = True try: iterator = annotation.read_annotations(filename) except grace.Error: iterator = None if iterator: total = 0 counts = { } for item in iterator: total += 1 counts[item.type] = counts.get(item.type,0)+1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if not any: raise grace.Error(filename + ' is neither a sequence file nor an annotation file that nesoni can read.') self.end_output(f)
def run(self): assert self.change_strand in STRAND_CHANGE, 'Unknown way to change strand.' strand_changer = STRAND_CHANGE[self.change_strand] out_file = open(self.prefix+'.gff','wb') annotation.write_gff3_header(out_file) for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if item.strand == 1: item.start += self.shift_start item.end += self.shift_end elif item.strand == -1: item.end -= self.shift_start item.start -= self.shift_end item.strand = strand_changer[item.strand] print >> out_file, item.as_gff() out_file.close()
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames = self.filenames, snpeff = False, cs = 'ifavailable' if self.index else False, ls = False, bowtie = 'ifavailable' if self.index else False, ).run() annotations = list(annotation.read_annotations(work/'reference.gff')) annotation.link_up_annotations(annotations) exon_index = span_index.index_annotations([ item for item in annotations if item.type == "exon" ]) mrna_end_index = span_index.index_annotations([ item.three_prime() for item in annotations if item.type == "mRNA" ]) mrna_utrs = [ ] gene_utrs = [ ] for gene in annotations: if gene.type != 'gene': continue mrnas = [ item for item in gene.children if item.type == 'mRNA' ] assert mrnas, "Gene without any mRNAs: "+gene.get_id() gene.attr['color'] = '#880088' gene.start = min(item.start for item in mrnas) gene.end = max(item.end for item in mrnas) gene.attr["max_extension"] = str(_max_extension(gene, exon_index, mrna_end_index)) gene_utr_5primes = [ ] for mrna in mrnas: assert mrna.strand == gene.strand, mrna assert mrna.seqid == gene.seqid, mrna mrna.attr["max_extension"] = str(_max_extension(mrna, exon_index, mrna_end_index)) cdss = [ item for item in mrna.children if item.type == 'CDS' ] exons = [ item for item in mrna.children if item.type == 'exon' ] if not exons: continue #link up annotations sorts children, so final is really final for item in exons[:-1]: item.attr["max_extension"] = "0" exons[-1].attr["max_extension"] = mrna.attr["max_extension"] if not cdss: continue mrna_utr_5primes = [ ] if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end >= cds_3prime: mrna_utr_5primes.append(max(item.start,cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start <= cds_3prime: mrna_utr_5primes.append(min(item.end,cds_3prime)) if mrna.strand >= 0: utr_start = min(mrna_utr_5primes) if mrna_utr_5primes else mrna.end utr_end = max(utr_start+1,mrna.end) gene_utr_5primes.append(utr_start) else: utr_end = max(mrna_utr_5primes) if mrna_utr_5primes else mrna.start utr_start = min(mrna.start,utr_end-1) gene_utr_5primes.append(utr_end) attr = mrna.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = mrna.seqid, strand = mrna.strand, start = utr_start, end = utr_end, attr = attr, ) max_ext = _max_extension(utr, exon_index, mrna_end_index) utr.attr["max_extension"] = str(max_ext) #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon if utr_end-utr_start+max_ext > 1: mrna_utrs.append(utr) if gene.strand >= 0: utr_start = max(gene_utr_5primes) if gene_utr_5primes else gene.end utr_end = max(utr_start+1,gene.end) else: utr_end = min(gene_utr_5primes) if gene_utr_5primes else gene.start utr_start = min(gene.start,utr_end-1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = gene.seqid, strand = gene.strand, start = utr_start, end = utr_end, attr = attr, ) utr.attr["max_extension"] = str(_max_extension(utr, exon_index, mrna_end_index)) gene_utrs.append(utr) annotation.write_gff3(work/'reference.gff', annotations + mrna_utrs) annotation.write_gff3(work/'utr.gff', gene_utrs) work.update_param(tail_tools_reference_version=work.VERSION)
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [item.lower() for item in self.types.split(',')] parts = self.parts or self.types parts = [item.lower() for item in parts.split(',')] all_annotations = list( annotation.read_annotations(annotations_filename)) annotation.link_up_annotations(all_annotations) for item in all_annotations: item.primary = None annotations = [ item for item in all_annotations if item.type.lower() in types ] part_annotations = [] seen = set() queue = [(item, item) for item in annotations] while queue: primary, item = queue.pop() if item.type.lower() in parts: assert item.primary is None, "Feature with multiple parents" item.primary = primary key = (id(primary), item.start, item.end, item.seqid, item.strand) # Ignore duplicate exons (many isoforms will have the same exons) if key not in seen: seen.add(key) part_annotations.append(item) queue.extend((primary, item2) for item2 in item.children) del seen del all_annotations self.log.log('%d annotations\n' % len(annotations)) self.log.log('%d part annotations\n' % len(part_annotations)) #assert annotations, 'No annotations of specified types in file' for item in part_annotations: this_extension = self.extension if "max_extension" in item.attr: this_extension = min(this_extension, int(item.attr["max_extension"])) if item.strand >= 0: item.tail_pos = item.end item.end += this_extension else: item.tail_pos = item.start item.start -= this_extension for item in annotations: item.hits = [] # [ (tail_length, adaptor_bases) ] index = span_index.index_annotations(part_annotations) for alignment in sam.Bam_reader(workspace / 'alignments_filtered_sorted.bam'): if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary: continue start = alignment.reference_start end = alignment.reference_end alignment_length = end - start strand = -1 if alignment.flag & sam.FLAG_REVERSE else 1 fragment_feature = annotation.Annotation( seqid=alignment.reference_name, start=start, end=end, strand=strand) if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in alignment.extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) hits = index.get(fragment_feature, same_strand=True) if hits: gene = min( hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.primary.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice gene.primary.hits.append((tail_length, adaptor_bases)) for item in annotations: del item.parents del item.children del item.primary f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()
def run(self): #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given' # Reference genome #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths() chromosomes = collections.OrderedDict(io.read_sequences( self.reference)) def get_interpeak_seq(peaks): start = min(item.transcription_stop for item in peaks) end = max(item.transcription_stop for item in peaks) if end - start > self.max_seq: return '' if peaks[0].strand >= 0: return chromosomes[peaks[0].seqid][start:end] else: return bio.reverse_complement( chromosomes[peaks[0].seqid][start:end]) def get_prepeak_seq(gene, peaks): if gene.strand >= 0: start = gene.utr_pos end = min(item.transcription_stop for item in peaks) if end - start > self.max_seq: return '' return chromosomes[gene.seqid][start:end] else: start = max(item.transcription_stop for item in peaks) end = gene.utr_pos if end - start > self.max_seq: return '' return bio.reverse_complement( chromosomes[gene.seqid][start:end]) # Normalization files if self.norm_file: norm_file = self.norm_file else: nesoni.Norm_from_counts(self.prefix + '-norm', self.counts).run() norm_file = self.prefix + '-norm.csv' norms = io.read_grouped_table(norm_file, [('All', str)])['All'] pair_norm_names = [] pair_norms = [] for i in xrange(len(norms)): pair_norm_names.append(norms.keys()[i] + '-peak1') pair_norms.append(norms.values()[i]) for i in xrange(len(norms)): pair_norm_names.append(norms.keys()[i] + '-peak2') pair_norms.append(norms.values()[i]) io.write_grouped_csv( self.prefix + '-pairs-norm.csv', [('All', io.named_list_type(pair_norm_names)(pair_norms))], comments=['#Normalization'], ) # Read data annotations = list(annotation.read_annotations(self.parents)) if self.utrs: utrs = list(annotation.read_annotations(self.utrs)) else: utrs = [] children = list(annotation.read_annotations(self.children)) count_table = io.read_grouped_table(self.counts, [('Count', int), ('Tail_count', int), ('Tail', _float_or_none), ('Proportion', _float_or_none), ('Annotation', str)]) counts = count_table['Count'] tail_counts = count_table['Tail_count'] proportions = count_table['Proportion'] tails = count_table['Tail'] samples = counts.value_type().keys() sample_tags = {} for line in count_table.comments: if line.startswith('#sampleTags='): parts = line[len('#sampleTags='):].split(',') assert parts[0] not in sample_tags sample_tags[parts[0]] = parts for item in children: item.weight = sum(counts[item.get_id()][name] * float(norms[name]['Normalizing.multiplier']) for name in samples) parents = [] id_to_parent = {} for item in annotations: if item.type != self.parent_type: continue assert item.get_id( ) not in id_to_parent, 'Duplicate id in parent file: ' + item.get_id( ) parents.append(item) id_to_parent[item.get_id()] = item item.children = [] #item.cds = [ ] # Default utr if item.strand >= 0: item.utr_pos = item.end else: item.utr_pos = item.start if 'three_prime_UTR_start' in item.attr: if item.strand >= 0: item.utr_pos = int(item.attr['three_prime_UTR_start']) - 1 else: item.utr_pos = int(item.attr['three_prime_UTR_start']) for item in utrs: assert item.attr[ 'Parent'] in id_to_parent, 'Unknown gene ' + item.attr['Parent'] id_to_parent[item.attr['Parent']].utr_pos = ( item.start if item.strand >= 0 else item.end) for item in children: item.transcription_stop = item.end if item.strand >= 0 else item.start #End of transcription, 0-based, ie between-positions based if 'Parent' in item.attr and item.attr.get( "Relation") != "Antisense": for item_parent in item.attr['Parent'].split(','): parent = id_to_parent[item_parent] parent.children.append(item) for item in parents: item.children.sort(key=_annotation_sorter) relevant = list(item.children) if self.utr_only: #if item.strand <= 0: # relative_utr_start = item.end - int(item.attr['three_prime_UTR_start']) #else: # relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start # #def relative_start(peak): # return item.end-peak.end if item.strand < 0 else peak.start-item.start #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ] #relevant = [ # peak for peak in relevant # if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos) # ] relevant = [ peak for peak in relevant if peak.attr.get("Relation") == "3'UTR" ] if self.top: relevant.sort(key=lambda peak: peak.weight, reverse=True) relevant = relevant[:self.top] relevant.sort(key=_annotation_sorter) item.relevant_children = relevant # JSON output j_data = {} j_genes = j_data['genes'] = {} j_genes['__comment__'] = 'start is 0-based' j_genes['name'] = [] j_genes['chromosome'] = [] j_genes['strand'] = [] j_genes['start'] = [] j_genes['utr'] = [] j_genes['end'] = [] j_genes['gene'] = [] j_genes['product'] = [] j_genes['peaks'] = [] j_genes['relevant_peaks'] = [] #j_genes['cds'] = [ ] #j_genes['cds_start'] = [ ] #j_genes['cds_end'] = [ ] for item in parents: j_genes['name'].append(item.get_id()) j_genes['chromosome'].append(item.seqid) j_genes['strand'].append(item.strand) j_genes['start'].append(item.start) j_genes['utr'].append(item.utr_pos) j_genes['end'].append(item.end) j_genes['gene'].append( item.attr.get('Name', item.attr.get('gene', ''))) j_genes['product'].append( item.attr.get('Product', item.attr.get('product', ''))) j_genes['peaks'].append( [item2.get_id() for item2 in item.children]) j_genes['relevant_peaks'].append( [item2.get_id() for item2 in item.relevant_children]) #j_genes['cds'].append( item.cds ) #j_genes['cds_start'].append( item.cds_start ) #j_genes['cds_end'].append( item.cds_end ) j_peaks = j_data['peaks'] = {} j_peaks['__comment__'] = 'start is 0-based' j_peaks['name'] = [] j_peaks['chromosome'] = [] j_peaks['strand'] = [] j_peaks['start'] = [] j_peaks['end'] = [] j_peaks['parents'] = [] j_peaks['counts'] = [] j_peaks['tail_lengths'] = [] j_peaks['proportion_tailed'] = [] for item in children: j_peaks['name'].append(item.get_id()) j_peaks['chromosome'].append(item.seqid) j_peaks['strand'].append(item.strand) j_peaks['start'].append(item.start) j_peaks['end'].append(item.end) j_peaks['parents'].append(item.attr['Parent'].split(',') if 'Parent' in item.attr else []) j_peaks['counts'].append(counts[item.get_id()].values()) j_peaks['tail_lengths'].append( count_table['Tail'][item.get_id()].values()) j_peaks['proportion_tailed'].append( count_table['Proportion'][item.get_id()].values()) j_samples = j_data['samples'] = {} j_samples['name'] = [] j_samples['tags'] = [] j_samples['normalizing_multiplier'] = [] for name in samples: j_samples['name'].append(name) j_samples['tags'].append(sample_tags[name]) j_samples['normalizing_multiplier'].append( float(norms[name]['Normalizing.multiplier'])) j_chromosomes = j_data['chromosomes'] = {} j_chromosomes['name'] = [] j_chromosomes['length'] = [] for name, seq in chromosomes.iteritems(): j_chromosomes['name'].append(name) j_chromosomes['length'].append(len(seq)) with open(self.prefix + '.json', 'wb') as f: json.dump(j_data, f) # Output paired peak file output_comments = ['#Counts'] output_samples = [] for item in samples: output_samples.append(item + '-peak1') output_comments.append('#sampleTags=' + ','.join([item + '-peak1', 'peak1'] + sample_tags.get(item, []))) for item in samples: output_samples.append(item + '-peak2') output_comments.append('#sampleTags=' + ','.join([item + '-peak2', 'peak2'] + sample_tags.get(item, []))) output_names = [] output_counts = [] output_tail_counts = [] output_proportions = [] output_tails = [] output_annotation_fields = [ 'gene', 'product', 'biotype', 'mean_tail_1', 'mean_tail_2', 'chromosome', 'strand', 'transcription_stops' ] #, 'interpeak_seq', ] output_annotations = [] for item in parents: peaks = item.relevant_children for i in xrange(len(peaks) - 1): for j in xrange(i + 1, len(peaks)): id_i = peaks[i].get_id() id_j = peaks[j].get_id() id_pair = item.get_id() + '-' + id_i + '-' + id_j output_names.append(id_pair) row = [] row.extend(counts[id_i].values()) row.extend(counts[id_j].values()) output_counts.append(filter(_text, row)) row = [] row.extend(tail_counts[id_i].values()) row.extend(tail_counts[id_j].values()) output_tail_counts.append(filter(_text, row)) row = [] row.extend(proportions[id_i].values()) row.extend(proportions[id_j].values()) output_proportions.append(filter(_text, row)) row = [] row.extend(tails[id_i].values()) row.extend(tails[id_j].values()) output_tails.append(filter(_text, row)) output_annotations.append([ item.attr.get('Name', item.attr.get('gene', '')), item.attr.get('Product', item.attr.get('product', '')), item.attr.get('Biotype', ''), count_table['Annotation'][id_i]['mean-tail'], count_table['Annotation'][id_j]['mean-tail'], item.seqid, str(item.strand), '%d, %d' % (peaks[i].transcription_stop, peaks[j].transcription_stop), #get_interpeak_seq([peaks[i],peaks[j]]), ]) #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts) io.write_grouped_csv( self.prefix + '-pairs.csv', [ ('Count', io.named_matrix_type(output_names, output_samples)(output_counts)), ('Tail_count', io.named_matrix_type(output_names, output_samples)(output_tail_counts)), ('Proportion', io.named_matrix_type(output_names, output_samples)(output_proportions)), ('Tail', io.named_matrix_type(output_names, output_samples)(output_tails)), ('Annotation', io.named_matrix_type( output_names, output_annotation_fields)(output_annotations)), ], comments=output_comments, )
def run(self): annotations = [ ] for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type annotations.append(item) annotations.sort(key=lambda item: (item.type, item.seqid, item.strand, item.start)) group = [ ] groups = [ ] def emit(): if not group: return groups.append(group[:]) del group[:] type = None seqid = None strand = None end = 0 for item in annotations: if item.type != type or item.seqid != seqid or item.strand != strand or item.start >= end: emit() type = item.type seqid = item.seqid strand = item.strand end = item.end-self.overlap group.append(item) end = max(item.end-self.overlap, end) emit() items = [ ] id_map = { } for group in groups: item = annotation.Annotation() item.source = group[0].source item.type = group[0].type item.seqid = group[0].seqid item.strand = group[0].strand item.start = min( item2.start for item2 in group ) item.end = max( item2.end for item2 in group ) item.score = None item.phase = None item.attr = { } for item2 in group: for key in item2.attr: if key in item.attr: continue item.attr[key] = join_descriptions([ item3.attr[key] for item3 in group if key in item3.attr ], self.joiner ) item.parents = [ ] for item2 in group: if 'ID' in item2.attr: assert item2.attr['ID'] not in id_map, 'Duplicate ID: '+item2.attr['ID'] id_map[item2.attr['ID']] = item.attr['ID'] if 'Parent' in item2.attr: item.parents.append(item2.attr['Parent']) items.append(item) for item in items: if item.parents: item.attr['Parent'] = join_descriptions([ id_map.get(parent,parent) for parent in item.parents ], ',') with open(self.prefix+'.gff','wb') as out_file: annotation.write_gff3_header(out_file) for item in items: print >> out_file, item.as_gff()
def run(self): annotations = [] for filename in self.filenames: for item in annotation.read_annotations(filename): if not selection.matches(self.select, [item.type]): continue if self.type: item.type = self.type annotations.append(item) annotations.sort( key=lambda item: (item.type, item.seqid, item.strand, item.start)) group = [] groups = [] def emit(): if not group: return groups.append(group[:]) del group[:] type = None seqid = None strand = None end = 0 for item in annotations: if item.type != type or item.seqid != seqid or item.strand != strand or item.start >= end: emit() type = item.type seqid = item.seqid strand = item.strand end = item.end - self.overlap group.append(item) end = max(item.end - self.overlap, end) emit() items = [] id_map = {} for group in groups: item = annotation.Annotation() item.source = group[0].source item.type = group[0].type item.seqid = group[0].seqid item.strand = group[0].strand item.start = min(item2.start for item2 in group) item.end = max(item2.end for item2 in group) item.score = None item.phase = None item.attr = {} for item2 in group: for key in item2.attr: if key in item.attr: continue item.attr[key] = join_descriptions([ item3.attr[key] for item3 in group if key in item3.attr ], self.joiner) item.parents = [] for item2 in group: if 'ID' in item2.attr: assert item2.attr[ 'ID'] not in id_map, 'Duplicate ID: ' + item2.attr['ID'] id_map[item2.attr['ID']] = item.attr['ID'] if 'Parent' in item2.attr: item.parents.append(item2.attr['Parent']) items.append(item) for item in items: if item.parents: item.attr['Parent'] = join_descriptions( [id_map.get(parent, parent) for parent in item.parents], ',') with open(self.prefix + '.gff', 'wb') as out_file: annotation.write_gff3_header(out_file) for item in items: print >> out_file, item.as_gff()
def count_run( min_score, min_size, max_size, filter_mode, equalize, types, locii, qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log): if filter_mode == 'poly': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = False expect_multiple_alignments = True elif filter_mode == 'mono': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = True expect_multiple_alignments = True else: assert filter_mode == 'existing', 'Unrecognized filtering mode' use_bam_filename = 'alignments_filtered.bam' use_only_top = False use_only_monogamous = False expect_multiple_alignments = False types = types.lower().split(',') qualifiers = qualifiers.split(',') if locii: locii = locii.lower().split(',') else: locii = None assert use_strand is not None, 'You must now explicitly specify --strand' assert use_strand in ('pool','forward','reverse','both'), "Can't understand --strand specification." from Bio import Seq, SeqIO annotation_filenames = [ ] bam_filenames = [ ] for arg in filenames: if annotation.is_annotation_file(arg): annotation_filenames.append(arg) else: bam_filenames.append(arg) n_samples = len(bam_filenames) titles = bam_filenames[:] for i in xrange(len(bam_filenames)): if os.path.isdir(bam_filenames[i]): titles[i] = os.path.basename(bam_filenames[i]) if not annotation_filenames: working = working_directory.Working(bam_filenames[i]) reference_filename = working.get_reference().annotations_filename() if reference_filename is not None: annotation_filenames.append(reference_filename) bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename) assert bam_filenames, 'No reference alignments given' merge = { } merge_qualifiers = { } if merge_filename is not None: #First line gives qualifiers #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...> f = open(merge_filename,'rU') qualifiers = f.readline().rstrip('\n').split('\t') for line in f: parts = line.rstrip('\n').split('\t') if not parts: continue for name in parts[len(qualifiers)+1:]: assert name not in merge, 'Duplicate feature name in merge file' merge[name] = parts[len(qualifiers)] merge_qualifiers[name] = parts[:len(qualifiers)] f.close() genes = { } # reference name -> gene index feature_names = { } # feature_name -> number of occurrences features = [ ] n_features = 0 chromosome_length = { } for filename in bam_filenames: headers = sam.bam_headers(filename) for line in headers.split('\n'): if not line: continue parts = line.split('\t') if parts[0] != '@SQ': continue name = None length = None for part in parts[1:]: if part.startswith('SN:'): name = part[3:] if part.startswith('LN:'): length = int(part[3:]) assert name is not None and length is not None if name in chromosome_length: assert chromosome_length[name] == length else: chromosome_length[name] = length for name in chromosome_length: genes[name] = span_index.Span_index() if annotation_filenames: assert not merge, 'Merging not supported with annotation files' for filename in annotation_filenames: for feature in annotation.read_annotations(filename): if feature.type.lower() not in types: continue if (locii is not None and ('locus_tag' not in feature.attr or feature.attr['locus_tag'].lower() not in locii)): continue f = Feature(n_samples) f.name = feature.get_id() if feature.type.lower() != 'cds' and len(types) > 1: f.name = feature.type + ':' + f.name feature_names[f.name] = feature_names.get(f.name,0)+1 if feature_names[f.name] > 1: f.name += '/%d' % feature_names[f.name] f.qualifiers = [ feature.attr.get(item,'') for item in qualifiers ] f.length = feature.end - feature.start assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files' genes[feature.seqid].insert(Span_entry(feature.start, feature.end, feature.strand or 1, f)) features.append(f) else: # Sequences as features log.log('No annotation files given or found, using sequences as features\n') name_feature = { } # (merged)name -> feature for name in chromosome_length: merged_name = merge.get(name, name) if merged_name not in name_feature: f = Feature(n_samples) f.name = merged_name f.length = length f.qualifiers = merge_qualifiers.get(name, ('',)*len(qualifiers)) n_features += 1 name_feature[merged_name] = f features.append(f) else: f = name_feature[merged_name] f.length = max(f.length, length) #... genes[name].insert(Span_entry(0, chromosome_length[name], 1, f)) log.log('%d features\n\n' % len(features)) for name in genes: genes[name].prepare() n_fragments = [ 0 ] * n_samples n_fragments_aligned = [ 0 ] * n_samples n_low_score = [ 0 ] * n_samples n_something = [ 0 ] * n_samples n_multiple = [ 0 ] * n_samples n_span = [ 0 ] * n_samples for i in xrange(n_samples): for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(bam_filenames[i], 'Counting sample %d of %d' % (i+1,n_samples)): n_fragments[i] += 1 if not fragment_alignments: continue n_fragments_aligned[i] += 1 feature_hits = [ ] # [ [ (feature, strand) ] ] # Use only top scoring alignments fragment_scores = [ sum( al.get_AS() for al in item ) for item in fragment_alignments ] best_score = max(fragment_scores) if min_score is not None and best_score < min_score: n_low_score[i] += 1 continue if use_only_top: cutoff = max(best_score, min_score) else: cutoff = min_score fragment_alignments = [ item for item, score in zip(fragment_alignments, fragment_scores) if score >= cutoff ] for alignments in fragment_alignments: strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1 start = min(item.pos-1 for item in alignments) end = max(item.pos+item.length-1 for item in alignments) length = end-start if min_size is not None and length < min_size: continue if max_size is not None and length > max_size: continue rname = alignments[0].rname strand = -1 if alignments[0].flag&sam.FLAG_REVERSE else 1 assert alignments[0].rname in genes, 'Alignment refers to sequence not present in GENBANK file' this_feature_hits = [ ] for item in genes[rname].get(start, end): rel_strand = strand * item.strand key = (item.feature, rel_strand) if key in this_feature_hits: continue this_feature_hits.append( key ) if not use_only_monogamous or len(fragment_alignments) == 1: item.feature.count[rel_strand][i] += 1 if this_feature_hits: feature_hits.append( this_feature_hits ) if len(this_feature_hits) > 1: for a in this_feature_hits: for b in this_feature_hits: if a[0] is b[0]: continue a[0].common[(a[1],b[1])][b[0]] += 1 if len(feature_hits) > 0: n_something[i] += 1 #else: # print fragment_alignments # print genes[fragment_alignments[0][0].rname].indexes # print if len(feature_hits) > 1: n_multiple[i] += 1 for j in xrange(len(feature_hits)): for k in xrange(len(feature_hits)): if j == k: continue for a in feature_hits[j]: for b in feature_hits[k]: if a[0] is b[0]: continue a[0].ambiguous[(a[1],b[1])][b[0]] += 1 if any(len(item) > 1 for item in feature_hits): n_span[i] += 1 if limit is not None and n_fragments[i] >= limit: break grace.status('') #log.log('%s\n' % titles[i]) #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i])) #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i])) #if n_low_score[i]: # log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i])) #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i])) #if expect_multiple_alignments or n_multiple[i]: # log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i])) #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i])) #log.log('\n') log.datum(titles[i], 'fragments', n_fragments[i]) log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i]) if n_low_score[i]: log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i]) log.datum(titles[i], 'aligned to an annotated gene', n_something[i]) if expect_multiple_alignments or n_multiple[i]: log.datum(titles[i], 'aligned to multiple genes', n_multiple[i]) log.datum(titles[i],'had an alignment that spanned multiple genes', n_span[i]) log.log('\n') strandedness = [ ] for feature in features: n_forward = sum(feature.count[1]) n_reverse = sum(feature.count[-1]) if n_forward+n_reverse < 5: continue strandedness.append( (n_forward-n_reverse)*100.0 / (n_forward+n_reverse) ) strandedness = sum(strandedness) / len(strandedness) log.log('Strand specificity: %.0f%%\n' ' (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n' ' Average over all features with at least 5 hits.)\n' % strandedness) if use_strand == 'pool': getters = [ lambda f: (feature.name, add_lists(feature.count[1],feature.count[-1]), add_defdicts(feature.common[(1,1)], feature.common[(1,-1)], feature.common[(-1,1)], feature.common[(-1,-1)]), add_defdicts(feature.ambiguous[(1,1)], feature.ambiguous[(1,-1)], feature.ambiguous[(-1,1)], feature.ambiguous[(-1,-1)])) ] elif use_strand == 'forward': getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]) ] elif use_strand == 'reverse': getters = [ lambda f: (feature.name, feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ] elif use_strand == 'both': getters = [ lambda f: (feature.name, feature.count[1], feature.common[(1,1)], feature.ambiguous[(1,1)]), lambda f: (feature.name + 'r', feature.count[-1], feature.common[(-1,-1)], feature.ambiguous[(-1,-1)]) ] total_hits = [0] * n_samples for feature in features: for getter in getters: total_hits = add_lists(total_hits, getter(feature)[1]) if equalize: min_hits = min(total_hits) p = [ float(min_hits)/item for item in total_hits ] total_hits = [ min_hits ] * n_samples f = open(output_prefix + '.txt', 'wb') #log.attach(open(output_prefix + '_log.txt', 'wb')) print >> f, tab_encode( [ 'Feature' ] + titles + [ 'RPKM ' + item for item in titles ] + [ 'Length' ] + qualifiers + [ 'On same fragment' ] + ([ 'Ambiguous alignment' ] if expect_multiple_alignments else [ ]) ) for feature in features: for getter in getters: feature_name, count, common, ambiguous = getter(feature) if equalize: count = [ subsample(count[i], p[i]) for i in xrange(n_samples) ] rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ] common_str = ' '.join( '%dx%s' % (item[1],item[0]) for item in sorted(common.items(), key=lambda item:item[1], reverse=True) ) ambiguous_str = ' '.join( '%dx%s' % (item[1],item[0]) for item in sorted(ambiguous.items(), key=lambda item:item[1], reverse=True) ) print >> f, tab_encode( [ feature_name ] + [ str(item) for item in count ] + [ '%.2f' % item for item in rpkm ] + [ str(feature.length) ] + list(feature.qualifiers) + [ common_str ] + ([ ambiguous_str ] if expect_multiple_alignments else [ ]) ) f.close()
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames = self.filenames, snpeff = False, cs = 'ifavailable' if self.index else False, ls = False, bowtie = 'ifavailable' if self.index else False, ).run() annotations = list(annotation.read_annotations(work/'reference.gff')) annotation.link_up_annotations(annotations) with open(work/'utr.gff','wb') as f: annotation.write_gff3_header(f) for gene in annotations: if gene.type != 'gene': continue mrnas = [ item for item in gene.children if item.type == 'mRNA' ] utr_5primes = [ ] for mrna in mrnas: cdss = [ item for item in mrna.children if item.type == 'CDS' ] exons = [ item for item in mrna.children if item.type == 'exon' ] if not cdss or not exons: continue if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end > cds_3prime: utr_5primes.append(max(item.start,cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start < cds_3prime: utr_5primes.append(min(item.end,cds_3prime)) if gene.strand >= 0: utr_start = max(utr_5primes) if utr_5primes else gene.end utr_end = max(utr_start+1,gene.end) else: utr_end = min(utr_5primes) if utr_5primes else gene.start utr_start = min(gene.start,utr_end-1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' thing = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = gene.seqid, strand = gene.strand, start = utr_start, end = utr_end, attr = attr, ) print >> f, thing.as_gff() work.update_param(tail_tools_reference_version=work.VERSION)
def run(self): items = list(annotation.read_annotations(self.parent)) annotation.link_up_annotations(items) for item in items: assert len(item.parents) <= 1 genes = [ item for item in items if item.type == "gene" ] downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ] exons = [ item for item in items if item.type == "exon" ] utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ] gene_index = span_index.index_annotations(genes) downstrand_gene_index = span_index.index_annotations(downstrand_genes) exon_index = span_index.index_annotations(exons) utr_index = span_index.index_annotations(utrs) peaks = [ ] for peak in annotation.read_annotations(self.child): if float(peak.attr.get("mean_tail","0.0")) < self.min_tail: continue peaks.append(peak) for peak in peaks: # Query is final base in genome before poly(A) starts query = peak.three_prime().shifted(-1,0) hit_to = "3'UTR" hits = [ item.parents[0].parents[0] for item in utr_index.get(query, True) ] if not hits: hit_to = "Exon" hits = [ item.parents[0].parents[0] for item in exon_index.get(query, True) ] # For non-coding RNAs, which don't have a 3' UTR if not hits: hit_to = "Downstrand" hits = downstrand_gene_index.get(query, True) if not hits: hit_to = "Intron" hits = gene_index.get(query, True) antisense_hits = gene_index.get(query.reversed(), True) if not hits: hit_to = "Antisense" hits = antisense_hits if hits: peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",") peak.attr["Relation"] = hit_to peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ]) peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ]) peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ]) if antisense_hits: peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",") peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ]) peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ]) peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits]) annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm annotation.write_gff3(self.prefix+"-child.gff", peaks)
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [ item.lower() for item in self.types.split(',') ] parts = self.parts or self.types parts = [ item.lower() for item in parts.split(',') ] all_annotations = list(annotation.read_annotations(annotations_filename)) annotation.link_up_annotations(all_annotations) for item in all_annotations: item.primary = None annotations = [ item for item in all_annotations if item.type.lower() in types ] part_annotations = [ ] seen = set() queue = [ (item,item) for item in annotations ] while queue: primary, item = queue.pop() if item.type.lower() in parts: assert item.primary is None, "Feature with multiple parents" item.primary = primary key = (id(primary),item.start,item.end,item.seqid,item.strand) # Ignore duplicate exons (many isoforms will have the same exons) if key not in seen: seen.add(key) part_annotations.append(item) queue.extend( (primary, item2) for item2 in item.children ) del seen del all_annotations self.log.log('%d annotations\n' % len(annotations)) self.log.log('%d part annotations\n' % len(part_annotations)) #assert annotations, 'No annotations of specified types in file' for item in part_annotations: this_extension = self.extension if "max_extension" in item.attr: this_extension = min(this_extension,int(item.attr["max_extension"])) if item.strand >= 0: item.tail_pos = item.end item.end += this_extension else: item.tail_pos = item.start item.start -= this_extension for item in annotations: item.hits = [] # [ (tail_length, adaptor_bases) ] index = span_index.index_annotations(part_annotations) for alignment in sam.Bam_reader(workspace/'alignments_filtered_sorted.bam'): if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary: continue start = alignment.reference_start end = alignment.reference_end alignment_length = end-start strand = -1 if alignment.flag&sam.FLAG_REVERSE else 1 fragment_feature = annotation.Annotation( seqid=alignment.reference_name, start=start, end=end, strand=strand ) if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in alignment.extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) hits = index.get(fragment_feature, same_strand=True) if hits: gene = min(hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.primary.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice gene.primary.hits.append( (tail_length,adaptor_bases) ) for item in annotations: del item.parents del item.children del item.primary f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()
def run(self): items = list(annotation.read_annotations(self.parent)) annotation.link_up_annotations(items) for item in items: assert len(item.parents) <= 1 genes = [ item for item in items if item.type == "gene" ] downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ] exons = [ item for item in items if item.type == "exon" ] utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ] gene_index = span_index.index_annotations(genes) downstrand_gene_index = span_index.index_annotations(downstrand_genes) exon_index = span_index.index_annotations(exons) utr_index = span_index.index_annotations(utrs) peaks = list(annotation.read_annotations(self.child)) for peak in peaks: # Query is final base in genome before poly(A) starts query = peak.three_prime().shifted(-1,0) hit_to = "3'UTR" hits = [ item.parents[0].parents[0] for item in utr_index.get(query, True) ] if not hits: hit_to = "Exon" hits = [ item.parents[0].parents[0] for item in exon_index.get(query, True) ] # For non-coding RNAs, which don't have a 3' UTR if not hits: hit_to = "Downstrand" hits = downstrand_gene_index.get(query, True) if not hits: hit_to = "Intron" hits = gene_index.get(query, True) antisense_hits = gene_index.get(query.reversed(), True) if not hits: hit_to = "Antisense" hits = antisense_hits if hits: peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",") peak.attr["Relation"] = hit_to peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ]) peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ]) peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ]) if antisense_hits: peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",") peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ]) peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ]) peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits]) annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm annotation.write_gff3(self.prefix+"-child.gff", peaks)
def run(self): names = [ sample.output_dir for sample in self.samples ] #os.path.splitext(os.path.split(item)[1])[0] #for item in self.reads #] reference = reference_directory.Reference(self.reference, must_exist=True) workspace = io.Workspace(self.output_dir, must_exist=False) samplespace = io.Workspace(workspace/'samples', must_exist=False) plotspace = io.Workspace(workspace/'plots', must_exist=False) expressionspace = io.Workspace(workspace/'expression', must_exist=False) testspace = io.Workspace(workspace/'test', must_exist=False) testspace_dedup = io.Workspace(workspace/'test-dedup', must_exist=False) file_prefix = self.file_prefix if file_prefix and not file_prefix.endswith('-'): file_prefix += '-' #dirs = [ # workspace/item # for item in names #] samples = [ ] for sample in self.samples: samples.append(sample( samplespace / sample.output_dir, reference = self.reference, )) dirs = [ item.output_dir for item in samples ] polya_dirs = [ item + '-polyA' for item in dirs ] interleaved = [ item2 for item in zip(dirs,polya_dirs) for item2 in item ] clipper_logs = [ join(item.output_dir, 'clipped_reads_log.txt') for item in samples ] filter_logs = [ join(item.output_dir, 'filter_log.txt') for item in samples ] filter_polya_logs = [ join(item.output_dir + '-polyA', 'filter_log.txt') for item in samples ] #filter_logs = [ item.get_filter_action().log_filename() for item in samples ] #filter_polya_logs = [ item.get_polya_filter_action().log_filename() for item in samples ] analyse_template = tail_lengths.Analyse_tail_counts( working_dirs = dirs, saturation = 0, extension = self.extension, annotations = reference/'reference.gff', types = 'gene', ) with nesoni.Stage() as stage: for item in samples: item.process_make(stage) nesoni.Norm_from_samples( workspace/'norm', working_dirs = dirs ).make() def writer(): for row in io.read_table(workspace/'norm.csv'): row['Name'] = row['Name']+'-polyA' yield row io.write_csv(workspace/'norm-polyA.csv', writer(), comments=['Normalization']) with nesoni.Stage() as stage: if self.include_plots: for plot_name, directories, norm_filename in [ ('all', dirs, workspace/'norm.csv'), ('polyA', polya_dirs, workspace/'norm-polyA.csv'), ]: nesoni.IGV_plots( plotspace/plot_name, working_dirs = directories, label_prefix = plot_name+' ', raw = True, norm = True, genome = reference.get_genome_filename(), norm_file = norm_filename, #delete_igv = False, ).process_make(stage) analyse_gene_counts_0 = analyse_template( output_dir = expressionspace/'genewise', saturation = 0, extension = self.extension, title = 'Genewise expression - ' + self.title, file_prefix = file_prefix+'genewise-', ) analyse_gene_counts_0.process_make(stage) analyse_gene_counts_1 = analyse_template( output_dir = expressionspace/'genewise-dedup', saturation = 1, title = 'Genewise expression with read deduplication - ' + self.title, file_prefix = file_prefix+'genewise-dedup-', ) analyse_gene_counts_1.process_make(stage) stage.process(self._run_peaks, workspace=workspace, expressionspace=expressionspace, reference=reference, polya_dirs=polya_dirs, analyse_template=analyse_template, file_prefix=file_prefix, ) with nesoni.Stage() as stage: for test in self.tests: test( output_dir = testspace/test.output_dir, analysis = self.output_dir ).process_make(stage) test( output_dir = testspace_dedup/test.output_dir, analysis = self.output_dir, dedup = True, ).process_make(stage) #=============================================== # Report #=============================================== r = reporting.Reporter(os.path.join(self.output_dir, 'report'), self.title, self.file_prefix) r.heading('Alignment to reference') r.report_logs('alignment-statistics', #[ workspace/'stats.txt' ] + clipper_logs + filter_logs + #filter_polya_logs + [ expressionspace/('genewise','aggregate-tail-counts_log.txt') ], filter=lambda sample, field: ( field not in [ 'fragments','fragments aligned to the reference','reads kept', 'average depth of coverage, ambiguous', 'average depth of coverage, unambiguous', ] ), ) if self.include_plots: r.heading('IGV plots') r.p('These files show the depth of coverage. They can be viewed with the IGV genome browser.') genome_files = [ ] if self.include_genome: genome_files.append(reference.get_genome_filename()) genome_dir = reference.get_genome_dir() base = os.path.split(self.genome_dir)[1] for filename in os.listdir(genome_dir): genome_files.append(( os.path.join(genome_dir, filename), os.path.join(base, filename) )) r.p(r.tar('igv-plots', genome_files + glob.glob(plotspace/'*.tdf') )) if self.include_bams: r.heading('BAM files') r.p('These BAM files contain the alignments of reads to the reference sequences.') r.p('Reads with a poly(A) tail have an \'AN\' attribute giving the length of non-templated poly(A) sequence. ' 'Tail-tools only treats a read as having a tail if this length is at least 4.') bam_files = [ ] for name in names: bam_files.append( (samplespace/(name,'alignments_filtered_sorted.bam'),name+'.bam') ) bam_files.append( (samplespace/(name,'alignments_filtered_sorted.bam.bai'),name+'.bam.bai') ) r.p(r.tar('bam-files', bam_files)) r.heading('Genewise expression') io.symbolic_link(source=expressionspace/('genewise','report'),link_name=r.workspace/'genewise') r.p('<a href="genewise/index.html">→ Genewise expression</a>') io.symbolic_link(source=expressionspace/('genewise-dedup','report'),link_name=r.workspace/'genewise-dedup') r.p('<a href="genewise-dedup/index.html">→ Genewise expression with read deduplication</a>') r.heading('Peakwise expression') web.Geneview_webapp(r.workspace/'view').run() peak_filename = expressionspace/('peakwise','features-with-data.gff') n_peaks = len(list(annotation.read_annotations(peak_filename))) r.p('%d peaks called (%d poly(A) reads were required to call a peak).' % (n_peaks, self.peak_min_depth)) r.p(r.get(peak_filename, name='peaks.gff') + ' - peaks called') #if self.groups: #r.subheading('Peak shift between groups') #r.p(r.get(workspace/('peak-shift','grouped.csv')) + ' - genes with a potential peak shift') #r.get(workspace/('peak-shift','grouped.json')) #r.subheading('Peak shift between samples') #r.p(r.get(workspace/('peak-shift','individual.csv')) + ' - genes with a potential peak shift') #r.get(workspace/('peak-shift','individual.json')) io.symbolic_link(source=expressionspace/('peakwise','report'),link_name=r.workspace/'peakwise') r.p('<a href="peakwise/index.html">→ Peakwise expression</a>') io.symbolic_link(source=expressionspace/('peakwise-dedup','report'),link_name=r.workspace/'peakwise-dedup') r.p('<a href="peakwise-dedup/index.html">→ Peakwise expression with read deduplication</a>') if self.tests: r.heading('Differential tests') for test in self.tests: io.symbolic_link(source=testspace/test.output_dir,link_name=r.workspace/('test-'+test.output_dir)) io.symbolic_link(source=testspace_dedup/test.output_dir,link_name=r.workspace/('test-dedup-'+test.output_dir)) r.p('<a href="test-%s">→ %s</a> ' ' <a href="test-dedup-%s" style="font-size: 66%%">[→ Deduplicated version]</a>' % (test.output_dir, test.get_title(), test.output_dir)) r.heading('Gene viewers') r.p('Having identified interesting genes from heatmaps and differential tests above, ' 'these viewers allow specific genes to be examined in detail.') if self.groups: r.p('<a href="view.html?json=%sgrouped.json">→ Gene viewer, grouped samples</a>' % r.file_prefix) r.p('<a href="view.html?json=%sindividual.json">→ Gene viewer, individual samples</a>' % r.file_prefix) r.write('<p/><hr>\n') r.p('Note: Use deduplicated versions with care. ' 'They may possibly provide more significant results, however they are less quantitative. ' 'Read deduplication involves throwing away a large amount of data, much of which will not be a technical artifact. ' 'Deduplicated versions might best be viewed as a check on data quality.') r.p('This set of genes was used in the analysis:') r.p(r.get(reference/'reference.gff') + ' - Reference annotations in GFF3 format') r.p(r.get(reference/'utr.gff') + ' - 3\' UTR regions') r.p('tail-tools version '+tail_tools.VERSION) r.p('nesoni version '+nesoni.VERSION) #r.p('SHRiMP version '+grace.get_shrimp_2_version()) r.close()
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames=self.filenames, snpeff=False, cs='ifavailable' if self.index else False, ls=False, bowtie='ifavailable' if self.index else False, ).run() annotations = list(annotation.read_annotations(work / 'reference.gff')) annotation.link_up_annotations(annotations) exon_index = span_index.index_annotations( [item for item in annotations if item.type == "exon"]) mrna_end_index = span_index.index_annotations([ item.three_prime() for item in annotations if item.type == "mRNA" ]) mrna_utrs = [] gene_utrs = [] for gene in annotations: if gene.type != 'gene': continue mrnas = [item for item in gene.children if item.type == 'mRNA'] assert mrnas, "Gene without any mRNAs: " + gene.get_id() gene.attr['color'] = '#880088' gene.start = min(item.start for item in mrnas) gene.end = max(item.end for item in mrnas) gene.attr["max_extension"] = str( _max_extension(gene, exon_index, mrna_end_index)) gene_utr_5primes = [] for mrna in mrnas: assert mrna.strand == gene.strand, mrna assert mrna.seqid == gene.seqid, mrna mrna.attr["max_extension"] = str( _max_extension(mrna, exon_index, mrna_end_index)) cdss = [item for item in mrna.children if item.type == 'CDS'] exons = [item for item in mrna.children if item.type == 'exon'] if not exons: continue #link up annotations sorts children, so final is really final for item in exons[:-1]: item.attr["max_extension"] = "0" exons[-1].attr["max_extension"] = mrna.attr["max_extension"] if not cdss: continue mrna_utr_5primes = [] if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end >= cds_3prime: mrna_utr_5primes.append(max( item.start, cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start <= cds_3prime: mrna_utr_5primes.append(min(item.end, cds_3prime)) if mrna.strand >= 0: utr_start = min( mrna_utr_5primes) if mrna_utr_5primes else mrna.end utr_end = max(utr_start + 1, mrna.end) gene_utr_5primes.append(utr_start) else: utr_end = max( mrna_utr_5primes) if mrna_utr_5primes else mrna.start utr_start = min(mrna.start, utr_end - 1) gene_utr_5primes.append(utr_end) attr = mrna.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID'] + '-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source='tt', type='three_prime_utr', seqid=mrna.seqid, strand=mrna.strand, start=utr_start, end=utr_end, attr=attr, ) max_ext = _max_extension(utr, exon_index, mrna_end_index) utr.attr["max_extension"] = str(max_ext) #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon if utr_end - utr_start + max_ext > 1: mrna_utrs.append(utr) if gene.strand >= 0: utr_start = max( gene_utr_5primes) if gene_utr_5primes else gene.end utr_end = max(utr_start + 1, gene.end) else: utr_end = min( gene_utr_5primes) if gene_utr_5primes else gene.start utr_start = min(gene.start, utr_end - 1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID'] + '-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source='tt', type='three_prime_utr', seqid=gene.seqid, strand=gene.strand, start=utr_start, end=utr_end, attr=attr, ) utr.attr["max_extension"] = str( _max_extension(utr, exon_index, mrna_end_index)) gene_utrs.append(utr) annotation.write_gff3(work / 'reference.gff', annotations + mrna_utrs) annotation.write_gff3(work / 'utr.gff', gene_utrs) work.update_param(tail_tools_reference_version=work.VERSION)
def count_run(min_score, min_size, max_size, filter_mode, equalize, types, locii, qualifiers, use_strand, merge_filename, limit, output_prefix, filenames, log): if filter_mode == 'poly': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = False expect_multiple_alignments = True elif filter_mode == 'mono': use_bam_filename = 'alignments.bam' use_only_top = True use_only_monogamous = True expect_multiple_alignments = True else: assert filter_mode == 'existing', 'Unrecognized filtering mode' use_bam_filename = 'alignments_filtered.bam' use_only_top = False use_only_monogamous = False expect_multiple_alignments = False types = types.lower().split(',') qualifiers = qualifiers.split(',') if locii: locii = locii.lower().split(',') else: locii = None assert use_strand is not None, 'You must now explicitly specify --strand' assert use_strand in ('pool', 'forward', 'reverse', 'both'), "Can't understand --strand specification." from Bio import Seq, SeqIO annotation_filenames = [] bam_filenames = [] for arg in filenames: if annotation.is_annotation_file(arg): annotation_filenames.append(arg) else: bam_filenames.append(arg) n_samples = len(bam_filenames) titles = bam_filenames[:] for i in xrange(len(bam_filenames)): if os.path.isdir(bam_filenames[i]): titles[i] = os.path.basename(bam_filenames[i]) if not annotation_filenames: working = working_directory.Working(bam_filenames[i]) reference_filename = working.get_reference( ).annotations_filename() if reference_filename is not None: annotation_filenames.append(reference_filename) bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename) assert bam_filenames, 'No reference alignments given' merge = {} merge_qualifiers = {} if merge_filename is not None: #First line gives qualifiers #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...> f = open(merge_filename, 'rU') qualifiers = f.readline().rstrip('\n').split('\t') for line in f: parts = line.rstrip('\n').split('\t') if not parts: continue for name in parts[len(qualifiers) + 1:]: assert name not in merge, 'Duplicate feature name in merge file' merge[name] = parts[len(qualifiers)] merge_qualifiers[name] = parts[:len(qualifiers)] f.close() genes = {} # reference name -> gene index feature_names = {} # feature_name -> number of occurrences features = [] n_features = 0 chromosome_length = {} for filename in bam_filenames: headers = sam.bam_headers(filename) for line in headers.split('\n'): if not line: continue parts = line.split('\t') if parts[0] != '@SQ': continue name = None length = None for part in parts[1:]: if part.startswith('SN:'): name = part[3:] if part.startswith('LN:'): length = int(part[3:]) assert name is not None and length is not None if name in chromosome_length: assert chromosome_length[name] == length else: chromosome_length[name] = length for name in chromosome_length: genes[name] = span_index.Span_index() if annotation_filenames: assert not merge, 'Merging not supported with annotation files' for filename in annotation_filenames: for feature in annotation.read_annotations(filename): if feature.type.lower() not in types: continue if (locii is not None and ('locus_tag' not in feature.attr or feature.attr['locus_tag'].lower() not in locii)): continue f = Feature(n_samples) f.name = feature.get_id() if feature.type.lower() != 'cds' and len(types) > 1: f.name = feature.type + ':' + f.name feature_names[f.name] = feature_names.get(f.name, 0) + 1 if feature_names[f.name] > 1: f.name += '/%d' % feature_names[f.name] f.qualifiers = [ feature.attr.get(item, '') for item in qualifiers ] f.length = feature.end - feature.start assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files' genes[feature.seqid].insert( Span_entry(feature.start, feature.end, feature.strand or 1, f)) features.append(f) else: # Sequences as features log.log( 'No annotation files given or found, using sequences as features\n' ) name_feature = {} # (merged)name -> feature for name in chromosome_length: merged_name = merge.get(name, name) if merged_name not in name_feature: f = Feature(n_samples) f.name = merged_name f.length = length f.qualifiers = merge_qualifiers.get(name, ('', ) * len(qualifiers)) n_features += 1 name_feature[merged_name] = f features.append(f) else: f = name_feature[merged_name] f.length = max(f.length, length) #... genes[name].insert(Span_entry(0, chromosome_length[name], 1, f)) log.log('%d features\n\n' % len(features)) for name in genes: genes[name].prepare() n_fragments = [0] * n_samples n_fragments_aligned = [0] * n_samples n_low_score = [0] * n_samples n_something = [0] * n_samples n_multiple = [0] * n_samples n_span = [0] * n_samples for i in xrange(n_samples): for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments( bam_filenames[i], 'Counting sample %d of %d' % (i + 1, n_samples)): n_fragments[i] += 1 if not fragment_alignments: continue n_fragments_aligned[i] += 1 feature_hits = [] # [ [ (feature, strand) ] ] # Use only top scoring alignments fragment_scores = [ sum(al.get_AS() for al in item) for item in fragment_alignments ] best_score = max(fragment_scores) if min_score is not None and best_score < min_score: n_low_score[i] += 1 continue if use_only_top: cutoff = max(best_score, min_score) else: cutoff = min_score fragment_alignments = [ item for item, score in zip(fragment_alignments, fragment_scores) if score >= cutoff ] for alignments in fragment_alignments: strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1 start = min(item.pos - 1 for item in alignments) end = max(item.pos + item.length - 1 for item in alignments) length = end - start if min_size is not None and length < min_size: continue if max_size is not None and length > max_size: continue rname = alignments[0].rname strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1 assert alignments[ 0].rname in genes, 'Alignment refers to sequence not present in GENBANK file' this_feature_hits = [] for item in genes[rname].get(start, end): rel_strand = strand * item.strand key = (item.feature, rel_strand) if key in this_feature_hits: continue this_feature_hits.append(key) if not use_only_monogamous or len( fragment_alignments) == 1: item.feature.count[rel_strand][i] += 1 if this_feature_hits: feature_hits.append(this_feature_hits) if len(this_feature_hits) > 1: for a in this_feature_hits: for b in this_feature_hits: if a[0] is b[0]: continue a[0].common[(a[1], b[1])][b[0]] += 1 if len(feature_hits) > 0: n_something[i] += 1 #else: # print fragment_alignments # print genes[fragment_alignments[0][0].rname].indexes # print if len(feature_hits) > 1: n_multiple[i] += 1 for j in xrange(len(feature_hits)): for k in xrange(len(feature_hits)): if j == k: continue for a in feature_hits[j]: for b in feature_hits[k]: if a[0] is b[0]: continue a[0].ambiguous[(a[1], b[1])][b[0]] += 1 if any(len(item) > 1 for item in feature_hits): n_span[i] += 1 if limit is not None and n_fragments[i] >= limit: break grace.status('') #log.log('%s\n' % titles[i]) #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i])) #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i])) #if n_low_score[i]: # log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i])) #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i])) #if expect_multiple_alignments or n_multiple[i]: # log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i])) #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i])) #log.log('\n') log.datum(titles[i], 'fragments', n_fragments[i]) log.datum(titles[i], 'fragments aligned to the reference', n_fragments_aligned[i]) if n_low_score[i]: log.datum(titles[i], 'had too low an alignment score, discarded', n_low_score[i]) log.datum(titles[i], 'aligned to an annotated gene', n_something[i]) if expect_multiple_alignments or n_multiple[i]: log.datum(titles[i], 'aligned to multiple genes', n_multiple[i]) log.datum(titles[i], 'had an alignment that spanned multiple genes', n_span[i]) log.log('\n') strandedness = [] for feature in features: n_forward = sum(feature.count[1]) n_reverse = sum(feature.count[-1]) if n_forward + n_reverse < 5: continue strandedness.append( (n_forward - n_reverse) * 100.0 / (n_forward + n_reverse)) strandedness = sum(strandedness) / len(strandedness) log.log( 'Strand specificity: %.0f%%\n' ' (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n' ' Average over all features with at least 5 hits.)\n' % strandedness) if use_strand == 'pool': getters = [ lambda f: (feature.name, add_lists(feature.count[1], feature.count[-1]), add_defdicts(feature.common[(1, 1)], feature.common[ (1, -1)], feature.common[(-1, 1)], feature.common[(-1, -1)]), add_defdicts(feature.ambiguous[(1, 1)], feature.ambiguous[ (1, -1)], feature.ambiguous[(-1, 1)], feature.ambiguous[ (-1, -1)])) ] elif use_strand == 'forward': getters = [ lambda f: (feature.name, feature.count[1], feature.common[ (1, 1)], feature.ambiguous[(1, 1)]) ] elif use_strand == 'reverse': getters = [ lambda f: (feature.name, feature.count[-1], feature.common[ (-1, -1)], feature.ambiguous[(-1, -1)]) ] elif use_strand == 'both': getters = [ lambda f: (feature.name, feature.count[1], feature.common[ (1, 1)], feature.ambiguous[(1, 1)]), lambda f: (feature.name + 'r', feature.count[-1], feature.common[ (-1, -1)], feature.ambiguous[(-1, -1)]) ] total_hits = [0] * n_samples for feature in features: for getter in getters: total_hits = add_lists(total_hits, getter(feature)[1]) if equalize: min_hits = min(total_hits) p = [float(min_hits) / item for item in total_hits] total_hits = [min_hits] * n_samples f = open(output_prefix + '.txt', 'wb') #log.attach(open(output_prefix + '_log.txt', 'wb')) print >> f, tab_encode( ['Feature'] + titles + ['RPKM ' + item for item in titles] + ['Length'] + qualifiers + ['On same fragment'] + (['Ambiguous alignment'] if expect_multiple_alignments else [])) for feature in features: for getter in getters: feature_name, count, common, ambiguous = getter(feature) if equalize: count = [subsample(count[i], p[i]) for i in xrange(n_samples)] rpkm = [ count[i] * 1.0e9 / feature.length / total_hits[i] for i in xrange(n_samples) ] common_str = ' '.join( '%dx%s' % (item[1], item[0]) for item in sorted( common.items(), key=lambda item: item[1], reverse=True)) ambiguous_str = ' '.join( '%dx%s' % (item[1], item[0]) for item in sorted( ambiguous.items(), key=lambda item: item[1], reverse=True)) print >> f, tab_encode( [feature_name] + [str(item) for item in count] + ['%.2f' % item for item in rpkm] + [str(feature.length)] + list(feature.qualifiers) + [common_str] + ([ambiguous_str] if expect_multiple_alignments else [])) f.close()
def run(self): #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given' # Reference genome #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths() chromosomes = collections.OrderedDict(io.read_sequences(self.reference)) def get_interpeak_seq(peaks): start = min(item.transcription_stop for item in peaks) end = max(item.transcription_stop for item in peaks) if end-start > self.max_seq: return '' if peaks[0].strand >= 0: return chromosomes[peaks[0].seqid][start:end] else: return bio.reverse_complement(chromosomes[peaks[0].seqid][start:end]) def get_prepeak_seq(gene,peaks): if gene.strand >= 0: start = gene.utr_pos end = min(item.transcription_stop for item in peaks) if end-start > self.max_seq: return '' return chromosomes[gene.seqid][start:end] else: start = max(item.transcription_stop for item in peaks) end = gene.utr_pos if end-start > self.max_seq: return '' return bio.reverse_complement(chromosomes[gene.seqid][start:end]) # Normalization files if self.norm_file: norm_file = self.norm_file else: nesoni.Norm_from_counts(self.prefix+'-norm', self.counts).run() norm_file = self.prefix+'-norm.csv' norms = io.read_grouped_table(norm_file, [('All',str)])['All'] pair_norm_names = [ ] pair_norms = [ ] for i in xrange(len(norms)): pair_norm_names.append(norms.keys()[i]+'-peak1') pair_norms.append(norms.values()[i]) for i in xrange(len(norms)): pair_norm_names.append(norms.keys()[i]+'-peak2') pair_norms.append(norms.values()[i]) io.write_grouped_csv( self.prefix+'-pairs-norm.csv', [('All',io.named_list_type(pair_norm_names)(pair_norms))], comments=['#Normalization'], ) # Read data annotations = list(annotation.read_annotations(self.parents)) if self.utrs: utrs = list(annotation.read_annotations(self.utrs)) else: utrs = [ ] children = list(annotation.read_annotations(self.children)) count_table = io.read_grouped_table(self.counts, [ ('Count',int), ('Tail_count',int), ('Tail',_float_or_none), ('Proportion',_float_or_none), ('Annotation',str) ]) counts = count_table['Count'] tail_counts = count_table['Tail_count'] proportions = count_table['Proportion'] tails = count_table['Tail'] samples = counts.value_type().keys() sample_tags = { } for line in count_table.comments: if line.startswith('#sampleTags='): parts = line[len('#sampleTags='):].split(',') assert parts[0] not in sample_tags sample_tags[parts[0]] = parts for item in children: item.weight = sum( counts[item.get_id()][name] * float(norms[name]['Normalizing.multiplier']) for name in samples ) parents = [ ] id_to_parent = { } for item in annotations: if item.type != self.parent_type: continue assert item.get_id() not in id_to_parent, 'Duplicate id in parent file: '+item.get_id() parents.append(item) id_to_parent[item.get_id()] = item item.children = [ ] #item.cds = [ ] # Default utr if item.strand >= 0: item.utr_pos = item.end else: item.utr_pos = item.start if 'three_prime_UTR_start' in item.attr: if item.strand >= 0: item.utr_pos = int(item.attr['three_prime_UTR_start'])-1 else: item.utr_pos = int(item.attr['three_prime_UTR_start']) for item in utrs: assert item.attr['Parent'] in id_to_parent, 'Unknown gene '+item.attr['Parent'] id_to_parent[item.attr['Parent']].utr_pos = (item.start if item.strand >= 0 else item.end) for item in children: item.transcription_stop = item.end if item.strand >= 0 else item.start #End of transcription, 0-based, ie between-positions based if 'Parent' in item.attr: for item_parent in item.attr['Parent'].split(','): parent = id_to_parent[item_parent] parent.children.append(item) for item in parents: item.children.sort(key=_annotation_sorter) relevant = list(item.children) if self.utr_only: #if item.strand <= 0: # relative_utr_start = item.end - int(item.attr['three_prime_UTR_start']) #else: # relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start # #def relative_start(peak): # return item.end-peak.end if item.strand < 0 else peak.start-item.start #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ] relevant = [ peak for peak in relevant if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos) ] if self.top: relevant.sort(key=lambda peak:peak.weight, reverse=True) relevant = relevant[:self.top] relevant.sort(key=_annotation_sorter) item.relevant_children = relevant # JSON output j_data = { } j_genes = j_data['genes'] = { } j_genes['__comment__'] = 'start is 0-based' j_genes['name'] = [ ] j_genes['chromosome'] = [ ] j_genes['strand'] = [ ] j_genes['start'] = [ ] j_genes['utr'] = [ ] j_genes['end'] = [ ] j_genes['gene'] = [ ] j_genes['product'] = [ ] j_genes['peaks'] = [ ] j_genes['relevant_peaks'] = [ ] #j_genes['cds'] = [ ] #j_genes['cds_start'] = [ ] #j_genes['cds_end'] = [ ] for item in parents: j_genes['name'].append( item.get_id() ) j_genes['chromosome'].append( item.seqid ) j_genes['strand'].append( item.strand ) j_genes['start'].append( item.start ) j_genes['utr'].append( item.utr_pos ) j_genes['end'].append( item.end ) j_genes['gene'].append( item.attr.get('Name',item.attr.get('gene','')) ) j_genes['product'].append( item.attr.get('Product',item.attr.get('product','')) ) j_genes['peaks'].append( [ item2.get_id() for item2 in item.children ] ) j_genes['relevant_peaks'].append( [ item2.get_id() for item2 in item.relevant_children ] ) #j_genes['cds'].append( item.cds ) #j_genes['cds_start'].append( item.cds_start ) #j_genes['cds_end'].append( item.cds_end ) j_peaks = j_data['peaks'] = { } j_peaks['__comment__'] = 'start is 0-based' j_peaks['name'] = [ ] j_peaks['chromosome'] = [ ] j_peaks['strand'] = [ ] j_peaks['start'] = [ ] j_peaks['end'] = [ ] j_peaks['parents'] = [ ] j_peaks['counts'] = [ ] j_peaks['tail_lengths'] = [ ] j_peaks['proportion_tailed'] = [ ] for item in children: j_peaks['name'].append( item.get_id() ) j_peaks['chromosome'].append( item.seqid ) j_peaks['strand'].append( item.strand ) j_peaks['start'].append( item.start ) j_peaks['end'].append( item.end ) j_peaks['parents'].append( item.attr['Parent'].split(',') if 'Parent' in item.attr else [ ]) j_peaks['counts'].append( counts[item.get_id()].values() ) j_peaks['tail_lengths'].append( count_table['Tail'][item.get_id()].values() ) j_peaks['proportion_tailed'].append( count_table['Proportion'][item.get_id()].values() ) j_samples = j_data['samples'] = { } j_samples['name'] = [ ] j_samples['tags'] = [ ] j_samples['normalizing_multiplier'] = [ ] for name in samples: j_samples['name'].append(name) j_samples['tags'].append(sample_tags[name]) j_samples['normalizing_multiplier'].append(float(norms[name]['Normalizing.multiplier'])) j_chromosomes = j_data['chromosomes'] = { } j_chromosomes['name'] = [ ] j_chromosomes['length'] = [ ] for name, seq in chromosomes.iteritems(): j_chromosomes['name'].append(name) j_chromosomes['length'].append(len(seq)) with open(self.prefix + '.json','wb') as f: json.dump(j_data, f) # Output paired peak file output_comments = [ '#Counts' ] output_samples = [ ] for item in samples: output_samples.append(item+'-peak1') output_comments.append('#sampleTags=' + ','.join([item+'-peak1','peak1']+sample_tags.get(item,[]))) for item in samples: output_samples.append(item+'-peak2') output_comments.append('#sampleTags=' + ','.join([item+'-peak2','peak2']+sample_tags.get(item,[]))) output_names = [ ] output_counts = [ ] output_tail_counts = [ ] output_proportions = [ ] output_tails = [ ] output_annotation_fields = [ 'gene', 'product', 'mean_tail_1', 'mean_tail_2', 'chromosome', 'strand', 'transcription_stops' ] #, 'interpeak_seq', ] output_annotations = [ ] for item in parents: peaks = item.relevant_children for i in xrange(len(peaks)-1): for j in xrange(i+1, len(peaks)): id_i = peaks[i].get_id() id_j = peaks[j].get_id() id_pair = item.get_id() + '-'+id_i+'-'+id_j output_names.append(id_pair) row = [ ] row.extend(counts[id_i].values()) row.extend(counts[id_j].values()) output_counts.append(filter(_text,row)) row = [ ] row.extend(tail_counts[id_i].values()) row.extend(tail_counts[id_j].values()) output_tail_counts.append(filter(_text,row)) row = [ ] row.extend(proportions[id_i].values()) row.extend(proportions[id_j].values()) output_proportions.append(filter(_text,row)) row = [ ] row.extend(tails[id_i].values()) row.extend(tails[id_j].values()) output_tails.append(filter(_text,row)) output_annotations.append([ item.attr.get('Name',item.attr.get('gene','')), item.attr.get('Product',item.attr.get('product','')), count_table['Annotation'][id_i]['mean-tail'], count_table['Annotation'][id_j]['mean-tail'], item.seqid, str(item.strand), '%d, %d' % (peaks[i].transcription_stop,peaks[j].transcription_stop), #get_interpeak_seq([peaks[i],peaks[j]]), ]) #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts) io.write_grouped_csv( self.prefix + '-pairs.csv', [ ('Count',io.named_matrix_type(output_names,output_samples)(output_counts)), ('Tail_count',io.named_matrix_type(output_names,output_samples)(output_tail_counts)), ('Proportion',io.named_matrix_type(output_names,output_samples)(output_proportions)), ('Tail',io.named_matrix_type(output_names,output_samples)(output_tails)), ('Annotation',io.named_matrix_type(output_names,output_annotation_fields)(output_annotations)), ], comments=output_comments, ) # # Chi Sq tests # # #for id in relation: # # peaks = relation[id] # # if len(peaks) < 2: continue # # mats = [ ] # genes = [ ] # products = [ ] # mean_tails = [ ] # prop_tails = [ ] # # peak_names = [ ] # chromosome_names = [ ] # strands = [ ] # transcription_stops = [ ] # interpeak_seqs = [ ] # prepeak_seqs = [ ] # # for parent in parents: # id = parent.get_id() # peaks = parent.relevant_children # if len(peaks) < 2: continue # # matrix = [ ] # for item in peaks: # matrix.append(counts[item.get_id()].values()) # # mats.append( # runr.R_literal(id) + ' = ' + # runr.R_literal(matrix) # ) # # genes.append(parent.attr.get('Name',parent.attr.get('gene',''))) # products.append(parent.attr.get('Product',parent.attr.get('product',''))) # # def format_mean(s): # if s == 'NA': return 'NA' # return '%.1f' % float(s) # mean_tails.append(', '.join( format_mean(count_table['Annotation'][item.get_id()]['mean-tail']) for item in peaks )) # # def format_prop(s): # if s == 'NA': return 'NA' # return '%.2f' % float(s) # prop_tails.append(', '.join( format_prop(count_table['Annotation'][item.get_id()]['proportion-with-tail']) for item in peaks )) # # peak_names.append(', '.join(item.get_id() for item in peaks)) # chromosome_names.append(parent.seqid) # strands.append(parent.strand) # transcription_stops.append(', '.join(str(item.transcription_stop) for item in peaks)) # interpeak_seqs.append(get_interpeak_seq(peaks)) # prepeak_seqs.append(get_prepeak_seq(parent,peaks)) # # #if len(mats) >= 10: break # # text = 'cat("Loading data into R+\n")\n' # text += 'data <- list(\n' + ',\n'.join(mats) + ')\n' # text += CHISQ # # runr.run_script(text, # OUTPUT_FILENAME=self.prefix+'.csv', # GENES = genes, # PRODUCTS = products, # MEAN_TAILS = mean_tails, # PROP_TAILS = prop_tails, # PEAK_NAMES = peak_names, # CHROMOSOME_NAMES = chromosome_names, # STRANDS = strands, # TRANSCRIPTION_STOPS = transcription_stops, # INTERPEAK_SEQS = interpeak_seqs, # PREPEAK_SEQS = prepeak_seqs, # ) #
def run(self): assert self.extension is not None, '--extension must be specified' #workspace = self.get_workspace() workspace = working_directory.Working(self.working_dir, must_exist=True) if self.annotations == None: reference = workspace.get_reference() annotations_filename = reference.annotations_filename() else: annotations_filename = self.annotations types = [ item.lower() for item in self.types.split(',') ] annotations = [ item for item in annotation.read_annotations(annotations_filename) if item.type.lower() in types ] self.log.log('%d annotations\n' % len(annotations)) assert annotations, 'No annotations of specified types in file' index = { } for item in annotations: if item.strand >= 0: item.tail_pos = item.end item.end += self.extension else: item.tail_pos = item.start item.start -= self.extension if item.seqid not in index: index[item.seqid] = span_index.Span_index() index[item.seqid].insert(item) item.hits = [] # [ (rel_start, rel_end, tail_length) ] for item in index.itervalues(): item.prepare() for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(workspace/'alignments_filtered.bam'): for fragment in fragment_alignments: start = min(item.pos-1 for item in fragment) end = max(item.pos+item.length-1 for item in fragment) alignment_length = end-start strand = -1 if fragment[0].flag&sam.FLAG_REVERSE else 1 if strand >= 0: tail_pos = end else: tail_pos = start tail_length = 0 adaptor_bases = 0 for item in fragment[0].extra: if item.startswith('AN:i:'): tail_length = int(item[5:]) elif item.startswith('AD:i:'): adaptor_bases = int(item[5:]) if fragment[0].rname in index: hits = [ gene for gene in index[fragment[0].rname].get(start,end) if gene.strand == strand ] if hits: gene = min(hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.get_id())) # Nearest by tail_pos # failing that, by id to ensure a deterministic choice if strand > 0: rel_start = start - gene.start rel_end = end - gene.start else: rel_start = gene.end - end rel_end = gene.end - start gene.hits.append( (rel_start,rel_end,tail_length,adaptor_bases) ) f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz') pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL) f.close()