def run(self): fa = io.open_possibly_compressed_file(self.fasta_file) fq = io.open_possibly_compressed_file(self.qual_file) out_file = self.begin_output() while True: a1 = fa.readline() if not a1: break a1 = a1.strip() a2 = fa.readline().strip() q1 = fq.readline().strip() q2 = fq.readline().strip() assert a1.startswith('>') assert a1 == q1 print >> out_file, '@' + a1[1:] print >> out_file, a2 print >> out_file, '+' print >> out_file, ''.join( chr(33 + max(0, int(item))) for item in q2.split()) self.end_output(out_file) fa.close() fq.close()
def run(self): fa = io.open_possibly_compressed_file(self.fasta_file) fq = io.open_possibly_compressed_file(self.qual_file) out_file = self.begin_output() while True: a1 = fa.readline() if not a1: break a1 = a1.strip() a2 = fa.readline().strip() q1 = fq.readline().strip() q2 = fq.readline().strip() assert a1.startswith(">") assert a1 == q1 print >> out_file, "@" + a1[1:] print >> out_file, a2 print >> out_file, "+" print >> out_file, "".join(chr(33 + max(0, int(item))) for item in q2.split()) self.end_output(out_file) fa.close() fq.close()
def begin_input(self): from nesoni import io if self.input is not None: return io.open_possibly_compressed_file(self.input) else: return sys.stdin
def read_gff(filename, joiner=None): f = io.open_possibly_compressed_file(filename) for line in f: line = line.rstrip() if line == '##FASTA': break if not line or line.startswith('#'): continue parts = line.split('\t') assert len(parts) >= 8, parts # Be nice, ignore spaces at start or end, eg in seqid parts = [item.strip() for item in parts] result = Annotation() result.seqid = parts[0] result.source = parts[1] result.type = parts[2] result.start = int(parts[3]) - 1 result.end = int(parts[4]) result.score = None if parts[5] == '.' else float(parts[5]) result.strand = strand_from_gff[parts[6]] result.phase = None if parts[7] == '.' else int(parts[7]) result.attr = {} if len(parts) < 9 else split_keyvals(parts[8], joiner) yield result f.close()
def read_gff(filename): f = io.open_possibly_compressed_file(filename) for line in f: line = line.rstrip() if line == '##FASTA': break if not line or line.startswith('#'): continue parts = line.split('\t') assert len(parts) >= 8, parts result = Annotation() result.seqid = parts[0] result.source = parts[1] result.type = parts[2] result.start = int(parts[3])-1 result.end = int(parts[4]) result.score = None if parts[5] == '.' else float(parts[5]) result.strand = strand_from_gff[parts[6]] result.phase = None if parts[7] == '.' else int(parts[7]) result.attr = { } if len(parts) < 9 else split_keyvals(parts[8]) yield result f.close()
def get_object(self, path, plain_text=False): from nesoni import io f = io.open_possibly_compressed_file(self._object_filename(path)) if plain_text: result = eval(f.read()) else: result = cPickle.load(f) f.close() return result
def run(self): workspace = self.get_workspace() reference = reference_directory.Reference(self.reference, must_exist=True) reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) variants = collections.defaultdict(list) for record in reader: variants[record.CHROM].append(record) reader_f.close() for chrom in variants: variants[chrom].sort(key=lambda item: item.POS) filenames = [workspace / (item + '.fa') for item in reader.samples] for filename in filenames: with open(filename, 'wb'): pass for name, seq in io.read_sequences( reference.reference_fasta_filename()): for i, sample in enumerate(reader.samples): revised = [] pos = 0 for variant in variants[name]: gt = variant.samples[i].data.GT if gt is None: continue assert gt.isdigit( ), 'Unsupported genotype (can only use haploid genotypes): ' + gt gt_number = int(gt) if gt_number == 0: var_seq = variant.REF else: var_seq = str(variant.ALT[gt_number - 1]) assert re.match( '[ACGTN]*$', var_seq), 'Unsupported variant type: ' + var_seq new_pos = variant.POS - 1 assert new_pos >= pos, 'Variants overlap.' revised.append(seq[pos:new_pos]) pos = new_pos revised.append(var_seq) assert seq[pos:pos + len(variant.REF)].upper( ) == variant.REF, 'REF column in VCF does not match reference sequence' pos += len(variant.REF) revised.append(seq[pos:]) with open(filenames[i], 'ab') as f: io.write_fasta(f, name, ''.join(revised)) del variants[name] assert not variants, 'Chromosome names in VCF not in reference: ' + ' '.join( variants)
def read_annotations(filename): f = io.open_possibly_compressed_file(filename) peek = f.read(1024) f.close() if peek.startswith('LOCUS'): return read_genbank(filename) elif peek.startswith('##gff') or peek.split('\n')[0].count('\t') in (7,8): return read_gff(filename) else: raise grace.Error('Not an annotation file.')
def read_annotations(filename, joiner=None): f = io.open_possibly_compressed_file(filename) peek = f.read(1024) f.close() if peek.startswith('LOCUS'): return read_genbank(filename) elif peek.startswith('##gff') or peek.split('\n')[0].count('\t') in (7, 8): return read_gff(filename, joiner) else: raise grace.Error('Not an annotation file.')
def get_object(self, path, plain_text=False): from nesoni import io f = io.open_possibly_compressed_file(self._object_filename(path)) if plain_text: data = f.read() try: result = json.loads(data) except ValueError: #Older versions used repr instead of json.dump result = eval(data) else: result = cPickle.load(f) f.close() return result
def run(self): f = self.begin_output() for filename in self.filenames: info = io.get_file_info(filename) any = False name = os.path.splitext(os.path.split(filename)[1])[0] if info.matches('sequences'): total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'total bases', total_length) if total: print >> f, grace.datum(name, 'average length', float(total_length) / total) print >> f any = True if info.matches('annotations'): total = 0 counts = {} for item in annotation.read_annotations(filename, "/"): total += 1 counts[item.type] = counts.get(item.type, 0) + 1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if info.matches('type-vcf'): reader_f = io.open_possibly_compressed_file(filename) reader = vcf.Reader(reader_f) n = 0 for item in reader: n += 1 print >> f, grace.datum(name, 'variants', n) any = True if not any: raise grace.Error('Don\'t know what to do with ' + filename) self.end_output(f)
def run(self): f = self.begin_output() for filename in self.filenames: info = io.get_file_info(filename) any = False name = os.path.splitext(os.path.split(filename)[1])[0] if info.matches('sequences'): total = 0 total_length = 0 for seq in io.read_sequences(filename, qualities=True): total += 1 total_length += len(seq[1]) print >> f, grace.datum(name, 'sequences', total) print >> f, grace.datum(name, 'total bases', total_length) if total: print >> f, grace.datum(name, 'average length', float(total_length)/total) print >> f any = True if info.matches('annotations'): total = 0 counts = { } for item in annotation.read_annotations(filename): total += 1 counts[item.type] = counts.get(item.type,0)+1 print >> f, grace.datum(name, 'features', total) for key in sorted(counts): print >> f, grace.datum(name, key + ' features', counts[key]) print >> f any = True if info.matches('type-vcf'): reader_f = io.open_possibly_compressed_file(filename) reader = vcf.Reader(reader_f) n = 0 for item in reader: n += 1 print >> f, grace.datum(name, 'variants', n) any = True if not any: raise grace.Error('Don\'t know what to do with ' + filename) self.end_output(f)
def __init__(self, filename): assert os.path.exists(filename), filename + ' does not exist' if is_bam(filename): self.process = io.run([ 'samtools', 'view', io.abspath(filename), ]) ## Godawful hack #self.process.stdout = io.process_buffer(self.process.stdout) self.file = self.process.stdout else: self.process = None self.file = io.open_possibly_compressed_file(filename)
def run(self): workspace = self.get_workspace() reference = reference_directory.Reference(self.reference, must_exist=True) reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) variants = collections.defaultdict(list) for record in reader: variants[record.CHROM].append(record) reader_f.close() for chrom in variants: variants[chrom].sort(key=lambda item: item.POS) filenames = [ workspace/(item+'.fa') for item in reader.samples ] for filename in filenames: with open(filename,'wb'): pass for name, seq in io.read_sequences(reference.reference_fasta_filename()): for i, sample in enumerate(reader.samples): revised = [ ] pos = 0 for variant in variants[name]: gt = variant.samples[i].data.GT if gt is None: continue assert gt.isdigit(), 'Unsupported genotype (can only use haploid genotypes): '+gt gt_number = int(gt) if gt_number == 0: var_seq = variant.REF else: var_seq = str(variant.ALT[gt_number-1]) assert re.match('[ACGTN]*$', var_seq), 'Unsupported variant type: '+var_seq new_pos = variant.POS-1 assert new_pos >= pos, 'Variants overlap.' revised.append(seq[pos:new_pos]) pos = new_pos revised.append(var_seq) assert seq[pos:pos+len(variant.REF)].upper() == variant.REF, 'REF column in VCF does not match reference sequence' pos += len(variant.REF) revised.append(seq[pos:]) with open(filenames[i],'ab') as f: io.write_fasta(f, name, ''.join(revised)) del variants[name] assert not variants, 'Chromosome names in VCF not in reference: '+' '.join(variants)
def read_genbank(filename): from Bio import Seq, SeqIO f = io.open_possibly_compressed_file(filename) id_counter = 0 for record in SeqIO.parse(f, 'genbank'): name = record.id if name == '' or name == 'unknown': name = record.name for root_feature in record.features: todo = [root_feature] while todo: feature = todo.pop() result = Annotation() result.seqid = name result.source = 'genbank-file' result.type = feature.type result.start = feature.location.nofuzzy_start result.end = feature.location.nofuzzy_end result.score = None result.strand = feature.strand result.phase = 0 #FIXME result.attr = {} for key in feature.qualifiers: result.attr[key] = ', '.join(feature.qualifiers[key]) yield result if 'ID' not in result.attr: id_counter += 1 result.attr['ID'] = '%d' % id_counter for sub_feature in feature.sub_features: feature.qualifiers['Parent'] = [result.attr['ID']] todo.extend(feature.sub_features[::-1]) f.close()
def read_genbank(filename): from Bio import Seq, SeqIO f = io.open_possibly_compressed_file(filename) id_counter = 0 for record in SeqIO.parse(f,'genbank'): name = record.id if name == '' or name == 'unknown': name = record.name for root_feature in record.features: todo = [ root_feature ] while todo: feature = todo.pop() result = Annotation() result.seqid = name result.source = 'genbank-file' result.type = feature.type result.start = feature.location.nofuzzy_start result.end = feature.location.nofuzzy_end result.score = None result.strand = feature.strand result.phase = 0 #FIXME result.attr = { } for key in feature.qualifiers: result.attr[key] = ', '.join(feature.qualifiers[key]) yield result if 'ID' not in result.attr: id_counter += 1 result.attr['ID'] = '%d' % id_counter for sub_feature in feature.sub_features: feature.qualifiers['Parent'] = [ result.attr['ID'] ] todo.extend(feature.sub_features[::-1]) f.close()
def run(self): reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) tags = { } for item in reader.metadata.get('sampleTags',[]): parts = item.split(',') tags[parts[0]] = parts assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.' samples = [ 'reference'] + reader.samples for sample in samples: if sample not in tags: tags[sample] = [ sample, 'all' ] samples = selection.select_and_sort( self.select, self.sort, samples, lambda sample: tags[sample]) required = [ i for i, sample in enumerate(samples) if selection.matches(self.require, tags[sample]) ] sample_number = dict((b,a) for a,b in enumerate(reader.samples)) items = [ ] for record in reader: variants = get_variants(record) genotypes = [ ] counts = [ ] qualities = [ ] for sample in samples: if sample == 'reference': genotypes.append([0]) counts.append([1]) qualities.append(float('inf')) else: genotypes.append(get_genotype(record.samples[sample_number[sample]])) counts.append(get_variant_counts(record.samples[sample_number[sample]])) qualities.append(record.samples[sample_number[sample]].data.GQ) # Only output when there are at least two genotypes any_interesting = False for i in xrange(len(genotypes)): for j in xrange(i): if (genotypes[i] is not None and genotypes[j] is not None and not genotypes_equal(genotypes[i], genotypes[j])): any_interesting = True break if any_interesting: break if not any_interesting: continue if any(genotypes[i] is None for i in required): continue if self.only_snps and any( genotype is not None and any(len(variants[i]) != 1 for i in genotype) for genotype in genotypes): continue snpeff = snpeff_describe(record.INFO.get('EFF','')) if not any( selection.matches(self.snpeff_filter, item[1]) for item in (snpeff or [('',[])]) ): continue items.append(_Nway_record(variants=variants, genotypes=genotypes, counts=counts, qualities=qualities, snpeff=snpeff, record=record)) self.log.log('%d variants\n\n' % len(items)) if self.as_ == 'table': self._write_table(samples, items) elif self.as_ == 'nexus': self._write_nexus(samples, items) elif self.as_ == 'splitstree': self._write_nexus(samples, items) io.execute( 'SplitsTree +g -i INPUT -x COMMAND', no_display=True, INPUT=self.prefix + '.nex', COMMAND='UPDATE; ' 'SAVE FILE=\'%s.nex\' REPLACE=yes; ' 'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; ' 'QUIT' % (self.prefix, self.prefix, len(items)), ) elif self.as_ == 'vcf': self._write_vcf(samples, items, reader) else: raise grace.Error('Unknown output format: '+self.as_)
def run(self): work = self.get_workspace() data = [] names = [] sample_tags = [] old = grace.status("Loading pickles") max_length = 1 for i, item in enumerate(self.pickles): grace.status("Loading " + os.path.basename(item)) f = io.open_possibly_compressed_file(item) name, tags, datum = pickle.load(f) f.close() data.append(datum) names.append(name) sample_tags.append(tags) try: max_length = max( max_length, max(item[0] #tail_length for feature in datum for item in feature.hits) + 1) except ValueError: pass if i == 0: annotations = datum grace.status(old) self.log.log("Maximum tail length %d\n" % max_length) for i in xrange(len(names)): n_alignments = 0 for feature in data[i]: feature.total_count = len(feature.hits) feature.tail_counts = [0] * max_length n_alignments += feature.total_count for tail_length, adaptor_bases in feature.hits: if adaptor_bases >= self.adaptor: feature.tail_counts[tail_length] += 1 del feature.hits self.log.datum(names[i], 'Alignments to features', n_alignments) counts = [] # [feature][sample](total_count, [taillength]) for item in data: assert len(item) == len(data[0]) for row in itertools.izip(*data): this_counts = [(item.total_count, item.tail_counts) for item in row] counts.append(this_counts) n_features = len(counts) n_samples = len(data) sample_n = [[0] * n_samples for i in xrange(n_features) ] # [feature][sample] Total count sample_n_tail = [[0] * n_samples for i in xrange(n_features) ] # [feature][sample] Polya count sample_prop = [ [None] * n_samples for i in xrange(n_features) ] # [feature][sample] Proportion of reads with tail (deprecated) sample_tail = [[None] * n_samples for i in xrange(n_features) ] # [feature][sample] Mean tail length in each sample sample_sd_tail = [ [None] * n_samples for i in xrange(n_features) ] # [feature][sample] Std dev tail length in each sample sample_total_tail = [[0] * n_samples for i in xrange(n_features)] sample_quantile_tail = collections.OrderedDict( (item, [[None] * n_samples for i in xrange(n_features)]) for item in [25, 50, 75, 100]) overall_n = [0] * n_features # [feature] Overall count overall_prop = [ None ] * n_features # [feature] Overall proportion with tail overall_tail = [ None ] * n_features # [feature] Overall mean tail length overall_n_tail = [ 0 ] * n_features # [feature] Overall polya count for i, row in enumerate(counts): for j, (this_this_n, item) in enumerate(row): sample_n[i][j] = this_this_n sample_n_tail[i][j] = sum(item[self.tail:]) sample_total_tail[i][j] = sum( item[k] * k for k in xrange(self.tail, max_length)) if sample_n[i][j] >= 1: sample_prop[i][j] = float( sample_n_tail[i][j]) / sample_n[i][j] if sample_n_tail[i][j] >= 1: sample_tail[i][j] = float( sample_total_tail[i][j]) / sample_n_tail[i][j] for quantile in sample_quantile_tail: counter = sample_n_tail[i][j] * quantile / 100.0 for k in xrange(self.tail, max_length): counter -= item[k] if counter <= 0: break sample_quantile_tail[quantile][i][j] = k if sample_n_tail[i][j] >= 2: sample_sd_tail[i][j] = math.sqrt( float( sum(item[k] * ((k - sample_tail[i][j])**2) for k in xrange(self.tail, max_length))) / (sample_n_tail[i][j] - 1)) overall_n[i] = sum(sample_n[i]) overall_n_tail[i] = sum(sample_n_tail[i]) if overall_n[i] >= 1: overall_prop[i] = float(sum(sample_n_tail[i])) / overall_n[i] if overall_n_tail[i] >= 1: overall_tail[i] = float(sum( sample_total_tail[i])) / overall_n_tail[i] for i, name in enumerate(names): this_total = sum(item[i] for item in sample_total_tail) this_n = sum(item[i] for item in sample_n_tail) if this_n: self.log.datum(name, 'Average poly-A tail', float(this_total) / this_n) for i, name in enumerate(names): this_total = sum(item[i] for item in sample_n_tail) this_n = sum(item[i] for item in sample_n) if this_n: self.log.datum(name, 'Average proportion of reads with tail', float(this_total) / this_n) with open(work / 'features-with-data.gff', 'wb') as f: annotation.write_gff3_header(f) for i, item in enumerate(annotations): item.attr['reads'] = str(overall_n[i]) item.attr['reads_with_tail'] = str(overall_n_tail[i]) item.attr['mean_tail'] = '%.1f' % overall_tail[ i] if overall_tail[i] else 'NA' item.attr['proportion_with_tail'] = '%.2f' % overall_prop[ i] if overall_prop[i] else 'NA' if overall_tail[i] is None: item.attr['color'] = '#444444' else: a = (overall_tail[i] - self.tail) / max( 1, max_length - self.tail) item.attr['color'] = '#%02x%02x%02x' % (int( a * 255), int( (1 - abs(a * 2 - 1)) * 255), 255 - int(a * 255)) #item.attr['color'] = ... print >> f, item.as_gff() comments = ['#Counts'] + [ '#sampleTags=' + ','.join(tags) for tags in sample_tags ] + [ '"Tail_count" group is number of reads with tail', '"Tail" group is mean tail per sample', '"Proportion" group is proportion of reads with tail', ] have_biotype = any("Biotype" in item.attr for item in annotations) have_parent = any("Parent" in item.attr for item in annotations) have_relation = any("Relation" in item.attr for item in annotations) have_antisense = any("Antisense_parent" in item.attr for item in annotations) def counts_iter(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(n_samples): row[('Count', names[j])] = '%d' % sample_n[i][j] row[('Annotation', 'Length')] = annotations[i].end - annotations[i].start row[('Annotation', 'gene')] = annotations[i].attr.get('Name', '') row[('Annotation', 'product')] = annotations[i].attr.get('Product', '') if have_biotype: row[('Annotation', 'biotype')] = annotations[i].attr.get('Biotype', '') if have_parent: row[('Annotation', 'parent')] = annotations[i].attr.get('Parent', '') if have_relation: row[('Annotation', 'relation')] = annotations[i].attr.get( 'Relation', '') if have_antisense: row[('Annotation', 'antisense_gene')] = annotations[i].attr.get( 'Antisense_name', '') row[('Annotation', 'antisense_product')] = annotations[i].attr.get( 'Antisense_product', '') row[('Annotation', 'antisense_biotype')] = annotations[i].attr.get( 'Antisense_biotype', '') row[('Annotation', 'antisense_parent')] = annotations[i].attr.get( 'Antisense_parent', '') row[('Annotation', 'chromosome')] = str(annotations[i].seqid) row[('Annotation', 'strand')] = str(annotations[i].strand) row[('Annotation', 'start')] = str(annotations[i].start + 1) row[('Annotation', 'end')] = str(annotations[i].end) row[('Annotation', 'reads')] = str(overall_n[i]) row[('Annotation', 'reads-with-tail')] = str(overall_n_tail[i]) row[('Annotation', 'mean-tail')] = str_na(overall_tail[i]) row[('Annotation', 'proportion-with-tail')] = str_na(overall_prop[i]) for j in xrange(n_samples): row[('Tail_count', names[j])] = '%d' % sample_n_tail[i][j] for j in xrange(n_samples): row[('Tail', names[j])] = str_na(sample_tail[i][j]) for j in xrange(n_samples): row[('Tail_sd', names[j])] = str_na(sample_sd_tail[i][j]) for quantile in sample_quantile_tail: for j in xrange(n_samples): row[('Tail_quantile_%d' % quantile, names[j])] = str_na( sample_quantile_tail[quantile][i][j]) for j in xrange(len(names)): row[('Proportion', names[j])] = str_na(sample_prop[i][j]) yield row io.write_csv(work / 'counts.csv', counts_iter(), comments=comments) def write_csv_matrix(filename, matrix): def emitter(): for i in xrange(n_features): row = collections.OrderedDict() row["Feature"] = annotations[i].get_id() for j in xrange(n_samples): row[names[j]] = str_na(matrix[i][j]) yield row io.write_csv(filename, emitter()) write_csv_matrix(work / 'read_count.csv', sample_n) write_csv_matrix(work / 'tail_count.csv', sample_n_tail) write_csv_matrix(work / 'tail.csv', sample_tail) write_csv_matrix(work / 'tail_sd.csv', sample_sd_tail) for quantile in sample_quantile_tail: write_csv_matrix(work / ('tail_quantile_%d.csv' % quantile), sample_quantile_tail[quantile]) #def raw_columns(): # for i in xrange(n_samples): # row = collections.OrderedDict() # row['Sample'] = names[i] # for j in xrange(max_length): # row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based # yield row #io.write_csv(work/'raw-columns.csv', raw_columns()) # ##Somewhat inefficient #def raw(): # for i in xrange(n_features): # row = collections.OrderedDict() # row['Feature'] = annotations[i].get_id() # for j in xrange(n_samples): # for k in xrange(max_length): # row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] ) # yield row #io.write_csv(work/'raw.csv', raw()) def pooled(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(max_length): row[str(j)] = str( sum(counts[i][k][1][j] for k in xrange(n_samples))) yield row io.write_csv(work / 'pooled.csv', pooled())
def nway_main( gbk_filename, use_indels, use_reference, give_evidence, give_consequences, require_all, require_bisect, full_output, format, working_dirs, split_a, split_b, f=sys.stdout, ): assert working_dirs, "Need at least one working directory." workspaces = [working_directory.Working(dirname, must_exist=True) for dirname in working_dirs] reference = workspaces[0].get_reference() # if not annotation_filename: # annotation_filename = reference.annotations_filename() #May still be None if use_reference: names = ["reference"] evidence_start = 1 else: names = [] evidence_start = 0 names.extend(norm_name(item) for item in working_dirs) references = io.read_sequences(reference.reference_fasta_filename()) annotations = {} if gbk_filename: from Bio import SeqIO for record in SeqIO.parse(io.open_possibly_compressed_file(gbk_filename), "genbank"): sequence = record.seq.tostring() features = [item for item in record.features if item.type != "source"] features.sort(key=lambda item: item.location.nofuzzy_start) annotations[sequence] = features iterator = reader(working_dirs, references, use_reference, annotations) if not use_indels: iterator = itertools.ifilter(has_no_indels, iterator) if require_all or require_bisect or format == "counts": iterator = itertools.ifilter(fully_unambiguous, iterator) if require_bisect: iterator = itertools.ifilter(is_binary_partition, iterator) if not require_bisect: if full_output: iterator = itertools.ifilter(not_boring_insertion, iterator) else: iterator = itertools.ifilter(is_interesting, iterator) if split_a or split_b: assert len(names) == len(set(names)), "Two samples with the same name" try: split_a = [names.index(norm_name(item)) for item in split_a] split_b = [names.index(norm_name(item)) for item in split_b] except ValueError: raise grace.Error("Sample to be split is not amongst samples given") iterator = itertools.ifilter(is_split(split_a, split_b), iterator) # if limit: # iterator = itertools.islice(iterator, limit) if format == "table": line = "Reference\tPosition\tChange type" line += "\t" + "\t".join(names) if give_evidence: line += "\t" + "\t".join(names[evidence_start:]) if give_consequences: line += "\t" + "\t".join(names[evidence_start:]) if annotations: line += "\tAnnotations" print >> f, line for calls in iterator: line = "%s\t%d\t%s\t%s" % ( calls.ref_name, calls.ref_pos + 1, change_type(calls), "\t".join(item.consensus for item in calls.calls), ) if give_evidence: line += "\t" + "\t".join(item.evidence for item in calls.calls[evidence_start:]) if give_consequences: line += "\t" + "\t".join(item.consequences for item in calls.calls[evidence_start:]) if annotations: line += "\t" + describe_features(calls.features) print >> f, line elif format == "compact": for line in transpose_strings(names): print >> f, line print >> f for calls in iterator: if calls.is_insertion: footer = "%12d.5 %s" % (calls.ref_pos, calls.ref_name) else: footer = "%12d %s" % (calls.ref_pos + 1, calls.ref_name) t = transpose_strings([item.consensus for item in calls.calls], "-", 1) top = t[0] + " " + footer if give_consequences: consequences = [] for call in calls.calls: if call.consequences: for item in call.consequences.split(", "): item = " ".join(item.split()[:3]) if item not in consequences: consequences.append(item) if consequences: top += " " + " / ".join(sorted(consequences)) top += " " + describe_features(calls.features) print >> f, top for line in t[1:]: print >> f, line elif format == "nexus": buckets = [[] for name in names] for calls in iterator: for i, char in enumerate(partition_string(calls)): buckets[i].append(char) print >> f, "#NEXUS" print >> f, "begin taxa;" print >> f, "dimensions ntax=%d;" % len(names) print >> f, "taxlabels" for name in names: print >> f, name print >> f, ";" print >> f, "end;" print >> f, "begin characters;" print >> f, "dimensions nchar=%d;" % len(buckets[0]) print >> f, 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;' print >> f, "matrix" for name, bucket in itertools.izip(names, buckets): print >> f, name, "".join(bucket) print >> f, ";" print >> f, "end;" elif format == "counts": for line in transpose_strings(names): print >> f, line print >> f counts = {} for calls in iterator: count_str = partition_string(calls) if count_str not in counts: counts[count_str] = 1 else: counts[count_str] += 1 for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True): print >> f, "%s %d" % (transpose_strings(count_str)[0], counts[count_str]) else: raise grace.Error("Unknown output format: " + format)
def run(self): workspace = self.get_workspace() header = [ "##gff-version 3\n" ] lengths = { } with io.open_possibly_compressed_file(self.features) as f: f.next() for line in f: if not line.startswith("#"): break if line.startswith("##gff-version"): continue header.append(line) parts = line.strip().split() if parts[0] == "##sequence-region": lengths[parts[1]] = int(parts[3]) header = "".join(header) items = list(annotation.read_gff(self.features, "/")) annotation.link_up_annotations(items) for item in items: assert len(item.parents) < 2 if "ID" in item.attr: item.attr["ID"] = item.attr["ID"].split(":")[1] if "Parent" in item.attr: item.attr["Parent"] = item.attr["Parent"].split(":")[1] if item.parents: item.parent = item.parents[0] def well_supported(item): if self.support is None: return True level = item.attr.get("transcript_support_level","NA").split()[0] if not level.isdigit(): return False return int(level) <= self.support exons = [ item for item in items if item.type == "exon" and well_supported(item.parent) ] exon_index = span_index.index_annotations(exons) utrs = [ ] extended_utrs = [ ] utr_parts = [ ] exons_kept = [ ] cds_kept = [ ] transcripts_kept = [ ] for item in items: this_exons = [ item2 for item2 in item.children if item2.type == "exon" ] if this_exons and well_supported(item): transcripts_kept.append(item) exons_kept.extend(this_exons) cds_kept.extend([ item2 for item2 in item.children if item2.type == "CDS" ]) if self.gene_level: utr_bits = [ item3 for item2 in item.children if well_supported(item2) for item3 in item2.children if item3.type == self.what ] else: if not well_supported(item): continue utr_bits = [ item2 for item2 in item.children if item2.type == self.what ] if not utr_bits: continue utr = utr_bits[0].copy() for item2 in utr_bits[1:]: utr = utr.span_with(item2) gene = item if self.gene_level else item.parent utr.attr = dict( ID=item.get_id(), Name=item.attr["Name"], gene_id=gene.get_id(), gene=gene.attr["Name"], description=gene.attr.get("description",""), biotype=item.attr["biotype"] ) max_extension = 10000 if item.strand < 0: max_extension = min(max_extension, utr.start) else: max_extension = min(max_extension, lengths[utr.seqid] - utr.end) assert max_extension >= 0, utr end = utr.three_prime() for hit in exon_index.get(end.shifted(0,max_extension), same_strand=True): #if hit.parent.get_id() == item.get_id(): # continue rel = hit.relative_to(end).start if rel >= 0: max_extension = min(max_extension, rel) extended_utr = utr.shifted(0,max_extension) extended_utr.start = max(extended_utr.start, 0) utr.attr["max_extension"] = str(max_extension) utrs.append(utr) extended_utrs.append(extended_utr) for item2 in utr_bits: part = item2.copy() part.attr = dict(Parent=item.get_id()) part.type = "part" utr_parts.append(part) write_gff3(workspace/"utr.gff",utrs,header) write_gff3(workspace/"utr_extended.gff",extended_utrs,header) write_gff3(workspace/"utr_part.gff",utr_parts,header) write_gff3(workspace/"transcript.gff",transcripts_kept,header) write_gff3(workspace/"exon.gff",exons_kept,header) write_gff3(workspace/"cds.gff",cds_kept,header)
def run(self): references = { } for filename in self.reference_filenames: for name, seq in io.read_sequences(filename): references[name] = seq tail_lengths = { } adaptor_bases = { } for filename in self.clips: with io.open_possibly_compressed_file(filename) as f: for line in f: if line.startswith('#'): continue parts = line.rstrip('\n').split('\t') name = parts[0].split()[0] tail_lengths[name] = int(parts[3])-int(parts[2]) adaptor_bases[name] = int(parts[6]) in_file = self.begin_input() out_file = self.begin_output() assert self.prop_a >= 0.0 and self.prop_a <= 1.0 a_score = 1-self.prop_a non_a_score = -self.prop_a for line in in_file: line = line.rstrip() if line.startswith('@'): print >> out_file, line continue al = Alignment(line) if al.flag & FLAG_UNMAPPED: continue #ref = references[al.rname] reverse = al.flag & FLAG_REVERSE if reverse: read_bases = rev_comp(al.seq) read_qual = al.qual[::-1] cigar = cigar_decode(al.cigar)[::-1] else: read_bases = al.seq read_qual = al.qual cigar = cigar_decode(al.cigar) n_tail = tail_lengths[al.qname] #if reverse: # if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference # bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1]) #else: # if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference # bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug! # #extension = 0 #while extension < n_tail and bases_ref[extension] == 'A': # extension += 1 if reverse: feat = annotation.Annotation(al.rname, start=al.pos-1-n_tail, end=al.pos-1, strand=-1) else: feat = annotation.Annotation(al.rname, start=al.pos-1+al.length, end=al.pos-1+al.length+n_tail, strand=1) bases_ref = feat.get_seq(references).upper() # Allow up to 60% mismatch on As # Treat soft clipping as insertion for simplicity cigar = cigar.replace("S","I") assert "H" not in cigar, "Can't handle hard clipping" extension = 0 best_score = 0.0 score = 0.0 # Soft clipping treated as a mismatch i = len(cigar)-1 while i >= 0 and cigar[i] in "I": score += non_a_score i -= 1 for i in xrange(n_tail): if bases_ref[i] == "A": score += a_score else: score += non_a_score if score >= best_score: extension = i+1 best_score = score #print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref if n_tail-extension > 0: al.extra.append('AN:i:%d' % (n_tail-extension)) al.extra.append('AG:i:%d' % (extension)) if adaptor_bases[al.qname]: al.extra.append('AD:i:%d' % adaptor_bases[al.qname]) if n_tail-extension >= self.tail: #if reverse: # tail_refpos = al.pos-extension #else: # tail_refpos = al.pos+al.length+extension-1 #al.extra.append('AA:i:%d'%tail_refpos) al.extra.append('AA:i:1') cigar += 'M' * extension read_bases += 'N' * extension #Since mispriming is so common (and loading the original sequence here would be a pain) read_qual += chr(33+20) * extension #Arbitrarily give quality 20 al.length += extension if reverse: al.pos -= extension al.seq = rev_comp(read_bases) al.qual = read_qual[::-1] al.cigar = cigar_encode(cigar[::-1]) else: al.seq = read_bases al.qual = read_qual al.cigar = cigar_encode(cigar) print >> out_file, al self.end_output(out_file) self.end_input(in_file)
def run(self): assert len(self.pickles) > 0, "No samples to count." work = self.get_workspace() data = [ ] names = [ ] sample_tags = [ ] old = grace.status("Loading pickles") max_length = 1 for i, item in enumerate(self.pickles): grace.status("Loading "+os.path.basename(item)) f = io.open_possibly_compressed_file(item) name, tags, datum = pickle.load(f) f.close() data.append(datum) names.append(name) sample_tags.append(tags) try: max_length = max(max_length, max( item[0] #tail_length for feature in datum for item in feature.hits ) + 1) except ValueError: pass if i == 0: annotations = datum grace.status(old) self.log.log("Maximum tail length %d\n" % max_length) for i in xrange(len(names)): n_alignments = 0 for feature in data[i]: feature.total_count = len(feature.hits) feature.tail_counts = [ 0 ] * max_length n_alignments += feature.total_count for tail_length, adaptor_bases in feature.hits: if adaptor_bases >= self.adaptor: feature.tail_counts[tail_length] += 1 del feature.hits self.log.datum(names[i], 'Alignments to features', n_alignments) counts = [ ] # [feature][sample](total_count, [taillength]) for item in data: assert len(item) == len(data[0]) for row in itertools.izip(*data): this_counts = [ (item.total_count, item.tail_counts) for item in row ] counts.append(this_counts) n_features = len(counts) n_samples = len(data) sample_n = [ [0]*n_samples for i in xrange(n_features) ] # [feature][sample] Total count sample_n_tail = [ [0]*n_samples for i in xrange(n_features) ] # [feature][sample] Polya count sample_prop = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Proportion of reads with tail (deprecated) sample_tail = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Mean tail length in each sample sample_sd_tail = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample] Std dev tail length in each sample sample_total_tail = [ [0]*n_samples for i in xrange(n_features) ] sample_quantile_tail = collections.OrderedDict( (item, [ [None]*n_samples for i in xrange(n_features) ]) for item in [25,50,75,100] ) overall_n = [ 0 ]*n_features # [feature] Overall count overall_prop = [ None ]*n_features # [feature] Overall proportion with tail overall_tail = [ None ]*n_features # [feature] Overall mean tail length overall_n_tail = [ 0 ]*n_features # [feature] Overall polya count for i, row in enumerate(counts): for j, (this_this_n, item) in enumerate(row): sample_n[i][j] = this_this_n sample_n_tail[i][j] = sum(item[self.tail:]) sample_total_tail[i][j] = sum( item[k]*k for k in xrange(self.tail,max_length) ) if sample_n[i][j] >= 1: sample_prop[i][j] = float(sample_n_tail[i][j])/sample_n[i][j] if sample_n_tail[i][j] >= 1: sample_tail[i][j] = float(sample_total_tail[i][j])/sample_n_tail[i][j] for quantile in sample_quantile_tail: counter = sample_n_tail[i][j] * quantile / 100.0 for k in xrange(self.tail, max_length): counter -= item[k] if counter <= 0: break sample_quantile_tail[quantile][i][j] = k if sample_n_tail[i][j] >= 2: sample_sd_tail[i][j] = math.sqrt( float(sum( item[k]*((k-sample_tail[i][j])**2) for k in xrange(self.tail,max_length) )) / (sample_n_tail[i][j]-1) ) overall_n[i] = sum(sample_n[i]) overall_n_tail[i] = sum(sample_n_tail[i]) if overall_n[i] >= 1: overall_prop[i] = float(sum(sample_n_tail[i]))/overall_n[i] if overall_n_tail[i] >= 1: overall_tail[i] = float(sum(sample_total_tail[i]))/overall_n_tail[i] for i, name in enumerate(names): this_total = sum( item[i] for item in sample_total_tail ) this_n = sum( item[i] for item in sample_n_tail ) if this_n: self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n) for i, name in enumerate(names): this_total = sum( item[i] for item in sample_n_tail ) this_n = sum( item[i] for item in sample_n ) if this_n: self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n) with open(work/'features-with-data.gff','wb') as f: annotation.write_gff3_header(f) for i, item in enumerate(annotations): item.attr['reads'] = str(overall_n[i]) item.attr['reads_with_tail'] = str(overall_n_tail[i]) item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA' item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA' if overall_tail[i] is None: item.attr['color'] = '#444444' else: a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail) item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255)) #item.attr['color'] = ... print >> f, item.as_gff() comments = [ '#Counts' ] + [ '#sampleTags='+','.join(tags) for tags in sample_tags ] + [ '"Tail_count" group is number of reads with tail', '"Tail" group is mean tail per sample', '"Proportion" group is proportion of reads with tail', ] have_biotype = any("Biotype" in item.attr for item in annotations) have_parent = any("Parent" in item.attr for item in annotations) have_relation = any("Relation" in item.attr for item in annotations) have_antisense = any("Antisense_parent" in item.attr for item in annotations) def counts_iter(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(n_samples): row[('Count',names[j])] = '%d' % sample_n[i][j] row[('Annotation','Length')] = annotations[i].end - annotations[i].start row[('Annotation','gene')] = annotations[i].attr.get('Name','') row[('Annotation','product')] = annotations[i].attr.get('Product','') if have_biotype: row[('Annotation','biotype')] = annotations[i].attr.get('Biotype','') if have_parent: row[('Annotation','parent')] = annotations[i].attr.get('Parent','') if have_relation: row[('Annotation','relation')] = annotations[i].attr.get('Relation','') if have_antisense: row[('Annotation','antisense_gene')] = annotations[i].attr.get('Antisense_name','') row[('Annotation','antisense_product')] = annotations[i].attr.get('Antisense_product','') row[('Annotation','antisense_biotype')] = annotations[i].attr.get('Antisense_biotype','') row[('Annotation','antisense_parent')] = annotations[i].attr.get('Antisense_parent','') row[('Annotation','chromosome')] = str(annotations[i].seqid) row[('Annotation','strand')] = str(annotations[i].strand) row[('Annotation','start')] = str(annotations[i].start+1) row[('Annotation','end')] = str(annotations[i].end) row[('Annotation','reads')] = str(overall_n[i]) row[('Annotation','reads-with-tail')] = str(overall_n_tail[i]) row[('Annotation','mean-tail')] = str_na(overall_tail[i]) row[('Annotation','proportion-with-tail')] = str_na(overall_prop[i]) for j in xrange(n_samples): row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j] for j in xrange(n_samples): row[('Tail',names[j])] = str_na(sample_tail[i][j]) for j in xrange(n_samples): row[('Tail_sd',names[j])] = str_na(sample_sd_tail[i][j]) for quantile in sample_quantile_tail: for j in xrange(n_samples): row[('Tail_quantile_%d'%quantile,names[j])] = str_na(sample_quantile_tail[quantile][i][j]) for j in xrange(len(names)): row[('Proportion',names[j])] = str_na(sample_prop[i][j]) yield row io.write_csv(work/'counts.csv', counts_iter(), comments=comments) def write_csv_matrix(filename, matrix): def emitter(): for i in xrange(n_features): row = collections.OrderedDict() row["Feature"] = annotations[i].get_id() for j in xrange(n_samples): row[names[j]] = str_na(matrix[i][j]) yield row io.write_csv(filename, emitter()) write_csv_matrix(work/'read_count.csv', sample_n) write_csv_matrix(work/'tail_count.csv', sample_n_tail) write_csv_matrix(work/'tail.csv', sample_tail) write_csv_matrix(work/'tail_sd.csv', sample_sd_tail) for quantile in sample_quantile_tail: write_csv_matrix(work/('tail_quantile_%d.csv'%quantile), sample_quantile_tail[quantile]) #def raw_columns(): # for i in xrange(n_samples): # row = collections.OrderedDict() # row['Sample'] = names[i] # for j in xrange(max_length): # row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based # yield row #io.write_csv(work/'raw-columns.csv', raw_columns()) # ##Somewhat inefficient #def raw(): # for i in xrange(n_features): # row = collections.OrderedDict() # row['Feature'] = annotations[i].get_id() # for j in xrange(n_samples): # for k in xrange(max_length): # row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] ) # yield row #io.write_csv(work/'raw.csv', raw()) def pooled(): for i in xrange(n_features): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(max_length): row[str(j)] = str( sum( counts[i][k][1][j] for k in xrange(n_samples) ) ) yield row io.write_csv(work/'pooled.csv', pooled())
def run(self): references = {} for filename in self.reference_filenames: for name, seq in io.read_sequences(filename): references[name] = seq tail_lengths = {} adaptor_bases = {} for filename in self.clips: with io.open_possibly_compressed_file(filename) as f: for line in f: if line.startswith('#'): continue parts = line.rstrip('\n').split('\t') name = parts[0].split()[0] tail_lengths[name] = int(parts[3]) - int(parts[2]) adaptor_bases[name] = int(parts[6]) in_file = self.begin_input() out_file = self.begin_output() assert self.prop_a >= 0.0 and self.prop_a <= 1.0 a_score = 1 - self.prop_a non_a_score = -self.prop_a for line in in_file: line = line.rstrip() if line.startswith('@'): print >> out_file, line continue al = Alignment(line) if al.flag & FLAG_UNMAPPED: continue #ref = references[al.rname] reverse = al.flag & FLAG_REVERSE if reverse: read_bases = rev_comp(al.seq) read_qual = al.qual[::-1] cigar = cigar_decode(al.cigar)[::-1] else: read_bases = al.seq read_qual = al.qual cigar = cigar_decode(al.cigar) n_tail = tail_lengths[al.qname] #if reverse: # if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference # bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1]) #else: # if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference # bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug! # #extension = 0 #while extension < n_tail and bases_ref[extension] == 'A': # extension += 1 if reverse: feat = annotation.Annotation(al.rname, start=al.pos - 1 - n_tail, end=al.pos - 1, strand=-1) else: feat = annotation.Annotation(al.rname, start=al.pos - 1 + al.length, end=al.pos - 1 + al.length + n_tail, strand=1) bases_ref = feat.get_seq(references).upper() # Allow up to 60% mismatch on As # Treat soft clipping as insertion for simplicity cigar = cigar.replace("S", "I") assert "H" not in cigar, "Can't handle hard clipping" extension = 0 best_score = 0.0 score = 0.0 # Soft clipping treated as a mismatch i = len(cigar) - 1 while i >= 0 and cigar[i] in "I": score += non_a_score i -= 1 for i in xrange(n_tail): if bases_ref[i] == "A": score += a_score else: score += non_a_score if score >= best_score: extension = i + 1 best_score = score #print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref if n_tail - extension > 0: al.extra.append('AN:i:%d' % (n_tail - extension)) al.extra.append('AG:i:%d' % (extension)) if adaptor_bases[al.qname]: al.extra.append('AD:i:%d' % adaptor_bases[al.qname]) if n_tail - extension >= self.tail: #if reverse: # tail_refpos = al.pos-extension #else: # tail_refpos = al.pos+al.length+extension-1 #al.extra.append('AA:i:%d'%tail_refpos) al.extra.append('AA:i:1') cigar += 'M' * extension read_bases += 'N' * extension #Since mispriming is so common (and loading the original sequence here would be a pain) read_qual += chr(33 + 20) * extension #Arbitrarily give quality 20 al.length += extension if reverse: al.pos -= extension al.seq = rev_comp(read_bases) al.qual = read_qual[::-1] al.cigar = cigar_encode(cigar[::-1]) else: al.seq = read_bases al.qual = read_qual al.cigar = cigar_encode(cigar) print >> out_file, al self.end_output(out_file) self.end_input(in_file)
def main(args): genbank_filename, args = grace.get_option_value(args,'--gbk',str,None) use_indels, args = grace.get_option_value(args,'--indels',grace.as_bool,True) use_reference, args = grace.get_option_value(args,'--reference',grace.as_bool,True) give_evidence, args = grace.get_option_value(args,'--evidence',grace.as_bool,True) give_consequences, args = grace.get_option_value(args,'--consequences',grace.as_bool,True) require_all, args = grace.get_option_value(args,'--require-all',grace.as_bool,False) require_bisect, args = grace.get_option_value(args,'--require-bisect',grace.as_bool,False) full_output, args = grace.get_option_value(args,'--full',grace.as_bool,False) format, args = grace.get_option_value(args,'--as',str,'table') # Secret option! limit, args = grace.get_option_value(args,'--limit',int,None) grace.expect_no_further_options(args) if len(args) < 1: sys.stderr.write(USAGE) return 1 working_dirs = [ ] split_a = [ ] split_b = [ ] def default(args): working_dirs.extend(args) def splitting(args): split_a.extend(args) def splitting_from(args): split_b.extend(args) grace.execute(args, { 'splitting' : splitting, 'from' : splitting_from }, default ) if use_reference: names = ['reference'] evidence_start = 1 else: names = [ ] evidence_start = 0 names.extend( norm_name(item) for item in working_dirs ) references = io.read_sequences(os.path.join(working_dirs[0], 'reference.fa')) annotations = { } if genbank_filename: from Bio import SeqIO for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'): sequence = record.seq.tostring() features = [ item for item in record.features if item.type != 'source' ] features.sort(key=lambda item: item.location.nofuzzy_start) annotations[sequence] = features iterator = reader(working_dirs, references, use_reference, annotations) if not use_indels: iterator = itertools.ifilter(has_no_indels, iterator) if require_all or require_bisect or format == 'counts': iterator = itertools.ifilter(fully_unambiguous, iterator) if require_bisect: iterator = itertools.ifilter(is_binary_partition, iterator) if not require_bisect: if full_output: iterator = itertools.ifilter(not_boring_insertion, iterator) else: iterator = itertools.ifilter(is_interesting, iterator) if split_a or split_b: assert len(names) == len(set(names)), 'Two samples with the same name' try: split_a = [ names.index(norm_name(item)) for item in split_a ] split_b = [ names.index(norm_name(item)) for item in split_b ] except ValueError: raise grace.Error('Sample to be split is not amongst samples given') iterator = itertools.ifilter(is_split(split_a, split_b), iterator) if limit: iterator = itertools.islice(iterator, limit) if format == 'table': line = 'Reference\tPosition\tChange type' line += '\t' + '\t'.join(names) if give_evidence: line += '\t' + '\t'.join(names[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(names[evidence_start:]) if annotations: line += '\tAnnotations' print line for calls in iterator: line = '%s\t%d\t%s\t%s' % ( calls.ref_name, calls.ref_pos+1, change_type(calls), '\t'.join(item.consensus for item in calls.calls)) if give_evidence: line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:]) if annotations: line += '\t' + describe_features(calls.features) print line elif format == 'compact': for line in transpose_strings(names): print line print for calls in iterator: if calls.is_insertion: footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name) else: footer = '%12d %s' % (calls.ref_pos+1, calls.ref_name) t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1) top = t[0] + ' ' + footer if give_consequences: consequences = [ ] for call in calls.calls: if call.consequences: for item in call.consequences.split(', '): item = ' '.join(item.split()[:3]) if item not in consequences: consequences.append(item) if consequences: top += ' ' + ' / '.join(sorted(consequences)) top += ' ' + describe_features(calls.features) print top for line in t[1:]: print line elif format == 'nexus': buckets = [ [ ] for name in names ] for calls in iterator: for i, char in enumerate(partition_string(calls)): buckets[i].append(char) print '#NEXUS' print 'begin taxa;' print 'dimensions ntax=%d;' % len(names) print 'taxlabels' for name in names: print name print ';' print 'end;' print 'begin characters;' print 'dimensions nchar=%d;' % len(buckets[0]) print 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;' print 'matrix' for name, bucket in itertools.izip(names, buckets): print name, ''.join(bucket) print ';' print 'end;' elif format == 'counts': for line in transpose_strings(names): print line print counts = { } for calls in iterator: count_str = partition_string(calls) if count_str not in counts: counts[count_str] = 1 else: counts[count_str] += 1 for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True): print '%s %d' % (transpose_strings(count_str)[0], counts[count_str]) else: raise grace.Error('Unknown output format: ' + format)
def run(self): reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) tags = {} for item in reader.metadata.get('sampleTags', []): parts = item.split(',') tags[parts[0]] = parts assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.' samples = ['reference'] + reader.samples for sample in samples: if sample not in tags: tags[sample] = [sample, 'all'] samples = selection.select_and_sort(self.select, self.sort, samples, lambda sample: tags[sample]) required = [ i for i, sample in enumerate(samples) if selection.matches(self.require, tags[sample]) ] sample_number = dict((b, a) for a, b in enumerate(reader.samples)) items = [] for record in reader: variants = get_variants(record) genotypes = [] counts = [] qualities = [] for sample in samples: if sample == 'reference': genotypes.append([0]) counts.append([1]) qualities.append(float('inf')) else: genotypes.append( get_genotype(record.samples[sample_number[sample]])) counts.append( get_variant_counts( record.samples[sample_number[sample]])) qualities.append( record.samples[sample_number[sample]].data.GQ) # Only output when there are at least two genotypes any_interesting = False for i in xrange(len(genotypes)): for j in xrange(i): if (genotypes[i] is not None and genotypes[j] is not None and not genotypes_equal(genotypes[i], genotypes[j])): any_interesting = True break if any_interesting: break if not any_interesting: continue if any(genotypes[i] is None for i in required): continue if self.only_snps and any(genotype is not None and any( len(variants[i]) != 1 for i in genotype) for genotype in genotypes): continue snpeff = snpeff_describe(record.INFO.get('EFF', '')) if not any( selection.matches(self.snpeff_filter, item[1]) for item in (snpeff or [('', [])])): continue items.append( _Nway_record(variants=variants, genotypes=genotypes, counts=counts, qualities=qualities, snpeff=snpeff, record=record)) self.log.log('%d variants\n\n' % len(items)) if self.as_ == 'table': self._write_table(samples, items) elif self.as_ == 'nexus': self._write_nexus(samples, items) elif self.as_ == 'splitstree': self._write_nexus(samples, items) io.execute( 'SplitsTree +g -i INPUT -x COMMAND', no_display=True, INPUT=self.prefix + '.nex', COMMAND='UPDATE; ' 'SAVE FILE=\'%s.nex\' REPLACE=yes; ' 'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; ' 'QUIT' % (self.prefix, self.prefix, len(items)), ) elif self.as_ == 'vcf': self._write_vcf(samples, items, reader) else: raise grace.Error('Unknown output format: ' + self.as_)
def main(args): default_transl_table, args = grace.get_option_value(args, '--transl_table', int, 11) use_coverage, args = grace.get_flag(args, '--use-coverage') coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff', float, 0.1) tabular, args = grace.get_flag(args, '--tabular') noheader, args = grace.get_flag(args, '--noheader') verbose, args = grace.get_flag(args, '--verbose') bandwidth, args = grace.get_option_value(args, '--band', int, 20) grace.expect_no_further_options(args) if len(args) != 2: print USAGE return 1 genbank_filename = args[0] alignment_filename = args[1] if os.path.isdir(alignment_filename): alignment_filename = os.path.join(alignment_filename, 'alignment.maf') working_dir = os.path.split(alignment_filename)[0] alignments = load_alignments(alignment_filename) summaries = [ ] details = [ ] if not noheader: fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t' if use_coverage: fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t' fields += 'Gene\tProduct' if tabular: fields += '\tChanges of note' print fields for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'): sequence = record.seq.tostring() for name, seq1, seq2, alignment in alignments: if seq1 == sequence: break else: raise grace.Error('Genbank record %s sequence not identical to any reference sequence' % record.id) if use_coverage: depth = get_graph(working_dir, name, 'depth') ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth') median_depth = numpy.median(depth) median_ambiguous_depth = numpy.median(ambiguous_depth) ambiguous_factor = float(median_ambiguous_depth) / median_depth depth_expect = expected_depth(name, sequence, depth, ambiguous_depth) for feature in record.features: if feature.type != 'CDS': continue if 'locus_tag' not in feature.qualifiers: locus_tag = '%d..%d' % (feature.location.nofuzzy_start+1,feature.location.nofuzzy_end) else: locus_tag = feature.qualifiers['locus_tag'][0] if 'transl_table' in feature.qualifiers: transl_table_no = int(feature.qualifiers['transl_table'][0]) else: assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given' transl_table_no = default_transl_table transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no] start_codons = transl_table.start_codons try: feature_alignment = alignment_from_feature(sequence, feature) except Weird_alignment: warn('%s has a location I could not handle, skipping, sorry' % locus_tag) continue dna = [ ] new_dna = [ ] shifts = [ ] for i in xrange(feature_alignment.end2): p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i+1, left=True) assert abs(p2-p1) < 2 dna.append( sequence_slice(sequence,p1,p2) ) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm diff = (p2-p1)-(p2a-p1a) #if diff: # if diff%3: # frame_shift = True # else: # frame_preserving_shift = True new_dna.append( sequence_slice(seq2,p1a,p2a) ) if diff: shifts.append((i,dna[-1],new_dna[-1])) dna = ''.join(dna) new_dna = ''.join(new_dna) # This usually indicated a CDS truncated at the start? # in which case, will probably fail some way or other down the line. if 'codon_start' in feature.qualifiers: codon_start = int(feature.qualifiers['codon_start'][0]) - 1 else: codon_start = 0 dna = dna[codon_start:] new_dna = new_dna[codon_start:] if len(dna) % 3 != 0: warn(locus_tag + ' length not a multiple of 3') #assert len(new_dna) % 3 == 0 protein = Seq.Seq(dna).translate(table=transl_table_no).tostring() # http://en.wikipedia.org/wiki/Start_codon is always translated to M protein = 'M' + protein[1:] if dna[:3] not in start_codons: warn(locus_tag + ' has unknown start codon: ' + dna[:3]) original_lacks_stop_codon = not protein.endswith('*') if original_lacks_stop_codon: warn(locus_tag + ' lacks end codon') original_stops_before_end = '*' in protein[:-1] if original_stops_before_end: warn(locus_tag + ' contains stop codon before end') if 'translation' in feature.qualifiers: expect = feature.qualifiers['translation'][0] if protein[:-1] != expect: warn(locus_tag + ' translation given in feature does not match translation from DNA') new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] # If end codon changed, find new end # Don't bother if there are unknown amino acids or # the original protein lacks a stop codon if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon: #This is very inefficient i = feature_alignment.end2 while True: p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i+1, left=True) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len(seq2): break new_dna += sequence_slice(seq2,p1a,p2a) new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] if 'X' in new_protein or '*' in new_protein: break i += 1 # Is the protein shorter? # Don't bother checking if the original protein has extra stop codons if '*' in new_protein and not original_stops_before_end: new_protein = new_protein[:new_protein.index('*')+1] # If indels occurred, do an alignment # Don't bother otherwise if shifts: # Penalize gaps with cost 2 (vs 1 for mismatch) # If lengths don't match, pad with spaces (won't match longer seq), # aligner prefers mismatch to gaps #result = pairwise2.align.globalxs(protein + ' '*max(0,len(new_protein)-len(protein)), # new_protein + ' '*max(0,len(protein)-len(new_protein)), # -2.001,-2.000)[0] # 2.001 : very slightly prefer contiguous gaps. Also much faster! result = band_limited_align(protein + ' '*max(0,len(new_protein)-len(protein)), new_protein + ' '*max(0,len(protein)-len(new_protein)), bandwidth) protein_ali = result[0] new_protein_ali = result[1] else: protein_ali = protein new_protein_ali = new_protein diffs = [ ] j = 0 k = 0 for i in xrange(min(len(new_protein_ali),len(protein_ali))): if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and ( protein_ali[i] == '-' or new_protein_ali[i] == '-' or not bio.might_be_same_amino(protein_ali[i], new_protein_ali[i]) ): diffs.append((i,j,k)) if protein_ali[i] != '-': j += 1 if new_protein_ali[i] != '-': k += 1 diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \ not bio.might_be_same_base(new_dna[1],dna[1]) or \ not bio.might_be_same_base(new_dna[2],dna[2]) interesting_coverage = False if use_coverage: cds_depth = depth[feature_alignment.start1:feature_alignment.end1] #/ median_depth if not feature_alignment.forward1: cds_depth = cds_depth[::-1] cds_ambiguous_depth = ambiguous_depth[feature_alignment.start1:feature_alignment.end1] #/ median_ambiguous_depth if not feature_alignment.forward1: cds_ambiguous_depth = cds_ambiguous_depth[::-1] cds_depth_expect = depth_expect[feature_alignment.start1:feature_alignment.end1] if not feature_alignment.forward1: cds_depth_expect = cds_depth_expect[::-1] #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth #line += '%.1f\t' % cds_average_depth_ratio #line += '%.1f\t' % cds_average_ambiguous_depth_ratio #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth) #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth) #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth) avg_expect = numpy.average(cds_depth_expect) if avg_expect > 0.0: cds_avg_depth = numpy.average(cds_depth)/avg_expect cds_avg_ambiguous_depth = numpy.average(cds_ambiguous_depth)/avg_expect/ambiguous_factor strange = ( (cds_depth >= cds_depth_expect*1.5) | (cds_ambiguous_depth <= cds_depth_expect*(0.5*ambiguous_factor)) ) interesting_coverage = numpy.average(strange) >= coverage_cutoff if interesting_coverage or diffs or diff_start or shifts or len(new_protein) != len(protein): line = name + '\t' + locus_tag + '\t' + \ '%d\t' % (len(protein)-1) + \ '%d\t' % (len(new_protein)-1) + \ '%d\t' % len(diffs) if use_coverage: if avg_expect <= 0.0: line += '\t\t\t' else: line += '%.1f\t' % (cds_avg_depth) + graphlet(cds_depth, cds_depth_expect)+'\t' line += '%.1f\t' % (cds_avg_ambiguous_depth) + graphlet(cds_ambiguous_depth, cds_depth_expect*ambiguous_factor)+'\t' line += '%.1f%%\t' % (numpy.average(cds_ambiguous_depth > 0.0)*100.0) line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \ '%s' % feature.qualifiers.get('product',[''])[0] notes = [ ] if use_coverage and 'X' in new_protein: xs = new_protein.count('X') if xs == len(new_protein)-1: #First is M, so len-1 notes.append('\ No consensus') else: notes.append('\ No consensus for %d aa' % (new_protein.count('X'))) if len(new_protein) < len(protein): notes.append('\ Shorter by %d aa' % (len(protein)-len(new_protein))) if len(new_protein) > len(protein): notes.append('\ Longer by %d aa' % (len(new_protein)-len(protein))) if diff_start: notes.append('\ Start changed: %s -> %s' % (dna[:3], new_dna[:3])) if new_dna[:3] not in start_codons: notes.append(' No longer a start codon!') if shifts: notes.append('\ Indels:') for pos, old, new in shifts: notes.append(' base %5d / codon %5d %s -> %s' % (pos+1,(pos//3)+1,old,new or '-')) if diffs: if verbose: notes.append('\ Amino acid changes:') for i, j, k in diffs: notes.append(' codon %5d %s->%s (%s->%s)' % ( j+1, protein_ali[i], new_protein_ali[i], dna[j*3:j*3+3] if protein_ali[i] != '-' else '-', new_dna[k*3:k*3+3] if new_protein_ali[i] != '-' else '-' )) #if len(new_protein) > len(protein): # print 'New protein is longer:', new_protein[len(protein):] #if len(new_protein) < len(protein): # print 'New protein is shorter:', protein[len(new_protein):] #print protein #print new_protein if tabular: print line + '\t' + ' '.join([ ' '.join(note.strip().split()) for note in notes ]) else: print line for note in notes: print '\t' + note return 0
def run(self): if self.dirichlet: assert self.ploidy == 1, 'Dirichlet mode is not available for ploidy > 1' reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) writer = vcf.Writer(open(self.prefix + '.vcf','wb'), reader) #print dir(reader) #print reader.formats #print #print reader.infos #print n = 0 n_kept = 0 for record in reader: n += 1 variants = get_variants(record) any = False for sample in record.samples: self._modify_sample(variants, sample) any = any or (sample.data.GT != self._blank_gt() and sample.data.GT != self._reference_gt()) #print call.sample #for key in call.data._fields: # print key, getattr(call.data,key), reader.formats[key].desc # #counts = [ call.data.RO ] #if isinstance(call.data.QA,list): # counts.extend(call.data.QA) #else: # counts.append(call.data.QA) #print variants, counts # # #if self.min_gq is not None and call.data.GQ < self.min_gq: # call.data = call.data._replace(GT='.') # print call.data #else: # any = True if self.dirichlet: record.QUAL = min(MAX_QUALITY, sum(sample.data.GQ for sample in record.samples)) if any: writer.write_record(record) n_kept += 1 writer.close() reader_f.close() self.log.datum('variants','input', n) self.log.datum('variants','kept', n_kept) index_vcf(self.prefix+'.vcf')
def nway_main(gbk_filename, use_indels, use_reference, give_evidence, give_consequences, require_all, require_bisect, full_output, format, working_dirs, split_a, split_b, f=sys.stdout): assert working_dirs, 'Need at least one working directory.' workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in working_dirs ] reference = workspaces[0].get_reference() #if not annotation_filename: # annotation_filename = reference.annotations_filename() #May still be None if use_reference: names = ['reference'] evidence_start = 1 else: names = [ ] evidence_start = 0 names.extend( norm_name(item) for item in working_dirs ) references = io.read_sequences(reference.reference_fasta_filename()) annotations = { } if gbk_filename: from Bio import SeqIO for record in SeqIO.parse(io.open_possibly_compressed_file(gbk_filename),'genbank'): sequence = record.seq.tostring() features = [ item for item in record.features if item.type != 'source' ] features.sort(key=lambda item: item.location.nofuzzy_start) annotations[sequence] = features iterator = reader(working_dirs, references, use_reference, annotations) if not use_indels: iterator = itertools.ifilter(has_no_indels, iterator) if require_all or require_bisect or format == 'counts': iterator = itertools.ifilter(fully_unambiguous, iterator) if require_bisect: iterator = itertools.ifilter(is_binary_partition, iterator) if not require_bisect: if full_output: iterator = itertools.ifilter(not_boring_insertion, iterator) else: iterator = itertools.ifilter(is_interesting, iterator) if split_a or split_b: assert len(names) == len(set(names)), 'Two samples with the same name' try: split_a = [ names.index(norm_name(item)) for item in split_a ] split_b = [ names.index(norm_name(item)) for item in split_b ] except ValueError: raise grace.Error('Sample to be split is not amongst samples given') iterator = itertools.ifilter(is_split(split_a, split_b), iterator) #if limit: # iterator = itertools.islice(iterator, limit) if format == 'table': line = 'Reference\tPosition\tChange type' line += '\t' + '\t'.join(names) if give_evidence: line += '\t' + '\t'.join(names[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(names[evidence_start:]) if annotations: line += '\tAnnotations' print >> f, line for calls in iterator: line = '%s\t%d\t%s\t%s' % ( calls.ref_name, calls.ref_pos+1, change_type(calls), '\t'.join(item.consensus for item in calls.calls)) if give_evidence: line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:]) if give_consequences: line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:]) if annotations: line += '\t' + describe_features(calls.features) print >> f, line elif format == 'compact': for line in transpose_strings(names): print >> f, line print >> f for calls in iterator: if calls.is_insertion: footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name) else: footer = '%12d %s' % (calls.ref_pos+1, calls.ref_name) t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1) top = t[0] + ' ' + footer if give_consequences: consequences = [ ] for call in calls.calls: if call.consequences: for item in call.consequences.split(', '): item = ' '.join(item.split()[:3]) if item not in consequences: consequences.append(item) if consequences: top += ' ' + ' / '.join(sorted(consequences)) top += ' ' + describe_features(calls.features) print >> f, top for line in t[1:]: print >> f, line elif format == 'nexus': buckets = [ [ ] for name in names ] for calls in iterator: for i, char in enumerate(partition_string(calls)): buckets[i].append(char) print >> f, '#NEXUS' print >> f, 'begin taxa;' print >> f, 'dimensions ntax=%d;' % len(names) print >> f, 'taxlabels' for name in names: print >> f, name print >> f, ';' print >> f, 'end;' print >> f, 'begin characters;' print >> f, 'dimensions nchar=%d;' % len(buckets[0]) print >> f, 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;' print >> f, 'matrix' for name, bucket in itertools.izip(names, buckets): print >> f, name, ''.join(bucket) print >> f, ';' print >> f, 'end;' elif format == 'counts': for line in transpose_strings(names): print >> f, line print >> f counts = { } for calls in iterator: count_str = partition_string(calls) if count_str not in counts: counts[count_str] = 1 else: counts[count_str] += 1 for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True): print >> f, '%s %d' % (transpose_strings(count_str)[0], counts[count_str]) else: raise grace.Error('Unknown output format: ' + format)
def run(self): workspace = self.get_workspace() header = ["##gff-version 3\n"] lengths = {} with io.open_possibly_compressed_file(self.features) as f: f.next() for line in f: if not line.startswith("#"): break if line.startswith("##gff-version"): continue header.append(line) parts = line.strip().split() if parts[0] == "##sequence-region": lengths[parts[1]] = int(parts[3]) header = "".join(header) items = list(annotation.read_gff(self.features, "/")) annotation.link_up_annotations(items) for item in items: assert len(item.parents) < 2 if "ID" in item.attr: item.attr["ID"] = item.attr["ID"].split(":")[1] if "Parent" in item.attr: item.attr["Parent"] = item.attr["Parent"].split(":")[1] if item.parents: item.parent = item.parents[0] def well_supported(item): if self.support is None: return True level = item.attr.get("transcript_support_level", "NA").split()[0] if not level.isdigit(): return False return int(level) <= self.support exons = [ item for item in items if item.type == "exon" and well_supported(item.parent) ] exon_index = span_index.index_annotations(exons) utrs = [] extended_utrs = [] utr_parts = [] exons_kept = [] cds_kept = [] transcripts_kept = [] for item in items: this_exons = [ item2 for item2 in item.children if item2.type == "exon" ] if this_exons and well_supported(item): transcripts_kept.append(item) exons_kept.extend(this_exons) cds_kept.extend( [item2 for item2 in item.children if item2.type == "CDS"]) if self.gene_level: utr_bits = [ item3 for item2 in item.children if well_supported(item2) for item3 in item2.children if item3.type == self.what ] else: if not well_supported(item): continue utr_bits = [ item2 for item2 in item.children if item2.type == self.what ] if not utr_bits: continue utr = utr_bits[0].copy() for item2 in utr_bits[1:]: utr = utr.span_with(item2) gene = item if self.gene_level else item.parent utr.attr = dict(ID=item.get_id(), Name=item.attr["Name"], gene_id=gene.get_id(), gene=gene.attr["Name"], description=gene.attr.get("description", ""), biotype=item.attr["biotype"]) max_extension = 10000 if item.strand < 0: max_extension = min(max_extension, utr.start) else: max_extension = min(max_extension, lengths[utr.seqid] - utr.end) assert max_extension >= 0, utr end = utr.three_prime() for hit in exon_index.get(end.shifted(0, max_extension), same_strand=True): #if hit.parent.get_id() == item.get_id(): # continue rel = hit.relative_to(end).start if rel >= 0: max_extension = min(max_extension, rel) extended_utr = utr.shifted(0, max_extension) extended_utr.start = max(extended_utr.start, 0) utr.attr["max_extension"] = str(max_extension) utrs.append(utr) extended_utrs.append(extended_utr) for item2 in utr_bits: part = item2.copy() part.attr = dict(Parent=item.get_id()) part.type = "part" utr_parts.append(part) write_gff3(workspace / "utr.gff", utrs, header) write_gff3(workspace / "utr_extended.gff", extended_utrs, header) write_gff3(workspace / "utr_part.gff", utr_parts, header) write_gff3(workspace / "transcript.gff", transcripts_kept, header) write_gff3(workspace / "exon.gff", exons_kept, header) write_gff3(workspace / "cds.gff", cds_kept, header)
def main(args): default_transl_table, args = grace.get_option_value( args, '--transl_table', int, 11) use_coverage, args = grace.get_flag(args, '--use-coverage') coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff', float, 0.1) tabular, args = grace.get_flag(args, '--tabular') noheader, args = grace.get_flag(args, '--noheader') verbose, args = grace.get_flag(args, '--verbose') bandwidth, args = grace.get_option_value(args, '--band', int, 20) grace.expect_no_further_options(args) if len(args) != 2: print USAGE return 1 genbank_filename = args[0] alignment_filename = args[1] if os.path.isdir(alignment_filename): alignment_filename = os.path.join(alignment_filename, 'alignment.maf') working_dir = os.path.split(alignment_filename)[0] alignments = load_alignments(alignment_filename) summaries = [] details = [] if not noheader: fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t' if use_coverage: fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t' fields += 'Gene\tProduct' if tabular: fields += '\tChanges of note' print fields for record in SeqIO.parse( io.open_possibly_compressed_file(genbank_filename), 'genbank'): sequence = record.seq.tostring() for name, seq1, seq2, alignment in alignments: if seq1 == sequence: break else: raise grace.Error( 'Genbank record %s sequence not identical to any reference sequence' % record.id) if use_coverage: depth = get_graph(working_dir, name, 'depth') ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth') median_depth = numpy.median(depth) median_ambiguous_depth = numpy.median(ambiguous_depth) ambiguous_factor = float(median_ambiguous_depth) / median_depth depth_expect = expected_depth(name, sequence, depth, ambiguous_depth) for feature in record.features: if feature.type != 'CDS': continue if 'locus_tag' not in feature.qualifiers: locus_tag = '%d..%d' % (feature.location.nofuzzy_start + 1, feature.location.nofuzzy_end) else: locus_tag = feature.qualifiers['locus_tag'][0] if 'transl_table' in feature.qualifiers: transl_table_no = int(feature.qualifiers['transl_table'][0]) else: assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given' transl_table_no = default_transl_table transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no] start_codons = transl_table.start_codons try: feature_alignment = alignment_from_feature(sequence, feature) except Weird_alignment: warn('%s has a location I could not handle, skipping, sorry' % locus_tag) continue dna = [] new_dna = [] shifts = [] for i in xrange(feature_alignment.end2): p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i + 1, left=True) assert abs(p2 - p1) < 2 dna.append(sequence_slice(sequence, p1, p2)) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm diff = (p2 - p1) - (p2a - p1a) #if diff: # if diff%3: # frame_shift = True # else: # frame_preserving_shift = True new_dna.append(sequence_slice(seq2, p1a, p2a)) if diff: shifts.append((i, dna[-1], new_dna[-1])) dna = ''.join(dna) new_dna = ''.join(new_dna) # This usually indicated a CDS truncated at the start? # in which case, will probably fail some way or other down the line. if 'codon_start' in feature.qualifiers: codon_start = int(feature.qualifiers['codon_start'][0]) - 1 else: codon_start = 0 dna = dna[codon_start:] new_dna = new_dna[codon_start:] if len(dna) % 3 != 0: warn(locus_tag + ' length not a multiple of 3') #assert len(new_dna) % 3 == 0 protein = Seq.Seq(dna).translate(table=transl_table_no).tostring() # http://en.wikipedia.org/wiki/Start_codon is always translated to M protein = 'M' + protein[1:] if dna[:3] not in start_codons: warn(locus_tag + ' has unknown start codon: ' + dna[:3]) original_lacks_stop_codon = not protein.endswith('*') if original_lacks_stop_codon: warn(locus_tag + ' lacks end codon') original_stops_before_end = '*' in protein[:-1] if original_stops_before_end: warn(locus_tag + ' contains stop codon before end') if 'translation' in feature.qualifiers: expect = feature.qualifiers['translation'][0] if protein[:-1] != expect: warn( locus_tag + ' translation given in feature does not match translation from DNA' ) new_protein = Seq.Seq(new_dna).translate( table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] # If end codon changed, find new end # Don't bother if there are unknown amino acids or # the original protein lacks a stop codon if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon: #This is very inefficient i = feature_alignment.end2 while True: p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i + 1, left=True) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len( seq2): break new_dna += sequence_slice(seq2, p1a, p2a) new_protein = Seq.Seq(new_dna).translate( table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] if 'X' in new_protein or '*' in new_protein: break i += 1 # Is the protein shorter? # Don't bother checking if the original protein has extra stop codons if '*' in new_protein and not original_stops_before_end: new_protein = new_protein[:new_protein.index('*') + 1] # If indels occurred, do an alignment # Don't bother otherwise if shifts: # Penalize gaps with cost 2 (vs 1 for mismatch) # If lengths don't match, pad with spaces (won't match longer seq), # aligner prefers mismatch to gaps #result = pairwise2.align.globalxs(protein + ' '*max(0,len(new_protein)-len(protein)), # new_protein + ' '*max(0,len(protein)-len(new_protein)), # -2.001,-2.000)[0] # 2.001 : very slightly prefer contiguous gaps. Also much faster! result = band_limited_align( protein + ' ' * max(0, len(new_protein) - len(protein)), new_protein + ' ' * max(0, len(protein) - len(new_protein)), bandwidth) protein_ali = result[0] new_protein_ali = result[1] else: protein_ali = protein new_protein_ali = new_protein diffs = [] j = 0 k = 0 for i in xrange(min(len(new_protein_ali), len(protein_ali))): if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and ( protein_ali[i] == '-' or new_protein_ali[i] == '-' or not bio.might_be_same_amino(protein_ali[i], new_protein_ali[i])): diffs.append((i, j, k)) if protein_ali[i] != '-': j += 1 if new_protein_ali[i] != '-': k += 1 diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \ not bio.might_be_same_base(new_dna[1],dna[1]) or \ not bio.might_be_same_base(new_dna[2],dna[2]) interesting_coverage = False if use_coverage: cds_depth = depth[feature_alignment.start1: feature_alignment.end1] #/ median_depth if not feature_alignment.forward1: cds_depth = cds_depth[::-1] cds_ambiguous_depth = ambiguous_depth[ feature_alignment.start1: feature_alignment.end1] #/ median_ambiguous_depth if not feature_alignment.forward1: cds_ambiguous_depth = cds_ambiguous_depth[::-1] cds_depth_expect = depth_expect[feature_alignment. start1:feature_alignment.end1] if not feature_alignment.forward1: cds_depth_expect = cds_depth_expect[::-1] #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth #line += '%.1f\t' % cds_average_depth_ratio #line += '%.1f\t' % cds_average_ambiguous_depth_ratio #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth) #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth) #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth) avg_expect = numpy.average(cds_depth_expect) if avg_expect > 0.0: cds_avg_depth = numpy.average(cds_depth) / avg_expect cds_avg_ambiguous_depth = numpy.average( cds_ambiguous_depth) / avg_expect / ambiguous_factor strange = ((cds_depth >= cds_depth_expect * 1.5) | (cds_ambiguous_depth <= cds_depth_expect * (0.5 * ambiguous_factor))) interesting_coverage = numpy.average( strange) >= coverage_cutoff if interesting_coverage or diffs or diff_start or shifts or len( new_protein) != len(protein): line = name + '\t' + locus_tag + '\t' + \ '%d\t' % (len(protein)-1) + \ '%d\t' % (len(new_protein)-1) + \ '%d\t' % len(diffs) if use_coverage: if avg_expect <= 0.0: line += '\t\t\t' else: line += '%.1f\t' % (cds_avg_depth) + graphlet( cds_depth, cds_depth_expect) + '\t' line += '%.1f\t' % ( cds_avg_ambiguous_depth) + graphlet( cds_ambiguous_depth, cds_depth_expect * ambiguous_factor) + '\t' line += '%.1f%%\t' % ( numpy.average(cds_ambiguous_depth > 0.0) * 100.0) line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \ '%s' % feature.qualifiers.get('product',[''])[0] notes = [] if use_coverage and 'X' in new_protein: xs = new_protein.count('X') if xs == len(new_protein) - 1: #First is M, so len-1 notes.append('\ No consensus') else: notes.append('\ No consensus for %d aa' % (new_protein.count('X'))) if len(new_protein) < len(protein): notes.append('\ Shorter by %d aa' % (len(protein) - len(new_protein))) if len(new_protein) > len(protein): notes.append('\ Longer by %d aa' % (len(new_protein) - len(protein))) if diff_start: notes.append('\ Start changed: %s -> %s' % (dna[:3], new_dna[:3])) if new_dna[:3] not in start_codons: notes.append(' No longer a start codon!') if shifts: notes.append('\ Indels:') for pos, old, new in shifts: notes.append(' base %5d / codon %5d %s -> %s' % (pos + 1, (pos // 3) + 1, old, new or '-')) if diffs: if verbose: notes.append('\ Amino acid changes:') for i, j, k in diffs: notes.append( ' codon %5d %s->%s (%s->%s)' % (j + 1, protein_ali[i], new_protein_ali[i], dna[j * 3:j * 3 + 3] if protein_ali[i] != '-' else '-', new_dna[k * 3:k * 3 + 3] if new_protein_ali[i] != '-' else '-')) #if len(new_protein) > len(protein): # print 'New protein is longer:', new_protein[len(protein):] #if len(new_protein) < len(protein): # print 'New protein is shorter:', protein[len(new_protein):] #print protein #print new_protein if tabular: print line + '\t' + ' '.join( [' '.join(note.strip().split()) for note in notes]) else: print line for note in notes: print '\t' + note return 0
def run(self): references = { } for filename in self.reference_filenames: for name, seq in io.read_sequences(filename): references[name] = seq tail_lengths = { } adaptor_bases = { } for filename in self.clips: with io.open_possibly_compressed_file(filename) as f: for line in f: if line.startswith('#'): continue parts = line.rstrip('\n').split('\t') name = parts[0].split()[0] tail_lengths[name] = int(parts[3])-int(parts[2]) adaptor_bases[name] = int(parts[6]) in_file = self.begin_input() out_file = self.begin_output() for line in in_file: line = line.rstrip() if line.startswith('@'): print >> out_file, line continue al = Alignment(line) if al.flag & FLAG_UNMAPPED: continue ref = references[al.rname] reverse = al.flag & FLAG_REVERSE if reverse: read_bases = rev_comp(al.seq) read_qual = al.qual[::-1] cigar = cigar_decode(al.cigar)[::-1] else: read_bases = al.seq read_qual = al.qual cigar = cigar_decode(al.cigar) n_tail = tail_lengths[al.qname] if reverse: if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1]) else: if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] extension = 0 while extension < n_tail and bases_ref[extension] == 'A': extension += 1 if n_tail-extension > 0: al.extra.append('AN:i:%d' % (n_tail-extension)) if adaptor_bases[al.qname]: al.extra.append('AD:i:%d' % adaptor_bases[al.qname]) if n_tail-extension >= self.tail: #if reverse: # tail_refpos = al.pos-extension #else: # tail_refpos = al.pos+al.length+extension-1 #al.extra.append('AA:i:%d'%tail_refpos) al.extra.append('AA:i:1') cigar += 'M' * extension read_bases += 'A' * extension read_qual += chr(33+20) * extension #Arbitrarily give quality 20 al.length += extension if reverse: al.pos -= extension al.seq = rev_comp(read_bases) al.qual = read_qual[::-1] al.cigar = cigar_encode(cigar[::-1]) else: al.seq = read_bases al.qual = read_qual al.cigar = cigar_encode(cigar) print >> out_file, al self.end_output(out_file) self.end_input(in_file)
def run(self): work = self.get_workspace() data = [ ] names = [ ] sample_tags = [ ] for item in self.pickles: f = io.open_possibly_compressed_file(item) name, tags, datum = pickle.load(f) f.close() data.append(datum) names.append(name) sample_tags.append(tags) annotations = data[0] all_lengths = [ #tail_length item[2] for sample in data for feature in sample #for rel_start,rel_end,tail_length in feature.hits for item in feature.hits ] if all_lengths: max_length = max(all_lengths)+1 else: max_length = 1 del all_lengths for i, sample in enumerate(data): n_alignments = 0 n_duplicates = 0 n_good = 0 for feature in sample: feature.tail_counts = [ 0.0 ] * max_length buckets = collections.defaultdict(list) for item in feature.hits: rel_start,rel_end,tail_length = item[:3] buckets[ (rel_start,rel_end) ].append(tail_length) for item in buckets.values(): n_alignments += len(item) n_good += 1 if self.saturation < 1 or len(item) <= self.saturation: weight = 1.0 else: weight = float(self.saturation) / len(item) n_duplicates += len(item) for item2 in item: feature.tail_counts[item2] += weight self.log.datum(names[i], 'Alignments to features', n_alignments) if self.saturation >= 1: self.log.datum(names[i], 'Proportion of alignments with duplicate start and end position', float(n_duplicates)/max(1,n_alignments)) self.log.datum(names[i], 'Alignments to features after deduplication', n_good) counts = [ ] # [feature][sample][taillength] for item in data: assert len(item) == len(data[0]) for row in itertools.izip(*data): this_counts = [ item.tail_counts for item in row ] counts.append(this_counts) sample_n = [ ] # [feature][sample] Total count sample_n_tail = [ ] # [feature][sample] Polya count sample_prop = [ ] # [feature][sample] Proportion of reads with tail sample_tail = [ ] # [feature][sample] Mean tail length in each sample sample_total_tail = [ ] overall_n = [ ] overall_prop = [ ] # [feature] Overall proportion with tail overall_tail = [ ] # [feature] Overall mean tail length overall_n_tail = [ ] # [feature] Overall polya count overall_total_tail = [ ] for row in counts: this_n = [ ] this_n_tail = [ ] this_prop = [ ] this_tail = [ ] this_total_tail = [ ] for item in row: this_this_n = sum(item) this_n.append( this_this_n ) this_this_n_tail = sum(item[self.tail:]) this_n_tail.append( this_this_n_tail ) this_this_total_tail = sum( item[i]*i for i in xrange(self.tail,max_length) ) this_total_tail.append( this_this_total_tail ) if this_this_n < 1: this_prop.append(None) else: this_prop.append(float(this_this_n_tail)/this_this_n) if this_this_n_tail < 1: this_tail.append(None) else: this_tail.append(this_this_total_tail/this_this_n_tail) sample_n.append(this_n) sample_n_tail.append(this_n_tail) sample_prop.append(this_prop) sample_tail.append(this_tail) sample_total_tail.append(this_total_tail) overall_n.append(sum(this_n)) overall_n_tail.append(sum(this_n_tail)) overall_total_tail.append(sum(this_total_tail)) if sum(this_n) < 1: overall_prop.append(None) else: overall_prop.append(float(sum(this_n_tail))/sum(this_n)) if sum(this_n_tail) < 1: overall_tail.append(None) else: overall_tail.append(float(sum(this_total_tail))/sum(this_n_tail)) for i, name in enumerate(names): this_total = sum( item[i] for item in sample_total_tail ) this_n = sum( item[i] for item in sample_n_tail ) if this_n: self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n) for i, name in enumerate(names): this_total = sum( item[i] for item in sample_n_tail ) this_n = sum( item[i] for item in sample_n ) if this_n: self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n) #max_length = max(max(len(item) for item in row) for row in counts) # #for row in counts: # for item in row: # while len(item) < max_length: # item.append(0) with open(work/'features-with-data.gff','wb') as f: annotation.write_gff3_header(f) for i, item in enumerate(annotations): item.attr['reads'] = str(overall_n[i]) item.attr['reads_with_tail'] = str(overall_n_tail[i]) item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA' item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA' if overall_tail[i] is None: item.attr['color'] = '#444444' else: a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail) item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255)) #item.attr['color'] = ... print >> f, item.as_gff() comments = [ '#Counts' ] + [ '#sampleTags='+','.join(tags) for tags in sample_tags ] + [ '"Tail_count" group is number of reads with tail', '"Tail" group is mean tail per sample', '"Proportion" group is proportion of reads with tail', ] def counts_iter(): for i in xrange(len(counts)): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(len(names)): row[('Count',names[j])] = '%d' % sample_n[i][j] row[('Annotation','Length')] = annotations[i].end - annotations[i].start row[('Annotation','gene')] = annotations[i].attr.get('Name','') row[('Annotation','product')] = annotations[i].attr.get('Product','') #row[('Annotation','Strand')] = str(annotations[i].strand) row[('Annotation','reads')] = str(overall_n[i]) row[('Annotation','reads-with-tail')] = str(overall_n_tail[i]) row[('Annotation','mean-tail')] = str(overall_tail[i]) if overall_tail[i] is not None else 'NA' row[('Annotation','proportion-with-tail')] = str(overall_prop[i]) if overall_prop[i] is not None else 'NA' for j in xrange(len(names)): row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j] for j in xrange(len(names)): row[('Tail',names[j])] = str(sample_tail[i][j]) if sample_tail[i][j] is not None else 'NA' for j in xrange(len(names)): row[('Proportion',names[j])] = str(sample_prop[i][j]) if sample_prop[i][j] is not None else 'NA' yield row io.write_csv(work/'counts.csv', counts_iter(), comments=comments) def raw_columns(): for i in xrange(len(names)): row = collections.OrderedDict() row['Sample'] = names[i] for j in xrange(max_length): row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based yield row io.write_csv(work/'raw-columns.csv', raw_columns()) #Somewhat inefficient def raw(): for i in xrange(len(counts)): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(len(names)): for k in xrange(max_length): row['%d %s' % (k,names[j])] = str( counts[i][j][k] ) yield row io.write_csv(work/'raw.csv', raw()) def pooled(): for i in xrange(len(counts)): row = collections.OrderedDict() row['Feature'] = annotations[i].get_id() for j in xrange(max_length): row[str(j)] = str( sum( counts[i][k][j] for k in xrange(len(names)) ) ) yield row io.write_csv(work/'pooled.csv', pooled())
def run(self): if self.dirichlet: assert self.ploidy == 1, 'Dirichlet mode is not available for ploidy > 1' reader_f = io.open_possibly_compressed_file(self.vcf) reader = vcf.Reader(reader_f) writer = vcf.Writer(open(self.prefix + '.vcf', 'wb'), reader) #print dir(reader) #print reader.formats #print #print reader.infos #print n = 0 n_kept = 0 for record in reader: n += 1 variants = get_variants(record) any = False for sample in record.samples: self._modify_sample(variants, sample) any = any or (sample.data.GT != self._blank_gt() and sample.data.GT != self._reference_gt()) #print call.sample #for key in call.data._fields: # print key, getattr(call.data,key), reader.formats[key].desc # #counts = [ call.data.RO ] #if isinstance(call.data.QA,list): # counts.extend(call.data.QA) #else: # counts.append(call.data.QA) #print variants, counts # # #if self.min_gq is not None and call.data.GQ < self.min_gq: # call.data = call.data._replace(GT='.') # print call.data #else: # any = True if self.dirichlet: record.QUAL = min( MAX_QUALITY, sum(sample.data.GQ for sample in record.samples)) if any: writer.write_record(record) n_kept += 1 writer.close() reader_f.close() self.log.datum('variants', 'input', n) self.log.datum('variants', 'kept', n_kept) index_vcf(self.prefix + '.vcf')