def main(args): default_transl_table, args = grace.get_option_value(args, '--transl_table', int, 11) use_coverage, args = grace.get_flag(args, '--use-coverage') coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff', float, 0.1) tabular, args = grace.get_flag(args, '--tabular') noheader, args = grace.get_flag(args, '--noheader') verbose, args = grace.get_flag(args, '--verbose') bandwidth, args = grace.get_option_value(args, '--band', int, 20) grace.expect_no_further_options(args) if len(args) != 2: print USAGE return 1 genbank_filename = args[0] alignment_filename = args[1] if os.path.isdir(alignment_filename): alignment_filename = os.path.join(alignment_filename, 'alignment.maf') working_dir = os.path.split(alignment_filename)[0] alignments = load_alignments(alignment_filename) summaries = [ ] details = [ ] if not noheader: fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t' if use_coverage: fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t' fields += 'Gene\tProduct' if tabular: fields += '\tChanges of note' print fields for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'): sequence = record.seq.tostring() for name, seq1, seq2, alignment in alignments: if seq1 == sequence: break else: raise grace.Error('Genbank record %s sequence not identical to any reference sequence' % record.id) if use_coverage: depth = get_graph(working_dir, name, 'depth') ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth') median_depth = numpy.median(depth) median_ambiguous_depth = numpy.median(ambiguous_depth) ambiguous_factor = float(median_ambiguous_depth) / median_depth depth_expect = expected_depth(name, sequence, depth, ambiguous_depth) for feature in record.features: if feature.type != 'CDS': continue if 'locus_tag' not in feature.qualifiers: locus_tag = '%d..%d' % (feature.location.nofuzzy_start+1,feature.location.nofuzzy_end) else: locus_tag = feature.qualifiers['locus_tag'][0] if 'transl_table' in feature.qualifiers: transl_table_no = int(feature.qualifiers['transl_table'][0]) else: assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given' transl_table_no = default_transl_table transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no] start_codons = transl_table.start_codons try: feature_alignment = alignment_from_feature(sequence, feature) except Weird_alignment: warn('%s has a location I could not handle, skipping, sorry' % locus_tag) continue dna = [ ] new_dna = [ ] shifts = [ ] for i in xrange(feature_alignment.end2): p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i+1, left=True) assert abs(p2-p1) < 2 dna.append( sequence_slice(sequence,p1,p2) ) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm diff = (p2-p1)-(p2a-p1a) #if diff: # if diff%3: # frame_shift = True # else: # frame_preserving_shift = True new_dna.append( sequence_slice(seq2,p1a,p2a) ) if diff: shifts.append((i,dna[-1],new_dna[-1])) dna = ''.join(dna) new_dna = ''.join(new_dna) # This usually indicated a CDS truncated at the start? # in which case, will probably fail some way or other down the line. if 'codon_start' in feature.qualifiers: codon_start = int(feature.qualifiers['codon_start'][0]) - 1 else: codon_start = 0 dna = dna[codon_start:] new_dna = new_dna[codon_start:] if len(dna) % 3 != 0: warn(locus_tag + ' length not a multiple of 3') #assert len(new_dna) % 3 == 0 protein = Seq.Seq(dna).translate(table=transl_table_no).tostring() # http://en.wikipedia.org/wiki/Start_codon is always translated to M protein = 'M' + protein[1:] if dna[:3] not in start_codons: warn(locus_tag + ' has unknown start codon: ' + dna[:3]) original_lacks_stop_codon = not protein.endswith('*') if original_lacks_stop_codon: warn(locus_tag + ' lacks end codon') original_stops_before_end = '*' in protein[:-1] if original_stops_before_end: warn(locus_tag + ' contains stop codon before end') if 'translation' in feature.qualifiers: expect = feature.qualifiers['translation'][0] if protein[:-1] != expect: warn(locus_tag + ' translation given in feature does not match translation from DNA') new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] # If end codon changed, find new end # Don't bother if there are unknown amino acids or # the original protein lacks a stop codon if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon: #This is very inefficient i = feature_alignment.end2 while True: p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i+1, left=True) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len(seq2): break new_dna += sequence_slice(seq2,p1a,p2a) new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] if 'X' in new_protein or '*' in new_protein: break i += 1 # Is the protein shorter? # Don't bother checking if the original protein has extra stop codons if '*' in new_protein and not original_stops_before_end: new_protein = new_protein[:new_protein.index('*')+1] # If indels occurred, do an alignment # Don't bother otherwise if shifts: # Penalize gaps with cost 2 (vs 1 for mismatch) # If lengths don't match, pad with spaces (won't match longer seq), # aligner prefers mismatch to gaps #result = pairwise2.align.globalxs(protein + ' '*max(0,len(new_protein)-len(protein)), # new_protein + ' '*max(0,len(protein)-len(new_protein)), # -2.001,-2.000)[0] # 2.001 : very slightly prefer contiguous gaps. Also much faster! result = band_limited_align(protein + ' '*max(0,len(new_protein)-len(protein)), new_protein + ' '*max(0,len(protein)-len(new_protein)), bandwidth) protein_ali = result[0] new_protein_ali = result[1] else: protein_ali = protein new_protein_ali = new_protein diffs = [ ] j = 0 k = 0 for i in xrange(min(len(new_protein_ali),len(protein_ali))): if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and ( protein_ali[i] == '-' or new_protein_ali[i] == '-' or not bio.might_be_same_amino(protein_ali[i], new_protein_ali[i]) ): diffs.append((i,j,k)) if protein_ali[i] != '-': j += 1 if new_protein_ali[i] != '-': k += 1 diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \ not bio.might_be_same_base(new_dna[1],dna[1]) or \ not bio.might_be_same_base(new_dna[2],dna[2]) interesting_coverage = False if use_coverage: cds_depth = depth[feature_alignment.start1:feature_alignment.end1] #/ median_depth if not feature_alignment.forward1: cds_depth = cds_depth[::-1] cds_ambiguous_depth = ambiguous_depth[feature_alignment.start1:feature_alignment.end1] #/ median_ambiguous_depth if not feature_alignment.forward1: cds_ambiguous_depth = cds_ambiguous_depth[::-1] cds_depth_expect = depth_expect[feature_alignment.start1:feature_alignment.end1] if not feature_alignment.forward1: cds_depth_expect = cds_depth_expect[::-1] #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth #line += '%.1f\t' % cds_average_depth_ratio #line += '%.1f\t' % cds_average_ambiguous_depth_ratio #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth) #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth) #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth) avg_expect = numpy.average(cds_depth_expect) if avg_expect > 0.0: cds_avg_depth = numpy.average(cds_depth)/avg_expect cds_avg_ambiguous_depth = numpy.average(cds_ambiguous_depth)/avg_expect/ambiguous_factor strange = ( (cds_depth >= cds_depth_expect*1.5) | (cds_ambiguous_depth <= cds_depth_expect*(0.5*ambiguous_factor)) ) interesting_coverage = numpy.average(strange) >= coverage_cutoff if interesting_coverage or diffs or diff_start or shifts or len(new_protein) != len(protein): line = name + '\t' + locus_tag + '\t' + \ '%d\t' % (len(protein)-1) + \ '%d\t' % (len(new_protein)-1) + \ '%d\t' % len(diffs) if use_coverage: if avg_expect <= 0.0: line += '\t\t\t' else: line += '%.1f\t' % (cds_avg_depth) + graphlet(cds_depth, cds_depth_expect)+'\t' line += '%.1f\t' % (cds_avg_ambiguous_depth) + graphlet(cds_ambiguous_depth, cds_depth_expect*ambiguous_factor)+'\t' line += '%.1f%%\t' % (numpy.average(cds_ambiguous_depth > 0.0)*100.0) line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \ '%s' % feature.qualifiers.get('product',[''])[0] notes = [ ] if use_coverage and 'X' in new_protein: xs = new_protein.count('X') if xs == len(new_protein)-1: #First is M, so len-1 notes.append('\ No consensus') else: notes.append('\ No consensus for %d aa' % (new_protein.count('X'))) if len(new_protein) < len(protein): notes.append('\ Shorter by %d aa' % (len(protein)-len(new_protein))) if len(new_protein) > len(protein): notes.append('\ Longer by %d aa' % (len(new_protein)-len(protein))) if diff_start: notes.append('\ Start changed: %s -> %s' % (dna[:3], new_dna[:3])) if new_dna[:3] not in start_codons: notes.append(' No longer a start codon!') if shifts: notes.append('\ Indels:') for pos, old, new in shifts: notes.append(' base %5d / codon %5d %s -> %s' % (pos+1,(pos//3)+1,old,new or '-')) if diffs: if verbose: notes.append('\ Amino acid changes:') for i, j, k in diffs: notes.append(' codon %5d %s->%s (%s->%s)' % ( j+1, protein_ali[i], new_protein_ali[i], dna[j*3:j*3+3] if protein_ali[i] != '-' else '-', new_dna[k*3:k*3+3] if new_protein_ali[i] != '-' else '-' )) #if len(new_protein) > len(protein): # print 'New protein is longer:', new_protein[len(protein):] #if len(new_protein) < len(protein): # print 'New protein is shorter:', protein[len(new_protein):] #print protein #print new_protein if tabular: print line + '\t' + ' '.join([ ' '.join(note.strip().split()) for note in notes ]) else: print line for note in notes: print '\t' + note return 0
def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [ ] reads_filenames = [ ] shrimp_options = [ '-h', threshold ] if threshold.endswith('%'): threshold = -float(threshold[:-1])/100.0 else: threshold = int(threshold) output_dir = [ ] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [ os.path.abspath(filename) for filename in args[1:] ]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([ [ os.path.abspath(filename) ] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append([ os.path.abspath(filename) for filename in args ]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute(args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir,'reference.fa') reference_file = open(reference_filename,'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references' : input_reference_filenames, 'reads' : reads_filenames, 'stride' : stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir,'temp%d-%d.fa' % (os.getpid(),my_number)) tempname_out = os.path.join(output_dir,'temp%d-%d.txt' % (os.getpid(),my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname,'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT,'/bin/sh','/bin/sh','-c',command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = { } # read_name -> [ hit line ] f = open(tempname_out,'rb') for line in f: if line.startswith('>'): read_name = line.split(None,1)[0][1:] if read_name not in hits: hits[read_name] = [ ] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [ ] reader = iter_reads(config) read_count = 0 while True: read_set = [ ] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append( do_shrimp(read_set) ) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps) ) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [] reads_filenames = [] shrimp_options = ['-h', threshold] if threshold.endswith('%'): threshold = -float(threshold[:-1]) / 100.0 else: threshold = int(threshold) output_dir = [] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [os.path.abspath(filename) for filename in args[1:]]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([[os.path.abspath(filename)] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append( [os.path.abspath(filename) for filename in args]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute( args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir, 'reference.fa') reference_file = open(reference_filename, 'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references': input_reference_filenames, 'reads': reads_filenames, 'stride': stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir, 'temp%d-%d.fa' % (os.getpid(), my_number)) tempname_out = os.path.join( output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname, 'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c', command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = {} # read_name -> [ hit line ] f = open(tempname_out, 'rb') for line in f: if line.startswith('>'): read_name = line.split(None, 1)[0][1:] if read_name not in hits: hits[read_name] = [] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [] reader = iter_reads(config) read_count = 0 while True: read_set = [] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append(do_shrimp(read_set)) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps)) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
def main(args): default_transl_table, args = grace.get_option_value( args, '--transl_table', int, 11) use_coverage, args = grace.get_flag(args, '--use-coverage') coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff', float, 0.1) tabular, args = grace.get_flag(args, '--tabular') noheader, args = grace.get_flag(args, '--noheader') verbose, args = grace.get_flag(args, '--verbose') bandwidth, args = grace.get_option_value(args, '--band', int, 20) grace.expect_no_further_options(args) if len(args) != 2: print USAGE return 1 genbank_filename = args[0] alignment_filename = args[1] if os.path.isdir(alignment_filename): alignment_filename = os.path.join(alignment_filename, 'alignment.maf') working_dir = os.path.split(alignment_filename)[0] alignments = load_alignments(alignment_filename) summaries = [] details = [] if not noheader: fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t' if use_coverage: fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t' fields += 'Gene\tProduct' if tabular: fields += '\tChanges of note' print fields for record in SeqIO.parse( io.open_possibly_compressed_file(genbank_filename), 'genbank'): sequence = record.seq.tostring() for name, seq1, seq2, alignment in alignments: if seq1 == sequence: break else: raise grace.Error( 'Genbank record %s sequence not identical to any reference sequence' % record.id) if use_coverage: depth = get_graph(working_dir, name, 'depth') ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth') median_depth = numpy.median(depth) median_ambiguous_depth = numpy.median(ambiguous_depth) ambiguous_factor = float(median_ambiguous_depth) / median_depth depth_expect = expected_depth(name, sequence, depth, ambiguous_depth) for feature in record.features: if feature.type != 'CDS': continue if 'locus_tag' not in feature.qualifiers: locus_tag = '%d..%d' % (feature.location.nofuzzy_start + 1, feature.location.nofuzzy_end) else: locus_tag = feature.qualifiers['locus_tag'][0] if 'transl_table' in feature.qualifiers: transl_table_no = int(feature.qualifiers['transl_table'][0]) else: assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given' transl_table_no = default_transl_table transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no] start_codons = transl_table.start_codons try: feature_alignment = alignment_from_feature(sequence, feature) except Weird_alignment: warn('%s has a location I could not handle, skipping, sorry' % locus_tag) continue dna = [] new_dna = [] shifts = [] for i in xrange(feature_alignment.end2): p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i + 1, left=True) assert abs(p2 - p1) < 2 dna.append(sequence_slice(sequence, p1, p2)) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm diff = (p2 - p1) - (p2a - p1a) #if diff: # if diff%3: # frame_shift = True # else: # frame_preserving_shift = True new_dna.append(sequence_slice(seq2, p1a, p2a)) if diff: shifts.append((i, dna[-1], new_dna[-1])) dna = ''.join(dna) new_dna = ''.join(new_dna) # This usually indicated a CDS truncated at the start? # in which case, will probably fail some way or other down the line. if 'codon_start' in feature.qualifiers: codon_start = int(feature.qualifiers['codon_start'][0]) - 1 else: codon_start = 0 dna = dna[codon_start:] new_dna = new_dna[codon_start:] if len(dna) % 3 != 0: warn(locus_tag + ' length not a multiple of 3') #assert len(new_dna) % 3 == 0 protein = Seq.Seq(dna).translate(table=transl_table_no).tostring() # http://en.wikipedia.org/wiki/Start_codon is always translated to M protein = 'M' + protein[1:] if dna[:3] not in start_codons: warn(locus_tag + ' has unknown start codon: ' + dna[:3]) original_lacks_stop_codon = not protein.endswith('*') if original_lacks_stop_codon: warn(locus_tag + ' lacks end codon') original_stops_before_end = '*' in protein[:-1] if original_stops_before_end: warn(locus_tag + ' contains stop codon before end') if 'translation' in feature.qualifiers: expect = feature.qualifiers['translation'][0] if protein[:-1] != expect: warn( locus_tag + ' translation given in feature does not match translation from DNA' ) new_protein = Seq.Seq(new_dna).translate( table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] # If end codon changed, find new end # Don't bother if there are unknown amino acids or # the original protein lacks a stop codon if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon: #This is very inefficient i = feature_alignment.end2 while True: p1 = feature_alignment.back_project(i, left=False) p2 = feature_alignment.back_project(i + 1, left=True) p1a = alignment.project(p1, left=False) p2a = alignment.project(p2, left=False) #Hmm if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len( seq2): break new_dna += sequence_slice(seq2, p1a, p2a) new_protein = Seq.Seq(new_dna).translate( table=transl_table_no).tostring() new_protein = 'M' + new_protein[1:] if 'X' in new_protein or '*' in new_protein: break i += 1 # Is the protein shorter? # Don't bother checking if the original protein has extra stop codons if '*' in new_protein and not original_stops_before_end: new_protein = new_protein[:new_protein.index('*') + 1] # If indels occurred, do an alignment # Don't bother otherwise if shifts: # Penalize gaps with cost 2 (vs 1 for mismatch) # If lengths don't match, pad with spaces (won't match longer seq), # aligner prefers mismatch to gaps #result = pairwise2.align.globalxs(protein + ' '*max(0,len(new_protein)-len(protein)), # new_protein + ' '*max(0,len(protein)-len(new_protein)), # -2.001,-2.000)[0] # 2.001 : very slightly prefer contiguous gaps. Also much faster! result = band_limited_align( protein + ' ' * max(0, len(new_protein) - len(protein)), new_protein + ' ' * max(0, len(protein) - len(new_protein)), bandwidth) protein_ali = result[0] new_protein_ali = result[1] else: protein_ali = protein new_protein_ali = new_protein diffs = [] j = 0 k = 0 for i in xrange(min(len(new_protein_ali), len(protein_ali))): if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and ( protein_ali[i] == '-' or new_protein_ali[i] == '-' or not bio.might_be_same_amino(protein_ali[i], new_protein_ali[i])): diffs.append((i, j, k)) if protein_ali[i] != '-': j += 1 if new_protein_ali[i] != '-': k += 1 diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \ not bio.might_be_same_base(new_dna[1],dna[1]) or \ not bio.might_be_same_base(new_dna[2],dna[2]) interesting_coverage = False if use_coverage: cds_depth = depth[feature_alignment.start1: feature_alignment.end1] #/ median_depth if not feature_alignment.forward1: cds_depth = cds_depth[::-1] cds_ambiguous_depth = ambiguous_depth[ feature_alignment.start1: feature_alignment.end1] #/ median_ambiguous_depth if not feature_alignment.forward1: cds_ambiguous_depth = cds_ambiguous_depth[::-1] cds_depth_expect = depth_expect[feature_alignment. start1:feature_alignment.end1] if not feature_alignment.forward1: cds_depth_expect = cds_depth_expect[::-1] #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth #line += '%.1f\t' % cds_average_depth_ratio #line += '%.1f\t' % cds_average_ambiguous_depth_ratio #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth) #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth) #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth) avg_expect = numpy.average(cds_depth_expect) if avg_expect > 0.0: cds_avg_depth = numpy.average(cds_depth) / avg_expect cds_avg_ambiguous_depth = numpy.average( cds_ambiguous_depth) / avg_expect / ambiguous_factor strange = ((cds_depth >= cds_depth_expect * 1.5) | (cds_ambiguous_depth <= cds_depth_expect * (0.5 * ambiguous_factor))) interesting_coverage = numpy.average( strange) >= coverage_cutoff if interesting_coverage or diffs or diff_start or shifts or len( new_protein) != len(protein): line = name + '\t' + locus_tag + '\t' + \ '%d\t' % (len(protein)-1) + \ '%d\t' % (len(new_protein)-1) + \ '%d\t' % len(diffs) if use_coverage: if avg_expect <= 0.0: line += '\t\t\t' else: line += '%.1f\t' % (cds_avg_depth) + graphlet( cds_depth, cds_depth_expect) + '\t' line += '%.1f\t' % ( cds_avg_ambiguous_depth) + graphlet( cds_ambiguous_depth, cds_depth_expect * ambiguous_factor) + '\t' line += '%.1f%%\t' % ( numpy.average(cds_ambiguous_depth > 0.0) * 100.0) line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \ '%s' % feature.qualifiers.get('product',[''])[0] notes = [] if use_coverage and 'X' in new_protein: xs = new_protein.count('X') if xs == len(new_protein) - 1: #First is M, so len-1 notes.append('\ No consensus') else: notes.append('\ No consensus for %d aa' % (new_protein.count('X'))) if len(new_protein) < len(protein): notes.append('\ Shorter by %d aa' % (len(protein) - len(new_protein))) if len(new_protein) > len(protein): notes.append('\ Longer by %d aa' % (len(new_protein) - len(protein))) if diff_start: notes.append('\ Start changed: %s -> %s' % (dna[:3], new_dna[:3])) if new_dna[:3] not in start_codons: notes.append(' No longer a start codon!') if shifts: notes.append('\ Indels:') for pos, old, new in shifts: notes.append(' base %5d / codon %5d %s -> %s' % (pos + 1, (pos // 3) + 1, old, new or '-')) if diffs: if verbose: notes.append('\ Amino acid changes:') for i, j, k in diffs: notes.append( ' codon %5d %s->%s (%s->%s)' % (j + 1, protein_ali[i], new_protein_ali[i], dna[j * 3:j * 3 + 3] if protein_ali[i] != '-' else '-', new_dna[k * 3:k * 3 + 3] if new_protein_ali[i] != '-' else '-')) #if len(new_protein) > len(protein): # print 'New protein is longer:', new_protein[len(protein):] #if len(new_protein) < len(protein): # print 'New protein is shorter:', protein[len(new_protein):] #print protein #print new_protein if tabular: print line + '\t' + ' '.join( [' '.join(note.strip().split()) for note in notes]) else: print line for note in notes: print '\t' + note return 0