コード例 #1
0
def main(args):
    default_transl_table, args = grace.get_option_value(args, '--transl_table', int, 11)
    use_coverage, args = grace.get_flag(args, '--use-coverage')
    coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff', float, 0.1)
    tabular, args = grace.get_flag(args, '--tabular')
    noheader, args = grace.get_flag(args, '--noheader')
    verbose, args = grace.get_flag(args, '--verbose')
    bandwidth, args = grace.get_option_value(args, '--band', int, 20)
    grace.expect_no_further_options(args)

    if len(args) != 2:
        print USAGE
        return 1
    
    genbank_filename = args[0]
    alignment_filename = args[1]
    
    if os.path.isdir(alignment_filename):
        alignment_filename = os.path.join(alignment_filename, 'alignment.maf')
    
    working_dir = os.path.split(alignment_filename)[0]
    
    alignments = load_alignments(alignment_filename)
    
    summaries = [ ]
    details = [ ]
    
    if not noheader:
        fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t'
        if use_coverage: fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t'
        fields += 'Gene\tProduct'
        if tabular: fields += '\tChanges of note'
        print fields
    
    for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'):
        sequence = record.seq.tostring()
    
        for name, seq1, seq2, alignment in alignments:
            if seq1 == sequence: break
        else:
            raise grace.Error('Genbank record %s sequence not identical to any reference sequence' % record.id)
             
        if use_coverage:       
            depth = get_graph(working_dir, name, 'depth')
            ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth')
            median_depth = numpy.median(depth)
            median_ambiguous_depth = numpy.median(ambiguous_depth)
            ambiguous_factor = float(median_ambiguous_depth) / median_depth
            depth_expect = expected_depth(name, sequence, depth, ambiguous_depth)
            
        
        for feature in record.features:
            if feature.type != 'CDS': continue
            
            if 'locus_tag' not in feature.qualifiers:
                locus_tag = '%d..%d' % (feature.location.nofuzzy_start+1,feature.location.nofuzzy_end)
            else:
                locus_tag = feature.qualifiers['locus_tag'][0]
            
            if 'transl_table' in feature.qualifiers:
                transl_table_no = int(feature.qualifiers['transl_table'][0])
            else:
                assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given'
                transl_table_no = default_transl_table
            
            transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no]
            start_codons = transl_table.start_codons
            
            try:
                feature_alignment = alignment_from_feature(sequence, feature)
            except Weird_alignment:
                warn('%s has a location I could not handle, skipping, sorry' % locus_tag)
                continue
            
            dna = [ ]
            new_dna = [ ]
            shifts = [ ]
            for i in xrange(feature_alignment.end2):
                p1 = feature_alignment.back_project(i, left=False)
                p2 = feature_alignment.back_project(i+1, left=True)
                assert abs(p2-p1) < 2
                dna.append( sequence_slice(sequence,p1,p2) )
                
                p1a = alignment.project(p1, left=False)
                p2a = alignment.project(p2, left=False) #Hmm
                
                diff = (p2-p1)-(p2a-p1a)
                #if diff:
                #    if diff%3:
                #        frame_shift = True
                #    else:
                #        frame_preserving_shift = True
                new_dna.append( sequence_slice(seq2,p1a,p2a) )
                
                if diff:
                    shifts.append((i,dna[-1],new_dna[-1]))
                
            dna = ''.join(dna)
            new_dna = ''.join(new_dna)
            
            # This usually indicated a CDS truncated at the start?
            # in which case, will probably fail some way or other down the line.
            if 'codon_start' in feature.qualifiers:
                codon_start = int(feature.qualifiers['codon_start'][0]) - 1
            else:
                codon_start = 0
            dna = dna[codon_start:]
            new_dna = new_dna[codon_start:]
            
            if len(dna) % 3 != 0:
                warn(locus_tag + ' length not a multiple of 3')
            #assert len(new_dna) % 3 == 0
            
            protein = Seq.Seq(dna).translate(table=transl_table_no).tostring()            
            # http://en.wikipedia.org/wiki/Start_codon is always translated to M
            protein = 'M' + protein[1:]
            
            if dna[:3] not in start_codons:
                warn(locus_tag + ' has unknown start codon: ' + dna[:3])
                                    
            original_lacks_stop_codon = not protein.endswith('*')                 
            if original_lacks_stop_codon:
                warn(locus_tag + ' lacks end codon')
            original_stops_before_end = '*' in protein[:-1] 
            if original_stops_before_end:
                warn(locus_tag + ' contains stop codon before end')
                            
            if 'translation' in feature.qualifiers:
                expect = feature.qualifiers['translation'][0]
                if protein[:-1] != expect:
                    warn(locus_tag + ' translation given in feature does not match translation from DNA')                
        
            new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring()            
            new_protein = 'M' + new_protein[1:]
        
            # If end codon changed, find new end                
            # Don't bother if there are unknown amino acids or 
            # the original protein lacks a stop codon
            if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon:
                #This is very inefficient
                i = feature_alignment.end2
                while True:
                    p1 = feature_alignment.back_project(i, left=False)
                    p2 = feature_alignment.back_project(i+1, left=True)
                    p1a = alignment.project(p1, left=False)
                    p2a = alignment.project(p2, left=False) #Hmm
                    if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len(seq2):
                        break
                        
                    new_dna += sequence_slice(seq2,p1a,p2a)                        
                    new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring()            
                    new_protein = 'M' + new_protein[1:]
                    if 'X' in new_protein or '*' in new_protein: break
                    
                    i += 1
            
            # Is the protein shorter?
            # Don't bother checking if the original protein has extra stop codons
            if '*' in new_protein and not original_stops_before_end:
                new_protein = new_protein[:new_protein.index('*')+1] 
        
            # If indels occurred, do an alignment
            # Don't bother otherwise
            if shifts:
                # Penalize gaps with cost 2 (vs 1 for mismatch)
                # If lengths don't match, pad with spaces (won't match longer seq),
                # aligner prefers mismatch to gaps
                
                #result = pairwise2.align.globalxs(protein      + ' '*max(0,len(new_protein)-len(protein)), 
                #                                  new_protein  + ' '*max(0,len(protein)-len(new_protein)), 
                #                                  -2.001,-2.000)[0]
                # 2.001 : very slightly prefer contiguous gaps. Also much faster!
        
                result = band_limited_align(protein      + ' '*max(0,len(new_protein)-len(protein)), 
                                            new_protein  + ' '*max(0,len(protein)-len(new_protein)), 
                                            bandwidth)
                
                
                protein_ali = result[0]
                new_protein_ali = result[1]
            else:
                protein_ali = protein
                new_protein_ali = new_protein
        
            diffs = [ ]
            j = 0
            k = 0
            for i in xrange(min(len(new_protein_ali),len(protein_ali))):
                if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and (
                      protein_ali[i] == '-' or 
                      new_protein_ali[i] == '-' or 
                      not bio.might_be_same_amino(protein_ali[i], new_protein_ali[i]) ):
                    diffs.append((i,j,k))
                if protein_ali[i] != '-': 
                    j += 1
                if new_protein_ali[i] != '-': 
                    k += 1
        
            diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \
                         not bio.might_be_same_base(new_dna[1],dna[1]) or \
                         not bio.might_be_same_base(new_dna[2],dna[2]) 
        
            interesting_coverage = False
            if use_coverage:
                cds_depth = depth[feature_alignment.start1:feature_alignment.end1] #/ median_depth
                if not feature_alignment.forward1: cds_depth = cds_depth[::-1]
                cds_ambiguous_depth = ambiguous_depth[feature_alignment.start1:feature_alignment.end1] #/ median_ambiguous_depth
                if not feature_alignment.forward1: cds_ambiguous_depth = cds_ambiguous_depth[::-1]
                
                cds_depth_expect = depth_expect[feature_alignment.start1:feature_alignment.end1]
                if not feature_alignment.forward1: cds_depth_expect = cds_depth_expect[::-1]
                
                #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth 
                #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth                        
                #line += '%.1f\t' % cds_average_depth_ratio 
                #line += '%.1f\t' % cds_average_ambiguous_depth_ratio
                
                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth) 
                #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth) 
                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth)
                
                avg_expect = numpy.average(cds_depth_expect)
                if avg_expect > 0.0:
                    cds_avg_depth = numpy.average(cds_depth)/avg_expect
                    cds_avg_ambiguous_depth = numpy.average(cds_ambiguous_depth)/avg_expect/ambiguous_factor
                
                strange = (
                    (cds_depth >= cds_depth_expect*1.5) |
                    (cds_ambiguous_depth <= cds_depth_expect*(0.5*ambiguous_factor))
                )
                
                interesting_coverage = numpy.average(strange) >= coverage_cutoff
                     

            if interesting_coverage or diffs or diff_start or shifts or len(new_protein) != len(protein):
                line = name + '\t' + locus_tag + '\t' + \
                      '%d\t' % (len(protein)-1) + \
                      '%d\t' % (len(new_protein)-1) + \
                      '%d\t' % len(diffs)
                

                if use_coverage:
                    if avg_expect <= 0.0:
                        line += '\t\t\t'
                    else:
                        line += '%.1f\t' % (cds_avg_depth) + graphlet(cds_depth, cds_depth_expect)+'\t' 
                        line += '%.1f\t' % (cds_avg_ambiguous_depth) + graphlet(cds_ambiguous_depth, cds_depth_expect*ambiguous_factor)+'\t'
                        line += '%.1f%%\t' % (numpy.average(cds_ambiguous_depth > 0.0)*100.0)
                
                line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \
                        '%s' % feature.qualifiers.get('product',[''])[0]
                
                notes = [ ]
                
                if use_coverage and 'X' in new_protein:
                    xs = new_protein.count('X')
                    if xs == len(new_protein)-1: #First is M, so len-1
                        notes.append('\ No consensus')
                    else:
                        notes.append('\ No consensus for %d aa' % (new_protein.count('X')))
                                   
                if len(new_protein) < len(protein):
                    notes.append('\ Shorter by %d aa' % (len(protein)-len(new_protein)))
        
                if len(new_protein) > len(protein):
                    notes.append('\ Longer by %d aa' % (len(new_protein)-len(protein)))
                
                if diff_start:
                    notes.append('\ Start changed: %s -> %s' % (dna[:3], new_dna[:3]))
                    if new_dna[:3] not in start_codons:
                        notes.append('  No longer a start codon!')
                        
                if shifts:
                    notes.append('\ Indels:')
                
                    for pos, old, new in shifts:
                        notes.append('    base %5d / codon %5d   %s -> %s' % (pos+1,(pos//3)+1,old,new or '-'))
                    
                if diffs:
                    if verbose:
                        notes.append('\ Amino acid changes:')
                        for i, j, k in diffs:
                            notes.append('    codon %5d   %s->%s   (%s->%s)' % (
                                j+1, 
                                protein_ali[i], 
                                new_protein_ali[i], 
                                dna[j*3:j*3+3] if protein_ali[i] != '-' else '-', 
                                new_dna[k*3:k*3+3] if new_protein_ali[i] != '-' else '-'
                            ))
                
                #if len(new_protein) > len(protein):
                #    print 'New protein is longer:', new_protein[len(protein):]
                #if len(new_protein) < len(protein):
                #    print 'New protein is shorter:', protein[len(new_protein):]
                #print protein
                #print new_protein
                
                if tabular:
                    print line + '\t' + ' '.join([ ' '.join(note.strip().split()) for note in notes ])
                else:
                    print line
                    for note in notes:
                        print '\t' + note
    return 0
コード例 #2
0
ファイル: shrimp.py プロジェクト: Puneet-Shivanand/nesoni
def main(args):
    grace.require_shrimp_1()

    n_cpus = grace.how_many_cpus()
        
    solid, args = grace.get_flag(args, '--solid')
    verbose, args = grace.get_flag(args, '--verbose')

    threshold, args = grace.get_option_value(args, '--threshold', str, '68%')
    
    stride, args = grace.get_option_value(args, '--stride', int, 1)
    max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus)
    batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000)
        
    input_reference_filenames = [ ]
    reads_filenames = [ ]
    
    shrimp_options = [ '-h', threshold ]
    if threshold.endswith('%'):
        threshold = -float(threshold[:-1])/100.0
    else:
        threshold = int(threshold)
    
    output_dir = [ ]  #As list so can write to from function. Gah.
    
    def front_command(args):
        grace.expect_no_further_options(args)
        
        if len(args) < 1:
            return
        
        output_dir.append(args[0])        
        input_reference_filenames.extend(
            [ os.path.abspath(filename) for filename in args[1:] ])
    def reads_command(args):
        grace.expect_no_further_options(args)
        reads_filenames.extend([ [ os.path.abspath(filename) ] for filename in args])
    def pairs_command(args):
        grace.expect_no_further_options(args)
        assert len(args) == 2, 'Expected exactly two files in "pairs"'
        reads_filenames.append([ os.path.abspath(filename) for filename in args ])
    def shrimp_options_command(args):
        shrimp_options.extend(args)
    
    grace.execute(args, {
        'reads': reads_command,
        '--reads': reads_command,
        'pairs': pairs_command,
        'shrimp-options': shrimp_options_command,
        '--shrimp-options': shrimp_options_command,
    }, front_command)
    
    
    if not output_dir:
        print >> sys.stderr, USAGE % n_cpus
        return 1
    
    output_dir = output_dir[0]
    
    assert input_reference_filenames, 'No reference files given'
    assert reads_filenames, 'No read files given'
    
    for filename in itertools.chain(input_reference_filenames, *reads_filenames):
        assert os.path.exists(filename), '%s does not exist' % filename

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    
    if solid:
        shrimp = 'rmapper-cs'
    else:
        shrimp = 'rmapper-ls'
    
    
    reference_filename = os.path.join(output_dir,'reference.fa')
    reference_file = open(reference_filename,'wb')
    total_reference_sequences = 0
    total_reference_bases = 0
    for input_reference_filename in input_reference_filenames:
        for name, sequence in io.read_sequences(input_reference_filename):
            #Don't retain any comment
            name = name.split()[0]
            io.write_fasta(reference_file, name, sequence)
            
            total_reference_sequences += 1
            total_reference_bases += len(sequence)
            
    reference_file.close()
    
    print '%s base%s in %s reference sequence%s' % (
        grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '',
        grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '')
    
    assert total_reference_bases, 'Reference sequence file is empty' 
    
    config = {
        'references' : input_reference_filenames,
        'reads' : reads_filenames,
        'stride' : stride,
        'solid': solid,
        'threshold': threshold,
    }
    config_file = open(os.path.join(output_dir, 'config.txt'), 'wb')
    pprint.pprint(config, config_file)
    config_file.close()
    
    output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz')
    output_file = gzip.open(output_filename, 'wb')
    
    unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz')
    unmapped_file = gzip.open(unmapped_filename, 'wb')
    
    dirty_filenames = set()
    dirty_filenames.add(output_filename)
    dirty_filenames.add(unmapped_filename)
    
    #warn_low_threshold = True
    
    try: #Cleanup temporary files
        
        N = [0]
        def do_shrimp(read_set):
            my_number = N[0]
            N[0] += 1
            
            tempname = os.path.join(output_dir,'temp%d-%d.fa' % (os.getpid(),my_number))
            tempname_out = os.path.join(output_dir,'temp%d-%d.txt' % (os.getpid(),my_number))
            
            dirty_filenames.add(tempname)
            dirty_filenames.add(tempname_out)
            
            f = open(tempname,'wb')
            for read_name, read_seq in read_set:
                print >> f, '>' + read_name
                print >> f, read_seq
            f.close()
        
            command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \
                      tempname + ' ' + reference_filename + ' >' + tempname_out
            if not verbose:
                command += ' 2>/dev/null'
            #f = os.popen(command, 'r')
            child_pid = os.spawnl(os.P_NOWAIT,'/bin/sh','/bin/sh','-c',command)
            #print 'SHRiMP %d running' % my_number
            
            def finalize():
                exit_status = os.waitpid(child_pid, 0)[1]
                assert exit_status == 0, 'Shrimp indicated an error'
                
                hits = { } # read_name -> [ hit line ]
                
                f = open(tempname_out,'rb')
                for line in f:
                    if line.startswith('>'):
                        read_name = line.split(None,1)[0][1:]
                        if read_name not in hits:
                            hits[read_name] = [ ]
                        hits[read_name].append(line)
                f.close()
                                
                for read_name, read_seq in read_set:
                    if read_name in hits:
                        for hit in hits[read_name]:
                            output_file.write(hit)
                    else:
                        print >> unmapped_file, '>' + read_name
                        print >> unmapped_file, read_seq

                output_file.flush()
                unmapped_file.flush()
        
                os.unlink(tempname)
                dirty_filenames.remove(tempname)
                os.unlink(tempname_out)
                dirty_filenames.remove(tempname_out)
                #print 'SHRiMP %d finished' % my_number
            return finalize
        
        
        shrimps = [ ]
        
        reader = iter_reads(config)
        read_count = 0
        
        while True:
            read_set = [ ]
            read_set_bases = 0

            #Read name should not include comment cruft
            # - SHRIMP passes this through
            # - might stuff up identification of pairs
            
            for read_name, read_seq in reader:
                read_name = read_name.split()[0]                
                read_set.append((read_name, read_seq))
                read_set_bases += len(read_seq)
                
                #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match
                #    sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n')                    
                #    warn_low_threshold = False
            
                read_count += 1
                if read_set_bases >= batch_size: break
                
            if not read_set: break
        
            if len(shrimps) >= max_shrimps:
                shrimps.pop(0)()
            shrimps.append( do_shrimp(read_set) )
            
            grace.status('SHRiMPing %s' % grace.pretty_number(read_count))
        
        while shrimps:
            grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps) )
            shrimps.pop(0)()
        
        grace.status('')
        
        output_file.close()
        dirty_filenames.remove(output_filename)
        unmapped_file.close()
        dirty_filenames.remove(unmapped_filename)
        
        return 0

    finally:
        for filename in dirty_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
コード例 #3
0
ファイル: shrimp.py プロジェクト: promodel/nesoni
def main(args):
    grace.require_shrimp_1()

    n_cpus = grace.how_many_cpus()

    solid, args = grace.get_flag(args, '--solid')
    verbose, args = grace.get_flag(args, '--verbose')

    threshold, args = grace.get_option_value(args, '--threshold', str, '68%')

    stride, args = grace.get_option_value(args, '--stride', int, 1)
    max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus)
    batch_size, args = grace.get_option_value(args, '--batch-size', int,
                                              5000000)

    input_reference_filenames = []
    reads_filenames = []

    shrimp_options = ['-h', threshold]
    if threshold.endswith('%'):
        threshold = -float(threshold[:-1]) / 100.0
    else:
        threshold = int(threshold)

    output_dir = []  #As list so can write to from function. Gah.

    def front_command(args):
        grace.expect_no_further_options(args)

        if len(args) < 1:
            return

        output_dir.append(args[0])
        input_reference_filenames.extend(
            [os.path.abspath(filename) for filename in args[1:]])

    def reads_command(args):
        grace.expect_no_further_options(args)
        reads_filenames.extend([[os.path.abspath(filename)]
                                for filename in args])

    def pairs_command(args):
        grace.expect_no_further_options(args)
        assert len(args) == 2, 'Expected exactly two files in "pairs"'
        reads_filenames.append(
            [os.path.abspath(filename) for filename in args])

    def shrimp_options_command(args):
        shrimp_options.extend(args)

    grace.execute(
        args, {
            'reads': reads_command,
            '--reads': reads_command,
            'pairs': pairs_command,
            'shrimp-options': shrimp_options_command,
            '--shrimp-options': shrimp_options_command,
        }, front_command)

    if not output_dir:
        print >> sys.stderr, USAGE % n_cpus
        return 1

    output_dir = output_dir[0]

    assert input_reference_filenames, 'No reference files given'
    assert reads_filenames, 'No read files given'

    for filename in itertools.chain(input_reference_filenames,
                                    *reads_filenames):
        assert os.path.exists(filename), '%s does not exist' % filename

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    if solid:
        shrimp = 'rmapper-cs'
    else:
        shrimp = 'rmapper-ls'

    reference_filename = os.path.join(output_dir, 'reference.fa')
    reference_file = open(reference_filename, 'wb')
    total_reference_sequences = 0
    total_reference_bases = 0
    for input_reference_filename in input_reference_filenames:
        for name, sequence in io.read_sequences(input_reference_filename):
            #Don't retain any comment
            name = name.split()[0]
            io.write_fasta(reference_file, name, sequence)

            total_reference_sequences += 1
            total_reference_bases += len(sequence)

    reference_file.close()

    print '%s base%s in %s reference sequence%s' % (
        grace.pretty_number(total_reference_bases),
        's' if total_reference_bases != 1 else '',
        grace.pretty_number(total_reference_sequences),
        's' if total_reference_sequences != 1 else '')

    assert total_reference_bases, 'Reference sequence file is empty'

    config = {
        'references': input_reference_filenames,
        'reads': reads_filenames,
        'stride': stride,
        'solid': solid,
        'threshold': threshold,
    }
    config_file = open(os.path.join(output_dir, 'config.txt'), 'wb')
    pprint.pprint(config, config_file)
    config_file.close()

    output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz')
    output_file = gzip.open(output_filename, 'wb')

    unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz')
    unmapped_file = gzip.open(unmapped_filename, 'wb')

    dirty_filenames = set()
    dirty_filenames.add(output_filename)
    dirty_filenames.add(unmapped_filename)

    #warn_low_threshold = True

    try:  #Cleanup temporary files

        N = [0]

        def do_shrimp(read_set):
            my_number = N[0]
            N[0] += 1

            tempname = os.path.join(output_dir,
                                    'temp%d-%d.fa' % (os.getpid(), my_number))
            tempname_out = os.path.join(
                output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number))

            dirty_filenames.add(tempname)
            dirty_filenames.add(tempname_out)

            f = open(tempname, 'wb')
            for read_name, read_seq in read_set:
                print >> f, '>' + read_name
                print >> f, read_seq
            f.close()

            command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \
                      tempname + ' ' + reference_filename + ' >' + tempname_out
            if not verbose:
                command += ' 2>/dev/null'
            #f = os.popen(command, 'r')
            child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c',
                                  command)

            #print 'SHRiMP %d running' % my_number

            def finalize():
                exit_status = os.waitpid(child_pid, 0)[1]
                assert exit_status == 0, 'Shrimp indicated an error'

                hits = {}  # read_name -> [ hit line ]

                f = open(tempname_out, 'rb')
                for line in f:
                    if line.startswith('>'):
                        read_name = line.split(None, 1)[0][1:]
                        if read_name not in hits:
                            hits[read_name] = []
                        hits[read_name].append(line)
                f.close()

                for read_name, read_seq in read_set:
                    if read_name in hits:
                        for hit in hits[read_name]:
                            output_file.write(hit)
                    else:
                        print >> unmapped_file, '>' + read_name
                        print >> unmapped_file, read_seq

                output_file.flush()
                unmapped_file.flush()

                os.unlink(tempname)
                dirty_filenames.remove(tempname)
                os.unlink(tempname_out)
                dirty_filenames.remove(tempname_out)
                #print 'SHRiMP %d finished' % my_number

            return finalize

        shrimps = []

        reader = iter_reads(config)
        read_count = 0

        while True:
            read_set = []
            read_set_bases = 0

            #Read name should not include comment cruft
            # - SHRIMP passes this through
            # - might stuff up identification of pairs

            for read_name, read_seq in reader:
                read_name = read_name.split()[0]
                read_set.append((read_name, read_seq))
                read_set_bases += len(read_seq)

                #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match
                #    sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n')
                #    warn_low_threshold = False

                read_count += 1
                if read_set_bases >= batch_size: break

            if not read_set: break

            if len(shrimps) >= max_shrimps:
                shrimps.pop(0)()
            shrimps.append(do_shrimp(read_set))

            grace.status('SHRiMPing %s' % grace.pretty_number(read_count))

        while shrimps:
            grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps))
            shrimps.pop(0)()

        grace.status('')

        output_file.close()
        dirty_filenames.remove(output_filename)
        unmapped_file.close()
        dirty_filenames.remove(unmapped_filename)

        return 0

    finally:
        for filename in dirty_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
コード例 #4
0
ファイル: consequences.py プロジェクト: promodel/nesoni
def main(args):
    default_transl_table, args = grace.get_option_value(
        args, '--transl_table', int, 11)
    use_coverage, args = grace.get_flag(args, '--use-coverage')
    coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff',
                                                   float, 0.1)
    tabular, args = grace.get_flag(args, '--tabular')
    noheader, args = grace.get_flag(args, '--noheader')
    verbose, args = grace.get_flag(args, '--verbose')
    bandwidth, args = grace.get_option_value(args, '--band', int, 20)
    grace.expect_no_further_options(args)

    if len(args) != 2:
        print USAGE
        return 1

    genbank_filename = args[0]
    alignment_filename = args[1]

    if os.path.isdir(alignment_filename):
        alignment_filename = os.path.join(alignment_filename, 'alignment.maf')

    working_dir = os.path.split(alignment_filename)[0]

    alignments = load_alignments(alignment_filename)

    summaries = []
    details = []

    if not noheader:
        fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t'
        if use_coverage:
            fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t'
        fields += 'Gene\tProduct'
        if tabular: fields += '\tChanges of note'
        print fields

    for record in SeqIO.parse(
            io.open_possibly_compressed_file(genbank_filename), 'genbank'):
        sequence = record.seq.tostring()

        for name, seq1, seq2, alignment in alignments:
            if seq1 == sequence: break
        else:
            raise grace.Error(
                'Genbank record %s sequence not identical to any reference sequence'
                % record.id)

        if use_coverage:
            depth = get_graph(working_dir, name, 'depth')
            ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth')
            median_depth = numpy.median(depth)
            median_ambiguous_depth = numpy.median(ambiguous_depth)
            ambiguous_factor = float(median_ambiguous_depth) / median_depth
            depth_expect = expected_depth(name, sequence, depth,
                                          ambiguous_depth)

        for feature in record.features:
            if feature.type != 'CDS': continue

            if 'locus_tag' not in feature.qualifiers:
                locus_tag = '%d..%d' % (feature.location.nofuzzy_start + 1,
                                        feature.location.nofuzzy_end)
            else:
                locus_tag = feature.qualifiers['locus_tag'][0]

            if 'transl_table' in feature.qualifiers:
                transl_table_no = int(feature.qualifiers['transl_table'][0])
            else:
                assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given'
                transl_table_no = default_transl_table

            transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no]
            start_codons = transl_table.start_codons

            try:
                feature_alignment = alignment_from_feature(sequence, feature)
            except Weird_alignment:
                warn('%s has a location I could not handle, skipping, sorry' %
                     locus_tag)
                continue

            dna = []
            new_dna = []
            shifts = []
            for i in xrange(feature_alignment.end2):
                p1 = feature_alignment.back_project(i, left=False)
                p2 = feature_alignment.back_project(i + 1, left=True)
                assert abs(p2 - p1) < 2
                dna.append(sequence_slice(sequence, p1, p2))

                p1a = alignment.project(p1, left=False)
                p2a = alignment.project(p2, left=False)  #Hmm

                diff = (p2 - p1) - (p2a - p1a)
                #if diff:
                #    if diff%3:
                #        frame_shift = True
                #    else:
                #        frame_preserving_shift = True
                new_dna.append(sequence_slice(seq2, p1a, p2a))

                if diff:
                    shifts.append((i, dna[-1], new_dna[-1]))

            dna = ''.join(dna)
            new_dna = ''.join(new_dna)

            # This usually indicated a CDS truncated at the start?
            # in which case, will probably fail some way or other down the line.
            if 'codon_start' in feature.qualifiers:
                codon_start = int(feature.qualifiers['codon_start'][0]) - 1
            else:
                codon_start = 0
            dna = dna[codon_start:]
            new_dna = new_dna[codon_start:]

            if len(dna) % 3 != 0:
                warn(locus_tag + ' length not a multiple of 3')
            #assert len(new_dna) % 3 == 0

            protein = Seq.Seq(dna).translate(table=transl_table_no).tostring()
            # http://en.wikipedia.org/wiki/Start_codon is always translated to M
            protein = 'M' + protein[1:]

            if dna[:3] not in start_codons:
                warn(locus_tag + ' has unknown start codon: ' + dna[:3])

            original_lacks_stop_codon = not protein.endswith('*')
            if original_lacks_stop_codon:
                warn(locus_tag + ' lacks end codon')
            original_stops_before_end = '*' in protein[:-1]
            if original_stops_before_end:
                warn(locus_tag + ' contains stop codon before end')

            if 'translation' in feature.qualifiers:
                expect = feature.qualifiers['translation'][0]
                if protein[:-1] != expect:
                    warn(
                        locus_tag +
                        ' translation given in feature does not match translation from DNA'
                    )

            new_protein = Seq.Seq(new_dna).translate(
                table=transl_table_no).tostring()
            new_protein = 'M' + new_protein[1:]

            # If end codon changed, find new end
            # Don't bother if there are unknown amino acids or
            # the original protein lacks a stop codon
            if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon:
                #This is very inefficient
                i = feature_alignment.end2
                while True:
                    p1 = feature_alignment.back_project(i, left=False)
                    p2 = feature_alignment.back_project(i + 1, left=True)
                    p1a = alignment.project(p1, left=False)
                    p2a = alignment.project(p2, left=False)  #Hmm
                    if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len(
                            seq2):
                        break

                    new_dna += sequence_slice(seq2, p1a, p2a)
                    new_protein = Seq.Seq(new_dna).translate(
                        table=transl_table_no).tostring()
                    new_protein = 'M' + new_protein[1:]
                    if 'X' in new_protein or '*' in new_protein: break

                    i += 1

            # Is the protein shorter?
            # Don't bother checking if the original protein has extra stop codons
            if '*' in new_protein and not original_stops_before_end:
                new_protein = new_protein[:new_protein.index('*') + 1]

            # If indels occurred, do an alignment
            # Don't bother otherwise
            if shifts:
                # Penalize gaps with cost 2 (vs 1 for mismatch)
                # If lengths don't match, pad with spaces (won't match longer seq),
                # aligner prefers mismatch to gaps

                #result = pairwise2.align.globalxs(protein      + ' '*max(0,len(new_protein)-len(protein)),
                #                                  new_protein  + ' '*max(0,len(protein)-len(new_protein)),
                #                                  -2.001,-2.000)[0]
                # 2.001 : very slightly prefer contiguous gaps. Also much faster!

                result = band_limited_align(
                    protein + ' ' * max(0,
                                        len(new_protein) - len(protein)),
                    new_protein + ' ' * max(0,
                                            len(protein) - len(new_protein)),
                    bandwidth)

                protein_ali = result[0]
                new_protein_ali = result[1]
            else:
                protein_ali = protein
                new_protein_ali = new_protein

            diffs = []
            j = 0
            k = 0
            for i in xrange(min(len(new_protein_ali), len(protein_ali))):
                if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and (
                        protein_ali[i] == '-' or new_protein_ali[i] == '-'
                        or not bio.might_be_same_amino(protein_ali[i],
                                                       new_protein_ali[i])):
                    diffs.append((i, j, k))
                if protein_ali[i] != '-':
                    j += 1
                if new_protein_ali[i] != '-':
                    k += 1

            diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \
                         not bio.might_be_same_base(new_dna[1],dna[1]) or \
                         not bio.might_be_same_base(new_dna[2],dna[2])

            interesting_coverage = False
            if use_coverage:
                cds_depth = depth[feature_alignment.start1:
                                  feature_alignment.end1]  #/ median_depth
                if not feature_alignment.forward1: cds_depth = cds_depth[::-1]
                cds_ambiguous_depth = ambiguous_depth[
                    feature_alignment.start1:
                    feature_alignment.end1]  #/ median_ambiguous_depth
                if not feature_alignment.forward1:
                    cds_ambiguous_depth = cds_ambiguous_depth[::-1]

                cds_depth_expect = depth_expect[feature_alignment.
                                                start1:feature_alignment.end1]
                if not feature_alignment.forward1:
                    cds_depth_expect = cds_depth_expect[::-1]

                #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth
                #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth
                #line += '%.1f\t' % cds_average_depth_ratio
                #line += '%.1f\t' % cds_average_ambiguous_depth_ratio

                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth)
                #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth)
                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth)

                avg_expect = numpy.average(cds_depth_expect)
                if avg_expect > 0.0:
                    cds_avg_depth = numpy.average(cds_depth) / avg_expect
                    cds_avg_ambiguous_depth = numpy.average(
                        cds_ambiguous_depth) / avg_expect / ambiguous_factor

                strange = ((cds_depth >= cds_depth_expect * 1.5) |
                           (cds_ambiguous_depth <= cds_depth_expect *
                            (0.5 * ambiguous_factor)))

                interesting_coverage = numpy.average(
                    strange) >= coverage_cutoff

            if interesting_coverage or diffs or diff_start or shifts or len(
                    new_protein) != len(protein):
                line = name + '\t' + locus_tag + '\t' + \
                      '%d\t' % (len(protein)-1) + \
                      '%d\t' % (len(new_protein)-1) + \
                      '%d\t' % len(diffs)

                if use_coverage:
                    if avg_expect <= 0.0:
                        line += '\t\t\t'
                    else:
                        line += '%.1f\t' % (cds_avg_depth) + graphlet(
                            cds_depth, cds_depth_expect) + '\t'
                        line += '%.1f\t' % (
                            cds_avg_ambiguous_depth) + graphlet(
                                cds_ambiguous_depth,
                                cds_depth_expect * ambiguous_factor) + '\t'
                        line += '%.1f%%\t' % (
                            numpy.average(cds_ambiguous_depth > 0.0) * 100.0)

                line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \
                        '%s' % feature.qualifiers.get('product',[''])[0]

                notes = []

                if use_coverage and 'X' in new_protein:
                    xs = new_protein.count('X')
                    if xs == len(new_protein) - 1:  #First is M, so len-1
                        notes.append('\ No consensus')
                    else:
                        notes.append('\ No consensus for %d aa' %
                                     (new_protein.count('X')))

                if len(new_protein) < len(protein):
                    notes.append('\ Shorter by %d aa' %
                                 (len(protein) - len(new_protein)))

                if len(new_protein) > len(protein):
                    notes.append('\ Longer by %d aa' %
                                 (len(new_protein) - len(protein)))

                if diff_start:
                    notes.append('\ Start changed: %s -> %s' %
                                 (dna[:3], new_dna[:3]))
                    if new_dna[:3] not in start_codons:
                        notes.append('  No longer a start codon!')

                if shifts:
                    notes.append('\ Indels:')

                    for pos, old, new in shifts:
                        notes.append('    base %5d / codon %5d   %s -> %s' %
                                     (pos + 1,
                                      (pos // 3) + 1, old, new or '-'))

                if diffs:
                    if verbose:
                        notes.append('\ Amino acid changes:')
                        for i, j, k in diffs:
                            notes.append(
                                '    codon %5d   %s->%s   (%s->%s)' %
                                (j + 1, protein_ali[i], new_protein_ali[i],
                                 dna[j * 3:j * 3 + 3] if protein_ali[i] != '-'
                                 else '-', new_dna[k * 3:k * 3 + 3]
                                 if new_protein_ali[i] != '-' else '-'))

                #if len(new_protein) > len(protein):
                #    print 'New protein is longer:', new_protein[len(protein):]
                #if len(new_protein) < len(protein):
                #    print 'New protein is shorter:', protein[len(new_protein):]
                #print protein
                #print new_protein

                if tabular:
                    print line + '\t' + ' '.join(
                        [' '.join(note.strip().split()) for note in notes])
                else:
                    print line
                    for note in notes:
                        print '\t' + note
    return 0