Exemple #1
0
def main(args):
    size, args = grace.get_option_value(args, '--size', int, 200)
    stride, args = grace.get_option_value(args, '--stride', int, 50)
    grace.expect_no_further_options(args)

    if not args:
        print USAGE
        return 1

    for filename in args:
        for name, seq in io.read_sequences(filename):
            name_parts = name.split(None, 1)
            name = name_parts[0]
            if len(name_parts) > 1:
                desc = ' ' + name_parts[1]
            else:
                desc = ''

            for i in xrange(-size + stride, len(seq), stride):
                start = max(0, min(len(seq), i))
                end = max(0, min(len(seq), i + size))
                io.write_fasta(sys.stdout,
                               '%s:%d..%d' % (name, start + 1, end) + desc,
                               seq[start:end])

    return 0
Exemple #2
0
def main(args):
    size, args = grace.get_option_value(args,'--size',int,200)
    stride, args = grace.get_option_value(args,'--stride',int,50)
    grace.expect_no_further_options(args)
    
    if not args:
        print USAGE
        return 1
    
    for filename in args:
        for name, seq in io.read_sequences(filename):
            name_parts = name.split(None, 1)
            name = name_parts[0]
            if len(name_parts) > 1:
               desc = ' ' + name_parts[1]
            else:
               desc = ''
            
            for i in xrange(-size+stride,len(seq),stride):
                start = max(0,min(len(seq),i))
                end = max(0,min(len(seq), i+size))
                io.write_fasta(
                    sys.stdout,
                    '%s:%d..%d' % (name,start+1,end) + desc,
                    seq[start:end]
                )
    
    return 0
Exemple #3
0
def normalize(args):
    min_depth, args = grace.get_option_value(args, '--min-depth', int, 5)
    grace.expect_no_further_options(args)

    if len(args) < 2:
        print NORMALIZE_HELP
        raise grace.Help_shown()

    dirnames = args

    filenames = []
    for dirname in dirnames:
        assert os.path.isdir(dirname), dirname + ' is not a directory'

        filenames.append(
            sorted(
                item for item in os.listdir(dirname)
                #if item.endswith('.userplot') and not item.endswith('-norm.userplot')
                if item.endswith('-depth.userplot')
                and not item.endswith('-ambiguous-depth.userplot')
                and not item.endswith('-pairspan-depth.userplot')))

    for i in xrange(1, len(dirnames)):
        if filenames[i] != filenames[0]:
            raise grace.Error('Userplots in %s differ from those in %s' %
                              (dirnames[i], dirnames[0]))
    filenames = filenames[0]

    for filename in filenames:
        normalize_files(dirnames, filename[:-15], min_depth)
Exemple #4
0
def normalize(args):
    min_depth, args = grace.get_option_value(args, '--min-depth', int, 5) 
    grace.expect_no_further_options(args) 

    if len(args) < 2:
        print NORMALIZE_HELP
        raise grace.Help_shown()

    dirnames = args

    filenames = [ ]
    for dirname in dirnames:
        assert os.path.isdir(dirname), dirname + ' is not a directory'

        filenames.append(sorted(
            item for item in os.listdir(dirname)
            #if item.endswith('.userplot') and not item.endswith('-norm.userplot')
            if item.endswith('-depth.userplot')
            and not item.endswith('-ambiguous-depth.userplot')
            and not item.endswith('-pairspan-depth.userplot')
        ))
    
    for i in xrange(1,len(dirnames)):
        if filenames[i] != filenames[0]:
            raise grace.Error('Userplots in %s differ from those in %s' % (dirnames[i], dirnames[0]))
    filenames = filenames[0]

    for filename in filenames:
        normalize_files(dirnames, filename[:-15], min_depth)
Exemple #5
0
 def scaffold(args):
     circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False)
     
     scaffold = [ ]
     for item in args:
         scaffold.append( ('contig', int(item)) )
         scaffold.append( ('gap', None) )
     
     if not circular: scaffold = scaffold[:-1]
     
     name = 'custom_scaffold_%d' % (len(scaffolds)+1)
     scaffolds.append( (name, scaffold) )
Exemple #6
0
def test_power_main(args):
    m, args = grace.get_option_value(args, '--m', int, 10)
    n, args = grace.get_option_value(args, '--n', int, 1000)
    reps, args = grace.get_option_value(args, '--reps', int, 2)
    count, args = grace.get_option_value(args, '--count', int, 100)
    dispersion, args = grace.get_option_value(args, '--dispersion', float, 0.1)
    log_fold, args = grace.get_option_value(args, '--log-fold', float, 1.0)

    if len(args) < 1:
        print >> sys.stderr, TEST_POWER_HELP
        raise grace.Help_shown()
    
    output_prefix, args = args[0], args[1:]

    options = [ ]
    def of(args):
        options.extend(args)    
    grace.execute(args, {'of': of})
    
    filename = output_prefix + '-input.txt'
    filename_literal = R_literal(filename)
    log_filename_literal = R_literal(output_prefix + '-info.txt')
    
    run_script(POWER_TEMPLATE % locals())

    claimed_fdr = test_counts_main([ output_prefix, filename, 'Experimental' ] + options)

    output_filename_literal = R_literal(output_prefix + '.txt')

    run_script(POWER_REPORT_TEMPLATE % locals())
Exemple #7
0
def debias(args):
    import numpy

    radius, args = grace.get_option_value(args, '--radius', int, 2)

    dirs = args

    for dir_name in dirs:
        for name, seq in io.read_sequences(
                os.path.join(dir_name, 'reference.fa')):
            for suffix, ambig_suffix in [
                ('-depth', '-ambiguous-depth'),
                ('-pairspan-depth', '-ambiguous-pairspan-depth'),
            ]:
                root = grace.filesystem_friendly_name(name)
                full_name = os.path.join(dir_name, root + suffix + '.userplot')
                full_ambig_name = os.path.join(
                    dir_name, root + ambig_suffix + '.userplot')
                if not os.path.exists(full_name): continue
                if not os.path.exists(full_ambig_name): continue

                output_suffix = '-%d.userplot' % radius

                print dir_name, root, output_suffix

                depths = numpy.array(read_unstranded_userplot(full_name))
                ambig_depths = numpy.array(
                    read_unstranded_userplot(full_ambig_name))
                expect = expected_depth(root, seq, depths, ambig_depths,
                                        radius)

                write_unstranded_userplot(
                    os.path.join(dir_name,
                                 root + suffix + '-expected' + output_suffix),
                    expect)

                corrected = depths / expect * numpy.median(expect)
                corrected[expect <= 5.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(dir_name,
                                 root + suffix + '-corrected' + output_suffix),
                    corrected)

                ambig_corrected = ambig_depths / expect * numpy.median(expect)
                ambig_corrected[expect <= 0.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(
                        dir_name,
                        root + ambig_suffix + '-corrected' + output_suffix),
                    ambig_corrected)
Exemple #8
0
def plot(args):
    log_it, args = grace.get_option_value(args, '--log', grace.as_bool, False)

    grace.expect_no_further_options(args)

    import numpy, pylab

    pylab.rcParams['axes.formatter.limits'] = [-20, 20]

    pylab.figure(figsize=(10, 4))

    maximum = 0
    for filename in args:
        parts = filename.split('~~', 1)
        data = []
        f = open(parts[0], 'rb')
        for line in f:
            data.append(float(line.strip()))
        f.close()

        data = numpy.array(data)

        maximum = max(maximum, numpy.maximum.reduce(data))

        #if log_it:
        #    data = numpy.log(data + 1.0) / numpy.log(2.0)

        if log_it:
            pylab.semilogy(numpy.arange(1,
                                        len(data) + 1),
                           data,
                           label=parts[-1])
        else:
            pylab.plot(numpy.arange(1, len(data) + 1), data, label=parts[-1])

    if len(args) > 1:
        pylab.legend()

        if log_it:
            pylab.ylim((1, maximum**1.2))
        else:
            pylab.ylim((0, maximum * 1.2))

    pylab.show()
Exemple #9
0
def debias(args):
    import numpy

    radius, args = grace.get_option_value(args, '--radius', int, 2) 

    dirs = args
    
    for dir_name in dirs:
        for name, seq in io.read_sequences(os.path.join(dir_name,'reference.fa')):
            for suffix, ambig_suffix in [
                ('-depth', '-ambiguous-depth'),
                ('-pairspan-depth', '-ambiguous-pairspan-depth'),
            ]:
                root = grace.filesystem_friendly_name(name)
                full_name = os.path.join(dir_name, root + suffix + '.userplot')
                full_ambig_name = os.path.join(dir_name, root + ambig_suffix + '.userplot')
                if not os.path.exists(full_name): continue
                if not os.path.exists(full_ambig_name): continue
                
                output_suffix = '-%d.userplot' % radius 

                print dir_name, root, output_suffix
                
                depths = numpy.array( read_unstranded_userplot(full_name) )
                ambig_depths = numpy.array( read_unstranded_userplot(full_ambig_name) )
                expect = expected_depth(root, seq, depths, ambig_depths, radius)
                
                write_unstranded_userplot(
                    os.path.join(dir_name, root + suffix + '-expected' + output_suffix),
                    expect) 
                
                corrected = depths / expect * numpy.median(expect)
                corrected[expect <= 5.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(dir_name, root + suffix + '-corrected' + output_suffix),
                    corrected)                 
                
                ambig_corrected = ambig_depths / expect * numpy.median(expect)
                ambig_corrected[expect <= 0.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(dir_name, root + ambig_suffix + '-corrected' + output_suffix),
                    ambig_corrected)                 
Exemple #10
0
def plot(args):
    log_it, args = grace.get_option_value(args, '--log', grace.as_bool, False)

    grace.expect_no_further_options(args)
    
    import numpy, pylab
    
    pylab.rcParams['axes.formatter.limits'] = [ -20, 20 ]
    
    pylab.figure(figsize=(10,4))
    
    maximum = 0
    for filename in args:
        parts = filename.split('~~', 1)
        data = [ ]
        f = open(parts[0],'rb')
        for line in f:
            data.append(float(line.strip()))
        f.close()
        
        data = numpy.array(data)
        
        maximum = max(maximum,numpy.maximum.reduce(data))
        
        #if log_it:
        #    data = numpy.log(data + 1.0) / numpy.log(2.0)
        
        if log_it:
            pylab.semilogy( numpy.arange(1,len(data)+1), data, label=parts[-1] )
        else:
            pylab.plot( numpy.arange(1,len(data)+1), data, label=parts[-1] )
    
    if len(args) > 1:
        pylab.legend()
        
        if log_it:
            pylab.ylim( (1,maximum**1.2) )
        else:
            pylab.ylim( (0,maximum*1.2) )
    
    pylab.show()
Exemple #11
0
def old_main(args):
    use_indels, args = grace.get_option_value(args,'--indels',int,1)
    use_reference, args = grace.get_option_value(args,'--reference',int,1)
    make_list, args = grace.get_option_value(args,'--list',int,0)
    fasta_output, args = grace.get_option_value(args,'--fasta',int,0)
    grace.expect_no_further_options(args)
    
    if len(args) < 1:
        sys.stderr.write(USAGE)
        return 1
        
    if fasta_output and use_indels:
        print >> sys.stderr, 'Indels will not be included in FASTA output'
        use_indels = 0
    
    working_dirs = args
    
    #reference_data = { } # (ref_name, position, change_type) -> string
    #strain_data = { } # working_dir -> (ref_name, position, change_type) -> string
    
    names = ['reference'] + working_dirs
    
    substitution_calls = { } # ref_name -> [ [ call ] ]
    insertion_calls = { } # ref_name -> [ [ call ] ]
    substitution_evidence = { }
    insertion_evidence = { }
    
    for name, sequence in io.read_sequences(os.path.join(working_dirs[0], 'reference.fa')):
        substitution_calls[name] = [ list(sequence.upper()) ]
        insertion_calls[name] = [ [ '-' ] * len(sequence) ]
        substitution_evidence[name] = [ [ '' ] * len(sequence) ]    
        insertion_evidence[name] = [ [ '' ] * len(sequence) ]    
    
    for working_dir in working_dirs:
        for name in substitution_calls:
            filename = os.path.join(working_dir, grace.filesystem_friendly_name(name) + '-evidence.txt')
            f = open(filename,'rb')
            
            this_substitution_calls = [ ]
            this_insertion_calls = [ ]
            this_substitution_evidence = [ ]
            this_insertion_evidence = [ ]
            
            header = f.readline()
            if header.count('\t') != 5:
                print >> sys.stderr, 'Old style evidence file. Please re-run nesoni consensus.'
                return 1
            
            for line in f:
                fields = line.rstrip('\n').split('\t')
                this_substitution_calls.append(fields[5])
                this_insertion_calls.append(fields[4])
                this_substitution_evidence.append(fields[2])
                this_insertion_evidence.append(fields[1])
            
            substitution_calls[name].append(this_substitution_calls)
            insertion_calls[name].append(this_insertion_calls)
            substitution_evidence[name].append(this_substitution_evidence)
            insertion_evidence[name].append(this_insertion_evidence)
    
    if not use_reference:
        names.pop(0)
        for name in substitution_calls:
            substitution_calls[name].pop(0)
            insertion_calls[name].pop(0)
            substitution_evidence[name].pop(0)
            insertion_evidence[name].pop(0)

    interesting = find_interesting('substitution', substitution_calls, substitution_evidence)
    if use_indels:
        interesting.extend( find_interesting('insertion-before', insertion_calls, insertion_evidence) )

    if not use_indels:
        interesting = [ item for item in interesting if '-' not in item[3] ]
    
    interesting.sort()


    if fasta_output:
        do_fasta_output(names, interesting)
        return 0 

    
    #strain_reference_having_consensus = { } # working_dir -> ref_name -> string
    #
    #for working_dir in working_dirs:
    #    assert working_dir not in strain_data, 'Working directory given twice'
    #    strain_data[working_dir] = { }
    #    
    #    report_file = open(os.path.join(working_dir, 'report.txt'), 'rU')
    #    report_file.readline()
    #    for line in report_file:
    #        ref_name, position, change_type, old, new, evidence = \
    #            line.rstrip('\n').split('\t')
    #        
    #        if change_type == 'deletion':
    #            change_type = 'substitution'
    #        
    #        if not use_indels and \
    #           (change_type == 'insertion-before' or new == '-'):
    #            continue
    #        
    #        key = (ref_name, int(position), change_type)
    #        if key in reference_data:
    #            assert reference_data[key] == old
    #        else:
    #            reference_data[key] = old
    #        
    #        strain_data[working_dir][key] = new
    #    report_file.close()
    #    
    #    strain_reference_having_consensus[working_dir] = { }
    #    ref_have_con_filename = os.path.join(working_dir, 'reference_having_consensus.fa')
    #    for name, sequence in io.read_fasta(ref_have_con_filename):
    #        strain_reference_having_consensus[working_dir][name] = sequence
    #
    #keys = sorted(reference_data)
    #
    ##Fill in any blanks
    #for working_dir in working_dirs:
    #    for key in keys:
    #        if key in strain_data[working_dir]: continue
    #    
    #        # - Positions in report files start from 1 not 0
    #        # - Insertions must be bracketed
    #        lacks_consensus = (
    #            strain_reference_having_consensus[working_dir][key[0]][key[1]-1] == 'N' or
    #            (key[2] == 'insertion-before' and key[1] > 1 and
    #             strain_reference_having_consensus[working_dir][key[0]][key[1]-2] == 'N')
    #        )
    #        
    #        #If there's no consensus, record it as ambiguous
    #        if lacks_consensus:
    #            strain_data[working_dir][key] = 'N'                
    #        else:
    #            strain_data[working_dir][key] = reference_data[key]

 
    #all_data_names = ([ 'reference' ] if use_reference else []) + working_dirs
    #all_data = ([ reference_data ] if use_reference else []) + \
    #           [ strain_data[working_dir] for working_dir in working_dirs ] 
    

    #all_data_names = ([ 'reference' ] if use_reference else []) + working_dirs
    
    

    
    
    ones = ( 1 << len(names) )-1
    
    total_differences = 0
    
    if make_list:
        print '\t'.join(['Partition','Sequence','Position in reference','Change type'] + names + names) 
    
    for i in xrange(1,(1<<len(names))-1,2):
        set1 = [ ]
        set2 = [ ]
        for j in xrange(len(names)):
            if i & (1<<j):
                set1.append(j)
            else:
                set2.append(j)

        if make_list:
            print
            print ', '.join( names[i] for i in set1 ) + '   vs   ' + \
                  ', '.join( names[i] for i in set2 )
            print
                
        n = 0
        for refname, position, change_type, values, has_ambiguous, evidence in interesting: 
            #Skip if *any* ambiguity
            if has_ambiguous:
                continue
            
            if any( values[i] != values[set1[0]] for i in set1[1:] ) or \
               any( values[i] != values[set2[0]] for i in set2[1:] ):
                continue
            
            if make_list:
                if change_type == 'substitution' and '-' in values: change_type = 'deletion'
                print '\t%s\t%d\t%s\t' % (refname,position,change_type) + '\t'.join(values) + '\t' + '\t'.join(evidence) 
            
            n += 1

        total_differences += n

        if not make_list:
            print ', '.join( names[i] for i in set1 ) + '   vs   ' + \
                  ', '.join( names[i] for i in set2 ) + \
                  ': %d differences' %n            

    if not make_list:
        print
        print 'Total: %d' % total_differences


    if make_list:
        print
        print 'Ignored'
        print
    
    n_multiway = 0
    n_ambiguous = 0    
    for refname, position, change_type, values, has_ambiguous, evidence in interesting: 
        confusing = False
        if has_ambiguous:
            n_ambiguous += 1
            confusing = True
        elif len(set(values)) > 2:
            n_multiway += 1
            confusing = True
        
        if make_list and confusing:
            print '\t%s\t%d\t%s\t' % (refname,position,change_type) + '\t'.join(values) + '\t' + '\t'.join(evidence) 

    if not make_list:
        print
        print 'Ambiguities ignored: %d' % n_ambiguous
        print 'Multi-way changes ignored: %d' % n_multiway
    
    assert total_differences + n_ambiguous + n_multiway == len(interesting)
    
    return 0
Exemple #12
0
def report_main(args):
    title, args = grace.get_option_value(args, '--title', str, 'Report')
    short_name, args = grace.get_option_value(args, '--short', str, 'files')
    show_refalign, args = grace.get_option_value(args, '--show-refalign', grace.as_bool, True)

    output_dir, args = args[0], args[1:]
    
    reference_filenames = [ ]
    clip_filenames = [ ]
    align_dirs = [ ]
    count_log_filenames = [ ]
    extra_items = [ ]
    extra_files = [ ]
    def file(args): extra_files.append((args[0], ' '.join(args[1:])))
    def extra(args): extra_items.extend(args)
    def reference(args): reference_filenames.extend(args)
    def clips(args): clip_filenames.extend(args)
    def aligns(args): align_dirs.extend(args)
    def count_log(args): count_log_filenames.extend(args)
    
    grace.execute(args, [reference, clips, aligns, extra, file, count_log])
        
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    
    file_dir = join(output_dir, short_name)
    if not os.path.isdir(file_dir): os.mkdir(file_dir)
    for item in os.listdir(file_dir):
        os.unlink(join(file_dir, item))
        
    for filename in reference_filenames:
        io.copy_file(filename, join(file_dir, os.path.basename(filename)))
    
    for filename, desc in extra_files:
        io.copy_file(filename, join(output_dir, os.path.basename(filename)))

    pairs = False
    for directory in align_dirs:
        name = os.path.basename(directory)
        io.copy_file(join(directory,'report.txt'), join(file_dir, name + '-report.txt'))
        for extension in [
            '-depth.userplot',
            '-ambiguous-depth.userplot',
            '-pairspan-depth.userplot',
            '-ambiguous-pairspan-depth.userplot',
        ]:
            filenames = [ item for item in os.listdir(directory)
                          if item.endswith(extension)
                          and not item.endswith('-ambiguous'+extension)
                          and not item.endswith('-pairspan'+extension) ]
            for filename in filenames:
                if len(filenames) == 1:
                    dest = name + extension
                else:
                    dest = name + '-' + filename
                io.copy_file(join(directory,filename), join(file_dir, dest))
                
                if 'pairspan' in extension: pairs = True
    
    today = datetime.date.today().strftime('%e %B %Y')
    
    f = open(join(output_dir, 'index.html'),'wb')
    print >> f, HEAD % locals()
    
    section(f, 'Results')
    
    for item in extra_items:
        p(f, item)
    
    for filename, desc in extra_files:
        name = os.path.basename(filename)
        p(f, '<a href="%(name)s">%(name)s</a> - %(desc)s' % locals())
    
    p(f, '<a href="%(short_name)s.zip">%(short_name)s.zip</a>' % locals())
    
    for filename in reference_filenames:
        bullet(f, os.path.basename(filename) + ' - reference')
    
    bullet(f, '...-report.txt - report on SNPs and indels found')
    
    p(f,'Different kinds of userplot:')
    bullet(f,'...-depth.userplot - depth of coverage of unambiguously aligned reads')
    bullet(f,'...-ambiguous-depth.userplot - depth of coverage, including reads that hit multiple locations')
    if pairs:
        bullet(f,'...-pairspan-depth.userplot - depth, including the space between reads in read-pairs')
        bullet(f,'...-ambiguous-pairspan-depth.userplot - as above, but including reads that hit multiple locations')
    
    if clip_filenames:
        section(f, 'Read clipping')
        
        for filename in clip_filenames:
            assert filename.endswith('_log.txt')
            name = os.path.basename(filename[:-8])
            text = extract(filename, lambda line: line.startswith('Fragments:') or line.startswith('Single reads') or line.startswith('Pairs'))
            subsection(f, name)
            pre(f, text)
            end_subsection(f)            

    if count_log_filenames:
        section(f, 'Counting alignments to genes')
        
        for filename in count_log_filenames:
            pre(f, open(filename,'rb').read())

    if align_dirs and show_refalign:
        section(f, 'Reference alignment')
        for directory in align_dirs:
            name = os.path.basename(directory)
            text = extract(join(directory, 'consensus_log.txt'),
                           lambda line: 'reads/pairs' in line or 'unmapped' in line)
            text = text.replace('(discarded)','')
            text = text.replace('reads/pairs kept', 'aligned unambiguously')
            subsection(f, name)
            pre(f, text)
            end_subsection(f)
    
    print >> f, TAIL % locals()
    f.close()
    
    zip_filename = join(output_dir, short_name + '.zip')
    if os.path.exists(zip_filename): os.unlink(zip_filename)
    assert 0 == os.system('cd %(output_dir)s ; zip %(short_name)s.zip %(short_name)s/* ' % locals())
    for item in os.listdir(file_dir):
        os.unlink(join(file_dir, item))
    os.rmdir(file_dir)
Exemple #13
0
def main(args):
    grace.require_shrimp_1()

    n_cpus = grace.how_many_cpus()

    solid, args = grace.get_flag(args, '--solid')
    verbose, args = grace.get_flag(args, '--verbose')

    threshold, args = grace.get_option_value(args, '--threshold', str, '68%')

    stride, args = grace.get_option_value(args, '--stride', int, 1)
    max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus)
    batch_size, args = grace.get_option_value(args, '--batch-size', int,
                                              5000000)

    input_reference_filenames = []
    reads_filenames = []

    shrimp_options = ['-h', threshold]
    if threshold.endswith('%'):
        threshold = -float(threshold[:-1]) / 100.0
    else:
        threshold = int(threshold)

    output_dir = []  #As list so can write to from function. Gah.

    def front_command(args):
        grace.expect_no_further_options(args)

        if len(args) < 1:
            return

        output_dir.append(args[0])
        input_reference_filenames.extend(
            [os.path.abspath(filename) for filename in args[1:]])

    def reads_command(args):
        grace.expect_no_further_options(args)
        reads_filenames.extend([[os.path.abspath(filename)]
                                for filename in args])

    def pairs_command(args):
        grace.expect_no_further_options(args)
        assert len(args) == 2, 'Expected exactly two files in "pairs"'
        reads_filenames.append(
            [os.path.abspath(filename) for filename in args])

    def shrimp_options_command(args):
        shrimp_options.extend(args)

    grace.execute(
        args, {
            'reads': reads_command,
            '--reads': reads_command,
            'pairs': pairs_command,
            'shrimp-options': shrimp_options_command,
            '--shrimp-options': shrimp_options_command,
        }, front_command)

    if not output_dir:
        print >> sys.stderr, USAGE % n_cpus
        return 1

    output_dir = output_dir[0]

    assert input_reference_filenames, 'No reference files given'
    assert reads_filenames, 'No read files given'

    for filename in itertools.chain(input_reference_filenames,
                                    *reads_filenames):
        assert os.path.exists(filename), '%s does not exist' % filename

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    if solid:
        shrimp = 'rmapper-cs'
    else:
        shrimp = 'rmapper-ls'

    reference_filename = os.path.join(output_dir, 'reference.fa')
    reference_file = open(reference_filename, 'wb')
    total_reference_sequences = 0
    total_reference_bases = 0
    for input_reference_filename in input_reference_filenames:
        for name, sequence in io.read_sequences(input_reference_filename):
            #Don't retain any comment
            name = name.split()[0]
            io.write_fasta(reference_file, name, sequence)

            total_reference_sequences += 1
            total_reference_bases += len(sequence)

    reference_file.close()

    print '%s base%s in %s reference sequence%s' % (
        grace.pretty_number(total_reference_bases),
        's' if total_reference_bases != 1 else '',
        grace.pretty_number(total_reference_sequences),
        's' if total_reference_sequences != 1 else '')

    assert total_reference_bases, 'Reference sequence file is empty'

    config = {
        'references': input_reference_filenames,
        'reads': reads_filenames,
        'stride': stride,
        'solid': solid,
        'threshold': threshold,
    }
    config_file = open(os.path.join(output_dir, 'config.txt'), 'wb')
    pprint.pprint(config, config_file)
    config_file.close()

    output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz')
    output_file = gzip.open(output_filename, 'wb')

    unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz')
    unmapped_file = gzip.open(unmapped_filename, 'wb')

    dirty_filenames = set()
    dirty_filenames.add(output_filename)
    dirty_filenames.add(unmapped_filename)

    #warn_low_threshold = True

    try:  #Cleanup temporary files

        N = [0]

        def do_shrimp(read_set):
            my_number = N[0]
            N[0] += 1

            tempname = os.path.join(output_dir,
                                    'temp%d-%d.fa' % (os.getpid(), my_number))
            tempname_out = os.path.join(
                output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number))

            dirty_filenames.add(tempname)
            dirty_filenames.add(tempname_out)

            f = open(tempname, 'wb')
            for read_name, read_seq in read_set:
                print >> f, '>' + read_name
                print >> f, read_seq
            f.close()

            command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \
                      tempname + ' ' + reference_filename + ' >' + tempname_out
            if not verbose:
                command += ' 2>/dev/null'
            #f = os.popen(command, 'r')
            child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c',
                                  command)

            #print 'SHRiMP %d running' % my_number

            def finalize():
                exit_status = os.waitpid(child_pid, 0)[1]
                assert exit_status == 0, 'Shrimp indicated an error'

                hits = {}  # read_name -> [ hit line ]

                f = open(tempname_out, 'rb')
                for line in f:
                    if line.startswith('>'):
                        read_name = line.split(None, 1)[0][1:]
                        if read_name not in hits:
                            hits[read_name] = []
                        hits[read_name].append(line)
                f.close()

                for read_name, read_seq in read_set:
                    if read_name in hits:
                        for hit in hits[read_name]:
                            output_file.write(hit)
                    else:
                        print >> unmapped_file, '>' + read_name
                        print >> unmapped_file, read_seq

                output_file.flush()
                unmapped_file.flush()

                os.unlink(tempname)
                dirty_filenames.remove(tempname)
                os.unlink(tempname_out)
                dirty_filenames.remove(tempname_out)
                #print 'SHRiMP %d finished' % my_number

            return finalize

        shrimps = []

        reader = iter_reads(config)
        read_count = 0

        while True:
            read_set = []
            read_set_bases = 0

            #Read name should not include comment cruft
            # - SHRIMP passes this through
            # - might stuff up identification of pairs

            for read_name, read_seq in reader:
                read_name = read_name.split()[0]
                read_set.append((read_name, read_seq))
                read_set_bases += len(read_seq)

                #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match
                #    sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n')
                #    warn_low_threshold = False

                read_count += 1
                if read_set_bases >= batch_size: break

            if not read_set: break

            if len(shrimps) >= max_shrimps:
                shrimps.pop(0)()
            shrimps.append(do_shrimp(read_set))

            grace.status('SHRiMPing %s' % grace.pretty_number(read_count))

        while shrimps:
            grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps))
            shrimps.pop(0)()

        grace.status('')

        output_file.close()
        dirty_filenames.remove(output_filename)
        unmapped_file.close()
        dirty_filenames.remove(unmapped_filename)

        return 0

    finally:
        for filename in dirty_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
Exemple #14
0
def main(args):
    default_transl_table, args = grace.get_option_value(
        args, '--transl_table', int, 11)
    use_coverage, args = grace.get_flag(args, '--use-coverage')
    coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff',
                                                   float, 0.1)
    tabular, args = grace.get_flag(args, '--tabular')
    noheader, args = grace.get_flag(args, '--noheader')
    verbose, args = grace.get_flag(args, '--verbose')
    bandwidth, args = grace.get_option_value(args, '--band', int, 20)
    grace.expect_no_further_options(args)

    if len(args) != 2:
        print USAGE
        return 1

    genbank_filename = args[0]
    alignment_filename = args[1]

    if os.path.isdir(alignment_filename):
        alignment_filename = os.path.join(alignment_filename, 'alignment.maf')

    working_dir = os.path.split(alignment_filename)[0]

    alignments = load_alignments(alignment_filename)

    summaries = []
    details = []

    if not noheader:
        fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t'
        if use_coverage:
            fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t'
        fields += 'Gene\tProduct'
        if tabular: fields += '\tChanges of note'
        print fields

    for record in SeqIO.parse(
            io.open_possibly_compressed_file(genbank_filename), 'genbank'):
        sequence = record.seq.tostring()

        for name, seq1, seq2, alignment in alignments:
            if seq1 == sequence: break
        else:
            raise grace.Error(
                'Genbank record %s sequence not identical to any reference sequence'
                % record.id)

        if use_coverage:
            depth = get_graph(working_dir, name, 'depth')
            ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth')
            median_depth = numpy.median(depth)
            median_ambiguous_depth = numpy.median(ambiguous_depth)
            ambiguous_factor = float(median_ambiguous_depth) / median_depth
            depth_expect = expected_depth(name, sequence, depth,
                                          ambiguous_depth)

        for feature in record.features:
            if feature.type != 'CDS': continue

            if 'locus_tag' not in feature.qualifiers:
                locus_tag = '%d..%d' % (feature.location.nofuzzy_start + 1,
                                        feature.location.nofuzzy_end)
            else:
                locus_tag = feature.qualifiers['locus_tag'][0]

            if 'transl_table' in feature.qualifiers:
                transl_table_no = int(feature.qualifiers['transl_table'][0])
            else:
                assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given'
                transl_table_no = default_transl_table

            transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no]
            start_codons = transl_table.start_codons

            try:
                feature_alignment = alignment_from_feature(sequence, feature)
            except Weird_alignment:
                warn('%s has a location I could not handle, skipping, sorry' %
                     locus_tag)
                continue

            dna = []
            new_dna = []
            shifts = []
            for i in xrange(feature_alignment.end2):
                p1 = feature_alignment.back_project(i, left=False)
                p2 = feature_alignment.back_project(i + 1, left=True)
                assert abs(p2 - p1) < 2
                dna.append(sequence_slice(sequence, p1, p2))

                p1a = alignment.project(p1, left=False)
                p2a = alignment.project(p2, left=False)  #Hmm

                diff = (p2 - p1) - (p2a - p1a)
                #if diff:
                #    if diff%3:
                #        frame_shift = True
                #    else:
                #        frame_preserving_shift = True
                new_dna.append(sequence_slice(seq2, p1a, p2a))

                if diff:
                    shifts.append((i, dna[-1], new_dna[-1]))

            dna = ''.join(dna)
            new_dna = ''.join(new_dna)

            # This usually indicated a CDS truncated at the start?
            # in which case, will probably fail some way or other down the line.
            if 'codon_start' in feature.qualifiers:
                codon_start = int(feature.qualifiers['codon_start'][0]) - 1
            else:
                codon_start = 0
            dna = dna[codon_start:]
            new_dna = new_dna[codon_start:]

            if len(dna) % 3 != 0:
                warn(locus_tag + ' length not a multiple of 3')
            #assert len(new_dna) % 3 == 0

            protein = Seq.Seq(dna).translate(table=transl_table_no).tostring()
            # http://en.wikipedia.org/wiki/Start_codon is always translated to M
            protein = 'M' + protein[1:]

            if dna[:3] not in start_codons:
                warn(locus_tag + ' has unknown start codon: ' + dna[:3])

            original_lacks_stop_codon = not protein.endswith('*')
            if original_lacks_stop_codon:
                warn(locus_tag + ' lacks end codon')
            original_stops_before_end = '*' in protein[:-1]
            if original_stops_before_end:
                warn(locus_tag + ' contains stop codon before end')

            if 'translation' in feature.qualifiers:
                expect = feature.qualifiers['translation'][0]
                if protein[:-1] != expect:
                    warn(
                        locus_tag +
                        ' translation given in feature does not match translation from DNA'
                    )

            new_protein = Seq.Seq(new_dna).translate(
                table=transl_table_no).tostring()
            new_protein = 'M' + new_protein[1:]

            # If end codon changed, find new end
            # Don't bother if there are unknown amino acids or
            # the original protein lacks a stop codon
            if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon:
                #This is very inefficient
                i = feature_alignment.end2
                while True:
                    p1 = feature_alignment.back_project(i, left=False)
                    p2 = feature_alignment.back_project(i + 1, left=True)
                    p1a = alignment.project(p1, left=False)
                    p2a = alignment.project(p2, left=False)  #Hmm
                    if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len(
                            seq2):
                        break

                    new_dna += sequence_slice(seq2, p1a, p2a)
                    new_protein = Seq.Seq(new_dna).translate(
                        table=transl_table_no).tostring()
                    new_protein = 'M' + new_protein[1:]
                    if 'X' in new_protein or '*' in new_protein: break

                    i += 1

            # Is the protein shorter?
            # Don't bother checking if the original protein has extra stop codons
            if '*' in new_protein and not original_stops_before_end:
                new_protein = new_protein[:new_protein.index('*') + 1]

            # If indels occurred, do an alignment
            # Don't bother otherwise
            if shifts:
                # Penalize gaps with cost 2 (vs 1 for mismatch)
                # If lengths don't match, pad with spaces (won't match longer seq),
                # aligner prefers mismatch to gaps

                #result = pairwise2.align.globalxs(protein      + ' '*max(0,len(new_protein)-len(protein)),
                #                                  new_protein  + ' '*max(0,len(protein)-len(new_protein)),
                #                                  -2.001,-2.000)[0]
                # 2.001 : very slightly prefer contiguous gaps. Also much faster!

                result = band_limited_align(
                    protein + ' ' * max(0,
                                        len(new_protein) - len(protein)),
                    new_protein + ' ' * max(0,
                                            len(protein) - len(new_protein)),
                    bandwidth)

                protein_ali = result[0]
                new_protein_ali = result[1]
            else:
                protein_ali = protein
                new_protein_ali = new_protein

            diffs = []
            j = 0
            k = 0
            for i in xrange(min(len(new_protein_ali), len(protein_ali))):
                if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and (
                        protein_ali[i] == '-' or new_protein_ali[i] == '-'
                        or not bio.might_be_same_amino(protein_ali[i],
                                                       new_protein_ali[i])):
                    diffs.append((i, j, k))
                if protein_ali[i] != '-':
                    j += 1
                if new_protein_ali[i] != '-':
                    k += 1

            diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \
                         not bio.might_be_same_base(new_dna[1],dna[1]) or \
                         not bio.might_be_same_base(new_dna[2],dna[2])

            interesting_coverage = False
            if use_coverage:
                cds_depth = depth[feature_alignment.start1:
                                  feature_alignment.end1]  #/ median_depth
                if not feature_alignment.forward1: cds_depth = cds_depth[::-1]
                cds_ambiguous_depth = ambiguous_depth[
                    feature_alignment.start1:
                    feature_alignment.end1]  #/ median_ambiguous_depth
                if not feature_alignment.forward1:
                    cds_ambiguous_depth = cds_ambiguous_depth[::-1]

                cds_depth_expect = depth_expect[feature_alignment.
                                                start1:feature_alignment.end1]
                if not feature_alignment.forward1:
                    cds_depth_expect = cds_depth_expect[::-1]

                #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth
                #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth
                #line += '%.1f\t' % cds_average_depth_ratio
                #line += '%.1f\t' % cds_average_ambiguous_depth_ratio

                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth)
                #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth)
                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth)

                avg_expect = numpy.average(cds_depth_expect)
                if avg_expect > 0.0:
                    cds_avg_depth = numpy.average(cds_depth) / avg_expect
                    cds_avg_ambiguous_depth = numpy.average(
                        cds_ambiguous_depth) / avg_expect / ambiguous_factor

                strange = ((cds_depth >= cds_depth_expect * 1.5) |
                           (cds_ambiguous_depth <= cds_depth_expect *
                            (0.5 * ambiguous_factor)))

                interesting_coverage = numpy.average(
                    strange) >= coverage_cutoff

            if interesting_coverage or diffs or diff_start or shifts or len(
                    new_protein) != len(protein):
                line = name + '\t' + locus_tag + '\t' + \
                      '%d\t' % (len(protein)-1) + \
                      '%d\t' % (len(new_protein)-1) + \
                      '%d\t' % len(diffs)

                if use_coverage:
                    if avg_expect <= 0.0:
                        line += '\t\t\t'
                    else:
                        line += '%.1f\t' % (cds_avg_depth) + graphlet(
                            cds_depth, cds_depth_expect) + '\t'
                        line += '%.1f\t' % (
                            cds_avg_ambiguous_depth) + graphlet(
                                cds_ambiguous_depth,
                                cds_depth_expect * ambiguous_factor) + '\t'
                        line += '%.1f%%\t' % (
                            numpy.average(cds_ambiguous_depth > 0.0) * 100.0)

                line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \
                        '%s' % feature.qualifiers.get('product',[''])[0]

                notes = []

                if use_coverage and 'X' in new_protein:
                    xs = new_protein.count('X')
                    if xs == len(new_protein) - 1:  #First is M, so len-1
                        notes.append('\ No consensus')
                    else:
                        notes.append('\ No consensus for %d aa' %
                                     (new_protein.count('X')))

                if len(new_protein) < len(protein):
                    notes.append('\ Shorter by %d aa' %
                                 (len(protein) - len(new_protein)))

                if len(new_protein) > len(protein):
                    notes.append('\ Longer by %d aa' %
                                 (len(new_protein) - len(protein)))

                if diff_start:
                    notes.append('\ Start changed: %s -> %s' %
                                 (dna[:3], new_dna[:3]))
                    if new_dna[:3] not in start_codons:
                        notes.append('  No longer a start codon!')

                if shifts:
                    notes.append('\ Indels:')

                    for pos, old, new in shifts:
                        notes.append('    base %5d / codon %5d   %s -> %s' %
                                     (pos + 1,
                                      (pos // 3) + 1, old, new or '-'))

                if diffs:
                    if verbose:
                        notes.append('\ Amino acid changes:')
                        for i, j, k in diffs:
                            notes.append(
                                '    codon %5d   %s->%s   (%s->%s)' %
                                (j + 1, protein_ali[i], new_protein_ali[i],
                                 dna[j * 3:j * 3 + 3] if protein_ali[i] != '-'
                                 else '-', new_dna[k * 3:k * 3 + 3]
                                 if new_protein_ali[i] != '-' else '-'))

                #if len(new_protein) > len(protein):
                #    print 'New protein is longer:', new_protein[len(protein):]
                #if len(new_protein) < len(protein):
                #    print 'New protein is shorter:', protein[len(new_protein):]
                #print protein
                #print new_protein

                if tabular:
                    print line + '\t' + ' '.join(
                        [' '.join(note.strip().split()) for note in notes])
                else:
                    print line
                    for note in notes:
                        print '\t' + note
    return 0
Exemple #15
0
def main(args):
    title1, args = grace.get_option_value(args, "--title1", str, None)
    title2, args = grace.get_option_value(args, "--title2", str, None)
    grace.expect_no_further_options(args)

    if len(args) != 3:
        print >> sys.stderr, USAGE
        return 1

    working_dir1 = args[0]
    working_dir2 = args[1]
    cutoff = float(args[2])

    sequence_names = [name for name, sequence in io.read_sequences(os.path.join(working_dir1, "reference.fa"))]

    if title1 is None:
        title1 = working_dir1
    if title2 is None:
        title2 = working_dir2

    n = 1
    while significance([("A", n)], [("T", n)], 1.0) > cutoff:
        n += 1

    print "%g\tsignificance cutoff" % cutoff
    print "%d\tdepth required to call substitution (greater if there are errors in the reads)" % n

    print "Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s" % (
        title1,
        title2,
        title1,
        title2,
    )

    for sequence_name in sequence_names:
        filename1 = os.path.join(working_dir1, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt")
        filename2 = os.path.join(working_dir2, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt")

        for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip(
            read_file(filename1), read_file(filename2)
        ):
            assert pos1 == pos2 and ref1 == ref2

            if pos1 % 1000 == 0:
                grace.status("Testing %s %d" % (sequence_name, pos1))

            dec_ins1 = io.decode_evidence(ins1)
            dec_ins2 = io.decode_evidence(ins2)
            if dec_ins1 and dec_ins2:
                sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff)
                if sig is not None and sig <= cutoff:
                    grace.status("")
                    print "%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s" % (
                        sequence_name,
                        pos1,
                        "insertion-before",
                        ins1,
                        ins2,
                        sig,
                        conins1,
                        conins2,
                    )

            dec_sub1 = io.decode_evidence(sub1)
            dec_sub2 = io.decode_evidence(sub2)
            if dec_sub1 and dec_sub2:
                sig = significance(dec_sub1, dec_sub2, cutoff)
                if sig is not None and sig <= cutoff:
                    if dec_sub1[0][0] == "-" or dec_sub2[0][0] == "-":
                        what = "deletion"
                    elif dec_sub1[0][0] != dec_sub2[0][0]:
                        what = "substitution"
                    else:
                        what = "different mix"
                    grace.status("")
                    print "%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s" % (
                        sequence_name,
                        pos1,
                        what,
                        ref1,
                        sub1,
                        sub2,
                        sig,
                        consub1,
                        consub2,
                    )

    grace.status("")
    return 0
Exemple #16
0
def pastiche(args):
    if len(args) < 4:
        print USAGE
        return 1

    mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool,
                                             False)
    min_leftover, args = grace.get_option_value(args, '--min-leftover', int,
                                                20)

    output_dir, args = args[0], args[1:]

    #, ref_filename, contig_filenames = args[0], args[1], args[2:]

    ref_filenames = []
    contig_filenames = []
    grace.execute(args,
                  {'contigs': lambda args: contig_filenames.extend(args)},
                  lambda args: ref_filenames.extend(args))

    assert ref_filenames, 'No reference sequences given'
    assert contig_filenames, 'No contig sequences given'

    contigs = dict([(name.split()[0], seq) for filename in contig_filenames
                    for name, seq in io.read_sequences(filename)])
    dir_contigs = {}
    for name in contigs:
        dir_contigs[name + '+'] = contigs[name]
        dir_contigs[name + '-'] = bio.reverse_complement(contigs[name])

    dir_contigs_used = {}
    for name in dir_contigs:
        dir_contigs_used[name] = [False] * len(dir_contigs[name])

    workspace = io.Workspace(output_dir)
    temp_prefix = workspace._object_filename('temp-pastiche')

    out_f = workspace.open('pastiche.fa', 'wb')

    for ref_filename in ref_filenames:
        for ref_name, ref_seq in io.read_sequences(ref_filename):
            ref_name = ref_name.split()[0]

            grace.status(ref_name)

            f = open(temp_prefix + '.fa', 'wb')
            io.write_fasta(f, 'ref', ref_seq)
            f.close()

            scores = [-1] * (len(ref_seq) * 2)
            strings = ['N', ''] * (len(ref_seq))
            contexts = [None for i in xrange(len(ref_seq) * 2)]

            #MAXSCORE = len(ref_seq)+1
            #for i in xrange(len(ref_seq)):
            #    if ref_seq[i].upper() != 'N':
            #        strings[i*2] = ref_seq[i]
            #        scores[i*2] = MAXSCORE
            #for i in xrange(len(ref_seq)-1):
            #    if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N':
            #        scores[i*2+1] = MAXSCORE

            if mask_only:
                for i in xrange(len(ref_seq)):
                    strings[i * 2] = ref_seq[i].lower()

            def put(position, dir_contig_name, start, end, score):
                if scores[position] < score:
                    scores[position] = score
                    strings[position] = dir_contigs[dir_contig_name][start:end]
                    contexts[position] = (dir_contig_name, start, end, score)

            for contig_filename in contig_filenames:
                execute([
                    'nucmer',
                    '--prefix',
                    temp_prefix,
                    #'--maxmatch', #Very slow
                    '--nosimplify',
                    '--minmatch',
                    '9',
                    '--mincluster',
                    '50',
                    #'--maxgap', '1000',
                    #'--breaklen', '1000', # Increasing this reduces Ns, but is slow
                    #'--diagfactor', '1.0',
                    temp_prefix + '.fa',
                    contig_filename
                ])

                for contig_name, contig_seq in io.read_sequences(
                        contig_filename):
                    contig_name = contig_name.split()[0]
                    grace.status(ref_name + ' vs ' + contig_name)
                    p = run([
                        'show-aligns', temp_prefix + '.delta', 'ref',
                        contig_name
                    ],
                            stderr=subprocess.PIPE)

                    alignments = []

                    while True:
                        line = p.stdout.readline()
                        if not line: break
                        if not line.startswith('-- BEGIN'):
                            continue

                        parts = line.split()

                        ref_start = int(parts[5])
                        ref_end = int(parts[7])
                        query_start = int(parts[10])
                        query_end = int(parts[12])

                        #assert ref_start < ref_end
                        #ref_start -= 1 #Zero based coordinates

                        al_ref = []
                        al_query = []

                        while True:
                            block = []
                            end = False
                            while True:
                                line = p.stdout.readline()
                                if line.startswith('--   END'):
                                    end = True
                                    break
                                if line == '\n':
                                    if block:
                                        break
                                    else:
                                        continue
                                block.append(line)

                            if end: break

                            al_ref.append(block[0].split()[1])
                            al_query.append(block[1].split()[1])

                        al_ref = ''.join(al_ref)
                        al_query = ''.join(al_query)

                        if ref_start > ref_end:
                            al_ref = bio.reverse_complement(al_ref)
                            al_query = bio.reverse_complement(al_query)
                            ref_start, ref_end = ref_end, ref_start
                            query_start, query_end = query_end, query_start

                        if query_start > query_end:
                            dir_contig_name = contig_name + '-'
                            query_start = len(contig_seq) + 1 - query_start
                            query_end = len(contig_seq) + 1 - query_end
                        else:
                            dir_contig_name = contig_name + '+'

                        ref_start -= 1  #Zero based coordinates
                        query_start -= 1

                        #print al_ref
                        #print al_query

                        #Pretty dumb scoring scheme
                        al_score = 0
                        for i in xrange(len(al_ref)):
                            if al_ref[i] == al_query[i]:
                                al_score += 1
                            #else:
                            #    al_score -= 1

                        #Pastiche alignment over reference
                        ref_pos = ref_start
                        query_pos = query_start
                        al_pos = 0
                        while al_pos < len(al_ref):
                            assert al_ref[al_pos] != '.'
                            if al_query[al_pos] == '.':
                                put(ref_pos * 2, dir_contig_name, query_pos,
                                    query_pos, al_score)
                            else:
                                assert al_query[al_pos].lower() == dir_contigs[
                                    dir_contig_name][query_pos].lower()
                                put(ref_pos * 2, dir_contig_name, query_pos,
                                    query_pos + 1, al_score)
                                query_pos += 1
                            al_pos += 1

                            al_pos_end = al_pos
                            query_pos_end = query_pos
                            while al_pos_end < len(
                                    al_ref) and al_ref[al_pos_end] == '.':
                                al_pos_end += 1
                                query_pos_end += 1
                            #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score)
                            assert al_query[al_pos:al_pos_end].lower(
                            ) == dir_contigs[dir_contig_name][
                                query_pos:query_pos_end].lower()
                            put(ref_pos * 2 + 1, dir_contig_name, query_pos,
                                query_pos_end, al_score)
                            al_pos = al_pos_end
                            query_pos = query_pos_end
                            ref_pos += 1

                    p.wait()

            grace.status(ref_name)

            result = ''.join(strings)
            io.write_fasta(out_f, ref_name, result)

            for context in contexts:
                if context is None: continue
                name, start, end, score = context
                for i in xrange(start, end):
                    dir_contigs_used[name][i] = True

            #Interpolation
            #result = [ ]
            #i = 0
            #while i < len(ref_seq):
            #    if strings[i*2].upper() != 'N':
            #        result.append(strings[i*2])
            #        result.append(strings[i*2+1])
            #        i += 1
            #        continue
            #
            #    j = i
            #    while strings[j*2].upper() == 'N':
            #        j += 1
            #
            #    grace.status('')
            #    print >> sys.stderr, 'interpolating', i+1,'..',j
            #
            #    window = 20 #!!!!!!!!!!!
            #    left_contexts = collections.defaultdict(lambda:0)
            #    for i1 in xrange(max(0,i-window),i):
            #        for context_name, context_start, context_end, context_score in contexts[i1*2]:
            #            key = (context_name, context_end + i - i1)
            #            left_contexts[key] = max(left_contexts[key],context_score)
            #
            #    right_contexts = collections.defaultdict(lambda:0)
            #    for j1 in xrange(j,min(j+window,len(ref_seq))):
            #        for context_name, context_start, context_end, context_score in contexts[j1*2]:
            #            key = (context_name, context_start + j - j1)
            #            right_contexts[key] = max(left_contexts[key],context_score)
            #
            #    #print >> sys.stderr, left_contexts
            #    #print >> sys.stderr, right_contexts
            #
            #    options = [ ]
            #
            #    for (left_name, left_pos), left_score in left_contexts.items():
            #        for (right_name, right_pos), right_score in right_contexts.items():
            #            if left_name != right_name: continue
            #            if right_pos < left_pos: continue
            #
            #            if right_pos-left_pos > (j-i) * 4.0 + 10: continue   #!!!!!!!!!!!!!!!!!!!!!!1
            #            if right_pos-left_pos < (j-i) * 0.25 - 10: continue
            #
            #            score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i)
            #            score *= left_score + right_score
            #            #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score
            #            options.append( (score, left_name, left_pos, right_pos) )
            #
            #    if options:
            #        best = max(options, key=lambda option: option[0])
            #        print >> sys.stderr, '->', best
            #        result.append( dir_contigs[best[1]][best[2]:best[3]].lower() )
            #    else:
            #        print >> sys.stderr, '-> no good interpolation'
            #        result.append( ref_seq[i:j] )
            #
            #    i = j
            #
            #result = ''.join(result)
            #io.write_fasta(sys.stdout, ref_name, result)

            #print >> sys.stderr, len(result), result.count('N')
            #for pos, size in N_runs:
            #    out_size = len(''.join( strings[pos*2:pos*2+2] ))
            #    print >> sys.stderr, pos, size, '->', out_size

    out_f.close()

    grace.status('')

    #for name, seq in io.read_sequences(ref_filename):
    #    result = pastiche(seq, contigs_filename)
    #    io.write_fasta(sys.stdout, name, result)

    leftover_f = workspace.open('leftovers.fa', 'wb')

    for name in sorted(contigs):
        used = [
            (a or b)
            for a, b in zip(dir_contigs_used[name +
                                             '+'], dir_contigs_used[name +
                                                                    '-'][::-1])
        ]

        i = 0
        while i < len(used):
            j = i
            while j < len(used) and not used[j]:
                j += 1
            if j - i > min_leftover:
                if i == 0 and j == len(used):
                    out_name = name
                else:
                    out_name = name + ':%d..%d' % (i + 1, j)
                io.write_fasta(leftover_f, out_name, contigs[name][i:j])

            i = j + 1

    leftover_f.close()

    for suffix in ['.fa', '.delta']:
        os.unlink(temp_prefix + suffix)
Exemple #17
0
def batch_main(args):
    options = Options()
    
    options.references = [ ]
    
    options.clip_options = [ ]
    options.shrimp_options = [ ]
    options.do_consensus = True
    options.consensus_options = [ ]
    options.samples = [ ]
    
    options.do_count = False
    options.count_options = [ ]
    options.tests = [ ]
    
    options.report_options = [ ]

    default_nesoni = sys.executable + ' ' + sys.argv[0]
    options.nesoni, args = grace.get_option_value(args, '--nesoni', str, default_nesoni)    
    options.pypy_nesoni, args = grace.get_option_value(args, '--pypy-nesoni', str, options.nesoni)

    options.prefix, args = grace.get_option_value(args,'--input-prefix', str, None)
    options.submit, args = grace.get_option_value(args,'--submit', str, '%')
    assert '%' in options.submit, 'Bad submit pattern'
    
    options.damp, args = grace.get_option_value(args, '--damp-run', grace.as_bool, False)
    
    options.run, args = grace.get_option_value(args, '--run', int, None)
    
    def absolutize(filename):
        if options.prefix is not None:
            return options.prefix + filename
        else:
            return io.abspath(filename)
    def path_param(filenames, damp=False):
        if damp:
           filenames = [ item+'~~first:10000' for item in filenames ]
    
        return ' '.join(absolutize(filename) for filename in filenames)
       
    def default(args):
        grace.expect_no_further_options(args)
        if len(args) != 1:
            print >> sys.stderr, BATCH_HELP % default_nesoni
            raise grace.Help_shown()
        options.dirname = args[0]

    def reference(args):
        grace.expect_no_further_options(args)
        options.references.extend(args)

    def do_clip(args):
        options.clip_options.extend(args)

    def do_shrimp(args):
        options.shrimp_options.extend(args)
    
    def do_consensus(args):
        if args == ['no']:
            options.do_consensus = False
        else:
            options.consensus_options.extend(args)

    def sample(args):
        sample = Options()
        sample.imported = False
        sample.reads = [ ]
        sample.pairs = [ ]
        sample.interleaved = [ ]
        options.samples.append(sample)        
        def default(args):
            assert len(args) == 1, 'Expected a sample name in "sample:"'
            sample.name = args[0]
        def reads(args):
            grace.expect_no_further_options(args)
            sample.reads.extend(args)
        def pairs(args):
            grace.expect_no_further_options(args)
            assert len(args) == 2, 'Expected exactly two files in "pairs:"'
            sample.pairs.append(args)
        def interleaved(args):
            grace.expect_no_further_options(args)
            sample.interleaved.extend(args)
        grace.execute(args, [reads,pairs,interleaved], default)
        assert sample.reads or sample.pairs or sample.interleaved, 'No reads for sample'

    def import_(args):
        grace.expect_no_further_options(args)
        for item in args:
            sample = Options()
            options.samples.append(sample)
            sample.imported = True
            sample.clip_dest = None
            sample.align_dest = absolutize(item)

    def do_count(args):
        options.do_count = True
        options.count_options.extend(args)

    def do_test_counts(args):
        assert len(args) > 1, 'Incorrect parameters for test-counts'
        test = Options()
        options.tests.append(test)
        test.args = args
    
    def do_report(args):
        options.report_options.extend(args)

    grace.execute(args, [
        reference,
        do_clip,
        do_shrimp,
        do_consensus,
        sample,
        import_,
        do_count,
        do_test_counts,
        do_report,
    ], default)

    if options.damp:
        options.dirname += '-damp'
    
    if options.tests:
        options.do_count = True
    
    batch = Batch(options.dirname, options.submit) 

    for sample in options.samples:
        if sample.imported: continue
    
        # CLIP ===========================================
        batch.require_dir('clip')
        
        sample.clip_dest = join('clip', sample.name)
        command = (
            options.pypy_nesoni + 
            ' clip: ' +
            sample.clip_dest
        )
        if options.clip_options:
            command += ' ' + quote_param(options.clip_options)         
        if sample.reads:
            command += ' reads: ' + path_param(sample.reads, options.damp)
        for pair in sample.pairs:
            command += ' pairs: ' + path_param(pair, options.damp)
        if sample.interleaved:
            command += ' interleaved: ' + path_param(sample.interleaved, options.damp)
        
        sample.has_pairs = bool(sample.pairs) or bool(sample.interleaved)
        sample.clip_state = batch.target(
            sample.clip_dest,
            [],
            command
        )

        # ALIGN ==========================================
        batch.require_dir('align')
        
        sample.align_dest = join('align', sample.name) 
        command = options.pypy_nesoni + ' samshrimp: ' + sample.align_dest
        command += ' ' + path_param(options.references)
        command += ' reads: ' + sample.clip_dest + '_single.fq.gz'
        if sample.has_pairs:
            command += ' interleaved: ' + sample.clip_dest + '_paired.fq.gz'
        command += ' ' + quote_param(options.shrimp_options) 
        
        sample.align_state = batch.target(
            sample.align_dest,
            [ sample.clip_state ],
            command
        )
        
        # CONSENSUS =======================================
        if options.do_consensus:
            command = (
                options.pypy_nesoni + 
                ' samconsensus: ' + 
                sample.align_dest +
                ' ' + quote_param(options.consensus_options)
            )
            
            sample.consensus_state = batch.target(
                join(sample.align_dest, 'consensus'),
                [ sample.align_state ],
                command 
            )

        
    batch.virtual_target(
        'clip',
        [ sample.clip_state for sample in options.samples if not sample.imported ]
    )
    batch.virtual_target(
        'align',
        [ sample.align_state for sample in options.samples if not sample.imported ]
    )
    
    # COUNT ==========================================
    if options.do_count:
        command = options.pypy_nesoni + ' samcount: counts ' + quote_param(options.count_options)
        command += ' ' + ' '.join( sample.align_dest for sample in options.samples )
        
        options.counts_state = batch.target(
            'count',
            [ (sample.consensus_state if options.do_consensus else sample.align_state) for sample in options.samples if not sample.imported ],
                # count: --filter existing can depend on consensus
            command
        )
        batch.virtual_target('count', [ options.counts_state ])
        
        command = options.pypy_nesoni + ' plot-counts: scatter-plots counts.txt'
        options.plot_state = batch.target(
            'plot',
            [ options.counts_state ],
            command
        )
        

    # TEST ============================================
    for test in options.tests:
        batch.require_dir('test')
        test.dest = join('test', test.args[0])
        param = test.args[1:]
        
        command = options.pypy_nesoni + ' test-counts: ' + test.dest + ' counts.txt'
        command += ' ' + quote_param(param)
        
        if options.damp: 
            command += ' --min-count 1'
        
        test.state = batch.target(
            test.dest,
            [ options.counts_state ],
            command
        )
    
    if options.tests:
        command1 = 'rm -f differential-expression-tests.zip'
        command2 = (
            'zip -j differential-expression-tests.zip ' +
            ' '.join( test.dest + '*' for test in options.tests )
        )
        
        options.edger_zip_state = batch.target(
            'differential-expression-tests',
            [ test.state for test in options.tests ],
            command1,
            command2, 
        )
        
    
    # REPORT ===========================================

    command = options.pypy_nesoni + ' report: report ' + quote_param(options.report_options)
    command += ' reference: ' + path_param(options.references)
    command += ' clips: ' + ' '.join( sample.clip_dest+'_log.txt' for sample in options.samples if sample.clip_dest is not None )
    if options.do_consensus:
        command += ' aligns: ' + ' '.join( sample.align_dest for sample in options.samples )
    if options.do_count:
        command += ' count-log: counts_log.txt'

    if options.do_count:
        command += ' file: counts.txt \'Table of raw counts, RPKMs, and statistics on alignments spanning multiple genes.\''
        command += ' file: scatter-plots-count.png \'Pairwise scatter plots of number of reads aligning to each gene.\''
        command += ' file: scatter-plots-RPKM.png \'Pairwise scatter plots of RPKM values.\''
    
    if options.tests:
        command += ' file: differential-expression-tests.zip \'Differential gene expression analysis\''     
    
    options.report_state = batch.target(
        'report',
        batch.all[:], #Meh
        command
    )
    batch.virtual_target('report', [ options.report_state ])
    batch.virtual_target(
        'view', 
        [ options.report_state ],
        'firefox -no-remote report/index.html'
    )
        
    batch.close()
    
    if options.run is None:
        print
        print 'Now type:'
        print
        print 'make -C %s' % pipes.quote(options.dirname)
        print
    else:
        command = 'make -C %s -j %d' % (pipes.quote(options.dirname), options.run)
        print
        print command
        print
        assert 0 == os.system(command)
Exemple #18
0
def main(args):
    mincov, args = grace.get_option_value(args, '--mincov', int, 1)
    maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
    minsize, args = grace.get_option_value(args, '--minsize', int, 200)
    what, args = grace.get_option_value(args, '--what', as_core_or_unique,
                                        'core')
    is_core = (what == 'core')

    grace.expect_no_further_options(args)

    if len(args) < 2:
        print >> sys.stderr, HELP
        raise grace.Help_shown()

    output_dir, working_dirs = args[0], args[1:]

    assert not path.exists(path.join(output_dir, 'reference.fa')), \
        'Output directory not given'

    if not path.exists(output_dir):
        os.mkdir(output_dir)

    for name, seq in io.read_sequences(
            path.join(working_dirs[0], 'reference.fa')):
        print name
        friendly_name = grace.filesystem_friendly_name(name)

        good = [True] * len(seq)

        for working_dir in working_dirs:
            if is_core:
                suffix = '-depth.userplot'
            else:
                suffix = '-ambiguous-depth.userplot'
            data = trivia.read_unstranded_userplot(
                os.path.join(working_dir, friendly_name + suffix))
            assert len(seq) == len(data)
            for i in xrange(len(seq)):
                if good[i]:
                    if is_core:
                        good[i] = data[i] >= mincov
                    else:
                        good[i] = data[i] < mincov

        #Close holes
        start = -maxdiff - 1
        n_holes = 0
        for i in xrange(len(seq)):
            if good[i]:
                if 0 < i - start <= maxdiff:
                    for j in xrange(start, i):
                        good[j] = True
                    n_holes += 1
                start = i + 1
        print 'Closed', grace.pretty_number(n_holes), 'holes'

        f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name, what)),
                 'wb')
        io.write_fasta(
            f, name,
            ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))]))
        f.close()

        f = open(
            path.join(output_dir, '%s-%s_masked.fa' % (friendly_name, what)),
            'wb')
        io.write_fasta(
            f, name, ''.join([(seq[i] if good[i] else seq[i].lower())
                              for i in xrange(len(seq))]))
        f.close()

        f_good = open(
            path.join(output_dir, '%s-%s_parts.fa' % (friendly_name, what)),
            'wb')
        f_nongood = open(
            path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name, what)),
            'wb')
        start = 0
        n_good = [0]
        n_good_bases = [0]

        def emit(i):
            if i - start < minsize: return
            if good[start]:
                n_good[0] += 1
                n_good_bases[0] += i - start
            io.write_fasta(f_good if good[start] else f_nongood,
                           '%s:%d..%d' % (name, start + 1, i), seq[start:i])

        for i in xrange(1, len(seq)):
            if good[i] != good[start]:
                emit(i)
                start = i
        emit(len(seq))
        f_nongood.close()
        f_good.close()

        print grace.pretty_number(
            sum(good)), 'bases are ' + what + ', of', grace.pretty_number(
                len(seq)), 'in reference sequence'
        print grace.pretty_number(
            n_good[0]), 'parts at least', grace.pretty_number(
                minsize), 'bases long with', grace.pretty_number(
                    n_good_bases[0]), 'total bases'

        print
Exemple #19
0
def report_main(args):
    title, args = grace.get_option_value(args, '--title', str, 'Report')
    short_name, args = grace.get_option_value(args, '--short', str, 'files')
    show_refalign, args = grace.get_option_value(args, '--show-refalign',
                                                 grace.as_bool, True)

    output_dir, args = args[0], args[1:]

    reference_filenames = []
    clip_filenames = []
    align_dirs = []
    count_log_filenames = []
    extra_items = []
    extra_files = []

    def file(args):
        extra_files.append((args[0], ' '.join(args[1:])))

    def extra(args):
        extra_items.extend(args)

    def reference(args):
        reference_filenames.extend(args)

    def clips(args):
        clip_filenames.extend(args)

    def aligns(args):
        align_dirs.extend(args)

    def count_log(args):
        count_log_filenames.extend(args)

    grace.execute(args, [reference, clips, aligns, extra, file, count_log])

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    file_dir = join(output_dir, short_name)
    if not os.path.isdir(file_dir): os.mkdir(file_dir)
    for item in os.listdir(file_dir):
        os.unlink(join(file_dir, item))

    for filename in reference_filenames:
        io.copy_file(filename, join(file_dir, os.path.basename(filename)))

    for filename, desc in extra_files:
        io.copy_file(filename, join(output_dir, os.path.basename(filename)))

    pairs = False
    for directory in align_dirs:
        name = os.path.basename(directory)
        io.copy_file(join(directory, 'report.txt'),
                     join(file_dir, name + '-report.txt'))
        for extension in [
                '-depth.userplot',
                '-ambiguous-depth.userplot',
                '-pairspan-depth.userplot',
                '-ambiguous-pairspan-depth.userplot',
        ]:
            filenames = [
                item for item in os.listdir(directory)
                if item.endswith(extension)
                and not item.endswith('-ambiguous' + extension)
                and not item.endswith('-pairspan' + extension)
            ]
            for filename in filenames:
                if len(filenames) == 1:
                    dest = name + extension
                else:
                    dest = name + '-' + filename
                io.copy_file(join(directory, filename), join(file_dir, dest))

                if 'pairspan' in extension: pairs = True

    today = datetime.date.today().strftime('%e %B %Y')

    f = open(join(output_dir, 'index.html'), 'wb')
    print >> f, HEAD % locals()

    section(f, 'Results')

    for item in extra_items:
        p(f, item)

    for filename, desc in extra_files:
        name = os.path.basename(filename)
        p(f, '<a href="%(name)s">%(name)s</a> - %(desc)s' % locals())

    p(f, '<a href="%(short_name)s.zip">%(short_name)s.zip</a>' % locals())

    for filename in reference_filenames:
        bullet(f, os.path.basename(filename) + ' - reference')

    bullet(f, '...-report.txt - report on SNPs and indels found')

    p(f, 'Different kinds of userplot:')
    bullet(
        f,
        '...-depth.userplot - depth of coverage of unambiguously aligned reads'
    )
    bullet(
        f,
        '...-ambiguous-depth.userplot - depth of coverage, including reads that hit multiple locations'
    )
    if pairs:
        bullet(
            f,
            '...-pairspan-depth.userplot - depth, including the space between reads in read-pairs'
        )
        bullet(
            f,
            '...-ambiguous-pairspan-depth.userplot - as above, but including reads that hit multiple locations'
        )

    if clip_filenames:
        section(f, 'Read clipping')

        for filename in clip_filenames:
            assert filename.endswith('_log.txt')
            name = os.path.basename(filename[:-8])
            text = extract(
                filename, lambda line: line.startswith('Fragments:') or line.
                startswith('Single reads') or line.startswith('Pairs'))
            subsection(f, name)
            pre(f, text)
            end_subsection(f)

    if count_log_filenames:
        section(f, 'Counting alignments to genes')

        for filename in count_log_filenames:
            pre(f, open(filename, 'rb').read())

    if align_dirs and show_refalign:
        section(f, 'Reference alignment')
        for directory in align_dirs:
            name = os.path.basename(directory)
            text = extract(
                join(directory, 'consensus_log.txt'),
                lambda line: 'reads/pairs' in line or 'unmapped' in line)
            text = text.replace('(discarded)', '')
            text = text.replace('reads/pairs kept', 'aligned unambiguously')
            subsection(f, name)
            pre(f, text)
            end_subsection(f)

    print >> f, TAIL % locals()
    f.close()

    zip_filename = join(output_dir, short_name + '.zip')
    if os.path.exists(zip_filename): os.unlink(zip_filename)
    assert 0 == os.system(
        'cd %(output_dir)s ; zip %(short_name)s.zip %(short_name)s/* ' %
        locals())
    for item in os.listdir(file_dir):
        os.unlink(join(file_dir, item))
    os.rmdir(file_dir)
Exemple #20
0
def main(args):
    title1, args = grace.get_option_value(args, '--title1', str, None)
    title2, args = grace.get_option_value(args, '--title2', str, None)
    grace.expect_no_further_options(args)

    if len(args) != 3:
        print >> sys.stderr, USAGE
        return 1

    working_dir1 = args[0]
    working_dir2 = args[1]
    cutoff = float(args[2])

    sequence_names = [
        name for name, sequence in io.read_sequences(
            os.path.join(working_dir1, 'reference.fa'))
    ]

    if title1 is None:
        title1 = working_dir1
    if title2 is None:
        title2 = working_dir2

    n = 1
    while significance([('A', n)], [('T', n)], 1.0) > cutoff:
        n += 1

    print '%g\tsignificance cutoff' % cutoff
    print '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n

    print 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % (
        title1, title2, title1, title2)

    for sequence_name in sequence_names:
        filename1 = os.path.join(
            working_dir1,
            grace.filesystem_friendly_name(sequence_name) + '-evidence.txt')
        filename2 = os.path.join(
            working_dir2,
            grace.filesystem_friendly_name(sequence_name) + '-evidence.txt')

        for (pos1, ins1, sub1, ref1, conins1,
             consub1), (pos2, ins2, sub2, ref2, conins2,
                        consub2) in itertools.izip(read_file(filename1),
                                                   read_file(filename2)):
            assert pos1 == pos2 and ref1 == ref2

            if pos1 % 1000 == 0:
                grace.status('Testing %s %d' % (sequence_name, pos1))

            dec_ins1 = io.decode_evidence(ins1)
            dec_ins2 = io.decode_evidence(ins2)
            if dec_ins1 and dec_ins2:
                sig = significance(io.decode_evidence(ins1),
                                   io.decode_evidence(ins2), cutoff)
                if sig is not None and sig <= cutoff:
                    grace.status('')
                    print '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % (
                        sequence_name, pos1, 'insertion-before', ins1, ins2,
                        sig, conins1, conins2)

            dec_sub1 = io.decode_evidence(sub1)
            dec_sub2 = io.decode_evidence(sub2)
            if dec_sub1 and dec_sub2:
                sig = significance(dec_sub1, dec_sub2, cutoff)
                if sig is not None and sig <= cutoff:
                    if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-':
                        what = 'deletion'
                    elif dec_sub1[0][0] != dec_sub2[0][0]:
                        what = 'substitution'
                    else:
                        what = 'different mix'
                    grace.status('')
                    print '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % (
                        sequence_name, pos1, what, ref1, sub1, sub2, sig,
                        consub1, consub2)

    grace.status('')
    return 0
Exemple #21
0
def main(args):
    genbank_filename, args = grace.get_option_value(args,'--gbk',str,None)
    use_indels, args = grace.get_option_value(args,'--indels',grace.as_bool,True)
    use_reference, args = grace.get_option_value(args,'--reference',grace.as_bool,True)
    give_evidence, args = grace.get_option_value(args,'--evidence',grace.as_bool,True)
    give_consequences, args = grace.get_option_value(args,'--consequences',grace.as_bool,True)
    require_all, args = grace.get_option_value(args,'--require-all',grace.as_bool,False)
    require_bisect, args = grace.get_option_value(args,'--require-bisect',grace.as_bool,False)
    full_output, args = grace.get_option_value(args,'--full',grace.as_bool,False)
    format, args = grace.get_option_value(args,'--as',str,'table')
    
    # Secret option!
    limit, args = grace.get_option_value(args,'--limit',int,None)
    
    grace.expect_no_further_options(args)

    if len(args) < 1:
        sys.stderr.write(USAGE)
        return 1

    working_dirs = [ ]
    split_a = [ ]
    split_b = [ ]
    def default(args):
        working_dirs.extend(args)
    def splitting(args):
        split_a.extend(args)
    def splitting_from(args):
        split_b.extend(args)
        
    grace.execute(args, {
        'splitting' : splitting,
        'from' : splitting_from 
    }, default
    )
    
    if use_reference:
        names = ['reference']
        evidence_start = 1
    else:
        names = [ ]
        evidence_start = 0
        
    names.extend( norm_name(item) for item in  working_dirs )
        
    references = io.read_sequences(os.path.join(working_dirs[0], 'reference.fa'))
    
    annotations = { }
    if genbank_filename:
        from Bio import SeqIO
        for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'):
            sequence = record.seq.tostring()
            features = [ item for item in record.features if item.type != 'source' ]
            features.sort(key=lambda item: item.location.nofuzzy_start)
            annotations[sequence] = features
    
    iterator = reader(working_dirs, references, use_reference, annotations)
    
    if not use_indels:
        iterator = itertools.ifilter(has_no_indels, iterator)

    if require_all or require_bisect or format == 'counts':
        iterator = itertools.ifilter(fully_unambiguous, iterator)
    
    if require_bisect:
        iterator = itertools.ifilter(is_binary_partition, iterator)

    if not require_bisect:
        if full_output:
            iterator = itertools.ifilter(not_boring_insertion, iterator)
        else:
            iterator = itertools.ifilter(is_interesting, iterator)

    if split_a or split_b:
        assert len(names) == len(set(names)), 'Two samples with the same name'
        try:
            split_a = [ names.index(norm_name(item)) for item in split_a ]
            split_b = [ names.index(norm_name(item)) for item in split_b ]
        except ValueError:
            raise grace.Error('Sample to be split is not amongst samples given')
        iterator = itertools.ifilter(is_split(split_a, split_b), iterator)

    if limit:
        iterator = itertools.islice(iterator, limit)
    
    if format == 'table':
        line = 'Reference\tPosition\tChange type'
        line +=  '\t' + '\t'.join(names)
        if give_evidence:
            line += '\t' + '\t'.join(names[evidence_start:])
        if give_consequences:
            line += '\t' + '\t'.join(names[evidence_start:])
        if annotations:
            line += '\tAnnotations'
        print line
        for calls in iterator:
            line = '%s\t%d\t%s\t%s' % (
                calls.ref_name, 
                calls.ref_pos+1, 
                change_type(calls), 
                '\t'.join(item.consensus for item in calls.calls))
            if give_evidence:
                line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:])
            if give_consequences:
                line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:])
            if annotations:
                line += '\t' + describe_features(calls.features)
            print line

    elif format == 'compact':
        for line in transpose_strings(names):
            print line
        print
        
        for calls in iterator:
            if calls.is_insertion:
                footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name)
            else: 
                footer = '%12d   %s' % (calls.ref_pos+1, calls.ref_name)
            
            t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1)
            top = t[0] + ' ' + footer
            if give_consequences:
                consequences = [ ]
                for call in calls.calls:
                    if call.consequences:
                        for item in call.consequences.split(', '):
                            item = ' '.join(item.split()[:3])
                            if item not in consequences: consequences.append(item)
                        
                if consequences:
                    top += '  ' + ' / '.join(sorted(consequences))
            top += '  ' + describe_features(calls.features)
            print top
            for line in t[1:]:
                print line            
    
    elif format == 'nexus':
        buckets = [ [ ] for name in names ]
        for calls in iterator:
            for i, char in enumerate(partition_string(calls)):
                buckets[i].append(char)
        
        print '#NEXUS'
        print 'begin taxa;'
        print 'dimensions ntax=%d;' % len(names)
        print 'taxlabels'
        for name in names:
            print name
        print ';'
        print 'end;'

        print 'begin characters;'
        print 'dimensions nchar=%d;' % len(buckets[0])
        print 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;'
        print 'matrix'
        for name, bucket in itertools.izip(names, buckets):
            print name, ''.join(bucket)
        print ';'
        print 'end;'
    
    elif format == 'counts':
        for line in transpose_strings(names):
            print line
        print

        counts = { }
        for calls in iterator:
            count_str = partition_string(calls)
            if count_str not in counts:
                counts[count_str] = 1
            else:
                counts[count_str] += 1
        
        for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True):
            print '%s   %d' % (transpose_strings(count_str)[0], counts[count_str])
    
    else:
        raise grace.Error('Unknown output format: ' + format)
Exemple #22
0
def main(args):
    grace.require_shrimp_1()

    n_cpus = grace.how_many_cpus()
        
    solid, args = grace.get_flag(args, '--solid')
    verbose, args = grace.get_flag(args, '--verbose')

    threshold, args = grace.get_option_value(args, '--threshold', str, '68%')
    
    stride, args = grace.get_option_value(args, '--stride', int, 1)
    max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus)
    batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000)
        
    input_reference_filenames = [ ]
    reads_filenames = [ ]
    
    shrimp_options = [ '-h', threshold ]
    if threshold.endswith('%'):
        threshold = -float(threshold[:-1])/100.0
    else:
        threshold = int(threshold)
    
    output_dir = [ ]  #As list so can write to from function. Gah.
    
    def front_command(args):
        grace.expect_no_further_options(args)
        
        if len(args) < 1:
            return
        
        output_dir.append(args[0])        
        input_reference_filenames.extend(
            [ os.path.abspath(filename) for filename in args[1:] ])
    def reads_command(args):
        grace.expect_no_further_options(args)
        reads_filenames.extend([ [ os.path.abspath(filename) ] for filename in args])
    def pairs_command(args):
        grace.expect_no_further_options(args)
        assert len(args) == 2, 'Expected exactly two files in "pairs"'
        reads_filenames.append([ os.path.abspath(filename) for filename in args ])
    def shrimp_options_command(args):
        shrimp_options.extend(args)
    
    grace.execute(args, {
        'reads': reads_command,
        '--reads': reads_command,
        'pairs': pairs_command,
        'shrimp-options': shrimp_options_command,
        '--shrimp-options': shrimp_options_command,
    }, front_command)
    
    
    if not output_dir:
        print >> sys.stderr, USAGE % n_cpus
        return 1
    
    output_dir = output_dir[0]
    
    assert input_reference_filenames, 'No reference files given'
    assert reads_filenames, 'No read files given'
    
    for filename in itertools.chain(input_reference_filenames, *reads_filenames):
        assert os.path.exists(filename), '%s does not exist' % filename

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    
    if solid:
        shrimp = 'rmapper-cs'
    else:
        shrimp = 'rmapper-ls'
    
    
    reference_filename = os.path.join(output_dir,'reference.fa')
    reference_file = open(reference_filename,'wb')
    total_reference_sequences = 0
    total_reference_bases = 0
    for input_reference_filename in input_reference_filenames:
        for name, sequence in io.read_sequences(input_reference_filename):
            #Don't retain any comment
            name = name.split()[0]
            io.write_fasta(reference_file, name, sequence)
            
            total_reference_sequences += 1
            total_reference_bases += len(sequence)
            
    reference_file.close()
    
    print '%s base%s in %s reference sequence%s' % (
        grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '',
        grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '')
    
    assert total_reference_bases, 'Reference sequence file is empty' 
    
    config = {
        'references' : input_reference_filenames,
        'reads' : reads_filenames,
        'stride' : stride,
        'solid': solid,
        'threshold': threshold,
    }
    config_file = open(os.path.join(output_dir, 'config.txt'), 'wb')
    pprint.pprint(config, config_file)
    config_file.close()
    
    output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz')
    output_file = gzip.open(output_filename, 'wb')
    
    unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz')
    unmapped_file = gzip.open(unmapped_filename, 'wb')
    
    dirty_filenames = set()
    dirty_filenames.add(output_filename)
    dirty_filenames.add(unmapped_filename)
    
    #warn_low_threshold = True
    
    try: #Cleanup temporary files
        
        N = [0]
        def do_shrimp(read_set):
            my_number = N[0]
            N[0] += 1
            
            tempname = os.path.join(output_dir,'temp%d-%d.fa' % (os.getpid(),my_number))
            tempname_out = os.path.join(output_dir,'temp%d-%d.txt' % (os.getpid(),my_number))
            
            dirty_filenames.add(tempname)
            dirty_filenames.add(tempname_out)
            
            f = open(tempname,'wb')
            for read_name, read_seq in read_set:
                print >> f, '>' + read_name
                print >> f, read_seq
            f.close()
        
            command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \
                      tempname + ' ' + reference_filename + ' >' + tempname_out
            if not verbose:
                command += ' 2>/dev/null'
            #f = os.popen(command, 'r')
            child_pid = os.spawnl(os.P_NOWAIT,'/bin/sh','/bin/sh','-c',command)
            #print 'SHRiMP %d running' % my_number
            
            def finalize():
                exit_status = os.waitpid(child_pid, 0)[1]
                assert exit_status == 0, 'Shrimp indicated an error'
                
                hits = { } # read_name -> [ hit line ]
                
                f = open(tempname_out,'rb')
                for line in f:
                    if line.startswith('>'):
                        read_name = line.split(None,1)[0][1:]
                        if read_name not in hits:
                            hits[read_name] = [ ]
                        hits[read_name].append(line)
                f.close()
                                
                for read_name, read_seq in read_set:
                    if read_name in hits:
                        for hit in hits[read_name]:
                            output_file.write(hit)
                    else:
                        print >> unmapped_file, '>' + read_name
                        print >> unmapped_file, read_seq

                output_file.flush()
                unmapped_file.flush()
        
                os.unlink(tempname)
                dirty_filenames.remove(tempname)
                os.unlink(tempname_out)
                dirty_filenames.remove(tempname_out)
                #print 'SHRiMP %d finished' % my_number
            return finalize
        
        
        shrimps = [ ]
        
        reader = iter_reads(config)
        read_count = 0
        
        while True:
            read_set = [ ]
            read_set_bases = 0

            #Read name should not include comment cruft
            # - SHRIMP passes this through
            # - might stuff up identification of pairs
            
            for read_name, read_seq in reader:
                read_name = read_name.split()[0]                
                read_set.append((read_name, read_seq))
                read_set_bases += len(read_seq)
                
                #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match
                #    sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n')                    
                #    warn_low_threshold = False
            
                read_count += 1
                if read_set_bases >= batch_size: break
                
            if not read_set: break
        
            if len(shrimps) >= max_shrimps:
                shrimps.pop(0)()
            shrimps.append( do_shrimp(read_set) )
            
            grace.status('SHRiMPing %s' % grace.pretty_number(read_count))
        
        while shrimps:
            grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps) )
            shrimps.pop(0)()
        
        grace.status('')
        
        output_file.close()
        dirty_filenames.remove(output_filename)
        unmapped_file.close()
        dirty_filenames.remove(unmapped_filename)
        
        return 0

    finally:
        for filename in dirty_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
def main(args):
    default_transl_table, args = grace.get_option_value(args, '--transl_table', int, 11)
    use_coverage, args = grace.get_flag(args, '--use-coverage')
    coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff', float, 0.1)
    tabular, args = grace.get_flag(args, '--tabular')
    noheader, args = grace.get_flag(args, '--noheader')
    verbose, args = grace.get_flag(args, '--verbose')
    bandwidth, args = grace.get_option_value(args, '--band', int, 20)
    grace.expect_no_further_options(args)

    if len(args) != 2:
        print USAGE
        return 1
    
    genbank_filename = args[0]
    alignment_filename = args[1]
    
    if os.path.isdir(alignment_filename):
        alignment_filename = os.path.join(alignment_filename, 'alignment.maf')
    
    working_dir = os.path.split(alignment_filename)[0]
    
    alignments = load_alignments(alignment_filename)
    
    summaries = [ ]
    details = [ ]
    
    if not noheader:
        fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t'
        if use_coverage: fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t'
        fields += 'Gene\tProduct'
        if tabular: fields += '\tChanges of note'
        print fields
    
    for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'):
        sequence = record.seq.tostring()
    
        for name, seq1, seq2, alignment in alignments:
            if seq1 == sequence: break
        else:
            raise grace.Error('Genbank record %s sequence not identical to any reference sequence' % record.id)
             
        if use_coverage:       
            depth = get_graph(working_dir, name, 'depth')
            ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth')
            median_depth = numpy.median(depth)
            median_ambiguous_depth = numpy.median(ambiguous_depth)
            ambiguous_factor = float(median_ambiguous_depth) / median_depth
            depth_expect = expected_depth(name, sequence, depth, ambiguous_depth)
            
        
        for feature in record.features:
            if feature.type != 'CDS': continue
            
            if 'locus_tag' not in feature.qualifiers:
                locus_tag = '%d..%d' % (feature.location.nofuzzy_start+1,feature.location.nofuzzy_end)
            else:
                locus_tag = feature.qualifiers['locus_tag'][0]
            
            if 'transl_table' in feature.qualifiers:
                transl_table_no = int(feature.qualifiers['transl_table'][0])
            else:
                assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given'
                transl_table_no = default_transl_table
            
            transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no]
            start_codons = transl_table.start_codons
            
            try:
                feature_alignment = alignment_from_feature(sequence, feature)
            except Weird_alignment:
                warn('%s has a location I could not handle, skipping, sorry' % locus_tag)
                continue
            
            dna = [ ]
            new_dna = [ ]
            shifts = [ ]
            for i in xrange(feature_alignment.end2):
                p1 = feature_alignment.back_project(i, left=False)
                p2 = feature_alignment.back_project(i+1, left=True)
                assert abs(p2-p1) < 2
                dna.append( sequence_slice(sequence,p1,p2) )
                
                p1a = alignment.project(p1, left=False)
                p2a = alignment.project(p2, left=False) #Hmm
                
                diff = (p2-p1)-(p2a-p1a)
                #if diff:
                #    if diff%3:
                #        frame_shift = True
                #    else:
                #        frame_preserving_shift = True
                new_dna.append( sequence_slice(seq2,p1a,p2a) )
                
                if diff:
                    shifts.append((i,dna[-1],new_dna[-1]))
                
            dna = ''.join(dna)
            new_dna = ''.join(new_dna)
            
            # This usually indicated a CDS truncated at the start?
            # in which case, will probably fail some way or other down the line.
            if 'codon_start' in feature.qualifiers:
                codon_start = int(feature.qualifiers['codon_start'][0]) - 1
            else:
                codon_start = 0
            dna = dna[codon_start:]
            new_dna = new_dna[codon_start:]
            
            if len(dna) % 3 != 0:
                warn(locus_tag + ' length not a multiple of 3')
            #assert len(new_dna) % 3 == 0
            
            protein = Seq.Seq(dna).translate(table=transl_table_no).tostring()            
            # http://en.wikipedia.org/wiki/Start_codon is always translated to M
            protein = 'M' + protein[1:]
            
            if dna[:3] not in start_codons:
                warn(locus_tag + ' has unknown start codon: ' + dna[:3])
                                    
            original_lacks_stop_codon = not protein.endswith('*')                 
            if original_lacks_stop_codon:
                warn(locus_tag + ' lacks end codon')
            original_stops_before_end = '*' in protein[:-1] 
            if original_stops_before_end:
                warn(locus_tag + ' contains stop codon before end')
                            
            if 'translation' in feature.qualifiers:
                expect = feature.qualifiers['translation'][0]
                if protein[:-1] != expect:
                    warn(locus_tag + ' translation given in feature does not match translation from DNA')                
        
            new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring()            
            new_protein = 'M' + new_protein[1:]
        
            # If end codon changed, find new end                
            # Don't bother if there are unknown amino acids or 
            # the original protein lacks a stop codon
            if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon:
                #This is very inefficient
                i = feature_alignment.end2
                while True:
                    p1 = feature_alignment.back_project(i, left=False)
                    p2 = feature_alignment.back_project(i+1, left=True)
                    p1a = alignment.project(p1, left=False)
                    p2a = alignment.project(p2, left=False) #Hmm
                    if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len(seq2):
                        break
                        
                    new_dna += sequence_slice(seq2,p1a,p2a)                        
                    new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring()            
                    new_protein = 'M' + new_protein[1:]
                    if 'X' in new_protein or '*' in new_protein: break
                    
                    i += 1
            
            # Is the protein shorter?
            # Don't bother checking if the original protein has extra stop codons
            if '*' in new_protein and not original_stops_before_end:
                new_protein = new_protein[:new_protein.index('*')+1] 
        
            # If indels occurred, do an alignment
            # Don't bother otherwise
            if shifts:
                # Penalize gaps with cost 2 (vs 1 for mismatch)
                # If lengths don't match, pad with spaces (won't match longer seq),
                # aligner prefers mismatch to gaps
                
                #result = pairwise2.align.globalxs(protein      + ' '*max(0,len(new_protein)-len(protein)), 
                #                                  new_protein  + ' '*max(0,len(protein)-len(new_protein)), 
                #                                  -2.001,-2.000)[0]
                # 2.001 : very slightly prefer contiguous gaps. Also much faster!
        
                result = band_limited_align(protein      + ' '*max(0,len(new_protein)-len(protein)), 
                                            new_protein  + ' '*max(0,len(protein)-len(new_protein)), 
                                            bandwidth)
                
                
                protein_ali = result[0]
                new_protein_ali = result[1]
            else:
                protein_ali = protein
                new_protein_ali = new_protein
        
            diffs = [ ]
            j = 0
            k = 0
            for i in xrange(min(len(new_protein_ali),len(protein_ali))):
                if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and (
                      protein_ali[i] == '-' or 
                      new_protein_ali[i] == '-' or 
                      not bio.might_be_same_amino(protein_ali[i], new_protein_ali[i]) ):
                    diffs.append((i,j,k))
                if protein_ali[i] != '-': 
                    j += 1
                if new_protein_ali[i] != '-': 
                    k += 1
        
            diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \
                         not bio.might_be_same_base(new_dna[1],dna[1]) or \
                         not bio.might_be_same_base(new_dna[2],dna[2]) 
        
            interesting_coverage = False
            if use_coverage:
                cds_depth = depth[feature_alignment.start1:feature_alignment.end1] #/ median_depth
                if not feature_alignment.forward1: cds_depth = cds_depth[::-1]
                cds_ambiguous_depth = ambiguous_depth[feature_alignment.start1:feature_alignment.end1] #/ median_ambiguous_depth
                if not feature_alignment.forward1: cds_ambiguous_depth = cds_ambiguous_depth[::-1]
                
                cds_depth_expect = depth_expect[feature_alignment.start1:feature_alignment.end1]
                if not feature_alignment.forward1: cds_depth_expect = cds_depth_expect[::-1]
                
                #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth 
                #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth                        
                #line += '%.1f\t' % cds_average_depth_ratio 
                #line += '%.1f\t' % cds_average_ambiguous_depth_ratio
                
                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth) 
                #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth) 
                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth)
                
                avg_expect = numpy.average(cds_depth_expect)
                if avg_expect > 0.0:
                    cds_avg_depth = numpy.average(cds_depth)/avg_expect
                    cds_avg_ambiguous_depth = numpy.average(cds_ambiguous_depth)/avg_expect/ambiguous_factor
                
                strange = (
                    (cds_depth >= cds_depth_expect*1.5) |
                    (cds_ambiguous_depth <= cds_depth_expect*(0.5*ambiguous_factor))
                )
                
                interesting_coverage = numpy.average(strange) >= coverage_cutoff
                     

            if interesting_coverage or diffs or diff_start or shifts or len(new_protein) != len(protein):
                line = name + '\t' + locus_tag + '\t' + \
                      '%d\t' % (len(protein)-1) + \
                      '%d\t' % (len(new_protein)-1) + \
                      '%d\t' % len(diffs)
                

                if use_coverage:
                    if avg_expect <= 0.0:
                        line += '\t\t\t'
                    else:
                        line += '%.1f\t' % (cds_avg_depth) + graphlet(cds_depth, cds_depth_expect)+'\t' 
                        line += '%.1f\t' % (cds_avg_ambiguous_depth) + graphlet(cds_ambiguous_depth, cds_depth_expect*ambiguous_factor)+'\t'
                        line += '%.1f%%\t' % (numpy.average(cds_ambiguous_depth > 0.0)*100.0)
                
                line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \
                        '%s' % feature.qualifiers.get('product',[''])[0]
                
                notes = [ ]
                
                if use_coverage and 'X' in new_protein:
                    xs = new_protein.count('X')
                    if xs == len(new_protein)-1: #First is M, so len-1
                        notes.append('\ No consensus')
                    else:
                        notes.append('\ No consensus for %d aa' % (new_protein.count('X')))
                                   
                if len(new_protein) < len(protein):
                    notes.append('\ Shorter by %d aa' % (len(protein)-len(new_protein)))
        
                if len(new_protein) > len(protein):
                    notes.append('\ Longer by %d aa' % (len(new_protein)-len(protein)))
                
                if diff_start:
                    notes.append('\ Start changed: %s -> %s' % (dna[:3], new_dna[:3]))
                    if new_dna[:3] not in start_codons:
                        notes.append('  No longer a start codon!')
                        
                if shifts:
                    notes.append('\ Indels:')
                
                    for pos, old, new in shifts:
                        notes.append('    base %5d / codon %5d   %s -> %s' % (pos+1,(pos//3)+1,old,new or '-'))
                    
                if diffs:
                    if verbose:
                        notes.append('\ Amino acid changes:')
                        for i, j, k in diffs:
                            notes.append('    codon %5d   %s->%s   (%s->%s)' % (
                                j+1, 
                                protein_ali[i], 
                                new_protein_ali[i], 
                                dna[j*3:j*3+3] if protein_ali[i] != '-' else '-', 
                                new_dna[k*3:k*3+3] if new_protein_ali[i] != '-' else '-'
                            ))
                
                #if len(new_protein) > len(protein):
                #    print 'New protein is longer:', new_protein[len(protein):]
                #if len(new_protein) < len(protein):
                #    print 'New protein is shorter:', protein[len(new_protein):]
                #print protein
                #print new_protein
                
                if tabular:
                    print line + '\t' + ' '.join([ ' '.join(note.strip().split()) for note in notes ])
                else:
                    print line
                    for note in notes:
                        print '\t' + note
    return 0
Exemple #24
0
def fill_scaffolds(args):
    max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000)
    
    if len(args) < 2:
        print USAGE
        return 1
    
    (output_dir, graph_dir), args = args[:2], args[2:]

    scaffolds = [ ]
    
    def scaffold(args):
        circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False)
        
        scaffold = [ ]
        for item in args:
            scaffold.append( ('contig', int(item)) )
            scaffold.append( ('gap', None) )
        
        if not circular: scaffold = scaffold[:-1]
        
        name = 'custom_scaffold_%d' % (len(scaffolds)+1)
        scaffolds.append( (name, scaffold) )
            
    grace.execute(args, [scaffold])
    
    custom_scaffolds = (len(scaffolds) != 0)    
    
    sequences = dict( 
        (a.split()[0], b.upper()) 
          for a,b in 
            io.read_sequences(os.path.join(
              graph_dir, '454AllContigs.fna')))
    
    sequence_names = sorted(sequences)
    sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1)))
    
    contexts = { }
    context_names = { }
    context_depths = { }
    for i in xrange(1,len(sequence_names)+1):
        seq = sequences[sequence_names[i-1]]
        contexts[ i ] = seq
        context_names[ i ] = sequence_names[i-1]+'-fwd'
        contexts[ -i ] = bio.reverse_complement(seq)
        context_names[ -i ] = sequence_names[i-1]+'-rev'
    
    links = collections.defaultdict(list)
    
    for line in open(
      os.path.join(graph_dir, '454ContigGraph.txt'),
      'rU'):
        parts = line.rstrip('\n').split('\t')
        
        if parts[0].isdigit():
            seq = sequence_ids[parts[1]]
            context_depths[ seq] = float(parts[3])
            context_depths[-seq] = float(parts[3])
        
        if parts[0] == 'C':    
            name1 = 'contig%05d' % int(parts[1])
            dir1 = {"3'" : 1, "5'" : -1 }[parts[2]]
            name2 = 'contig%05d' % int(parts[3])
            dir2 = {"5'" : 1, "3'" : -1 }[parts[4]]
            depth = int(parts[5])
            #print name1, dir1, name2, dir2, depth
            
            links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) )
            links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) )
    
        if parts[0] == 'S' and not custom_scaffolds:  
            name = 'scaffold%05d' % int(parts[2])  
            components = parts[3].split(';')
            scaffold = [ ]
            for component in components:
                a,b = component.split(':')
                if a == 'gap':
                    scaffold.append( ('gap',int(b)) )
                else:
                    strand = { '+': +1, '-': -1 }[ b ]
                    scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) )
            scaffolds.append( (name, scaffold) )
    
    
    
    #paths = { }
    #
    #todo = [ ]
    #for i in contexts:
    #    for depth_left, neg_left in links[-i]:
    #        left = -neg_left
    #        for depth_right, right in links[i]:
    #            todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) )
    #
    #heapq.heapify(todo)
    #while todo:
    #    score, source, dest, path = heapq.heappop(todo)
    #    if (source,dest) in paths: continue
    #    
    #    paths[(source,dest)] = path
    #    
    #    if len(contexts[dest]) > max_filler_length: continue
    #    
    #    for depth, next in links[dest]:
    #        heapq.heappush(todo,
    #            ( max(score,-depth,-context_depths[dest]), source, next, path+(dest,))
    #        )
    
    
    path_source_dest = collections.defaultdict(dict) # source -> dest -> next
    path_dest_source = collections.defaultdict(dict) # dest -> source -> next
    
    
    # Use links, in order to depth of coverage, to construct paths between contigs
    # Thus: paths have maximum minimum depth
    #       subsections of paths also have this property
    
    todo = [ ]
    for i in contexts:    
        for depth_link, right in links[i]:
            todo.append( ( depth_link, i, right) )
    todo.sort(reverse=True)
    for score, left, right in todo:
        if right in path_source_dest[left]: continue
        
        sources = [(left,right)]
        if len(contexts[left]) <= max_filler_length:
            sources += path_dest_source[left].items()
        destinations = [right]
        if len(contexts[right]) <= max_filler_length:
            destinations += path_source_dest[right].keys()
        
        for source, next in sources:
            for dest in destinations:
                if dest in path_source_dest[source]: continue
                path_source_dest[source][dest] = next
                path_dest_source[dest][source] = next
    
    
    workspace = io.Workspace(output_dir)
    scaffold_f = workspace.open('scaffolds.fa','wb')
    
    #comments = [ ]
    features = [ ]
    
    used = set()
    previous_total = 0
    
    for i, (name, scaffold) in enumerate(scaffolds):
        result = '' # Inefficient. Meh.
        n_filled = 0
        n_failed = 0
        for j, item in enumerate(scaffold):
            if item[0] == 'contig':
                result += contexts[item[1]]
                used.add(abs(item[1]))
            else:
                left = scaffold[j-1]
                right = scaffold[ (j+1) % len(scaffold) ] #If gap at end, assume circular
                assert left[0] == 'contig'
                assert right[0] == 'contig'
                
                gap_start = len(result)
    
                can_fill = right[1] in path_source_dest[left[1]]
                if can_fill:
                    n = 0
                    k = path_source_dest[left[1]][right[1]]
                    while k != right[1]:
                        n += len(contexts[k])
                        result += contexts[k].lower()
                        used.add(abs(k))
                        
                        k = path_source_dest[k][right[1]]
                    
                    n_filled += 1
                        
                    if item[1] is not None and max(n,item[1]) > min(n,item[1])*4:
                        print >> sys.stderr, 'Warning: gap size changed from %d to %d in scaffold %d' % (item[1],n,i+1)
                else:
                    n_failed += 1
                    
                    #print >> sys.stderr, 'Warning: No path to fill a gap in scaffold %d' % (i+1)
                    result += 'n' * (9 if item[1] is None else item[1])
    
                gap_end = len(result)
                
                #features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % (
                #    'all-scaffolds',
                #    'fill-scaffolds',
                #    'gap',
                #    previous_total + gap_start+1,
                #    previous_total + max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm.
                #    '.', #score
                #    '+', #strand
                #    '.', #frame
                #    '' #properties
                #))
                features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % (
                    name,
                    'fill-scaffolds',
                    'gap',
                    gap_start+1,
                    max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm.
                    '.', #score
                    '+', #strand
                    '.', #frame
                    '' #properties
                ))
                    
    
        io.write_fasta(scaffold_f, name, result)
        previous_total += len(result)
        #comments.append('##sequence-region    %s %d %d' % (name, 1, len(result)))
        print >> sys.stderr, 'Scaffold%05d: %d gaps filled, %d could not be filled' % (i+1, n_filled, n_failed)
    
    scaffold_f.close()
    
    gff_f = workspace.open('scaffolds.gff', 'wb')
    #print >>gff_f, '##gff-version    3'
    #for comment in comments:
    #    print >>gff_f, comment
    for feature in features:
        print >>gff_f, feature
    gff_f.close()
    
    
    leftovers_f = workspace.open('leftovers.fa', 'wb')
    for name in sequence_names:
        if sequence_ids[name] not in used:
            io.write_fasta(leftovers_f, name, sequences[name])
    leftovers_f.close()
    
    ends = { }
    for i, (name, scaffold) in enumerate(scaffolds):
        if scaffold[-1][0] == 'gap': continue
        ends[ '%s start' % name ] = scaffold[-1][1]
        ends[ '%s end  ' % name ] = -scaffold[0][1] 
    
    for end1 in sorted(ends):
        options = [ end2 for end2 in ends if -ends[end2] in path_source_dest[ends[end1]] ]
        if len(options) == 1:
            print >> sys.stderr, 'Note: from', end1, 'only', options[0], 'is reachable'
Exemple #25
0
def main(args):
    mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
    maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
    minsize, args = grace.get_option_value(args, '--minsize', int, 200)
    what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
    is_core = (what == 'core') 

    grace.expect_no_further_options(args)
    
    if len(args) < 2:
        print >> sys.stderr, HELP
        raise grace.Help_shown()
    
    output_dir, working_dirs = args[0], args[1:]
    
    assert not path.exists(path.join(output_dir, 'reference.fa')), \
        'Output directory not given'
    
    if not path.exists(output_dir):
        os.mkdir(output_dir)
    
    for name, seq in io.read_sequences(path.join(working_dirs[0],'reference.fa')):
        print name
        friendly_name = grace.filesystem_friendly_name(name)
        
        good = [ True ] * len(seq)
        
        for working_dir in working_dirs:
            if is_core:
               suffix = '-depth.userplot'
            else:
               suffix = '-ambiguous-depth.userplot'
            data = trivia.read_unstranded_userplot(
                os.path.join(working_dir, friendly_name+suffix)
            )
            assert len(seq) == len(data)
            for i in xrange(len(seq)):
               if good[i]:
                   if is_core:
                       good[i] = data[i] >= mincov
                   else:
                       good[i] = data[i] < mincov

        #Close holes
        start = -maxdiff-1
        n_holes = 0
        for i in xrange(len(seq)):
            if good[i]:
                 if 0 < i-start <= maxdiff:
                     for j in xrange(start,i): good[j] = True
                     n_holes += 1
                 start = i+1
        print 'Closed', grace.pretty_number(n_holes), 'holes'
        
        
        f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else 'N')
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f = open(path.join(output_dir, '%s-%s_masked.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else seq[i].lower())
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f_good = open(path.join(output_dir, '%s-%s_parts.fa' % (friendly_name,what)), 'wb')
        f_nongood = open(path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name,what)), 'wb')
        start = 0
        n_good = [0]
        n_good_bases = [0]    
        def emit(i):
            if i-start < minsize: return
            if good[start]:
                n_good[0] += 1
                n_good_bases[0] += i-start
            io.write_fasta(
                f_good if good[start] else f_nongood,
                '%s:%d..%d' % (name, start+1,i),
                seq[start:i]
            )
        for i in xrange(1,len(seq)):
            if good[i] != good[start]:
                emit(i)
                start = i
        emit(len(seq))
        f_nongood.close()
        f_good.close()
        
        print grace.pretty_number(sum(good)), 'bases are '+what+', of', grace.pretty_number(len(seq)), 'in reference sequence'
        print grace.pretty_number(n_good[0]), 'parts at least', grace.pretty_number(minsize), 'bases long with', grace.pretty_number(n_good_bases[0]), 'total bases'

        print
Exemple #26
0
def pastiche(args):
    if len(args) < 4:
        print USAGE
        return 1

    mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False)
    min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20)
        
    output_dir, args = args[0], args[1:]
    
    #, ref_filename, contig_filenames = args[0], args[1], args[2:]
    
    ref_filenames = [ ]
    contig_filenames = [ ]
    grace.execute(args, {
        'contigs' : lambda args: contig_filenames.extend(args)
    }, lambda args: ref_filenames.extend(args))
    
    assert ref_filenames, 'No reference sequences given'
    assert contig_filenames, 'No contig sequences given'
    
    contigs = dict([ 
                 (name.split()[0], seq) 
                 for filename in contig_filenames 
                 for name, seq in io.read_sequences(filename) 
              ])
    dir_contigs = { }
    for name in contigs:
        dir_contigs[name + '+'] = contigs[name]
        dir_contigs[name + '-'] = bio.reverse_complement(contigs[name])
    
    dir_contigs_used = { }
    for name in dir_contigs:
        dir_contigs_used[name] = [ False ] * len(dir_contigs[name])


    workspace = io.Workspace(output_dir)
    temp_prefix = workspace._object_filename('temp-pastiche')
    
    out_f = workspace.open('pastiche.fa', 'wb')
    
    for ref_filename in ref_filenames:
      for ref_name, ref_seq in io.read_sequences(ref_filename):
        ref_name = ref_name.split()[0]
        
        grace.status(ref_name)
        
        f = open(temp_prefix + '.fa','wb')
        io.write_fasta(f, 'ref', ref_seq)
        f.close()
    
        scores = [ -1 ] * (len(ref_seq)*2)
        strings = [ 'N', '' ] * (len(ref_seq))
        contexts = [ None for i in xrange(len(ref_seq)*2) ]
        
        #MAXSCORE = len(ref_seq)+1
        #for i in xrange(len(ref_seq)):
        #    if ref_seq[i].upper() != 'N':
        #        strings[i*2] = ref_seq[i]
        #        scores[i*2] = MAXSCORE
        #for i in xrange(len(ref_seq)-1):
        #    if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N':
        #        scores[i*2+1] = MAXSCORE

        if mask_only:        
            for i in xrange(len(ref_seq)):
                strings[i*2] = ref_seq[i].lower()
        
        
        def put(position, dir_contig_name, start, end, score):
            if scores[position] < score:
                scores[position] = score
                strings[position] = dir_contigs[dir_contig_name][start:end]
                contexts[position] = (dir_contig_name, start, end, score)

        for contig_filename in contig_filenames:
            execute(['nucmer',
                     '--prefix', temp_prefix,
                     #'--maxmatch', #Very slow
                     '--nosimplify',
                     '--minmatch', '9',
                     '--mincluster', '50',
                     #'--maxgap', '1000',
                     #'--breaklen', '1000', # Increasing this reduces Ns, but is slow
                     #'--diagfactor', '1.0',
                     temp_prefix+'.fa',
                     contig_filename])
            
            for contig_name, contig_seq in io.read_sequences(contig_filename):
                contig_name = contig_name.split()[0]
                grace.status(ref_name + ' vs ' + contig_name)
                p = run(['show-aligns', temp_prefix+'.delta', 'ref', contig_name],
                        stderr=subprocess.PIPE)
                
                alignments = [ ]
                
                while True:
                    line = p.stdout.readline()
                    if not line: break
                    if not line.startswith('-- BEGIN'):
                        continue
                    
                    parts = line.split()
                    
                    ref_start = int(parts[5])
                    ref_end = int(parts[7])
                    query_start = int(parts[10])
                    query_end = int(parts[12])
                    
                    #assert ref_start < ref_end
                    #ref_start -= 1 #Zero based coordinates
                    
                    al_ref = [ ]
                    al_query = [ ]
                    
                    while True:
                        block = [ ]
                        end = False
                        while True:
                            line = p.stdout.readline()
                            if line.startswith('--   END'): 
                                end = True
                                break
                            if line == '\n':
                                if block: 
                                    break
                                else:
                                    continue
                            block.append(line)
                        
                        if end: break
                        
                        al_ref.append(block[0].split()[1])
                        al_query.append(block[1].split()[1])
                        
                    al_ref = ''.join(al_ref)
                    al_query = ''.join(al_query)            
                    
                    if ref_start > ref_end:
                       al_ref = bio.reverse_complement(al_ref)
                       al_query = bio.reverse_complement(al_query)
                       ref_start, ref_end = ref_end, ref_start
                       query_start, query_end = query_end, query_start
                    
                    if query_start > query_end:
                       dir_contig_name = contig_name + '-'
                       query_start = len(contig_seq)+1-query_start
                       query_end = len(contig_seq)+1-query_end
                    else:
                       dir_contig_name = contig_name + '+'
                       
                    ref_start -= 1 #Zero based coordinates
                    query_start -= 1
                    
                    #print al_ref
                    #print al_query
                    
                    #Pretty dumb scoring scheme
                    al_score = 0
                    for i in xrange(len(al_ref)):
                        if al_ref[i] == al_query[i]:
                            al_score += 1
                        #else:
                        #    al_score -= 1
                    
                    #Pastiche alignment over reference
                    ref_pos = ref_start
                    query_pos = query_start
                    al_pos = 0
                    while al_pos < len(al_ref):
                        assert al_ref[al_pos] != '.'                
                        if al_query[al_pos] == '.':
                            put(ref_pos*2, dir_contig_name, query_pos, query_pos, al_score)
                        else:
                            assert al_query[al_pos].lower() == dir_contigs[dir_contig_name][query_pos].lower()
                            put(ref_pos*2, dir_contig_name, query_pos, query_pos+1, al_score)
                            query_pos += 1
                        al_pos += 1
                        
                        al_pos_end = al_pos
                        query_pos_end = query_pos
                        while al_pos_end < len(al_ref) and al_ref[al_pos_end] == '.':
                            al_pos_end += 1
                            query_pos_end += 1
                        #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score)
                        assert al_query[al_pos:al_pos_end].lower() == dir_contigs[dir_contig_name][query_pos:query_pos_end].lower() 
                        put(ref_pos*2+1, dir_contig_name, query_pos,query_pos_end, al_score)
                        al_pos = al_pos_end
                        query_pos = query_pos_end
                        ref_pos += 1
                    
                    
                p.wait()
            
        grace.status(ref_name)
        
        result = ''.join(strings)    
        io.write_fasta(out_f, ref_name, result)
        
        
        for context in contexts:
            if context is None: continue
            name,start,end,score = context
            for i in xrange(start,end):
                dir_contigs_used[name][i] = True
        
        
        #Interpolation
        #result = [ ]
        #i = 0
        #while i < len(ref_seq):
        #    if strings[i*2].upper() != 'N':
        #        result.append(strings[i*2])
        #        result.append(strings[i*2+1])
        #        i += 1
        #        continue
        #    
        #    j = i
        #    while strings[j*2].upper() == 'N':
        #        j += 1
        #    
        #    grace.status('')
        #    print >> sys.stderr, 'interpolating', i+1,'..',j
        #    
        #    window = 20 #!!!!!!!!!!!
        #    left_contexts = collections.defaultdict(lambda:0)
        #    for i1 in xrange(max(0,i-window),i):
        #        for context_name, context_start, context_end, context_score in contexts[i1*2]:
        #            key = (context_name, context_end + i - i1)
        #            left_contexts[key] = max(left_contexts[key],context_score)
        #        
        #    right_contexts = collections.defaultdict(lambda:0)
        #    for j1 in xrange(j,min(j+window,len(ref_seq))):
        #        for context_name, context_start, context_end, context_score in contexts[j1*2]:
        #            key = (context_name, context_start + j - j1)
        #            right_contexts[key] = max(left_contexts[key],context_score)
        #    
        #    #print >> sys.stderr, left_contexts
        #    #print >> sys.stderr, right_contexts
        #    
        #    options = [ ]
        #    
        #    for (left_name, left_pos), left_score in left_contexts.items():
        #        for (right_name, right_pos), right_score in right_contexts.items():
        #            if left_name != right_name: continue
        #            if right_pos < left_pos: continue
        #            
        #            if right_pos-left_pos > (j-i) * 4.0 + 10: continue   #!!!!!!!!!!!!!!!!!!!!!!1
        #            if right_pos-left_pos < (j-i) * 0.25 - 10: continue
        #            
        #            score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i)                  
        #            score *= left_score + right_score
        #            #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score
        #            options.append( (score, left_name, left_pos, right_pos) )
        #    
        #    if options:
        #        best = max(options, key=lambda option: option[0])
        #        print >> sys.stderr, '->', best
        #        result.append( dir_contigs[best[1]][best[2]:best[3]].lower() )
        #    else:
        #        print >> sys.stderr, '-> no good interpolation'
        #        result.append( ref_seq[i:j] )
        #    
        #    i = j
        #
        #result = ''.join(result)    
        #io.write_fasta(sys.stdout, ref_name, result)
        
        
        #print >> sys.stderr, len(result), result.count('N')
        #for pos, size in N_runs:
        #    out_size = len(''.join( strings[pos*2:pos*2+2] ))
        #    print >> sys.stderr, pos, size, '->', out_size        
    
    out_f.close()
    
    grace.status('')
    
    #for name, seq in io.read_sequences(ref_filename):
    #    result = pastiche(seq, contigs_filename)
    #    io.write_fasta(sys.stdout, name, result)
    
    
    leftover_f = workspace.open('leftovers.fa','wb')

    for name in sorted(contigs):
        used = [ (a or b) for a,b in zip(dir_contigs_used[name+'+'],dir_contigs_used[name+'-'][::-1]) ]

        i = 0
        while i < len(used):
            j = i
            while j < len(used) and not used[j]: 
                j += 1
            if j-i > min_leftover:
                if i == 0 and j == len(used):
                    out_name = name
                else:
                    out_name = name + ':%d..%d' % (i+1,j)
                io.write_fasta(leftover_f, out_name, contigs[name][i:j])
            
            i = j+1        

    leftover_f.close()

    for suffix in ['.fa', '.delta']:
        os.unlink(temp_prefix + suffix)