Exemple #1
0
def main(args):
    size, args = grace.get_option_value(args, '--size', int, 200)
    stride, args = grace.get_option_value(args, '--stride', int, 50)
    grace.expect_no_further_options(args)

    if not args:
        print USAGE
        return 1

    for filename in args:
        for name, seq in io.read_sequences(filename):
            name_parts = name.split(None, 1)
            name = name_parts[0]
            if len(name_parts) > 1:
                desc = ' ' + name_parts[1]
            else:
                desc = ''

            for i in xrange(-size + stride, len(seq), stride):
                start = max(0, min(len(seq), i))
                end = max(0, min(len(seq), i + size))
                io.write_fasta(sys.stdout,
                               '%s:%d..%d' % (name, start + 1, end) + desc,
                               seq[start:end])

    return 0
Exemple #2
0
 def emit(i):
     if i - start < minsize: return
     if good[start]:
         n_good[0] += 1
         n_good_bases[0] += i - start
     io.write_fasta(f_good if good[start] else f_nongood,
                    '%s:%d..%d' % (name, start + 1, i), seq[start:i])
Exemple #3
0
    def run(self):
        extractions = [ ]
        for item in self.genes.split(','):
            extraction = item.split('/')
            assert len(extraction) == 4
            extractions.append(extraction)
            
        rename = { }
        if self.rename:
            for item in self.rename.split(','):
                old,new = item.split('=')
                rename[old] = new

        work = self.get_workspace()        
        
        with workspace.tempspace() as temp:
            items = list(annotation.read_annotations(self.annotation))
            for item in items:
                item.seqid = rename.get(item.seqid, item.seqid)
            annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
            del items
            
            with open(temp/'temp.fa','wb') as f:
                for name,seq in io.read_sequences(self.genome):
                    name = name.split()[0]
                    name = rename.get(name,name)
                    io.write_fasta(f, name, seq)
            
            reference_directory.Make_tt_reference(
                self.output_dir,
                filenames = [ temp/'temp.fa', temp/'temp.gff' ],
                index = self.index,
                ).run()
    def run(self):
        extractions = [ ]
        for item in self.genes.split(','):
            extraction = item.split('/')
            assert len(extraction) == 4
            extractions.append(extraction)
            
        rename = { }
        if self.rename:
            for item in self.rename.split(','):
                old,new = item.split('=')
                rename[old] = new

        work = self.get_workspace()        
        
        with workspace.tempspace() as temp:
            items = list(annotation.read_annotations(self.annotation))
            for item in items:
                item.seqid = rename.get(item.seqid, item.seqid)
            annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
            del items
            
            with open(temp/'temp.fa','wb') as f:
                for name,seq in io.read_sequences(self.genome):
                    name = name.split()[0]
                    name = rename.get(name,name)
                    io.write_fasta(f, name, seq)
            
            reference_directory.Make_tt_reference(
                self.output_dir,
                filenames = [ temp/'temp.fa', temp/'temp.gff' ] + self.extra,
                index = self.index, shrimp = self.shrimp, 
                bowtie = self.bowtie, star = self.star
                ).run()
Exemple #5
0
def main(args):
    size, args = grace.get_option_value(args,'--size',int,200)
    stride, args = grace.get_option_value(args,'--stride',int,50)
    grace.expect_no_further_options(args)
    
    if not args:
        print USAGE
        return 1
    
    for filename in args:
        for name, seq in io.read_sequences(filename):
            name_parts = name.split(None, 1)
            name = name_parts[0]
            if len(name_parts) > 1:
               desc = ' ' + name_parts[1]
            else:
               desc = ''
            
            for i in xrange(-size+stride,len(seq),stride):
                start = max(0,min(len(seq),i))
                end = max(0,min(len(seq), i+size))
                io.write_fasta(
                    sys.stdout,
                    '%s:%d..%d' % (name,start+1,end) + desc,
                    seq[start:end]
                )
    
    return 0
 def build_snpeff(self):
     jar = io.find_jar('snpEff.jar')
     
     with open(self/'snpeff.config','wb') as f:
         print >> f, 'data_dir = snpeff'
         print >> f, 'genomes : ' + self.name
         print >> f, self.name + '.genome : ' + self.name 
     
     snpwork = io.Workspace(self/'snpeff',must_exist=False)
     snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False)
     snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False)
     
     annotations = self.annotations_filename()
     assert annotations
     with open(snpwork_genome/'genes.gff','wb') as f:
         for record in annotation.read_annotations(annotations):
             if record.end <= record.start: continue
             if not record.attr:
                 record.attr['attributes'] = 'none'
             print >> f, record.as_gff()
     
     with open(snpwork_genomes/(self.name+'.fa'),'wb') as f:
         for name, seq in io.read_sequences(self.reference_fasta_filename()):
             io.write_fasta(f, name, seq)
             
     io.execute('java -jar JAR build NAME -gff3 -c CONFIG',
         JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
Exemple #7
0
 def emit(i):
     if i-start < minsize: return
     if good[start]:
         n_good[0] += 1
         n_good_bases[0] += i-start
     io.write_fasta(
         f_good if good[start] else f_nongood,
         '%s:%d..%d' % (name, start+1,i),
         seq[start:i]
     )
Exemple #8
0
    def run(self):
        workspace = self.get_workspace()

        reference = reference_directory.Reference(self.reference,
                                                  must_exist=True)

        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)
        variants = collections.defaultdict(list)
        for record in reader:
            variants[record.CHROM].append(record)
        reader_f.close()

        for chrom in variants:
            variants[chrom].sort(key=lambda item: item.POS)

        filenames = [workspace / (item + '.fa') for item in reader.samples]
        for filename in filenames:
            with open(filename, 'wb'):
                pass

        for name, seq in io.read_sequences(
                reference.reference_fasta_filename()):
            for i, sample in enumerate(reader.samples):
                revised = []
                pos = 0
                for variant in variants[name]:
                    gt = variant.samples[i].data.GT
                    if gt is None: continue
                    assert gt.isdigit(
                    ), 'Unsupported genotype (can only use haploid genotypes): ' + gt
                    gt_number = int(gt)
                    if gt_number == 0:
                        var_seq = variant.REF
                    else:
                        var_seq = str(variant.ALT[gt_number - 1])
                        assert re.match(
                            '[ACGTN]*$',
                            var_seq), 'Unsupported variant type: ' + var_seq
                    new_pos = variant.POS - 1
                    assert new_pos >= pos, 'Variants overlap.'
                    revised.append(seq[pos:new_pos])
                    pos = new_pos
                    revised.append(var_seq)
                    assert seq[pos:pos + len(variant.REF)].upper(
                    ) == variant.REF, 'REF column in VCF does not match reference sequence'
                    pos += len(variant.REF)
                revised.append(seq[pos:])

                with open(filenames[i], 'ab') as f:
                    io.write_fasta(f, name, ''.join(revised))

            del variants[name]
        assert not variants, 'Chromosome names in VCF not in reference: ' + ' '.join(
            variants)
Exemple #9
0
    def run(self):
        assert self.release
        assert self.species
        assert self.assembly
        assert self.dna
        
        extractions = [ ]
        for item in self.genes.split(','):
            extraction = item.split('/')
            assert len(extraction) == 4
            extractions.append(extraction)
            
        rename = { }
        if self.rename:
            for item in self.rename.split(','):
                old,new = item.split('=')
                rename[old] = new

        work = self.get_workspace()        
        ensembl = workspace.Workspace(work/'ensembl')
        
        genome_filename = self.species+"."+self.assembly+"."+self.dna+".fa.gz"
        genome_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/fasta/"+self.species.lower()+"/dna/"+genome_filename
        
        gff_filename = self.species+"."+self.assembly+"."+self.release+".gff3.gz"
        gff_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/gff3/"+self.species.lower()+"/"+gff_filename
        
        
        if self.download:
            self.log.log("Fetching "+genome_url+"\n")
            io.execute(['rsync','-aP',genome_url, ensembl/genome_filename])
            self.log.log("Fetching "+gff_url+"\n")
            io.execute(['rsync','-aP',gff_url, ensembl/gff_filename])
        
        with workspace.tempspace() as temp:
            items = list(annotation.read_annotations(ensembl/gff_filename))
            for item in items:
                item.seqid = rename.get(item.seqid, item.seqid)
            annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
            del items
            
            with open(temp/'temp.fa','wb') as f:
                for name,seq in io.read_sequences(ensembl/genome_filename):
                    name = name.split()[0]
                    name = rename.get(name,name)
                    io.write_fasta(f, name, seq)
            
            reference_directory.Make_tt_reference(
                self.output_dir,
                filenames = [ temp/'temp.fa', temp/'temp.gff' ],
                index = self.index,
                ).run()
Exemple #10
0
def do_fasta_output(names, interesting):
    for i, name in enumerate(names):
        sequence = [ ]
        #for refname in substitution_calls:
        #    sequence.extend(substitution_calls[refname][i])
        #for i in xrange(len(sequence)):
        #    if sequence[i] not in 'ACGT':
        #        sequence[i] = 'N'

        for refname, position, change_type, values, has_ambiguous, evidence in interesting:
            if not has_ambiguous and change_type == 'substitution':
                sequence.append(values[i]) 
            
        io.write_fasta(sys.stdout, name + ' variable sites with consensus in all', ''.join(sequence))
Exemple #11
0
def do_fasta_output(names, interesting):
    for i, name in enumerate(names):
        sequence = []
        # for refname in substitution_calls:
        #    sequence.extend(substitution_calls[refname][i])
        # for i in xrange(len(sequence)):
        #    if sequence[i] not in 'ACGT':
        #        sequence[i] = 'N'

        for refname, position, change_type, values, has_ambiguous, evidence in interesting:
            if not has_ambiguous and change_type == "substitution":
                sequence.append(values[i])

        io.write_fasta(sys.stdout, name + " variable sites with consensus in all", "".join(sequence))
Exemple #12
0
    def run(self):
        workspace = self.get_workspace()
        
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)
        variants = collections.defaultdict(list)
        for record in reader:
            variants[record.CHROM].append(record)
        reader_f.close()
        
        for chrom in variants:
            variants[chrom].sort(key=lambda item: item.POS)
        
        filenames = [ workspace/(item+'.fa') for item in reader.samples ]
        for filename in filenames:
            with open(filename,'wb'): pass
        
        for name, seq in io.read_sequences(reference.reference_fasta_filename()):
            for i, sample in enumerate(reader.samples):            
                revised = [ ]
                pos = 0
                for variant in variants[name]:
                    gt = variant.samples[i].data.GT
                    if gt is None: continue
                    assert gt.isdigit(), 'Unsupported genotype (can only use haploid genotypes): '+gt
                    gt_number = int(gt)
                    if gt_number == 0:
                        var_seq = variant.REF
                    else:
                        var_seq = str(variant.ALT[gt_number-1])
                        assert re.match('[ACGTN]*$', var_seq), 'Unsupported variant type: '+var_seq
                    new_pos = variant.POS-1
                    assert new_pos >= pos, 'Variants overlap.'
                    revised.append(seq[pos:new_pos])
                    pos = new_pos
                    revised.append(var_seq)
                    assert seq[pos:pos+len(variant.REF)].upper() == variant.REF, 'REF column in VCF does not match reference sequence'
                    pos += len(variant.REF)
                revised.append(seq[pos:])
                            
                with open(filenames[i],'ab') as f:
                    io.write_fasta(f, name, ''.join(revised))

            del variants[name]        
        assert not variants, 'Chromosome names in VCF not in reference: '+' '.join(variants)
Exemple #13
0
    def run(self):    
        f = self.begin_output()
    
        for filename in self.filenames:
            for name, seq in io.read_sequences(filename):
                name_parts = name.split(None, 1)
                name = name_parts[0]
                for i in xrange(-self.size+self.stride,len(seq),self.stride):
                    start = max(0,min(len(seq),i))
                    end = max(0,min(len(seq), i+self.size))
                    shred_name = '%s:%d..%d' % (name,start+1,end)
                    shred_seq = seq
                    if self.quality:
                        io.write_fastq(f, shred_name, seq[start:end], chr(33+self.quality)*(end-start))
                    else:
                        io.write_fasta(f, shred_name, seq[start:end])

        self.end_output(f)
Exemple #14
0
    def run(self):
        f = self.begin_output()

        for filename in self.filenames:
            for name, seq in io.read_sequences(filename):
                name_parts = name.split(None, 1)
                name = name_parts[0]
                for i in xrange(-self.size + self.stride, len(seq),
                                self.stride):
                    start = max(0, min(len(seq), i))
                    end = max(0, min(len(seq), i + self.size))
                    shred_name = '%s:%d..%d' % (name, start + 1, end)
                    shred_seq = seq
                    if self.quality:
                        io.write_fastq(f, shred_name, seq[start:end],
                                       chr(33 + self.quality) * (end - start))
                    else:
                        io.write_fasta(f, shred_name, seq[start:end])

        self.end_output(f)
    def set_sequences(self, filenames):        
        reference_genbank_filename = self / 'reference.gbk'
        reference_filename = self / 'reference.fa'

        reference_genbank_file = open(reference_genbank_filename,'wb')
        any_genbank = [ False ]

        def genbank_callback(name, record):
            """ Make a copy of any genbank files passed in. """
            from Bio import SeqIO
            
            SeqIO.write([record], reference_genbank_file, 'genbank')
            
            f = open(self / (grace.filesystem_friendly_name(name) + '.gbk'), 'wb')
            SeqIO.write([record], f, 'genbank')
            f.close()
            
            any_genbank[0] = True
        
        lengths = [ ]
        seen = set()
        f = open(reference_filename, 'wb')
        for filename in filenames:
            for name, seq in io.read_sequences(filename, genbank_callback=genbank_callback):
                name = name.split()[0]
                assert name not in seen, 'Duplicate chromosome name: ' + name
                seen.add(name)
                lengths.append( (name, len(seq)) )
                io.write_fasta(f, name, seq)
        f.close()        
        self.set_object(lengths, 'reference-lengths.pickle.gz')
        
        reference_genbank_file.close()
        if not any_genbank[0]:
            os.unlink(reference_genbank_filename)
            
        # Create an index of the reference sequences for samtools
        io.execute([
            'samtools', 'faidx', reference_filename
        ])
Exemple #16
0
def main(args):
    mincov, args = grace.get_option_value(args, '--mincov', int, 1)
    maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
    minsize, args = grace.get_option_value(args, '--minsize', int, 200)
    what, args = grace.get_option_value(args, '--what', as_core_or_unique,
                                        'core')
    is_core = (what == 'core')

    grace.expect_no_further_options(args)

    if len(args) < 2:
        print >> sys.stderr, HELP
        raise grace.Help_shown()

    output_dir, working_dirs = args[0], args[1:]

    assert not path.exists(path.join(output_dir, 'reference.fa')), \
        'Output directory not given'

    if not path.exists(output_dir):
        os.mkdir(output_dir)

    for name, seq in io.read_sequences(
            path.join(working_dirs[0], 'reference.fa')):
        print name
        friendly_name = grace.filesystem_friendly_name(name)

        good = [True] * len(seq)

        for working_dir in working_dirs:
            if is_core:
                suffix = '-depth.userplot'
            else:
                suffix = '-ambiguous-depth.userplot'
            data = trivia.read_unstranded_userplot(
                os.path.join(working_dir, friendly_name + suffix))
            assert len(seq) == len(data)
            for i in xrange(len(seq)):
                if good[i]:
                    if is_core:
                        good[i] = data[i] >= mincov
                    else:
                        good[i] = data[i] < mincov

        #Close holes
        start = -maxdiff - 1
        n_holes = 0
        for i in xrange(len(seq)):
            if good[i]:
                if 0 < i - start <= maxdiff:
                    for j in xrange(start, i):
                        good[j] = True
                    n_holes += 1
                start = i + 1
        print 'Closed', grace.pretty_number(n_holes), 'holes'

        f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name, what)),
                 'wb')
        io.write_fasta(
            f, name,
            ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))]))
        f.close()

        f = open(
            path.join(output_dir, '%s-%s_masked.fa' % (friendly_name, what)),
            'wb')
        io.write_fasta(
            f, name, ''.join([(seq[i] if good[i] else seq[i].lower())
                              for i in xrange(len(seq))]))
        f.close()

        f_good = open(
            path.join(output_dir, '%s-%s_parts.fa' % (friendly_name, what)),
            'wb')
        f_nongood = open(
            path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name, what)),
            'wb')
        start = 0
        n_good = [0]
        n_good_bases = [0]

        def emit(i):
            if i - start < minsize: return
            if good[start]:
                n_good[0] += 1
                n_good_bases[0] += i - start
            io.write_fasta(f_good if good[start] else f_nongood,
                           '%s:%d..%d' % (name, start + 1, i), seq[start:i])

        for i in xrange(1, len(seq)):
            if good[i] != good[start]:
                emit(i)
                start = i
        emit(len(seq))
        f_nongood.close()
        f_good.close()

        print grace.pretty_number(
            sum(good)), 'bases are ' + what + ', of', grace.pretty_number(
                len(seq)), 'in reference sequence'
        print grace.pretty_number(
            n_good[0]), 'parts at least', grace.pretty_number(
                minsize), 'bases long with', grace.pretty_number(
                    n_good_bases[0]), 'total bases'

        print


# python2.6 modify_sequence.py data/velvet_test_reference.fa >data/velvet_test_reference_modified.fa

import sys, random

from nesoni import io

for name, seq in io.read_sequences(sys.argv[1]):
    j = 0    
    for i in xrange(0,len(seq)-100,100):
        original = seq[i]
        if j%3 == 0:
            while True:
                new = random.choice('ACGT')
                if new != original: break
            seq = seq[:i] + new + seq[i+1:]
        elif j%3 == 1:
            n = (j // 3) % 9 + 1
            seq = seq[:i] + seq[i+n:]
        else:
            n = (j // 3) % 9 + 1
            seq = seq[:i] + ''.join( random.choice('ACGT') for k in xrange(n) ) + seq[i:]
        j += 1
    io.write_fasta(sys.stdout, name, seq)
Exemple #18
0
    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1)
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')
        #is_core = (what == 'core')
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in (
            'core',
            'unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core')

        workspace = self.get_workspace()

        for name, seq in io.read_sequences(
                working_directory.Working(self.working_dirs[0]).get_reference(
                ).reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)

            good = [True] * len(seq)

            for working_dir in self.working_dirs:
                if is_core:
                    suffix = '-depth.userplot'
                else:
                    suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name + suffix))
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                    if good[i]:
                        if is_core:
                            good[i] = data[i] >= self.mincov
                        else:
                            good[i] = data[i] < self.mincov

            #Close holes
            start = -self.maxdiff - 1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                    if 0 < i - start <= self.maxdiff:
                        for j in xrange(start, i):
                            good[j] = True
                        n_holes += 1
                    start = i + 1
            self.log.log('Closed ' + grace.pretty_number(n_holes) + ' holes\n')

            f = open(workspace / ('%s-%s.fa' % (friendly_name, self.what)),
                     'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else 'N')
                                  for i in xrange(len(seq))]))
            f.close()

            f = open(
                workspace / ('%s-%s_masked.fa' % (friendly_name, self.what)),
                'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else seq[i].lower())
                                  for i in xrange(len(seq))]))
            f.close()

            f_good = open(
                workspace / ('%s-%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            f_nongood = open(
                workspace / ('%s-non%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]

            def emit(i):
                if i - start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i - start
                io.write_fasta(f_good if good[start] else f_nongood,
                               '%s:%d..%d' % (name, start + 1, i),
                               seq[start:i])

            for i in xrange(1, len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()

            self.log.log(
                grace.pretty_number(sum(good)) + ' bases are ' + self.what +
                ', of ' + grace.pretty_number(len(seq)) +
                ' in reference sequence\n')
            self.log.log(
                grace.pretty_number(n_good[0]) + ' parts at least ' +
                grace.pretty_number(self.minsize) + ' bases long with ' +
                grace.pretty_number(n_good_bases[0]) + ' total bases\n')
            self.log.log('\n')
Exemple #19
0
def pastiche(args):
    if len(args) < 4:
        print USAGE
        return 1

    mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool,
                                             False)
    min_leftover, args = grace.get_option_value(args, '--min-leftover', int,
                                                20)

    output_dir, args = args[0], args[1:]

    #, ref_filename, contig_filenames = args[0], args[1], args[2:]

    ref_filenames = []
    contig_filenames = []
    grace.execute(args,
                  {'contigs': lambda args: contig_filenames.extend(args)},
                  lambda args: ref_filenames.extend(args))

    assert ref_filenames, 'No reference sequences given'
    assert contig_filenames, 'No contig sequences given'

    contigs = dict([(name.split()[0], seq) for filename in contig_filenames
                    for name, seq in io.read_sequences(filename)])
    dir_contigs = {}
    for name in contigs:
        dir_contigs[name + '+'] = contigs[name]
        dir_contigs[name + '-'] = bio.reverse_complement(contigs[name])

    dir_contigs_used = {}
    for name in dir_contigs:
        dir_contigs_used[name] = [False] * len(dir_contigs[name])

    workspace = io.Workspace(output_dir)
    temp_prefix = workspace._object_filename('temp-pastiche')

    out_f = workspace.open('pastiche.fa', 'wb')

    for ref_filename in ref_filenames:
        for ref_name, ref_seq in io.read_sequences(ref_filename):
            ref_name = ref_name.split()[0]

            grace.status(ref_name)

            f = open(temp_prefix + '.fa', 'wb')
            io.write_fasta(f, 'ref', ref_seq)
            f.close()

            scores = [-1] * (len(ref_seq) * 2)
            strings = ['N', ''] * (len(ref_seq))
            contexts = [None for i in xrange(len(ref_seq) * 2)]

            #MAXSCORE = len(ref_seq)+1
            #for i in xrange(len(ref_seq)):
            #    if ref_seq[i].upper() != 'N':
            #        strings[i*2] = ref_seq[i]
            #        scores[i*2] = MAXSCORE
            #for i in xrange(len(ref_seq)-1):
            #    if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N':
            #        scores[i*2+1] = MAXSCORE

            if mask_only:
                for i in xrange(len(ref_seq)):
                    strings[i * 2] = ref_seq[i].lower()

            def put(position, dir_contig_name, start, end, score):
                if scores[position] < score:
                    scores[position] = score
                    strings[position] = dir_contigs[dir_contig_name][start:end]
                    contexts[position] = (dir_contig_name, start, end, score)

            for contig_filename in contig_filenames:
                execute([
                    'nucmer',
                    '--prefix',
                    temp_prefix,
                    #'--maxmatch', #Very slow
                    '--nosimplify',
                    '--minmatch',
                    '9',
                    '--mincluster',
                    '50',
                    #'--maxgap', '1000',
                    #'--breaklen', '1000', # Increasing this reduces Ns, but is slow
                    #'--diagfactor', '1.0',
                    temp_prefix + '.fa',
                    contig_filename
                ])

                for contig_name, contig_seq in io.read_sequences(
                        contig_filename):
                    contig_name = contig_name.split()[0]
                    grace.status(ref_name + ' vs ' + contig_name)
                    p = run([
                        'show-aligns', temp_prefix + '.delta', 'ref',
                        contig_name
                    ],
                            stderr=subprocess.PIPE)

                    alignments = []

                    while True:
                        line = p.stdout.readline()
                        if not line: break
                        if not line.startswith('-- BEGIN'):
                            continue

                        parts = line.split()

                        ref_start = int(parts[5])
                        ref_end = int(parts[7])
                        query_start = int(parts[10])
                        query_end = int(parts[12])

                        #assert ref_start < ref_end
                        #ref_start -= 1 #Zero based coordinates

                        al_ref = []
                        al_query = []

                        while True:
                            block = []
                            end = False
                            while True:
                                line = p.stdout.readline()
                                if line.startswith('--   END'):
                                    end = True
                                    break
                                if line == '\n':
                                    if block:
                                        break
                                    else:
                                        continue
                                block.append(line)

                            if end: break

                            al_ref.append(block[0].split()[1])
                            al_query.append(block[1].split()[1])

                        al_ref = ''.join(al_ref)
                        al_query = ''.join(al_query)

                        if ref_start > ref_end:
                            al_ref = bio.reverse_complement(al_ref)
                            al_query = bio.reverse_complement(al_query)
                            ref_start, ref_end = ref_end, ref_start
                            query_start, query_end = query_end, query_start

                        if query_start > query_end:
                            dir_contig_name = contig_name + '-'
                            query_start = len(contig_seq) + 1 - query_start
                            query_end = len(contig_seq) + 1 - query_end
                        else:
                            dir_contig_name = contig_name + '+'

                        ref_start -= 1  #Zero based coordinates
                        query_start -= 1

                        #print al_ref
                        #print al_query

                        #Pretty dumb scoring scheme
                        al_score = 0
                        for i in xrange(len(al_ref)):
                            if al_ref[i] == al_query[i]:
                                al_score += 1
                            #else:
                            #    al_score -= 1

                        #Pastiche alignment over reference
                        ref_pos = ref_start
                        query_pos = query_start
                        al_pos = 0
                        while al_pos < len(al_ref):
                            assert al_ref[al_pos] != '.'
                            if al_query[al_pos] == '.':
                                put(ref_pos * 2, dir_contig_name, query_pos,
                                    query_pos, al_score)
                            else:
                                assert al_query[al_pos].lower() == dir_contigs[
                                    dir_contig_name][query_pos].lower()
                                put(ref_pos * 2, dir_contig_name, query_pos,
                                    query_pos + 1, al_score)
                                query_pos += 1
                            al_pos += 1

                            al_pos_end = al_pos
                            query_pos_end = query_pos
                            while al_pos_end < len(
                                    al_ref) and al_ref[al_pos_end] == '.':
                                al_pos_end += 1
                                query_pos_end += 1
                            #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score)
                            assert al_query[al_pos:al_pos_end].lower(
                            ) == dir_contigs[dir_contig_name][
                                query_pos:query_pos_end].lower()
                            put(ref_pos * 2 + 1, dir_contig_name, query_pos,
                                query_pos_end, al_score)
                            al_pos = al_pos_end
                            query_pos = query_pos_end
                            ref_pos += 1

                    p.wait()

            grace.status(ref_name)

            result = ''.join(strings)
            io.write_fasta(out_f, ref_name, result)

            for context in contexts:
                if context is None: continue
                name, start, end, score = context
                for i in xrange(start, end):
                    dir_contigs_used[name][i] = True

            #Interpolation
            #result = [ ]
            #i = 0
            #while i < len(ref_seq):
            #    if strings[i*2].upper() != 'N':
            #        result.append(strings[i*2])
            #        result.append(strings[i*2+1])
            #        i += 1
            #        continue
            #
            #    j = i
            #    while strings[j*2].upper() == 'N':
            #        j += 1
            #
            #    grace.status('')
            #    print >> sys.stderr, 'interpolating', i+1,'..',j
            #
            #    window = 20 #!!!!!!!!!!!
            #    left_contexts = collections.defaultdict(lambda:0)
            #    for i1 in xrange(max(0,i-window),i):
            #        for context_name, context_start, context_end, context_score in contexts[i1*2]:
            #            key = (context_name, context_end + i - i1)
            #            left_contexts[key] = max(left_contexts[key],context_score)
            #
            #    right_contexts = collections.defaultdict(lambda:0)
            #    for j1 in xrange(j,min(j+window,len(ref_seq))):
            #        for context_name, context_start, context_end, context_score in contexts[j1*2]:
            #            key = (context_name, context_start + j - j1)
            #            right_contexts[key] = max(left_contexts[key],context_score)
            #
            #    #print >> sys.stderr, left_contexts
            #    #print >> sys.stderr, right_contexts
            #
            #    options = [ ]
            #
            #    for (left_name, left_pos), left_score in left_contexts.items():
            #        for (right_name, right_pos), right_score in right_contexts.items():
            #            if left_name != right_name: continue
            #            if right_pos < left_pos: continue
            #
            #            if right_pos-left_pos > (j-i) * 4.0 + 10: continue   #!!!!!!!!!!!!!!!!!!!!!!1
            #            if right_pos-left_pos < (j-i) * 0.25 - 10: continue
            #
            #            score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i)
            #            score *= left_score + right_score
            #            #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score
            #            options.append( (score, left_name, left_pos, right_pos) )
            #
            #    if options:
            #        best = max(options, key=lambda option: option[0])
            #        print >> sys.stderr, '->', best
            #        result.append( dir_contigs[best[1]][best[2]:best[3]].lower() )
            #    else:
            #        print >> sys.stderr, '-> no good interpolation'
            #        result.append( ref_seq[i:j] )
            #
            #    i = j
            #
            #result = ''.join(result)
            #io.write_fasta(sys.stdout, ref_name, result)

            #print >> sys.stderr, len(result), result.count('N')
            #for pos, size in N_runs:
            #    out_size = len(''.join( strings[pos*2:pos*2+2] ))
            #    print >> sys.stderr, pos, size, '->', out_size

    out_f.close()

    grace.status('')

    #for name, seq in io.read_sequences(ref_filename):
    #    result = pastiche(seq, contigs_filename)
    #    io.write_fasta(sys.stdout, name, result)

    leftover_f = workspace.open('leftovers.fa', 'wb')

    for name in sorted(contigs):
        used = [
            (a or b)
            for a, b in zip(dir_contigs_used[name +
                                             '+'], dir_contigs_used[name +
                                                                    '-'][::-1])
        ]

        i = 0
        while i < len(used):
            j = i
            while j < len(used) and not used[j]:
                j += 1
            if j - i > min_leftover:
                if i == 0 and j == len(used):
                    out_name = name
                else:
                    out_name = name + ':%d..%d' % (i + 1, j)
                io.write_fasta(leftover_f, out_name, contigs[name][i:j])

            i = j + 1

    leftover_f.close()

    for suffix in ['.fa', '.delta']:
        os.unlink(temp_prefix + suffix)
Exemple #20
0
def fill_scaffolds(args):
    max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000)
    
    if len(args) < 2:
        print USAGE
        return 1
    
    (output_dir, graph_dir), args = args[:2], args[2:]

    scaffolds = [ ]
    
    def scaffold(args):
        circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False)
        
        scaffold = [ ]
        for item in args:
            scaffold.append( ('contig', int(item)) )
            scaffold.append( ('gap', None) )
        
        if not circular: scaffold = scaffold[:-1]
        
        name = 'custom_scaffold_%d' % (len(scaffolds)+1)
        scaffolds.append( (name, scaffold) )
            
    grace.execute(args, [scaffold])
    
    custom_scaffolds = (len(scaffolds) != 0)    
    
    sequences = dict( 
        (a.split()[0], b.upper()) 
          for a,b in 
            io.read_sequences(os.path.join(
              graph_dir, '454AllContigs.fna')))
    
    sequence_names = sorted(sequences)
    sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1)))
    
    contexts = { }
    context_names = { }
    context_depths = { }
    for i in xrange(1,len(sequence_names)+1):
        seq = sequences[sequence_names[i-1]]
        contexts[ i ] = seq
        context_names[ i ] = sequence_names[i-1]+'-fwd'
        contexts[ -i ] = bio.reverse_complement(seq)
        context_names[ -i ] = sequence_names[i-1]+'-rev'
    
    links = collections.defaultdict(list)
    
    for line in open(
      os.path.join(graph_dir, '454ContigGraph.txt'),
      'rU'):
        parts = line.rstrip('\n').split('\t')
        
        if parts[0].isdigit():
            seq = sequence_ids[parts[1]]
            context_depths[ seq] = float(parts[3])
            context_depths[-seq] = float(parts[3])
        
        if parts[0] == 'C':    
            name1 = 'contig%05d' % int(parts[1])
            dir1 = {"3'" : 1, "5'" : -1 }[parts[2]]
            name2 = 'contig%05d' % int(parts[3])
            dir2 = {"5'" : 1, "3'" : -1 }[parts[4]]
            depth = int(parts[5])
            #print name1, dir1, name2, dir2, depth
            
            links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) )
            links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) )
    
        if parts[0] == 'S' and not custom_scaffolds:  
            name = 'scaffold%05d' % int(parts[2])  
            components = parts[3].split(';')
            scaffold = [ ]
            for component in components:
                a,b = component.split(':')
                if a == 'gap':
                    scaffold.append( ('gap',int(b)) )
                else:
                    strand = { '+': +1, '-': -1 }[ b ]
                    scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) )
            scaffolds.append( (name, scaffold) )
    
    
    
    #paths = { }
    #
    #todo = [ ]
    #for i in contexts:
    #    for depth_left, neg_left in links[-i]:
    #        left = -neg_left
    #        for depth_right, right in links[i]:
    #            todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) )
    #
    #heapq.heapify(todo)
    #while todo:
    #    score, source, dest, path = heapq.heappop(todo)
    #    if (source,dest) in paths: continue
    #    
    #    paths[(source,dest)] = path
    #    
    #    if len(contexts[dest]) > max_filler_length: continue
    #    
    #    for depth, next in links[dest]:
    #        heapq.heappush(todo,
    #            ( max(score,-depth,-context_depths[dest]), source, next, path+(dest,))
    #        )
    
    
    path_source_dest = collections.defaultdict(dict) # source -> dest -> next
    path_dest_source = collections.defaultdict(dict) # dest -> source -> next
    
    
    # Use links, in order to depth of coverage, to construct paths between contigs
    # Thus: paths have maximum minimum depth
    #       subsections of paths also have this property
    
    todo = [ ]
    for i in contexts:    
        for depth_link, right in links[i]:
            todo.append( ( depth_link, i, right) )
    todo.sort(reverse=True)
    for score, left, right in todo:
        if right in path_source_dest[left]: continue
        
        sources = [(left,right)]
        if len(contexts[left]) <= max_filler_length:
            sources += path_dest_source[left].items()
        destinations = [right]
        if len(contexts[right]) <= max_filler_length:
            destinations += path_source_dest[right].keys()
        
        for source, next in sources:
            for dest in destinations:
                if dest in path_source_dest[source]: continue
                path_source_dest[source][dest] = next
                path_dest_source[dest][source] = next
    
    
    workspace = io.Workspace(output_dir)
    scaffold_f = workspace.open('scaffolds.fa','wb')
    
    #comments = [ ]
    features = [ ]
    
    used = set()
    previous_total = 0
    
    for i, (name, scaffold) in enumerate(scaffolds):
        result = '' # Inefficient. Meh.
        n_filled = 0
        n_failed = 0
        for j, item in enumerate(scaffold):
            if item[0] == 'contig':
                result += contexts[item[1]]
                used.add(abs(item[1]))
            else:
                left = scaffold[j-1]
                right = scaffold[ (j+1) % len(scaffold) ] #If gap at end, assume circular
                assert left[0] == 'contig'
                assert right[0] == 'contig'
                
                gap_start = len(result)
    
                can_fill = right[1] in path_source_dest[left[1]]
                if can_fill:
                    n = 0
                    k = path_source_dest[left[1]][right[1]]
                    while k != right[1]:
                        n += len(contexts[k])
                        result += contexts[k].lower()
                        used.add(abs(k))
                        
                        k = path_source_dest[k][right[1]]
                    
                    n_filled += 1
                        
                    if item[1] is not None and max(n,item[1]) > min(n,item[1])*4:
                        print >> sys.stderr, 'Warning: gap size changed from %d to %d in scaffold %d' % (item[1],n,i+1)
                else:
                    n_failed += 1
                    
                    #print >> sys.stderr, 'Warning: No path to fill a gap in scaffold %d' % (i+1)
                    result += 'n' * (9 if item[1] is None else item[1])
    
                gap_end = len(result)
                
                #features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % (
                #    'all-scaffolds',
                #    'fill-scaffolds',
                #    'gap',
                #    previous_total + gap_start+1,
                #    previous_total + max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm.
                #    '.', #score
                #    '+', #strand
                #    '.', #frame
                #    '' #properties
                #))
                features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % (
                    name,
                    'fill-scaffolds',
                    'gap',
                    gap_start+1,
                    max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm.
                    '.', #score
                    '+', #strand
                    '.', #frame
                    '' #properties
                ))
                    
    
        io.write_fasta(scaffold_f, name, result)
        previous_total += len(result)
        #comments.append('##sequence-region    %s %d %d' % (name, 1, len(result)))
        print >> sys.stderr, 'Scaffold%05d: %d gaps filled, %d could not be filled' % (i+1, n_filled, n_failed)
    
    scaffold_f.close()
    
    gff_f = workspace.open('scaffolds.gff', 'wb')
    #print >>gff_f, '##gff-version    3'
    #for comment in comments:
    #    print >>gff_f, comment
    for feature in features:
        print >>gff_f, feature
    gff_f.close()
    
    
    leftovers_f = workspace.open('leftovers.fa', 'wb')
    for name in sequence_names:
        if sequence_ids[name] not in used:
            io.write_fasta(leftovers_f, name, sequences[name])
    leftovers_f.close()
    
    ends = { }
    for i, (name, scaffold) in enumerate(scaffolds):
        if scaffold[-1][0] == 'gap': continue
        ends[ '%s start' % name ] = scaffold[-1][1]
        ends[ '%s end  ' % name ] = -scaffold[0][1] 
    
    for end1 in sorted(ends):
        options = [ end2 for end2 in ends if -ends[end2] in path_source_dest[ends[end1]] ]
        if len(options) == 1:
            print >> sys.stderr, 'Note: from', end1, 'only', options[0], 'is reachable'
Exemple #21
0
 def run(self):
     for filename in self.filenames:
         for name, seq in io.read_sequences(filename):
             if self.only and name.split()[0] not in self.only: continue
             io.write_fasta(sys.stdout, name, seq)
Exemple #22
0
    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
        #is_core = (what == 'core') 
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in ('core','unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core') 
        
        workspace = self.get_workspace()
        
        for name, seq in io.read_sequences(working_directory.Working(self.working_dirs[0]).get_reference().reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)
            
            good = [ True ] * len(seq)
            
            for working_dir in self.working_dirs:
                if is_core:
                   suffix = '-depth.userplot'
                else:
                   suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name+suffix)
                )
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                   if good[i]:
                       if is_core:
                           good[i] = data[i] >= self.mincov
                       else:
                           good[i] = data[i] < self.mincov
    
            #Close holes
            start = -self.maxdiff-1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                     if 0 < i-start <= self.maxdiff:
                         for j in xrange(start,i): good[j] = True
                         n_holes += 1
                     start = i+1
            self.log.log('Closed '+grace.pretty_number(n_holes)+' holes\n')
            
            
            f = open( workspace/('%s-%s.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else 'N')
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f = open( workspace/('%s-%s_masked.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else seq[i].lower())
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f_good = open( workspace/('%s-%s_parts.fa' % (friendly_name,self.what)), 'wb')
            f_nongood = open( workspace/('%s-non%s_parts.fa' % (friendly_name,self.what)), 'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]    
            def emit(i):
                if i-start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i-start
                io.write_fasta(
                    f_good if good[start] else f_nongood,
                    '%s:%d..%d' % (name, start+1,i),
                    seq[start:i]
                )
            for i in xrange(1,len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()
            
            self.log.log(grace.pretty_number(sum(good))+' bases are '+self.what+', of '+grace.pretty_number(len(seq))+' in reference sequence\n')
            self.log.log(grace.pretty_number(n_good[0])+' parts at least '+grace.pretty_number(self.minsize)+' bases long with '+grace.pretty_number(n_good_bases[0])+' total bases\n')
            self.log.log('\n')
Exemple #23
0
    def run(self):
        workspace = self.get_workspace()
        
        read_length = 100
        left = rand_seq(read_length-1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[:1]: break
        left += flank
        
        right = rand_seq(read_length-1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[-1:]: break
        right = flank+right
        
        i = 0
        
        variants_used = [ ]
        
        with open(workspace/'reads.fq','wb') as f:
            for i, variant in enumerate(self.variants):
                if 'x' in variant:
                    variant, count = variant.split('x')
                    count = int(count)
                else:
                    count = 10
                variants_used.append( (variant,count) )
                seq = left+variant+right
                for j in xrange(count):
                    pos = len(variant)+random.randrange(read_length-len(variant))
                    read = seq[pos:pos+read_length]
                    if random.randrange(2):
                        read = bio.reverse_complement(read)
                    i += 1
                    io.write_fastq(f,'read_%s_%d' % (variant,i),read,chr(64+30)*len(read))

        reference = left+self.ref+right
        primary_variant = left+variants_used[0][0]+right

        with open(workspace/'reference.fa','wb') as f:
            io.write_fasta(f,'chr1',reference)
        
        legion.remake_needed()
        
        self.analysis(
            workspace/'sample',
            workspace/'reference.fa',
            reads = [ workspace/'reads.fq' ],
            ).run()
        
        self.freebayes(
            workspace/'freebayes',
            workspace/'sample',
            ).run()
        
        self.vcf_filter(
            workspace/'filtered',
            workspace/'freebayes.vcf',
            ).run()
        
        Vcf_patch(
            workspace/'patch',
            workspace/('sample','reference'),
            workspace/'filtered.vcf'
            ).run()
        
        patched = io.read_sequences(workspace/('patch','sample.fa')).next()[1]
        
        masked = io.read_sequences(workspace/('sample','consensus_masked.fa')).next()[1].upper()
        
        with open(workspace/'freebayes.vcf','rU') as f:
            reader = vcf.Reader(f)
            raw_count = len(list(reader))
        
        with open(workspace/'filtered.vcf','rU') as f:
            reader =  vcf.Reader(f)
            filtered_count = len(list(vcf.Reader(open(workspace/'filtered.vcf','rU'))))
        
        with open(workspace/('sample','report.txt'),'rb') as f:
            nesoni_count = len(f.readlines()) - 1

        self.log.log('\n')
        self.log.datum(workspace.name,'changes found by "nesoni consensus:"', nesoni_count)
        self.log.datum(workspace.name,'is correctly patched by "nesoni consensus:"', masked == primary_variant)
        self.log.log('\n')
        self.log.datum(workspace.name,'raw variants', raw_count)
        self.log.datum(workspace.name,'variants after filtering', filtered_count)
        self.log.datum(workspace.name,'is correctly patched by VCF pipeline', patched == primary_variant)
        self.log.log('\n')
Exemple #24
0
def main(args):
    grace.require_shrimp_1()

    n_cpus = grace.how_many_cpus()
        
    solid, args = grace.get_flag(args, '--solid')
    verbose, args = grace.get_flag(args, '--verbose')

    threshold, args = grace.get_option_value(args, '--threshold', str, '68%')
    
    stride, args = grace.get_option_value(args, '--stride', int, 1)
    max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus)
    batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000)
        
    input_reference_filenames = [ ]
    reads_filenames = [ ]
    
    shrimp_options = [ '-h', threshold ]
    if threshold.endswith('%'):
        threshold = -float(threshold[:-1])/100.0
    else:
        threshold = int(threshold)
    
    output_dir = [ ]  #As list so can write to from function. Gah.
    
    def front_command(args):
        grace.expect_no_further_options(args)
        
        if len(args) < 1:
            return
        
        output_dir.append(args[0])        
        input_reference_filenames.extend(
            [ os.path.abspath(filename) for filename in args[1:] ])
    def reads_command(args):
        grace.expect_no_further_options(args)
        reads_filenames.extend([ [ os.path.abspath(filename) ] for filename in args])
    def pairs_command(args):
        grace.expect_no_further_options(args)
        assert len(args) == 2, 'Expected exactly two files in "pairs"'
        reads_filenames.append([ os.path.abspath(filename) for filename in args ])
    def shrimp_options_command(args):
        shrimp_options.extend(args)
    
    grace.execute(args, {
        'reads': reads_command,
        '--reads': reads_command,
        'pairs': pairs_command,
        'shrimp-options': shrimp_options_command,
        '--shrimp-options': shrimp_options_command,
    }, front_command)
    
    
    if not output_dir:
        print >> sys.stderr, USAGE % n_cpus
        return 1
    
    output_dir = output_dir[0]
    
    assert input_reference_filenames, 'No reference files given'
    assert reads_filenames, 'No read files given'
    
    for filename in itertools.chain(input_reference_filenames, *reads_filenames):
        assert os.path.exists(filename), '%s does not exist' % filename

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    
    if solid:
        shrimp = 'rmapper-cs'
    else:
        shrimp = 'rmapper-ls'
    
    
    reference_filename = os.path.join(output_dir,'reference.fa')
    reference_file = open(reference_filename,'wb')
    total_reference_sequences = 0
    total_reference_bases = 0
    for input_reference_filename in input_reference_filenames:
        for name, sequence in io.read_sequences(input_reference_filename):
            #Don't retain any comment
            name = name.split()[0]
            io.write_fasta(reference_file, name, sequence)
            
            total_reference_sequences += 1
            total_reference_bases += len(sequence)
            
    reference_file.close()
    
    print '%s base%s in %s reference sequence%s' % (
        grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '',
        grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '')
    
    assert total_reference_bases, 'Reference sequence file is empty' 
    
    config = {
        'references' : input_reference_filenames,
        'reads' : reads_filenames,
        'stride' : stride,
        'solid': solid,
        'threshold': threshold,
    }
    config_file = open(os.path.join(output_dir, 'config.txt'), 'wb')
    pprint.pprint(config, config_file)
    config_file.close()
    
    output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz')
    output_file = gzip.open(output_filename, 'wb')
    
    unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz')
    unmapped_file = gzip.open(unmapped_filename, 'wb')
    
    dirty_filenames = set()
    dirty_filenames.add(output_filename)
    dirty_filenames.add(unmapped_filename)
    
    #warn_low_threshold = True
    
    try: #Cleanup temporary files
        
        N = [0]
        def do_shrimp(read_set):
            my_number = N[0]
            N[0] += 1
            
            tempname = os.path.join(output_dir,'temp%d-%d.fa' % (os.getpid(),my_number))
            tempname_out = os.path.join(output_dir,'temp%d-%d.txt' % (os.getpid(),my_number))
            
            dirty_filenames.add(tempname)
            dirty_filenames.add(tempname_out)
            
            f = open(tempname,'wb')
            for read_name, read_seq in read_set:
                print >> f, '>' + read_name
                print >> f, read_seq
            f.close()
        
            command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \
                      tempname + ' ' + reference_filename + ' >' + tempname_out
            if not verbose:
                command += ' 2>/dev/null'
            #f = os.popen(command, 'r')
            child_pid = os.spawnl(os.P_NOWAIT,'/bin/sh','/bin/sh','-c',command)
            #print 'SHRiMP %d running' % my_number
            
            def finalize():
                exit_status = os.waitpid(child_pid, 0)[1]
                assert exit_status == 0, 'Shrimp indicated an error'
                
                hits = { } # read_name -> [ hit line ]
                
                f = open(tempname_out,'rb')
                for line in f:
                    if line.startswith('>'):
                        read_name = line.split(None,1)[0][1:]
                        if read_name not in hits:
                            hits[read_name] = [ ]
                        hits[read_name].append(line)
                f.close()
                                
                for read_name, read_seq in read_set:
                    if read_name in hits:
                        for hit in hits[read_name]:
                            output_file.write(hit)
                    else:
                        print >> unmapped_file, '>' + read_name
                        print >> unmapped_file, read_seq

                output_file.flush()
                unmapped_file.flush()
        
                os.unlink(tempname)
                dirty_filenames.remove(tempname)
                os.unlink(tempname_out)
                dirty_filenames.remove(tempname_out)
                #print 'SHRiMP %d finished' % my_number
            return finalize
        
        
        shrimps = [ ]
        
        reader = iter_reads(config)
        read_count = 0
        
        while True:
            read_set = [ ]
            read_set_bases = 0

            #Read name should not include comment cruft
            # - SHRIMP passes this through
            # - might stuff up identification of pairs
            
            for read_name, read_seq in reader:
                read_name = read_name.split()[0]                
                read_set.append((read_name, read_seq))
                read_set_bases += len(read_seq)
                
                #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match
                #    sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n')                    
                #    warn_low_threshold = False
            
                read_count += 1
                if read_set_bases >= batch_size: break
                
            if not read_set: break
        
            if len(shrimps) >= max_shrimps:
                shrimps.pop(0)()
            shrimps.append( do_shrimp(read_set) )
            
            grace.status('SHRiMPing %s' % grace.pretty_number(read_count))
        
        while shrimps:
            grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps) )
            shrimps.pop(0)()
        
        grace.status('')
        
        output_file.close()
        dirty_filenames.remove(output_filename)
        unmapped_file.close()
        dirty_filenames.remove(unmapped_filename)
        
        return 0

    finally:
        for filename in dirty_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
Exemple #25
0
def main(args):
    grace.require_shrimp_1()

    n_cpus = grace.how_many_cpus()

    solid, args = grace.get_flag(args, '--solid')
    verbose, args = grace.get_flag(args, '--verbose')

    threshold, args = grace.get_option_value(args, '--threshold', str, '68%')

    stride, args = grace.get_option_value(args, '--stride', int, 1)
    max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus)
    batch_size, args = grace.get_option_value(args, '--batch-size', int,
                                              5000000)

    input_reference_filenames = []
    reads_filenames = []

    shrimp_options = ['-h', threshold]
    if threshold.endswith('%'):
        threshold = -float(threshold[:-1]) / 100.0
    else:
        threshold = int(threshold)

    output_dir = []  #As list so can write to from function. Gah.

    def front_command(args):
        grace.expect_no_further_options(args)

        if len(args) < 1:
            return

        output_dir.append(args[0])
        input_reference_filenames.extend(
            [os.path.abspath(filename) for filename in args[1:]])

    def reads_command(args):
        grace.expect_no_further_options(args)
        reads_filenames.extend([[os.path.abspath(filename)]
                                for filename in args])

    def pairs_command(args):
        grace.expect_no_further_options(args)
        assert len(args) == 2, 'Expected exactly two files in "pairs"'
        reads_filenames.append(
            [os.path.abspath(filename) for filename in args])

    def shrimp_options_command(args):
        shrimp_options.extend(args)

    grace.execute(
        args, {
            'reads': reads_command,
            '--reads': reads_command,
            'pairs': pairs_command,
            'shrimp-options': shrimp_options_command,
            '--shrimp-options': shrimp_options_command,
        }, front_command)

    if not output_dir:
        print >> sys.stderr, USAGE % n_cpus
        return 1

    output_dir = output_dir[0]

    assert input_reference_filenames, 'No reference files given'
    assert reads_filenames, 'No read files given'

    for filename in itertools.chain(input_reference_filenames,
                                    *reads_filenames):
        assert os.path.exists(filename), '%s does not exist' % filename

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    if solid:
        shrimp = 'rmapper-cs'
    else:
        shrimp = 'rmapper-ls'

    reference_filename = os.path.join(output_dir, 'reference.fa')
    reference_file = open(reference_filename, 'wb')
    total_reference_sequences = 0
    total_reference_bases = 0
    for input_reference_filename in input_reference_filenames:
        for name, sequence in io.read_sequences(input_reference_filename):
            #Don't retain any comment
            name = name.split()[0]
            io.write_fasta(reference_file, name, sequence)

            total_reference_sequences += 1
            total_reference_bases += len(sequence)

    reference_file.close()

    print '%s base%s in %s reference sequence%s' % (
        grace.pretty_number(total_reference_bases),
        's' if total_reference_bases != 1 else '',
        grace.pretty_number(total_reference_sequences),
        's' if total_reference_sequences != 1 else '')

    assert total_reference_bases, 'Reference sequence file is empty'

    config = {
        'references': input_reference_filenames,
        'reads': reads_filenames,
        'stride': stride,
        'solid': solid,
        'threshold': threshold,
    }
    config_file = open(os.path.join(output_dir, 'config.txt'), 'wb')
    pprint.pprint(config, config_file)
    config_file.close()

    output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz')
    output_file = gzip.open(output_filename, 'wb')

    unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz')
    unmapped_file = gzip.open(unmapped_filename, 'wb')

    dirty_filenames = set()
    dirty_filenames.add(output_filename)
    dirty_filenames.add(unmapped_filename)

    #warn_low_threshold = True

    try:  #Cleanup temporary files

        N = [0]

        def do_shrimp(read_set):
            my_number = N[0]
            N[0] += 1

            tempname = os.path.join(output_dir,
                                    'temp%d-%d.fa' % (os.getpid(), my_number))
            tempname_out = os.path.join(
                output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number))

            dirty_filenames.add(tempname)
            dirty_filenames.add(tempname_out)

            f = open(tempname, 'wb')
            for read_name, read_seq in read_set:
                print >> f, '>' + read_name
                print >> f, read_seq
            f.close()

            command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \
                      tempname + ' ' + reference_filename + ' >' + tempname_out
            if not verbose:
                command += ' 2>/dev/null'
            #f = os.popen(command, 'r')
            child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c',
                                  command)

            #print 'SHRiMP %d running' % my_number

            def finalize():
                exit_status = os.waitpid(child_pid, 0)[1]
                assert exit_status == 0, 'Shrimp indicated an error'

                hits = {}  # read_name -> [ hit line ]

                f = open(tempname_out, 'rb')
                for line in f:
                    if line.startswith('>'):
                        read_name = line.split(None, 1)[0][1:]
                        if read_name not in hits:
                            hits[read_name] = []
                        hits[read_name].append(line)
                f.close()

                for read_name, read_seq in read_set:
                    if read_name in hits:
                        for hit in hits[read_name]:
                            output_file.write(hit)
                    else:
                        print >> unmapped_file, '>' + read_name
                        print >> unmapped_file, read_seq

                output_file.flush()
                unmapped_file.flush()

                os.unlink(tempname)
                dirty_filenames.remove(tempname)
                os.unlink(tempname_out)
                dirty_filenames.remove(tempname_out)
                #print 'SHRiMP %d finished' % my_number

            return finalize

        shrimps = []

        reader = iter_reads(config)
        read_count = 0

        while True:
            read_set = []
            read_set_bases = 0

            #Read name should not include comment cruft
            # - SHRIMP passes this through
            # - might stuff up identification of pairs

            for read_name, read_seq in reader:
                read_name = read_name.split()[0]
                read_set.append((read_name, read_seq))
                read_set_bases += len(read_seq)

                #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match
                #    sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n')
                #    warn_low_threshold = False

                read_count += 1
                if read_set_bases >= batch_size: break

            if not read_set: break

            if len(shrimps) >= max_shrimps:
                shrimps.pop(0)()
            shrimps.append(do_shrimp(read_set))

            grace.status('SHRiMPing %s' % grace.pretty_number(read_count))

        while shrimps:
            grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps))
            shrimps.pop(0)()

        grace.status('')

        output_file.close()
        dirty_filenames.remove(output_filename)
        unmapped_file.close()
        dirty_filenames.remove(unmapped_filename)

        return 0

    finally:
        for filename in dirty_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
Exemple #26
0
# python2.6 modify_sequence.py data/velvet_test_reference.fa >data/velvet_test_reference_modified.fa

import sys, random

from nesoni import io

for name, seq in io.read_sequences(sys.argv[1]):
    j = 0
    for i in xrange(0, len(seq) - 100, 100):
        original = seq[i]
        if j % 3 == 0:
            while True:
                new = random.choice('ACGT')
                if new != original: break
            seq = seq[:i] + new + seq[i + 1:]
        elif j % 3 == 1:
            n = (j // 3) % 9 + 1
            seq = seq[:i] + seq[i + n:]
        else:
            n = (j // 3) % 9 + 1
            seq = seq[:i] + ''.join(random.choice('ACGT')
                                    for k in xrange(n)) + seq[i:]
        j += 1
    io.write_fasta(sys.stdout, name, seq)
Exemple #27
0
 def run(self):
     for filename in self.filenames:
         for name, seq in io.read_sequences(filename):
             if self.only and name.split()[0] not in self.only: continue
             io.write_fasta(sys.stdout, name, seq) 
Exemple #28
0
    def run(self):
        workspace = self.get_workspace()

        read_length = 100
        left = rand_seq(read_length - 1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[:1]: break
        left += flank

        right = rand_seq(read_length - 1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[-1:]: break
        right = flank + right

        i = 0

        variants_used = []

        with open(workspace / 'reads.fq', 'wb') as f:
            for i, variant in enumerate(self.variants):
                if 'x' in variant:
                    variant, count = variant.split('x')
                    count = int(count)
                else:
                    count = 10
                variants_used.append((variant, count))
                seq = left + variant + right
                for j in xrange(count):
                    pos = len(variant) + random.randrange(read_length -
                                                          len(variant))
                    read = seq[pos:pos + read_length]
                    if random.randrange(2):
                        read = bio.reverse_complement(read)
                    i += 1
                    io.write_fastq(f, 'read_%s_%d' % (variant, i), read,
                                   chr(64 + 30) * len(read))

        reference = left + self.ref + right
        primary_variant = left + variants_used[0][0] + right

        with open(workspace / 'reference.fa', 'wb') as f:
            io.write_fasta(f, 'chr1', reference)

        legion.remake_needed()

        self.analysis(
            workspace / 'sample',
            workspace / 'reference.fa',
            reads=[workspace / 'reads.fq'],
        ).run()

        self.freebayes(
            workspace / 'freebayes',
            workspace / 'sample',
        ).run()

        self.vcf_filter(
            workspace / 'filtered',
            workspace / 'freebayes.vcf',
        ).run()

        Vcf_patch(workspace / 'patch', workspace / ('sample', 'reference'),
                  workspace / 'filtered.vcf').run()

        patched = io.read_sequences(workspace /
                                    ('patch', 'sample.fa')).next()[1]

        masked = io.read_sequences(
            workspace / ('sample', 'consensus_masked.fa')).next()[1].upper()

        with open(workspace / 'freebayes.vcf', 'rU') as f:
            reader = vcf.Reader(f)
            raw_count = len(list(reader))

        with open(workspace / 'filtered.vcf', 'rU') as f:
            reader = vcf.Reader(f)
            filtered_count = len(
                list(vcf.Reader(open(workspace / 'filtered.vcf', 'rU'))))

        with open(workspace / ('sample', 'report.txt'), 'rb') as f:
            nesoni_count = len(f.readlines()) - 1

        self.log.log('\n')
        self.log.datum(workspace.name, 'changes found by "nesoni consensus:"',
                       nesoni_count)
        self.log.datum(workspace.name,
                       'is correctly patched by "nesoni consensus:"',
                       masked == primary_variant)
        self.log.log('\n')
        self.log.datum(workspace.name, 'raw variants', raw_count)
        self.log.datum(workspace.name, 'variants after filtering',
                       filtered_count)
        self.log.datum(workspace.name, 'is correctly patched by VCF pipeline',
                       patched == primary_variant)
        self.log.log('\n')
Exemple #29
0
def main(args):
    mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
    maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
    minsize, args = grace.get_option_value(args, '--minsize', int, 200)
    what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
    is_core = (what == 'core') 

    grace.expect_no_further_options(args)
    
    if len(args) < 2:
        print >> sys.stderr, HELP
        raise grace.Help_shown()
    
    output_dir, working_dirs = args[0], args[1:]
    
    assert not path.exists(path.join(output_dir, 'reference.fa')), \
        'Output directory not given'
    
    if not path.exists(output_dir):
        os.mkdir(output_dir)
    
    for name, seq in io.read_sequences(path.join(working_dirs[0],'reference.fa')):
        print name
        friendly_name = grace.filesystem_friendly_name(name)
        
        good = [ True ] * len(seq)
        
        for working_dir in working_dirs:
            if is_core:
               suffix = '-depth.userplot'
            else:
               suffix = '-ambiguous-depth.userplot'
            data = trivia.read_unstranded_userplot(
                os.path.join(working_dir, friendly_name+suffix)
            )
            assert len(seq) == len(data)
            for i in xrange(len(seq)):
               if good[i]:
                   if is_core:
                       good[i] = data[i] >= mincov
                   else:
                       good[i] = data[i] < mincov

        #Close holes
        start = -maxdiff-1
        n_holes = 0
        for i in xrange(len(seq)):
            if good[i]:
                 if 0 < i-start <= maxdiff:
                     for j in xrange(start,i): good[j] = True
                     n_holes += 1
                 start = i+1
        print 'Closed', grace.pretty_number(n_holes), 'holes'
        
        
        f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else 'N')
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f = open(path.join(output_dir, '%s-%s_masked.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else seq[i].lower())
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f_good = open(path.join(output_dir, '%s-%s_parts.fa' % (friendly_name,what)), 'wb')
        f_nongood = open(path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name,what)), 'wb')
        start = 0
        n_good = [0]
        n_good_bases = [0]    
        def emit(i):
            if i-start < minsize: return
            if good[start]:
                n_good[0] += 1
                n_good_bases[0] += i-start
            io.write_fasta(
                f_good if good[start] else f_nongood,
                '%s:%d..%d' % (name, start+1,i),
                seq[start:i]
            )
        for i in xrange(1,len(seq)):
            if good[i] != good[start]:
                emit(i)
                start = i
        emit(len(seq))
        f_nongood.close()
        f_good.close()
        
        print grace.pretty_number(sum(good)), 'bases are '+what+', of', grace.pretty_number(len(seq)), 'in reference sequence'
        print grace.pretty_number(n_good[0]), 'parts at least', grace.pretty_number(minsize), 'bases long with', grace.pretty_number(n_good_bases[0]), 'total bases'

        print
Exemple #30
0
def pastiche(args):
    if len(args) < 4:
        print USAGE
        return 1

    mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool, False)
    min_leftover, args = grace.get_option_value(args, '--min-leftover', int, 20)
        
    output_dir, args = args[0], args[1:]
    
    #, ref_filename, contig_filenames = args[0], args[1], args[2:]
    
    ref_filenames = [ ]
    contig_filenames = [ ]
    grace.execute(args, {
        'contigs' : lambda args: contig_filenames.extend(args)
    }, lambda args: ref_filenames.extend(args))
    
    assert ref_filenames, 'No reference sequences given'
    assert contig_filenames, 'No contig sequences given'
    
    contigs = dict([ 
                 (name.split()[0], seq) 
                 for filename in contig_filenames 
                 for name, seq in io.read_sequences(filename) 
              ])
    dir_contigs = { }
    for name in contigs:
        dir_contigs[name + '+'] = contigs[name]
        dir_contigs[name + '-'] = bio.reverse_complement(contigs[name])
    
    dir_contigs_used = { }
    for name in dir_contigs:
        dir_contigs_used[name] = [ False ] * len(dir_contigs[name])


    workspace = io.Workspace(output_dir)
    temp_prefix = workspace._object_filename('temp-pastiche')
    
    out_f = workspace.open('pastiche.fa', 'wb')
    
    for ref_filename in ref_filenames:
      for ref_name, ref_seq in io.read_sequences(ref_filename):
        ref_name = ref_name.split()[0]
        
        grace.status(ref_name)
        
        f = open(temp_prefix + '.fa','wb')
        io.write_fasta(f, 'ref', ref_seq)
        f.close()
    
        scores = [ -1 ] * (len(ref_seq)*2)
        strings = [ 'N', '' ] * (len(ref_seq))
        contexts = [ None for i in xrange(len(ref_seq)*2) ]
        
        #MAXSCORE = len(ref_seq)+1
        #for i in xrange(len(ref_seq)):
        #    if ref_seq[i].upper() != 'N':
        #        strings[i*2] = ref_seq[i]
        #        scores[i*2] = MAXSCORE
        #for i in xrange(len(ref_seq)-1):
        #    if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N':
        #        scores[i*2+1] = MAXSCORE

        if mask_only:        
            for i in xrange(len(ref_seq)):
                strings[i*2] = ref_seq[i].lower()
        
        
        def put(position, dir_contig_name, start, end, score):
            if scores[position] < score:
                scores[position] = score
                strings[position] = dir_contigs[dir_contig_name][start:end]
                contexts[position] = (dir_contig_name, start, end, score)

        for contig_filename in contig_filenames:
            execute(['nucmer',
                     '--prefix', temp_prefix,
                     #'--maxmatch', #Very slow
                     '--nosimplify',
                     '--minmatch', '9',
                     '--mincluster', '50',
                     #'--maxgap', '1000',
                     #'--breaklen', '1000', # Increasing this reduces Ns, but is slow
                     #'--diagfactor', '1.0',
                     temp_prefix+'.fa',
                     contig_filename])
            
            for contig_name, contig_seq in io.read_sequences(contig_filename):
                contig_name = contig_name.split()[0]
                grace.status(ref_name + ' vs ' + contig_name)
                p = run(['show-aligns', temp_prefix+'.delta', 'ref', contig_name],
                        stderr=subprocess.PIPE)
                
                alignments = [ ]
                
                while True:
                    line = p.stdout.readline()
                    if not line: break
                    if not line.startswith('-- BEGIN'):
                        continue
                    
                    parts = line.split()
                    
                    ref_start = int(parts[5])
                    ref_end = int(parts[7])
                    query_start = int(parts[10])
                    query_end = int(parts[12])
                    
                    #assert ref_start < ref_end
                    #ref_start -= 1 #Zero based coordinates
                    
                    al_ref = [ ]
                    al_query = [ ]
                    
                    while True:
                        block = [ ]
                        end = False
                        while True:
                            line = p.stdout.readline()
                            if line.startswith('--   END'): 
                                end = True
                                break
                            if line == '\n':
                                if block: 
                                    break
                                else:
                                    continue
                            block.append(line)
                        
                        if end: break
                        
                        al_ref.append(block[0].split()[1])
                        al_query.append(block[1].split()[1])
                        
                    al_ref = ''.join(al_ref)
                    al_query = ''.join(al_query)            
                    
                    if ref_start > ref_end:
                       al_ref = bio.reverse_complement(al_ref)
                       al_query = bio.reverse_complement(al_query)
                       ref_start, ref_end = ref_end, ref_start
                       query_start, query_end = query_end, query_start
                    
                    if query_start > query_end:
                       dir_contig_name = contig_name + '-'
                       query_start = len(contig_seq)+1-query_start
                       query_end = len(contig_seq)+1-query_end
                    else:
                       dir_contig_name = contig_name + '+'
                       
                    ref_start -= 1 #Zero based coordinates
                    query_start -= 1
                    
                    #print al_ref
                    #print al_query
                    
                    #Pretty dumb scoring scheme
                    al_score = 0
                    for i in xrange(len(al_ref)):
                        if al_ref[i] == al_query[i]:
                            al_score += 1
                        #else:
                        #    al_score -= 1
                    
                    #Pastiche alignment over reference
                    ref_pos = ref_start
                    query_pos = query_start
                    al_pos = 0
                    while al_pos < len(al_ref):
                        assert al_ref[al_pos] != '.'                
                        if al_query[al_pos] == '.':
                            put(ref_pos*2, dir_contig_name, query_pos, query_pos, al_score)
                        else:
                            assert al_query[al_pos].lower() == dir_contigs[dir_contig_name][query_pos].lower()
                            put(ref_pos*2, dir_contig_name, query_pos, query_pos+1, al_score)
                            query_pos += 1
                        al_pos += 1
                        
                        al_pos_end = al_pos
                        query_pos_end = query_pos
                        while al_pos_end < len(al_ref) and al_ref[al_pos_end] == '.':
                            al_pos_end += 1
                            query_pos_end += 1
                        #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score)
                        assert al_query[al_pos:al_pos_end].lower() == dir_contigs[dir_contig_name][query_pos:query_pos_end].lower() 
                        put(ref_pos*2+1, dir_contig_name, query_pos,query_pos_end, al_score)
                        al_pos = al_pos_end
                        query_pos = query_pos_end
                        ref_pos += 1
                    
                    
                p.wait()
            
        grace.status(ref_name)
        
        result = ''.join(strings)    
        io.write_fasta(out_f, ref_name, result)
        
        
        for context in contexts:
            if context is None: continue
            name,start,end,score = context
            for i in xrange(start,end):
                dir_contigs_used[name][i] = True
        
        
        #Interpolation
        #result = [ ]
        #i = 0
        #while i < len(ref_seq):
        #    if strings[i*2].upper() != 'N':
        #        result.append(strings[i*2])
        #        result.append(strings[i*2+1])
        #        i += 1
        #        continue
        #    
        #    j = i
        #    while strings[j*2].upper() == 'N':
        #        j += 1
        #    
        #    grace.status('')
        #    print >> sys.stderr, 'interpolating', i+1,'..',j
        #    
        #    window = 20 #!!!!!!!!!!!
        #    left_contexts = collections.defaultdict(lambda:0)
        #    for i1 in xrange(max(0,i-window),i):
        #        for context_name, context_start, context_end, context_score in contexts[i1*2]:
        #            key = (context_name, context_end + i - i1)
        #            left_contexts[key] = max(left_contexts[key],context_score)
        #        
        #    right_contexts = collections.defaultdict(lambda:0)
        #    for j1 in xrange(j,min(j+window,len(ref_seq))):
        #        for context_name, context_start, context_end, context_score in contexts[j1*2]:
        #            key = (context_name, context_start + j - j1)
        #            right_contexts[key] = max(left_contexts[key],context_score)
        #    
        #    #print >> sys.stderr, left_contexts
        #    #print >> sys.stderr, right_contexts
        #    
        #    options = [ ]
        #    
        #    for (left_name, left_pos), left_score in left_contexts.items():
        #        for (right_name, right_pos), right_score in right_contexts.items():
        #            if left_name != right_name: continue
        #            if right_pos < left_pos: continue
        #            
        #            if right_pos-left_pos > (j-i) * 4.0 + 10: continue   #!!!!!!!!!!!!!!!!!!!!!!1
        #            if right_pos-left_pos < (j-i) * 0.25 - 10: continue
        #            
        #            score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i)                  
        #            score *= left_score + right_score
        #            #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score
        #            options.append( (score, left_name, left_pos, right_pos) )
        #    
        #    if options:
        #        best = max(options, key=lambda option: option[0])
        #        print >> sys.stderr, '->', best
        #        result.append( dir_contigs[best[1]][best[2]:best[3]].lower() )
        #    else:
        #        print >> sys.stderr, '-> no good interpolation'
        #        result.append( ref_seq[i:j] )
        #    
        #    i = j
        #
        #result = ''.join(result)    
        #io.write_fasta(sys.stdout, ref_name, result)
        
        
        #print >> sys.stderr, len(result), result.count('N')
        #for pos, size in N_runs:
        #    out_size = len(''.join( strings[pos*2:pos*2+2] ))
        #    print >> sys.stderr, pos, size, '->', out_size        
    
    out_f.close()
    
    grace.status('')
    
    #for name, seq in io.read_sequences(ref_filename):
    #    result = pastiche(seq, contigs_filename)
    #    io.write_fasta(sys.stdout, name, result)
    
    
    leftover_f = workspace.open('leftovers.fa','wb')

    for name in sorted(contigs):
        used = [ (a or b) for a,b in zip(dir_contigs_used[name+'+'],dir_contigs_used[name+'-'][::-1]) ]

        i = 0
        while i < len(used):
            j = i
            while j < len(used) and not used[j]: 
                j += 1
            if j-i > min_leftover:
                if i == 0 and j == len(used):
                    out_name = name
                else:
                    out_name = name + ':%d..%d' % (i+1,j)
                io.write_fasta(leftover_f, out_name, contigs[name][i:j])
            
            i = j+1        

    leftover_f.close()

    for suffix in ['.fa', '.delta']:
        os.unlink(temp_prefix + suffix)