Ejemplo n.º 1
0
    def run(self):
        f = self.begin_output()

        for filename in self.filenames:
            any = False

            name = os.path.splitext(os.path.split(filename)[1])[0]

            try:
                iterator = io.read_sequences(filename, qualities=True)
            except grace.Error:
                iterator = None

            if iterator is not None:
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'average length',
                                        float(total_length) / total)
                print >> f
                any = True

            try:
                iterator = annotation.read_annotations(filename)
            except grace.Error:
                iterator = None

            if iterator:
                total = 0
                counts = {}
                for item in iterator:
                    total += 1
                    counts[item.type] = counts.get(item.type, 0) + 1

                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features',
                                            counts[key])
                print >> f
                any = True

            if not any:
                raise grace.Error(
                    filename +
                    ' is neither a sequence file nor an annotation file that nesoni can read.'
                )

        self.end_output(f)
Ejemplo n.º 2
0
def main(args):
    size, args = grace.get_option_value(args, '--size', int, 200)
    stride, args = grace.get_option_value(args, '--stride', int, 50)
    grace.expect_no_further_options(args)

    if not args:
        print USAGE
        return 1

    for filename in args:
        for name, seq in io.read_sequences(filename):
            name_parts = name.split(None, 1)
            name = name_parts[0]
            if len(name_parts) > 1:
                desc = ' ' + name_parts[1]
            else:
                desc = ''

            for i in xrange(-size + stride, len(seq), stride):
                start = max(0, min(len(seq), i))
                end = max(0, min(len(seq), i + size))
                io.write_fasta(sys.stdout,
                               '%s:%d..%d' % (name, start + 1, end) + desc,
                               seq[start:end])

    return 0
Ejemplo n.º 3
0
    def run(self):
        extractions = [ ]
        for item in self.genes.split(','):
            extraction = item.split('/')
            assert len(extraction) == 4
            extractions.append(extraction)
            
        rename = { }
        if self.rename:
            for item in self.rename.split(','):
                old,new = item.split('=')
                rename[old] = new

        work = self.get_workspace()        
        
        with workspace.tempspace() as temp:
            items = list(annotation.read_annotations(self.annotation))
            for item in items:
                item.seqid = rename.get(item.seqid, item.seqid)
            annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
            del items
            
            with open(temp/'temp.fa','wb') as f:
                for name,seq in io.read_sequences(self.genome):
                    name = name.split()[0]
                    name = rename.get(name,name)
                    io.write_fasta(f, name, seq)
            
            reference_directory.Make_tt_reference(
                self.output_dir,
                filenames = [ temp/'temp.fa', temp/'temp.gff' ],
                index = self.index,
                ).run()
Ejemplo n.º 4
0
def make_file_for_primer_3 (gff_file, ref_file, names_file, output_file):
    # check for a tmp direcory 
    if len(glob.glob("./tmp")) == 0:
        call (["mkdir", "tmp"])

    gff_file = list(annotation.read_annotations(gff_file))
    print "\n Reading in the reference file"
    seq_dict = dict(io.read_sequences(ref_file))
    names_file = open(names_file).readlines()
    config  = open("primer_config.txt").readlines()

    with open("tmp/regions_" + output_file, 'w') as out_f:
        for name in names_file:
            sname = name.strip("\n")
            found = False
            for line in gff_file:
                gff_name = line.attr.get ("Name", "No_name")
                peak = line.attr.get ("id", "No_id")
                if sname in gff_name.split("/"):
                    out_f.write ("SEQUENCE_ID="+ gff_name.replace("/", "_") + "_" + peak + "\n")
                    out_f.write("SEQUENCE_TEMPLATE=" + line.shifted(-100, 0).get_seq(seq_dict) + "\n")
                    found = True
                    for cline in config:
                        out_f.write(cline.strip("\n")  + "\n")
                    out_f.write("="  + "\n")
            if found ==False:
                print "Could not find the gene " + sname + " in the gff file"
Ejemplo n.º 5
0
def main(args):
    size, args = grace.get_option_value(args,'--size',int,200)
    stride, args = grace.get_option_value(args,'--stride',int,50)
    grace.expect_no_further_options(args)
    
    if not args:
        print USAGE
        return 1
    
    for filename in args:
        for name, seq in io.read_sequences(filename):
            name_parts = name.split(None, 1)
            name = name_parts[0]
            if len(name_parts) > 1:
               desc = ' ' + name_parts[1]
            else:
               desc = ''
            
            for i in xrange(-size+stride,len(seq),stride):
                start = max(0,min(len(seq),i))
                end = max(0,min(len(seq), i+size))
                io.write_fasta(
                    sys.stdout,
                    '%s:%d..%d' % (name,start+1,end) + desc,
                    seq[start:end]
                )
    
    return 0
Ejemplo n.º 6
0
 def build_snpeff(self):
     jar = io.find_jar('snpEff.jar')
     
     with open(self/'snpeff.config','wb') as f:
         print >> f, 'data_dir = snpeff'
         print >> f, 'genomes : ' + self.name
         print >> f, self.name + '.genome : ' + self.name 
     
     snpwork = io.Workspace(self/'snpeff',must_exist=False)
     snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False)
     snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False)
     
     annotations = self.annotations_filename()
     assert annotations
     with open(snpwork_genome/'genes.gff','wb') as f:
         for record in annotation.read_annotations(annotations):
             if record.end <= record.start: continue
             if not record.attr:
                 record.attr['attributes'] = 'none'
             print >> f, record.as_gff()
     
     with open(snpwork_genomes/(self.name+'.fa'),'wb') as f:
         for name, seq in io.read_sequences(self.reference_fasta_filename()):
             io.write_fasta(f, name, seq)
             
     io.execute('java -jar JAR build NAME -gff3 -c CONFIG',
         JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
Ejemplo n.º 7
0
def make_file_for_primer_3(gff_file, ref_file, names_file, output_file, start,
                           end):
    # check for a tmp direcory
    if len(glob.glob("./tmp")) == 0:
        call(["mkdir", "tmp"])

    gff_file = list(annotation.read_annotations(gff_file))
    print "\nReading in the reference file\n"
    seq_dict = dict(io.read_sequences(ref_file))

    names_file = open(names_file).readlines()
    config = open("Software/primer_config.txt").readlines()

    with open("tmp/regions_" + output_file, 'w') as out_f:
        for name in names_file:
            sname = name.strip("\n ")
            found = False
            for line in gff_file:
                gff_name = line.attr.get("Name", "No_name")
                peak = line.attr.get("id", "No_id")
                if sname in gff_name.split("/"):
                    out_f.write("SEQUENCE_ID=" + gff_name.replace("/", "_") +
                                "_" + peak + "\n")
                    # Move the peaks 30 bases proximal
                    out_f.write("SEQUENCE_TEMPLATE=" +
                                line.shifted(start, end).get_seq(seq_dict) +
                                "\n")
                    found = True
                    for cline in config:
                        out_f.write(cline.strip("\n") + "\n")
                    out_f.write("=" + "\n")
            if found == False:
                print "Could not find the gene " + sname + " in the reference gff file\n"
    def run(self):
        extractions = [ ]
        for item in self.genes.split(','):
            extraction = item.split('/')
            assert len(extraction) == 4
            extractions.append(extraction)
            
        rename = { }
        if self.rename:
            for item in self.rename.split(','):
                old,new = item.split('=')
                rename[old] = new

        work = self.get_workspace()        
        
        with workspace.tempspace() as temp:
            items = list(annotation.read_annotations(self.annotation))
            for item in items:
                item.seqid = rename.get(item.seqid, item.seqid)
            annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
            del items
            
            with open(temp/'temp.fa','wb') as f:
                for name,seq in io.read_sequences(self.genome):
                    name = name.split()[0]
                    name = rename.get(name,name)
                    io.write_fasta(f, name, seq)
            
            reference_directory.Make_tt_reference(
                self.output_dir,
                filenames = [ temp/'temp.fa', temp/'temp.gff' ] + self.extra,
                index = self.index, shrimp = self.shrimp, 
                bowtie = self.bowtie, star = self.star
                ).run()
Ejemplo n.º 9
0
    def run(self):
        seqs = []
        seen = 0
        for filename in self.filenames:
            for seq in io.read_sequences(filename, qualities=True):
                seen += 1
                if seen % 100000 == 0:
                    grace.status('Scanned ' + grace.pretty_number(seen))

                if len(seqs) < self.n:
                    seqs.append(seq)
                elif self.n <= random.random() * seen:
                    seqs[random.randrange(self.n)] = seq
        grace.status('')

        print >> sys.stderr, 'Sampled', grace.pretty_number(
            len(seqs)), 'of', grace.pretty_number(seen), 'sequences'

        if not seqs: return
        qualities = len(seqs[0])
        if qualities:
            for name, seq, qual in seqs:
                io.write_fastq(sys.stdout, name, seq, qual)
        else:
            for name, seq in seqs:
                io.write_fastq(sys.stdout, name, seq)
Ejemplo n.º 10
0
    def run(self):
        f = self.begin_output()
    
        for filename in self.filenames:
            any = False
            
            name = os.path.splitext(os.path.split(filename)[1])[0]
            
            try:
               iterator = io.read_sequences(filename, qualities=True)
            except grace.Error:
               iterator = None
               
            if iterator is not None:
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'average length', float(total_length)/total)
                print >> f
                any = True
            
            try:
                iterator = annotation.read_annotations(filename)
            except grace.Error:
                iterator = None
                
            if iterator:
                total = 0
                counts = { }
                for item in iterator:
                    total += 1
                    counts[item.type] = counts.get(item.type,0)+1
                                
                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features', counts[key])
                print >> f
                any = True
            
            if not any:
                raise grace.Error(filename + ' is neither a sequence file nor an annotation file that nesoni can read.')

        self.end_output(f)
Ejemplo n.º 11
0
    def run(self):
        base = os.path.split(self.prefix)[1]
        
        annotations = [ ]
        sequences = [ ]
        
        for filename in self.filenames:
            any = False
            if io.is_sequence_file(filename):
                sequences.append(filename)
                any = True
            if annotation.is_annotation_file(filename):
                annotations.append(filename)
                any = True
            assert any, 'File is neither a recognized sequence or annotation file'

        cytoband_filename = os.path.join(self.prefix,base+'_cytoband.txt')
        property_filename = os.path.join(self.prefix,'property.txt')
        gff_filename = os.path.join(self.prefix,base+'.gff')
        output_filenames = [ cytoband_filename, property_filename, gff_filename ] 

        if not os.path.exists(self.prefix):
            os.mkdir(self.prefix)
            
        f = open(property_filename,'wb')
        print >> f, 'ordered=true'
        print >> f, 'id=%s' % base
        print >> f, 'name=%s' % (self.name or base)
        print >> f, 'cytobandFile=%s_cytoband.txt' % base
        print >> f, 'geneFile=%s.gff' % base
        print >> f, 'sequenceLocation=%s' % base
        f.close()
        
        trivia.As_gff(output=gff_filename,
               filenames=annotations,
               exclude=[ 'gene', 'source' ]
        ).run()
        
        f_cyt = open(cytoband_filename,'wb')
        for filename in sequences:
            for name, seq in io.read_sequences(filename):
                assert '/' not in name
                f = open(os.path.join(self.prefix, name + '.txt'), 'wb')
                f.write(seq)
                f.close()
                print >> f_cyt, '%s\t0\t%d' % (name, len(seq))
        f_cyt.close()
        
        genome_filename = self.prefix + '.genome'
        if os.path.exists(genome_filename):
            os.unlink(genome_filename)
        io.execute(
            ['zip', '-j', io.abspath(genome_filename)] +
            [ io.abspath(item) for item in output_filenames ]
        )
        for filename in output_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
Ejemplo n.º 12
0
 def get_lengths(self):
     #Legacy working directory
     if not self.object_exists('reference-lengths.pickle.gz'):
         lengths = [ ]
         for name, seq in io.read_sequences(self.reference_fasta_filename()):
             name = name.split()[0]
             lengths.append( (name, len(seq)) )
         self.set_object(lengths, 'reference-lengths.pickle.gz')
             
     return self.get_object('reference-lengths.pickle.gz')
Ejemplo n.º 13
0
 def convert(filename):
     info = io.get_file_info(filename)
     ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info)
     if ok:
         return filename            
     result_name = tempname()
     with open(result_name,'wb') as f:
         for name, seq, qual in io.read_sequences(filename, qualities='required'):
             io.write_fastq(f, name, seq, qual)
     return result_name
Ejemplo n.º 14
0
 def get_lengths(self):
     #Legacy working directory
     if not self.object_exists('reference-lengths.pickle.gz'):
         lengths = [ ]
         for name, seq in io.read_sequences(self.reference_fasta_filename()):
             name = name.split()[0]
             lengths.append( (name, len(seq)) )
         self.set_object(lengths, 'reference-lengths.pickle.gz')
             
     return self.get_object('reference-lengths.pickle.gz')
Ejemplo n.º 15
0
    def run(self):
        workspace = self.get_workspace()

        reference = reference_directory.Reference(self.reference,
                                                  must_exist=True)

        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)
        variants = collections.defaultdict(list)
        for record in reader:
            variants[record.CHROM].append(record)
        reader_f.close()

        for chrom in variants:
            variants[chrom].sort(key=lambda item: item.POS)

        filenames = [workspace / (item + '.fa') for item in reader.samples]
        for filename in filenames:
            with open(filename, 'wb'):
                pass

        for name, seq in io.read_sequences(
                reference.reference_fasta_filename()):
            for i, sample in enumerate(reader.samples):
                revised = []
                pos = 0
                for variant in variants[name]:
                    gt = variant.samples[i].data.GT
                    if gt is None: continue
                    assert gt.isdigit(
                    ), 'Unsupported genotype (can only use haploid genotypes): ' + gt
                    gt_number = int(gt)
                    if gt_number == 0:
                        var_seq = variant.REF
                    else:
                        var_seq = str(variant.ALT[gt_number - 1])
                        assert re.match(
                            '[ACGTN]*$',
                            var_seq), 'Unsupported variant type: ' + var_seq
                    new_pos = variant.POS - 1
                    assert new_pos >= pos, 'Variants overlap.'
                    revised.append(seq[pos:new_pos])
                    pos = new_pos
                    revised.append(var_seq)
                    assert seq[pos:pos + len(variant.REF)].upper(
                    ) == variant.REF, 'REF column in VCF does not match reference sequence'
                    pos += len(variant.REF)
                revised.append(seq[pos:])

                with open(filenames[i], 'ab') as f:
                    io.write_fasta(f, name, ''.join(revised))

            del variants[name]
        assert not variants, 'Chromosome names in VCF not in reference: ' + ' '.join(
            variants)
Ejemplo n.º 16
0
    def run(self):
        assert self.release
        assert self.species
        assert self.assembly
        assert self.dna
        
        extractions = [ ]
        for item in self.genes.split(','):
            extraction = item.split('/')
            assert len(extraction) == 4
            extractions.append(extraction)
            
        rename = { }
        if self.rename:
            for item in self.rename.split(','):
                old,new = item.split('=')
                rename[old] = new

        work = self.get_workspace()        
        ensembl = workspace.Workspace(work/'ensembl')
        
        genome_filename = self.species+"."+self.assembly+"."+self.dna+".fa.gz"
        genome_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/fasta/"+self.species.lower()+"/dna/"+genome_filename
        
        gff_filename = self.species+"."+self.assembly+"."+self.release+".gff3.gz"
        gff_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/gff3/"+self.species.lower()+"/"+gff_filename
        
        
        if self.download:
            self.log.log("Fetching "+genome_url+"\n")
            io.execute(['rsync','-aP',genome_url, ensembl/genome_filename])
            self.log.log("Fetching "+gff_url+"\n")
            io.execute(['rsync','-aP',gff_url, ensembl/gff_filename])
        
        with workspace.tempspace() as temp:
            items = list(annotation.read_annotations(ensembl/gff_filename))
            for item in items:
                item.seqid = rename.get(item.seqid, item.seqid)
            annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log))
            del items
            
            with open(temp/'temp.fa','wb') as f:
                for name,seq in io.read_sequences(ensembl/genome_filename):
                    name = name.split()[0]
                    name = rename.get(name,name)
                    io.write_fasta(f, name, seq)
            
            reference_directory.Make_tt_reference(
                self.output_dir,
                filenames = [ temp/'temp.fa', temp/'temp.gff' ],
                index = self.index,
                ).run()
Ejemplo n.º 17
0
 def convert(filename):
     info = io.get_file_info(filename)
     ok = selection.matches(
         'type-fastq:[compression-none/compression-gzip/compression-bzip2]',
         info)
     if ok:
         return filename
     result_name = tempname()
     with open(result_name, 'wb') as f:
         for name, seq, qual in io.read_sequences(
                 filename, qualities='required'):
             io.write_fastq(f, name, seq, qual)
     return result_name
Ejemplo n.º 18
0
    def run(self):
        f = self.begin_output()

        for filename in self.filenames:
            info = io.get_file_info(filename)

            any = False

            name = os.path.splitext(os.path.split(filename)[1])[0]

            if info.matches('sequences'):
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'total bases', total_length)
                if total:
                    print >> f, grace.datum(name, 'average length',
                                            float(total_length) / total)
                print >> f
                any = True

            if info.matches('annotations'):
                total = 0
                counts = {}
                for item in annotation.read_annotations(filename, "/"):
                    total += 1
                    counts[item.type] = counts.get(item.type, 0) + 1

                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features',
                                            counts[key])
                print >> f
                any = True

            if info.matches('type-vcf'):
                reader_f = io.open_possibly_compressed_file(filename)
                reader = vcf.Reader(reader_f)
                n = 0
                for item in reader:
                    n += 1
                print >> f, grace.datum(name, 'variants', n)
                any = True

            if not any:
                raise grace.Error('Don\'t know what to do with ' + filename)

        self.end_output(f)
Ejemplo n.º 19
0
def debias(args):
    import numpy

    radius, args = grace.get_option_value(args, '--radius', int, 2)

    dirs = args

    for dir_name in dirs:
        for name, seq in io.read_sequences(
                os.path.join(dir_name, 'reference.fa')):
            for suffix, ambig_suffix in [
                ('-depth', '-ambiguous-depth'),
                ('-pairspan-depth', '-ambiguous-pairspan-depth'),
            ]:
                root = grace.filesystem_friendly_name(name)
                full_name = os.path.join(dir_name, root + suffix + '.userplot')
                full_ambig_name = os.path.join(
                    dir_name, root + ambig_suffix + '.userplot')
                if not os.path.exists(full_name): continue
                if not os.path.exists(full_ambig_name): continue

                output_suffix = '-%d.userplot' % radius

                print dir_name, root, output_suffix

                depths = numpy.array(read_unstranded_userplot(full_name))
                ambig_depths = numpy.array(
                    read_unstranded_userplot(full_ambig_name))
                expect = expected_depth(root, seq, depths, ambig_depths,
                                        radius)

                write_unstranded_userplot(
                    os.path.join(dir_name,
                                 root + suffix + '-expected' + output_suffix),
                    expect)

                corrected = depths / expect * numpy.median(expect)
                corrected[expect <= 5.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(dir_name,
                                 root + suffix + '-corrected' + output_suffix),
                    corrected)

                ambig_corrected = ambig_depths / expect * numpy.median(expect)
                ambig_corrected[expect <= 0.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(
                        dir_name,
                        root + ambig_suffix + '-corrected' + output_suffix),
                    ambig_corrected)
Ejemplo n.º 20
0
    def run(self):
        f = self.begin_output()
    
        for filename in self.filenames:
            info = io.get_file_info(filename)
            
            any = False
            
            name = os.path.splitext(os.path.split(filename)[1])[0]
            
            if info.matches('sequences'):
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'total bases', total_length)
                if total:
                    print >> f, grace.datum(name, 'average length', float(total_length)/total)
                print >> f
                any = True
            
            if info.matches('annotations'):
                total = 0
                counts = { }
                for item in annotation.read_annotations(filename):
                    total += 1
                    counts[item.type] = counts.get(item.type,0)+1
                                
                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features', counts[key])
                print >> f
                any = True
            
            if info.matches('type-vcf'):
                reader_f = io.open_possibly_compressed_file(filename)
                reader = vcf.Reader(reader_f)
                n = 0
                for item in reader:
                    n += 1
                print >> f, grace.datum(name, 'variants', n)
                any = True
            
            if not any:
                raise grace.Error('Don\'t know what to do with ' + filename)

        self.end_output(f)
Ejemplo n.º 21
0
def iter_reads(config, qualities=False):
    if 'stride' not in config:
        raise grace.Error('Please re-run nesoni shrimp, output format has changed')

    stride = config['stride']
    for reads_filename_set in config['reads']:
        if config['solid']:
            reader = [ io.read_solid(filename) for filename in reads_filename_set ]
        else:
            reader = [ io.read_sequences(filename, qualities) for filename in reads_filename_set ]
        reader = itertools.izip(*reader)
        
        for i, items in enumerate(reader):
            if i % stride == 0: 
               for item in items: 
                   yield item
Ejemplo n.º 22
0
    def run(self):
        workspace = self.get_workspace()
        
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)
        variants = collections.defaultdict(list)
        for record in reader:
            variants[record.CHROM].append(record)
        reader_f.close()
        
        for chrom in variants:
            variants[chrom].sort(key=lambda item: item.POS)
        
        filenames = [ workspace/(item+'.fa') for item in reader.samples ]
        for filename in filenames:
            with open(filename,'wb'): pass
        
        for name, seq in io.read_sequences(reference.reference_fasta_filename()):
            for i, sample in enumerate(reader.samples):            
                revised = [ ]
                pos = 0
                for variant in variants[name]:
                    gt = variant.samples[i].data.GT
                    if gt is None: continue
                    assert gt.isdigit(), 'Unsupported genotype (can only use haploid genotypes): '+gt
                    gt_number = int(gt)
                    if gt_number == 0:
                        var_seq = variant.REF
                    else:
                        var_seq = str(variant.ALT[gt_number-1])
                        assert re.match('[ACGTN]*$', var_seq), 'Unsupported variant type: '+var_seq
                    new_pos = variant.POS-1
                    assert new_pos >= pos, 'Variants overlap.'
                    revised.append(seq[pos:new_pos])
                    pos = new_pos
                    revised.append(var_seq)
                    assert seq[pos:pos+len(variant.REF)].upper() == variant.REF, 'REF column in VCF does not match reference sequence'
                    pos += len(variant.REF)
                revised.append(seq[pos:])
                            
                with open(filenames[i],'ab') as f:
                    io.write_fasta(f, name, ''.join(revised))

            del variants[name]        
        assert not variants, 'Chromosome names in VCF not in reference: '+' '.join(variants)
Ejemplo n.º 23
0
    def run(self):    
        f = self.begin_output()
    
        for filename in self.filenames:
            for name, seq in io.read_sequences(filename):
                name_parts = name.split(None, 1)
                name = name_parts[0]
                for i in xrange(-self.size+self.stride,len(seq),self.stride):
                    start = max(0,min(len(seq),i))
                    end = max(0,min(len(seq), i+self.size))
                    shred_name = '%s:%d..%d' % (name,start+1,end)
                    shred_seq = seq
                    if self.quality:
                        io.write_fastq(f, shred_name, seq[start:end], chr(33+self.quality)*(end-start))
                    else:
                        io.write_fasta(f, shred_name, seq[start:end])

        self.end_output(f)
Ejemplo n.º 24
0
    def run(self):
        min_quality = chr(33+self.quality)

        with io.open_possibly_compressed_writer(self.prefix+'.csfastq.gz') as out_file:        
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0
            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    
                    score = 0
                    start = 0
                    for i in xrange(len(seq)-1):
                        if qual[i] >= min_quality:
                            if seq[i+1] == '0':
                                score += 1
                            else:
                                score = max(0, score-4)
                                if not score: start = i+2
                    
                    n += 1
                    total_before += len(seq)
                
                    if start > self.length+1:
                        if start < len(seq):
                            n_clipped += 1
                            total_clipped += len(seq)-start
                        
                        print >> out_file, '@'+name
                        print >> out_file, seq[:start]
                        print >> out_file, '+'
                        print >> out_file, qual[:start-1]
                    else:
                        n_discarded += 1
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
Ejemplo n.º 25
0
    def run(self):
        min_quality = chr(33+self.quality)

        with io.open_possibly_compressed_writer(self.prefix+'.csfastq.gz') as out_file:        
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0
            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    
                    score = 0
                    start = 0
                    for i in xrange(len(seq)-1):
                        if qual[i] >= min_quality:
                            if seq[i+1] == '0':
                                score += 1
                            else:
                                score = max(0, score-4)
                                if not score: start = i+2
                    
                    n += 1
                    total_before += len(seq)
                
                    if start > self.length+1:
                        if start < len(seq):
                            n_clipped += 1
                            total_clipped += len(seq)-start
                        
                        print >> out_file, '@'+name
                        print >> out_file, seq[:start]
                        print >> out_file, '+'
                        print >> out_file, qual[:start-1]
                    else:
                        n_discarded += 1
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
Ejemplo n.º 26
0
def debias(args):
    import numpy

    radius, args = grace.get_option_value(args, '--radius', int, 2) 

    dirs = args
    
    for dir_name in dirs:
        for name, seq in io.read_sequences(os.path.join(dir_name,'reference.fa')):
            for suffix, ambig_suffix in [
                ('-depth', '-ambiguous-depth'),
                ('-pairspan-depth', '-ambiguous-pairspan-depth'),
            ]:
                root = grace.filesystem_friendly_name(name)
                full_name = os.path.join(dir_name, root + suffix + '.userplot')
                full_ambig_name = os.path.join(dir_name, root + ambig_suffix + '.userplot')
                if not os.path.exists(full_name): continue
                if not os.path.exists(full_ambig_name): continue
                
                output_suffix = '-%d.userplot' % radius 

                print dir_name, root, output_suffix
                
                depths = numpy.array( read_unstranded_userplot(full_name) )
                ambig_depths = numpy.array( read_unstranded_userplot(full_ambig_name) )
                expect = expected_depth(root, seq, depths, ambig_depths, radius)
                
                write_unstranded_userplot(
                    os.path.join(dir_name, root + suffix + '-expected' + output_suffix),
                    expect) 
                
                corrected = depths / expect * numpy.median(expect)
                corrected[expect <= 5.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(dir_name, root + suffix + '-corrected' + output_suffix),
                    corrected)                 
                
                ambig_corrected = ambig_depths / expect * numpy.median(expect)
                ambig_corrected[expect <= 0.0] = 0.0
                write_unstranded_userplot(
                    os.path.join(dir_name, root + ambig_suffix + '-corrected' + output_suffix),
                    ambig_corrected)                 
Ejemplo n.º 27
0
    def run(self):
        f = self.begin_output()

        for filename in self.filenames:
            for name, seq in io.read_sequences(filename):
                name_parts = name.split(None, 1)
                name = name_parts[0]
                for i in xrange(-self.size + self.stride, len(seq),
                                self.stride):
                    start = max(0, min(len(seq), i))
                    end = max(0, min(len(seq), i + self.size))
                    shred_name = '%s:%d..%d' % (name, start + 1, end)
                    shred_seq = seq
                    if self.quality:
                        io.write_fastq(f, shred_name, seq[start:end],
                                       chr(33 + self.quality) * (end - start))
                    else:
                        io.write_fasta(f, shred_name, seq[start:end])

        self.end_output(f)
Ejemplo n.º 28
0
    def set_sequences(self, filenames):        
        reference_genbank_filename = self / 'reference.gbk'
        reference_filename = self / 'reference.fa'

        reference_genbank_file = open(reference_genbank_filename,'wb')
        any_genbank = [ False ]

        def genbank_callback(name, record):
            """ Make a copy of any genbank files passed in. """
            from Bio import SeqIO
            
            SeqIO.write([record], reference_genbank_file, 'genbank')
            
            f = open(self / (grace.filesystem_friendly_name(name) + '.gbk'), 'wb')
            SeqIO.write([record], f, 'genbank')
            f.close()
            
            any_genbank[0] = True
        
        lengths = [ ]
        seen = set()
        f = open(reference_filename, 'wb')
        for filename in filenames:
            for name, seq in io.read_sequences(filename, genbank_callback=genbank_callback):
                name = name.split()[0]
                assert name not in seen, 'Duplicate chromosome name: ' + name
                seen.add(name)
                lengths.append( (name, len(seq)) )
                io.write_fasta(f, name, seq)
        f.close()        
        self.set_object(lengths, 'reference-lengths.pickle.gz')
        
        reference_genbank_file.close()
        if not any_genbank[0]:
            os.unlink(reference_genbank_filename)
            
        # Create an index of the reference sequences for samtools
        io.execute([
            'samtools', 'faidx', reference_filename
        ])
Ejemplo n.º 29
0
def iter_reads(config, qualities=False):
    if 'stride' not in config:
        raise grace.Error(
            'Please re-run nesoni shrimp, output format has changed')

    stride = config['stride']
    for reads_filename_set in config['reads']:
        if config['solid']:
            reader = [
                io.read_solid(filename) for filename in reads_filename_set
            ]
        else:
            reader = [
                io.read_sequences(filename, qualities)
                for filename in reads_filename_set
            ]
        reader = itertools.izip(*reader)

        for i, items in enumerate(reader):
            if i % stride == 0:
                for item in items:
                    yield item
Ejemplo n.º 30
0
 def run(self):
     seqs = [ ]
     seen = 0
     for filename in self.filenames:
         for seq in io.read_sequences(filename, qualities=True):
             seen += 1
             if seen % 100000 == 0: grace.status('Scanned '+grace.pretty_number(seen))
             
             if len(seqs) < self.n:
                 seqs.append(seq)
             elif self.n <= random.random() * seen:
                 seqs[ random.randrange(self.n) ] = seq
     grace.status('')
     
     print >> sys.stderr, 'Sampled', grace.pretty_number(len(seqs)), 'of', grace.pretty_number(seen), 'sequences'
 
     if not seqs: return
     qualities = len(seqs[0])
     if qualities:
         for name, seq, qual in seqs:
             io.write_fastq(sys.stdout, name, seq, qual)
     else:
         for name, seq in seqs:
             io.write_fastq(sys.stdout, name, seq)
Ejemplo n.º 31
0
    def run(self):
        assert self.reads or self.pairs or self.interleaved, 'No reads given'

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        working = self.get_workspace()
        working.setup_reference(self.references, bowtie=True)
        working.update_param(snp_cost=2.0)
        reference = working.get_reference()

        log_file = open(self.log_filename(), 'wb')

        with workspace.tempspace(dir=working.working_dir) as temp:
            n = [0]

            def tempname():
                n[0] += 1
                return temp / ('%d.fq' % n[0])

            def convert(filename):
                info = io.get_file_info(filename)
                ok = selection.matches(
                    'type-fastq:[compression-none/compression-gzip/compression-bzip2]',
                    info)
                if ok:
                    return filename
                result_name = tempname()
                with open(result_name, 'wb') as f:
                    for name, seq, qual in io.read_sequences(
                            filename, qualities='required'):
                        io.write_fastq(f, name, seq, qual)
                return result_name

            ones = []
            twos = []
            singles = []

            for pair in self.pairs:
                assert len(
                    pair) == 2, 'Need two files in each "pair:" section.'
                ones.append(convert(pair[0]))
                twos.append(convert(pair[1]))

            for item in self.interleaved:
                left_name = tempname()
                right_name = tempname()
                ones.append(left_name)
                twos.append(right_name)
                with open(left_name,'wb') as left, \
                     open(right_name,'wb') as right:
                    reader = io.read_sequences(item, qualities='required')
                    while True:
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            break
                        io.write_fastq(left, name, seq, qual)

                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            raise grace.Error(
                                'Interleaved file contains odd number of sequences'
                            )
                        io.write_fastq(right, name, seq, qual)

            for item in self.reads:
                singles.append(convert(item))

            cores = min(self.cores, legion.coordinator().get_cores())

            command = ([
                'bowtie2',
                '--threads',
                str(cores),
                '--rg-id',
                '1',
                '--rg',
                'SM:' + working.name,
            ] + self.bowtie_options +
                       ['-x', reference.get_bowtie_index_prefix()])
            commands = []
            if ones:
                commands.append(command +
                                ['-1', ','.join(ones), '-2', ','.join(twos)])
            if singles:
                commands.append(command + ['-U', ','.join(singles)])

            temp_bam_name = temp / 'temp.bam'

            with io.pipe_to(['samtools', 'view', '-S', '-b', '-'],
                            stdout=open(temp_bam_name, 'wb'),
                            stderr=log_file) as f:
                header_sent = False
                for command in commands:
                    self.log.log('Running:\n' + ' '.join(command) + '\n')
                    with io.pipe_from(command, stderr=log_file,
                                      cores=cores) as f_out:
                        for line in f_out:
                            if not header_sent or not line.startswith('@'):
                                f.write(line)
                    header_sent = True

            #io.execute([
            #    'samtools', 'sort', '-n', temp_bam_name, working/'alignments'
            #    ])

            sam.sort_bam(temp_bam_name,
                         working / 'alignments',
                         by_name=True,
                         cores=self.cores)

        log_file.close()
Ejemplo n.º 32
0
def fill_scaffolds(args):
    max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000)
    
    if len(args) < 2:
        print USAGE
        return 1
    
    (output_dir, graph_dir), args = args[:2], args[2:]

    scaffolds = [ ]
    
    def scaffold(args):
        circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False)
        
        scaffold = [ ]
        for item in args:
            scaffold.append( ('contig', int(item)) )
            scaffold.append( ('gap', None) )
        
        if not circular: scaffold = scaffold[:-1]
        
        name = 'custom_scaffold_%d' % (len(scaffolds)+1)
        scaffolds.append( (name, scaffold) )
            
    grace.execute(args, [scaffold])
    
    custom_scaffolds = (len(scaffolds) != 0)    
    
    sequences = dict( 
        (a.split()[0], b.upper()) 
          for a,b in 
            io.read_sequences(os.path.join(
              graph_dir, '454AllContigs.fna')))
    
    sequence_names = sorted(sequences)
    sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1)))
    
    contexts = { }
    context_names = { }
    context_depths = { }
    for i in xrange(1,len(sequence_names)+1):
        seq = sequences[sequence_names[i-1]]
        contexts[ i ] = seq
        context_names[ i ] = sequence_names[i-1]+'-fwd'
        contexts[ -i ] = bio.reverse_complement(seq)
        context_names[ -i ] = sequence_names[i-1]+'-rev'
    
    links = collections.defaultdict(list)
    
    for line in open(
      os.path.join(graph_dir, '454ContigGraph.txt'),
      'rU'):
        parts = line.rstrip('\n').split('\t')
        
        if parts[0].isdigit():
            seq = sequence_ids[parts[1]]
            context_depths[ seq] = float(parts[3])
            context_depths[-seq] = float(parts[3])
        
        if parts[0] == 'C':    
            name1 = 'contig%05d' % int(parts[1])
            dir1 = {"3'" : 1, "5'" : -1 }[parts[2]]
            name2 = 'contig%05d' % int(parts[3])
            dir2 = {"5'" : 1, "3'" : -1 }[parts[4]]
            depth = int(parts[5])
            #print name1, dir1, name2, dir2, depth
            
            links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) )
            links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) )
    
        if parts[0] == 'S' and not custom_scaffolds:  
            name = 'scaffold%05d' % int(parts[2])  
            components = parts[3].split(';')
            scaffold = [ ]
            for component in components:
                a,b = component.split(':')
                if a == 'gap':
                    scaffold.append( ('gap',int(b)) )
                else:
                    strand = { '+': +1, '-': -1 }[ b ]
                    scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) )
            scaffolds.append( (name, scaffold) )
    
    
    
    #paths = { }
    #
    #todo = [ ]
    #for i in contexts:
    #    for depth_left, neg_left in links[-i]:
    #        left = -neg_left
    #        for depth_right, right in links[i]:
    #            todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) )
    #
    #heapq.heapify(todo)
    #while todo:
    #    score, source, dest, path = heapq.heappop(todo)
    #    if (source,dest) in paths: continue
    #    
    #    paths[(source,dest)] = path
    #    
    #    if len(contexts[dest]) > max_filler_length: continue
    #    
    #    for depth, next in links[dest]:
    #        heapq.heappush(todo,
    #            ( max(score,-depth,-context_depths[dest]), source, next, path+(dest,))
    #        )
    
    
    path_source_dest = collections.defaultdict(dict) # source -> dest -> next
    path_dest_source = collections.defaultdict(dict) # dest -> source -> next
    
    
    # Use links, in order to depth of coverage, to construct paths between contigs
    # Thus: paths have maximum minimum depth
    #       subsections of paths also have this property
    
    todo = [ ]
    for i in contexts:    
        for depth_link, right in links[i]:
            todo.append( ( depth_link, i, right) )
    todo.sort(reverse=True)
    for score, left, right in todo:
        if right in path_source_dest[left]: continue
        
        sources = [(left,right)]
        if len(contexts[left]) <= max_filler_length:
            sources += path_dest_source[left].items()
        destinations = [right]
        if len(contexts[right]) <= max_filler_length:
            destinations += path_source_dest[right].keys()
        
        for source, next in sources:
            for dest in destinations:
                if dest in path_source_dest[source]: continue
                path_source_dest[source][dest] = next
                path_dest_source[dest][source] = next
    
    
    workspace = io.Workspace(output_dir)
    scaffold_f = workspace.open('scaffolds.fa','wb')
    
    #comments = [ ]
    features = [ ]
    
    used = set()
    previous_total = 0
    
    for i, (name, scaffold) in enumerate(scaffolds):
        result = '' # Inefficient. Meh.
        n_filled = 0
        n_failed = 0
        for j, item in enumerate(scaffold):
            if item[0] == 'contig':
                result += contexts[item[1]]
                used.add(abs(item[1]))
            else:
                left = scaffold[j-1]
                right = scaffold[ (j+1) % len(scaffold) ] #If gap at end, assume circular
                assert left[0] == 'contig'
                assert right[0] == 'contig'
                
                gap_start = len(result)
    
                can_fill = right[1] in path_source_dest[left[1]]
                if can_fill:
                    n = 0
                    k = path_source_dest[left[1]][right[1]]
                    while k != right[1]:
                        n += len(contexts[k])
                        result += contexts[k].lower()
                        used.add(abs(k))
                        
                        k = path_source_dest[k][right[1]]
                    
                    n_filled += 1
                        
                    if item[1] is not None and max(n,item[1]) > min(n,item[1])*4:
                        print >> sys.stderr, 'Warning: gap size changed from %d to %d in scaffold %d' % (item[1],n,i+1)
                else:
                    n_failed += 1
                    
                    #print >> sys.stderr, 'Warning: No path to fill a gap in scaffold %d' % (i+1)
                    result += 'n' * (9 if item[1] is None else item[1])
    
                gap_end = len(result)
                
                #features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % (
                #    'all-scaffolds',
                #    'fill-scaffolds',
                #    'gap',
                #    previous_total + gap_start+1,
                #    previous_total + max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm.
                #    '.', #score
                #    '+', #strand
                #    '.', #frame
                #    '' #properties
                #))
                features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % (
                    name,
                    'fill-scaffolds',
                    'gap',
                    gap_start+1,
                    max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm.
                    '.', #score
                    '+', #strand
                    '.', #frame
                    '' #properties
                ))
                    
    
        io.write_fasta(scaffold_f, name, result)
        previous_total += len(result)
        #comments.append('##sequence-region    %s %d %d' % (name, 1, len(result)))
        print >> sys.stderr, 'Scaffold%05d: %d gaps filled, %d could not be filled' % (i+1, n_filled, n_failed)
    
    scaffold_f.close()
    
    gff_f = workspace.open('scaffolds.gff', 'wb')
    #print >>gff_f, '##gff-version    3'
    #for comment in comments:
    #    print >>gff_f, comment
    for feature in features:
        print >>gff_f, feature
    gff_f.close()
    
    
    leftovers_f = workspace.open('leftovers.fa', 'wb')
    for name in sequence_names:
        if sequence_ids[name] not in used:
            io.write_fasta(leftovers_f, name, sequences[name])
    leftovers_f.close()
    
    ends = { }
    for i, (name, scaffold) in enumerate(scaffolds):
        if scaffold[-1][0] == 'gap': continue
        ends[ '%s start' % name ] = scaffold[-1][1]
        ends[ '%s end  ' % name ] = -scaffold[0][1] 
    
    for end1 in sorted(ends):
        options = [ end2 for end2 in ends if -ends[end2] in path_source_dest[ends[end1]] ]
        if len(options) == 1:
            print >> sys.stderr, 'Note: from', end1, 'only', options[0], 'is reachable'
Ejemplo n.º 33
0
    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1)
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')
        #is_core = (what == 'core')
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in (
            'core',
            'unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core')

        workspace = self.get_workspace()

        for name, seq in io.read_sequences(
                working_directory.Working(self.working_dirs[0]).get_reference(
                ).reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)

            good = [True] * len(seq)

            for working_dir in self.working_dirs:
                if is_core:
                    suffix = '-depth.userplot'
                else:
                    suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name + suffix))
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                    if good[i]:
                        if is_core:
                            good[i] = data[i] >= self.mincov
                        else:
                            good[i] = data[i] < self.mincov

            #Close holes
            start = -self.maxdiff - 1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                    if 0 < i - start <= self.maxdiff:
                        for j in xrange(start, i):
                            good[j] = True
                        n_holes += 1
                    start = i + 1
            self.log.log('Closed ' + grace.pretty_number(n_holes) + ' holes\n')

            f = open(workspace / ('%s-%s.fa' % (friendly_name, self.what)),
                     'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else 'N')
                                  for i in xrange(len(seq))]))
            f.close()

            f = open(
                workspace / ('%s-%s_masked.fa' % (friendly_name, self.what)),
                'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else seq[i].lower())
                                  for i in xrange(len(seq))]))
            f.close()

            f_good = open(
                workspace / ('%s-%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            f_nongood = open(
                workspace / ('%s-non%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]

            def emit(i):
                if i - start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i - start
                io.write_fasta(f_good if good[start] else f_nongood,
                               '%s:%d..%d' % (name, start + 1, i),
                               seq[start:i])

            for i in xrange(1, len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()

            self.log.log(
                grace.pretty_number(sum(good)) + ' bases are ' + self.what +
                ', of ' + grace.pretty_number(len(seq)) +
                ' in reference sequence\n')
            self.log.log(
                grace.pretty_number(n_good[0]) + ' parts at least ' +
                grace.pretty_number(self.minsize) + ' bases long with ' +
                grace.pretty_number(n_good_bases[0]) + ' total bases\n')
            self.log.log('\n')
Ejemplo n.º 34
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, 'No reference sequences given'
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
        for pair in self.pairs:
            assert len(pair) == 2, 'Two files required in each pair: section'

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        #Create working directory

        workspace = self.get_workspace()
        workspace.setup_reference(self.references)
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())

        default_options = {
            '-E': None,
            '-T': None,
            '-N': str(cores),
            '-n': '2',
            '-w': '200%',
            '-p': 'opp-in',
            '-I': '0,500',
            '-X': None,
        }

        if self.sam_unaligned:
            default_options['--sam-unaligned'] = None

        if self.half_paired:
            default_options['--half-paired'] = None
        else:
            default_options['--no-half-paired'] = None

        cutoff = '55%'  #Default changed in SHRiMP 2.0.2
        if '-h' in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index('-h') + 1]

        #Run shrimp

        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')
        bam_sorted_prefix = io.abspath(self.output_dir, 'alignments_sorted')

        temp_filename = io.abspath(self.output_dir, 'temp.bam')

        log_filename = io.abspath(self.output_dir, 'shrimp_log.txt')
        log_file = open(log_filename, 'wb')

        sam_eater = sam.Bam_writer(temp_filename)

        sam_header_sent = [False]
        n_seen = [0]

        def eat(f):
            for line in f:
                if line.startswith('@'):
                    if sam_header_sent[0]: continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status('%s alignments produced' %
                                     grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ['-p', '-I']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2:]
            for flag in ['--half-paired']:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1:]
            return options

        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) ==
                3  #A little ugly
                for filename in filenames)
            if has_qualities:
                options.append('--fastq')

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ['-1', filenames[0], '-2', filenames[1]]

            if '--qv-offset' not in self.shrimp_options:
                #guesses = [ ]
                #for filename in filenames:
                #    guesses.append(io.guess_quality_offset(filename))
                #assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify --qv-offset manually.'
                #default_options['--qv-offset'] = str(guesses[0])
                default_options['--qv-offset'] = str(
                    io.guess_quality_offset(*filenames))

            default_options['--read-group'] = '%s,%s' % (
                workspace.name.replace(',',
                                       '_'), workspace.name.replace(',', '_'))
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status('')

            full_param = reference.shrimp_command(self.cs,
                                                  options + reads_parameters)

            print >> sys.stderr, 'Running', ' '.join(full_param)

            with io.pipe_from(full_param, stderr=log_file, cores=cores) as f:
                eat(f)

        log_file.close()

        sam_eater.close()

        grace.status('Sort')

        #io.execute([
        #    'samtools', 'sort', '-n', temp_filename, bam_prefix
        #])
        sam.sort_bam(temp_filename, bam_prefix, by_name=True, cores=self.cores)

        os.unlink(temp_filename)

        grace.status('')
Ejemplo n.º 35
0
    def run(self):
        references = {}
        for filename in self.reference_filenames:
            for name, seq in io.read_sequences(filename):
                references[name] = seq

        tail_lengths = {}
        adaptor_bases = {}
        for filename in self.clips:
            with io.open_possibly_compressed_file(filename) as f:
                for line in f:
                    if line.startswith('#'): continue
                    parts = line.rstrip('\n').split('\t')
                    name = parts[0].split()[0]
                    tail_lengths[name] = int(parts[3]) - int(parts[2])
                    adaptor_bases[name] = int(parts[6])

        in_file = self.begin_input()
        out_file = self.begin_output()

        assert self.prop_a >= 0.0 and self.prop_a <= 1.0
        a_score = 1 - self.prop_a
        non_a_score = -self.prop_a

        for line in in_file:
            line = line.rstrip()
            if line.startswith('@'):
                print >> out_file, line
                continue

            al = Alignment(line)

            if al.flag & FLAG_UNMAPPED:
                continue

            #ref = references[al.rname]

            reverse = al.flag & FLAG_REVERSE
            if reverse:
                read_bases = rev_comp(al.seq)
                read_qual = al.qual[::-1]
                cigar = cigar_decode(al.cigar)[::-1]
            else:
                read_bases = al.seq
                read_qual = al.qual
                cigar = cigar_decode(al.cigar)

            n_tail = tail_lengths[al.qname]

            #if reverse:
            #    if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
            #    bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1])
            #else:
            #    if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
            #    bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug!
            #
            #extension = 0
            #while extension < n_tail and bases_ref[extension] == 'A':
            #    extension += 1

            if reverse:
                feat = annotation.Annotation(al.rname,
                                             start=al.pos - 1 - n_tail,
                                             end=al.pos - 1,
                                             strand=-1)
            else:
                feat = annotation.Annotation(al.rname,
                                             start=al.pos - 1 + al.length,
                                             end=al.pos - 1 + al.length +
                                             n_tail,
                                             strand=1)
            bases_ref = feat.get_seq(references).upper()

            # Allow up to 60% mismatch on As
            # Treat soft clipping as insertion for simplicity
            cigar = cigar.replace("S", "I")
            assert "H" not in cigar, "Can't handle hard clipping"

            extension = 0
            best_score = 0.0
            score = 0.0

            # Soft clipping treated as a mismatch
            i = len(cigar) - 1
            while i >= 0 and cigar[i] in "I":
                score += non_a_score
                i -= 1

            for i in xrange(n_tail):
                if bases_ref[i] == "A":
                    score += a_score
                else:
                    score += non_a_score

                if score >= best_score:
                    extension = i + 1
                    best_score = score
            #print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref

            if n_tail - extension > 0:
                al.extra.append('AN:i:%d' % (n_tail - extension))
                al.extra.append('AG:i:%d' % (extension))
            if adaptor_bases[al.qname]:
                al.extra.append('AD:i:%d' % adaptor_bases[al.qname])
            if n_tail - extension >= self.tail:
                #if reverse:
                #    tail_refpos = al.pos-extension
                #else:
                #    tail_refpos = al.pos+al.length+extension-1
                #al.extra.append('AA:i:%d'%tail_refpos)
                al.extra.append('AA:i:1')

            cigar += 'M' * extension
            read_bases += 'N' * extension  #Since mispriming is so common (and loading the original sequence here would be a pain)
            read_qual += chr(33 + 20) * extension  #Arbitrarily give quality 20
            al.length += extension
            if reverse:
                al.pos -= extension
                al.seq = rev_comp(read_bases)
                al.qual = read_qual[::-1]
                al.cigar = cigar_encode(cigar[::-1])
            else:
                al.seq = read_bases
                al.qual = read_qual
                al.cigar = cigar_encode(cigar)

            print >> out_file, al

        self.end_output(out_file)
        self.end_input(in_file)
    def run(self):
        references = { }
        for filename in self.reference_filenames:
            for name, seq in io.read_sequences(filename):
                references[name] = seq
        
        tail_lengths = { }
        adaptor_bases = { }
        for filename in self.clips:
            with io.open_possibly_compressed_file(filename) as f:
                for line in f:
                    if line.startswith('#'): continue
                    parts = line.rstrip('\n').split('\t')
                    name = parts[0].split()[0]
                    tail_lengths[name] = int(parts[3])-int(parts[2])
                    adaptor_bases[name] = int(parts[6])
        
        in_file = self.begin_input()
        out_file = self.begin_output()
        
        assert self.prop_a >= 0.0 and self.prop_a <= 1.0
        a_score = 1-self.prop_a
        non_a_score = -self.prop_a
        
        for line in in_file:
            line = line.rstrip()
            if line.startswith('@'):
                print >> out_file, line
                continue
            
            al = Alignment(line)
            
            if al.flag & FLAG_UNMAPPED:
                continue

            #ref = references[al.rname]

            reverse = al.flag & FLAG_REVERSE
            if reverse:
                read_bases = rev_comp(al.seq)
                read_qual = al.qual[::-1]
                cigar = cigar_decode(al.cigar)[::-1]
            else:
                read_bases = al.seq
                read_qual = al.qual
                cigar = cigar_decode(al.cigar)
            
            n_tail = tail_lengths[al.qname]
            
            #if reverse:
            #    if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
            #    bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1])    
            #else:
            #    if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
            #    bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug!
            #
            #extension = 0
            #while extension < n_tail and bases_ref[extension] == 'A':
            #    extension += 1
            
            if reverse:
                feat = annotation.Annotation(al.rname, start=al.pos-1-n_tail, end=al.pos-1, strand=-1)
            else:
                feat = annotation.Annotation(al.rname, start=al.pos-1+al.length, end=al.pos-1+al.length+n_tail, strand=1)
            bases_ref = feat.get_seq(references).upper()
            
            # Allow up to 60% mismatch on As
            # Treat soft clipping as insertion for simplicity
            cigar = cigar.replace("S","I")
            assert "H" not in cigar, "Can't handle hard clipping"
            
            extension = 0
            best_score = 0.0
            score = 0.0
            
            # Soft clipping treated as a mismatch
            i = len(cigar)-1
            while i >= 0 and cigar[i] in "I":
                score += non_a_score
                i -= 1
            
            for i in xrange(n_tail):
                if bases_ref[i] == "A":
                    score += a_score
                else:
                    score += non_a_score
                    
                if score >= best_score:
                    extension = i+1
                    best_score = score
            #print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref
                      
            
            if n_tail-extension > 0:
                al.extra.append('AN:i:%d' % (n_tail-extension))
                al.extra.append('AG:i:%d' % (extension))
            if adaptor_bases[al.qname]:
                al.extra.append('AD:i:%d' % adaptor_bases[al.qname])
            if n_tail-extension >= self.tail:
                #if reverse:
                #    tail_refpos = al.pos-extension
                #else:
                #    tail_refpos = al.pos+al.length+extension-1 
                #al.extra.append('AA:i:%d'%tail_refpos)
                al.extra.append('AA:i:1')
            
            cigar += 'M' * extension
            read_bases += 'N' * extension #Since mispriming is so common (and loading the original sequence here would be a pain)
            read_qual += chr(33+20) * extension #Arbitrarily give quality 20
            al.length += extension
            if reverse:
                al.pos -= extension
                al.seq = rev_comp(read_bases)
                al.qual = read_qual[::-1]
                al.cigar = cigar_encode(cigar[::-1])
            else: 
                al.seq = read_bases
                al.qual = read_qual
                al.cigar = cigar_encode(cigar)
                
            print >> out_file, al
    
        self.end_output(out_file)
        self.end_input(in_file)
Ejemplo n.º 37
0
    def run(self):
        grace.require_shrimp_2()
        grace.require_samtools()
        assert self.references, "No reference sequences given"
        assert self.reads or self.pairs or self.interleaved, "No reads given"
        for pair in self.pairs:
            assert len(pair) == 2, "Two files required in each pair: section"

        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)

        read_sets = []
        for item in self.reads:
            read_sets.append(([item], False))
        for item in self.pairs:
            read_sets.append((item, True))
        for item in self.interleaved:
            read_sets.append(([item], True))

        # Create working directory

        workspace = self.get_workspace()
        workspace.setup_reference(self.references)
        workspace.update_param(snp_cost=25)
        reference = workspace.get_reference()
        reference_filename = reference.reference_fasta_filename()

        cores = min(self.cores, legion.coordinator().get_cores())

        default_options = {
            "-E": None,
            "-T": None,
            "-N": str(cores),
            "-n": "2",
            "-w": "200%",
            "-p": "opp-in",
            "-I": "0,500",
            "-X": None,
        }

        if self.sam_unaligned:
            default_options["--sam-unaligned"] = None

        if self.half_paired:
            default_options["--half-paired"] = None
        else:
            default_options["--no-half-paired"] = None

        cutoff = "55%"  # Default changed in SHRiMP 2.0.2
        if "-h" in self.shrimp_options:
            cutoff = self.shrimp_options[self.shrimp_options.index("-h") + 1]

        # Run shrimp

        bam_filename = io.abspath(self.output_dir, "alignments.bam")
        bam_prefix = io.abspath(self.output_dir, "alignments")
        bam_sorted_prefix = io.abspath(self.output_dir, "alignments_sorted")

        temp_filename = io.abspath(self.output_dir, "temp.bam")

        log_filename = io.abspath(self.output_dir, "shrimp_log.txt")
        log_file = open(log_filename, "wb")

        sam_eater = sam.Bam_writer(temp_filename)

        sam_header_sent = [False]
        n_seen = [0]

        def eat(f):
            for line in f:
                if line.startswith("@"):
                    if sam_header_sent[0]:
                        continue
                else:
                    n_seen[0] += 1
                    if n_seen[0] % 100000 == 0:
                        grace.status("%s alignments produced" % grace.pretty_number(n_seen[0]))
                sam_eater.write_raw(line)
            sam_header_sent[0] = True

        def remove_pair_options(options):
            for flag in ["-p", "-I"]:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 2 :]
            for flag in ["--half-paired"]:
                while flag in options:
                    pos = options.index(flag)
                    options = options[:pos] + options[pos + 1 :]
            return options

        for i, (filenames, is_paired) in enumerate(read_sets):
            options = self.shrimp_options[:]

            has_qualities = all(
                len(io.read_sequences(filename, qualities=True).next()) == 3 for filename in filenames  # A little ugly
            )
            if has_qualities:
                options.append("--fastq")

            if len(filenames) == 1:
                reads_parameters = [filenames[0]]
            else:
                reads_parameters = ["-1", filenames[0], "-2", filenames[1]]

            if "--qv-offset" not in self.shrimp_options:
                guesses = []
                for filename in filenames:
                    guesses.append(io.guess_quality_offset(filename))
                assert (
                    len(set(guesses)) == 1
                ), "Conflicting quality offset guesses, please specify --qv-offset manually."
                default_options["--qv-offset"] = str(guesses[0])

            default_options["--read-group"] = "%s,%s" % (
                workspace.name.replace(",", "_"),
                workspace.name.replace(",", "_"),
            )
            for flag in default_options:
                if flag not in options:
                    options.append(flag)
                    if default_options[flag] is not None:
                        options.append(default_options[flag])

            if not is_paired:
                options = remove_pair_options(options)

            grace.status("")

            full_param = reference.shrimp_command(self.cs, options + reads_parameters)

            print >>sys.stderr, "Running", " ".join(full_param)

            with io.pipe_from(full_param, stderr=log_file, cores=cores) as f:
                eat(f)

        log_file.close()

        sam_eater.close()

        grace.status("Sort")

        io.execute(["samtools", "sort", "-n", temp_filename, bam_prefix])

        os.unlink(temp_filename)

        grace.status("")
Ejemplo n.º 38
0
 def run(self):
     for filename in self.filenames:
         for name, seq in io.read_sequences(filename):
             if self.only and name.split()[0] not in self.only: continue
             io.write_fasta(sys.stdout, name, seq) 
 def run(self):
     references = { }
     for filename in self.reference_filenames:
         print >> sys.stderr, 'Load', filename
         for name, seq in io.read_sequences(filename):
             references[name] = seq
     
     reads = { }
     for filename in self.reads:
         print >> sys.stderr, 'Load', filename
         for name, seq, qual in io.read_sequences(filename, qualities='required'):
             reads[name] = (seq, qual)
     
     print >> sys.stderr, 'Begin'
     
     in_file = self.begin_input()
     out_file = self.begin_output()
     
     for line in in_file:
         line = line.rstrip()
         if line.startswith('@'):
             print >> out_file, line
             continue
         
         al = Alignment(line)
         
         if al.flag & FLAG_UNMAPPED:
             continue
         
         reverse = al.flag & FLAG_REVERSE
         if reverse:
             read_bases = rev_comp(al.seq)
             read_qual = al.qual[::-1]
             cigar = cigar_decode(al.cigar)[::-1]
         else:
             read_bases = al.seq
             read_qual = al.qual
             cigar = cigar_decode(al.cigar)
         
         al.extra = [ item for item in al.extra
                      if not item.startswith('CQ:Z:') and 
                         not item.startswith('CS:Z:') ] + [
                      'CQ:Z:'+reads[al.qname][1],
                      'CS:Z:'+reads[al.qname][0],
                    ]
         
         ref = references[al.rname]
         
         seq_tail = reads[al.qname][0][ len(al.seq)+1: ]
         qual_tail = reads[al.qname][1][ len(al.seq): ]
         n_tail = len(seq_tail)
         
         if reverse:
             if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
             bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1+1])    
         else:
             if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
             bases_ref = ref[al.pos-1+al.length-1:al.pos-1+al.length+n_tail]
                 
         seq_ref = solid_encode( bases_ref )
         
         basic_score = alignment_score(qual_tail, seq_tail, seq_ref, self.quality)
         
         if n_tail:
             tail_score, tail_pos = max(
                 (alignment_score(
                     qual_tail, seq_tail,
                     solid_encode(bases_ref[:1+i] + 'A'*(n_tail-i)), 
                     self.quality)[0],
                  i)
                 for i in xrange(n_tail+1)
             )
             
             baseline = max(0, alignment_score(
                 qual_tail[:tail_pos], seq_tail[:tail_pos], seq_ref[:tail_pos], self.quality)[0])
     
             if tail_score >= baseline + self.tail:
                 #Record position of end of transcript in 'AA' (1-based position)
                 if reverse:
                     tail_refpos = al.pos - tail_pos
                 else:
                     tail_refpos = al.pos+al.length + tail_pos - 1 
                 al.extra.append('AA:i:%d' % tail_refpos)
             
             #Record sequence's poly(A) tail length in AN: from the end of correspondence with the reference sequence to the end of good quality As
             estimated_tail_length = alignment_score(qual_tail,seq_tail,solid_encode(bases_ref[:1+tail_pos]+'A'*(n_tail-tail_pos)),self.quality)[1] - tail_pos
             if estimated_tail_length > 0:
                 al.extra.append('AN:i:%d' % estimated_tail_length)
             
             if tail_pos:    
                 read_bases += solid_decode(read_bases[-1], seq_tail[:tail_pos])
                 read_qual += qual_tail[:tail_pos]
                 cigar += 'M'*tail_pos
                 al.length += tail_pos
                 if reverse:
                     al.pos -= tail_pos
                     al.seq = rev_comp(read_bases)
                     al.qual = read_qual[::-1]
                     al.cigar = cigar_encode(cigar[::-1])
                 else: 
                     al.seq = read_bases
                     al.qual = read_qual
                     al.cigar = cigar_encode(cigar)
         
         print >> out_file, al
     
     self.end_output(out_file)
     self.end_input(in_file)
Ejemplo n.º 40
0
    def run(self):
        workspace = self.get_workspace()
        
        read_length = 100
        left = rand_seq(read_length-1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[:1]: break
        left += flank
        
        right = rand_seq(read_length-1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[-1:]: break
        right = flank+right
        
        i = 0
        
        variants_used = [ ]
        
        with open(workspace/'reads.fq','wb') as f:
            for i, variant in enumerate(self.variants):
                if 'x' in variant:
                    variant, count = variant.split('x')
                    count = int(count)
                else:
                    count = 10
                variants_used.append( (variant,count) )
                seq = left+variant+right
                for j in xrange(count):
                    pos = len(variant)+random.randrange(read_length-len(variant))
                    read = seq[pos:pos+read_length]
                    if random.randrange(2):
                        read = bio.reverse_complement(read)
                    i += 1
                    io.write_fastq(f,'read_%s_%d' % (variant,i),read,chr(64+30)*len(read))

        reference = left+self.ref+right
        primary_variant = left+variants_used[0][0]+right

        with open(workspace/'reference.fa','wb') as f:
            io.write_fasta(f,'chr1',reference)
        
        legion.remake_needed()
        
        self.analysis(
            workspace/'sample',
            workspace/'reference.fa',
            reads = [ workspace/'reads.fq' ],
            ).run()
        
        self.freebayes(
            workspace/'freebayes',
            workspace/'sample',
            ).run()
        
        self.vcf_filter(
            workspace/'filtered',
            workspace/'freebayes.vcf',
            ).run()
        
        Vcf_patch(
            workspace/'patch',
            workspace/('sample','reference'),
            workspace/'filtered.vcf'
            ).run()
        
        patched = io.read_sequences(workspace/('patch','sample.fa')).next()[1]
        
        masked = io.read_sequences(workspace/('sample','consensus_masked.fa')).next()[1].upper()
        
        with open(workspace/'freebayes.vcf','rU') as f:
            reader = vcf.Reader(f)
            raw_count = len(list(reader))
        
        with open(workspace/'filtered.vcf','rU') as f:
            reader =  vcf.Reader(f)
            filtered_count = len(list(vcf.Reader(open(workspace/'filtered.vcf','rU'))))
        
        with open(workspace/('sample','report.txt'),'rb') as f:
            nesoni_count = len(f.readlines()) - 1

        self.log.log('\n')
        self.log.datum(workspace.name,'changes found by "nesoni consensus:"', nesoni_count)
        self.log.datum(workspace.name,'is correctly patched by "nesoni consensus:"', masked == primary_variant)
        self.log.log('\n')
        self.log.datum(workspace.name,'raw variants', raw_count)
        self.log.datum(workspace.name,'variants after filtering', filtered_count)
        self.log.datum(workspace.name,'is correctly patched by VCF pipeline', patched == primary_variant)
        self.log.log('\n')
Ejemplo n.º 41
0
    def run(self):
        log = self.log

        #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10)
        #qoffset, args = grace.get_option_value(args, '--qoffset', int, None)
        #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True)
        #length_cutoff, args = grace.get_option_value(args, '--length', int, 24)
        #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10)
        #max_error, args = grace.get_option_value(args, '--max-errors', int, 1)
        #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna')
        #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False)
        #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False)
        #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0)
        #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0)
        #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False)
        #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True)
        #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False)
        #grace.expect_no_further_options(args)

        prefix = self.prefix
        log_name = os.path.split(prefix)[1]

        quality_cutoff = self.quality
        qoffset = self.qoffset
        clip_ambiguous = self.clip_ambiguous
        length_cutoff = self.length
        adaptor_cutoff = self.match
        max_error = self.max_errors
        adaptor_set = self.adaptors
        disallow_homopolymers = self.homopolymers
        reverse_complement = self.revcom
        trim_start = self.trim_start
        trim_end = self.trim_end
        output_fasta = self.fasta
        use_gzip = self.gzip
        output_rejects = self.rejects

        iterators = []
        filenames = []
        any_paired = False

        for filename in self.reads:
            filenames.append(filename)
            iterators.append(
                itertools.izip(io.read_sequences(filename, qualities=True)))

        for pair_filenames in self.pairs:
            assert len(pair_filenames
                       ) == 2, 'Expected a pair of files for "pairs" section.'
            filenames.extend(pair_filenames)
            any_paired = True
            iterators.append(
                itertools.izip(
                    io.read_sequences(pair_filenames[0], qualities=True),
                    io.read_sequences(pair_filenames[1], qualities=True)))

        for filename in self.interleaved:
            filenames.extend(filename)
            any_paired = True
            iterators.append(
                deinterleave(io.read_sequences(filename, qualities=True)))

        fragment_reads = (2 if any_paired else 1)
        read_in_fragment_names = ['read-1', 'read-2'
                                  ] if any_paired else ['read']

        assert iterators, 'Nothing to clip'

        if qoffset is None:
            guesses = [
                io.guess_quality_offset(filename) for filename in filenames
            ]
            assert len(
                set(guesses)
            ) == 1, 'Conflicting quality offset guesses, please specify manually.'
            qoffset = guesses[0]
            log.log('FASTQ offset seems to be %d\n' % qoffset)

        quality_cutoff_char = chr(qoffset + quality_cutoff)

        #log.log('Minimum quality:        %d (%s)\n' % (quality_cutoff, quality_cutoff_char))
        #log.log('Clip ambiguous bases:   %s\n' % (grace.describe_bool(clip_ambiguous)))
        #log.log('Minimum adaptor match:  %d bases, %d errors\n' % (adaptor_cutoff, max_error))
        #log.log('Minimum length:         %d bases\n' % length_cutoff)

        adaptor_seqs = []
        adaptor_names = []
        if adaptor_set and adaptor_set.lower() != 'none':
            for item in adaptor_set.split(','):
                item = item.strip().lower() + ' '
                any = False
                for line in ADAPTORS.strip().split('\n'):
                    if line.startswith('#'): continue
                    if not line.lower().startswith(item): continue
                    any = True
                    name, seq = line.rsplit(None, 1)
                    seq = seq.replace('U', 'T')

                    #if seq in adaptor_seqs: print 'Dup', name
                    adaptor_seqs.append(seq)
                    adaptor_names.append(name)
                    adaptor_seqs.append(bio.reverse_complement(seq))
                    adaptor_names.append(name)
                if not any:
                    raise grace.Error('Unknown adaptor set: ' + item)

        matcher = Matcher(adaptor_seqs, adaptor_names, max_error)

        start_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]
        end_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]

        if output_fasta:
            write_sequence = io.write_fasta_single_line
        else:
            write_sequence = io.write_fastq

        f_single = io.open_possibly_compressed_writer(
            self.reads_output_filenames()[0])
        if fragment_reads == 2:
            f_paired = io.open_possibly_compressed_writer(
                self.interleaved_output_filenames()[0])
        if output_rejects:
            f_reject = io.open_possibly_compressed_writer(
                self.rejects_output_filenames()[0])

        n_single = 0
        n_paired = 0

        n_in_single = 0
        n_in_paired = 0
        total_in_length = [0] * fragment_reads

        n_out = [0] * fragment_reads
        n_q_clipped = [0] * fragment_reads
        n_a_clipped = [0] * fragment_reads
        n_homopolymers = [0] * fragment_reads
        total_out_length = [0] * fragment_reads

        #log.attach(open(prefix + '_log.txt', 'wb'))

        for iterator in iterators:
            for fragment in iterator:
                if (n_in_single + n_in_paired) % 10000 == 0:
                    grace.status(
                        'Clipping fragment %s' %
                        grace.pretty_number(n_in_single + n_in_paired))

                if len(fragment) == 1:
                    n_in_single += 1
                else:
                    n_in_paired += 1

                graduates = []
                rejects = []
                for i, (name, seq, qual) in enumerate(fragment):
                    name = name.split()[0]
                    seq = seq.upper()
                    total_in_length[i] += len(seq)

                    start = trim_start
                    best_start = 0
                    best_len = 0
                    for j in xrange(len(seq) - trim_end):
                        if qual[j] < quality_cutoff_char or \
                           (clip_ambiguous and seq[j] not in 'ACGT'):
                            if best_len < j - start:
                                best_start = start
                                best_len = j - start
                            start = j + 1
                    j = len(seq) - trim_end
                    if best_len < j - start:
                        best_start = start
                        best_len = j - start

                    clipped_seq = seq[best_start:best_start + best_len]
                    clipped_qual = qual[best_start:best_start + best_len]
                    if len(clipped_seq) < length_cutoff:
                        n_q_clipped[i] += 1
                        rejects.append((name, seq, qual, 'quality'))
                        continue

                    match = matcher.match(clipped_seq)
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[match[0]:]
                        clipped_qual = clipped_qual[match[0]:]
                        start_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    match = matcher.match(bio.reverse_complement(clipped_seq))
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[:len(clipped_seq) - match[0]]
                        clipped_qual = clipped_qual[:len(clipped_qual) -
                                                    match[0]]
                        end_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    if disallow_homopolymers and len(set(clipped_seq)) <= 1:
                        n_homopolymers[i] += 1
                        rejects.append((name, seq, qual, 'homopolymer'))
                        continue

                    graduates.append((name, clipped_seq, clipped_qual))
                    n_out[i] += 1
                    total_out_length[i] += len(clipped_seq)

                if output_rejects:
                    for name, seq, qual, reason in rejects:
                        write_sequence(f_reject, name + ' ' + reason, seq,
                                       qual)

                if graduates:
                    if reverse_complement:
                        graduates = [(name, bio.reverse_complement(seq),
                                      qual[::-1])
                                     for name, seq, qual in graduates]

                    if len(graduates) == 1:
                        this_f = f_single
                        n_single += 1
                    else:
                        assert len(graduates) == 2
                        this_f = f_paired
                        n_paired += 1

                    for name, seq, qual in graduates:
                        write_sequence(this_f, name, seq, qual)

        grace.status('')

        if output_rejects:
            f_reject.close()
        if fragment_reads == 2:
            f_paired.close()
        f_single.close()

        def summarize_clips(name, location, clips):
            total = 0
            for i in clips:
                total += len(clips[i])
            log.datum(log_name, name + ' adaptors clipped at ' + location,
                      total)

            if not clips:
                return

            for i in xrange(min(clips), max(clips) + 1):
                item = clips[i]
                log.quietly_log('%3d bases: %10d ' % (i, len(item)))
                if item:
                    avg_errors = float(sum(item2[0]
                                           for item2 in item)) / len(item)
                    log.quietly_log(' avg errors: %5.2f  ' % avg_errors)

                    counts = collections.defaultdict(int)
                    for item2 in item:
                        counts[item2[1]] += 1
                    #print counts
                    for no in sorted(counts,
                                     key=lambda item2: counts[item2],
                                     reverse=True)[:2]:
                        log.quietly_log('%dx%s ' %
                                        (counts[no], matcher.names[no]))
                    if len(counts) > 2: log.quietly_log('...')

                log.quietly_log('\n')
            log.quietly_log('\n')

        if n_in_paired:
            log.datum(log_name, 'read-pairs', n_in_paired)
        if n_in_single:
            log.datum(log_name, 'single reads', n_in_single)

        for i in xrange(fragment_reads):
            if start_clips:
                summarize_clips(read_in_fragment_names[i], 'start',
                                start_clips[i])

            if end_clips:
                summarize_clips(read_in_fragment_names[i], 'end', end_clips[i])

                prefix = read_in_fragment_names[i]

            log.datum(log_name, prefix + ' too short after quality clip',
                      n_q_clipped[i])
            log.datum(log_name, prefix + ' too short after adaptor clip',
                      n_a_clipped[i])
            if disallow_homopolymers:
                log.datum(log_name, prefix + ' homopolymers',
                          n_homopolymers[i])
            if fragment_reads > 1:
                log.datum(log_name, prefix + ' kept', n_out[i])
            log.datum(log_name, prefix + ' average input length',
                      float(total_in_length[i]) / (n_in_single + n_in_paired))
            if n_out[i]:
                log.datum(log_name, prefix + ' average output length',
                          float(total_out_length[i]) / n_out[i])

        if fragment_reads == 2:
            log.datum(log_name, 'pairs kept after clipping', n_paired)
        log.datum(log_name, 'reads kept after clipping', n_single)
Ejemplo n.º 42
0
def main(args):
    title1, args = grace.get_option_value(args, "--title1", str, None)
    title2, args = grace.get_option_value(args, "--title2", str, None)
    grace.expect_no_further_options(args)

    if len(args) != 3:
        print >> sys.stderr, USAGE
        return 1

    working_dir1 = args[0]
    working_dir2 = args[1]
    cutoff = float(args[2])

    sequence_names = [name for name, sequence in io.read_sequences(os.path.join(working_dir1, "reference.fa"))]

    if title1 is None:
        title1 = working_dir1
    if title2 is None:
        title2 = working_dir2

    n = 1
    while significance([("A", n)], [("T", n)], 1.0) > cutoff:
        n += 1

    print "%g\tsignificance cutoff" % cutoff
    print "%d\tdepth required to call substitution (greater if there are errors in the reads)" % n

    print "Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s" % (
        title1,
        title2,
        title1,
        title2,
    )

    for sequence_name in sequence_names:
        filename1 = os.path.join(working_dir1, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt")
        filename2 = os.path.join(working_dir2, grace.filesystem_friendly_name(sequence_name) + "-evidence.txt")

        for (pos1, ins1, sub1, ref1, conins1, consub1), (pos2, ins2, sub2, ref2, conins2, consub2) in itertools.izip(
            read_file(filename1), read_file(filename2)
        ):
            assert pos1 == pos2 and ref1 == ref2

            if pos1 % 1000 == 0:
                grace.status("Testing %s %d" % (sequence_name, pos1))

            dec_ins1 = io.decode_evidence(ins1)
            dec_ins2 = io.decode_evidence(ins2)
            if dec_ins1 and dec_ins2:
                sig = significance(io.decode_evidence(ins1), io.decode_evidence(ins2), cutoff)
                if sig is not None and sig <= cutoff:
                    grace.status("")
                    print "%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s" % (
                        sequence_name,
                        pos1,
                        "insertion-before",
                        ins1,
                        ins2,
                        sig,
                        conins1,
                        conins2,
                    )

            dec_sub1 = io.decode_evidence(sub1)
            dec_sub2 = io.decode_evidence(sub2)
            if dec_sub1 and dec_sub2:
                sig = significance(dec_sub1, dec_sub2, cutoff)
                if sig is not None and sig <= cutoff:
                    if dec_sub1[0][0] == "-" or dec_sub2[0][0] == "-":
                        what = "deletion"
                    elif dec_sub1[0][0] != dec_sub2[0][0]:
                        what = "substitution"
                    else:
                        what = "different mix"
                    grace.status("")
                    print "%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s" % (
                        sequence_name,
                        pos1,
                        what,
                        ref1,
                        sub1,
                        sub2,
                        sig,
                        consub1,
                        consub2,
                    )

    grace.status("")
    return 0
Ejemplo n.º 43
0
def nway_main(gbk_filename, use_indels, use_reference, give_evidence, give_consequences,
              require_all, require_bisect, full_output, format, working_dirs, split_a, split_b, f=sys.stdout):
    assert working_dirs, 'Need at least one working directory.'
    workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in working_dirs ]
    reference = workspaces[0].get_reference()
    #if not annotation_filename:
    #    annotation_filename = reference.annotations_filename() #May still be None
    
    if use_reference:
        names = ['reference']
        evidence_start = 1
    else:
        names = [ ]
        evidence_start = 0
        
    names.extend( norm_name(item) for item in  working_dirs )
    
    references = io.read_sequences(reference.reference_fasta_filename())
    
    annotations = { }
    if gbk_filename:
        from Bio import SeqIO
        for record in SeqIO.parse(io.open_possibly_compressed_file(gbk_filename),'genbank'):
            sequence = record.seq.tostring()
            features = [ item for item in record.features if item.type != 'source' ]
            features.sort(key=lambda item: item.location.nofuzzy_start)
            annotations[sequence] = features
    
    iterator = reader(working_dirs, references, use_reference, annotations)
    
    if not use_indels:
        iterator = itertools.ifilter(has_no_indels, iterator)

    if require_all or require_bisect or format == 'counts':
        iterator = itertools.ifilter(fully_unambiguous, iterator)
    
    if require_bisect:
        iterator = itertools.ifilter(is_binary_partition, iterator)

    if not require_bisect:
        if full_output:
            iterator = itertools.ifilter(not_boring_insertion, iterator)
        else:
            iterator = itertools.ifilter(is_interesting, iterator)

    if split_a or split_b:
        assert len(names) == len(set(names)), 'Two samples with the same name'
        try:
            split_a = [ names.index(norm_name(item)) for item in split_a ]
            split_b = [ names.index(norm_name(item)) for item in split_b ]
        except ValueError:
            raise grace.Error('Sample to be split is not amongst samples given')
        iterator = itertools.ifilter(is_split(split_a, split_b), iterator)

    #if limit:
    #    iterator = itertools.islice(iterator, limit)
    
    if format == 'table':
        line = 'Reference\tPosition\tChange type'
        line +=  '\t' + '\t'.join(names)
        if give_evidence:
            line += '\t' + '\t'.join(names[evidence_start:])
        if give_consequences:
            line += '\t' + '\t'.join(names[evidence_start:])
        if annotations:
            line += '\tAnnotations'
        print >> f, line
        for calls in iterator:
            line = '%s\t%d\t%s\t%s' % (
                calls.ref_name, 
                calls.ref_pos+1, 
                change_type(calls), 
                '\t'.join(item.consensus for item in calls.calls))
            if give_evidence:
                line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:])
            if give_consequences:
                line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:])
            if annotations:
                line += '\t' + describe_features(calls.features)
            print >> f, line

    elif format == 'compact':
        for line in transpose_strings(names):
            print >> f, line
        print >> f
        
        for calls in iterator:
            if calls.is_insertion:
                footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name)
            else: 
                footer = '%12d   %s' % (calls.ref_pos+1, calls.ref_name)
            
            t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1)
            top = t[0] + ' ' + footer
            if give_consequences:
                consequences = [ ]
                for call in calls.calls:
                    if call.consequences:
                        for item in call.consequences.split(', '):
                            item = ' '.join(item.split()[:3])
                            if item not in consequences: consequences.append(item)
                        
                if consequences:
                    top += '  ' + ' / '.join(sorted(consequences))
            top += '  ' + describe_features(calls.features)
            print >> f, top
            for line in t[1:]:
                print >> f, line            
    
    elif format == 'nexus':
        buckets = [ [ ] for name in names ]
        for calls in iterator:
            for i, char in enumerate(partition_string(calls)):
                buckets[i].append(char)
        
        print >> f, '#NEXUS'
        print >> f, 'begin taxa;'
        print >> f, 'dimensions ntax=%d;' % len(names)
        print >> f, 'taxlabels'
        for name in names:
            print >> f, name
        print >> f, ';'
        print >> f, 'end;'

        print >> f, 'begin characters;'
        print >> f, 'dimensions nchar=%d;' % len(buckets[0])
        print >> f, 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;'
        print >> f, 'matrix'
        for name, bucket in itertools.izip(names, buckets):
            print >> f, name, ''.join(bucket)
        print >> f, ';'
        print >> f, 'end;'
    
    elif format == 'counts':
        for line in transpose_strings(names):
            print >> f, line
        print >> f

        counts = { }
        for calls in iterator:
            count_str = partition_string(calls)
            if count_str not in counts:
                counts[count_str] = 1
            else:
                counts[count_str] += 1
        
        for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True):
            print >> f, '%s   %d' % (transpose_strings(count_str)[0], counts[count_str])
    
    else:
        raise grace.Error('Unknown output format: ' + format)
Ejemplo n.º 44
0
def recombination(args):
    grace.expect_no_further_options(args)
    if len(args) != 2:
        print >> sys.stderr, USAGE
        raise grace.Help_shown()

    working_dir, seq_name = args

    references = dict(io.read_sequences(os.path.join(working_dir, 'reference.fa')))
    
    depth = { }
    prefixes = { }
    suffixes = { }
    for name in references:
        depth[name] = numpy.zeros(len(references[name]), 'int64')
        prefixes[name] = [ [] for base in references[name] ]
        suffixes[name] = [ [] for base in references[name] ]
    def register_divergence(hit):
        if not hit.query_forward:
            hit = hit.reversed()
        
        margin = 20
        
        if hit.target_end - hit.target_start < 20: 
            return False
        
        depth[hit.target_name][hit.target_start : hit.target_end] += 1

        any = False
        
        if hit.query_end <= len(hit.query_seq)-margin: # and hit.target_end < len(hit.target_seq):
            suffixes[hit.target_name][hit.target_end-1].append( hit.query_seq[hit.query_end:] )
            any = True
        
        if hit.query_start >= margin: # and hit.target_start > 0:
            prefixes[hit.target_name][hit.target_start].append( hit.query_seq[:hit.query_start] )
            any = True
        
        return any

    n = 0
    for (read_name, read_seq), hits in shrimp.iter_read_hits(working_dir):
        # Skip reads containing Ns
        if 'N' in read_seq: continue
    
        for line in hits:
            register_divergence(alignment_from_shrimp(line, references, read_name, read_seq))
        
        n += 1
        #if n > 100000:
        #    break
            
        if n%10000 == 0:
            grace.status('Processing read %s' % grace.pretty_number(n))

    grace.status('')

    
    def show_items(items):
        original_length = len(items)
        cut = 0
        while len(items) > 80:
            cut += 1
            items = [ item for item in items if item[0] >= cut ]
        for item in items:
            print item[1]
        if len(items) < original_length:
            print '(and %d more occurring %d times or less)' % (original_length-len(items), cut-1) 
    
    def score(items):
        if not items: return 1.0
        return float(sum( item[0] * item[0] for item in items )) / (sum( item[0] for item in items )**2)
    
    def summarize_prefixes(seqs, pad):
        seqs = sorted(seqs, key=lambda seq: seq[::-1])
        
        cut = 100
        while True:
            items = [ ] 
            for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[-cut:]):
                ss = list(iterator)
                anylong = any( item != seq for item in ss )            
                n = len(ss)
                items.append( (n, ('%'+str(pad)+'s')%(('...' if anylong else '') + seq) + ' x %d' % n) )
            
            if score(items) >= 1.0/20: break
            cut -= 1
        
        show_items(items)
        
    def summarize_suffixes(seqs, pad):
        seqs = sorted(seqs)
        
        cut = 100
        while True:
            items = [ ] 
            for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[:cut]):
                ss = list(iterator)            
                anylong = any( item != seq for item in ss )            
                n = len(ss)
                items.append( (n, ('%'+str(pad)+'s')%('%d x '%n) + seq + ('...' if anylong else '')) )
            
            if score(items) >= 1.0/20: break
            cut -= 1
        show_items(items)
    
    print 'Position        Depth        Changed prefixes             Changed suffixes'
    print '                          Count    % of depth       Count    % of depth'
    for i in xrange(len(references[seq_name])):
        print '%8d   %10d %9d  %11s       %9d  %11s' % (
            i+1,
            depth[seq_name][i],
            len(prefixes[seq_name][i]),
            '%.3f%%' % (len(prefixes[seq_name][i])*100.0/depth[seq_name][i]) if prefixes[seq_name][i] else '',  
            len(suffixes[seq_name][i]),
            '%.3f%%' % (len(suffixes[seq_name][i])*100.0/depth[seq_name][i]) if suffixes[seq_name][i] else '')
        #summarize_suffixes(suffixes[name][i], references[name][i+1:], references[name], suffix_depth[name][i])

    print
    print 'Details'
    print
    for i in xrange(len(references[seq_name])):
        print '%-80s*' % ('Base %d' % (i+1))
        
        print pad_slice(references[seq_name], i-80,i+1+80)        
        summarize_prefixes(prefixes[seq_name][i], 80)
        summarize_suffixes(suffixes[seq_name][i], 81)
        print
Ejemplo n.º 45
0
    def run(self):
        workspace = self.get_workspace()

        read_length = 100
        left = rand_seq(read_length - 1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[:1]: break
        left += flank

        right = rand_seq(read_length - 1)
        while True:
            flank = rand_seq(1)
            if flank != self.ref[-1:]: break
        right = flank + right

        i = 0

        variants_used = []

        with open(workspace / 'reads.fq', 'wb') as f:
            for i, variant in enumerate(self.variants):
                if 'x' in variant:
                    variant, count = variant.split('x')
                    count = int(count)
                else:
                    count = 10
                variants_used.append((variant, count))
                seq = left + variant + right
                for j in xrange(count):
                    pos = len(variant) + random.randrange(read_length -
                                                          len(variant))
                    read = seq[pos:pos + read_length]
                    if random.randrange(2):
                        read = bio.reverse_complement(read)
                    i += 1
                    io.write_fastq(f, 'read_%s_%d' % (variant, i), read,
                                   chr(64 + 30) * len(read))

        reference = left + self.ref + right
        primary_variant = left + variants_used[0][0] + right

        with open(workspace / 'reference.fa', 'wb') as f:
            io.write_fasta(f, 'chr1', reference)

        legion.remake_needed()

        self.analysis(
            workspace / 'sample',
            workspace / 'reference.fa',
            reads=[workspace / 'reads.fq'],
        ).run()

        self.freebayes(
            workspace / 'freebayes',
            workspace / 'sample',
        ).run()

        self.vcf_filter(
            workspace / 'filtered',
            workspace / 'freebayes.vcf',
        ).run()

        Vcf_patch(workspace / 'patch', workspace / ('sample', 'reference'),
                  workspace / 'filtered.vcf').run()

        patched = io.read_sequences(workspace /
                                    ('patch', 'sample.fa')).next()[1]

        masked = io.read_sequences(
            workspace / ('sample', 'consensus_masked.fa')).next()[1].upper()

        with open(workspace / 'freebayes.vcf', 'rU') as f:
            reader = vcf.Reader(f)
            raw_count = len(list(reader))

        with open(workspace / 'filtered.vcf', 'rU') as f:
            reader = vcf.Reader(f)
            filtered_count = len(
                list(vcf.Reader(open(workspace / 'filtered.vcf', 'rU'))))

        with open(workspace / ('sample', 'report.txt'), 'rb') as f:
            nesoni_count = len(f.readlines()) - 1

        self.log.log('\n')
        self.log.datum(workspace.name, 'changes found by "nesoni consensus:"',
                       nesoni_count)
        self.log.datum(workspace.name,
                       'is correctly patched by "nesoni consensus:"',
                       masked == primary_variant)
        self.log.log('\n')
        self.log.datum(workspace.name, 'raw variants', raw_count)
        self.log.datum(workspace.name, 'variants after filtering',
                       filtered_count)
        self.log.datum(workspace.name, 'is correctly patched by VCF pipeline',
                       patched == primary_variant)
        self.log.log('\n')
Ejemplo n.º 46
0
    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
        #is_core = (what == 'core') 
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in ('core','unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core') 
        
        workspace = self.get_workspace()
        
        for name, seq in io.read_sequences(working_directory.Working(self.working_dirs[0]).get_reference().reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)
            
            good = [ True ] * len(seq)
            
            for working_dir in self.working_dirs:
                if is_core:
                   suffix = '-depth.userplot'
                else:
                   suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name+suffix)
                )
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                   if good[i]:
                       if is_core:
                           good[i] = data[i] >= self.mincov
                       else:
                           good[i] = data[i] < self.mincov
    
            #Close holes
            start = -self.maxdiff-1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                     if 0 < i-start <= self.maxdiff:
                         for j in xrange(start,i): good[j] = True
                         n_holes += 1
                     start = i+1
            self.log.log('Closed '+grace.pretty_number(n_holes)+' holes\n')
            
            
            f = open( workspace/('%s-%s.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else 'N')
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f = open( workspace/('%s-%s_masked.fa' % (friendly_name,self.what)), 'wb')
            io.write_fasta(f, name,
                ''.join([ (seq[i] if good[i] else seq[i].lower())
                          for i in xrange(len(seq)) ])
            )
            f.close()
    
            f_good = open( workspace/('%s-%s_parts.fa' % (friendly_name,self.what)), 'wb')
            f_nongood = open( workspace/('%s-non%s_parts.fa' % (friendly_name,self.what)), 'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]    
            def emit(i):
                if i-start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i-start
                io.write_fasta(
                    f_good if good[start] else f_nongood,
                    '%s:%d..%d' % (name, start+1,i),
                    seq[start:i]
                )
            for i in xrange(1,len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()
            
            self.log.log(grace.pretty_number(sum(good))+' bases are '+self.what+', of '+grace.pretty_number(len(seq))+' in reference sequence\n')
            self.log.log(grace.pretty_number(n_good[0])+' parts at least '+grace.pretty_number(self.minsize)+' bases long with '+grace.pretty_number(n_good_bases[0])+' total bases\n')
            self.log.log('\n')
Ejemplo n.º 47
0
    def run(self):
        #assert not self.utr_only or self.utrs, '--utrs-only yes but no --utrs given'
        
        # Reference genome
        
        #chromosome_lengths = reference_directory.Reference(self.reference, must_exist=True).get_lengths()
        chromosomes = collections.OrderedDict(io.read_sequences(self.reference))

        def get_interpeak_seq(peaks):
            start = min(item.transcription_stop for item in peaks)
            end = max(item.transcription_stop for item in peaks)
            if end-start > self.max_seq: return ''
            if peaks[0].strand >= 0:
                return chromosomes[peaks[0].seqid][start:end]
            else:
                return bio.reverse_complement(chromosomes[peaks[0].seqid][start:end])

        def get_prepeak_seq(gene,peaks):
            if gene.strand >= 0:
                start = gene.utr_pos
                end = min(item.transcription_stop for item in peaks)
                if end-start > self.max_seq: return ''
                return chromosomes[gene.seqid][start:end]
            else:
                start = max(item.transcription_stop for item in peaks)
                end = gene.utr_pos
                if end-start > self.max_seq: return ''
                return bio.reverse_complement(chromosomes[gene.seqid][start:end])
        
        # Normalization files
        
        if self.norm_file:
            norm_file = self.norm_file
        else:
            nesoni.Norm_from_counts(self.prefix+'-norm', self.counts).run()
            norm_file = self.prefix+'-norm.csv'

        norms = io.read_grouped_table(norm_file, [('All',str)])['All']
        pair_norm_names = [ ]
        pair_norms = [ ]
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i]+'-peak1')
            pair_norms.append(norms.values()[i])
        for i in xrange(len(norms)):
            pair_norm_names.append(norms.keys()[i]+'-peak2')
            pair_norms.append(norms.values()[i])
        io.write_grouped_csv(
            self.prefix+'-pairs-norm.csv',
            [('All',io.named_list_type(pair_norm_names)(pair_norms))],
            comments=['#Normalization'],
            )


        # Read data
        
        annotations = list(annotation.read_annotations(self.parents))
        if self.utrs:
            utrs = list(annotation.read_annotations(self.utrs))
        else:
            utrs = [ ]
        children = list(annotation.read_annotations(self.children))
        
        count_table = io.read_grouped_table(self.counts, [
            ('Count',int),
            ('Tail_count',int),
            ('Tail',_float_or_none),
            ('Proportion',_float_or_none),
            ('Annotation',str)
            ])
        counts = count_table['Count']
        tail_counts = count_table['Tail_count']
        proportions = count_table['Proportion']
        tails = count_table['Tail']
        
        samples = counts.value_type().keys()
        sample_tags = { }
        for line in count_table.comments:
            if line.startswith('#sampleTags='):
                parts = line[len('#sampleTags='):].split(',')
                assert parts[0] not in sample_tags
                sample_tags[parts[0]] = parts
        
        for item in children:
            item.weight = sum( counts[item.get_id()][name] * float(norms[name]['Normalizing.multiplier']) for name in samples )
        
        parents = [ ]
        id_to_parent = { }
        for item in annotations:
            if item.type != self.parent_type: continue
            assert item.get_id() not in id_to_parent, 'Duplicate id in parent file: '+item.get_id()
            parents.append(item)
            id_to_parent[item.get_id()] = item
            item.children = [ ]
            #item.cds = [ ]
            
            # Default utr
            if item.strand >= 0:
               item.utr_pos = item.end
            else:
               item.utr_pos = item.start
            
            if 'three_prime_UTR_start' in item.attr:
               if item.strand >= 0:
                  item.utr_pos = int(item.attr['three_prime_UTR_start'])-1
               else:
                  item.utr_pos = int(item.attr['three_prime_UTR_start'])
            
            
        
        for item in utrs:
            assert item.attr['Parent'] in id_to_parent, 'Unknown gene '+item.attr['Parent']    
            id_to_parent[item.attr['Parent']].utr_pos = (item.start if item.strand >= 0 else item.end)


        for item in children:
            item.transcription_stop = item.end if item.strand >= 0 else item.start #End of transcription, 0-based, ie between-positions based
            
            if 'Parent' in item.attr:
                for item_parent in item.attr['Parent'].split(','):
                    parent = id_to_parent[item_parent]
                    parent.children.append(item)
                    

        for item in parents:
            item.children.sort(key=_annotation_sorter)
            
            relevant = list(item.children)
            if self.utr_only:
                #if item.strand <= 0:
                #    relative_utr_start = item.end - int(item.attr['three_prime_UTR_start'])
                #else:
                #    relative_utr_start = int(item.attr['three_prime_UTR_start'])-1 - item.start
                #
                #def relative_start(peak):
                #    return item.end-peak.end if item.strand < 0 else peak.start-item.start
                #relevant = [ peak for peak in relevant if relative_start(peak) >= relative_utr_start ]
                relevant = [ 
                    peak for peak in relevant 
                    if (peak.end >= item.utr_pos if item.strand >= 0 else peak.start <= item.utr_pos)
                    ]
                    
            if self.top:
                relevant.sort(key=lambda peak:peak.weight, reverse=True)
                relevant = relevant[:self.top]
            relevant.sort(key=_annotation_sorter)
            item.relevant_children = relevant
        
        
        
        # JSON output
        
        j_data = { }
        j_genes = j_data['genes'] = { }
        
        j_genes['__comment__'] = 'start is 0-based'
        j_genes['name'] = [ ]
        j_genes['chromosome'] = [ ]
        j_genes['strand'] = [ ]
        j_genes['start'] = [ ]
        j_genes['utr'] = [ ]
        j_genes['end'] = [ ]
        j_genes['gene'] = [ ]
        j_genes['product'] = [ ]
        j_genes['peaks'] = [ ]
        j_genes['relevant_peaks'] = [ ]
        #j_genes['cds'] = [ ]
        #j_genes['cds_start'] = [ ]
        #j_genes['cds_end'] = [ ]
        for item in parents:
            j_genes['name'].append( item.get_id() )
            j_genes['chromosome'].append( item.seqid )
            j_genes['strand'].append( item.strand )
            j_genes['start'].append( item.start )
            j_genes['utr'].append( item.utr_pos )
            j_genes['end'].append( item.end )
            j_genes['gene'].append( item.attr.get('Name',item.attr.get('gene','')) )
            j_genes['product'].append( item.attr.get('Product',item.attr.get('product','')) )
            j_genes['peaks'].append( [ item2.get_id() for item2 in item.children ] )
            j_genes['relevant_peaks'].append( [ item2.get_id() for item2 in item.relevant_children ] )
            #j_genes['cds'].append( item.cds )
            #j_genes['cds_start'].append( item.cds_start )
            #j_genes['cds_end'].append( item.cds_end )
        
        j_peaks = j_data['peaks'] = { }
        j_peaks['__comment__'] = 'start is 0-based'
        j_peaks['name'] = [ ]
        j_peaks['chromosome'] = [ ]
        j_peaks['strand'] = [ ]
        j_peaks['start'] = [ ]
        j_peaks['end'] = [ ]
        j_peaks['parents'] = [ ]
        j_peaks['counts'] = [ ]
        j_peaks['tail_lengths'] = [ ]
        j_peaks['proportion_tailed'] = [ ]
        for item in children:
            j_peaks['name'].append( item.get_id() )
            j_peaks['chromosome'].append( item.seqid )
            j_peaks['strand'].append( item.strand )
            j_peaks['start'].append( item.start )
            j_peaks['end'].append( item.end )
            j_peaks['parents'].append( item.attr['Parent'].split(',') if 'Parent' in item.attr else [ ])
            j_peaks['counts'].append( counts[item.get_id()].values() )
            j_peaks['tail_lengths'].append( count_table['Tail'][item.get_id()].values() )
            j_peaks['proportion_tailed'].append( count_table['Proportion'][item.get_id()].values() )
        
        j_samples = j_data['samples'] = { }
        j_samples['name'] = [ ]
        j_samples['tags'] = [ ]
        j_samples['normalizing_multiplier'] = [ ]
        for name in samples:
            j_samples['name'].append(name)
            j_samples['tags'].append(sample_tags[name])
            j_samples['normalizing_multiplier'].append(float(norms[name]['Normalizing.multiplier']))
        
        j_chromosomes = j_data['chromosomes'] = { }
        j_chromosomes['name'] = [ ]
        j_chromosomes['length'] = [ ]
        for name, seq in chromosomes.iteritems():
            j_chromosomes['name'].append(name)
            j_chromosomes['length'].append(len(seq))        
        
        with open(self.prefix + '.json','wb') as f:
            json.dump(j_data, f)
        
        
        # Output paired peak file
        
        output_comments = [ '#Counts' ]
        output_samples = [ ]
        for item in samples:
            output_samples.append(item+'-peak1')
            output_comments.append('#sampleTags=' + ','.join([item+'-peak1','peak1']+sample_tags.get(item,[])))
        for item in samples:
            output_samples.append(item+'-peak2')
            output_comments.append('#sampleTags=' + ','.join([item+'-peak2','peak2']+sample_tags.get(item,[])))
        
        output_names = [ ]
        output_counts = [ ]
        output_tail_counts = [ ]
        output_proportions = [ ]
        output_tails = [ ]
        output_annotation_fields = [ 'gene', 'product', 'mean_tail_1', 'mean_tail_2', 'chromosome', 'strand', 
                                     'transcription_stops' ] #, 'interpeak_seq', ]
        output_annotations = [ ]
            
        for item in parents:
            peaks = item.relevant_children
            for i in xrange(len(peaks)-1):
                for j in xrange(i+1, len(peaks)):
                    id_i = peaks[i].get_id()
                    id_j = peaks[j].get_id()
                    id_pair = item.get_id() + '-'+id_i+'-'+id_j
                    output_names.append(id_pair)
                    
                    row = [ ]
                    row.extend(counts[id_i].values())
                    row.extend(counts[id_j].values())
                    output_counts.append(filter(_text,row))
                    
                    row = [ ]
                    row.extend(tail_counts[id_i].values())
                    row.extend(tail_counts[id_j].values())
                    output_tail_counts.append(filter(_text,row))

                    row = [ ]
                    row.extend(proportions[id_i].values())
                    row.extend(proportions[id_j].values())
                    output_proportions.append(filter(_text,row))

                    row = [ ]
                    row.extend(tails[id_i].values())
                    row.extend(tails[id_j].values())
                    output_tails.append(filter(_text,row))
                    
                    output_annotations.append([
                        item.attr.get('Name',item.attr.get('gene','')),
                        item.attr.get('Product',item.attr.get('product','')),
                        count_table['Annotation'][id_i]['mean-tail'],
                        count_table['Annotation'][id_j]['mean-tail'],
                        
                        item.seqid,
                        str(item.strand),
                        '%d, %d' % (peaks[i].transcription_stop,peaks[j].transcription_stop),
                        #get_interpeak_seq([peaks[i],peaks[j]]),
                        ])
        
        #output_count_table = io.named_matrix_type(output_names,output_samples)(output_counts)
        io.write_grouped_csv(
            self.prefix + '-pairs.csv',
            [ 
                ('Count',io.named_matrix_type(output_names,output_samples)(output_counts)),
                ('Tail_count',io.named_matrix_type(output_names,output_samples)(output_tail_counts)),
                ('Proportion',io.named_matrix_type(output_names,output_samples)(output_proportions)),
                ('Tail',io.named_matrix_type(output_names,output_samples)(output_tails)),
                ('Annotation',io.named_matrix_type(output_names,output_annotation_fields)(output_annotations)),
                ],
            comments=output_comments,
            )
                        
#        # Chi Sq tests
#        
#        #for id in relation:
#        #    peaks = relation[id]
#        #    if len(peaks) < 2: continue     
#        
#        mats = [ ]   
#        genes = [ ]
#        products = [ ]
#        mean_tails = [ ]
#        prop_tails = [ ]
#        
#        peak_names = [ ]
#        chromosome_names = [ ]
#        strands = [ ]
#        transcription_stops = [ ]
#        interpeak_seqs = [ ]
#        prepeak_seqs = [ ]
#        
#        for parent in parents:
#            id = parent.get_id()
#            peaks = parent.relevant_children
#            if len(peaks) < 2: continue
#            
#            matrix = [ ]
#            for item in peaks:
#                matrix.append(counts[item.get_id()].values())
#            
#            mats.append(
#                runr.R_literal(id) + ' = ' + 
#                runr.R_literal(matrix)
#                )
#            
#            genes.append(parent.attr.get('Name',parent.attr.get('gene','')))
#            products.append(parent.attr.get('Product',parent.attr.get('product','')))
#            
#            def format_mean(s):
#                if s == 'NA': return 'NA'
#                return '%.1f' % float(s)
#            mean_tails.append(', '.join( format_mean(count_table['Annotation'][item.get_id()]['mean-tail']) for item in peaks ))
#            
#            def format_prop(s):
#                if s == 'NA': return 'NA'
#                return '%.2f' % float(s)
#            prop_tails.append(', '.join( format_prop(count_table['Annotation'][item.get_id()]['proportion-with-tail']) for item in peaks ))
#            
#            peak_names.append(', '.join(item.get_id() for item in peaks))
#            chromosome_names.append(parent.seqid)
#            strands.append(parent.strand)
#            transcription_stops.append(', '.join(str(item.transcription_stop) for item in peaks))
#            interpeak_seqs.append(get_interpeak_seq(peaks))
#            prepeak_seqs.append(get_prepeak_seq(parent,peaks))
#            
#            #if len(mats) >= 10: break
#        
#        text = 'cat("Loading data into R+\n")\n'
#        text += 'data <- list(\n' + ',\n'.join(mats) + ')\n'        
#        text += CHISQ
#        
#        runr.run_script(text,
#            OUTPUT_FILENAME=self.prefix+'.csv',
#            GENES = genes,
#            PRODUCTS = products,
#            MEAN_TAILS = mean_tails,
#            PROP_TAILS = prop_tails,
#            PEAK_NAMES = peak_names,
#            CHROMOSOME_NAMES = chromosome_names,
#            STRANDS = strands,
#            TRANSCRIPTION_STOPS = transcription_stops,
#            INTERPEAK_SEQS = interpeak_seqs,
#            PREPEAK_SEQS = prepeak_seqs,
#            )
#        
            
        
        
        
        
        
        
        
        
Ejemplo n.º 48
0
    def run(self):
        """
        
        <sequence> <poly-A> <adaptor> <anything>
        
        """
        clip_quality = chr(33+self.clip_quality)
        ignore_quality = chr(33+self.ignore_quality)
        
        with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \
             io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file:
            print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched'
             
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0

            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    # "Good quality" sequence ends at the first low quality base
                    #good_quality_end = 0
                    #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality:
                    #    good_quality_end += 1
                    
                    goodness_score = 0
                    best_goodness_score = 0
                    good_quality_end = 0
                    i = 0
                    while True:
                        if goodness_score > best_goodness_score:
                            best_goodness_score = goodness_score
                            good_quality_end = i
                        
                        if i >= len(seq):
                            break
                            
                        if qual[i] >= clip_quality:
                            goodness_score += 1
                        else:
                            goodness_score -= 9
                        i += 1
                            
                    
                    best_score = 0
                    best_a_start = good_quality_end
                    best_a_end = good_quality_end
                    best_adaptor_bases = 0
                    best_aonly_score = 0
                    best_aonly_start = good_quality_end
                    best_aonly_end = good_quality_end
                    
                    # Consider each possible start position for the poly(A)
                    for a_start in xrange(good_quality_end):
                        if a_start and seq[a_start-1] == 'A': continue
                        
                        # Consider each possible end position for the poly(A)
                        a_end = a_start
                        aonly_score = 0
                        while True:
                            if aonly_score > best_aonly_score:
                                best_aonly_score = aonly_score
                                best_aonly_start = a_start
                                best_aonly_end = a_end
                            
                            
                            # The poly(A) should be followed by adaptor,
                            # at least until the end of good quality sequence.
                            # However if there is evidence of the adaptor beyond
                            # the end of good quality, we still want to know that,
                            # and count it towards the number of adaptor bases present.
                            score = aonly_score
                            adaptor_bases = 0
                            i = a_end
                            while True:
                                if (score > best_score and 
                                    (i >= good_quality_end or i >= a_end+len(self.adaptor))):
                                    best_score = score
                                    best_a_start = a_start
                                    best_a_end = a_end
                                    best_adaptor_bases = adaptor_bases
                            
                                if i >= a_end+len(self.adaptor) or i >= len(seq):
                                    break
                            
                                if qual[i] >= ignore_quality:
                                    if seq[i] == self.adaptor[i-a_end]:
                                        score += 1
                                        adaptor_bases += 1
                                    else:
                                        score -= 4
                                i += 1
                                
                            #if a_end >= len(seq): break
                            
                            # poly(A) tail only within good quality region.
                            if a_end >= good_quality_end: break
                            
                            
                            if qual[a_end] >= ignore_quality:
                                if seq[a_end] == 'A':
                                    aonly_score += 1
                                else:
                                    aonly_score -= 4
                                    if aonly_score <= 0: break
                            a_end += 1
                        
                    a_start = best_a_start
                    a_end = best_a_end
                    adaptor_bases = best_adaptor_bases
                    aonly_start = best_aonly_start
                    aonly_end = best_aonly_end                    
                        
                    if self.debug: # and a_end == a_start and a_end < len(seq)-10:        
                        print name
                        print ''.join( 
                            'I' if item<ignore_quality 
                            else ('C' if item<clip_quality else ' ') 
                            for item in qual )
                        print '-' * good_quality_end
                        print seq
                        print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score)
                        #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "."
                        print 
                        sys.stdout.flush()

                    n += 1
                    total_before += len(seq)

                    # 0 - sequence name
                    # 1 - sequence length
                    # 2 - poly(A) start
                    # 3 - poly(A) end
                    # (4 - best run of As start, for debugging the need to detect adaptor seq)
                    # (5 - best run of As end)
                    # 6 - number of adaptor bases matched
                    print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end,  adaptor_bases)
                    
                    if a_start > self.length:
                        if a_start < len(seq):
                            n_clipped += 1
                            total_clipped += a_start
                    
                        print >> out_file, '@'+name
                        print >> out_file, seq[:a_start]
                        print >> out_file, '+'
                        print >> out_file, qual[:a_start]
                    else:
                        n_discarded += 1
                    
                    if n%10000 == 0: 
                        grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)')
        
        grace.status('')
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
Ejemplo n.º 49
0
    def run(self):
        references = {}
        for filename in self.reference_filenames:
            print >> sys.stderr, 'Load', filename
            for name, seq in io.read_sequences(filename):
                references[name] = seq

        reads = {}
        for filename in self.reads:
            print >> sys.stderr, 'Load', filename
            for name, seq, qual in io.read_sequences(filename,
                                                     qualities='required'):
                reads[name] = (seq, qual)

        print >> sys.stderr, 'Begin'

        in_file = self.begin_input()
        out_file = self.begin_output()

        for line in in_file:
            line = line.rstrip()
            if line.startswith('@'):
                print >> out_file, line
                continue

            al = Alignment(line)

            if al.flag & FLAG_UNMAPPED:
                continue

            reverse = al.flag & FLAG_REVERSE
            if reverse:
                read_bases = rev_comp(al.seq)
                read_qual = al.qual[::-1]
                cigar = cigar_decode(al.cigar)[::-1]
            else:
                read_bases = al.seq
                read_qual = al.qual
                cigar = cigar_decode(al.cigar)

            al.extra = [
                item for item in al.extra if not item.startswith('CQ:Z:')
                and not item.startswith('CS:Z:')
            ] + [
                'CQ:Z:' + reads[al.qname][1],
                'CS:Z:' + reads[al.qname][0],
            ]

            ref = references[al.rname]

            seq_tail = reads[al.qname][0][len(al.seq) + 1:]
            qual_tail = reads[al.qname][1][len(al.seq):]
            n_tail = len(seq_tail)

            if reverse:
                if al.pos - 1 - n_tail < 0:
                    continue  #TODO: handle tail extending beyond end of reference
                bases_ref = rev_comp(ref[al.pos - 1 - n_tail:al.pos - 1 + 1])
            else:
                if al.pos - 1 + al.length + n_tail > len(ref):
                    continue  #TODO: handle tail extending beyond end of reference
                bases_ref = ref[al.pos - 1 + al.length - 1:al.pos - 1 +
                                al.length + n_tail]

            seq_ref = solid_encode(bases_ref)

            basic_score = alignment_score(qual_tail, seq_tail, seq_ref,
                                          self.quality)

            if n_tail:
                tail_score, tail_pos = max((alignment_score(
                    qual_tail, seq_tail,
                    solid_encode(bases_ref[:1 + i] + 'A' *
                                 (n_tail - i)), self.quality)[0], i)
                                           for i in xrange(n_tail + 1))

                baseline = max(
                    0,
                    alignment_score(qual_tail[:tail_pos], seq_tail[:tail_pos],
                                    seq_ref[:tail_pos], self.quality)[0])

                if tail_score >= baseline + self.tail:
                    #Record position of end of transcript in 'AA' (1-based position)
                    if reverse:
                        tail_refpos = al.pos - tail_pos
                    else:
                        tail_refpos = al.pos + al.length + tail_pos - 1
                    al.extra.append('AA:i:%d' % tail_refpos)

                #Record sequence's poly(A) tail length in AN: from the end of correspondence with the reference sequence to the end of good quality As
                estimated_tail_length = alignment_score(
                    qual_tail, seq_tail,
                    solid_encode(bases_ref[:1 + tail_pos] + 'A' *
                                 (n_tail - tail_pos)),
                    self.quality)[1] - tail_pos
                if estimated_tail_length > 0:
                    al.extra.append('AN:i:%d' % estimated_tail_length)

                if tail_pos:
                    read_bases += solid_decode(read_bases[-1],
                                               seq_tail[:tail_pos])
                    read_qual += qual_tail[:tail_pos]
                    cigar += 'M' * tail_pos
                    al.length += tail_pos
                    if reverse:
                        al.pos -= tail_pos
                        al.seq = rev_comp(read_bases)
                        al.qual = read_qual[::-1]
                        al.cigar = cigar_encode(cigar[::-1])
                    else:
                        al.seq = read_bases
                        al.qual = read_qual
                        al.cigar = cigar_encode(cigar)

            print >> out_file, al

        self.end_output(out_file)
        self.end_input(in_file)
Ejemplo n.º 50
0
# python2.6 modify_sequence.py data/velvet_test_reference.fa >data/velvet_test_reference_modified.fa

import sys, random

from nesoni import io

for name, seq in io.read_sequences(sys.argv[1]):
    j = 0
    for i in xrange(0, len(seq) - 100, 100):
        original = seq[i]
        if j % 3 == 0:
            while True:
                new = random.choice('ACGT')
                if new != original: break
            seq = seq[:i] + new + seq[i + 1:]
        elif j % 3 == 1:
            n = (j // 3) % 9 + 1
            seq = seq[:i] + seq[i + n:]
        else:
            n = (j // 3) % 9 + 1
            seq = seq[:i] + ''.join(random.choice('ACGT')
                                    for k in xrange(n)) + seq[i:]
        j += 1
    io.write_fasta(sys.stdout, name, seq)
Ejemplo n.º 51
0
def pastiche(args):
    if len(args) < 4:
        print USAGE
        return 1

    mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool,
                                             False)
    min_leftover, args = grace.get_option_value(args, '--min-leftover', int,
                                                20)

    output_dir, args = args[0], args[1:]

    #, ref_filename, contig_filenames = args[0], args[1], args[2:]

    ref_filenames = []
    contig_filenames = []
    grace.execute(args,
                  {'contigs': lambda args: contig_filenames.extend(args)},
                  lambda args: ref_filenames.extend(args))

    assert ref_filenames, 'No reference sequences given'
    assert contig_filenames, 'No contig sequences given'

    contigs = dict([(name.split()[0], seq) for filename in contig_filenames
                    for name, seq in io.read_sequences(filename)])
    dir_contigs = {}
    for name in contigs:
        dir_contigs[name + '+'] = contigs[name]
        dir_contigs[name + '-'] = bio.reverse_complement(contigs[name])

    dir_contigs_used = {}
    for name in dir_contigs:
        dir_contigs_used[name] = [False] * len(dir_contigs[name])

    workspace = io.Workspace(output_dir)
    temp_prefix = workspace._object_filename('temp-pastiche')

    out_f = workspace.open('pastiche.fa', 'wb')

    for ref_filename in ref_filenames:
        for ref_name, ref_seq in io.read_sequences(ref_filename):
            ref_name = ref_name.split()[0]

            grace.status(ref_name)

            f = open(temp_prefix + '.fa', 'wb')
            io.write_fasta(f, 'ref', ref_seq)
            f.close()

            scores = [-1] * (len(ref_seq) * 2)
            strings = ['N', ''] * (len(ref_seq))
            contexts = [None for i in xrange(len(ref_seq) * 2)]

            #MAXSCORE = len(ref_seq)+1
            #for i in xrange(len(ref_seq)):
            #    if ref_seq[i].upper() != 'N':
            #        strings[i*2] = ref_seq[i]
            #        scores[i*2] = MAXSCORE
            #for i in xrange(len(ref_seq)-1):
            #    if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N':
            #        scores[i*2+1] = MAXSCORE

            if mask_only:
                for i in xrange(len(ref_seq)):
                    strings[i * 2] = ref_seq[i].lower()

            def put(position, dir_contig_name, start, end, score):
                if scores[position] < score:
                    scores[position] = score
                    strings[position] = dir_contigs[dir_contig_name][start:end]
                    contexts[position] = (dir_contig_name, start, end, score)

            for contig_filename in contig_filenames:
                execute([
                    'nucmer',
                    '--prefix',
                    temp_prefix,
                    #'--maxmatch', #Very slow
                    '--nosimplify',
                    '--minmatch',
                    '9',
                    '--mincluster',
                    '50',
                    #'--maxgap', '1000',
                    #'--breaklen', '1000', # Increasing this reduces Ns, but is slow
                    #'--diagfactor', '1.0',
                    temp_prefix + '.fa',
                    contig_filename
                ])

                for contig_name, contig_seq in io.read_sequences(
                        contig_filename):
                    contig_name = contig_name.split()[0]
                    grace.status(ref_name + ' vs ' + contig_name)
                    p = run([
                        'show-aligns', temp_prefix + '.delta', 'ref',
                        contig_name
                    ],
                            stderr=subprocess.PIPE)

                    alignments = []

                    while True:
                        line = p.stdout.readline()
                        if not line: break
                        if not line.startswith('-- BEGIN'):
                            continue

                        parts = line.split()

                        ref_start = int(parts[5])
                        ref_end = int(parts[7])
                        query_start = int(parts[10])
                        query_end = int(parts[12])

                        #assert ref_start < ref_end
                        #ref_start -= 1 #Zero based coordinates

                        al_ref = []
                        al_query = []

                        while True:
                            block = []
                            end = False
                            while True:
                                line = p.stdout.readline()
                                if line.startswith('--   END'):
                                    end = True
                                    break
                                if line == '\n':
                                    if block:
                                        break
                                    else:
                                        continue
                                block.append(line)

                            if end: break

                            al_ref.append(block[0].split()[1])
                            al_query.append(block[1].split()[1])

                        al_ref = ''.join(al_ref)
                        al_query = ''.join(al_query)

                        if ref_start > ref_end:
                            al_ref = bio.reverse_complement(al_ref)
                            al_query = bio.reverse_complement(al_query)
                            ref_start, ref_end = ref_end, ref_start
                            query_start, query_end = query_end, query_start

                        if query_start > query_end:
                            dir_contig_name = contig_name + '-'
                            query_start = len(contig_seq) + 1 - query_start
                            query_end = len(contig_seq) + 1 - query_end
                        else:
                            dir_contig_name = contig_name + '+'

                        ref_start -= 1  #Zero based coordinates
                        query_start -= 1

                        #print al_ref
                        #print al_query

                        #Pretty dumb scoring scheme
                        al_score = 0
                        for i in xrange(len(al_ref)):
                            if al_ref[i] == al_query[i]:
                                al_score += 1
                            #else:
                            #    al_score -= 1

                        #Pastiche alignment over reference
                        ref_pos = ref_start
                        query_pos = query_start
                        al_pos = 0
                        while al_pos < len(al_ref):
                            assert al_ref[al_pos] != '.'
                            if al_query[al_pos] == '.':
                                put(ref_pos * 2, dir_contig_name, query_pos,
                                    query_pos, al_score)
                            else:
                                assert al_query[al_pos].lower() == dir_contigs[
                                    dir_contig_name][query_pos].lower()
                                put(ref_pos * 2, dir_contig_name, query_pos,
                                    query_pos + 1, al_score)
                                query_pos += 1
                            al_pos += 1

                            al_pos_end = al_pos
                            query_pos_end = query_pos
                            while al_pos_end < len(
                                    al_ref) and al_ref[al_pos_end] == '.':
                                al_pos_end += 1
                                query_pos_end += 1
                            #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score)
                            assert al_query[al_pos:al_pos_end].lower(
                            ) == dir_contigs[dir_contig_name][
                                query_pos:query_pos_end].lower()
                            put(ref_pos * 2 + 1, dir_contig_name, query_pos,
                                query_pos_end, al_score)
                            al_pos = al_pos_end
                            query_pos = query_pos_end
                            ref_pos += 1

                    p.wait()

            grace.status(ref_name)

            result = ''.join(strings)
            io.write_fasta(out_f, ref_name, result)

            for context in contexts:
                if context is None: continue
                name, start, end, score = context
                for i in xrange(start, end):
                    dir_contigs_used[name][i] = True

            #Interpolation
            #result = [ ]
            #i = 0
            #while i < len(ref_seq):
            #    if strings[i*2].upper() != 'N':
            #        result.append(strings[i*2])
            #        result.append(strings[i*2+1])
            #        i += 1
            #        continue
            #
            #    j = i
            #    while strings[j*2].upper() == 'N':
            #        j += 1
            #
            #    grace.status('')
            #    print >> sys.stderr, 'interpolating', i+1,'..',j
            #
            #    window = 20 #!!!!!!!!!!!
            #    left_contexts = collections.defaultdict(lambda:0)
            #    for i1 in xrange(max(0,i-window),i):
            #        for context_name, context_start, context_end, context_score in contexts[i1*2]:
            #            key = (context_name, context_end + i - i1)
            #            left_contexts[key] = max(left_contexts[key],context_score)
            #
            #    right_contexts = collections.defaultdict(lambda:0)
            #    for j1 in xrange(j,min(j+window,len(ref_seq))):
            #        for context_name, context_start, context_end, context_score in contexts[j1*2]:
            #            key = (context_name, context_start + j - j1)
            #            right_contexts[key] = max(left_contexts[key],context_score)
            #
            #    #print >> sys.stderr, left_contexts
            #    #print >> sys.stderr, right_contexts
            #
            #    options = [ ]
            #
            #    for (left_name, left_pos), left_score in left_contexts.items():
            #        for (right_name, right_pos), right_score in right_contexts.items():
            #            if left_name != right_name: continue
            #            if right_pos < left_pos: continue
            #
            #            if right_pos-left_pos > (j-i) * 4.0 + 10: continue   #!!!!!!!!!!!!!!!!!!!!!!1
            #            if right_pos-left_pos < (j-i) * 0.25 - 10: continue
            #
            #            score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i)
            #            score *= left_score + right_score
            #            #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score
            #            options.append( (score, left_name, left_pos, right_pos) )
            #
            #    if options:
            #        best = max(options, key=lambda option: option[0])
            #        print >> sys.stderr, '->', best
            #        result.append( dir_contigs[best[1]][best[2]:best[3]].lower() )
            #    else:
            #        print >> sys.stderr, '-> no good interpolation'
            #        result.append( ref_seq[i:j] )
            #
            #    i = j
            #
            #result = ''.join(result)
            #io.write_fasta(sys.stdout, ref_name, result)

            #print >> sys.stderr, len(result), result.count('N')
            #for pos, size in N_runs:
            #    out_size = len(''.join( strings[pos*2:pos*2+2] ))
            #    print >> sys.stderr, pos, size, '->', out_size

    out_f.close()

    grace.status('')

    #for name, seq in io.read_sequences(ref_filename):
    #    result = pastiche(seq, contigs_filename)
    #    io.write_fasta(sys.stdout, name, result)

    leftover_f = workspace.open('leftovers.fa', 'wb')

    for name in sorted(contigs):
        used = [
            (a or b)
            for a, b in zip(dir_contigs_used[name +
                                             '+'], dir_contigs_used[name +
                                                                    '-'][::-1])
        ]

        i = 0
        while i < len(used):
            j = i
            while j < len(used) and not used[j]:
                j += 1
            if j - i > min_leftover:
                if i == 0 and j == len(used):
                    out_name = name
                else:
                    out_name = name + ':%d..%d' % (i + 1, j)
                io.write_fasta(leftover_f, out_name, contigs[name][i:j])

            i = j + 1

    leftover_f.close()

    for suffix in ['.fa', '.delta']:
        os.unlink(temp_prefix + suffix)
Ejemplo n.º 52
0
    def run(self):
        assert self.reads or self.pairs or self.interleaved, 'No reads given'
    
        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)
        
        working = self.get_workspace()
        working.setup_reference(self.references, bowtie=True)
        working.update_param(snp_cost=2.0)        
        reference = working.get_reference()
        
        log_file = open(self.log_filename(),'wb')
              
        with workspace.tempspace(dir=working.working_dir) as temp:
            n = [ 0 ]
            def tempname():
                n[0] += 1
                return temp/('%d.fq'%n[0])
            def convert(filename):
                info = io.get_file_info(filename)
                ok = selection.matches('type-fastq:[compression-none/compression-gzip/compression-bzip2]', info)
                if ok:
                    return filename            
                result_name = tempname()
                with open(result_name,'wb') as f:
                    for name, seq, qual in io.read_sequences(filename, qualities='required'):
                        io.write_fastq(f, name, seq, qual)
                return result_name
            
            ones = [ ]
            twos = [ ]
            singles = [ ]
            
            for pair in self.pairs:
                assert len(pair) == 2, 'Need two files in each "pair:" section.'
                ones.append(convert(pair[0]))
                twos.append(convert(pair[1]))
            
            for item in self.interleaved:
                left_name = tempname()
                right_name = tempname()
                ones.append(left_name)
                twos.append(right_name)
                with open(left_name,'wb') as left, \
                     open(right_name,'wb') as right:
                    reader = io.read_sequences(item, qualities='required')
                    while True:
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            break
                        io.write_fastq(left, name,seq,qual)
                        
                        try:
                            name, seq, qual = reader.next()
                        except StopIteration:
                            raise grace.Error('Interleaved file contains odd number of sequences')
                        io.write_fastq(right, name,seq,qual)
            
            for item in self.reads:
                singles.append(convert(item))

            cores = min(self.cores, legion.coordinator().get_cores())

            command = (
                [ 'bowtie2', 
                    '--threads', str(cores),
                    '--rg-id', '1',
                    '--rg', 'SM:'+working.name,                    
                    ] + 
                self.bowtie_options + 
                [ '-x', reference.get_bowtie_index_prefix() ]
                )
            commands = [ ]
            if ones:
                commands.append(command + [ '-1', ','.join(ones), '-2', ','.join(twos) ])
            if singles:
                commands.append(command + [ '-U', ','.join(singles) ])
            
            temp_bam_name = temp/'temp.bam'

            with io.pipe_to(
                     ['samtools', 'view', '-S', '-b', '-'],
                     stdout=open(temp_bam_name,'wb'),
                     stderr=log_file
                     ) as f:
                header_sent = False
                for command in commands:
                    self.log.log('Running:\n' + ' '.join(command) + '\n')            
                    with io.pipe_from(
                        command,
                        stderr=log_file,
                        cores=cores
                        ) as f_out:
                        for line in f_out:
                            if not header_sent or not line.startswith('@'):
                                f.write(line)
                    header_sent = True

            #io.execute([
            #    'samtools', 'sort', '-n', temp_bam_name, working/'alignments'
            #    ])
            
            sam.sort_bam(temp_bam_name, working/'alignments', by_name=True, cores=self.cores)
            
        log_file.close()
Ejemplo n.º 53
0
def main(args):
    grace.require_shrimp_1()

    n_cpus = grace.how_many_cpus()

    solid, args = grace.get_flag(args, '--solid')
    verbose, args = grace.get_flag(args, '--verbose')

    threshold, args = grace.get_option_value(args, '--threshold', str, '68%')

    stride, args = grace.get_option_value(args, '--stride', int, 1)
    max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus)
    batch_size, args = grace.get_option_value(args, '--batch-size', int,
                                              5000000)

    input_reference_filenames = []
    reads_filenames = []

    shrimp_options = ['-h', threshold]
    if threshold.endswith('%'):
        threshold = -float(threshold[:-1]) / 100.0
    else:
        threshold = int(threshold)

    output_dir = []  #As list so can write to from function. Gah.

    def front_command(args):
        grace.expect_no_further_options(args)

        if len(args) < 1:
            return

        output_dir.append(args[0])
        input_reference_filenames.extend(
            [os.path.abspath(filename) for filename in args[1:]])

    def reads_command(args):
        grace.expect_no_further_options(args)
        reads_filenames.extend([[os.path.abspath(filename)]
                                for filename in args])

    def pairs_command(args):
        grace.expect_no_further_options(args)
        assert len(args) == 2, 'Expected exactly two files in "pairs"'
        reads_filenames.append(
            [os.path.abspath(filename) for filename in args])

    def shrimp_options_command(args):
        shrimp_options.extend(args)

    grace.execute(
        args, {
            'reads': reads_command,
            '--reads': reads_command,
            'pairs': pairs_command,
            'shrimp-options': shrimp_options_command,
            '--shrimp-options': shrimp_options_command,
        }, front_command)

    if not output_dir:
        print >> sys.stderr, USAGE % n_cpus
        return 1

    output_dir = output_dir[0]

    assert input_reference_filenames, 'No reference files given'
    assert reads_filenames, 'No read files given'

    for filename in itertools.chain(input_reference_filenames,
                                    *reads_filenames):
        assert os.path.exists(filename), '%s does not exist' % filename

    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    if solid:
        shrimp = 'rmapper-cs'
    else:
        shrimp = 'rmapper-ls'

    reference_filename = os.path.join(output_dir, 'reference.fa')
    reference_file = open(reference_filename, 'wb')
    total_reference_sequences = 0
    total_reference_bases = 0
    for input_reference_filename in input_reference_filenames:
        for name, sequence in io.read_sequences(input_reference_filename):
            #Don't retain any comment
            name = name.split()[0]
            io.write_fasta(reference_file, name, sequence)

            total_reference_sequences += 1
            total_reference_bases += len(sequence)

    reference_file.close()

    print '%s base%s in %s reference sequence%s' % (
        grace.pretty_number(total_reference_bases),
        's' if total_reference_bases != 1 else '',
        grace.pretty_number(total_reference_sequences),
        's' if total_reference_sequences != 1 else '')

    assert total_reference_bases, 'Reference sequence file is empty'

    config = {
        'references': input_reference_filenames,
        'reads': reads_filenames,
        'stride': stride,
        'solid': solid,
        'threshold': threshold,
    }
    config_file = open(os.path.join(output_dir, 'config.txt'), 'wb')
    pprint.pprint(config, config_file)
    config_file.close()

    output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz')
    output_file = gzip.open(output_filename, 'wb')

    unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz')
    unmapped_file = gzip.open(unmapped_filename, 'wb')

    dirty_filenames = set()
    dirty_filenames.add(output_filename)
    dirty_filenames.add(unmapped_filename)

    #warn_low_threshold = True

    try:  #Cleanup temporary files

        N = [0]

        def do_shrimp(read_set):
            my_number = N[0]
            N[0] += 1

            tempname = os.path.join(output_dir,
                                    'temp%d-%d.fa' % (os.getpid(), my_number))
            tempname_out = os.path.join(
                output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number))

            dirty_filenames.add(tempname)
            dirty_filenames.add(tempname_out)

            f = open(tempname, 'wb')
            for read_name, read_seq in read_set:
                print >> f, '>' + read_name
                print >> f, read_seq
            f.close()

            command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \
                      tempname + ' ' + reference_filename + ' >' + tempname_out
            if not verbose:
                command += ' 2>/dev/null'
            #f = os.popen(command, 'r')
            child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c',
                                  command)

            #print 'SHRiMP %d running' % my_number

            def finalize():
                exit_status = os.waitpid(child_pid, 0)[1]
                assert exit_status == 0, 'Shrimp indicated an error'

                hits = {}  # read_name -> [ hit line ]

                f = open(tempname_out, 'rb')
                for line in f:
                    if line.startswith('>'):
                        read_name = line.split(None, 1)[0][1:]
                        if read_name not in hits:
                            hits[read_name] = []
                        hits[read_name].append(line)
                f.close()

                for read_name, read_seq in read_set:
                    if read_name in hits:
                        for hit in hits[read_name]:
                            output_file.write(hit)
                    else:
                        print >> unmapped_file, '>' + read_name
                        print >> unmapped_file, read_seq

                output_file.flush()
                unmapped_file.flush()

                os.unlink(tempname)
                dirty_filenames.remove(tempname)
                os.unlink(tempname_out)
                dirty_filenames.remove(tempname_out)
                #print 'SHRiMP %d finished' % my_number

            return finalize

        shrimps = []

        reader = iter_reads(config)
        read_count = 0

        while True:
            read_set = []
            read_set_bases = 0

            #Read name should not include comment cruft
            # - SHRIMP passes this through
            # - might stuff up identification of pairs

            for read_name, read_seq in reader:
                read_name = read_name.split()[0]
                read_set.append((read_name, read_seq))
                read_set_bases += len(read_seq)

                #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match
                #    sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n')
                #    warn_low_threshold = False

                read_count += 1
                if read_set_bases >= batch_size: break

            if not read_set: break

            if len(shrimps) >= max_shrimps:
                shrimps.pop(0)()
            shrimps.append(do_shrimp(read_set))

            grace.status('SHRiMPing %s' % grace.pretty_number(read_count))

        while shrimps:
            grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps))
            shrimps.pop(0)()

        grace.status('')

        output_file.close()
        dirty_filenames.remove(output_filename)
        unmapped_file.close()
        dirty_filenames.remove(unmapped_filename)

        return 0

    finally:
        for filename in dirty_filenames:
            if os.path.exists(filename):
                os.unlink(filename)
Ejemplo n.º 54
0
def main(args):
    mincov, args = grace.get_option_value(args, '--mincov', int, 1) 
    maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) 
    minsize, args = grace.get_option_value(args, '--minsize', int, 200)
    what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')    
    is_core = (what == 'core') 

    grace.expect_no_further_options(args)
    
    if len(args) < 2:
        print >> sys.stderr, HELP
        raise grace.Help_shown()
    
    output_dir, working_dirs = args[0], args[1:]
    
    assert not path.exists(path.join(output_dir, 'reference.fa')), \
        'Output directory not given'
    
    if not path.exists(output_dir):
        os.mkdir(output_dir)
    
    for name, seq in io.read_sequences(path.join(working_dirs[0],'reference.fa')):
        print name
        friendly_name = grace.filesystem_friendly_name(name)
        
        good = [ True ] * len(seq)
        
        for working_dir in working_dirs:
            if is_core:
               suffix = '-depth.userplot'
            else:
               suffix = '-ambiguous-depth.userplot'
            data = trivia.read_unstranded_userplot(
                os.path.join(working_dir, friendly_name+suffix)
            )
            assert len(seq) == len(data)
            for i in xrange(len(seq)):
               if good[i]:
                   if is_core:
                       good[i] = data[i] >= mincov
                   else:
                       good[i] = data[i] < mincov

        #Close holes
        start = -maxdiff-1
        n_holes = 0
        for i in xrange(len(seq)):
            if good[i]:
                 if 0 < i-start <= maxdiff:
                     for j in xrange(start,i): good[j] = True
                     n_holes += 1
                 start = i+1
        print 'Closed', grace.pretty_number(n_holes), 'holes'
        
        
        f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else 'N')
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f = open(path.join(output_dir, '%s-%s_masked.fa' % (friendly_name,what)), 'wb')
        io.write_fasta(f, name,
            ''.join([ (seq[i] if good[i] else seq[i].lower())
                      for i in xrange(len(seq)) ])
        )
        f.close()

        f_good = open(path.join(output_dir, '%s-%s_parts.fa' % (friendly_name,what)), 'wb')
        f_nongood = open(path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name,what)), 'wb')
        start = 0
        n_good = [0]
        n_good_bases = [0]    
        def emit(i):
            if i-start < minsize: return
            if good[start]:
                n_good[0] += 1
                n_good_bases[0] += i-start
            io.write_fasta(
                f_good if good[start] else f_nongood,
                '%s:%d..%d' % (name, start+1,i),
                seq[start:i]
            )
        for i in xrange(1,len(seq)):
            if good[i] != good[start]:
                emit(i)
                start = i
        emit(len(seq))
        f_nongood.close()
        f_good.close()
        
        print grace.pretty_number(sum(good)), 'bases are '+what+', of', grace.pretty_number(len(seq)), 'in reference sequence'
        print grace.pretty_number(n_good[0]), 'parts at least', grace.pretty_number(minsize), 'bases long with', grace.pretty_number(n_good_bases[0]), 'total bases'

        print
Ejemplo n.º 55
0
    def run(self):
        """
        
        <sequence> <poly-A> <adaptor> <anything>
        
        """
        clip_quality = chr(33+self.clip_quality)
        #ignore_quality = chr(33+self.ignore_quality)
        
        with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \
             io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file:
            print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched'
             
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0

            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    # "Good quality" sequence ends at the first low quality base
                    #good_quality_end = 0
                    #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality:
                    #    good_quality_end += 1
                    
                    goodness_score = 0
                    best_goodness_score = 0
                    good_quality_end = 0
                    i = 0
                    while True:
                        if goodness_score > best_goodness_score:
                            best_goodness_score = goodness_score
                            good_quality_end = i
                        
                        if i >= len(seq):
                            break
                            
                        if qual[i] >= clip_quality:
                            goodness_score += 1
                        else:
                            goodness_score -= 9
                        i += 1
                            
                    
                    best_score = self.min_score-1
                    best_a_start = good_quality_end
                    best_a_end = good_quality_end
                    best_adaptor_bases = 0
                    best_aonly_score = 0
                    best_aonly_start = good_quality_end
                    best_aonly_end = good_quality_end
                    
                    # Consider each possible start position for the poly(A)
                    for a_start in xrange(len(seq)):
                        if a_start and seq[a_start-1] == 'A': continue
                        
                        # Consider each possible end position for the poly(A)
                        a_end = a_start
                        aonly_score = 0
                        while True:
                            if aonly_score > best_aonly_score:
                                best_aonly_score = aonly_score
                                best_aonly_start = a_start
                                best_aonly_end = a_end
                            
                            
                            # The poly(A) should be followed by adaptor,
                            ## at least until the end of good quality sequence.
                            # However if there is evidence of the adaptor beyond
                            # the end of good quality, we still want to know that,
                            # and count it towards the number of adaptor bases present.
                            score = aonly_score
                            adaptor_bases = 0
                            i = a_end
                            abort_score = best_score-len(self.adaptor)
                            abort_i = min(len(seq), a_end+len(self.adaptor))
                            while score >= abort_score:
                                #if (score > best_score and 
                                #    (i >= good_quality_end or i >= a_end+len(self.adaptor))):
                                if score > best_score:
                                    best_score = score
                                    best_a_start = a_start
                                    best_a_end = a_end
                                    best_adaptor_bases = adaptor_bases
                            
                                if i >= abort_i:
                                    break
                            
                                if seq[i] == self.adaptor[i-a_end]:
                                    score += 1
                                    adaptor_bases += 1
                                else:
                                    score -= 4
                                i += 1
                                
                            #if a_end >= len(seq): break
                            
                            # Modified 2018-03-21
                            # poly(A) tail only within good quality region.
                            #if a_end >= good_quality_end: break
                            #if qual[a_end] >= ignore_quality:
                            #    if seq[a_end] == 'A':
                            #        aonly_score += 1
                            #    else:
                            #        aonly_score -= 4
                            #        if aonly_score <= 0: break

                            if a_end >= len(seq): break

                            if seq[a_end] == 'A':
                                aonly_score += 1
                            else: #if qual[a_end] >= ignore_quality:
                                aonly_score -= 4
                            #else:
                            #    aonly_score -= 1                       

                            a_end += 1
                    
                    # 2018-03-21 
                    # Look for tail starting after good quality,
                    # however don't call a tail if starts after good quality 
                    if best_a_start > good_quality_end:
                        best_a_start = good_quality_end
                        best_a_end = good_quality_end
                        best_adaptor_bases = 0
                        best_score = 0

                    a_start = best_a_start
                    a_end = best_a_end
                    adaptor_bases = best_adaptor_bases
                    aonly_start = best_aonly_start
                    aonly_end = best_aonly_end                    
                        
                    if self.debug: # and a_end == a_start and a_end < len(seq)-10:        
                        print name
                        print ''.join( 
                            ('C' if item<clip_quality else ' ') 
                            for item in qual )
                        print '-' * good_quality_end
                        print seq
                        print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score)
                        #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "."
                        print 
                        sys.stdout.flush()

                    n += 1
                    total_before += len(seq)

                    # 0 - sequence name
                    # 1 - sequence length
                    # 2 - poly(A) start
                    # 3 - poly(A) end
                    # (4 - best run of As start, for debugging the need to detect adaptor seq)
                    # (5 - best run of As end)
                    # 6 - number of adaptor bases matched
                    print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end,  adaptor_bases)
                    
                    if a_start >= self.length:
                        if a_start < len(seq):
                            n_clipped += 1
                            total_clipped += a_start
                    
                        print >> out_file, '@'+name
                        print >> out_file, seq[:a_start]
                        print >> out_file, '+'
                        print >> out_file, qual[:a_start]
                    else:
                        n_discarded += 1
                    
                    if n%10000 == 0: 
                        grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)')
                    
                    # Option to do a quick subsample
                    if self.only and self.only <= n:
                        break
        
        grace.status('')
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)