Esempio n. 1
0
    def begin_output(self):
        from nesoni import io

        if self.output is not None:
            return io.open_possibly_compressed_writer(self.output)
        else:
            return sys.stdout
Esempio n. 2
0
    def begin_output(self):
        from nesoni import io

        if self.output is not None:
            return io.open_possibly_compressed_writer(self.output)
        else:
            return sys.stdout
Esempio n. 3
0
def write_gff3(filename, items):
    # IGV likes to index large GFFs, and needs them to be sorted for this
    items = sorted(items, key=lambda item: (item.seqid, item.start))
    
    with io.open_possibly_compressed_writer(filename) as f:
        write_gff3_header(f)
        for item in items:
            print >> f, item.as_gff()
Esempio n. 4
0
def write_gff3(filename, items, sort=True):
    # IGV likes to index large GFFs, and needs them to be sorted for this
    if sort:
        items = sorted(items, key=lambda item: (item.seqid, item.start))

    with io.open_possibly_compressed_writer(filename) as f:
        write_gff3_header(f)
        for item in items:
            print >> f, item.as_gff()
Esempio n. 5
0
    def run(self):
        min_quality = chr(33+self.quality)

        with io.open_possibly_compressed_writer(self.prefix+'.csfastq.gz') as out_file:        
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0
            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    
                    score = 0
                    start = 0
                    for i in xrange(len(seq)-1):
                        if qual[i] >= min_quality:
                            if seq[i+1] == '0':
                                score += 1
                            else:
                                score = max(0, score-4)
                                if not score: start = i+2
                    
                    n += 1
                    total_before += len(seq)
                
                    if start > self.length+1:
                        if start < len(seq):
                            n_clipped += 1
                            total_clipped += len(seq)-start
                        
                        print >> out_file, '@'+name
                        print >> out_file, seq[:start]
                        print >> out_file, '+'
                        print >> out_file, qual[:start-1]
                    else:
                        n_discarded += 1
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
Esempio n. 6
0
    def run(self):
        min_quality = chr(33+self.quality)

        with io.open_possibly_compressed_writer(self.prefix+'.csfastq.gz') as out_file:        
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0
            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    
                    score = 0
                    start = 0
                    for i in xrange(len(seq)-1):
                        if qual[i] >= min_quality:
                            if seq[i+1] == '0':
                                score += 1
                            else:
                                score = max(0, score-4)
                                if not score: start = i+2
                    
                    n += 1
                    total_before += len(seq)
                
                    if start > self.length+1:
                        if start < len(seq):
                            n_clipped += 1
                            total_clipped += len(seq)-start
                        
                        print >> out_file, '@'+name
                        print >> out_file, seq[:start]
                        print >> out_file, '+'
                        print >> out_file, qual[:start-1]
                    else:
                        n_discarded += 1
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def write_gff3(filename, items, header):
    with io.open_possibly_compressed_writer(filename) as f:
        f.write(header)
        for item in items:
            print >> f, item.as_gff()
     def run(self):
         assert self.extension is not None, '--extension must be specified'
     
         #workspace = self.get_workspace()
         workspace = working_directory.Working(self.working_dir, must_exist=True)
         if self.annotations == None:
             reference = workspace.get_reference()
             annotations_filename = reference.annotations_filename()
         else:
             annotations_filename = self.annotations
         
         types = [ item.lower() for item in self.types.split(',') ]
         
         parts = self.parts or self.types 
         parts = [ item.lower() for item in parts.split(',') ]
         
         
         all_annotations = list(annotation.read_annotations(annotations_filename))
         annotation.link_up_annotations(all_annotations)
         for item in all_annotations: 
             item.primary = None
     
         annotations = [ 
             item 
             for item in all_annotations
             if item.type.lower() in types
         ]
         
         part_annotations = [ ]
         seen = set()
         queue = [ (item,item) for item in annotations ]
         while queue:
             primary, item = queue.pop()
             if item.type.lower() in parts:
                 assert item.primary is None, "Feature with multiple parents"
                 item.primary = primary
                 key = (id(primary),item.start,item.end,item.seqid,item.strand)
                 # Ignore duplicate exons (many isoforms will have the same exons)
                 if key not in seen:
                     seen.add(key)
                     part_annotations.append(item)
             queue.extend( (primary, item2) for item2 in item.children )
         
         del seen
         del all_annotations
         
         self.log.log('%d annotations\n' % len(annotations))
         self.log.log('%d part annotations\n' % len(part_annotations))
         
         #assert annotations, 'No annotations of specified types in file'
         
         for item in part_annotations:
             this_extension = self.extension
             if "max_extension" in item.attr:
                 this_extension = min(this_extension,int(item.attr["max_extension"]))
                 
             if item.strand >= 0:
                 item.tail_pos = item.end
                 item.end += this_extension
             else:
                 item.tail_pos = item.start
                 item.start -= this_extension
         
         for item in annotations:    
             item.hits = [] # [ (tail_length, adaptor_bases) ]
         
         index = span_index.index_annotations(part_annotations)
         
         for alignment in sam.Bam_reader(workspace/'alignments_filtered_sorted.bam'):
             if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary:
                 continue
        
             start = alignment.reference_start
             end = alignment.reference_end
             alignment_length = end-start
             strand = -1 if alignment.flag&sam.FLAG_REVERSE else 1
             fragment_feature = annotation.Annotation(
                 seqid=alignment.reference_name,
                 start=start,
                 end=end,
                 strand=strand
                 )
             
             if strand >= 0:
                 tail_pos = end
             else:
                 tail_pos = start
             
             tail_length = 0
             adaptor_bases = 0
             for item in alignment.extra:
                 if item.startswith('AN:i:'):
                     tail_length = int(item[5:])
                 elif item.startswith('AD:i:'):
                     adaptor_bases = int(item[5:])
             
             hits = index.get(fragment_feature, same_strand=True)
             if hits:
                 gene = min(hits, key=lambda gene: 
                     (abs(tail_pos - gene.tail_pos), gene.primary.get_id()))
                     # Nearest by tail_pos
                     # failing that, by id to ensure a deterministic choice
                 
                 gene.primary.hits.append( (tail_length,adaptor_bases) )

         for item in annotations:
             del item.parents
             del item.children
             del item.primary

         f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
         pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL)
         f.close()
Esempio n. 9
0
    def run(self):
        """
        
        <sequence> <poly-A> <adaptor> <anything>
        
        """
        clip_quality = chr(33+self.clip_quality)
        ignore_quality = chr(33+self.ignore_quality)
        
        with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \
             io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file:
            print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched'
             
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0

            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    # "Good quality" sequence ends at the first low quality base
                    #good_quality_end = 0
                    #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality:
                    #    good_quality_end += 1
                    
                    goodness_score = 0
                    best_goodness_score = 0
                    good_quality_end = 0
                    i = 0
                    while True:
                        if goodness_score > best_goodness_score:
                            best_goodness_score = goodness_score
                            good_quality_end = i
                        
                        if i >= len(seq):
                            break
                            
                        if qual[i] >= clip_quality:
                            goodness_score += 1
                        else:
                            goodness_score -= 9
                        i += 1
                            
                    
                    best_score = 0
                    best_a_start = good_quality_end
                    best_a_end = good_quality_end
                    best_adaptor_bases = 0
                    best_aonly_score = 0
                    best_aonly_start = good_quality_end
                    best_aonly_end = good_quality_end
                    
                    # Consider each possible start position for the poly(A)
                    for a_start in xrange(good_quality_end):
                        if a_start and seq[a_start-1] == 'A': continue
                        
                        # Consider each possible end position for the poly(A)
                        a_end = a_start
                        aonly_score = 0
                        while True:
                            if aonly_score > best_aonly_score:
                                best_aonly_score = aonly_score
                                best_aonly_start = a_start
                                best_aonly_end = a_end
                            
                            
                            # The poly(A) should be followed by adaptor,
                            # at least until the end of good quality sequence.
                            # However if there is evidence of the adaptor beyond
                            # the end of good quality, we still want to know that,
                            # and count it towards the number of adaptor bases present.
                            score = aonly_score
                            adaptor_bases = 0
                            i = a_end
                            while True:
                                if (score > best_score and 
                                    (i >= good_quality_end or i >= a_end+len(self.adaptor))):
                                    best_score = score
                                    best_a_start = a_start
                                    best_a_end = a_end
                                    best_adaptor_bases = adaptor_bases
                            
                                if i >= a_end+len(self.adaptor) or i >= len(seq):
                                    break
                            
                                if qual[i] >= ignore_quality:
                                    if seq[i] == self.adaptor[i-a_end]:
                                        score += 1
                                        adaptor_bases += 1
                                    else:
                                        score -= 4
                                i += 1
                                
                            #if a_end >= len(seq): break
                            
                            # poly(A) tail only within good quality region.
                            if a_end >= good_quality_end: break
                            
                            
                            if qual[a_end] >= ignore_quality:
                                if seq[a_end] == 'A':
                                    aonly_score += 1
                                else:
                                    aonly_score -= 4
                                    if aonly_score <= 0: break
                            a_end += 1
                        
                    a_start = best_a_start
                    a_end = best_a_end
                    adaptor_bases = best_adaptor_bases
                    aonly_start = best_aonly_start
                    aonly_end = best_aonly_end                    
                        
                    if self.debug: # and a_end == a_start and a_end < len(seq)-10:        
                        print name
                        print ''.join( 
                            'I' if item<ignore_quality 
                            else ('C' if item<clip_quality else ' ') 
                            for item in qual )
                        print '-' * good_quality_end
                        print seq
                        print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score)
                        #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "."
                        print 
                        sys.stdout.flush()

                    n += 1
                    total_before += len(seq)

                    # 0 - sequence name
                    # 1 - sequence length
                    # 2 - poly(A) start
                    # 3 - poly(A) end
                    # (4 - best run of As start, for debugging the need to detect adaptor seq)
                    # (5 - best run of As end)
                    # 6 - number of adaptor bases matched
                    print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end,  adaptor_bases)
                    
                    if a_start > self.length:
                        if a_start < len(seq):
                            n_clipped += 1
                            total_clipped += a_start
                    
                        print >> out_file, '@'+name
                        print >> out_file, seq[:a_start]
                        print >> out_file, '+'
                        print >> out_file, qual[:a_start]
                    else:
                        n_discarded += 1
                    
                    if n%10000 == 0: 
                        grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)')
        
        grace.status('')
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
Esempio n. 10
0
def write_gff3(filename, items):
    with io.open_possibly_compressed_writer(filename) as f:
        write_gff3_header(f)
        for item in items:
            print >> f, item.as_gff()
Esempio n. 11
0
    def run(self):
        log = self.log

        #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10)
        #qoffset, args = grace.get_option_value(args, '--qoffset', int, None)
        #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True)
        #length_cutoff, args = grace.get_option_value(args, '--length', int, 24)
        #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10)
        #max_error, args = grace.get_option_value(args, '--max-errors', int, 1)
        #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna')
        #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False)
        #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False)
        #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0)
        #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0)
        #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False)
        #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True)
        #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False)
        #grace.expect_no_further_options(args)

        prefix = self.prefix
        log_name = os.path.split(prefix)[1]

        quality_cutoff = self.quality
        qoffset = self.qoffset
        clip_ambiguous = self.clip_ambiguous
        length_cutoff = self.length
        adaptor_cutoff = self.match
        max_error = self.max_errors
        adaptor_set = self.adaptors
        disallow_homopolymers = self.homopolymers
        reverse_complement = self.revcom
        trim_start = self.trim_start
        trim_end = self.trim_end
        output_fasta = self.fasta
        use_gzip = self.gzip
        output_rejects = self.rejects

        iterators = []
        filenames = []
        any_paired = False

        for filename in self.reads:
            filenames.append(filename)
            iterators.append(
                itertools.izip(io.read_sequences(filename, qualities=True)))

        for pair_filenames in self.pairs:
            assert len(pair_filenames
                       ) == 2, 'Expected a pair of files for "pairs" section.'
            filenames.extend(pair_filenames)
            any_paired = True
            iterators.append(
                itertools.izip(
                    io.read_sequences(pair_filenames[0], qualities=True),
                    io.read_sequences(pair_filenames[1], qualities=True)))

        for filename in self.interleaved:
            filenames.extend(filename)
            any_paired = True
            iterators.append(
                deinterleave(io.read_sequences(filename, qualities=True)))

        fragment_reads = (2 if any_paired else 1)
        read_in_fragment_names = ['read-1', 'read-2'
                                  ] if any_paired else ['read']

        assert iterators, 'Nothing to clip'

        if qoffset is None:
            guesses = [
                io.guess_quality_offset(filename) for filename in filenames
            ]
            assert len(
                set(guesses)
            ) == 1, 'Conflicting quality offset guesses, please specify manually.'
            qoffset = guesses[0]
            log.log('FASTQ offset seems to be %d\n' % qoffset)

        quality_cutoff_char = chr(qoffset + quality_cutoff)

        #log.log('Minimum quality:        %d (%s)\n' % (quality_cutoff, quality_cutoff_char))
        #log.log('Clip ambiguous bases:   %s\n' % (grace.describe_bool(clip_ambiguous)))
        #log.log('Minimum adaptor match:  %d bases, %d errors\n' % (adaptor_cutoff, max_error))
        #log.log('Minimum length:         %d bases\n' % length_cutoff)

        adaptor_seqs = []
        adaptor_names = []
        if adaptor_set and adaptor_set.lower() != 'none':
            for item in adaptor_set.split(','):
                item = item.strip().lower() + ' '
                any = False
                for line in ADAPTORS.strip().split('\n'):
                    if line.startswith('#'): continue
                    if not line.lower().startswith(item): continue
                    any = True
                    name, seq = line.rsplit(None, 1)
                    seq = seq.replace('U', 'T')

                    #if seq in adaptor_seqs: print 'Dup', name
                    adaptor_seqs.append(seq)
                    adaptor_names.append(name)
                    adaptor_seqs.append(bio.reverse_complement(seq))
                    adaptor_names.append(name)
                if not any:
                    raise grace.Error('Unknown adaptor set: ' + item)

        matcher = Matcher(adaptor_seqs, adaptor_names, max_error)

        start_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]
        end_clips = [
            collections.defaultdict(list) for i in xrange(fragment_reads)
        ]

        if output_fasta:
            write_sequence = io.write_fasta_single_line
        else:
            write_sequence = io.write_fastq

        f_single = io.open_possibly_compressed_writer(
            self.reads_output_filenames()[0])
        if fragment_reads == 2:
            f_paired = io.open_possibly_compressed_writer(
                self.interleaved_output_filenames()[0])
        if output_rejects:
            f_reject = io.open_possibly_compressed_writer(
                self.rejects_output_filenames()[0])

        n_single = 0
        n_paired = 0

        n_in_single = 0
        n_in_paired = 0
        total_in_length = [0] * fragment_reads

        n_out = [0] * fragment_reads
        n_q_clipped = [0] * fragment_reads
        n_a_clipped = [0] * fragment_reads
        n_homopolymers = [0] * fragment_reads
        total_out_length = [0] * fragment_reads

        #log.attach(open(prefix + '_log.txt', 'wb'))

        for iterator in iterators:
            for fragment in iterator:
                if (n_in_single + n_in_paired) % 10000 == 0:
                    grace.status(
                        'Clipping fragment %s' %
                        grace.pretty_number(n_in_single + n_in_paired))

                if len(fragment) == 1:
                    n_in_single += 1
                else:
                    n_in_paired += 1

                graduates = []
                rejects = []
                for i, (name, seq, qual) in enumerate(fragment):
                    name = name.split()[0]
                    seq = seq.upper()
                    total_in_length[i] += len(seq)

                    start = trim_start
                    best_start = 0
                    best_len = 0
                    for j in xrange(len(seq) - trim_end):
                        if qual[j] < quality_cutoff_char or \
                           (clip_ambiguous and seq[j] not in 'ACGT'):
                            if best_len < j - start:
                                best_start = start
                                best_len = j - start
                            start = j + 1
                    j = len(seq) - trim_end
                    if best_len < j - start:
                        best_start = start
                        best_len = j - start

                    clipped_seq = seq[best_start:best_start + best_len]
                    clipped_qual = qual[best_start:best_start + best_len]
                    if len(clipped_seq) < length_cutoff:
                        n_q_clipped[i] += 1
                        rejects.append((name, seq, qual, 'quality'))
                        continue

                    match = matcher.match(clipped_seq)
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[match[0]:]
                        clipped_qual = clipped_qual[match[0]:]
                        start_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    match = matcher.match(bio.reverse_complement(clipped_seq))
                    if match and match[0] >= adaptor_cutoff:
                        clipped_seq = clipped_seq[:len(clipped_seq) - match[0]]
                        clipped_qual = clipped_qual[:len(clipped_qual) -
                                                    match[0]]
                        end_clips[i][match[0]].append(match[1][0])
                        if len(clipped_seq) < length_cutoff:
                            n_a_clipped[i] += 1
                            rejects.append((name, seq, qual, 'adaptor'))
                            continue

                    if disallow_homopolymers and len(set(clipped_seq)) <= 1:
                        n_homopolymers[i] += 1
                        rejects.append((name, seq, qual, 'homopolymer'))
                        continue

                    graduates.append((name, clipped_seq, clipped_qual))
                    n_out[i] += 1
                    total_out_length[i] += len(clipped_seq)

                if output_rejects:
                    for name, seq, qual, reason in rejects:
                        write_sequence(f_reject, name + ' ' + reason, seq,
                                       qual)

                if graduates:
                    if reverse_complement:
                        graduates = [(name, bio.reverse_complement(seq),
                                      qual[::-1])
                                     for name, seq, qual in graduates]

                    if len(graduates) == 1:
                        this_f = f_single
                        n_single += 1
                    else:
                        assert len(graduates) == 2
                        this_f = f_paired
                        n_paired += 1

                    for name, seq, qual in graduates:
                        write_sequence(this_f, name, seq, qual)

        grace.status('')

        if output_rejects:
            f_reject.close()
        if fragment_reads == 2:
            f_paired.close()
        f_single.close()

        def summarize_clips(name, location, clips):
            total = 0
            for i in clips:
                total += len(clips[i])
            log.datum(log_name, name + ' adaptors clipped at ' + location,
                      total)

            if not clips:
                return

            for i in xrange(min(clips), max(clips) + 1):
                item = clips[i]
                log.quietly_log('%3d bases: %10d ' % (i, len(item)))
                if item:
                    avg_errors = float(sum(item2[0]
                                           for item2 in item)) / len(item)
                    log.quietly_log(' avg errors: %5.2f  ' % avg_errors)

                    counts = collections.defaultdict(int)
                    for item2 in item:
                        counts[item2[1]] += 1
                    #print counts
                    for no in sorted(counts,
                                     key=lambda item2: counts[item2],
                                     reverse=True)[:2]:
                        log.quietly_log('%dx%s ' %
                                        (counts[no], matcher.names[no]))
                    if len(counts) > 2: log.quietly_log('...')

                log.quietly_log('\n')
            log.quietly_log('\n')

        if n_in_paired:
            log.datum(log_name, 'read-pairs', n_in_paired)
        if n_in_single:
            log.datum(log_name, 'single reads', n_in_single)

        for i in xrange(fragment_reads):
            if start_clips:
                summarize_clips(read_in_fragment_names[i], 'start',
                                start_clips[i])

            if end_clips:
                summarize_clips(read_in_fragment_names[i], 'end', end_clips[i])

                prefix = read_in_fragment_names[i]

            log.datum(log_name, prefix + ' too short after quality clip',
                      n_q_clipped[i])
            log.datum(log_name, prefix + ' too short after adaptor clip',
                      n_a_clipped[i])
            if disallow_homopolymers:
                log.datum(log_name, prefix + ' homopolymers',
                          n_homopolymers[i])
            if fragment_reads > 1:
                log.datum(log_name, prefix + ' kept', n_out[i])
            log.datum(log_name, prefix + ' average input length',
                      float(total_in_length[i]) / (n_in_single + n_in_paired))
            if n_out[i]:
                log.datum(log_name, prefix + ' average output length',
                          float(total_out_length[i]) / n_out[i])

        if fragment_reads == 2:
            log.datum(log_name, 'pairs kept after clipping', n_paired)
        log.datum(log_name, 'reads kept after clipping', n_single)
Esempio n. 12
0
    def run(self):
        """
        
        <sequence> <poly-A> <adaptor> <anything>
        
        """
        clip_quality = chr(33+self.clip_quality)
        #ignore_quality = chr(33+self.ignore_quality)
        
        with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \
             io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file:
            print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched'
             
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0

            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    # "Good quality" sequence ends at the first low quality base
                    #good_quality_end = 0
                    #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality:
                    #    good_quality_end += 1
                    
                    goodness_score = 0
                    best_goodness_score = 0
                    good_quality_end = 0
                    i = 0
                    while True:
                        if goodness_score > best_goodness_score:
                            best_goodness_score = goodness_score
                            good_quality_end = i
                        
                        if i >= len(seq):
                            break
                            
                        if qual[i] >= clip_quality:
                            goodness_score += 1
                        else:
                            goodness_score -= 9
                        i += 1
                            
                    
                    best_score = self.min_score-1
                    best_a_start = good_quality_end
                    best_a_end = good_quality_end
                    best_adaptor_bases = 0
                    best_aonly_score = 0
                    best_aonly_start = good_quality_end
                    best_aonly_end = good_quality_end
                    
                    # Consider each possible start position for the poly(A)
                    for a_start in xrange(len(seq)):
                        if a_start and seq[a_start-1] == 'A': continue
                        
                        # Consider each possible end position for the poly(A)
                        a_end = a_start
                        aonly_score = 0
                        while True:
                            if aonly_score > best_aonly_score:
                                best_aonly_score = aonly_score
                                best_aonly_start = a_start
                                best_aonly_end = a_end
                            
                            
                            # The poly(A) should be followed by adaptor,
                            ## at least until the end of good quality sequence.
                            # However if there is evidence of the adaptor beyond
                            # the end of good quality, we still want to know that,
                            # and count it towards the number of adaptor bases present.
                            score = aonly_score
                            adaptor_bases = 0
                            i = a_end
                            abort_score = best_score-len(self.adaptor)
                            abort_i = min(len(seq), a_end+len(self.adaptor))
                            while score >= abort_score:
                                #if (score > best_score and 
                                #    (i >= good_quality_end or i >= a_end+len(self.adaptor))):
                                if score > best_score:
                                    best_score = score
                                    best_a_start = a_start
                                    best_a_end = a_end
                                    best_adaptor_bases = adaptor_bases
                            
                                if i >= abort_i:
                                    break
                            
                                if seq[i] == self.adaptor[i-a_end]:
                                    score += 1
                                    adaptor_bases += 1
                                else:
                                    score -= 4
                                i += 1
                                
                            #if a_end >= len(seq): break
                            
                            # Modified 2018-03-21
                            # poly(A) tail only within good quality region.
                            #if a_end >= good_quality_end: break
                            #if qual[a_end] >= ignore_quality:
                            #    if seq[a_end] == 'A':
                            #        aonly_score += 1
                            #    else:
                            #        aonly_score -= 4
                            #        if aonly_score <= 0: break

                            if a_end >= len(seq): break

                            if seq[a_end] == 'A':
                                aonly_score += 1
                            else: #if qual[a_end] >= ignore_quality:
                                aonly_score -= 4
                            #else:
                            #    aonly_score -= 1                       

                            a_end += 1
                    
                    # 2018-03-21 
                    # Look for tail starting after good quality,
                    # however don't call a tail if starts after good quality 
                    if best_a_start > good_quality_end:
                        best_a_start = good_quality_end
                        best_a_end = good_quality_end
                        best_adaptor_bases = 0
                        best_score = 0

                    a_start = best_a_start
                    a_end = best_a_end
                    adaptor_bases = best_adaptor_bases
                    aonly_start = best_aonly_start
                    aonly_end = best_aonly_end                    
                        
                    if self.debug: # and a_end == a_start and a_end < len(seq)-10:        
                        print name
                        print ''.join( 
                            ('C' if item<clip_quality else ' ') 
                            for item in qual )
                        print '-' * good_quality_end
                        print seq
                        print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score)
                        #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "."
                        print 
                        sys.stdout.flush()

                    n += 1
                    total_before += len(seq)

                    # 0 - sequence name
                    # 1 - sequence length
                    # 2 - poly(A) start
                    # 3 - poly(A) end
                    # (4 - best run of As start, for debugging the need to detect adaptor seq)
                    # (5 - best run of As end)
                    # 6 - number of adaptor bases matched
                    print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end,  adaptor_bases)
                    
                    if a_start >= self.length:
                        if a_start < len(seq):
                            n_clipped += 1
                            total_clipped += a_start
                    
                        print >> out_file, '@'+name
                        print >> out_file, seq[:a_start]
                        print >> out_file, '+'
                        print >> out_file, qual[:a_start]
                    else:
                        n_discarded += 1
                    
                    if n%10000 == 0: 
                        grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)')
                    
                    # Option to do a quick subsample
                    if self.only and self.only <= n:
                        break
        
        grace.status('')
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
Esempio n. 13
0
    def run(self):
        log = self.log
        
        #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10)
        #qoffset, args = grace.get_option_value(args, '--qoffset', int, None)
        #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True)
        #length_cutoff, args = grace.get_option_value(args, '--length', int, 24)
        #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10)
        #max_error, args = grace.get_option_value(args, '--max-errors', int, 1)
        #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna')
        #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False)
        #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False)
        #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0)
        #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0)
        #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False)
        #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True)
        #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False)
        #grace.expect_no_further_options(args)
        
        prefix = self.prefix
        log_name = os.path.split(prefix)[1]
        
        quality_cutoff = self.quality
        qoffset = self.qoffset
        clip_ambiguous = self.clip_ambiguous
        length_cutoff = self.length
        adaptor_cutoff = self.match
        max_error = self.max_errors
        disallow_homopolymers = self.homopolymers
        reverse_complement = self.revcom
        trim_start = self.trim_start
        trim_end = self.trim_end
        output_fasta = self.fasta
        use_gzip = self.gzip
        output_rejects = self.rejects
    
        iterators = [ ]        
        filenames = [ ]
        any_paired = False
        
        for filename in self.reads:
            filenames.append(filename)
            iterators.append(itertools.izip(
                 io.read_sequences(filename, qualities=True)
            ))
        
        for pair_filenames in self.pairs:
            assert len(pair_filenames) == 2, 'Expected a pair of files for "pairs" section.'
            filenames.extend(pair_filenames)
            any_paired = True
            iterators.append(itertools.izip(
                io.read_sequences(pair_filenames[0], qualities=True),
                io.read_sequences(pair_filenames[1], qualities=True)
            ))
        
        for filename in self.interleaved:
            filenames.append(filename)
            any_paired = True
            iterators.append(deinterleave(
                io.read_sequences(filename, qualities=True)
            ))
        
        fragment_reads = (2 if any_paired else 1)
        read_in_fragment_names = [ 'read-1', 'read-2' ] if any_paired else [ 'read' ]
        
        assert iterators, 'Nothing to clip'
        
        io.check_name_uniqueness(self.reads, self.pairs, self.interleaved)
    
        if qoffset is None:
            guesses = [ io.guess_quality_offset(filename) for filename in filenames ]
            assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify manually.'
            qoffset = guesses[0]
            log.log('FASTQ offset seems to be %d\n' % qoffset)    
    
        quality_cutoff_char = chr(qoffset + quality_cutoff)
        
        #log.log('Minimum quality:        %d (%s)\n' % (quality_cutoff, quality_cutoff_char))
        #log.log('Clip ambiguous bases:   %s\n' % (grace.describe_bool(clip_ambiguous)))
        #log.log('Minimum adaptor match:  %d bases, %d errors\n' % (adaptor_cutoff, max_error))
        #log.log('Minimum length:         %d bases\n' % length_cutoff)
        
        adaptor_seqs = [ ]
        adaptor_names = [ ]
        if self.adaptor_clip:
            if self.adaptor_file:
                adaptor_iter = io.read_sequences(self.adaptor_file)
            else:
                adaptor_iter = ADAPTORS
            for name, seq in adaptor_iter:
                seq = seq.upper().replace('U','T')
                adaptor_seqs.append(seq)
                adaptor_names.append(name)
                adaptor_seqs.append(bio.reverse_complement(seq))
                adaptor_names.append(name)

        matcher = Matcher(adaptor_seqs, adaptor_names, max_error)
        
        start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ]
        end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ]
    
        if output_fasta:
            write_sequence = io.write_fasta_single_line
        else:
            write_sequence = io.write_fastq
    
        f_single = io.open_possibly_compressed_writer(self.reads_output_filenames()[0])
        if fragment_reads == 2:
            names = self.pairs_output_filenames()[0] if self.out_separate else self.interleaved_output_filenames()
            f_paired = map(io.open_possibly_compressed_writer, names)
        if output_rejects:
            f_reject = io.open_possibly_compressed_writer(self.rejects_output_filenames()[0])
        
        n_single = 0
        n_paired = 0
        
        n_in_single = 0
        n_in_paired = 0
        total_in_length = [ 0 ] * fragment_reads
        
        n_out = [ 0 ] * fragment_reads
        n_q_clipped = [ 0 ] * fragment_reads
        n_a_clipped = [ 0 ] * fragment_reads
        n_homopolymers = [ 0 ] * fragment_reads
        total_out_length = [ 0 ] * fragment_reads
        
        #log.attach(open(prefix + '_log.txt', 'wb'))
        
        for iterator in iterators:
          for fragment in iterator:
            if (n_in_single+n_in_paired) % 10000 == 0:
                grace.status('Clipping fragment %s' % grace.pretty_number(n_in_single+n_in_paired))
        
            if len(fragment) == 1:
                n_in_single += 1
            else:
                n_in_paired += 1
            
            graduates = [ ]
            rejects = [ ]
            for i, (name, seq, qual) in enumerate(fragment):
                seq = seq.upper()
                total_in_length[i] += len(seq)
                
                if self.trim_to:
                    seq = seq[:self.trim_to]
                    qual = qual[:self.trim_to]
                
                start = trim_start
                best_start = 0
                best_len = 0
                for j in xrange(len(seq)-trim_end):
                    if qual[j] < quality_cutoff_char or \
                       (clip_ambiguous and seq[j] not in 'ACGT'):
                        if best_len < j-start:
                            best_start = start
                            best_len = j-start
                        start = j + 1
                j = len(seq)-trim_end
                if best_len < j-start:
                    best_start = start
                    best_len = j-start
        
                clipped_seq = seq[best_start:best_start+best_len]
                clipped_qual = qual[best_start:best_start+best_len]
                if len(clipped_seq) < length_cutoff:
                    n_q_clipped[i] += 1
                    rejects.append( (name,seq,qual,'quality') ) 
                    continue
        
                match = matcher.match(clipped_seq)
                if match and match[0] >= adaptor_cutoff:
                    clipped_seq = clipped_seq[match[0]:]
                    clipped_qual = clipped_qual[match[0]:]
                    start_clips[i][match[0]].append( match[1][0] )
                    if len(clipped_seq) < length_cutoff:
                        n_a_clipped[i] += 1 
                        rejects.append( (name,seq,qual,'adaptor') ) 
                        continue
            
                match = matcher.match(bio.reverse_complement(clipped_seq))
                if match and match[0] >= adaptor_cutoff:
                    clipped_seq = clipped_seq[: len(clipped_seq)-match[0] ]    
                    clipped_qual = clipped_qual[: len(clipped_qual)-match[0] ]    
                    end_clips[i][match[0]].append( match[1][0] )
                    if len(clipped_seq) < length_cutoff:
                        n_a_clipped[i] += 1 
                        rejects.append( (name,seq,qual,'adaptor') ) 
                        continue
    
                if disallow_homopolymers and len(set(clipped_seq)) <= 1:
                    n_homopolymers[i] += 1
                    rejects.append( (name,seq,qual,'homopolymer') ) 
                    continue
        
                graduates.append( (name, clipped_seq, clipped_qual) )
                n_out[i] += 1
                total_out_length[i] += len(clipped_seq)
    
            if output_rejects:
                for name,seq,qual,reason in rejects:
                    write_sequence(f_reject, name + ' ' + reason, seq, qual)
             
            if graduates:
                if reverse_complement:
                    graduates = [
                        (name, bio.reverse_complement(seq), qual[::-1])
                        for name, seq, qual in graduates
                    ]
            
                if len(graduates) == 1:
                    n_single += 1

                    (name, seq, qual) = graduates[0]
                    write_sequence(f_single, name, seq, qual)
                else:
                    assert len(graduates) == 2
                    n_paired += 1

                    # Write the pair to an interleaved file or separate l/r files
                    for (lr,(name, seq, qual)) in enumerate(graduates):
                        write_sequence(f_paired[lr%len(f_paired)], name, seq, qual)
                
        
        grace.status('')
        
        if output_rejects:
            f_reject.close()
        if fragment_reads == 2:
            map(lambda f: f.close(), f_paired)
        f_single.close()
        
        def summarize_clips(name, location, clips):
            total = 0
            for i in clips:
                total += len(clips[i])
            log.datum(log_name, name + ' adaptors clipped at ' + location, total) 
            
            if not clips:
                return
    
            for i in xrange(min(clips), max(clips)+1):
                item = clips[i]
                log.quietly_log('%3d bases: %10d ' % (i, len(item)))
                if item:
                    avg_errors = float(sum( item2[0] for item2 in item )) / len(item)
                    log.quietly_log(' avg errors: %5.2f  ' % avg_errors)
                    
                    counts = collections.defaultdict(int)
                    for item2 in item: counts[item2[1]] += 1
                    #print counts
                    for no in sorted(counts,key=lambda item2:counts[item2],reverse=True)[:2]:
                        log.quietly_log('%dx%s ' % (counts[no], matcher.names[no]))
                    if len(counts) > 2: log.quietly_log('...')
                    
                log.quietly_log('\n')
            log.quietly_log('\n')


        if n_in_paired:
            log.datum(log_name,'read-pairs', n_in_paired)                      
        if n_in_single:
            log.datum(log_name,'single reads', n_in_single)                      
        
        for i in xrange(fragment_reads):
            if start_clips:
                summarize_clips(read_in_fragment_names[i], 'start', start_clips[i])
        
            if end_clips:
                summarize_clips(read_in_fragment_names[i], 'end', end_clips[i])

                prefix = read_in_fragment_names[i]
                
            log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i])
            log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i])
            if disallow_homopolymers:
                log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i])
            if fragment_reads > 1:
                log.datum(log_name, prefix + ' kept', n_out[i])
            log.datum(log_name, prefix + ' average input length',  float(total_in_length[i]) / (n_in_single+n_in_paired))                     
            if n_out[i]:
                log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i])                     
        
        if fragment_reads == 2:
            log.datum(log_name,'pairs kept after clipping', n_paired)                      
        log.datum(log_name, 'reads kept after clipping', n_single)
Esempio n. 14
0
    def run(self):
        """
        
        <sequence> <poly-A> <adaptor> <anything>
        
        """
        min_quality = chr(33+self.quality)
        
        with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \
             io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file:
            print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched'
             
            n = 0
            n_discarded = 0
            n_clipped = 0
            total_before = 0
            total_clipped = 0

            for filename in self.filenames:
                for name, seq, qual in io.read_sequences(filename, qualities='required'):
                    best_score = 0
                    best_a_start = len(seq)
                    best_a_end = len(seq)
                    best_adaptor_bases = 0
                    best_aonly_score = 0
                    best_aonly_start = len(seq)
                    best_aonly_end = len(seq)
                    for a_start in xrange(len(seq)):
                        if a_start and seq[a_start-1] == 'A': continue
                        
                        a_end = a_start
                        aonly_score = 0
                        while True:
                            if aonly_score > best_aonly_score:
                                best_aonly_score = aonly_score
                                best_aonly_start = a_start
                                best_aonly_end = a_end
                                            
                            score = aonly_score
                            adaptor_bases = 0
                            for i in xrange(a_end,min(a_end+len(self.adaptor),len(seq))):
                                if qual[i] >= min_quality:
                                    if seq[i] == self.adaptor[i-a_end]:
                                        score += 1
                                        adaptor_bases += 1
                                    else:
                                        score -= 4
                            if score > best_score:
                                best_score = score
                                best_a_start = a_start
                                best_a_end = a_end
                                best_adaptor_bases = adaptor_bases
                        
                            if a_end >= len(seq): break
                            if qual[a_end] >= min_quality:
                                if seq[a_end] == 'A':
                                    aonly_score += 1
                                else:
                                    aonly_score -= 4
                                    if aonly_score <= 0: break
                            a_end += 1
                        
                    a_start = best_a_start
                    a_end = best_a_end
                    adaptor_bases = best_adaptor_bases
                    aonly_start = best_aonly_start
                    aonly_end = best_aonly_end                    
                        
                    if self.debug: # and a_end == a_start and a_end < len(seq)-10:        
                        print name
                        print ''.join( 'X' if item<min_quality else ' ' for item in qual )
                        print seq
                        print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor
                        print ' '*aonly_start + 'A'*(aonly_end-aonly_start)
                        print 
                        sys.stdout.flush()

                    n += 1
                    total_before += len(seq)

                    # 0 - sequence name
                    # 1 - sequence length
                    # 2 - poly(A) start
                    # 3 - poly(A) end
                    # (4 - best run of As start, for debugging the need to detect adaptor seq)
                    # (5 - best run of As end)
                    # 6 - number of adaptor bases matched
                    print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end,  adaptor_bases)
                    
                    if a_start > self.length:
                        if a_start < len(seq):
                            n_clipped += 1
                            total_clipped += a_start
                    
                        print >> out_file, '@'+name
                        print >> out_file, seq[:a_start]
                        print >> out_file, '+'
                        print >> out_file, qual[:a_start]
                    else:
                        n_discarded += 1
                    
                    if n%10000 == 0: 
                        grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)')
        
        grace.status('')
        
        self.log.datum(self.sample,'reads',n)
        if n:
            self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n)
        self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded)
        self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped)
        if n_clipped:
            self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
Esempio n. 15
0
    def run(self):
        assert self.extension is not None, '--extension must be specified'

        #workspace = self.get_workspace()
        workspace = working_directory.Working(self.working_dir,
                                              must_exist=True)
        if self.annotations == None:
            reference = workspace.get_reference()
            annotations_filename = reference.annotations_filename()
        else:
            annotations_filename = self.annotations

        types = [item.lower() for item in self.types.split(',')]

        parts = self.parts or self.types
        parts = [item.lower() for item in parts.split(',')]

        all_annotations = list(
            annotation.read_annotations(annotations_filename))
        annotation.link_up_annotations(all_annotations)
        for item in all_annotations:
            item.primary = None

        annotations = [
            item for item in all_annotations if item.type.lower() in types
        ]

        part_annotations = []
        seen = set()
        queue = [(item, item) for item in annotations]
        while queue:
            primary, item = queue.pop()
            if item.type.lower() in parts:
                assert item.primary is None, "Feature with multiple parents"
                item.primary = primary
                key = (id(primary), item.start, item.end, item.seqid,
                       item.strand)
                # Ignore duplicate exons (many isoforms will have the same exons)
                if key not in seen:
                    seen.add(key)
                    part_annotations.append(item)
            queue.extend((primary, item2) for item2 in item.children)

        del seen
        del all_annotations

        self.log.log('%d annotations\n' % len(annotations))
        self.log.log('%d part annotations\n' % len(part_annotations))

        #assert annotations, 'No annotations of specified types in file'

        for item in part_annotations:
            this_extension = self.extension
            if "max_extension" in item.attr:
                this_extension = min(this_extension,
                                     int(item.attr["max_extension"]))

            if item.strand >= 0:
                item.tail_pos = item.end
                item.end += this_extension
            else:
                item.tail_pos = item.start
                item.start -= this_extension

        for item in annotations:
            item.hits = []  # [ (tail_length, adaptor_bases) ]

        index = span_index.index_annotations(part_annotations)

        for alignment in sam.Bam_reader(workspace /
                                        'alignments_filtered_sorted.bam'):
            if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary:
                continue

            start = alignment.reference_start
            end = alignment.reference_end
            alignment_length = end - start
            strand = -1 if alignment.flag & sam.FLAG_REVERSE else 1
            fragment_feature = annotation.Annotation(
                seqid=alignment.reference_name,
                start=start,
                end=end,
                strand=strand)

            if strand >= 0:
                tail_pos = end
            else:
                tail_pos = start

            tail_length = 0
            adaptor_bases = 0
            for item in alignment.extra:
                if item.startswith('AN:i:'):
                    tail_length = int(item[5:])
                elif item.startswith('AD:i:'):
                    adaptor_bases = int(item[5:])

            hits = index.get(fragment_feature, same_strand=True)
            if hits:
                gene = min(
                    hits,
                    key=lambda gene:
                    (abs(tail_pos - gene.tail_pos), gene.primary.get_id()))
                # Nearest by tail_pos
                # failing that, by id to ensure a deterministic choice

                gene.primary.hits.append((tail_length, adaptor_bases))

        for item in annotations:
            del item.parents
            del item.children
            del item.primary

        f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
        pickle.dump((workspace.name, workspace.get_tags(), annotations), f,
                    pickle.HIGHEST_PROTOCOL)
        f.close()
Esempio n. 16
0
     def run(self):
         assert self.extension is not None, '--extension must be specified'
     
         #workspace = self.get_workspace()
         workspace = working_directory.Working(self.working_dir, must_exist=True)
         if self.annotations == None:
             reference = workspace.get_reference()
             annotations_filename = reference.annotations_filename()
         else:
             annotations_filename = self.annotations
         
         types = [ item.lower() for item in self.types.split(',') ]
     
         annotations = [ 
             item 
             for item in annotation.read_annotations(annotations_filename)
             if item.type.lower() in types
         ]
         
         self.log.log('%d annotations\n' % len(annotations))
         
         assert annotations, 'No annotations of specified types in file'
         
         index = { }
         
         for item in annotations:
             if item.strand >= 0:
                 item.tail_pos = item.end
                 item.end += self.extension
             else:
                 item.tail_pos = item.start
                 item.start -= self.extension
         
             if item.seqid not in index:
                 index[item.seqid] = span_index.Span_index()
             index[item.seqid].insert(item)
             
             item.hits = [] # [ (rel_start, rel_end, tail_length) ]
         
         for item in index.itervalues(): 
             item.prepare()
         
         for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(workspace/'alignments_filtered.bam'):
             for fragment in fragment_alignments:
                 start = min(item.pos-1 for item in fragment)
                 end = max(item.pos+item.length-1 for item in fragment)
                 alignment_length = end-start
                 strand = -1 if fragment[0].flag&sam.FLAG_REVERSE else 1
                 
                 if strand >= 0:
                     tail_pos = end
                 else:
                     tail_pos = start
                 
                 tail_length = 0
                 adaptor_bases = 0
                 for item in fragment[0].extra:
                     if item.startswith('AN:i:'):
                         tail_length = int(item[5:])
                     elif item.startswith('AD:i:'):
                         adaptor_bases = int(item[5:])
                 
                 if fragment[0].rname in index:
                     hits = [ 
                         gene
                         for gene in index[fragment[0].rname].get(start,end)
                         if gene.strand == strand
                         ]                         
                     if hits:
                         gene = min(hits, key=lambda gene: (abs(tail_pos - gene.tail_pos), gene.get_id()))
                             # Nearest by tail_pos
                             # failing that, by id to ensure a deterministic choice
                             
                         if strand > 0:
                             rel_start = start - gene.start
                             rel_end = end - gene.start
                         else:
                             rel_start = gene.end - end
                             rel_end = gene.end - start
                         
                         gene.hits.append( (rel_start,rel_end,tail_length,adaptor_bases) )

         f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
         pickle.dump((workspace.name, workspace.get_tags(), annotations), f, pickle.HIGHEST_PROTOCOL)
         f.close()