Exemple #1
0
 def setup(self):
     grace.status('Load depths')
     self.sample_names = [ os.path.split(dirname)[1] for dirname in self.working_dirs ]
     self.workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in self.working_dirs ]
     
     self.depths = [ item.get_depths() for item in self.workspaces ]
     #self.depths = list(legion.imap(lambda item: item.get_object('depths.pickle.gz'), self.workspaces, local=True))
     
     self.any_pairs = any(item.param['any_pairs'] for item in self.workspaces)
     grace.status('')
     
     lengths = self.workspaces[0].get_reference().get_lengths()
     self.chromosome_names = [ name for name, length in lengths ]
     self.lengths = dict(lengths)
     
     self.processes = [ ]
Exemple #2
0
    def run(self):
        workspace = working_directory.Working(self.output_dir)        
        workspace.setup_reference(self.reference)
        workspace.update_param(snp_cost = self.snp_cost)
        
        #assert os.path.exists(self.reference), 'Reference file does not exist'
        #reference_filename = workspace._object_filename('reference.fa')
        #if os.path.exists(reference_filename):
        #   os.unlink(reference_filename)
        #os.symlink(os.path.relpath(self.reference, self.output_dir), reference_filename)
        
        bam_filename = io.abspath(self.output_dir, 'alignments.bam')
        bam_prefix = io.abspath(self.output_dir, 'alignments')

        if sam.is_bam(self.input):
            sort_input_filename = self.input
            temp_filename = None
        else:
            temp_filename = io.abspath(self.output_dir, 'temp.bam')
            sort_input_filename = temp_filename
            writer = io.Pipe_writer(temp_filename, ['samtools', 'view', '-S', '-b', '-'])
            f = open(self.input, 'rb')
            while True:
                data = f.read(1<<20)
                if not data: break
                writer.write(data)
            writer.close()
            f.close()
        
        grace.status('Sort')
        
        #io.execute([
        #    'samtools', 'sort', '-n', sort_input_filename, bam_prefix
        #])
        sam.sort_bam(sort_input_filename, bam_prefix, by_name=True)
        
        if temp_filename is not None:
            os.unlink(temp_filename)
        
        grace.status('')
Exemple #3
0
    def run(self):
        title1 = self.title1
        title2 = self.title2

        working1 = working_directory.Working(self.working_dir1)
        working2 = working_directory.Working(self.working_dir2)

        cutoff = self.cutoff

        sequence_names = [
            name for name, length in working1.get_reference().get_lengths()
        ]

        if title1 is None:
            title1 = working1.name
        if title2 is None:
            title2 = working2.name

        n = 1
        while significance([('A', n)], [('T', n)], 1.0) > cutoff:
            n += 1

        f = open(self.prefix + '.txt', 'wb')
        print >> f, '%g\tsignificance cutoff' % cutoff
        print >> f, '%d\tdepth required to call substitution (greater if there are errors in the reads)' % n

        print >> f, 'Sequence\tPosition in reference\tChange type\tReference\t%s\t%s\tp-value (no correction for multiple testing)\t%s\t%s' % (
            title1, title2, title1, title2)

        for sequence_name in sequence_names:
            filename1 = working1 / (
                grace.filesystem_friendly_name(sequence_name) +
                '-evidence.txt')
            filename2 = working2 / (
                grace.filesystem_friendly_name(sequence_name) +
                '-evidence.txt')

            for (pos1, ins1, sub1, ref1, conins1,
                 consub1), (pos2, ins2, sub2, ref2, conins2,
                            consub2) in itertools.izip(read_file(filename1),
                                                       read_file(filename2)):
                assert pos1 == pos2 and ref1 == ref2

                if pos1 % 1000 == 0:
                    grace.status('Testing %s %d' % (sequence_name, pos1))

                dec_ins1 = io.decode_evidence(ins1)
                dec_ins2 = io.decode_evidence(ins2)
                if dec_ins1 and dec_ins2:
                    sig = significance(io.decode_evidence(ins1),
                                       io.decode_evidence(ins2), cutoff)
                    if sig is not None and sig <= cutoff:
                        print >> f, '%s\t%d\t%s\t\t%s\t%s\t%g\t%s\t%s' % (
                            sequence_name, pos1, 'insertion-before', ins1,
                            ins2, sig, conins1, conins2)
                        f.flush()

                dec_sub1 = io.decode_evidence(sub1)
                dec_sub2 = io.decode_evidence(sub2)
                if dec_sub1 and dec_sub2:
                    sig = significance(dec_sub1, dec_sub2, cutoff)
                    if sig is not None and sig <= cutoff:
                        if dec_sub1[0][0] == '-' or dec_sub2[0][0] == '-':
                            what = 'deletion'
                        elif dec_sub1[0][0] != dec_sub2[0][0]:
                            what = 'substitution'
                        else:
                            what = 'different mix'
                        print >> f, '%s\t%d\t%s\t%s\t%s\t%s\t%g\t%s\t%s' % (
                            sequence_name, pos1, what, ref1, sub1, sub2, sig,
                            consub1, consub2)
                        f.flush()

        f.close()

        grace.status('')
        return 0
Exemple #4
0
def nway_main(gbk_filename, use_indels, use_reference, give_evidence, give_consequences,
              require_all, require_bisect, full_output, format, working_dirs, split_a, split_b, f=sys.stdout):
    assert working_dirs, 'Need at least one working directory.'
    workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in working_dirs ]
    reference = workspaces[0].get_reference()
    #if not annotation_filename:
    #    annotation_filename = reference.annotations_filename() #May still be None
    
    if use_reference:
        names = ['reference']
        evidence_start = 1
    else:
        names = [ ]
        evidence_start = 0
        
    names.extend( norm_name(item) for item in  working_dirs )
    
    references = io.read_sequences(reference.reference_fasta_filename())
    
    annotations = { }
    if gbk_filename:
        from Bio import SeqIO
        for record in SeqIO.parse(io.open_possibly_compressed_file(gbk_filename),'genbank'):
            sequence = record.seq.tostring()
            features = [ item for item in record.features if item.type != 'source' ]
            features.sort(key=lambda item: item.location.nofuzzy_start)
            annotations[sequence] = features
    
    iterator = reader(working_dirs, references, use_reference, annotations)
    
    if not use_indels:
        iterator = itertools.ifilter(has_no_indels, iterator)

    if require_all or require_bisect or format == 'counts':
        iterator = itertools.ifilter(fully_unambiguous, iterator)
    
    if require_bisect:
        iterator = itertools.ifilter(is_binary_partition, iterator)

    if not require_bisect:
        if full_output:
            iterator = itertools.ifilter(not_boring_insertion, iterator)
        else:
            iterator = itertools.ifilter(is_interesting, iterator)

    if split_a or split_b:
        assert len(names) == len(set(names)), 'Two samples with the same name'
        try:
            split_a = [ names.index(norm_name(item)) for item in split_a ]
            split_b = [ names.index(norm_name(item)) for item in split_b ]
        except ValueError:
            raise grace.Error('Sample to be split is not amongst samples given')
        iterator = itertools.ifilter(is_split(split_a, split_b), iterator)

    #if limit:
    #    iterator = itertools.islice(iterator, limit)
    
    if format == 'table':
        line = 'Reference\tPosition\tChange type'
        line +=  '\t' + '\t'.join(names)
        if give_evidence:
            line += '\t' + '\t'.join(names[evidence_start:])
        if give_consequences:
            line += '\t' + '\t'.join(names[evidence_start:])
        if annotations:
            line += '\tAnnotations'
        print >> f, line
        for calls in iterator:
            line = '%s\t%d\t%s\t%s' % (
                calls.ref_name, 
                calls.ref_pos+1, 
                change_type(calls), 
                '\t'.join(item.consensus for item in calls.calls))
            if give_evidence:
                line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:])
            if give_consequences:
                line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:])
            if annotations:
                line += '\t' + describe_features(calls.features)
            print >> f, line

    elif format == 'compact':
        for line in transpose_strings(names):
            print >> f, line
        print >> f
        
        for calls in iterator:
            if calls.is_insertion:
                footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name)
            else: 
                footer = '%12d   %s' % (calls.ref_pos+1, calls.ref_name)
            
            t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1)
            top = t[0] + ' ' + footer
            if give_consequences:
                consequences = [ ]
                for call in calls.calls:
                    if call.consequences:
                        for item in call.consequences.split(', '):
                            item = ' '.join(item.split()[:3])
                            if item not in consequences: consequences.append(item)
                        
                if consequences:
                    top += '  ' + ' / '.join(sorted(consequences))
            top += '  ' + describe_features(calls.features)
            print >> f, top
            for line in t[1:]:
                print >> f, line            
    
    elif format == 'nexus':
        buckets = [ [ ] for name in names ]
        for calls in iterator:
            for i, char in enumerate(partition_string(calls)):
                buckets[i].append(char)
        
        print >> f, '#NEXUS'
        print >> f, 'begin taxa;'
        print >> f, 'dimensions ntax=%d;' % len(names)
        print >> f, 'taxlabels'
        for name in names:
            print >> f, name
        print >> f, ';'
        print >> f, 'end;'

        print >> f, 'begin characters;'
        print >> f, 'dimensions nchar=%d;' % len(buckets[0])
        print >> f, 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;'
        print >> f, 'matrix'
        for name, bucket in itertools.izip(names, buckets):
            print >> f, name, ''.join(bucket)
        print >> f, ';'
        print >> f, 'end;'
    
    elif format == 'counts':
        for line in transpose_strings(names):
            print >> f, line
        print >> f

        counts = { }
        for calls in iterator:
            count_str = partition_string(calls)
            if count_str not in counts:
                counts[count_str] = 1
            else:
                counts[count_str] += 1
        
        for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True):
            print >> f, '%s   %d' % (transpose_strings(count_str)[0], counts[count_str])
    
    else:
        raise grace.Error('Unknown output format: ' + format)
Exemple #5
0
    def run(self):
        #mincov, args = grace.get_option_value(args, '--mincov', int, 1)
        #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16)
        #minsize, args = grace.get_option_value(args, '--minsize', int, 200)
        #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core')
        #is_core = (what == 'core')
        #
        #grace.expect_no_further_options(args)
        #
        #if len(args) < 2:
        #    print >> sys.stderr, HELP
        #    raise grace.Help_shown()
        #
        #output_dir, working_dirs = args[0], args[1:]
        #
        ##assert not path.exists(path.join(output_dir, 'reference.fa')), \
        #assert not path.exists(path.join(output_dir, 'parameters')), \
        #        'Output directory not given'
        #
        #if not path.exists(output_dir):
        #    os.mkdir(output_dir)

        assert self.what in (
            'core',
            'unique'), 'Expected --what to be either "core" or "unique".'
        is_core = (self.what == 'core')

        workspace = self.get_workspace()

        for name, seq in io.read_sequences(
                working_directory.Working(self.working_dirs[0]).get_reference(
                ).reference_fasta_filename()):
            self.log.log(name + '\n')
            friendly_name = grace.filesystem_friendly_name(name)

            good = [True] * len(seq)

            for working_dir in self.working_dirs:
                if is_core:
                    suffix = '-depth.userplot'
                else:
                    suffix = '-ambiguous-depth.userplot'
                data = trivia.read_unstranded_userplot(
                    os.path.join(working_dir, friendly_name + suffix))
                assert len(seq) == len(data)
                for i in xrange(len(seq)):
                    if good[i]:
                        if is_core:
                            good[i] = data[i] >= self.mincov
                        else:
                            good[i] = data[i] < self.mincov

            #Close holes
            start = -self.maxdiff - 1
            n_holes = 0
            for i in xrange(len(seq)):
                if good[i]:
                    if 0 < i - start <= self.maxdiff:
                        for j in xrange(start, i):
                            good[j] = True
                        n_holes += 1
                    start = i + 1
            self.log.log('Closed ' + grace.pretty_number(n_holes) + ' holes\n')

            f = open(workspace / ('%s-%s.fa' % (friendly_name, self.what)),
                     'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else 'N')
                                  for i in xrange(len(seq))]))
            f.close()

            f = open(
                workspace / ('%s-%s_masked.fa' % (friendly_name, self.what)),
                'wb')
            io.write_fasta(
                f, name, ''.join([(seq[i] if good[i] else seq[i].lower())
                                  for i in xrange(len(seq))]))
            f.close()

            f_good = open(
                workspace / ('%s-%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            f_nongood = open(
                workspace / ('%s-non%s_parts.fa' % (friendly_name, self.what)),
                'wb')
            start = 0
            n_good = [0]
            n_good_bases = [0]

            def emit(i):
                if i - start < self.minsize: return
                if good[start]:
                    n_good[0] += 1
                    n_good_bases[0] += i - start
                io.write_fasta(f_good if good[start] else f_nongood,
                               '%s:%d..%d' % (name, start + 1, i),
                               seq[start:i])

            for i in xrange(1, len(seq)):
                if good[i] != good[start]:
                    emit(i)
                    start = i
            emit(len(seq))
            f_nongood.close()
            f_good.close()

            self.log.log(
                grace.pretty_number(sum(good)) + ' bases are ' + self.what +
                ', of ' + grace.pretty_number(len(seq)) +
                ' in reference sequence\n')
            self.log.log(
                grace.pretty_number(n_good[0]) + ' parts at least ' +
                grace.pretty_number(self.minsize) + ' bases long with ' +
                grace.pretty_number(n_good_bases[0]) + ' total bases\n')
            self.log.log('\n')
Exemple #6
0
    def run(self):
        bams = []
        reference = None
        reference2 = None

        extra = []

        for sample in self.samples:
            if sam.is_bam(sample):
                bams.append(sample)
            elif os.path.isdir(sample):
                working = working_directory.Working(sample, True)
                bams.append(working.get_filtered_sorted_bam())
                extra.append('##sampleTags=' + ','.join(working.get_tags()))
                if reference2 is None:
                    reference2 = working.get_reference(
                    ).reference_fasta_filename()
            elif io.is_sequence_file(sample):
                assert reference is None, 'Only one reference FASTA file allowed.'
                reference = sample

        if reference is None:
            reference = reference2
        if reference is None:
            raise grace.Error('No reference FASTA file given.')

        with nesoni.Stage() as stage:
            tempspace = stage.enter(workspace.tempspace())
            if self.depth_limit:
                with nesoni.Stage() as stage2:
                    for i in xrange(len(bams)):
                        sam.Bam_depth_limit(
                            tempspace / ('%d' % i),
                            bams[i],
                            depth=self.depth_limit).process_make(stage2)
                        bams[i] = tempspace / ('%d.bam' % i)

            # FreeBayes claims to handle multiple bams, but it doesn't actually work
            if len(bams) > 1:
                sam.Bam_merge(tempspace / 'merged', bams=bams,
                              index=False).run()
                bams = [tempspace / 'merged.bam']

            command = [
                'freebayes',
                '-f',
                reference,
                '--ploidy',
                str(self.ploidy),
                '--pvar',
                str(self.pvar),
            ] + self.freebayes_options + bams

            self.log.log('Running: ' + ' '.join(command) + '\n')

            f_out = stage.enter(open(self.prefix + '.vcf', 'wb'))
            f_in = stage.enter(io.pipe_from(command))
            done_extra = False
            for line in f_in:
                if not done_extra and not line.startswith('##'):
                    for extra_line in extra:
                        f_out.write(extra_line + '\n')
                    done_extra = True
                f_out.write(line)

        index_vcf(self.prefix + '.vcf')
Exemple #7
0
def count_run(min_score, min_size, max_size, filter_mode, equalize, types,
              locii, qualifiers, use_strand, merge_filename, limit,
              output_prefix, filenames, log):

    if filter_mode == 'poly':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = False
        expect_multiple_alignments = True
    elif filter_mode == 'mono':
        use_bam_filename = 'alignments.bam'
        use_only_top = True
        use_only_monogamous = True
        expect_multiple_alignments = True
    else:
        assert filter_mode == 'existing', 'Unrecognized filtering mode'
        use_bam_filename = 'alignments_filtered.bam'
        use_only_top = False
        use_only_monogamous = False
        expect_multiple_alignments = False

    types = types.lower().split(',')

    qualifiers = qualifiers.split(',')

    if locii:
        locii = locii.lower().split(',')
    else:
        locii = None

    assert use_strand is not None, 'You must now explicitly specify --strand'
    assert use_strand in ('pool', 'forward', 'reverse',
                          'both'), "Can't understand --strand specification."

    from Bio import Seq, SeqIO

    annotation_filenames = []
    bam_filenames = []
    for arg in filenames:
        if annotation.is_annotation_file(arg):
            annotation_filenames.append(arg)
        else:
            bam_filenames.append(arg)

    n_samples = len(bam_filenames)
    titles = bam_filenames[:]
    for i in xrange(len(bam_filenames)):
        if os.path.isdir(bam_filenames[i]):
            titles[i] = os.path.basename(bam_filenames[i])
            if not annotation_filenames:
                working = working_directory.Working(bam_filenames[i])
                reference_filename = working.get_reference(
                ).annotations_filename()
                if reference_filename is not None:
                    annotation_filenames.append(reference_filename)

            bam_filenames[i] = os.path.join(bam_filenames[i], use_bam_filename)

    assert bam_filenames, 'No reference alignments given'

    merge = {}
    merge_qualifiers = {}
    if merge_filename is not None:
        #First line gives qualifiers
        #remaining lines give <qualifier> <qualifier...> <gene> <transcript> <transcript...>

        f = open(merge_filename, 'rU')
        qualifiers = f.readline().rstrip('\n').split('\t')
        for line in f:
            parts = line.rstrip('\n').split('\t')
            if not parts: continue
            for name in parts[len(qualifiers) + 1:]:
                assert name not in merge, 'Duplicate feature name in merge file'
                merge[name] = parts[len(qualifiers)]
                merge_qualifiers[name] = parts[:len(qualifiers)]
        f.close()

    genes = {}  # reference name -> gene index

    feature_names = {}  # feature_name -> number of occurrences

    features = []

    n_features = 0

    chromosome_length = {}
    for filename in bam_filenames:
        headers = sam.bam_headers(filename)
        for line in headers.split('\n'):
            if not line: continue
            parts = line.split('\t')
            if parts[0] != '@SQ': continue

            name = None
            length = None
            for part in parts[1:]:
                if part.startswith('SN:'): name = part[3:]
                if part.startswith('LN:'): length = int(part[3:])
            assert name is not None and length is not None

            if name in chromosome_length:
                assert chromosome_length[name] == length
            else:
                chromosome_length[name] = length

    for name in chromosome_length:
        genes[name] = span_index.Span_index()

    if annotation_filenames:
        assert not merge, 'Merging not supported with annotation files'

        for filename in annotation_filenames:
            for feature in annotation.read_annotations(filename):
                if feature.type.lower() not in types: continue

                if (locii is not None and
                    ('locus_tag' not in feature.attr
                     or feature.attr['locus_tag'].lower() not in locii)):
                    continue

                f = Feature(n_samples)
                f.name = feature.get_id()
                if feature.type.lower() != 'cds' and len(types) > 1:
                    f.name = feature.type + ':' + f.name

                feature_names[f.name] = feature_names.get(f.name, 0) + 1
                if feature_names[f.name] > 1:
                    f.name += '/%d' % feature_names[f.name]

                f.qualifiers = [
                    feature.attr.get(item, '') for item in qualifiers
                ]

                f.length = feature.end - feature.start

                assert feature.seqid in genes, 'Annotation for sequence that is not in BAM files'
                genes[feature.seqid].insert(
                    Span_entry(feature.start, feature.end, feature.strand or 1,
                               f))
                features.append(f)

    else:
        # Sequences as features
        log.log(
            'No annotation files given or found, using sequences as features\n'
        )

        name_feature = {}  # (merged)name -> feature

        for name in chromosome_length:
            merged_name = merge.get(name, name)

            if merged_name not in name_feature:
                f = Feature(n_samples)
                f.name = merged_name
                f.length = length
                f.qualifiers = merge_qualifiers.get(name,
                                                    ('', ) * len(qualifiers))
                n_features += 1
                name_feature[merged_name] = f
                features.append(f)
            else:
                f = name_feature[merged_name]
                f.length = max(f.length, length)  #...

            genes[name].insert(Span_entry(0, chromosome_length[name], 1, f))

    log.log('%d features\n\n' % len(features))

    for name in genes:
        genes[name].prepare()

    n_fragments = [0] * n_samples
    n_fragments_aligned = [0] * n_samples
    n_low_score = [0] * n_samples
    n_something = [0] * n_samples
    n_multiple = [0] * n_samples
    n_span = [0] * n_samples

    for i in xrange(n_samples):
        for read_name, fragment_alignments, unmapped in sam.bam_iter_fragments(
                bam_filenames[i],
                'Counting sample %d of %d' % (i + 1, n_samples)):
            n_fragments[i] += 1

            if not fragment_alignments:
                continue

            n_fragments_aligned[i] += 1

            feature_hits = []  # [ [ (feature, strand) ] ]

            # Use only top scoring alignments
            fragment_scores = [
                sum(al.get_AS() for al in item) for item in fragment_alignments
            ]

            best_score = max(fragment_scores)

            if min_score is not None and best_score < min_score:
                n_low_score[i] += 1
                continue

            if use_only_top:
                cutoff = max(best_score, min_score)
            else:
                cutoff = min_score
            fragment_alignments = [
                item
                for item, score in zip(fragment_alignments, fragment_scores)
                if score >= cutoff
            ]

            for alignments in fragment_alignments:
                strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1

                start = min(item.pos - 1 for item in alignments)
                end = max(item.pos + item.length - 1 for item in alignments)
                length = end - start
                if min_size is not None and length < min_size: continue
                if max_size is not None and length > max_size: continue

                rname = alignments[0].rname
                strand = -1 if alignments[0].flag & sam.FLAG_REVERSE else 1
                assert alignments[
                    0].rname in genes, 'Alignment refers to sequence not present in GENBANK file'

                this_feature_hits = []
                for item in genes[rname].get(start, end):
                    rel_strand = strand * item.strand
                    key = (item.feature, rel_strand)
                    if key in this_feature_hits: continue
                    this_feature_hits.append(key)
                    if not use_only_monogamous or len(
                            fragment_alignments) == 1:
                        item.feature.count[rel_strand][i] += 1

                if this_feature_hits:
                    feature_hits.append(this_feature_hits)

                if len(this_feature_hits) > 1:
                    for a in this_feature_hits:
                        for b in this_feature_hits:
                            if a[0] is b[0]: continue
                            a[0].common[(a[1], b[1])][b[0]] += 1

            if len(feature_hits) > 0:
                n_something[i] += 1
            #else:
            #    print fragment_alignments
            #    print genes[fragment_alignments[0][0].rname].indexes
            #    print

            if len(feature_hits) > 1:
                n_multiple[i] += 1
                for j in xrange(len(feature_hits)):
                    for k in xrange(len(feature_hits)):
                        if j == k: continue
                        for a in feature_hits[j]:
                            for b in feature_hits[k]:
                                if a[0] is b[0]: continue
                                a[0].ambiguous[(a[1], b[1])][b[0]] += 1

            if any(len(item) > 1 for item in feature_hits): n_span[i] += 1

            if limit is not None and n_fragments[i] >= limit: break

        grace.status('')

        #log.log('%s\n' % titles[i])
        #log.log('%20s fragments\n' % grace.pretty_number(n_fragments[i]))
        #log.log('%20s fragments aligned to the reference\n' % grace.pretty_number(n_fragments_aligned[i]))
        #if n_low_score[i]:
        #    log.log('%20s had too low an alignment score, discarded\n' % grace.pretty_number(n_low_score[i]))
        #log.log('%20s aligned to an annotated gene\n' % grace.pretty_number(n_something[i]))
        #if expect_multiple_alignments or n_multiple[i]:
        #    log.log('%20s aligned to multiple genes\n' % grace.pretty_number(n_multiple[i]))
        #log.log('%20s had an alignment that spanned multiple genes\n' % grace.pretty_number(n_span[i]))
        #log.log('\n')

        log.datum(titles[i], 'fragments', n_fragments[i])
        log.datum(titles[i], 'fragments aligned to the reference',
                  n_fragments_aligned[i])
        if n_low_score[i]:
            log.datum(titles[i], 'had too low an alignment score, discarded',
                      n_low_score[i])
        log.datum(titles[i], 'aligned to an annotated gene', n_something[i])
        if expect_multiple_alignments or n_multiple[i]:
            log.datum(titles[i], 'aligned to multiple genes', n_multiple[i])
        log.datum(titles[i], 'had an alignment that spanned multiple genes',
                  n_span[i])
        log.log('\n')

    strandedness = []
    for feature in features:
        n_forward = sum(feature.count[1])
        n_reverse = sum(feature.count[-1])
        if n_forward + n_reverse < 5: continue
        strandedness.append(
            (n_forward - n_reverse) * 100.0 / (n_forward + n_reverse))
    strandedness = sum(strandedness) / len(strandedness)
    log.log(
        'Strand specificity: %.0f%%\n'
        '  (~ -100%% reverse strand, ~ 0%% non-specific, ~ 100%% forward strand\n'
        '   Average over all features with at least 5 hits.)\n' % strandedness)

    if use_strand == 'pool':
        getters = [
            lambda f:
            (feature.name, add_lists(feature.count[1], feature.count[-1]),
             add_defdicts(feature.common[(1, 1)], feature.common[
                 (1, -1)], feature.common[(-1, 1)], feature.common[(-1, -1)]),
             add_defdicts(feature.ambiguous[(1, 1)], feature.ambiguous[
                 (1, -1)], feature.ambiguous[(-1, 1)], feature.ambiguous[
                     (-1, -1)]))
        ]
    elif use_strand == 'forward':
        getters = [
            lambda f: (feature.name, feature.count[1], feature.common[
                (1, 1)], feature.ambiguous[(1, 1)])
        ]
    elif use_strand == 'reverse':
        getters = [
            lambda f: (feature.name, feature.count[-1], feature.common[
                (-1, -1)], feature.ambiguous[(-1, -1)])
        ]
    elif use_strand == 'both':
        getters = [
            lambda f: (feature.name, feature.count[1], feature.common[
                (1, 1)], feature.ambiguous[(1, 1)]), lambda f:
            (feature.name + 'r', feature.count[-1], feature.common[
                (-1, -1)], feature.ambiguous[(-1, -1)])
        ]

    total_hits = [0] * n_samples
    for feature in features:
        for getter in getters:
            total_hits = add_lists(total_hits, getter(feature)[1])

    if equalize:
        min_hits = min(total_hits)
        p = [float(min_hits) / item for item in total_hits]
        total_hits = [min_hits] * n_samples

    f = open(output_prefix + '.txt', 'wb')
    #log.attach(open(output_prefix + '_log.txt', 'wb'))

    print >> f, tab_encode(
        ['Feature'] + titles + ['RPKM ' + item for item in titles] +
        ['Length'] + qualifiers + ['On same fragment'] +
        (['Ambiguous alignment'] if expect_multiple_alignments else []))

    for feature in features:
        for getter in getters:
            feature_name, count, common, ambiguous = getter(feature)

            if equalize:
                count = [subsample(count[i], p[i]) for i in xrange(n_samples)]

            rpkm = [
                count[i] * 1.0e9 / feature.length / total_hits[i]
                for i in xrange(n_samples)
            ]

            common_str = ' '.join(
                '%dx%s' % (item[1], item[0]) for item in sorted(
                    common.items(), key=lambda item: item[1], reverse=True))
            ambiguous_str = ' '.join(
                '%dx%s' % (item[1], item[0]) for item in sorted(
                    ambiguous.items(), key=lambda item: item[1], reverse=True))

            print >> f, tab_encode(
                [feature_name] + [str(item) for item in count] +
                ['%.2f' % item for item in rpkm] + [str(feature.length)] +
                list(feature.qualifiers) + [common_str] +
                ([ambiguous_str] if expect_multiple_alignments else []))

    f.close()
    def run(self):
        assert self.reads, 'No read files given.'
        colorspace = [ io.is_colorspace(item) for item in self.reads ]
        assert len(set(colorspace)) == 1, 'Mixture of colorspace and basespace reads is not currently supported.'
        colorspace = colorspace[0]
        
        #polya_dir = self.get_polya_dir()
    
        working = working_directory.Working(self.output_dir, must_exist=False)
        working.set_reference(self.reference)
        reference = working.get_reference()
        
        #polya_working = working_directory.Working(polya_dir, must_exist=False)
        #polya_working.set_reference(self.reference)
        
        clipped_prefix = working/'clipped_reads'
        clipped_filename = clipped_prefix+('.csfastq.gz' if colorspace else '.fastq.gz')
        
        raw_filename = working/'alignments_raw.sam.gz'
        extended_filename = working/'alignments_extended.sam.gz'
        
        #polya_filename = working/'alignments_filtered_polyA.sam.gz'

        if colorspace:
            self.clip_runs_colorspace(
                filenames=self.reads,
                prefix=clipped_prefix,
                sample=working.name,
            ).make()
        else:
            self.clip_runs_basespace(
                filenames=self.reads,
                prefix=clipped_prefix,
                sample=working.name,
            ).make()        

        cores = min(nesoni.coordinator().get_cores(), 8)
        
        if colorspace:
            nesoni.Execute(
                command = reference.shrimp_command(cs=colorspace, parameters=[ clipped_filename ]) + [ '--qv-offset', '33' ],
                execution_options = [ '-N', str(cores) ],
                output=raw_filename,
                cores=cores,
                prefix=working/'run_alignment'
                ).make()
        
        else:
            nesoni.Execute(
                command = [ 
                    'bowtie2', 
                    '--rg-id', '1',
                    '--rg', 'SM:'+working.name,
                    '--sensitive-local',
                    '-k', '10', #Up to 10 alignments per read
                    '-x', reference.get_bowtie_index_prefix(),
                    '-U', clipped_filename,
                    ],
                execution_options = [ '--threads', str(cores) ],
                output=raw_filename,
                cores=cores,
                prefix=working/'run_alignment'
                ).make()
                
        if colorspace:
            extend_sam.Extend_sam_colorspace(
                input=raw_filename,
                output=extended_filename,
                reads=self.reads,
                reference_filenames=[ reference.reference_fasta_filename() ],
            ).make()
        else:    
            extend_sam.Extend_sam_basespace(
                input=raw_filename,
                output=extended_filename,
                clips=[ clipped_prefix+'.clips.gz' ],
                reference_filenames=[ reference.reference_fasta_filename() ],
                prop_a = self.extension_prop_a
            ).make()
        
        nesoni.Import(
            input=extended_filename,
            output_dir=self.output_dir,
            reference=[ self.reference ],
        ).make()
        
        self.get_filter_action().make()

        #Tail_only(
        #    input=working/'alignments_filtered.bam',
        #    output=polya_filename,
        #).make()
        
        #nesoni.Import(
        #    input=polya_filename,
        #    output_dir=polya_dir,
        #    reference=[ self.reference ],
        #).make()
        
        # This shouldn't actually filter out any alignments.
        # We do it to produce depth of coverage plots
        # and position-sorted BAM files.
        #self.get_polya_filter_action().make()
        
        nesoni.Tag(self.output_dir, tags=self.tags).make()
        #nesoni.Tag(polya_dir, tags=self.tags).make()
        
        if self.delete_files:
            # Delete unneeded files
            os.unlink(clipped_prefix+'.state')
            os.unlink(clipped_filename)
            os.unlink(working/'alignments.bam')
            os.unlink(working/'alignments_filtered.bam')
            os.unlink(working/'run_alignment.state')
            os.unlink(raw_filename)
            os.unlink(extended_filename)
    def run(self):
        assert self.extension is not None, '--extension must be specified'

        # Also allow simply the analyse-polya-batch directory
        working_dirs = []
        for item in self.working_dirs:
            state_filename = os.path.join(item, 'analyse-polya-batch.state')
            if not os.path.exists(state_filename):
                working_dirs.append(item)
            else:
                with open(state_filename, 'rb') as f:
                    state = pickle.load(f)

                for sample in state.samples:
                    working_dirs.append(
                        os.path.join(item, 'samples', sample.output_dir))

        work = self.get_workspace()

        if self.reuse:
            pickle_workspace = workspace.Workspace(
                os.path.join(self.reuse, 'pickles'))
        else:
            pickle_workspace = workspace.Workspace(work / 'pickles')
        plot_workspace = workspace.Workspace(work / 'plots')

        pickle_filenames = []

        file_prefix = self.file_prefix
        if file_prefix and not file_prefix.endswith('-'):
            file_prefix += '-'

        with nesoni.Stage() as stage:
            for dir in working_dirs:
                working = working_directory.Working(dir, must_exist=True)
                pickle_filenames.append(pickle_workspace / working.name +
                                        '.pickle.gz')
                if self.reuse: continue
                Tail_count(
                    pickle_workspace / working.name,
                    working_dir=dir,
                    annotations=self.annotations,
                    types=self.types,
                    parts=self.parts,
                    extension=self.extension,
                ).process_make(stage)

        assert len(set(pickle_filenames)) == len(
            pickle_filenames), "Duplicate sample name."

        with nesoni.Stage() as stage:
            Aggregate_tail_counts(output_dir=self.output_dir,
                                  pickles=pickle_filenames,
                                  tail=self.tail,
                                  adaptor=self.adaptor).process_make(stage)

        nesoni.Norm_from_counts(
            prefix=work / 'norm',
            counts_filename=work / 'counts.csv',
        ).make()

        similarity = nesoni.Similarity(
            prefix=plot_workspace / 'similarity',
            counts=work / 'counts.csv',
        )

        plot_pooleds = [
            Plot_pooled(
                prefix=plot_workspace / 'pooled-heatmap',
                aggregate=self.output_dir,
                #min_tails = min_tails,
                min_tails=1,
                top=100,
            )
            #for min_tails in (20,50,100,200,500,1000,2000)
        ]

        #plot_comparisons = [
        #    Plot_comparison(
        #        prefix = plot_workspace/('comparison-min-tails-%d-min-span-%.1f' % (min_tails,min_span)),
        #        aggregate = self.output_dir,
        #        min_tails = min_tails,
        #        min_span = min_span,
        #        )
        #    for min_tails in [50,100,200,500]
        #    for min_span in [2,4,8,10,15,20,25,30]
        #    ]
        #
        heatmaps = [
            nesoni.Heatmap(
                prefix=plot_workspace / ('heatmap-min-fold-%.1f' % fold),
                counts=work / 'counts.csv',
                norm_file=work / 'norm.csv',
                min_span=math.log(fold) / math.log(2.0),
            ) for fold in [1.5, 2.0, 4.0, 6.0, 8.0, 10.0, 20.0, 30.0, 40.0]
        ]

        with nesoni.Stage() as stage:
            similarity.process_make(stage)
            for action in plot_pooleds + heatmaps:  #+ plot_comparisons:
                action.process_make(stage)

        r = reporting.Reporter(
            work / 'report',
            self.title,
            file_prefix,
            style=web.style(),
        )

        similarity.report(r)

        r.heading('Poly(A) tail length distribution')

        r.p('This plot shows the distribution of lengths of poly(A) tail sequence in top expressed features. '
            'Its main purpose is to assess data quality. '
            'If the plot has many bright spots there may be many identical reads, possibly due to non-random digestion.'
            )

        r.p('Only reads with a poly(A) sequence of four or more bases are used.'
            )

        for heatmap in plot_pooleds:
            r.report_heatmap(heatmap)

        r.heading('Heatmaps')

        r.p('Genes were selected based '
            'on there being at least some fold change difference between '
            'some pair of samples.')

        for heatmap in heatmaps:
            r.report_heatmap(heatmap)

        #r.heading('Average poly(A) tail length and its relation to expression levels')
        #
        #r.p(
        #    'Only reads with a poly(A) sequence of four or more bases was included in the averages.'
        #    )
        #
        #r.p(
        #    'Genes were selected based on there being at least a certain number of reads with poly(A) sequence in <i>each</i> sample (min-tails), '
        #    'and on there being at least some amount of difference in average tail length between samples (min-span).'
        #    )
        #
        #for heatmap in plot_comparisons:
        #    r.report_heatmap(heatmap)

        r.close()
    def run(self):
        assert self.extension is not None, '--extension must be specified'

        #workspace = self.get_workspace()
        workspace = working_directory.Working(self.working_dir,
                                              must_exist=True)
        if self.annotations == None:
            reference = workspace.get_reference()
            annotations_filename = reference.annotations_filename()
        else:
            annotations_filename = self.annotations

        types = [item.lower() for item in self.types.split(',')]

        parts = self.parts or self.types
        parts = [item.lower() for item in parts.split(',')]

        all_annotations = list(
            annotation.read_annotations(annotations_filename))
        annotation.link_up_annotations(all_annotations)
        for item in all_annotations:
            item.primary = None

        annotations = [
            item for item in all_annotations if item.type.lower() in types
        ]

        part_annotations = []
        seen = set()
        queue = [(item, item) for item in annotations]
        while queue:
            primary, item = queue.pop()
            if item.type.lower() in parts:
                assert item.primary is None, "Feature with multiple parents"
                item.primary = primary
                key = (id(primary), item.start, item.end, item.seqid,
                       item.strand)
                # Ignore duplicate exons (many isoforms will have the same exons)
                if key not in seen:
                    seen.add(key)
                    part_annotations.append(item)
            queue.extend((primary, item2) for item2 in item.children)

        del seen
        del all_annotations

        self.log.log('%d annotations\n' % len(annotations))
        self.log.log('%d part annotations\n' % len(part_annotations))

        #assert annotations, 'No annotations of specified types in file'

        for item in part_annotations:
            this_extension = self.extension
            if "max_extension" in item.attr:
                this_extension = min(this_extension,
                                     int(item.attr["max_extension"]))

            if item.strand >= 0:
                item.tail_pos = item.end
                item.end += this_extension
            else:
                item.tail_pos = item.start
                item.start -= this_extension

        for item in annotations:
            item.hits = []  # [ (tail_length, adaptor_bases) ]

        index = span_index.index_annotations(part_annotations)

        for alignment in sam.Bam_reader(workspace /
                                        'alignments_filtered_sorted.bam'):
            if alignment.is_unmapped or alignment.is_secondary or alignment.is_supplementary:
                continue

            start = alignment.reference_start
            end = alignment.reference_end
            alignment_length = end - start
            strand = -1 if alignment.flag & sam.FLAG_REVERSE else 1
            fragment_feature = annotation.Annotation(
                seqid=alignment.reference_name,
                start=start,
                end=end,
                strand=strand)

            if strand >= 0:
                tail_pos = end
            else:
                tail_pos = start

            tail_length = 0
            adaptor_bases = 0
            for item in alignment.extra:
                if item.startswith('AN:i:'):
                    tail_length = int(item[5:])
                elif item.startswith('AD:i:'):
                    adaptor_bases = int(item[5:])

            hits = index.get(fragment_feature, same_strand=True)
            if hits:
                gene = min(
                    hits,
                    key=lambda gene:
                    (abs(tail_pos - gene.tail_pos), gene.primary.get_id()))
                # Nearest by tail_pos
                # failing that, by id to ensure a deterministic choice

                gene.primary.hits.append((tail_length, adaptor_bases))

        for item in annotations:
            del item.parents
            del item.children
            del item.primary

        f = io.open_possibly_compressed_writer(self.prefix + '.pickle.gz')
        pickle.dump((workspace.name, workspace.get_tags(), annotations), f,
                    pickle.HIGHEST_PROTOCOL)
        f.close()
Exemple #11
0
    def run(self):
        working_dirs = []
        peaks_file = self.peaks_file
        for item in self.working_dirs:
            state_filename = os.path.join(item, 'analyse-polya-batch.state')
            if not os.path.exists(state_filename):
                working_dirs.append(item)
            else:
                with open(state_filename, 'rb') as f:
                    state = pickle.load(f)

                for sample in state.samples:
                    working_dirs.append(
                        os.path.join(item, 'samples', sample.output_dir))

                if not peaks_file:
                    peaks_file = os.path.join(self.pipeline_dir, "peaks",
                                              "relation-child.gff")

        sample_names = [os.path.split(dirname)[1] for dirname in working_dirs]
        workspaces = [
            working_directory.Working(dirname, must_exist=True)
            for dirname in working_dirs
        ]

        workspace = self.get_workspace()

        with open(workspace / "index.html", "wb") as f:
            web.emit(
                f, "igv.html",
                dict(
                    SAMPLES=json.dumps(sample_names),
                    HAVE_NORM=json.dumps(bool(self.norm_file)),
                    TITLE=self.title,
                ))

        bams = [item / "alignments_filtered_sorted.bam" for item in workspaces]

        for i in xrange(len(sample_names)):
            io.symbolic_link(bams[i], workspace / (sample_names[i] + ".bam"))
            io.symbolic_link(bams[i] + ".bai",
                             workspace / (sample_names[i] + ".bam.bai"))

        io.symbolic_link(peaks_file, workspace / "peaks.gff")

        if self.norm_file:
            mults = io.read_grouped_table(self.norm_file)['All']
            norm_mult = [
                float(mults[name]['Normalizing.multiplier'])
                for name in sample_names
            ]

        with nesoni.Stage() as stage:
            Bam_to_bigwig(
                workspace / "total",
                bam_files=bams,
                what="ambiguity,span,3p,polyaspan,polya3p",
            ).process_make(stage)

            for i in xrange(len(sample_names)):
                for scale_desc, scale in \
                        [("raw",1.0)] + \
                        ([("norm",norm_mult[i])] if self.norm_file else []):
                    Bam_to_bigwig(workspace /
                                  (sample_names[i] + "-" + scale_desc),
                                  bam_files=[bams[i]],
                                  what='span,3p,polyaspan,polya3p',
                                  scale=scale).process_make(stage)