Esempio n. 1
0
 def build_snpeff(self):
     jar = io.find_jar('snpEff.jar')
     
     with open(self/'snpeff.config','wb') as f:
         print >> f, 'data_dir = snpeff'
         print >> f, 'genomes : ' + self.name
         print >> f, self.name + '.genome : ' + self.name 
     
     snpwork = io.Workspace(self/'snpeff',must_exist=False)
     snpwork_genome = io.Workspace(snpwork/self.name,must_exist=False)
     snpwork_genomes = io.Workspace(snpwork/'genomes',must_exist=False)
     
     annotations = self.annotations_filename()
     assert annotations
     with open(snpwork_genome/'genes.gff','wb') as f:
         for record in annotation.read_annotations(annotations):
             if record.end <= record.start: continue
             if not record.attr:
                 record.attr['attributes'] = 'none'
             print >> f, record.as_gff()
     
     with open(snpwork_genomes/(self.name+'.fa'),'wb') as f:
         for name, seq in io.read_sequences(self.reference_fasta_filename()):
             io.write_fasta(f, name, seq)
             
     io.execute('java -jar JAR build NAME -gff3 -c CONFIG',
         JAR=jar, NAME=self.name, CONFIG=self/'snpeff.config')
Esempio n. 2
0
 def _describe_peaks(self, r):        
     workspace = io.Workspace(self.output_dir, must_exist=False)
     
     counts = io.read_grouped_table(workspace/("expression","peakwise","counts.csv"))["Count"]
     
     peak_counts = collections.defaultdict(int)
     read_counts = collections.defaultdict(int)
     total = 0
     for item in annotation.read_annotations(workspace/("peaks","relation-child.gff")):
         peak_counts[item.attr.get("Relation","None")] += 1
         read_counts[item.attr.get("Relation","None")] += sum(int(c) for c in counts[item.get_id()].values())            
         total += 1
     
     total_reads = sum(read_counts.values())
     
     r.write("<p>\n")
     r.write("%d peaks\n" % total)
     
     for name, desc in [
             ("3'UTR", "in a 3' UTR"),
             ("Exon", "otherwise in an exon"),
             ("Downstrand", "otherwise downstrand of a non-coding RNA"),
             ("Intron", "otherwise in an intron"),
             ("Antisense", "otherwise antisense to a gene"),
             ("None", "couldn't be related to annotated genes"),
             ]:
         r.write("<br/>%d peaks and %.1f%% of reads %s\n" % (peak_counts[name], read_counts[name]*100.0/total_reads, desc))
     r.write("</p>\n")
Esempio n. 3
0
    def _run_peaks(self, workspace, expressionspace, reference, dirs, analyse_template, file_prefix):
        shiftspace = io.Workspace(workspace/'peak-shift')

        peaks.Call_peaks(
            workspace/'peaks',
            annotations = reference/'reference.gff',
            extension = self.extension,
            min_depth = self.peak_min_depth,
            polya = self.peak_polya,
            min_tail = self.peak_min_tail,
            peak_length = self.peak_length,
            samples = dirs,
            ).make()

        analyse_template(
            expressionspace/'peakwise',
            annotations=workspace/('peaks','relation-child.gff'), 
            extension=0,
            types='peak',
            parts='peak',
            title='Peakwise expression - ' + self.title,
            file_prefix=file_prefix+'peakwise-',
            ).make()
            
        alternative_tails.Compare_peaks(
            shiftspace/'individual',
            norm_file=expressionspace/('peakwise','norm.csv'),
            utrs=reference/'utr.gff',
            utr_only=True,
            top=2,
            reference=reference/'reference.fa',
            parents=workspace/('peaks','relation-parent.gff'),
            children=workspace/('peaks','relation-child.gff'),
            counts=expressionspace/('peakwise','counts.csv'),
            ).make()
        
        if self.groups:
            tail_lengths.Collapse_counts(
                shiftspace/'grouped-counts',
                counts=expressionspace/('peakwise','counts.csv'),
                groups=self.groups
                ).make()
            
            alternative_tails.Compare_peaks(
                shiftspace/'grouped',
                utrs=reference/'utr.gff',
                utr_only=True,
                top=2,
                reference=reference/'reference.fa',
                parents=workspace/('peaks','relation-parent.gff'),
                children=workspace/('peaks','relation-child.gff'),
                counts=shiftspace/'grouped-counts.csv',
                ).make()
Esempio n. 4
0
    def __init__(self, directory, title, file_prefix=''):
        self.workspace = io.Workspace(directory, must_exist=False)
        self.file_prefix = file_prefix
        if self.file_prefix: self.file_prefix += '-'

        self.f = self.workspace.open('index.html', 'wb')

        print >> self.f, '<html><head>'
        print >> self.f, '<title>%s</title>' % title
        print >> self.f, '<style>%s</style>' % STYLE
        print >> self.f, '</head><body>'
        print >> self.f, '<h1>%s</h1>' % title
        self.p(datetime.date.today().strftime('%e %B %Y'))
Esempio n. 5
0
    def run(self):
        assert self.pipeline is not None, "Pipeline output directory required."
        path = os.path.abspath(self.pipeline)

        workspace = io.Workspace(self.output_dir, must_exist=False)

        with open(workspace / "app.R", "wb") as f:
            print >> f, "library(tailtools)"
            print >> f, "shiny_tailtools_report(%s, species=%s, title=%s)" % (
                repr(path), "NULL" if not self.species else repr(self.species),
                repr(self.title))

        with open(workspace / "index.html", "wb") as f:
            web.emit(f, "sorry-no-shiny.html", {})
Esempio n. 6
0
 def _extract_raw(self):            
     work = io.Workspace(self.output_dir, must_exist=False)
     raw = io.Workspace(work/'raw', must_exist=False)
     
     for name, counts, norms in [
             ('genewise',
                 work/('expression','genewise','counts.csv'),
                 work/('expression','genewise','norm.csv'),
                 ),
             ('primarypeakwise',
                 work/('expression','primarypeakwise','counts.csv'),
                 work/('expression','primarypeakwise','norm.csv'),
                 ),
             ('peakwise',
                 work/('expression','peakwise','counts.csv'),
                 work/('expression','peakwise','norm.csv'),
                 ),
             ('pairwise',
                 work/('peak-shift','individual-pairs.csv'),
                 work/('peak-shift','individual-pairs-norm.csv'),
                 ),
             ]:
         nesoni.Vst(
             raw/(name+'-mlog2-RPM'),
             counts,
             norm_file = norms
             ).make()
         
         counts_table = io.read_grouped_table(counts)
         io.write_csv_2(raw/(name+'-info.csv'), counts_table['Annotation'])
         io.write_csv_2(raw/(name+'-count.csv'), counts_table['Count'])
         io.write_csv_2(raw/(name+'-tail.csv'), counts_table['Tail'])
         io.write_csv_2(raw/(name+'-tail-count.csv'), counts_table['Tail_count'])
         io.write_csv_2(raw/(name+'-proportion.csv'), counts_table['Proportion'])
         
         norm_table = io.read_grouped_table(norms)
         io.write_csv_2(raw/(name+'-norm.csv'), norm_table['All'])
Esempio n. 7
0
 def run(self):
     working = io.Workspace(self.output_dir, must_exist=False)
 
     for filename in self.files:
         reader = io.Table_reader(filename)
         
         name = os.path.splitext(os.path.split(filename)[1])[0]
         
         rname = None
         files = None
         for record in reader:
             if record['Chromosome'] != rname:
                 if files: 
                     for item in files: 
                         item.close()
                 rname = record['Chromosome']
                 grace.status('Convert '+name+' '+rname)
                 files = [
                     open(working / (
                         name + 
                         '-' + grace.filesystem_friendly_name(rname) + 
                         '-' + grace.filesystem_friendly_name(item) + '.userplot'
                     ), 'wb')
                     for item in reader.headings[4:]
                 ]
                 pos = 0
             assert int(record['Start']) == pos and int(record['End']) == pos + 1
             
             for val, f in zip(record.values()[4:], files):
                 print >> f, val
             
             pos += 1
         
         if files: 
             for item in files: 
                 item.close()
         grace.status('')
Esempio n. 8
0
 def _create_json(self):                    
     workspace = io.Workspace(self.output_dir, must_exist=False)
     
     samples = [ ]
     groups = [ ]
     for sample in self.samples:
         this_groups = [ ]
         for item in self.groups:
             if selection.matches(
                     selection.term_specification(item),
                     sample.tags + [ sample.output_dir ]
                     ):
                 this_groups.append(selection.term_name(item))
         group = ','.join(this_groups) if this_groups else 'ungrouped'
         if group not in groups: groups.append(group)
         
         item = {
             'name' : sample.output_dir,
             'bam' : os.path.abspath( 
                 workspace/('samples',sample.output_dir,'alignments_filtered_sorted.bam')
                 ),
             'group' : group,
             'tags' : sample.tags,
             }
         samples.append(item)
         
     obj = collections.OrderedDict()
     obj['reference'] = os.path.abspath( self.reference )
     obj['extension'] = self.extension
     obj['genes'] = os.path.abspath( workspace/('peaks','relation-parent.gff') )
     obj['peaks'] = os.path.abspath( workspace/('peaks','relation-child.gff') )
     obj['groups'] = groups
     obj['samples'] = samples
     
     with open(workspace/"plotter-config.json","wb") as f:
         json.dump(obj, f, indent=4)
Esempio n. 9
0
import sys, os

sys.path.insert(0, os.path.join(os.path.split(__file__)[0], '../'))

import unittest

import nesoni
from nesoni import io

data = io.Workspace('data', must_exist=True)
output = io.Workspace('output', must_exist=False)


class Test_clip(unittest.TestCase):
    def test_single(self):
        nesoni.Clip(
            output / 'clip',
            reads=[data / 'reads_1.txt.gz'],
        ).run()

    def test_paired(self):
        nesoni.Clip(
            output / 'clip',
            pairs=[[data / 'reads_1.txt.gz', data / 'reads_2.txt.gz']],
        ).run()


if __name__ == '__main__':
    unittest.main()
Esempio n. 10
0
def pastiche(args):
    if len(args) < 4:
        print USAGE
        return 1

    mask_only, args = grace.get_option_value(args, '--mask', grace.as_bool,
                                             False)
    min_leftover, args = grace.get_option_value(args, '--min-leftover', int,
                                                20)

    output_dir, args = args[0], args[1:]

    #, ref_filename, contig_filenames = args[0], args[1], args[2:]

    ref_filenames = []
    contig_filenames = []
    grace.execute(args,
                  {'contigs': lambda args: contig_filenames.extend(args)},
                  lambda args: ref_filenames.extend(args))

    assert ref_filenames, 'No reference sequences given'
    assert contig_filenames, 'No contig sequences given'

    contigs = dict([(name.split()[0], seq) for filename in contig_filenames
                    for name, seq in io.read_sequences(filename)])
    dir_contigs = {}
    for name in contigs:
        dir_contigs[name + '+'] = contigs[name]
        dir_contigs[name + '-'] = bio.reverse_complement(contigs[name])

    dir_contigs_used = {}
    for name in dir_contigs:
        dir_contigs_used[name] = [False] * len(dir_contigs[name])

    workspace = io.Workspace(output_dir)
    temp_prefix = workspace._object_filename('temp-pastiche')

    out_f = workspace.open('pastiche.fa', 'wb')

    for ref_filename in ref_filenames:
        for ref_name, ref_seq in io.read_sequences(ref_filename):
            ref_name = ref_name.split()[0]

            grace.status(ref_name)

            f = open(temp_prefix + '.fa', 'wb')
            io.write_fasta(f, 'ref', ref_seq)
            f.close()

            scores = [-1] * (len(ref_seq) * 2)
            strings = ['N', ''] * (len(ref_seq))
            contexts = [None for i in xrange(len(ref_seq) * 2)]

            #MAXSCORE = len(ref_seq)+1
            #for i in xrange(len(ref_seq)):
            #    if ref_seq[i].upper() != 'N':
            #        strings[i*2] = ref_seq[i]
            #        scores[i*2] = MAXSCORE
            #for i in xrange(len(ref_seq)-1):
            #    if ref_seq[i].upper() != 'N' and ref_seq[i+1].upper() != 'N':
            #        scores[i*2+1] = MAXSCORE

            if mask_only:
                for i in xrange(len(ref_seq)):
                    strings[i * 2] = ref_seq[i].lower()

            def put(position, dir_contig_name, start, end, score):
                if scores[position] < score:
                    scores[position] = score
                    strings[position] = dir_contigs[dir_contig_name][start:end]
                    contexts[position] = (dir_contig_name, start, end, score)

            for contig_filename in contig_filenames:
                execute([
                    'nucmer',
                    '--prefix',
                    temp_prefix,
                    #'--maxmatch', #Very slow
                    '--nosimplify',
                    '--minmatch',
                    '9',
                    '--mincluster',
                    '50',
                    #'--maxgap', '1000',
                    #'--breaklen', '1000', # Increasing this reduces Ns, but is slow
                    #'--diagfactor', '1.0',
                    temp_prefix + '.fa',
                    contig_filename
                ])

                for contig_name, contig_seq in io.read_sequences(
                        contig_filename):
                    contig_name = contig_name.split()[0]
                    grace.status(ref_name + ' vs ' + contig_name)
                    p = run([
                        'show-aligns', temp_prefix + '.delta', 'ref',
                        contig_name
                    ],
                            stderr=subprocess.PIPE)

                    alignments = []

                    while True:
                        line = p.stdout.readline()
                        if not line: break
                        if not line.startswith('-- BEGIN'):
                            continue

                        parts = line.split()

                        ref_start = int(parts[5])
                        ref_end = int(parts[7])
                        query_start = int(parts[10])
                        query_end = int(parts[12])

                        #assert ref_start < ref_end
                        #ref_start -= 1 #Zero based coordinates

                        al_ref = []
                        al_query = []

                        while True:
                            block = []
                            end = False
                            while True:
                                line = p.stdout.readline()
                                if line.startswith('--   END'):
                                    end = True
                                    break
                                if line == '\n':
                                    if block:
                                        break
                                    else:
                                        continue
                                block.append(line)

                            if end: break

                            al_ref.append(block[0].split()[1])
                            al_query.append(block[1].split()[1])

                        al_ref = ''.join(al_ref)
                        al_query = ''.join(al_query)

                        if ref_start > ref_end:
                            al_ref = bio.reverse_complement(al_ref)
                            al_query = bio.reverse_complement(al_query)
                            ref_start, ref_end = ref_end, ref_start
                            query_start, query_end = query_end, query_start

                        if query_start > query_end:
                            dir_contig_name = contig_name + '-'
                            query_start = len(contig_seq) + 1 - query_start
                            query_end = len(contig_seq) + 1 - query_end
                        else:
                            dir_contig_name = contig_name + '+'

                        ref_start -= 1  #Zero based coordinates
                        query_start -= 1

                        #print al_ref
                        #print al_query

                        #Pretty dumb scoring scheme
                        al_score = 0
                        for i in xrange(len(al_ref)):
                            if al_ref[i] == al_query[i]:
                                al_score += 1
                            #else:
                            #    al_score -= 1

                        #Pastiche alignment over reference
                        ref_pos = ref_start
                        query_pos = query_start
                        al_pos = 0
                        while al_pos < len(al_ref):
                            assert al_ref[al_pos] != '.'
                            if al_query[al_pos] == '.':
                                put(ref_pos * 2, dir_contig_name, query_pos,
                                    query_pos, al_score)
                            else:
                                assert al_query[al_pos].lower() == dir_contigs[
                                    dir_contig_name][query_pos].lower()
                                put(ref_pos * 2, dir_contig_name, query_pos,
                                    query_pos + 1, al_score)
                                query_pos += 1
                            al_pos += 1

                            al_pos_end = al_pos
                            query_pos_end = query_pos
                            while al_pos_end < len(
                                    al_ref) and al_ref[al_pos_end] == '.':
                                al_pos_end += 1
                                query_pos_end += 1
                            #put(ref_pos*2+1, al_query[al_pos:al_pos_end], al_score)
                            assert al_query[al_pos:al_pos_end].lower(
                            ) == dir_contigs[dir_contig_name][
                                query_pos:query_pos_end].lower()
                            put(ref_pos * 2 + 1, dir_contig_name, query_pos,
                                query_pos_end, al_score)
                            al_pos = al_pos_end
                            query_pos = query_pos_end
                            ref_pos += 1

                    p.wait()

            grace.status(ref_name)

            result = ''.join(strings)
            io.write_fasta(out_f, ref_name, result)

            for context in contexts:
                if context is None: continue
                name, start, end, score = context
                for i in xrange(start, end):
                    dir_contigs_used[name][i] = True

            #Interpolation
            #result = [ ]
            #i = 0
            #while i < len(ref_seq):
            #    if strings[i*2].upper() != 'N':
            #        result.append(strings[i*2])
            #        result.append(strings[i*2+1])
            #        i += 1
            #        continue
            #
            #    j = i
            #    while strings[j*2].upper() == 'N':
            #        j += 1
            #
            #    grace.status('')
            #    print >> sys.stderr, 'interpolating', i+1,'..',j
            #
            #    window = 20 #!!!!!!!!!!!
            #    left_contexts = collections.defaultdict(lambda:0)
            #    for i1 in xrange(max(0,i-window),i):
            #        for context_name, context_start, context_end, context_score in contexts[i1*2]:
            #            key = (context_name, context_end + i - i1)
            #            left_contexts[key] = max(left_contexts[key],context_score)
            #
            #    right_contexts = collections.defaultdict(lambda:0)
            #    for j1 in xrange(j,min(j+window,len(ref_seq))):
            #        for context_name, context_start, context_end, context_score in contexts[j1*2]:
            #            key = (context_name, context_start + j - j1)
            #            right_contexts[key] = max(left_contexts[key],context_score)
            #
            #    #print >> sys.stderr, left_contexts
            #    #print >> sys.stderr, right_contexts
            #
            #    options = [ ]
            #
            #    for (left_name, left_pos), left_score in left_contexts.items():
            #        for (right_name, right_pos), right_score in right_contexts.items():
            #            if left_name != right_name: continue
            #            if right_pos < left_pos: continue
            #
            #            if right_pos-left_pos > (j-i) * 4.0 + 10: continue   #!!!!!!!!!!!!!!!!!!!!!!1
            #            if right_pos-left_pos < (j-i) * 0.25 - 10: continue
            #
            #            score = float(min(right_pos-left_pos,j-i))/max(right_pos-left_pos,j-i)
            #            score *= left_score + right_score
            #            #print >> sys.stderr, left_name, right_pos-left_pos, j-i, score
            #            options.append( (score, left_name, left_pos, right_pos) )
            #
            #    if options:
            #        best = max(options, key=lambda option: option[0])
            #        print >> sys.stderr, '->', best
            #        result.append( dir_contigs[best[1]][best[2]:best[3]].lower() )
            #    else:
            #        print >> sys.stderr, '-> no good interpolation'
            #        result.append( ref_seq[i:j] )
            #
            #    i = j
            #
            #result = ''.join(result)
            #io.write_fasta(sys.stdout, ref_name, result)

            #print >> sys.stderr, len(result), result.count('N')
            #for pos, size in N_runs:
            #    out_size = len(''.join( strings[pos*2:pos*2+2] ))
            #    print >> sys.stderr, pos, size, '->', out_size

    out_f.close()

    grace.status('')

    #for name, seq in io.read_sequences(ref_filename):
    #    result = pastiche(seq, contigs_filename)
    #    io.write_fasta(sys.stdout, name, result)

    leftover_f = workspace.open('leftovers.fa', 'wb')

    for name in sorted(contigs):
        used = [
            (a or b)
            for a, b in zip(dir_contigs_used[name +
                                             '+'], dir_contigs_used[name +
                                                                    '-'][::-1])
        ]

        i = 0
        while i < len(used):
            j = i
            while j < len(used) and not used[j]:
                j += 1
            if j - i > min_leftover:
                if i == 0 and j == len(used):
                    out_name = name
                else:
                    out_name = name + ':%d..%d' % (i + 1, j)
                io.write_fasta(leftover_f, out_name, contigs[name][i:j])

            i = j + 1

    leftover_f.close()

    for suffix in ['.fa', '.delta']:
        os.unlink(temp_prefix + suffix)
Esempio n. 11
0
def fill_scaffolds(args):
    max_filler_length, args = grace.get_option_value(args, '--max-filler', int, 4000)
    
    if len(args) < 2:
        print USAGE
        return 1
    
    (output_dir, graph_dir), args = args[:2], args[2:]

    scaffolds = [ ]
    
    def scaffold(args):
        circular, args = grace.get_option_value(args, '--circular', grace.as_bool, False)
        
        scaffold = [ ]
        for item in args:
            scaffold.append( ('contig', int(item)) )
            scaffold.append( ('gap', None) )
        
        if not circular: scaffold = scaffold[:-1]
        
        name = 'custom_scaffold_%d' % (len(scaffolds)+1)
        scaffolds.append( (name, scaffold) )
            
    grace.execute(args, [scaffold])
    
    custom_scaffolds = (len(scaffolds) != 0)    
    
    sequences = dict( 
        (a.split()[0], b.upper()) 
          for a,b in 
            io.read_sequences(os.path.join(
              graph_dir, '454AllContigs.fna')))
    
    sequence_names = sorted(sequences)
    sequence_ids = dict(zip(sequence_names, xrange(1,len(sequence_names)+1)))
    
    contexts = { }
    context_names = { }
    context_depths = { }
    for i in xrange(1,len(sequence_names)+1):
        seq = sequences[sequence_names[i-1]]
        contexts[ i ] = seq
        context_names[ i ] = sequence_names[i-1]+'-fwd'
        contexts[ -i ] = bio.reverse_complement(seq)
        context_names[ -i ] = sequence_names[i-1]+'-rev'
    
    links = collections.defaultdict(list)
    
    for line in open(
      os.path.join(graph_dir, '454ContigGraph.txt'),
      'rU'):
        parts = line.rstrip('\n').split('\t')
        
        if parts[0].isdigit():
            seq = sequence_ids[parts[1]]
            context_depths[ seq] = float(parts[3])
            context_depths[-seq] = float(parts[3])
        
        if parts[0] == 'C':    
            name1 = 'contig%05d' % int(parts[1])
            dir1 = {"3'" : 1, "5'" : -1 }[parts[2]]
            name2 = 'contig%05d' % int(parts[3])
            dir2 = {"5'" : 1, "3'" : -1 }[parts[4]]
            depth = int(parts[5])
            #print name1, dir1, name2, dir2, depth
            
            links[ sequence_ids[name1] * dir1 ].append( (depth, sequence_ids[name2] * dir2) )
            links[ sequence_ids[name2] * -dir2 ].append( (depth, sequence_ids[name1] * -dir1) )
    
        if parts[0] == 'S' and not custom_scaffolds:  
            name = 'scaffold%05d' % int(parts[2])  
            components = parts[3].split(';')
            scaffold = [ ]
            for component in components:
                a,b = component.split(':')
                if a == 'gap':
                    scaffold.append( ('gap',int(b)) )
                else:
                    strand = { '+': +1, '-': -1 }[ b ]
                    scaffold.append( ('contig', sequence_ids['contig%05d'%int(a)] * strand) )
            scaffolds.append( (name, scaffold) )
    
    
    
    #paths = { }
    #
    #todo = [ ]
    #for i in contexts:
    #    for depth_left, neg_left in links[-i]:
    #        left = -neg_left
    #        for depth_right, right in links[i]:
    #            todo.append( ( max(-depth_left,-depth_right,-context_depths[i]), left, right, (i,)) )
    #
    #heapq.heapify(todo)
    #while todo:
    #    score, source, dest, path = heapq.heappop(todo)
    #    if (source,dest) in paths: continue
    #    
    #    paths[(source,dest)] = path
    #    
    #    if len(contexts[dest]) > max_filler_length: continue
    #    
    #    for depth, next in links[dest]:
    #        heapq.heappush(todo,
    #            ( max(score,-depth,-context_depths[dest]), source, next, path+(dest,))
    #        )
    
    
    path_source_dest = collections.defaultdict(dict) # source -> dest -> next
    path_dest_source = collections.defaultdict(dict) # dest -> source -> next
    
    
    # Use links, in order to depth of coverage, to construct paths between contigs
    # Thus: paths have maximum minimum depth
    #       subsections of paths also have this property
    
    todo = [ ]
    for i in contexts:    
        for depth_link, right in links[i]:
            todo.append( ( depth_link, i, right) )
    todo.sort(reverse=True)
    for score, left, right in todo:
        if right in path_source_dest[left]: continue
        
        sources = [(left,right)]
        if len(contexts[left]) <= max_filler_length:
            sources += path_dest_source[left].items()
        destinations = [right]
        if len(contexts[right]) <= max_filler_length:
            destinations += path_source_dest[right].keys()
        
        for source, next in sources:
            for dest in destinations:
                if dest in path_source_dest[source]: continue
                path_source_dest[source][dest] = next
                path_dest_source[dest][source] = next
    
    
    workspace = io.Workspace(output_dir)
    scaffold_f = workspace.open('scaffolds.fa','wb')
    
    #comments = [ ]
    features = [ ]
    
    used = set()
    previous_total = 0
    
    for i, (name, scaffold) in enumerate(scaffolds):
        result = '' # Inefficient. Meh.
        n_filled = 0
        n_failed = 0
        for j, item in enumerate(scaffold):
            if item[0] == 'contig':
                result += contexts[item[1]]
                used.add(abs(item[1]))
            else:
                left = scaffold[j-1]
                right = scaffold[ (j+1) % len(scaffold) ] #If gap at end, assume circular
                assert left[0] == 'contig'
                assert right[0] == 'contig'
                
                gap_start = len(result)
    
                can_fill = right[1] in path_source_dest[left[1]]
                if can_fill:
                    n = 0
                    k = path_source_dest[left[1]][right[1]]
                    while k != right[1]:
                        n += len(contexts[k])
                        result += contexts[k].lower()
                        used.add(abs(k))
                        
                        k = path_source_dest[k][right[1]]
                    
                    n_filled += 1
                        
                    if item[1] is not None and max(n,item[1]) > min(n,item[1])*4:
                        print >> sys.stderr, 'Warning: gap size changed from %d to %d in scaffold %d' % (item[1],n,i+1)
                else:
                    n_failed += 1
                    
                    #print >> sys.stderr, 'Warning: No path to fill a gap in scaffold %d' % (i+1)
                    result += 'n' * (9 if item[1] is None else item[1])
    
                gap_end = len(result)
                
                #features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % (
                #    'all-scaffolds',
                #    'fill-scaffolds',
                #    'gap',
                #    previous_total + gap_start+1,
                #    previous_total + max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm.
                #    '.', #score
                #    '+', #strand
                #    '.', #frame
                #    '' #properties
                #))
                features.append( '%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s' % (
                    name,
                    'fill-scaffolds',
                    'gap',
                    gap_start+1,
                    max(gap_end, gap_start+1), #Allow for zeroed out gaps. Hmm.
                    '.', #score
                    '+', #strand
                    '.', #frame
                    '' #properties
                ))
                    
    
        io.write_fasta(scaffold_f, name, result)
        previous_total += len(result)
        #comments.append('##sequence-region    %s %d %d' % (name, 1, len(result)))
        print >> sys.stderr, 'Scaffold%05d: %d gaps filled, %d could not be filled' % (i+1, n_filled, n_failed)
    
    scaffold_f.close()
    
    gff_f = workspace.open('scaffolds.gff', 'wb')
    #print >>gff_f, '##gff-version    3'
    #for comment in comments:
    #    print >>gff_f, comment
    for feature in features:
        print >>gff_f, feature
    gff_f.close()
    
    
    leftovers_f = workspace.open('leftovers.fa', 'wb')
    for name in sequence_names:
        if sequence_ids[name] not in used:
            io.write_fasta(leftovers_f, name, sequences[name])
    leftovers_f.close()
    
    ends = { }
    for i, (name, scaffold) in enumerate(scaffolds):
        if scaffold[-1][0] == 'gap': continue
        ends[ '%s start' % name ] = scaffold[-1][1]
        ends[ '%s end  ' % name ] = -scaffold[0][1] 
    
    for end1 in sorted(ends):
        options = [ end2 for end2 in ends if -ends[end2] in path_source_dest[ends[end1]] ]
        if len(options) == 1:
            print >> sys.stderr, 'Note: from', end1, 'only', options[0], 'is reachable'
Esempio n. 12
0
    def run(self):
        #===============================================
        #                Sanity checks
        #===============================================
        
        assert len(set([ item.output_dir for item in self.samples ])) == len(self.samples), "Duplicate sample name."
        
        all_inputs = [ ]
        for sample in self.samples:
            all_inputs.extend(sample.reads)
        assert len(set(all_inputs)) == len(all_inputs), "Duplicate read filename."
        
        assert len(set([ item.output_dir for item in self.tests ])) == len(self.tests), "Duplicate test name."
        
        for test in self.tests:
            assert not test.analysis, "analysis parameter for tests should not be set, will be filled in automatically"
        
        #===============================================
        #                Run pipeline
        #===============================================
        
        names = [ sample.output_dir for sample in self.samples ]
        
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        workspace = io.Workspace(self.output_dir, must_exist=False)
        samplespace = io.Workspace(workspace/'samples', must_exist=False)
        expressionspace = io.Workspace(workspace/'expression', must_exist=False)
        testspace = io.Workspace(workspace/'test', must_exist=False)
        
        self._create_json()
                
        file_prefix = self.file_prefix
        if file_prefix and not file_prefix.endswith('-'):
            file_prefix += '-'


        samples = [ ]
        for sample in self.samples:
            samples.append(sample(
                samplespace / sample.output_dir,
                reference = self.reference,
                ))
        
        dirs = [ item.output_dir for item in samples ]
        
        clipper_logs = [ join(item.output_dir, 'clipped_reads_log.txt') for item in samples ]
        filter_logs = [ join(item.output_dir, 'filter_log.txt') for item in samples ]
        filter_polya_logs = [ join(item.output_dir + '-polyA', 'filter_log.txt') for item in samples ]

        analyse_template = tail_lengths.Analyse_tail_counts(
            working_dirs = dirs,
            extension = self.extension,
            annotations = reference/'reference.gff',
            types = self.types,
            parts = self.parts
            )
        
        with nesoni.Stage() as stage:        
            for item in samples:
                item.process_make(stage)

        job_gene_counts = analyse_template(
            output_dir = expressionspace/'genewise',
            extension = self.extension,
            title = 'Genewise expression - ' + self.title,
            file_prefix = file_prefix+'genewise-',
            ).make
        
        job_peaks = _call(self._run_peaks, 
            workspace=workspace, 
            expressionspace=expressionspace, 
            reference=reference, 
            dirs = dirs,
            analyse_template = analyse_template,
            file_prefix=file_prefix,
            )
        
        job_norm = nesoni.Norm_from_samples(
            workspace/'norm',
            working_dirs = dirs
            ).make
            
        job_bigwig = bigwig.Polya_bigwigs(
            workspace/'bigwigs', 
            working_dirs = dirs, 
            norm_file = workspace/"norm.csv",
            peaks_file = workspace/("peaks", "relation-child.gff"),
            title = "IGV tracks - "+self.title
            ).make
        
        job_norm_bigwig = _call(_serial, job_norm, job_bigwig)

        job_utrs = tail_tools.Call_utrs(
            workspace/('peaks','primary-peak'),
            self.reference,
            self.output_dir,
            extension=self.extension
            ).make
            
        job_primpeak_counts = analyse_template(
            expressionspace/'primarypeakwise',
            annotations=workspace/('peaks','primary-peak-peaks.gff'), 
            extension=0,
            types='peak',
            parts='peak',
            title='Primary-peakwise expression - ' + self.title,
            file_prefix=file_prefix+'primarypeakwise-',
            ).make
        
        job_primpeak = _call(_serial, job_utrs, job_primpeak_counts)
        
        job_peak_primpeak_bigwig = _call(_serial, 
            job_peaks, 
            _call(_parallel, job_norm_bigwig, job_primpeak))
        
        job_count = _call(_parallel, job_gene_counts, job_peak_primpeak_bigwig)
            
        test_jobs = [ ]
        for test in self.tests:
            test_jobs.append(test(
                output_dir = testspace/test.output_dir,
                analysis = self.output_dir,
                ).make)

        job_test = _call(_parallel, *test_jobs)

        job_raw = self._extract_raw

        job_all = _call(_serial, job_count, _call(_parallel, job_raw, job_test))        
        
        job_all()



        #===============================================
        #                   Report        
        #===============================================

        r = reporting.Reporter(workspace/'report', self.title, self.file_prefix, style=web.style())
        
        io.symbolic_link(source=workspace/'bigwigs', link_name=r.workspace/'bigwigs')
        r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="bigwigs/index.html">&rarr; Load tracks into IGV</a></div>')

        tail_tools.Shiny(workspace/('report','shiny'), self.output_dir, title=self.title, species=self.species).run()
        r.write('<div style="font-size: 150%; margin-top: 1em; margin-bottom: 1em;"><a href="shiny/" target="_blank">&rarr; Interactive report (shiny)</a></div>')
        
        r.heading('Alignment to reference')
        
        r.report_logs('alignment-statistics',
            #[ workspace/'stats.txt' ] +
            clipper_logs + filter_logs + #filter_polya_logs +
            [ expressionspace/('genewise','aggregate-tail-counts_log.txt') ],
            filter=lambda sample, field: (
                field not in [
                    'fragments','fragments aligned to the reference','reads kept',
                    'average depth of coverage, ambiguous',
                    'average depth of coverage, unambiguous',
                    ]
            ),
        )
        

        r.heading('Genewise expression')
        
        r.p("This is based on all reads within each gene (possibly from multiple peaks, or decay products).")
        
        io.symbolic_link(source=expressionspace/('genewise','report'),link_name=r.workspace/'genewise')
        r.p('<a href="genewise/index.html">&rarr; Genewise expression</a>')


        r.heading('Peakwise expression')
        
        r.p("This shows results from all called peaks.")
        
        peak_filename = expressionspace/('peakwise','features-with-data.gff')
        r.p(r.get(peak_filename, name='peaks.gff') + ' - peaks called')        

        self._describe_peaks(r)
        
        io.symbolic_link(source=expressionspace/('peakwise','report'),link_name=r.workspace/'peakwise')
        r.p('<a href="peakwise/index.html">&rarr; Peakwise expression</a>')


        r.subheading('Primary-peakwise expression')
        
        r.p("This is based on the most prominent peak in the 3'UTR for each gene. (Peak can be up to %d bases downstrand of the annotated 3'UTR end, but not inside another gene on the same strand.)" % self.extension)
        
        io.symbolic_link(source=expressionspace/('primarypeakwise','report'),link_name=r.workspace/'primarypeakwise')
        r.p('<a href="primarypeakwise/index.html">&rarr; Primary-peakwise expression</a>')

        r.p(r.get(workspace/('peaks','primary-peak-peaks.gff')) + ' - primary peaks for each gene.')
        r.p(r.get(workspace/('peaks','primary-peak-utrs.gff')) + ' - 3\' UTR regions, based on primary peak call.')
        r.p(r.get(workspace/('peaks','primary-peak-genes.gff')) + ' - full extent of gene, based on primary peak call.')


        if self.tests:
            r.heading('Differential tests')
            for test in self.tests:
                io.symbolic_link(source=testspace/test.output_dir,link_name=r.workspace/('test-'+test.output_dir))
                r.p('<a href="test-%s">&rarr; %s</a> '
                    % (test.output_dir, test.get_title()))


        web.Geneview_webapp(r.workspace/'view').run()        
                
        r.heading('Gene viewers')
        r.p('Having identified interesting genes from heatmaps and differential tests above, '
            'these viewers allow specific genes to be examined in detail.')
        
        if self.groups:
            r.get(workspace/('peak-shift','grouped.json'))
            r.p('<a href="view.html?json=%sgrouped.json">&rarr; Gene viewer, grouped samples</a>' % r.file_prefix)
        r.get(workspace/('peak-shift','individual.json'))
        r.p('<a href="view.html?json=%sindividual.json">&rarr; Gene viewer, individual samples</a>' % r.file_prefix)
        
        
        r.heading('Raw data')
        
        r.p(r.tar('csv-files',glob.glob(workspace/('raw','*.csv'))))
        
        r.write('<ul>\n')
        r.write('<li> -info.csv = gene name and product, etc\n')
        r.write('<li> -count.csv = read count\n')
        r.write('<li> -mlog2-RPM.csv = moderated log2 Reads Per Million\n')
        r.write('<li> -tail.csv = average poly(A) tail length\n')
        r.write('<li> -tail-count.csv = poly(A) read count\n')
        r.write('<li> -proportion.csv = proportion of reads with poly(A)\n')
        r.write('<li> -norm.csv = read count normalization used for log2 transformation, heatmaps, differential tests, etc etc\n')
        r.write('</ul>\n')

        r.p('This set of genes was used in the analysis:')
        
        r.p(r.get(reference/'reference.gff') + ' - Reference annotations in GFF3 format')
        r.p(r.get(reference/'utr.gff') + ' - 3\' UTR regions')

        r.p('<b>%d further bases 3\' extension was allowed</b> beyond the GFF files above (but not extending into the next gene on the same strand).' % self.extension)

        r.write('<p/><hr>\n')
        r.subheading('About normalization and log transformation')

        r.p('Counts are converted to '
            'log2 Reads Per Million using Anscombe\'s variance stabilizing transformation '
            'for the negative binomial distribution, implemented in '
            'R package "varistran".')
                
        r.write('<p/><hr>\n')

        r.p('Reference directory '+self.reference)
        r.p('Tail Tools version '+tail_tools.VERSION)
        r.p('Nesoni version '+nesoni.VERSION)
        
        r.close()