def run(self):
        fa = io.open_possibly_compressed_file(self.fasta_file)
        fq = io.open_possibly_compressed_file(self.qual_file)

        out_file = self.begin_output()

        while True:
            a1 = fa.readline()
            if not a1: break
            a1 = a1.strip()
            a2 = fa.readline().strip()
            q1 = fq.readline().strip()
            q2 = fq.readline().strip()

            assert a1.startswith('>')
            assert a1 == q1

            print >> out_file, '@' + a1[1:]
            print >> out_file, a2
            print >> out_file, '+'
            print >> out_file, ''.join(
                chr(33 + max(0, int(item))) for item in q2.split())

        self.end_output(out_file)

        fa.close()
        fq.close()
    def run(self):
        fa = io.open_possibly_compressed_file(self.fasta_file)
        fq = io.open_possibly_compressed_file(self.qual_file)

        out_file = self.begin_output()

        while True:
            a1 = fa.readline()
            if not a1:
                break
            a1 = a1.strip()
            a2 = fa.readline().strip()
            q1 = fq.readline().strip()
            q2 = fq.readline().strip()

            assert a1.startswith(">")
            assert a1 == q1

            print >> out_file, "@" + a1[1:]
            print >> out_file, a2
            print >> out_file, "+"
            print >> out_file, "".join(chr(33 + max(0, int(item))) for item in q2.split())

        self.end_output(out_file)

        fa.close()
        fq.close()
Esempio n. 3
0
    def begin_input(self):
        from nesoni import io

        if self.input is not None:
            return io.open_possibly_compressed_file(self.input)
        else:
            return sys.stdin
Esempio n. 4
0
def read_gff(filename, joiner=None):
    f = io.open_possibly_compressed_file(filename)
    for line in f:
        line = line.rstrip()

        if line == '##FASTA':
            break

        if not line or line.startswith('#'):
            continue

        parts = line.split('\t')
        assert len(parts) >= 8, parts

        # Be nice, ignore spaces at start or end, eg in seqid
        parts = [item.strip() for item in parts]

        result = Annotation()

        result.seqid = parts[0]
        result.source = parts[1]
        result.type = parts[2]
        result.start = int(parts[3]) - 1
        result.end = int(parts[4])
        result.score = None if parts[5] == '.' else float(parts[5])
        result.strand = strand_from_gff[parts[6]]
        result.phase = None if parts[7] == '.' else int(parts[7])
        result.attr = {} if len(parts) < 9 else split_keyvals(parts[8], joiner)

        yield result

    f.close()
Esempio n. 5
0
def read_gff(filename):
    f = io.open_possibly_compressed_file(filename)
    for line in f:
        line = line.rstrip()
        
        if line == '##FASTA':
            break
        
        if not line or line.startswith('#'): 
            continue
        
        parts = line.split('\t')
        assert len(parts) >= 8, parts

        result = Annotation()
        
        result.seqid = parts[0]
        result.source = parts[1]
        result.type = parts[2]
        result.start = int(parts[3])-1
        result.end = int(parts[4])
        result.score = None if parts[5] == '.' else float(parts[5])
        result.strand = strand_from_gff[parts[6]]
        result.phase = None if parts[7] == '.' else int(parts[7])
        result.attr = { } if len(parts) < 9 else split_keyvals(parts[8])
        
        yield result        

    f.close()
Esempio n. 6
0
    def begin_input(self):
        from nesoni import io

        if self.input is not None:
            return io.open_possibly_compressed_file(self.input)
        else:
            return sys.stdin
Esempio n. 7
0
 def get_object(self, path, plain_text=False):
     from nesoni import io
     f = io.open_possibly_compressed_file(self._object_filename(path))
     if plain_text:
         result = eval(f.read())
     else:
         result = cPickle.load(f)
     f.close()
     return result
Esempio n. 8
0
    def run(self):
        workspace = self.get_workspace()

        reference = reference_directory.Reference(self.reference,
                                                  must_exist=True)

        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)
        variants = collections.defaultdict(list)
        for record in reader:
            variants[record.CHROM].append(record)
        reader_f.close()

        for chrom in variants:
            variants[chrom].sort(key=lambda item: item.POS)

        filenames = [workspace / (item + '.fa') for item in reader.samples]
        for filename in filenames:
            with open(filename, 'wb'):
                pass

        for name, seq in io.read_sequences(
                reference.reference_fasta_filename()):
            for i, sample in enumerate(reader.samples):
                revised = []
                pos = 0
                for variant in variants[name]:
                    gt = variant.samples[i].data.GT
                    if gt is None: continue
                    assert gt.isdigit(
                    ), 'Unsupported genotype (can only use haploid genotypes): ' + gt
                    gt_number = int(gt)
                    if gt_number == 0:
                        var_seq = variant.REF
                    else:
                        var_seq = str(variant.ALT[gt_number - 1])
                        assert re.match(
                            '[ACGTN]*$',
                            var_seq), 'Unsupported variant type: ' + var_seq
                    new_pos = variant.POS - 1
                    assert new_pos >= pos, 'Variants overlap.'
                    revised.append(seq[pos:new_pos])
                    pos = new_pos
                    revised.append(var_seq)
                    assert seq[pos:pos + len(variant.REF)].upper(
                    ) == variant.REF, 'REF column in VCF does not match reference sequence'
                    pos += len(variant.REF)
                revised.append(seq[pos:])

                with open(filenames[i], 'ab') as f:
                    io.write_fasta(f, name, ''.join(revised))

            del variants[name]
        assert not variants, 'Chromosome names in VCF not in reference: ' + ' '.join(
            variants)
Esempio n. 9
0
def read_annotations(filename):
    f = io.open_possibly_compressed_file(filename)
    peek = f.read(1024)
    f.close()
    
    if peek.startswith('LOCUS'):
        return read_genbank(filename)
    elif peek.startswith('##gff') or peek.split('\n')[0].count('\t') in (7,8):
        return read_gff(filename)
    else:
        raise grace.Error('Not an annotation file.')
Esempio n. 10
0
def read_annotations(filename, joiner=None):
    f = io.open_possibly_compressed_file(filename)
    peek = f.read(1024)
    f.close()

    if peek.startswith('LOCUS'):
        return read_genbank(filename)
    elif peek.startswith('##gff') or peek.split('\n')[0].count('\t') in (7, 8):
        return read_gff(filename, joiner)
    else:
        raise grace.Error('Not an annotation file.')
Esempio n. 11
0
 def get_object(self, path, plain_text=False):
     from nesoni import io
     f = io.open_possibly_compressed_file(self._object_filename(path))
     if plain_text:
         data = f.read()
         try:
             result = json.loads(data)
         except ValueError: #Older versions used repr instead of json.dump
             result = eval(data)
     else:
         result = cPickle.load(f)
     f.close()
     return result
Esempio n. 12
0
 def get_object(self, path, plain_text=False):
     from nesoni import io
     f = io.open_possibly_compressed_file(self._object_filename(path))
     if plain_text:
         data = f.read()
         try:
             result = json.loads(data)
         except ValueError:  #Older versions used repr instead of json.dump
             result = eval(data)
     else:
         result = cPickle.load(f)
     f.close()
     return result
Esempio n. 13
0
    def run(self):
        f = self.begin_output()

        for filename in self.filenames:
            info = io.get_file_info(filename)

            any = False

            name = os.path.splitext(os.path.split(filename)[1])[0]

            if info.matches('sequences'):
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'total bases', total_length)
                if total:
                    print >> f, grace.datum(name, 'average length',
                                            float(total_length) / total)
                print >> f
                any = True

            if info.matches('annotations'):
                total = 0
                counts = {}
                for item in annotation.read_annotations(filename, "/"):
                    total += 1
                    counts[item.type] = counts.get(item.type, 0) + 1

                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features',
                                            counts[key])
                print >> f
                any = True

            if info.matches('type-vcf'):
                reader_f = io.open_possibly_compressed_file(filename)
                reader = vcf.Reader(reader_f)
                n = 0
                for item in reader:
                    n += 1
                print >> f, grace.datum(name, 'variants', n)
                any = True

            if not any:
                raise grace.Error('Don\'t know what to do with ' + filename)

        self.end_output(f)
Esempio n. 14
0
    def run(self):
        f = self.begin_output()
    
        for filename in self.filenames:
            info = io.get_file_info(filename)
            
            any = False
            
            name = os.path.splitext(os.path.split(filename)[1])[0]
            
            if info.matches('sequences'):
                total = 0
                total_length = 0
                for seq in io.read_sequences(filename, qualities=True):
                    total += 1
                    total_length += len(seq[1])
                print >> f, grace.datum(name, 'sequences', total)
                print >> f, grace.datum(name, 'total bases', total_length)
                if total:
                    print >> f, grace.datum(name, 'average length', float(total_length)/total)
                print >> f
                any = True
            
            if info.matches('annotations'):
                total = 0
                counts = { }
                for item in annotation.read_annotations(filename):
                    total += 1
                    counts[item.type] = counts.get(item.type,0)+1
                                
                print >> f, grace.datum(name, 'features', total)
                for key in sorted(counts):
                    print >> f, grace.datum(name, key + ' features', counts[key])
                print >> f
                any = True
            
            if info.matches('type-vcf'):
                reader_f = io.open_possibly_compressed_file(filename)
                reader = vcf.Reader(reader_f)
                n = 0
                for item in reader:
                    n += 1
                print >> f, grace.datum(name, 'variants', n)
                any = True
            
            if not any:
                raise grace.Error('Don\'t know what to do with ' + filename)

        self.end_output(f)
Esempio n. 15
0
 def __init__(self, filename):
     assert os.path.exists(filename), filename + ' does not exist'
     
     if is_bam(filename):
         self.process = io.run([
             'samtools',
             'view',
             io.abspath(filename),
         ])
         
         ## Godawful hack
         #self.process.stdout = io.process_buffer(self.process.stdout)
         self.file = self.process.stdout
     else:
         self.process = None
         self.file = io.open_possibly_compressed_file(filename)
Esempio n. 16
0
 def __init__(self, filename):
     assert os.path.exists(filename), filename + ' does not exist'
     
     if is_bam(filename):
         self.process = io.run([
             'samtools',
             'view',
             io.abspath(filename),
         ])
         
         ## Godawful hack
         #self.process.stdout = io.process_buffer(self.process.stdout)
         self.file = self.process.stdout
     else:
         self.process = None
         self.file = io.open_possibly_compressed_file(filename)
Esempio n. 17
0
    def run(self):
        workspace = self.get_workspace()
        
        reference = reference_directory.Reference(self.reference, must_exist=True)
        
        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)
        variants = collections.defaultdict(list)
        for record in reader:
            variants[record.CHROM].append(record)
        reader_f.close()
        
        for chrom in variants:
            variants[chrom].sort(key=lambda item: item.POS)
        
        filenames = [ workspace/(item+'.fa') for item in reader.samples ]
        for filename in filenames:
            with open(filename,'wb'): pass
        
        for name, seq in io.read_sequences(reference.reference_fasta_filename()):
            for i, sample in enumerate(reader.samples):            
                revised = [ ]
                pos = 0
                for variant in variants[name]:
                    gt = variant.samples[i].data.GT
                    if gt is None: continue
                    assert gt.isdigit(), 'Unsupported genotype (can only use haploid genotypes): '+gt
                    gt_number = int(gt)
                    if gt_number == 0:
                        var_seq = variant.REF
                    else:
                        var_seq = str(variant.ALT[gt_number-1])
                        assert re.match('[ACGTN]*$', var_seq), 'Unsupported variant type: '+var_seq
                    new_pos = variant.POS-1
                    assert new_pos >= pos, 'Variants overlap.'
                    revised.append(seq[pos:new_pos])
                    pos = new_pos
                    revised.append(var_seq)
                    assert seq[pos:pos+len(variant.REF)].upper() == variant.REF, 'REF column in VCF does not match reference sequence'
                    pos += len(variant.REF)
                revised.append(seq[pos:])
                            
                with open(filenames[i],'ab') as f:
                    io.write_fasta(f, name, ''.join(revised))

            del variants[name]        
        assert not variants, 'Chromosome names in VCF not in reference: '+' '.join(variants)
Esempio n. 18
0
def read_genbank(filename):
    from Bio import Seq, SeqIO
    f = io.open_possibly_compressed_file(filename)

    id_counter = 0

    for record in SeqIO.parse(f, 'genbank'):
        name = record.id
        if name == '' or name == 'unknown':
            name = record.name

        for root_feature in record.features:
            todo = [root_feature]
            while todo:
                feature = todo.pop()
                result = Annotation()
                result.seqid = name
                result.source = 'genbank-file'
                result.type = feature.type
                result.start = feature.location.nofuzzy_start
                result.end = feature.location.nofuzzy_end
                result.score = None
                result.strand = feature.strand
                result.phase = 0  #FIXME
                result.attr = {}
                for key in feature.qualifiers:
                    result.attr[key] = ', '.join(feature.qualifiers[key])
                yield result

                if 'ID' not in result.attr:
                    id_counter += 1
                    result.attr['ID'] = '%d' % id_counter
                for sub_feature in feature.sub_features:
                    feature.qualifiers['Parent'] = [result.attr['ID']]

                todo.extend(feature.sub_features[::-1])

    f.close()
Esempio n. 19
0
def read_genbank(filename):
    from Bio import Seq, SeqIO
    f = io.open_possibly_compressed_file(filename)
    
    id_counter = 0
    
    for record in SeqIO.parse(f,'genbank'):
        name = record.id
        if name == '' or name == 'unknown':
            name = record.name

        for root_feature in record.features:
            todo = [ root_feature ]
            while todo:
                feature = todo.pop()            
                result = Annotation()
                result.seqid = name
                result.source = 'genbank-file'
                result.type = feature.type
                result.start = feature.location.nofuzzy_start
                result.end = feature.location.nofuzzy_end
                result.score = None
                result.strand = feature.strand
                result.phase = 0 #FIXME
                result.attr = { }
                for key in feature.qualifiers:
                    result.attr[key] = ', '.join(feature.qualifiers[key])
                yield result
                
                if 'ID' not in result.attr:
                    id_counter += 1
                    result.attr['ID'] = '%d' % id_counter                 
                for sub_feature in feature.sub_features:
                    feature.qualifiers['Parent'] = [ result.attr['ID'] ]
                
                todo.extend(feature.sub_features[::-1])
                
    f.close()
Esempio n. 20
0
    def run(self):
        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)

        tags = { }
        for item in reader.metadata.get('sampleTags',[]):
            parts = item.split(',')
            tags[parts[0]] = parts
        
        assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.'

        samples = [ 'reference'] + reader.samples
        
        for sample in samples:
            if sample not in tags:
                tags[sample] = [ sample, 'all' ]

        samples = selection.select_and_sort(
            self.select, self.sort, samples, lambda sample: tags[sample])
        
        required = [ i for i, sample in enumerate(samples)
                     if selection.matches(self.require, tags[sample]) ]
        
        sample_number = dict((b,a) for a,b in enumerate(reader.samples))
        
        items = [ ]
        for record in reader:
            variants = get_variants(record)
            genotypes = [ ]
            counts = [ ]
            qualities = [ ]
            for sample in samples:
                if sample == 'reference':
                    genotypes.append([0])
                    counts.append([1])
                    qualities.append(float('inf'))
                else:
                    genotypes.append(get_genotype(record.samples[sample_number[sample]]))
                    counts.append(get_variant_counts(record.samples[sample_number[sample]]))
                    qualities.append(record.samples[sample_number[sample]].data.GQ)

            # Only output when there are at least two genotypes            
            any_interesting = False
            for i in xrange(len(genotypes)):
                for j in xrange(i):
                    if (genotypes[i] is not None and genotypes[j] is not None and
                        not genotypes_equal(genotypes[i], genotypes[j])):
                        any_interesting = True
                        break
                if any_interesting: break
            if not any_interesting:
                continue

            if any(genotypes[i] is None for i in required):
                continue
                
            if self.only_snps and any(
                genotype is not None and any(len(variants[i]) != 1 for i in genotype)
                for genotype in genotypes):
                continue
                
            snpeff = snpeff_describe(record.INFO.get('EFF',''))
            if not any( selection.matches(self.snpeff_filter, item[1]) for item in (snpeff or [('',[])]) ):
                continue

            items.append(_Nway_record(variants=variants, genotypes=genotypes, counts=counts, qualities=qualities, snpeff=snpeff, record=record))
        
        self.log.log('%d variants\n\n' % len(items))
        
        if self.as_ == 'table':
            self._write_table(samples, items)
        elif self.as_ == 'nexus':
            self._write_nexus(samples, items)
        elif self.as_ == 'splitstree':
            self._write_nexus(samples, items)
            
            io.execute(
                'SplitsTree +g -i INPUT -x COMMAND',
                no_display=True,
                INPUT=self.prefix + '.nex',
                COMMAND='UPDATE; '
                        'SAVE FILE=\'%s.nex\' REPLACE=yes; '
                        'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; ' 
                        'QUIT' 
                        % (self.prefix, self.prefix, len(items)),
                )
        elif self.as_ == 'vcf':
            self._write_vcf(samples, items, reader)
        
        else:
            raise grace.Error('Unknown output format: '+self.as_)
Esempio n. 21
0
    def run(self):
        work = self.get_workspace()

        data = []
        names = []
        sample_tags = []

        old = grace.status("Loading pickles")

        max_length = 1
        for i, item in enumerate(self.pickles):
            grace.status("Loading " + os.path.basename(item))
            f = io.open_possibly_compressed_file(item)
            name, tags, datum = pickle.load(f)
            f.close()
            data.append(datum)
            names.append(name)
            sample_tags.append(tags)

            try:
                max_length = max(
                    max_length,
                    max(item[0]  #tail_length
                        for feature in datum for item in feature.hits) + 1)
            except ValueError:
                pass

            if i == 0:
                annotations = datum

        grace.status(old)

        self.log.log("Maximum tail length %d\n" % max_length)

        for i in xrange(len(names)):
            n_alignments = 0
            for feature in data[i]:
                feature.total_count = len(feature.hits)
                feature.tail_counts = [0] * max_length

                n_alignments += feature.total_count

                for tail_length, adaptor_bases in feature.hits:
                    if adaptor_bases >= self.adaptor:
                        feature.tail_counts[tail_length] += 1

                del feature.hits

            self.log.datum(names[i], 'Alignments to features', n_alignments)

        counts = []  # [feature][sample](total_count, [taillength])

        for item in data:
            assert len(item) == len(data[0])
        for row in itertools.izip(*data):
            this_counts = [(item.total_count, item.tail_counts)
                           for item in row]
            counts.append(this_counts)

        n_features = len(counts)
        n_samples = len(data)

        sample_n = [[0] * n_samples for i in xrange(n_features)
                    ]  # [feature][sample]  Total count
        sample_n_tail = [[0] * n_samples for i in xrange(n_features)
                         ]  # [feature][sample]  Polya count
        sample_prop = [
            [None] * n_samples for i in xrange(n_features)
        ]  # [feature][sample]  Proportion of reads with tail (deprecated)
        sample_tail = [[None] * n_samples for i in xrange(n_features)
                       ]  # [feature][sample]  Mean tail length in each sample
        sample_sd_tail = [
            [None] * n_samples for i in xrange(n_features)
        ]  # [feature][sample]  Std dev tail length in each sample
        sample_total_tail = [[0] * n_samples for i in xrange(n_features)]

        sample_quantile_tail = collections.OrderedDict(
            (item, [[None] * n_samples for i in xrange(n_features)])
            for item in [25, 50, 75, 100])

        overall_n = [0] * n_features  # [feature]          Overall count
        overall_prop = [
            None
        ] * n_features  # [feature]          Overall proportion with tail
        overall_tail = [
            None
        ] * n_features  # [feature]          Overall mean tail length
        overall_n_tail = [
            0
        ] * n_features  # [feature]          Overall polya count
        for i, row in enumerate(counts):
            for j, (this_this_n, item) in enumerate(row):
                sample_n[i][j] = this_this_n
                sample_n_tail[i][j] = sum(item[self.tail:])
                sample_total_tail[i][j] = sum(
                    item[k] * k for k in xrange(self.tail, max_length))

                if sample_n[i][j] >= 1:
                    sample_prop[i][j] = float(
                        sample_n_tail[i][j]) / sample_n[i][j]

                if sample_n_tail[i][j] >= 1:
                    sample_tail[i][j] = float(
                        sample_total_tail[i][j]) / sample_n_tail[i][j]

                    for quantile in sample_quantile_tail:
                        counter = sample_n_tail[i][j] * quantile / 100.0
                        for k in xrange(self.tail, max_length):
                            counter -= item[k]
                            if counter <= 0: break
                        sample_quantile_tail[quantile][i][j] = k

                if sample_n_tail[i][j] >= 2:
                    sample_sd_tail[i][j] = math.sqrt(
                        float(
                            sum(item[k] * ((k - sample_tail[i][j])**2)
                                for k in xrange(self.tail, max_length))) /
                        (sample_n_tail[i][j] - 1))

            overall_n[i] = sum(sample_n[i])
            overall_n_tail[i] = sum(sample_n_tail[i])
            if overall_n[i] >= 1:
                overall_prop[i] = float(sum(sample_n_tail[i])) / overall_n[i]
            if overall_n_tail[i] >= 1:
                overall_tail[i] = float(sum(
                    sample_total_tail[i])) / overall_n_tail[i]

        for i, name in enumerate(names):
            this_total = sum(item[i] for item in sample_total_tail)
            this_n = sum(item[i] for item in sample_n_tail)
            if this_n:
                self.log.datum(name, 'Average poly-A tail',
                               float(this_total) / this_n)

        for i, name in enumerate(names):
            this_total = sum(item[i] for item in sample_n_tail)
            this_n = sum(item[i] for item in sample_n)
            if this_n:
                self.log.datum(name, 'Average proportion of reads with tail',
                               float(this_total) / this_n)

        with open(work / 'features-with-data.gff', 'wb') as f:
            annotation.write_gff3_header(f)
            for i, item in enumerate(annotations):
                item.attr['reads'] = str(overall_n[i])
                item.attr['reads_with_tail'] = str(overall_n_tail[i])
                item.attr['mean_tail'] = '%.1f' % overall_tail[
                    i] if overall_tail[i] else 'NA'
                item.attr['proportion_with_tail'] = '%.2f' % overall_prop[
                    i] if overall_prop[i] else 'NA'

                if overall_tail[i] is None:
                    item.attr['color'] = '#444444'
                else:
                    a = (overall_tail[i] - self.tail) / max(
                        1, max_length - self.tail)
                    item.attr['color'] = '#%02x%02x%02x' % (int(
                        a * 255), int(
                            (1 - abs(a * 2 - 1)) * 255), 255 - int(a * 255))
                #item.attr['color'] = ...
                print >> f, item.as_gff()

        comments = ['#Counts'] + [
            '#sampleTags=' + ','.join(tags) for tags in sample_tags
        ] + [
            '"Tail_count" group is number of reads with tail',
            '"Tail" group is mean tail per sample',
            '"Proportion" group is proportion of reads with tail',
        ]

        have_biotype = any("Biotype" in item.attr for item in annotations)
        have_parent = any("Parent" in item.attr for item in annotations)
        have_relation = any("Relation" in item.attr for item in annotations)
        have_antisense = any("Antisense_parent" in item.attr
                             for item in annotations)

        def counts_iter():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(n_samples):
                    row[('Count', names[j])] = '%d' % sample_n[i][j]

                row[('Annotation',
                     'Length')] = annotations[i].end - annotations[i].start
                row[('Annotation',
                     'gene')] = annotations[i].attr.get('Name', '')
                row[('Annotation',
                     'product')] = annotations[i].attr.get('Product', '')
                if have_biotype:
                    row[('Annotation',
                         'biotype')] = annotations[i].attr.get('Biotype', '')
                if have_parent:
                    row[('Annotation',
                         'parent')] = annotations[i].attr.get('Parent', '')
                if have_relation:
                    row[('Annotation', 'relation')] = annotations[i].attr.get(
                        'Relation', '')

                if have_antisense:
                    row[('Annotation',
                         'antisense_gene')] = annotations[i].attr.get(
                             'Antisense_name', '')
                    row[('Annotation',
                         'antisense_product')] = annotations[i].attr.get(
                             'Antisense_product', '')
                    row[('Annotation',
                         'antisense_biotype')] = annotations[i].attr.get(
                             'Antisense_biotype', '')
                    row[('Annotation',
                         'antisense_parent')] = annotations[i].attr.get(
                             'Antisense_parent', '')

                row[('Annotation', 'chromosome')] = str(annotations[i].seqid)
                row[('Annotation', 'strand')] = str(annotations[i].strand)
                row[('Annotation', 'start')] = str(annotations[i].start + 1)
                row[('Annotation', 'end')] = str(annotations[i].end)

                row[('Annotation', 'reads')] = str(overall_n[i])
                row[('Annotation', 'reads-with-tail')] = str(overall_n_tail[i])
                row[('Annotation', 'mean-tail')] = str_na(overall_tail[i])
                row[('Annotation',
                     'proportion-with-tail')] = str_na(overall_prop[i])
                for j in xrange(n_samples):
                    row[('Tail_count', names[j])] = '%d' % sample_n_tail[i][j]
                for j in xrange(n_samples):
                    row[('Tail', names[j])] = str_na(sample_tail[i][j])
                for j in xrange(n_samples):
                    row[('Tail_sd', names[j])] = str_na(sample_sd_tail[i][j])

                for quantile in sample_quantile_tail:
                    for j in xrange(n_samples):
                        row[('Tail_quantile_%d' % quantile,
                             names[j])] = str_na(
                                 sample_quantile_tail[quantile][i][j])

                for j in xrange(len(names)):
                    row[('Proportion', names[j])] = str_na(sample_prop[i][j])
                yield row

        io.write_csv(work / 'counts.csv', counts_iter(), comments=comments)

        def write_csv_matrix(filename, matrix):
            def emitter():
                for i in xrange(n_features):
                    row = collections.OrderedDict()
                    row["Feature"] = annotations[i].get_id()
                    for j in xrange(n_samples):
                        row[names[j]] = str_na(matrix[i][j])
                    yield row

            io.write_csv(filename, emitter())

        write_csv_matrix(work / 'read_count.csv', sample_n)
        write_csv_matrix(work / 'tail_count.csv', sample_n_tail)
        write_csv_matrix(work / 'tail.csv', sample_tail)
        write_csv_matrix(work / 'tail_sd.csv', sample_sd_tail)
        for quantile in sample_quantile_tail:
            write_csv_matrix(work / ('tail_quantile_%d.csv' % quantile),
                             sample_quantile_tail[quantile])

        #def raw_columns():
        #    for i in xrange(n_samples):
        #        row = collections.OrderedDict()
        #        row['Sample'] = names[i]
        #        for j in xrange(max_length):
        #            row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based
        #        yield row
        #io.write_csv(work/'raw-columns.csv', raw_columns())
        #
        ##Somewhat inefficient
        #def raw():
        #    for i in xrange(n_features):
        #        row = collections.OrderedDict()
        #        row['Feature'] = annotations[i].get_id()
        #        for j in xrange(n_samples):
        #            for k in xrange(max_length):
        #                row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] )
        #        yield row
        #io.write_csv(work/'raw.csv', raw())

        def pooled():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(max_length):
                    row[str(j)] = str(
                        sum(counts[i][k][1][j] for k in xrange(n_samples)))
                yield row

        io.write_csv(work / 'pooled.csv', pooled())
Esempio n. 22
0
def nway_main(
    gbk_filename,
    use_indels,
    use_reference,
    give_evidence,
    give_consequences,
    require_all,
    require_bisect,
    full_output,
    format,
    working_dirs,
    split_a,
    split_b,
    f=sys.stdout,
):
    assert working_dirs, "Need at least one working directory."
    workspaces = [working_directory.Working(dirname, must_exist=True) for dirname in working_dirs]
    reference = workspaces[0].get_reference()
    # if not annotation_filename:
    #    annotation_filename = reference.annotations_filename() #May still be None

    if use_reference:
        names = ["reference"]
        evidence_start = 1
    else:
        names = []
        evidence_start = 0

    names.extend(norm_name(item) for item in working_dirs)

    references = io.read_sequences(reference.reference_fasta_filename())

    annotations = {}
    if gbk_filename:
        from Bio import SeqIO

        for record in SeqIO.parse(io.open_possibly_compressed_file(gbk_filename), "genbank"):
            sequence = record.seq.tostring()
            features = [item for item in record.features if item.type != "source"]
            features.sort(key=lambda item: item.location.nofuzzy_start)
            annotations[sequence] = features

    iterator = reader(working_dirs, references, use_reference, annotations)

    if not use_indels:
        iterator = itertools.ifilter(has_no_indels, iterator)

    if require_all or require_bisect or format == "counts":
        iterator = itertools.ifilter(fully_unambiguous, iterator)

    if require_bisect:
        iterator = itertools.ifilter(is_binary_partition, iterator)

    if not require_bisect:
        if full_output:
            iterator = itertools.ifilter(not_boring_insertion, iterator)
        else:
            iterator = itertools.ifilter(is_interesting, iterator)

    if split_a or split_b:
        assert len(names) == len(set(names)), "Two samples with the same name"
        try:
            split_a = [names.index(norm_name(item)) for item in split_a]
            split_b = [names.index(norm_name(item)) for item in split_b]
        except ValueError:
            raise grace.Error("Sample to be split is not amongst samples given")
        iterator = itertools.ifilter(is_split(split_a, split_b), iterator)

    # if limit:
    #    iterator = itertools.islice(iterator, limit)

    if format == "table":
        line = "Reference\tPosition\tChange type"
        line += "\t" + "\t".join(names)
        if give_evidence:
            line += "\t" + "\t".join(names[evidence_start:])
        if give_consequences:
            line += "\t" + "\t".join(names[evidence_start:])
        if annotations:
            line += "\tAnnotations"
        print >> f, line
        for calls in iterator:
            line = "%s\t%d\t%s\t%s" % (
                calls.ref_name,
                calls.ref_pos + 1,
                change_type(calls),
                "\t".join(item.consensus for item in calls.calls),
            )
            if give_evidence:
                line += "\t" + "\t".join(item.evidence for item in calls.calls[evidence_start:])
            if give_consequences:
                line += "\t" + "\t".join(item.consequences for item in calls.calls[evidence_start:])
            if annotations:
                line += "\t" + describe_features(calls.features)
            print >> f, line

    elif format == "compact":
        for line in transpose_strings(names):
            print >> f, line
        print >> f

        for calls in iterator:
            if calls.is_insertion:
                footer = "%12d.5 %s" % (calls.ref_pos, calls.ref_name)
            else:
                footer = "%12d   %s" % (calls.ref_pos + 1, calls.ref_name)

            t = transpose_strings([item.consensus for item in calls.calls], "-", 1)
            top = t[0] + " " + footer
            if give_consequences:
                consequences = []
                for call in calls.calls:
                    if call.consequences:
                        for item in call.consequences.split(", "):
                            item = " ".join(item.split()[:3])
                            if item not in consequences:
                                consequences.append(item)

                if consequences:
                    top += "  " + " / ".join(sorted(consequences))
            top += "  " + describe_features(calls.features)
            print >> f, top
            for line in t[1:]:
                print >> f, line

    elif format == "nexus":
        buckets = [[] for name in names]
        for calls in iterator:
            for i, char in enumerate(partition_string(calls)):
                buckets[i].append(char)

        print >> f, "#NEXUS"
        print >> f, "begin taxa;"
        print >> f, "dimensions ntax=%d;" % len(names)
        print >> f, "taxlabels"
        for name in names:
            print >> f, name
        print >> f, ";"
        print >> f, "end;"

        print >> f, "begin characters;"
        print >> f, "dimensions nchar=%d;" % len(buckets[0])
        print >> f, 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;'
        print >> f, "matrix"
        for name, bucket in itertools.izip(names, buckets):
            print >> f, name, "".join(bucket)
        print >> f, ";"
        print >> f, "end;"

    elif format == "counts":
        for line in transpose_strings(names):
            print >> f, line
        print >> f

        counts = {}
        for calls in iterator:
            count_str = partition_string(calls)
            if count_str not in counts:
                counts[count_str] = 1
            else:
                counts[count_str] += 1

        for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True):
            print >> f, "%s   %d" % (transpose_strings(count_str)[0], counts[count_str])

    else:
        raise grace.Error("Unknown output format: " + format)
    def run(self):
        workspace = self.get_workspace()
                
        header = [ "##gff-version 3\n" ]
        lengths = { }
        with io.open_possibly_compressed_file(self.features) as f:
            f.next()
            for line in f:
                if not line.startswith("#"): break
                if line.startswith("##gff-version"): continue
                header.append(line)
                parts = line.strip().split()
                if parts[0] == "##sequence-region":
                    lengths[parts[1]] = int(parts[3])
                    
        header = "".join(header)
                
        items = list(annotation.read_gff(self.features, "/"))
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) < 2
            if "ID" in item.attr:
                item.attr["ID"] = item.attr["ID"].split(":")[1]
            if "Parent" in item.attr:
                item.attr["Parent"] = item.attr["Parent"].split(":")[1]
            if item.parents:
                item.parent = item.parents[0]
            
        
        def well_supported(item):
            if self.support is None: return True
            level = item.attr.get("transcript_support_level","NA").split()[0]
            if not level.isdigit(): return False
            return int(level) <= self.support
        
        exons = [ item for item in items if item.type == "exon" and well_supported(item.parent) ]
        exon_index = span_index.index_annotations(exons)
        
        utrs = [ ]
        extended_utrs = [ ]
        utr_parts = [ ]
        exons_kept = [ ]
        cds_kept = [ ]
        transcripts_kept = [ ]
        for item in items:
            this_exons = [ item2 for item2 in item.children if item2.type == "exon" ]
            if this_exons and well_supported(item):    
                 transcripts_kept.append(item)
                 exons_kept.extend(this_exons)
                 cds_kept.extend([ item2 for item2 in item.children if item2.type == "CDS" ])
        
            if self.gene_level:
                utr_bits = [ item3 for item2 in item.children  if well_supported(item2)
                                   for item3 in item2.children if item3.type == self.what ] 
            else:
                if not well_supported(item): continue
                utr_bits = [ item2 for item2 in item.children if item2.type == self.what ] 
            
            if not utr_bits:
                continue
            
            utr = utr_bits[0].copy()
            for item2 in utr_bits[1:]:
                utr = utr.span_with(item2)
            
            gene = item if self.gene_level else item.parent
            
            utr.attr = dict(
                ID=item.get_id(),
                Name=item.attr["Name"],
                gene_id=gene.get_id(),
                gene=gene.attr["Name"],
                description=gene.attr.get("description",""),
                biotype=item.attr["biotype"]
                )
        
            max_extension = 10000
            if item.strand < 0:
                max_extension = min(max_extension, utr.start)
            else:
                max_extension = min(max_extension, lengths[utr.seqid] - utr.end)
            assert max_extension >= 0, utr
            
            end = utr.three_prime()
            for hit in exon_index.get(end.shifted(0,max_extension), same_strand=True):
                #if hit.parent.get_id() == item.get_id():
                #    continue
                rel = hit.relative_to(end).start
                if rel >= 0:
                    max_extension = min(max_extension, rel)
        
            extended_utr = utr.shifted(0,max_extension)
            extended_utr.start = max(extended_utr.start, 0)
            utr.attr["max_extension"] = str(max_extension)
        
            utrs.append(utr)
            extended_utrs.append(extended_utr)
            
            for item2 in utr_bits:
                part = item2.copy()
                part.attr = dict(Parent=item.get_id())
                part.type = "part"
                utr_parts.append(part)
                            
        write_gff3(workspace/"utr.gff",utrs,header)
        write_gff3(workspace/"utr_extended.gff",extended_utrs,header)
        write_gff3(workspace/"utr_part.gff",utr_parts,header)
        write_gff3(workspace/"transcript.gff",transcripts_kept,header)
        write_gff3(workspace/"exon.gff",exons_kept,header)
        write_gff3(workspace/"cds.gff",cds_kept,header)

     




        
    def run(self):
        references = { }
        for filename in self.reference_filenames:
            for name, seq in io.read_sequences(filename):
                references[name] = seq
        
        tail_lengths = { }
        adaptor_bases = { }
        for filename in self.clips:
            with io.open_possibly_compressed_file(filename) as f:
                for line in f:
                    if line.startswith('#'): continue
                    parts = line.rstrip('\n').split('\t')
                    name = parts[0].split()[0]
                    tail_lengths[name] = int(parts[3])-int(parts[2])
                    adaptor_bases[name] = int(parts[6])
        
        in_file = self.begin_input()
        out_file = self.begin_output()
        
        assert self.prop_a >= 0.0 and self.prop_a <= 1.0
        a_score = 1-self.prop_a
        non_a_score = -self.prop_a
        
        for line in in_file:
            line = line.rstrip()
            if line.startswith('@'):
                print >> out_file, line
                continue
            
            al = Alignment(line)
            
            if al.flag & FLAG_UNMAPPED:
                continue

            #ref = references[al.rname]

            reverse = al.flag & FLAG_REVERSE
            if reverse:
                read_bases = rev_comp(al.seq)
                read_qual = al.qual[::-1]
                cigar = cigar_decode(al.cigar)[::-1]
            else:
                read_bases = al.seq
                read_qual = al.qual
                cigar = cigar_decode(al.cigar)
            
            n_tail = tail_lengths[al.qname]
            
            #if reverse:
            #    if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
            #    bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1])    
            #else:
            #    if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
            #    bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug!
            #
            #extension = 0
            #while extension < n_tail and bases_ref[extension] == 'A':
            #    extension += 1
            
            if reverse:
                feat = annotation.Annotation(al.rname, start=al.pos-1-n_tail, end=al.pos-1, strand=-1)
            else:
                feat = annotation.Annotation(al.rname, start=al.pos-1+al.length, end=al.pos-1+al.length+n_tail, strand=1)
            bases_ref = feat.get_seq(references).upper()
            
            # Allow up to 60% mismatch on As
            # Treat soft clipping as insertion for simplicity
            cigar = cigar.replace("S","I")
            assert "H" not in cigar, "Can't handle hard clipping"
            
            extension = 0
            best_score = 0.0
            score = 0.0
            
            # Soft clipping treated as a mismatch
            i = len(cigar)-1
            while i >= 0 and cigar[i] in "I":
                score += non_a_score
                i -= 1
            
            for i in xrange(n_tail):
                if bases_ref[i] == "A":
                    score += a_score
                else:
                    score += non_a_score
                    
                if score >= best_score:
                    extension = i+1
                    best_score = score
            #print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref
                      
            
            if n_tail-extension > 0:
                al.extra.append('AN:i:%d' % (n_tail-extension))
                al.extra.append('AG:i:%d' % (extension))
            if adaptor_bases[al.qname]:
                al.extra.append('AD:i:%d' % adaptor_bases[al.qname])
            if n_tail-extension >= self.tail:
                #if reverse:
                #    tail_refpos = al.pos-extension
                #else:
                #    tail_refpos = al.pos+al.length+extension-1 
                #al.extra.append('AA:i:%d'%tail_refpos)
                al.extra.append('AA:i:1')
            
            cigar += 'M' * extension
            read_bases += 'N' * extension #Since mispriming is so common (and loading the original sequence here would be a pain)
            read_qual += chr(33+20) * extension #Arbitrarily give quality 20
            al.length += extension
            if reverse:
                al.pos -= extension
                al.seq = rev_comp(read_bases)
                al.qual = read_qual[::-1]
                al.cigar = cigar_encode(cigar[::-1])
            else: 
                al.seq = read_bases
                al.qual = read_qual
                al.cigar = cigar_encode(cigar)
                
            print >> out_file, al
    
        self.end_output(out_file)
        self.end_input(in_file)
    def run(self):
        assert len(self.pickles) > 0, "No samples to count."
        
        work = self.get_workspace()
        
        data = [ ]
        names = [ ]
        sample_tags = [ ]
        
        old = grace.status("Loading pickles")
        
        max_length = 1
        for i, item in enumerate(self.pickles):
            grace.status("Loading "+os.path.basename(item))
            f = io.open_possibly_compressed_file(item)
            name, tags, datum = pickle.load(f)
            f.close()
            data.append(datum)
            names.append(name)
            sample_tags.append(tags)
            
            try:
                max_length = max(max_length, max( 
                    item[0] #tail_length
                    for feature in datum
                    for item in feature.hits
                    ) + 1)
            except ValueError:
                pass
            
            if i == 0:
               annotations = datum
        
        grace.status(old)
        
        self.log.log("Maximum tail length %d\n" % max_length)

        for i in xrange(len(names)):        
            n_alignments = 0
            for feature in data[i]:
                feature.total_count = len(feature.hits)
                feature.tail_counts = [ 0 ] * max_length
                
                n_alignments += feature.total_count
                
                for tail_length, adaptor_bases in feature.hits:
                    if adaptor_bases >= self.adaptor:
                        feature.tail_counts[tail_length] += 1
                
                del feature.hits

            self.log.datum(names[i], 'Alignments to features', n_alignments)
                
        
        counts = [ ]  # [feature][sample](total_count, [taillength])
        
        for item in data: 
            assert len(item) == len(data[0])
        for row in itertools.izip(*data):
            this_counts = [ (item.total_count, item.tail_counts) for item in row ]
            counts.append(this_counts)
        
        n_features = len(counts)
        n_samples = len(data)
        
        sample_n = [ [0]*n_samples for i in xrange(n_features) ]        # [feature][sample]  Total count
        sample_n_tail = [ [0]*n_samples for i in xrange(n_features) ]   # [feature][sample]  Polya count
        sample_prop = [ [None]*n_samples for i in xrange(n_features) ]    # [feature][sample]  Proportion of reads with tail (deprecated)
        sample_tail = [ [None]*n_samples for i in xrange(n_features) ]    # [feature][sample]  Mean tail length in each sample
        sample_sd_tail = [ [None]*n_samples for i in xrange(n_features) ] # [feature][sample]  Std dev tail length in each sample
        sample_total_tail = [ [0]*n_samples for i in xrange(n_features) ]
        
        sample_quantile_tail = collections.OrderedDict( 
            (item, [ [None]*n_samples for i in xrange(n_features) ]) 
            for item in [25,50,75,100]
            )
        
        overall_n = [ 0 ]*n_features       # [feature]          Overall count
        overall_prop = [ None ]*n_features   # [feature]          Overall proportion with tail
        overall_tail = [ None ]*n_features   # [feature]          Overall mean tail length
        overall_n_tail = [ 0 ]*n_features  # [feature]          Overall polya count
        for i, row in enumerate(counts):
            for j, (this_this_n, item) in enumerate(row):
                sample_n[i][j] = this_this_n
                sample_n_tail[i][j] = sum(item[self.tail:])
                sample_total_tail[i][j] = sum( item[k]*k for k in xrange(self.tail,max_length) )

                if sample_n[i][j] >= 1:
                    sample_prop[i][j] = float(sample_n_tail[i][j])/sample_n[i][j]
                
                if sample_n_tail[i][j] >= 1:
                    sample_tail[i][j] = float(sample_total_tail[i][j])/sample_n_tail[i][j]
                
                    for quantile in sample_quantile_tail:
                        counter = sample_n_tail[i][j] * quantile / 100.0
                        for k in xrange(self.tail, max_length):
                            counter -= item[k]
                            if counter <= 0: break
                        sample_quantile_tail[quantile][i][j] = k
                
                if sample_n_tail[i][j] >= 2:
                    sample_sd_tail[i][j] = math.sqrt(
                        float(sum( item[k]*((k-sample_tail[i][j])**2) for k in xrange(self.tail,max_length) ))
                        / (sample_n_tail[i][j]-1)
                        )
                    
            overall_n[i] = sum(sample_n[i])
            overall_n_tail[i] = sum(sample_n_tail[i])
            if overall_n[i] >= 1:
                overall_prop[i] = float(sum(sample_n_tail[i]))/overall_n[i]
            if overall_n_tail[i] >= 1:
                overall_tail[i] = float(sum(sample_total_tail[i]))/overall_n_tail[i]
             
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_total_tail )
            this_n = sum( item[i] for item in sample_n_tail )
            if this_n:
                self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n)
                
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_n_tail )
            this_n = sum( item[i] for item in sample_n )
            if this_n:
                self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n)
        
        with open(work/'features-with-data.gff','wb') as f:
            annotation.write_gff3_header(f)
            for i, item in enumerate(annotations):
                item.attr['reads'] = str(overall_n[i])
                item.attr['reads_with_tail'] = str(overall_n_tail[i])
                item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA'
                item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA'
                
                if overall_tail[i] is None:
                    item.attr['color'] = '#444444'
                else:
                    a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail)
                    item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255))
                #item.attr['color'] = ...                
                print >> f, item.as_gff()
        
        
        comments = [ '#Counts' ] + [
            '#sampleTags='+','.join(tags)
            for tags in sample_tags
            ] + [
            '"Tail_count" group is number of reads with tail',
            '"Tail" group is mean tail per sample',
            '"Proportion" group is proportion of reads with tail',
            ]
            
        have_biotype = any("Biotype" in item.attr for item in annotations)
        have_parent = any("Parent" in item.attr for item in annotations)
        have_relation = any("Relation" in item.attr for item in annotations)
        have_antisense = any("Antisense_parent" in item.attr for item in annotations)

        def counts_iter():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(n_samples):
                    row[('Count',names[j])] = '%d' % sample_n[i][j]

                row[('Annotation','Length')] = annotations[i].end - annotations[i].start
                row[('Annotation','gene')] = annotations[i].attr.get('Name','')
                row[('Annotation','product')] = annotations[i].attr.get('Product','')
                if have_biotype:
                    row[('Annotation','biotype')] = annotations[i].attr.get('Biotype','')
                if have_parent:
                    row[('Annotation','parent')] = annotations[i].attr.get('Parent','')
                if have_relation:
                    row[('Annotation','relation')] = annotations[i].attr.get('Relation','')
                
                if have_antisense:
                    row[('Annotation','antisense_gene')] = annotations[i].attr.get('Antisense_name','')
                    row[('Annotation','antisense_product')] = annotations[i].attr.get('Antisense_product','')
                    row[('Annotation','antisense_biotype')] = annotations[i].attr.get('Antisense_biotype','')
                    row[('Annotation','antisense_parent')] = annotations[i].attr.get('Antisense_parent','')
                
                row[('Annotation','chromosome')] = str(annotations[i].seqid)
                row[('Annotation','strand')] = str(annotations[i].strand)
                row[('Annotation','start')] = str(annotations[i].start+1)
                row[('Annotation','end')] = str(annotations[i].end)
                
                row[('Annotation','reads')] = str(overall_n[i])
                row[('Annotation','reads-with-tail')] = str(overall_n_tail[i])
                row[('Annotation','mean-tail')] = str_na(overall_tail[i])
                row[('Annotation','proportion-with-tail')] = str_na(overall_prop[i])
                for j in xrange(n_samples):
                    row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j]
                for j in xrange(n_samples):
                    row[('Tail',names[j])] = str_na(sample_tail[i][j])
                for j in xrange(n_samples):
                    row[('Tail_sd',names[j])] = str_na(sample_sd_tail[i][j])
                
                for quantile in sample_quantile_tail:
                    for j in xrange(n_samples):
                        row[('Tail_quantile_%d'%quantile,names[j])] = str_na(sample_quantile_tail[quantile][i][j])                    
                
                for j in xrange(len(names)):
                    row[('Proportion',names[j])] = str_na(sample_prop[i][j])
                yield row
        io.write_csv(work/'counts.csv', counts_iter(), comments=comments)
        
        
        def write_csv_matrix(filename, matrix):
            def emitter():
                for i in xrange(n_features):
                    row = collections.OrderedDict()
                    row["Feature"] = annotations[i].get_id()
                    for j in xrange(n_samples):
                        row[names[j]] = str_na(matrix[i][j])
                    yield row
            io.write_csv(filename, emitter())
            
        write_csv_matrix(work/'read_count.csv', sample_n)
        write_csv_matrix(work/'tail_count.csv', sample_n_tail)
        write_csv_matrix(work/'tail.csv', sample_tail)
        write_csv_matrix(work/'tail_sd.csv', sample_sd_tail)
        for quantile in sample_quantile_tail:
            write_csv_matrix(work/('tail_quantile_%d.csv'%quantile), sample_quantile_tail[quantile])


        #def raw_columns():
        #    for i in xrange(n_samples):
        #        row = collections.OrderedDict()
        #        row['Sample'] = names[i]
        #        for j in xrange(max_length):
        #            row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based
        #        yield row
        #io.write_csv(work/'raw-columns.csv', raw_columns())
        #
        ##Somewhat inefficient        
        #def raw():
        #    for i in xrange(n_features):
        #        row = collections.OrderedDict()
        #        row['Feature'] = annotations[i].get_id()
        #        for j in xrange(n_samples):
        #            for k in xrange(max_length):
        #                row['%d %s' % (k,names[j])] = str( counts[i][j][1][k] )
        #        yield row
        #io.write_csv(work/'raw.csv', raw())
        
        def pooled():
            for i in xrange(n_features):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(max_length):
                    row[str(j)] = str( sum( counts[i][k][1][j] for k in xrange(n_samples) ) )
                yield row
        io.write_csv(work/'pooled.csv', pooled())
Esempio n. 26
0
    def run(self):
        references = {}
        for filename in self.reference_filenames:
            for name, seq in io.read_sequences(filename):
                references[name] = seq

        tail_lengths = {}
        adaptor_bases = {}
        for filename in self.clips:
            with io.open_possibly_compressed_file(filename) as f:
                for line in f:
                    if line.startswith('#'): continue
                    parts = line.rstrip('\n').split('\t')
                    name = parts[0].split()[0]
                    tail_lengths[name] = int(parts[3]) - int(parts[2])
                    adaptor_bases[name] = int(parts[6])

        in_file = self.begin_input()
        out_file = self.begin_output()

        assert self.prop_a >= 0.0 and self.prop_a <= 1.0
        a_score = 1 - self.prop_a
        non_a_score = -self.prop_a

        for line in in_file:
            line = line.rstrip()
            if line.startswith('@'):
                print >> out_file, line
                continue

            al = Alignment(line)

            if al.flag & FLAG_UNMAPPED:
                continue

            #ref = references[al.rname]

            reverse = al.flag & FLAG_REVERSE
            if reverse:
                read_bases = rev_comp(al.seq)
                read_qual = al.qual[::-1]
                cigar = cigar_decode(al.cigar)[::-1]
            else:
                read_bases = al.seq
                read_qual = al.qual
                cigar = cigar_decode(al.cigar)

            n_tail = tail_lengths[al.qname]

            #if reverse:
            #    if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
            #    bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1])
            #else:
            #    if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
            #    bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail] .upper()#upper was missing for a long time. Bug!
            #
            #extension = 0
            #while extension < n_tail and bases_ref[extension] == 'A':
            #    extension += 1

            if reverse:
                feat = annotation.Annotation(al.rname,
                                             start=al.pos - 1 - n_tail,
                                             end=al.pos - 1,
                                             strand=-1)
            else:
                feat = annotation.Annotation(al.rname,
                                             start=al.pos - 1 + al.length,
                                             end=al.pos - 1 + al.length +
                                             n_tail,
                                             strand=1)
            bases_ref = feat.get_seq(references).upper()

            # Allow up to 60% mismatch on As
            # Treat soft clipping as insertion for simplicity
            cigar = cigar.replace("S", "I")
            assert "H" not in cigar, "Can't handle hard clipping"

            extension = 0
            best_score = 0.0
            score = 0.0

            # Soft clipping treated as a mismatch
            i = len(cigar) - 1
            while i >= 0 and cigar[i] in "I":
                score += non_a_score
                i -= 1

            for i in xrange(n_tail):
                if bases_ref[i] == "A":
                    score += a_score
                else:
                    score += non_a_score

                if score >= best_score:
                    extension = i + 1
                    best_score = score
            #print >> sys.stderr, reverse!=0, n_tail, extension, bases_ref

            if n_tail - extension > 0:
                al.extra.append('AN:i:%d' % (n_tail - extension))
                al.extra.append('AG:i:%d' % (extension))
            if adaptor_bases[al.qname]:
                al.extra.append('AD:i:%d' % adaptor_bases[al.qname])
            if n_tail - extension >= self.tail:
                #if reverse:
                #    tail_refpos = al.pos-extension
                #else:
                #    tail_refpos = al.pos+al.length+extension-1
                #al.extra.append('AA:i:%d'%tail_refpos)
                al.extra.append('AA:i:1')

            cigar += 'M' * extension
            read_bases += 'N' * extension  #Since mispriming is so common (and loading the original sequence here would be a pain)
            read_qual += chr(33 + 20) * extension  #Arbitrarily give quality 20
            al.length += extension
            if reverse:
                al.pos -= extension
                al.seq = rev_comp(read_bases)
                al.qual = read_qual[::-1]
                al.cigar = cigar_encode(cigar[::-1])
            else:
                al.seq = read_bases
                al.qual = read_qual
                al.cigar = cigar_encode(cigar)

            print >> out_file, al

        self.end_output(out_file)
        self.end_input(in_file)
Esempio n. 27
0
def main(args):
    genbank_filename, args = grace.get_option_value(args,'--gbk',str,None)
    use_indels, args = grace.get_option_value(args,'--indels',grace.as_bool,True)
    use_reference, args = grace.get_option_value(args,'--reference',grace.as_bool,True)
    give_evidence, args = grace.get_option_value(args,'--evidence',grace.as_bool,True)
    give_consequences, args = grace.get_option_value(args,'--consequences',grace.as_bool,True)
    require_all, args = grace.get_option_value(args,'--require-all',grace.as_bool,False)
    require_bisect, args = grace.get_option_value(args,'--require-bisect',grace.as_bool,False)
    full_output, args = grace.get_option_value(args,'--full',grace.as_bool,False)
    format, args = grace.get_option_value(args,'--as',str,'table')
    
    # Secret option!
    limit, args = grace.get_option_value(args,'--limit',int,None)
    
    grace.expect_no_further_options(args)

    if len(args) < 1:
        sys.stderr.write(USAGE)
        return 1

    working_dirs = [ ]
    split_a = [ ]
    split_b = [ ]
    def default(args):
        working_dirs.extend(args)
    def splitting(args):
        split_a.extend(args)
    def splitting_from(args):
        split_b.extend(args)
        
    grace.execute(args, {
        'splitting' : splitting,
        'from' : splitting_from 
    }, default
    )
    
    if use_reference:
        names = ['reference']
        evidence_start = 1
    else:
        names = [ ]
        evidence_start = 0
        
    names.extend( norm_name(item) for item in  working_dirs )
        
    references = io.read_sequences(os.path.join(working_dirs[0], 'reference.fa'))
    
    annotations = { }
    if genbank_filename:
        from Bio import SeqIO
        for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'):
            sequence = record.seq.tostring()
            features = [ item for item in record.features if item.type != 'source' ]
            features.sort(key=lambda item: item.location.nofuzzy_start)
            annotations[sequence] = features
    
    iterator = reader(working_dirs, references, use_reference, annotations)
    
    if not use_indels:
        iterator = itertools.ifilter(has_no_indels, iterator)

    if require_all or require_bisect or format == 'counts':
        iterator = itertools.ifilter(fully_unambiguous, iterator)
    
    if require_bisect:
        iterator = itertools.ifilter(is_binary_partition, iterator)

    if not require_bisect:
        if full_output:
            iterator = itertools.ifilter(not_boring_insertion, iterator)
        else:
            iterator = itertools.ifilter(is_interesting, iterator)

    if split_a or split_b:
        assert len(names) == len(set(names)), 'Two samples with the same name'
        try:
            split_a = [ names.index(norm_name(item)) for item in split_a ]
            split_b = [ names.index(norm_name(item)) for item in split_b ]
        except ValueError:
            raise grace.Error('Sample to be split is not amongst samples given')
        iterator = itertools.ifilter(is_split(split_a, split_b), iterator)

    if limit:
        iterator = itertools.islice(iterator, limit)
    
    if format == 'table':
        line = 'Reference\tPosition\tChange type'
        line +=  '\t' + '\t'.join(names)
        if give_evidence:
            line += '\t' + '\t'.join(names[evidence_start:])
        if give_consequences:
            line += '\t' + '\t'.join(names[evidence_start:])
        if annotations:
            line += '\tAnnotations'
        print line
        for calls in iterator:
            line = '%s\t%d\t%s\t%s' % (
                calls.ref_name, 
                calls.ref_pos+1, 
                change_type(calls), 
                '\t'.join(item.consensus for item in calls.calls))
            if give_evidence:
                line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:])
            if give_consequences:
                line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:])
            if annotations:
                line += '\t' + describe_features(calls.features)
            print line

    elif format == 'compact':
        for line in transpose_strings(names):
            print line
        print
        
        for calls in iterator:
            if calls.is_insertion:
                footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name)
            else: 
                footer = '%12d   %s' % (calls.ref_pos+1, calls.ref_name)
            
            t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1)
            top = t[0] + ' ' + footer
            if give_consequences:
                consequences = [ ]
                for call in calls.calls:
                    if call.consequences:
                        for item in call.consequences.split(', '):
                            item = ' '.join(item.split()[:3])
                            if item not in consequences: consequences.append(item)
                        
                if consequences:
                    top += '  ' + ' / '.join(sorted(consequences))
            top += '  ' + describe_features(calls.features)
            print top
            for line in t[1:]:
                print line            
    
    elif format == 'nexus':
        buckets = [ [ ] for name in names ]
        for calls in iterator:
            for i, char in enumerate(partition_string(calls)):
                buckets[i].append(char)
        
        print '#NEXUS'
        print 'begin taxa;'
        print 'dimensions ntax=%d;' % len(names)
        print 'taxlabels'
        for name in names:
            print name
        print ';'
        print 'end;'

        print 'begin characters;'
        print 'dimensions nchar=%d;' % len(buckets[0])
        print 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;'
        print 'matrix'
        for name, bucket in itertools.izip(names, buckets):
            print name, ''.join(bucket)
        print ';'
        print 'end;'
    
    elif format == 'counts':
        for line in transpose_strings(names):
            print line
        print

        counts = { }
        for calls in iterator:
            count_str = partition_string(calls)
            if count_str not in counts:
                counts[count_str] = 1
            else:
                counts[count_str] += 1
        
        for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True):
            print '%s   %d' % (transpose_strings(count_str)[0], counts[count_str])
    
    else:
        raise grace.Error('Unknown output format: ' + format)
Esempio n. 28
0
    def run(self):
        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)

        tags = {}
        for item in reader.metadata.get('sampleTags', []):
            parts = item.split(',')
            tags[parts[0]] = parts

        assert 'reference' not in reader.samples, 'Can\'t have a sample called reference, sorry.'

        samples = ['reference'] + reader.samples

        for sample in samples:
            if sample not in tags:
                tags[sample] = [sample, 'all']

        samples = selection.select_and_sort(self.select, self.sort, samples,
                                            lambda sample: tags[sample])

        required = [
            i for i, sample in enumerate(samples)
            if selection.matches(self.require, tags[sample])
        ]

        sample_number = dict((b, a) for a, b in enumerate(reader.samples))

        items = []
        for record in reader:
            variants = get_variants(record)
            genotypes = []
            counts = []
            qualities = []
            for sample in samples:
                if sample == 'reference':
                    genotypes.append([0])
                    counts.append([1])
                    qualities.append(float('inf'))
                else:
                    genotypes.append(
                        get_genotype(record.samples[sample_number[sample]]))
                    counts.append(
                        get_variant_counts(
                            record.samples[sample_number[sample]]))
                    qualities.append(
                        record.samples[sample_number[sample]].data.GQ)

            # Only output when there are at least two genotypes
            any_interesting = False
            for i in xrange(len(genotypes)):
                for j in xrange(i):
                    if (genotypes[i] is not None and genotypes[j] is not None
                            and
                            not genotypes_equal(genotypes[i], genotypes[j])):
                        any_interesting = True
                        break
                if any_interesting: break
            if not any_interesting:
                continue

            if any(genotypes[i] is None for i in required):
                continue

            if self.only_snps and any(genotype is not None and any(
                    len(variants[i]) != 1 for i in genotype)
                                      for genotype in genotypes):
                continue

            snpeff = snpeff_describe(record.INFO.get('EFF', ''))
            if not any(
                    selection.matches(self.snpeff_filter, item[1])
                    for item in (snpeff or [('', [])])):
                continue

            items.append(
                _Nway_record(variants=variants,
                             genotypes=genotypes,
                             counts=counts,
                             qualities=qualities,
                             snpeff=snpeff,
                             record=record))

        self.log.log('%d variants\n\n' % len(items))

        if self.as_ == 'table':
            self._write_table(samples, items)
        elif self.as_ == 'nexus':
            self._write_nexus(samples, items)
        elif self.as_ == 'splitstree':
            self._write_nexus(samples, items)

            io.execute(
                'SplitsTree +g -i INPUT -x COMMAND',
                no_display=True,
                INPUT=self.prefix + '.nex',
                COMMAND='UPDATE; '
                'SAVE FILE=\'%s.nex\' REPLACE=yes; '
                'EXPORTGRAPHICS format=svg file=\'%s.svg\' REPLACE=yes TITLE=\'NeighborNet from %d variants\'; '
                'QUIT' % (self.prefix, self.prefix, len(items)),
            )
        elif self.as_ == 'vcf':
            self._write_vcf(samples, items, reader)

        else:
            raise grace.Error('Unknown output format: ' + self.as_)
Esempio n. 29
0
def main(args):
    default_transl_table, args = grace.get_option_value(args, '--transl_table', int, 11)
    use_coverage, args = grace.get_flag(args, '--use-coverage')
    coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff', float, 0.1)
    tabular, args = grace.get_flag(args, '--tabular')
    noheader, args = grace.get_flag(args, '--noheader')
    verbose, args = grace.get_flag(args, '--verbose')
    bandwidth, args = grace.get_option_value(args, '--band', int, 20)
    grace.expect_no_further_options(args)

    if len(args) != 2:
        print USAGE
        return 1
    
    genbank_filename = args[0]
    alignment_filename = args[1]
    
    if os.path.isdir(alignment_filename):
        alignment_filename = os.path.join(alignment_filename, 'alignment.maf')
    
    working_dir = os.path.split(alignment_filename)[0]
    
    alignments = load_alignments(alignment_filename)
    
    summaries = [ ]
    details = [ ]
    
    if not noheader:
        fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t'
        if use_coverage: fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t'
        fields += 'Gene\tProduct'
        if tabular: fields += '\tChanges of note'
        print fields
    
    for record in SeqIO.parse(io.open_possibly_compressed_file(genbank_filename),'genbank'):
        sequence = record.seq.tostring()
    
        for name, seq1, seq2, alignment in alignments:
            if seq1 == sequence: break
        else:
            raise grace.Error('Genbank record %s sequence not identical to any reference sequence' % record.id)
             
        if use_coverage:       
            depth = get_graph(working_dir, name, 'depth')
            ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth')
            median_depth = numpy.median(depth)
            median_ambiguous_depth = numpy.median(ambiguous_depth)
            ambiguous_factor = float(median_ambiguous_depth) / median_depth
            depth_expect = expected_depth(name, sequence, depth, ambiguous_depth)
            
        
        for feature in record.features:
            if feature.type != 'CDS': continue
            
            if 'locus_tag' not in feature.qualifiers:
                locus_tag = '%d..%d' % (feature.location.nofuzzy_start+1,feature.location.nofuzzy_end)
            else:
                locus_tag = feature.qualifiers['locus_tag'][0]
            
            if 'transl_table' in feature.qualifiers:
                transl_table_no = int(feature.qualifiers['transl_table'][0])
            else:
                assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given'
                transl_table_no = default_transl_table
            
            transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no]
            start_codons = transl_table.start_codons
            
            try:
                feature_alignment = alignment_from_feature(sequence, feature)
            except Weird_alignment:
                warn('%s has a location I could not handle, skipping, sorry' % locus_tag)
                continue
            
            dna = [ ]
            new_dna = [ ]
            shifts = [ ]
            for i in xrange(feature_alignment.end2):
                p1 = feature_alignment.back_project(i, left=False)
                p2 = feature_alignment.back_project(i+1, left=True)
                assert abs(p2-p1) < 2
                dna.append( sequence_slice(sequence,p1,p2) )
                
                p1a = alignment.project(p1, left=False)
                p2a = alignment.project(p2, left=False) #Hmm
                
                diff = (p2-p1)-(p2a-p1a)
                #if diff:
                #    if diff%3:
                #        frame_shift = True
                #    else:
                #        frame_preserving_shift = True
                new_dna.append( sequence_slice(seq2,p1a,p2a) )
                
                if diff:
                    shifts.append((i,dna[-1],new_dna[-1]))
                
            dna = ''.join(dna)
            new_dna = ''.join(new_dna)
            
            # This usually indicated a CDS truncated at the start?
            # in which case, will probably fail some way or other down the line.
            if 'codon_start' in feature.qualifiers:
                codon_start = int(feature.qualifiers['codon_start'][0]) - 1
            else:
                codon_start = 0
            dna = dna[codon_start:]
            new_dna = new_dna[codon_start:]
            
            if len(dna) % 3 != 0:
                warn(locus_tag + ' length not a multiple of 3')
            #assert len(new_dna) % 3 == 0
            
            protein = Seq.Seq(dna).translate(table=transl_table_no).tostring()            
            # http://en.wikipedia.org/wiki/Start_codon is always translated to M
            protein = 'M' + protein[1:]
            
            if dna[:3] not in start_codons:
                warn(locus_tag + ' has unknown start codon: ' + dna[:3])
                                    
            original_lacks_stop_codon = not protein.endswith('*')                 
            if original_lacks_stop_codon:
                warn(locus_tag + ' lacks end codon')
            original_stops_before_end = '*' in protein[:-1] 
            if original_stops_before_end:
                warn(locus_tag + ' contains stop codon before end')
                            
            if 'translation' in feature.qualifiers:
                expect = feature.qualifiers['translation'][0]
                if protein[:-1] != expect:
                    warn(locus_tag + ' translation given in feature does not match translation from DNA')                
        
            new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring()            
            new_protein = 'M' + new_protein[1:]
        
            # If end codon changed, find new end                
            # Don't bother if there are unknown amino acids or 
            # the original protein lacks a stop codon
            if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon:
                #This is very inefficient
                i = feature_alignment.end2
                while True:
                    p1 = feature_alignment.back_project(i, left=False)
                    p2 = feature_alignment.back_project(i+1, left=True)
                    p1a = alignment.project(p1, left=False)
                    p2a = alignment.project(p2, left=False) #Hmm
                    if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len(seq2):
                        break
                        
                    new_dna += sequence_slice(seq2,p1a,p2a)                        
                    new_protein = Seq.Seq(new_dna).translate(table=transl_table_no).tostring()            
                    new_protein = 'M' + new_protein[1:]
                    if 'X' in new_protein or '*' in new_protein: break
                    
                    i += 1
            
            # Is the protein shorter?
            # Don't bother checking if the original protein has extra stop codons
            if '*' in new_protein and not original_stops_before_end:
                new_protein = new_protein[:new_protein.index('*')+1] 
        
            # If indels occurred, do an alignment
            # Don't bother otherwise
            if shifts:
                # Penalize gaps with cost 2 (vs 1 for mismatch)
                # If lengths don't match, pad with spaces (won't match longer seq),
                # aligner prefers mismatch to gaps
                
                #result = pairwise2.align.globalxs(protein      + ' '*max(0,len(new_protein)-len(protein)), 
                #                                  new_protein  + ' '*max(0,len(protein)-len(new_protein)), 
                #                                  -2.001,-2.000)[0]
                # 2.001 : very slightly prefer contiguous gaps. Also much faster!
        
                result = band_limited_align(protein      + ' '*max(0,len(new_protein)-len(protein)), 
                                            new_protein  + ' '*max(0,len(protein)-len(new_protein)), 
                                            bandwidth)
                
                
                protein_ali = result[0]
                new_protein_ali = result[1]
            else:
                protein_ali = protein
                new_protein_ali = new_protein
        
            diffs = [ ]
            j = 0
            k = 0
            for i in xrange(min(len(new_protein_ali),len(protein_ali))):
                if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and (
                      protein_ali[i] == '-' or 
                      new_protein_ali[i] == '-' or 
                      not bio.might_be_same_amino(protein_ali[i], new_protein_ali[i]) ):
                    diffs.append((i,j,k))
                if protein_ali[i] != '-': 
                    j += 1
                if new_protein_ali[i] != '-': 
                    k += 1
        
            diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \
                         not bio.might_be_same_base(new_dna[1],dna[1]) or \
                         not bio.might_be_same_base(new_dna[2],dna[2]) 
        
            interesting_coverage = False
            if use_coverage:
                cds_depth = depth[feature_alignment.start1:feature_alignment.end1] #/ median_depth
                if not feature_alignment.forward1: cds_depth = cds_depth[::-1]
                cds_ambiguous_depth = ambiguous_depth[feature_alignment.start1:feature_alignment.end1] #/ median_ambiguous_depth
                if not feature_alignment.forward1: cds_ambiguous_depth = cds_ambiguous_depth[::-1]
                
                cds_depth_expect = depth_expect[feature_alignment.start1:feature_alignment.end1]
                if not feature_alignment.forward1: cds_depth_expect = cds_depth_expect[::-1]
                
                #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth 
                #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth                        
                #line += '%.1f\t' % cds_average_depth_ratio 
                #line += '%.1f\t' % cds_average_ambiguous_depth_ratio
                
                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth) 
                #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth) 
                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth)
                
                avg_expect = numpy.average(cds_depth_expect)
                if avg_expect > 0.0:
                    cds_avg_depth = numpy.average(cds_depth)/avg_expect
                    cds_avg_ambiguous_depth = numpy.average(cds_ambiguous_depth)/avg_expect/ambiguous_factor
                
                strange = (
                    (cds_depth >= cds_depth_expect*1.5) |
                    (cds_ambiguous_depth <= cds_depth_expect*(0.5*ambiguous_factor))
                )
                
                interesting_coverage = numpy.average(strange) >= coverage_cutoff
                     

            if interesting_coverage or diffs or diff_start or shifts or len(new_protein) != len(protein):
                line = name + '\t' + locus_tag + '\t' + \
                      '%d\t' % (len(protein)-1) + \
                      '%d\t' % (len(new_protein)-1) + \
                      '%d\t' % len(diffs)
                

                if use_coverage:
                    if avg_expect <= 0.0:
                        line += '\t\t\t'
                    else:
                        line += '%.1f\t' % (cds_avg_depth) + graphlet(cds_depth, cds_depth_expect)+'\t' 
                        line += '%.1f\t' % (cds_avg_ambiguous_depth) + graphlet(cds_ambiguous_depth, cds_depth_expect*ambiguous_factor)+'\t'
                        line += '%.1f%%\t' % (numpy.average(cds_ambiguous_depth > 0.0)*100.0)
                
                line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \
                        '%s' % feature.qualifiers.get('product',[''])[0]
                
                notes = [ ]
                
                if use_coverage and 'X' in new_protein:
                    xs = new_protein.count('X')
                    if xs == len(new_protein)-1: #First is M, so len-1
                        notes.append('\ No consensus')
                    else:
                        notes.append('\ No consensus for %d aa' % (new_protein.count('X')))
                                   
                if len(new_protein) < len(protein):
                    notes.append('\ Shorter by %d aa' % (len(protein)-len(new_protein)))
        
                if len(new_protein) > len(protein):
                    notes.append('\ Longer by %d aa' % (len(new_protein)-len(protein)))
                
                if diff_start:
                    notes.append('\ Start changed: %s -> %s' % (dna[:3], new_dna[:3]))
                    if new_dna[:3] not in start_codons:
                        notes.append('  No longer a start codon!')
                        
                if shifts:
                    notes.append('\ Indels:')
                
                    for pos, old, new in shifts:
                        notes.append('    base %5d / codon %5d   %s -> %s' % (pos+1,(pos//3)+1,old,new or '-'))
                    
                if diffs:
                    if verbose:
                        notes.append('\ Amino acid changes:')
                        for i, j, k in diffs:
                            notes.append('    codon %5d   %s->%s   (%s->%s)' % (
                                j+1, 
                                protein_ali[i], 
                                new_protein_ali[i], 
                                dna[j*3:j*3+3] if protein_ali[i] != '-' else '-', 
                                new_dna[k*3:k*3+3] if new_protein_ali[i] != '-' else '-'
                            ))
                
                #if len(new_protein) > len(protein):
                #    print 'New protein is longer:', new_protein[len(protein):]
                #if len(new_protein) < len(protein):
                #    print 'New protein is shorter:', protein[len(new_protein):]
                #print protein
                #print new_protein
                
                if tabular:
                    print line + '\t' + ' '.join([ ' '.join(note.strip().split()) for note in notes ])
                else:
                    print line
                    for note in notes:
                        print '\t' + note
    return 0
Esempio n. 30
0
    def run(self):
        if self.dirichlet:
            assert self.ploidy == 1, 'Dirichlet mode is not available for ploidy > 1'
        
        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)
        
        writer = vcf.Writer(open(self.prefix + '.vcf','wb'), reader)
        
        #print dir(reader)
        #print reader.formats
        #print 
        #print reader.infos
        #print 
        
        n = 0
        n_kept = 0
        
        for record in reader:
            n += 1
            variants = get_variants(record)
            
            any = False
            
            for sample in record.samples:
                self._modify_sample(variants, sample)
                
                any = any or (sample.data.GT != self._blank_gt() and sample.data.GT != self._reference_gt())

                #print call.sample
                #for key in call.data._fields:
                #    print key, getattr(call.data,key), reader.formats[key].desc
                #    
                #counts = [ call.data.RO ]
                #if isinstance(call.data.QA,list):
                #    counts.extend(call.data.QA)
                #else:
                #    counts.append(call.data.QA)
                #print variants, counts
                #
                #
                #if self.min_gq is not None and call.data.GQ < self.min_gq:
                #    call.data = call.data._replace(GT='.')
                #    print call.data
                #else:
                #    any = True
            
            if self.dirichlet:
                record.QUAL = min(MAX_QUALITY, sum(sample.data.GQ for sample in record.samples))
            
            if any:
                writer.write_record(record)
                n_kept += 1
                
        writer.close()
        reader_f.close()
        
        self.log.datum('variants','input', n)
        self.log.datum('variants','kept',  n_kept)
        
        index_vcf(self.prefix+'.vcf')
Esempio n. 31
0
def nway_main(gbk_filename, use_indels, use_reference, give_evidence, give_consequences,
              require_all, require_bisect, full_output, format, working_dirs, split_a, split_b, f=sys.stdout):
    assert working_dirs, 'Need at least one working directory.'
    workspaces = [ working_directory.Working(dirname, must_exist=True) for dirname in working_dirs ]
    reference = workspaces[0].get_reference()
    #if not annotation_filename:
    #    annotation_filename = reference.annotations_filename() #May still be None
    
    if use_reference:
        names = ['reference']
        evidence_start = 1
    else:
        names = [ ]
        evidence_start = 0
        
    names.extend( norm_name(item) for item in  working_dirs )
    
    references = io.read_sequences(reference.reference_fasta_filename())
    
    annotations = { }
    if gbk_filename:
        from Bio import SeqIO
        for record in SeqIO.parse(io.open_possibly_compressed_file(gbk_filename),'genbank'):
            sequence = record.seq.tostring()
            features = [ item for item in record.features if item.type != 'source' ]
            features.sort(key=lambda item: item.location.nofuzzy_start)
            annotations[sequence] = features
    
    iterator = reader(working_dirs, references, use_reference, annotations)
    
    if not use_indels:
        iterator = itertools.ifilter(has_no_indels, iterator)

    if require_all or require_bisect or format == 'counts':
        iterator = itertools.ifilter(fully_unambiguous, iterator)
    
    if require_bisect:
        iterator = itertools.ifilter(is_binary_partition, iterator)

    if not require_bisect:
        if full_output:
            iterator = itertools.ifilter(not_boring_insertion, iterator)
        else:
            iterator = itertools.ifilter(is_interesting, iterator)

    if split_a or split_b:
        assert len(names) == len(set(names)), 'Two samples with the same name'
        try:
            split_a = [ names.index(norm_name(item)) for item in split_a ]
            split_b = [ names.index(norm_name(item)) for item in split_b ]
        except ValueError:
            raise grace.Error('Sample to be split is not amongst samples given')
        iterator = itertools.ifilter(is_split(split_a, split_b), iterator)

    #if limit:
    #    iterator = itertools.islice(iterator, limit)
    
    if format == 'table':
        line = 'Reference\tPosition\tChange type'
        line +=  '\t' + '\t'.join(names)
        if give_evidence:
            line += '\t' + '\t'.join(names[evidence_start:])
        if give_consequences:
            line += '\t' + '\t'.join(names[evidence_start:])
        if annotations:
            line += '\tAnnotations'
        print >> f, line
        for calls in iterator:
            line = '%s\t%d\t%s\t%s' % (
                calls.ref_name, 
                calls.ref_pos+1, 
                change_type(calls), 
                '\t'.join(item.consensus for item in calls.calls))
            if give_evidence:
                line += '\t' + '\t'.join(item.evidence for item in calls.calls[evidence_start:])
            if give_consequences:
                line += '\t' + '\t'.join(item.consequences for item in calls.calls[evidence_start:])
            if annotations:
                line += '\t' + describe_features(calls.features)
            print >> f, line

    elif format == 'compact':
        for line in transpose_strings(names):
            print >> f, line
        print >> f
        
        for calls in iterator:
            if calls.is_insertion:
                footer = '%12d.5 %s' % (calls.ref_pos, calls.ref_name)
            else: 
                footer = '%12d   %s' % (calls.ref_pos+1, calls.ref_name)
            
            t = transpose_strings([ item.consensus for item in calls.calls ], '-', 1)
            top = t[0] + ' ' + footer
            if give_consequences:
                consequences = [ ]
                for call in calls.calls:
                    if call.consequences:
                        for item in call.consequences.split(', '):
                            item = ' '.join(item.split()[:3])
                            if item not in consequences: consequences.append(item)
                        
                if consequences:
                    top += '  ' + ' / '.join(sorted(consequences))
            top += '  ' + describe_features(calls.features)
            print >> f, top
            for line in t[1:]:
                print >> f, line            
    
    elif format == 'nexus':
        buckets = [ [ ] for name in names ]
        for calls in iterator:
            for i, char in enumerate(partition_string(calls)):
                buckets[i].append(char)
        
        print >> f, '#NEXUS'
        print >> f, 'begin taxa;'
        print >> f, 'dimensions ntax=%d;' % len(names)
        print >> f, 'taxlabels'
        for name in names:
            print >> f, name
        print >> f, ';'
        print >> f, 'end;'

        print >> f, 'begin characters;'
        print >> f, 'dimensions nchar=%d;' % len(buckets[0])
        print >> f, 'format datatype=STANDARD symbols="ACGT-0123456789" missing=N;'
        print >> f, 'matrix'
        for name, bucket in itertools.izip(names, buckets):
            print >> f, name, ''.join(bucket)
        print >> f, ';'
        print >> f, 'end;'
    
    elif format == 'counts':
        for line in transpose_strings(names):
            print >> f, line
        print >> f

        counts = { }
        for calls in iterator:
            count_str = partition_string(calls)
            if count_str not in counts:
                counts[count_str] = 1
            else:
                counts[count_str] += 1
        
        for count_str in sorted(counts, key=lambda x: (counts[x], x), reverse=True):
            print >> f, '%s   %d' % (transpose_strings(count_str)[0], counts[count_str])
    
    else:
        raise grace.Error('Unknown output format: ' + format)
Esempio n. 32
0
    def run(self):
        workspace = self.get_workspace()

        header = ["##gff-version 3\n"]
        lengths = {}
        with io.open_possibly_compressed_file(self.features) as f:
            f.next()
            for line in f:
                if not line.startswith("#"): break
                if line.startswith("##gff-version"): continue
                header.append(line)
                parts = line.strip().split()
                if parts[0] == "##sequence-region":
                    lengths[parts[1]] = int(parts[3])

        header = "".join(header)

        items = list(annotation.read_gff(self.features, "/"))
        annotation.link_up_annotations(items)
        for item in items:
            assert len(item.parents) < 2
            if "ID" in item.attr:
                item.attr["ID"] = item.attr["ID"].split(":")[1]
            if "Parent" in item.attr:
                item.attr["Parent"] = item.attr["Parent"].split(":")[1]
            if item.parents:
                item.parent = item.parents[0]

        def well_supported(item):
            if self.support is None: return True
            level = item.attr.get("transcript_support_level", "NA").split()[0]
            if not level.isdigit(): return False
            return int(level) <= self.support

        exons = [
            item for item in items
            if item.type == "exon" and well_supported(item.parent)
        ]
        exon_index = span_index.index_annotations(exons)

        utrs = []
        extended_utrs = []
        utr_parts = []
        exons_kept = []
        cds_kept = []
        transcripts_kept = []
        for item in items:
            this_exons = [
                item2 for item2 in item.children if item2.type == "exon"
            ]
            if this_exons and well_supported(item):
                transcripts_kept.append(item)
                exons_kept.extend(this_exons)
                cds_kept.extend(
                    [item2 for item2 in item.children if item2.type == "CDS"])

            if self.gene_level:
                utr_bits = [
                    item3 for item2 in item.children if well_supported(item2)
                    for item3 in item2.children if item3.type == self.what
                ]
            else:
                if not well_supported(item): continue
                utr_bits = [
                    item2 for item2 in item.children if item2.type == self.what
                ]

            if not utr_bits:
                continue

            utr = utr_bits[0].copy()
            for item2 in utr_bits[1:]:
                utr = utr.span_with(item2)

            gene = item if self.gene_level else item.parent

            utr.attr = dict(ID=item.get_id(),
                            Name=item.attr["Name"],
                            gene_id=gene.get_id(),
                            gene=gene.attr["Name"],
                            description=gene.attr.get("description", ""),
                            biotype=item.attr["biotype"])

            max_extension = 10000
            if item.strand < 0:
                max_extension = min(max_extension, utr.start)
            else:
                max_extension = min(max_extension,
                                    lengths[utr.seqid] - utr.end)
            assert max_extension >= 0, utr

            end = utr.three_prime()
            for hit in exon_index.get(end.shifted(0, max_extension),
                                      same_strand=True):
                #if hit.parent.get_id() == item.get_id():
                #    continue
                rel = hit.relative_to(end).start
                if rel >= 0:
                    max_extension = min(max_extension, rel)

            extended_utr = utr.shifted(0, max_extension)
            extended_utr.start = max(extended_utr.start, 0)
            utr.attr["max_extension"] = str(max_extension)

            utrs.append(utr)
            extended_utrs.append(extended_utr)

            for item2 in utr_bits:
                part = item2.copy()
                part.attr = dict(Parent=item.get_id())
                part.type = "part"
                utr_parts.append(part)

        write_gff3(workspace / "utr.gff", utrs, header)
        write_gff3(workspace / "utr_extended.gff", extended_utrs, header)
        write_gff3(workspace / "utr_part.gff", utr_parts, header)
        write_gff3(workspace / "transcript.gff", transcripts_kept, header)
        write_gff3(workspace / "exon.gff", exons_kept, header)
        write_gff3(workspace / "cds.gff", cds_kept, header)
Esempio n. 33
0
def main(args):
    default_transl_table, args = grace.get_option_value(
        args, '--transl_table', int, 11)
    use_coverage, args = grace.get_flag(args, '--use-coverage')
    coverage_cutoff, args = grace.get_option_value(args, '--coverage-cutoff',
                                                   float, 0.1)
    tabular, args = grace.get_flag(args, '--tabular')
    noheader, args = grace.get_flag(args, '--noheader')
    verbose, args = grace.get_flag(args, '--verbose')
    bandwidth, args = grace.get_option_value(args, '--band', int, 20)
    grace.expect_no_further_options(args)

    if len(args) != 2:
        print USAGE
        return 1

    genbank_filename = args[0]
    alignment_filename = args[1]

    if os.path.isdir(alignment_filename):
        alignment_filename = os.path.join(alignment_filename, 'alignment.maf')

    working_dir = os.path.split(alignment_filename)[0]

    alignments = load_alignments(alignment_filename)

    summaries = []
    details = []

    if not noheader:
        fields = 'Sequence\tLocus tag\tOld length (aa)\tNew length (aa)\tAmino acid changes\t'
        if use_coverage:
            fields += 'Unambiguous coverage vs expected\t\tAmbiguous coverage vs expected\t\tAmbiguous percent with any hits\t'
        fields += 'Gene\tProduct'
        if tabular: fields += '\tChanges of note'
        print fields

    for record in SeqIO.parse(
            io.open_possibly_compressed_file(genbank_filename), 'genbank'):
        sequence = record.seq.tostring()

        for name, seq1, seq2, alignment in alignments:
            if seq1 == sequence: break
        else:
            raise grace.Error(
                'Genbank record %s sequence not identical to any reference sequence'
                % record.id)

        if use_coverage:
            depth = get_graph(working_dir, name, 'depth')
            ambiguous_depth = get_graph(working_dir, name, 'ambiguous-depth')
            median_depth = numpy.median(depth)
            median_ambiguous_depth = numpy.median(ambiguous_depth)
            ambiguous_factor = float(median_ambiguous_depth) / median_depth
            depth_expect = expected_depth(name, sequence, depth,
                                          ambiguous_depth)

        for feature in record.features:
            if feature.type != 'CDS': continue

            if 'locus_tag' not in feature.qualifiers:
                locus_tag = '%d..%d' % (feature.location.nofuzzy_start + 1,
                                        feature.location.nofuzzy_end)
            else:
                locus_tag = feature.qualifiers['locus_tag'][0]

            if 'transl_table' in feature.qualifiers:
                transl_table_no = int(feature.qualifiers['transl_table'][0])
            else:
                assert default_transl_table is not None, 'No /transl_table for CDS, and default transl_table not given'
                transl_table_no = default_transl_table

            transl_table = CodonTable.ambiguous_dna_by_id[transl_table_no]
            start_codons = transl_table.start_codons

            try:
                feature_alignment = alignment_from_feature(sequence, feature)
            except Weird_alignment:
                warn('%s has a location I could not handle, skipping, sorry' %
                     locus_tag)
                continue

            dna = []
            new_dna = []
            shifts = []
            for i in xrange(feature_alignment.end2):
                p1 = feature_alignment.back_project(i, left=False)
                p2 = feature_alignment.back_project(i + 1, left=True)
                assert abs(p2 - p1) < 2
                dna.append(sequence_slice(sequence, p1, p2))

                p1a = alignment.project(p1, left=False)
                p2a = alignment.project(p2, left=False)  #Hmm

                diff = (p2 - p1) - (p2a - p1a)
                #if diff:
                #    if diff%3:
                #        frame_shift = True
                #    else:
                #        frame_preserving_shift = True
                new_dna.append(sequence_slice(seq2, p1a, p2a))

                if diff:
                    shifts.append((i, dna[-1], new_dna[-1]))

            dna = ''.join(dna)
            new_dna = ''.join(new_dna)

            # This usually indicated a CDS truncated at the start?
            # in which case, will probably fail some way or other down the line.
            if 'codon_start' in feature.qualifiers:
                codon_start = int(feature.qualifiers['codon_start'][0]) - 1
            else:
                codon_start = 0
            dna = dna[codon_start:]
            new_dna = new_dna[codon_start:]

            if len(dna) % 3 != 0:
                warn(locus_tag + ' length not a multiple of 3')
            #assert len(new_dna) % 3 == 0

            protein = Seq.Seq(dna).translate(table=transl_table_no).tostring()
            # http://en.wikipedia.org/wiki/Start_codon is always translated to M
            protein = 'M' + protein[1:]

            if dna[:3] not in start_codons:
                warn(locus_tag + ' has unknown start codon: ' + dna[:3])

            original_lacks_stop_codon = not protein.endswith('*')
            if original_lacks_stop_codon:
                warn(locus_tag + ' lacks end codon')
            original_stops_before_end = '*' in protein[:-1]
            if original_stops_before_end:
                warn(locus_tag + ' contains stop codon before end')

            if 'translation' in feature.qualifiers:
                expect = feature.qualifiers['translation'][0]
                if protein[:-1] != expect:
                    warn(
                        locus_tag +
                        ' translation given in feature does not match translation from DNA'
                    )

            new_protein = Seq.Seq(new_dna).translate(
                table=transl_table_no).tostring()
            new_protein = 'M' + new_protein[1:]

            # If end codon changed, find new end
            # Don't bother if there are unknown amino acids or
            # the original protein lacks a stop codon
            if 'X' not in new_protein and '*' not in new_protein and not original_lacks_stop_codon:
                #This is very inefficient
                i = feature_alignment.end2
                while True:
                    p1 = feature_alignment.back_project(i, left=False)
                    p2 = feature_alignment.back_project(i + 1, left=True)
                    p1a = alignment.project(p1, left=False)
                    p2a = alignment.project(p2, left=False)  #Hmm
                    if p1a < 0 or p2a < 0 or p1a > len(seq2) or p2a > len(
                            seq2):
                        break

                    new_dna += sequence_slice(seq2, p1a, p2a)
                    new_protein = Seq.Seq(new_dna).translate(
                        table=transl_table_no).tostring()
                    new_protein = 'M' + new_protein[1:]
                    if 'X' in new_protein or '*' in new_protein: break

                    i += 1

            # Is the protein shorter?
            # Don't bother checking if the original protein has extra stop codons
            if '*' in new_protein and not original_stops_before_end:
                new_protein = new_protein[:new_protein.index('*') + 1]

            # If indels occurred, do an alignment
            # Don't bother otherwise
            if shifts:
                # Penalize gaps with cost 2 (vs 1 for mismatch)
                # If lengths don't match, pad with spaces (won't match longer seq),
                # aligner prefers mismatch to gaps

                #result = pairwise2.align.globalxs(protein      + ' '*max(0,len(new_protein)-len(protein)),
                #                                  new_protein  + ' '*max(0,len(protein)-len(new_protein)),
                #                                  -2.001,-2.000)[0]
                # 2.001 : very slightly prefer contiguous gaps. Also much faster!

                result = band_limited_align(
                    protein + ' ' * max(0,
                                        len(new_protein) - len(protein)),
                    new_protein + ' ' * max(0,
                                            len(protein) - len(new_protein)),
                    bandwidth)

                protein_ali = result[0]
                new_protein_ali = result[1]
            else:
                protein_ali = protein
                new_protein_ali = new_protein

            diffs = []
            j = 0
            k = 0
            for i in xrange(min(len(new_protein_ali), len(protein_ali))):
                if protein_ali[i] != ' ' and new_protein_ali[i] != ' ' and (
                        protein_ali[i] == '-' or new_protein_ali[i] == '-'
                        or not bio.might_be_same_amino(protein_ali[i],
                                                       new_protein_ali[i])):
                    diffs.append((i, j, k))
                if protein_ali[i] != '-':
                    j += 1
                if new_protein_ali[i] != '-':
                    k += 1

            diff_start = not bio.might_be_same_base(new_dna[0],dna[0]) or \
                         not bio.might_be_same_base(new_dna[1],dna[1]) or \
                         not bio.might_be_same_base(new_dna[2],dna[2])

            interesting_coverage = False
            if use_coverage:
                cds_depth = depth[feature_alignment.start1:
                                  feature_alignment.end1]  #/ median_depth
                if not feature_alignment.forward1: cds_depth = cds_depth[::-1]
                cds_ambiguous_depth = ambiguous_depth[
                    feature_alignment.start1:
                    feature_alignment.end1]  #/ median_ambiguous_depth
                if not feature_alignment.forward1:
                    cds_ambiguous_depth = cds_ambiguous_depth[::-1]

                cds_depth_expect = depth_expect[feature_alignment.
                                                start1:feature_alignment.end1]
                if not feature_alignment.forward1:
                    cds_depth_expect = cds_depth_expect[::-1]

                #cds_average_depth_ratio = numpy.average(depth[feature_alignment.start1:feature_alignment.end1]) / median_depth
                #cds_average_ambiguous_depth_ratio = numpy.average(ambiguous_depth[feature_alignment.start1:feature_alignment.end1]) / median_ambiguous_depth
                #line += '%.1f\t' % cds_average_depth_ratio
                #line += '%.1f\t' % cds_average_ambiguous_depth_ratio

                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_depth)/median_depth, numpy.maximum.reduce(cds_depth)/median_depth)
                #line += '%.1f+/-%.1f\t' % (numpy.average(cds_depth)/median_depth, numpy.var(cds_depth)**0.5/median_depth)
                #line += '%.1f..%.1f\t' % (numpy.minimum.reduce(cds_ambiguous_depth)/median_ambiguous_depth, numpy.maximum.reduce(cds_ambiguous_depth)/median_ambiguous_depth)

                avg_expect = numpy.average(cds_depth_expect)
                if avg_expect > 0.0:
                    cds_avg_depth = numpy.average(cds_depth) / avg_expect
                    cds_avg_ambiguous_depth = numpy.average(
                        cds_ambiguous_depth) / avg_expect / ambiguous_factor

                strange = ((cds_depth >= cds_depth_expect * 1.5) |
                           (cds_ambiguous_depth <= cds_depth_expect *
                            (0.5 * ambiguous_factor)))

                interesting_coverage = numpy.average(
                    strange) >= coverage_cutoff

            if interesting_coverage or diffs or diff_start or shifts or len(
                    new_protein) != len(protein):
                line = name + '\t' + locus_tag + '\t' + \
                      '%d\t' % (len(protein)-1) + \
                      '%d\t' % (len(new_protein)-1) + \
                      '%d\t' % len(diffs)

                if use_coverage:
                    if avg_expect <= 0.0:
                        line += '\t\t\t'
                    else:
                        line += '%.1f\t' % (cds_avg_depth) + graphlet(
                            cds_depth, cds_depth_expect) + '\t'
                        line += '%.1f\t' % (
                            cds_avg_ambiguous_depth) + graphlet(
                                cds_ambiguous_depth,
                                cds_depth_expect * ambiguous_factor) + '\t'
                        line += '%.1f%%\t' % (
                            numpy.average(cds_ambiguous_depth > 0.0) * 100.0)

                line += '%s\t' % feature.qualifiers.get('gene',[''])[0] + \
                        '%s' % feature.qualifiers.get('product',[''])[0]

                notes = []

                if use_coverage and 'X' in new_protein:
                    xs = new_protein.count('X')
                    if xs == len(new_protein) - 1:  #First is M, so len-1
                        notes.append('\ No consensus')
                    else:
                        notes.append('\ No consensus for %d aa' %
                                     (new_protein.count('X')))

                if len(new_protein) < len(protein):
                    notes.append('\ Shorter by %d aa' %
                                 (len(protein) - len(new_protein)))

                if len(new_protein) > len(protein):
                    notes.append('\ Longer by %d aa' %
                                 (len(new_protein) - len(protein)))

                if diff_start:
                    notes.append('\ Start changed: %s -> %s' %
                                 (dna[:3], new_dna[:3]))
                    if new_dna[:3] not in start_codons:
                        notes.append('  No longer a start codon!')

                if shifts:
                    notes.append('\ Indels:')

                    for pos, old, new in shifts:
                        notes.append('    base %5d / codon %5d   %s -> %s' %
                                     (pos + 1,
                                      (pos // 3) + 1, old, new or '-'))

                if diffs:
                    if verbose:
                        notes.append('\ Amino acid changes:')
                        for i, j, k in diffs:
                            notes.append(
                                '    codon %5d   %s->%s   (%s->%s)' %
                                (j + 1, protein_ali[i], new_protein_ali[i],
                                 dna[j * 3:j * 3 + 3] if protein_ali[i] != '-'
                                 else '-', new_dna[k * 3:k * 3 + 3]
                                 if new_protein_ali[i] != '-' else '-'))

                #if len(new_protein) > len(protein):
                #    print 'New protein is longer:', new_protein[len(protein):]
                #if len(new_protein) < len(protein):
                #    print 'New protein is shorter:', protein[len(new_protein):]
                #print protein
                #print new_protein

                if tabular:
                    print line + '\t' + ' '.join(
                        [' '.join(note.strip().split()) for note in notes])
                else:
                    print line
                    for note in notes:
                        print '\t' + note
    return 0
Esempio n. 34
0
    def run(self):
        references = { }
        for filename in self.reference_filenames:
            for name, seq in io.read_sequences(filename):
                references[name] = seq
        
        tail_lengths = { }
        adaptor_bases = { }
        for filename in self.clips:
            with io.open_possibly_compressed_file(filename) as f:
                for line in f:
                    if line.startswith('#'): continue
                    parts = line.rstrip('\n').split('\t')
                    name = parts[0].split()[0]
                    tail_lengths[name] = int(parts[3])-int(parts[2])
                    adaptor_bases[name] = int(parts[6])
        
        in_file = self.begin_input()
        out_file = self.begin_output()
        
        for line in in_file:
            line = line.rstrip()
            if line.startswith('@'):
                print >> out_file, line
                continue
            
            al = Alignment(line)
            
            if al.flag & FLAG_UNMAPPED:
                continue

            ref = references[al.rname]

            reverse = al.flag & FLAG_REVERSE
            if reverse:
                read_bases = rev_comp(al.seq)
                read_qual = al.qual[::-1]
                cigar = cigar_decode(al.cigar)[::-1]
            else:
                read_bases = al.seq
                read_qual = al.qual
                cigar = cigar_decode(al.cigar)
            
            n_tail = tail_lengths[al.qname]
            
            if reverse:
                if al.pos-1-n_tail < 0: continue #TODO: handle tail extending beyond end of reference
                bases_ref = rev_comp(ref[al.pos-1-n_tail:al.pos-1])    
            else:
                if al.pos-1+al.length+n_tail > len(ref): continue #TODO: handle tail extending beyond end of reference
                bases_ref = ref[al.pos-1+al.length:al.pos-1+al.length+n_tail]
            
            extension = 0
            while extension < n_tail and bases_ref[extension] == 'A':
                extension += 1
            
            if n_tail-extension > 0:
                al.extra.append('AN:i:%d' % (n_tail-extension))
            if adaptor_bases[al.qname]:
                al.extra.append('AD:i:%d' % adaptor_bases[al.qname])
            if n_tail-extension >= self.tail:
                #if reverse:
                #    tail_refpos = al.pos-extension
                #else:
                #    tail_refpos = al.pos+al.length+extension-1 
                #al.extra.append('AA:i:%d'%tail_refpos)
                al.extra.append('AA:i:1')
            
            cigar += 'M' * extension
            read_bases += 'A' * extension
            read_qual += chr(33+20) * extension #Arbitrarily give quality 20
            al.length += extension
            if reverse:
                al.pos -= extension
                al.seq = rev_comp(read_bases)
                al.qual = read_qual[::-1]
                al.cigar = cigar_encode(cigar[::-1])
            else: 
                al.seq = read_bases
                al.qual = read_qual
                al.cigar = cigar_encode(cigar)

            print >> out_file, al
    
        self.end_output(out_file)
        self.end_input(in_file)
Esempio n. 35
0
    def run(self):
        work = self.get_workspace()
        
        data = [ ]
        names = [ ]
        sample_tags = [ ]
        
        for item in self.pickles:
            f = io.open_possibly_compressed_file(item)
            name, tags, datum = pickle.load(f)
            f.close()
            data.append(datum)
            names.append(name)
            sample_tags.append(tags)
        
        annotations = data[0]
        
        all_lengths = [ 
            #tail_length
            item[2]
            for sample in data
            for feature in sample
            #for rel_start,rel_end,tail_length in feature.hits
            for item in feature.hits
            ]
        if all_lengths: 
            max_length = max(all_lengths)+1
        else:
            max_length = 1
        del all_lengths
        
        for i, sample in enumerate(data):
            n_alignments = 0
            n_duplicates = 0
            n_good = 0
            for feature in sample:
                feature.tail_counts = [ 0.0 ] * max_length
                
                buckets = collections.defaultdict(list)
                for item in feature.hits:
                    rel_start,rel_end,tail_length = item[:3]
                    buckets[ (rel_start,rel_end) ].append(tail_length)
                for item in buckets.values():
                    n_alignments += len(item)
                    n_good += 1
                    if self.saturation < 1 or len(item) <= self.saturation:
                        weight = 1.0
                    else:
                        weight = float(self.saturation) / len(item)
                        n_duplicates += len(item)
                    for item2 in item:
                        feature.tail_counts[item2] += weight                

            self.log.datum(names[i], 'Alignments to features', n_alignments)
            if self.saturation >= 1:
                self.log.datum(names[i], 'Proportion of alignments with duplicate start and end position', float(n_duplicates)/max(1,n_alignments))
                self.log.datum(names[i], 'Alignments to features after deduplication', n_good)
                
        
        counts = [ ]  # [feature][sample][taillength]
        
        for item in data: 
            assert len(item) == len(data[0])
        for row in itertools.izip(*data):
            this_counts = [ item.tail_counts for item in row ]
            counts.append(this_counts)
        
        sample_n = [ ]        # [feature][sample]  Total count
        sample_n_tail = [ ]   # [feature][sample]  Polya count
        sample_prop = [ ]     # [feature][sample]  Proportion of reads with tail
        sample_tail = [ ]     # [feature][sample]  Mean tail length in each sample
        sample_total_tail = [ ]
        overall_n = [ ]
        overall_prop = [ ]    # [feature]          Overall proportion with tail
        overall_tail = [ ]    # [feature]          Overall mean tail length
        overall_n_tail = [ ]  # [feature]          Overall polya count
        overall_total_tail = [ ]
        for row in counts:
            this_n = [ ]
            this_n_tail = [ ]
            this_prop = [ ]
            this_tail = [ ]
            this_total_tail = [ ]
            for item in row:
                this_this_n = sum(item)
                this_n.append( this_this_n )

                this_this_n_tail = sum(item[self.tail:])
                this_n_tail.append( this_this_n_tail )

                this_this_total_tail = sum( item[i]*i for i in xrange(self.tail,max_length) )
                this_total_tail.append( this_this_total_tail )

                if this_this_n < 1:
                    this_prop.append(None)
                else:
                    this_prop.append(float(this_this_n_tail)/this_this_n)
                if this_this_n_tail < 1:
                    this_tail.append(None)
                else:
                    this_tail.append(this_this_total_tail/this_this_n_tail)

            sample_n.append(this_n)
            sample_n_tail.append(this_n_tail)
            sample_prop.append(this_prop)
            sample_tail.append(this_tail)
            sample_total_tail.append(this_total_tail)
            overall_n.append(sum(this_n))
            overall_n_tail.append(sum(this_n_tail))
            overall_total_tail.append(sum(this_total_tail))
            if sum(this_n) < 1:
                overall_prop.append(None)
            else:
                overall_prop.append(float(sum(this_n_tail))/sum(this_n))
            if sum(this_n_tail) < 1:
                overall_tail.append(None)
            else:
                overall_tail.append(float(sum(this_total_tail))/sum(this_n_tail))
             
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_total_tail )
            this_n = sum( item[i] for item in sample_n_tail )
            if this_n:
                self.log.datum(name, 'Average poly-A tail', float(this_total)/this_n)
                
        for i, name in enumerate(names):
            this_total = sum( item[i] for item in sample_n_tail )
            this_n = sum( item[i] for item in sample_n )
            if this_n:
                self.log.datum(name, 'Average proportion of reads with tail', float(this_total)/this_n)
            
        
        #max_length = max(max(len(item) for item in row) for row in counts)
        #
        #for row in counts:
        #    for item in row:
        #        while len(item) < max_length:
        #            item.append(0)
                
        
        with open(work/'features-with-data.gff','wb') as f:
            annotation.write_gff3_header(f)
            for i, item in enumerate(annotations):
                item.attr['reads'] = str(overall_n[i])
                item.attr['reads_with_tail'] = str(overall_n_tail[i])
                item.attr['mean_tail'] = '%.1f'%overall_tail[i] if overall_tail[i] else 'NA'
                item.attr['proportion_with_tail'] = '%.2f'%overall_prop[i] if overall_prop[i] else 'NA'
                
                if overall_tail[i] is None:
                    item.attr['color'] = '#444444'
                else:
                    a = (overall_tail[i]-self.tail)/max(1,max_length-self.tail)
                    item.attr['color'] = '#%02x%02x%02x' % (int(a*255),int((1-abs(a*2-1))*255),255-int(a*255))
                #item.attr['color'] = ...                
                print >> f, item.as_gff()
        
        
        comments = [ '#Counts' ] + [
            '#sampleTags='+','.join(tags)
            for tags in sample_tags
            ] + [
            '"Tail_count" group is number of reads with tail',
            '"Tail" group is mean tail per sample',
            '"Proportion" group is proportion of reads with tail',
            ]

        def counts_iter():
            for i in xrange(len(counts)):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(len(names)):
                    row[('Count',names[j])] = '%d' % sample_n[i][j]

                row[('Annotation','Length')] = annotations[i].end - annotations[i].start
                row[('Annotation','gene')] = annotations[i].attr.get('Name','')
                row[('Annotation','product')] = annotations[i].attr.get('Product','')
                #row[('Annotation','Strand')] = str(annotations[i].strand)
                row[('Annotation','reads')] = str(overall_n[i])
                row[('Annotation','reads-with-tail')] = str(overall_n_tail[i])
                row[('Annotation','mean-tail')] = str(overall_tail[i]) if overall_tail[i] is not None else 'NA'
                row[('Annotation','proportion-with-tail')] = str(overall_prop[i]) if overall_prop[i] is not None else 'NA'
                for j in xrange(len(names)):
                    row[('Tail_count',names[j])] = '%d' % sample_n_tail[i][j]
                for j in xrange(len(names)):
                    row[('Tail',names[j])] = str(sample_tail[i][j]) if sample_tail[i][j] is not None else 'NA'
                for j in xrange(len(names)):
                    row[('Proportion',names[j])] = str(sample_prop[i][j]) if sample_prop[i][j] is not None else 'NA'
                yield row
        io.write_csv(work/'counts.csv', counts_iter(), comments=comments)

        def raw_columns():
            for i in xrange(len(names)):
                row = collections.OrderedDict()
                row['Sample'] = names[i]
                for j in xrange(max_length):
                    row['length-%d' % j] = str(i*max_length+j+1) #For R+, so 1 based
                yield row
        io.write_csv(work/'raw-columns.csv', raw_columns())

        #Somewhat inefficient        
        def raw():
            for i in xrange(len(counts)):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(len(names)):
                    for k in xrange(max_length):
                        row['%d %s' % (k,names[j])] = str( counts[i][j][k] )
                yield row
        io.write_csv(work/'raw.csv', raw())
        
        def pooled():
            for i in xrange(len(counts)):
                row = collections.OrderedDict()
                row['Feature'] = annotations[i].get_id()
                for j in xrange(max_length):
                    row[str(j)] = str( sum( counts[i][k][j] for k in xrange(len(names)) ) )
                yield row
        io.write_csv(work/'pooled.csv', pooled())
Esempio n. 36
0
    def run(self):
        if self.dirichlet:
            assert self.ploidy == 1, 'Dirichlet mode is not available for ploidy > 1'

        reader_f = io.open_possibly_compressed_file(self.vcf)
        reader = vcf.Reader(reader_f)

        writer = vcf.Writer(open(self.prefix + '.vcf', 'wb'), reader)

        #print dir(reader)
        #print reader.formats
        #print
        #print reader.infos
        #print

        n = 0
        n_kept = 0

        for record in reader:
            n += 1
            variants = get_variants(record)

            any = False

            for sample in record.samples:
                self._modify_sample(variants, sample)

                any = any or (sample.data.GT != self._blank_gt()
                              and sample.data.GT != self._reference_gt())

                #print call.sample
                #for key in call.data._fields:
                #    print key, getattr(call.data,key), reader.formats[key].desc
                #
                #counts = [ call.data.RO ]
                #if isinstance(call.data.QA,list):
                #    counts.extend(call.data.QA)
                #else:
                #    counts.append(call.data.QA)
                #print variants, counts
                #
                #
                #if self.min_gq is not None and call.data.GQ < self.min_gq:
                #    call.data = call.data._replace(GT='.')
                #    print call.data
                #else:
                #    any = True

            if self.dirichlet:
                record.QUAL = min(
                    MAX_QUALITY,
                    sum(sample.data.GQ for sample in record.samples))

            if any:
                writer.write_record(record)
                n_kept += 1

        writer.close()
        reader_f.close()

        self.log.datum('variants', 'input', n)
        self.log.datum('variants', 'kept', n_kept)

        index_vcf(self.prefix + '.vcf')