コード例 #1
0
    def __init__(self, fhand, reference_name, grouping=None):
        'It inits the class'
        # The fhand is as it arrives
        open(fhand.name, 'w')
        self.fhand = open(fhand.name, 'a')
        self._namer = SnvNamer()

        self._temp_fhand = NamedTemporaryFile(mode='a')
        self._filter_descriptions = {}
        self._header = []
        if grouping is None:
            grouping = 'read_groups'
        self._genotype_grouping_key = grouping
        self._genotype_groups = OrderedDict()
        self._get_pre_header(reference_name)
        self.num_features = 0
コード例 #2
0
class VariantCallFormatWriter(object):
    'It writes variant call format files for the snvs.'
    def __init__(self, fhand, reference_name, grouping=None):
        'It inits the class'
        # The fhand is as it arrives
        open(fhand.name, 'w')
        self.fhand = open(fhand.name, 'a')
        self._namer = SnvNamer()

        self._temp_fhand = NamedTemporaryFile(mode='a')
        self._filter_descriptions = {}
        self._header = []
        if grouping is None:
            grouping = 'read_groups'
        self._genotype_grouping_key = grouping
        self._genotype_groups = OrderedDict()
        self._get_pre_header(reference_name)
        self.num_features = 0

    def _get_pre_header(self, reference_name):
        'It writes the header of the vcf file'
        header = self._header
        header.append('##fileformat=VCFv4.1')
        header.append('##fileDate=%s' %
                                      datetime.date.today().strftime('%Y%m%d'))
        header.append('##source=franklin')
        header.append('##reference=%s' % reference_name)
        header.append('##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">')
        header.append('##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">')
        header.append('##INFO=<ID=RC,Number=A,Type=Integer,Description="Read count of the alt alleles">')
        header.append('##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">')
        header.append('##INFO=<ID=BQ,Number=1,Type=Float,Description="RMS Base Quality">')
        header.append('##INFO=<ID=GC,Number=.,Type=String,Description="Genotype Counts: Num. genotypes in which every alleles has been detected">')
        header.append('##INFO=<ID=GP,Number=1,Type=String,Description="Genotype polimorphism">')
        header.append('##INFO=<ID=EZ,Number=1,Type=String,Description="CAP enzymes">')
        header.append('##FORMAT=<ID=GT,Number=1,Type=String,Description="Read group Genotype">')
        header.append('##FORMAT=<ID=EC,Number=.,Type=Integer,Description="Allele count for the ref and alt alleles in the order listed">')

    def close(self):
        'It merges the header and the snv data'
        # Append the data spec  to the header
        fhand = self.fhand
        self._add_filters_to_header()
        line_items = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER',
                      'INFO', 'FORMAT']
        line_items.extend([group.upper() for group in self._genotype_groups.keys()])
        num_items_per_line = len(line_items)
        self._header.append('%s\n' % '\t'.join(line_items))
        fhand.write('\n'.join(self._header))
        for line in open(self._temp_fhand.name):
            #fix the missing genotype groups
            line = line.strip()
            line = line.split()
            num_items = len(line)
            line.extend(['.:.'] * (num_items_per_line - num_items))
            line = '\t'.join(line)
            #\fix the missing genotype groups
            fhand.write(line + '\n')
        fhand.flush()

    def _add_filters_to_header(self):
        'It adds the used filter tag to the header'
        for name, desc in self._filter_descriptions.values():
            filter_desc = '##FILTER=<ID=%s,Description="%s">' % (name, desc)
            self._header.append(filter_desc)

    def write(self, sequence):
        'It writes the snvs present in the given sequence as SeqFeatures'
        for snv in sequence.get_features(kind='snv'):
            self.num_features += 1
            self._write_snv(sequence, snv)

    @staticmethod
    def _create_alternative_alleles(alleles):
        'It returns the ALT part on the vcf'
        str_alleles = []
        alternative_alleles = []
        for allele in alleles:
            kind = allele[1]
            if kind == INVARIANT:
                continue

            str_allele = allele[0].replace('-', '')
            str_alleles.append(str_allele)
            alternative_alleles.append(allele)
        if str_alleles:
            str_alleles = ','.join(str_alleles)
        else:
            str_alleles = '.'
        return str_alleles, alternative_alleles

    def _create_filters(self, qualifiers):
        'It returns the FILTER part on the vcf'
        filter_strs = []
        if 'filters' not in qualifiers:
            return '.'
        for name, filters_data in qualifiers['filters'].items():
            for parameters, result in filters_data.items():
                if not result:
                    continue
                short_name, description = self._namer.get_filter_description(
                                                      name, parameters,
                                                      self._filter_descriptions)
                short_name = short_name.upper()
                filter_strs.append(short_name)
                self._filter_descriptions[name, parameters] = (short_name,
                                                               description)
        if not filter_strs:
            return 'PASS'
        else:
            return ';'.join(filter_strs)

    def _create_info(self, qualifiers, alternative_alleles):
        'It creates the INFO bit on the vcf'
        toprint_items = []

        alleles = qualifiers['alleles']

        #RC allele count in genotypes, for each ALT allele, in the same order as
        #listed
        acounts = [] #allele_count
        for allele in alternative_alleles:
            acount = _allele_count(allele, alleles, group_kind='read_groups')
            acounts.append(acount)
        if acounts:
            toprint_items.append('RC=%s' % ','.join(map(str, acounts)))

        #AF allele frequency for each ALT allele in the same order as listed:
        reference_allele = qualifiers['reference_allele'], INVARIANT
        if reference_allele in alleles:
            ref_count = _allele_count(reference_allele, alleles,
                                      group_kind='read_groups')
        else:
            ref_count = 0
        total_count = float(sum(acounts) + ref_count)
        afreqs = [acount / total_count for acount in acounts]
        if afreqs:
            toprint_items.append('AF=%s' % ','.join(map(lambda x: '%.1f' % x,
                                                        afreqs)))

        #MQ RMS mapping quality, e.g. MQ=52
        #BQ RMS base quality at this position
        for kind, strfmt in (('mapping_quality', 'MQ=%.2f'),
                             ('quality', 'BQ=%.2f')):
            qual = qualifiers[kind]
            if qual is not None:
                toprint_items.append(strfmt % qual)

        #genotype count
        #we count in how many genotypes every allele has been found.
        allele_counts = self._allele_count_by_group(alleles=alleles,
                                           reference_allele=reference_allele[0],
                                        alternative_alleles=alternative_alleles,
                                          read_groups=qualifiers['read_groups'],
                                          count_reads=False)
        #if some allele is missing there are 0 counts of it
        n_als = max(allele_counts.keys()) + 1
        genotype_counts = [(al, len(allele_counts.get(al, [])))for al in range(n_als)]

        #now we print
        counts = [str(count[1]) for count in genotype_counts]
        toprint_items.append('GC=%s' % (','.join(counts)))
        #genotype polymorphism
        #1 - (number_groups_for_the_allele_with_more_groups / number_groups)
        number_of_groups = sum([count[1] for count in genotype_counts])

        genotype_polymorphism = 1 - genotype_counts[0][1] / float(number_of_groups)
        toprint_items.append('GP=%.2f' % genotype_polymorphism)

        #cap enzymes
        if 'cap_enzymes' in qualifiers and qualifiers['cap_enzymes']:
            to_print = 'EZ=%s' % ','.join(qualifiers['cap_enzymes'])
            toprint_items.append(to_print)

        if toprint_items:
            return ';'.join(toprint_items)
        else:
            return '.'

    @staticmethod
    def _create_quality(alleles, alternative_alleles):
        '''It returns the quality for this snv

        QUAL phred-scaled quality score for the assertion made in ALT. i.e. give
        -10log_10 prob(call in ALT is wrong). If ALT is "." (no variant) then
        this is -10log_10 p(variant), and if ALT is not "." this is -10log_10
        p(no variant). High QUAL scores indicate high confidence calls.
        Although traditionally people use integer phred scores, this field is
        permitted to be a floating point so to enable higher resolution for low
        confidence calls if desired. (Numeric, Missing Value: -1)'''

        if alternative_alleles:
            phreds = [alleles[allele]['quality'] for allele in alternative_alleles]
            if len(phreds) == 1:
                phred = phreds[0]
            else:
                inv_phred = lambda phred: math.pow(10, (-phred / 10))
                probs = map(inv_phred, phreds[:2])
                prob = probs[0] * probs[1]
                phred = -10 * math.log10(prob)
        else:
            phred = alleles.values()[0]['quality']
        return '%i' % phred

    @staticmethod
    def _numbers_for_alleles(reference_allele, alternative_alleles):
        'It returns a key with the numbers for the alleles'
        #a map from alleles to allele index (0 for reference, etc)
        alleles_index = [(reference_allele, INVARIANT)]
        alleles_index.extend(alternative_alleles)
        alleles_index = dict(zip(alleles_index, range(len(alleles_index))))
        return alleles_index

    def _allele_count_by_group(self, alleles, reference_allele,
                               alternative_alleles, read_groups, count_reads):
        '''It returns the allele counts by group

        It can answer to two questions:
            - How many times have the allele been found in every group?
              (count_reads = True)
              It returns a dict indexed by group and the alleles
              (with the vcf number coding)
            - in which groups the allele have been found?
              (count_reads= False)
              It returns a dict indexed by the alleles
              (with the vcf number coding)
        It takes into account the grouping_key (read_group, sample or library)
        It requires the dict with the information about the read_groups.
        '''
        #a map from alleles to allele index (0 for reference, etc)

        alleles_index = self._numbers_for_alleles(reference_allele,
                                                  alternative_alleles)
        grouping_key = self._genotype_grouping_key

        alleles_by_group = {}
        for allele, allele_info in alleles.items():
            #we need the index for the allele
#            if allele not in alleles_index:
#                continue

            try:
                allele_index = alleles_index[allele]
            except KeyError:
                print 'allele', allele
                print "allele index", alleles_index
                print 'ref_allele', reference_allele
                print 'alternative_alleles', alternative_alleles
                print 'alleles', alleles
                raise

            for read_group in allele_info['read_groups']:
                group = _get_group(read_group, grouping_key, read_groups)
                if group not in self._genotype_groups:
                    self._genotype_groups[group] = True
                if count_reads:
                    if group not in alleles_by_group:
                        alleles_by_group[group] = {}
                    if allele_index not in alleles_by_group[group]:
                        alleles_by_group[group][allele_index] = 0
                    count = allele_info['read_groups'][read_group]
                    alleles_by_group[group][allele_index] += count
                else:
                    if allele_index not in alleles_by_group:
                        alleles_by_group[allele_index] = set()
                    alleles_by_group[allele_index].add(group)

        return alleles_by_group

    def _create_genotypes(self, qualifiers, alternative_alleles):
        'It returns the genotype section for this snv'

        alleles = qualifiers['alleles']
        reference_allele = qualifiers['reference_allele']
        read_groups = qualifiers['read_groups']

        items = []
        #the format
        items.append('GT:EC')

        #a map from alleles to allele index (0 for reference, etc)
        alleles_index = self._numbers_for_alleles(reference_allele,
                                                  alternative_alleles)

        #now we need the alleles for every sample

        alleles_by_group = self._allele_count_by_group(alleles=alleles,
                                              reference_allele=reference_allele,
                                        alternative_alleles=alternative_alleles,
                                                        read_groups=read_groups,
                                                               count_reads=True)

        #now we can build the info for every sample
        for group in self._genotype_groups.keys():
            allele_counts = alleles_by_group.get(group, {None: None})
            alleles, counts = [], []
            for allele, count in allele_counts.items():
                allele = str(allele) if allele is not None else '.'
                alleles.append(str(allele))
                count = str(count) if count is not None else '.'
                counts.append(count)
            #print group, alleles, counts
            mix_genotype = '|'.join(alleles)
            allele_counts = ','.join(counts)
            items.append('%s:%s' % (mix_genotype, allele_counts))
        #print 'result', ' '.join(items)

        return '\t'.join(items)

    def _write_snv(self, sequence, snv):
        'Given an snv feature it writes a line in the vcf'
        items = [] #items to write
        items.append(get_seq_name(sequence))
        items.append(str(int(snv.location.start.position) + 1))
        id_ = snv.id
        if id_ == "<unknown id>":
            id_ = '.'
        items.append(id_)
        qualifiers = snv.qualifiers
        ref_seq = qualifiers['reference_allele'].replace('-', '')
        items.append(ref_seq)
        toprint_af, alternative_alleles = self._create_alternative_alleles(
                                                          qualifiers['alleles'])
        items.append(toprint_af)
        items.append(self._create_quality(qualifiers['alleles'],
                                          alternative_alleles))
        filters = self._create_filters(qualifiers)
        items.append(filters)
        try:
            items.append(self._create_info(qualifiers, alternative_alleles))
        except KeyError:
            print 'sequence', get_seq_name(sequence)
            print 'position', str(int(snv.location.start.position))
            raise

        items.append(self._create_genotypes(qualifiers, alternative_alleles))

        self._temp_fhand.write('%s\n' % '\t'.join(items))
        self._temp_fhand.flush()
コード例 #3
0
    def test_get_filter_description():
        'It tests get_filter_description function'

        filter_name = 'close_to_intron'
        parameters = 30
        filter_descriptions = {}
        namer = SnvNamer()
        name, desc = namer.get_filter_description(filter_name, parameters,
                                            filter_descriptions)
        assert name == 'I30'
        assert desc == 'An intron is located closer than 30 base pairs'

        filter_name = 'maf'
        parameters = 0.6
        filter_descriptions = {}
        name, desc = namer.get_filter_description(filter_name, parameters,
                                            filter_descriptions)
        assert name == 'maf1'
        assert desc == 'The most frequent allele in All: All. frequency greater than 0.60'

        filter_name = 'maf'
        parameters = (0.6, ('1', '2'), 'read_group')
        filter_descriptions = {}
        name, desc = namer.get_filter_description(filter_name, parameters,
                                            filter_descriptions)
        assert name == 'maf2'
        assert desc == 'The most frequent allele in read_group: 1,2. frequency greater than 0.60'

        filter_name = 'by_kind'
        parameters = SNP
        filter_descriptions = {}
        name, desc = namer.get_filter_description(filter_name, parameters,
                                            filter_descriptions)

        filter_name = 'is_variable'
        kind = 'read_groups'
        groups = ['rg1', 'rg2']
        maf = 0.6
        min_num_reads = 3
        in_union = True
        in_all_groups = True
        reference_free = True
        parameters = (kind, tuple(groups), in_union, in_all_groups,
                      reference_free, maf, min_num_reads)
        filter_descriptions = {}
        name, desc = namer.get_filter_description(filter_name, parameters,
                                            filter_descriptions)
        assert name[:3] == 'vrg'
        descrip = "It is not variable, or no data, in the read_groups : rg1,rg2."
        descrip += ' All together: True. maf:0.600000. min_num_reads:3'
        assert desc == descrip

        name, desc = namer.get_filter_description(filter_name, parameters,
                                            filter_descriptions)
        assert name[:3] == 'vrg'

        filter_name = 'is_not_variable'
        kind = 'read_groups'
        groups = ['rg1', 'rg2']
        maf = 0.7
        min_num_reads = 2
        in_union = True
        in_all_groups = True
        reference_free = True
        parameters = (kind, tuple(groups), in_union, in_all_groups,
                      reference_free, maf, min_num_reads)
        filter_descriptions = {}
        name, desc = namer.get_filter_description(filter_name, parameters,
                                            filter_descriptions)
        assert name[:4] == 'nvrg'
        descrip = "It is variable, or no data, in the read_groups : rg1,rg2."
        descrip += ' All together: True. maf:0.700000. min_num_reads:2'
        assert desc == descrip

        filter_name = 'is_not_variable'
        kind = 'read_groups'
        groups = ['rg1', 'rg2']
        maf = None
        min_num_reads = None
        in_union = True
        in_all_groups = True
        reference_free = True
        parameters = (kind, tuple(groups), in_union, in_all_groups,
                      reference_free, maf, min_num_reads)
        filter_descriptions = {}
        name, desc = namer.get_filter_description(filter_name, parameters,
                                            filter_descriptions)
        assert name[:4] == 'nvrg'
        descrip = "It is variable, or no data, in the read_groups : rg1,rg2."
        descrip += ' All together: True'
        assert desc == descrip

        parameters = (4, 'read_groups')
        name, desc = namer.get_filter_description('min_groups',
                                            parameters,
                                            filter_descriptions)
        assert name == 'mr4'
        assert desc == 'SNV read in less than 4 read_groups'

        parameters = (True)
        name, desc = namer.get_filter_description('cap_enzymes',
                                            parameters,
                                            filter_descriptions)
        assert name == 'cet'
        assert desc == 'SNV is not a CAP detectable by the enzymes: all'

        parameters = (False)
        name, desc = namer.get_filter_description('cap_enzymes',
                                            parameters,
                                            filter_descriptions)
        assert name == 'cef'
        assert desc == 'SNV is not a CAP detectable by the enzymes: cheap ones'