def __init__(self, fhand, reference_name, grouping=None): 'It inits the class' # The fhand is as it arrives open(fhand.name, 'w') self.fhand = open(fhand.name, 'a') self._namer = SnvNamer() self._temp_fhand = NamedTemporaryFile(mode='a') self._filter_descriptions = {} self._header = [] if grouping is None: grouping = 'read_groups' self._genotype_grouping_key = grouping self._genotype_groups = OrderedDict() self._get_pre_header(reference_name) self.num_features = 0
class VariantCallFormatWriter(object): 'It writes variant call format files for the snvs.' def __init__(self, fhand, reference_name, grouping=None): 'It inits the class' # The fhand is as it arrives open(fhand.name, 'w') self.fhand = open(fhand.name, 'a') self._namer = SnvNamer() self._temp_fhand = NamedTemporaryFile(mode='a') self._filter_descriptions = {} self._header = [] if grouping is None: grouping = 'read_groups' self._genotype_grouping_key = grouping self._genotype_groups = OrderedDict() self._get_pre_header(reference_name) self.num_features = 0 def _get_pre_header(self, reference_name): 'It writes the header of the vcf file' header = self._header header.append('##fileformat=VCFv4.1') header.append('##fileDate=%s' % datetime.date.today().strftime('%Y%m%d')) header.append('##source=franklin') header.append('##reference=%s' % reference_name) header.append('##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">') header.append('##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">') header.append('##INFO=<ID=RC,Number=A,Type=Integer,Description="Read count of the alt alleles">') header.append('##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">') header.append('##INFO=<ID=BQ,Number=1,Type=Float,Description="RMS Base Quality">') header.append('##INFO=<ID=GC,Number=.,Type=String,Description="Genotype Counts: Num. genotypes in which every alleles has been detected">') header.append('##INFO=<ID=GP,Number=1,Type=String,Description="Genotype polimorphism">') header.append('##INFO=<ID=EZ,Number=1,Type=String,Description="CAP enzymes">') header.append('##FORMAT=<ID=GT,Number=1,Type=String,Description="Read group Genotype">') header.append('##FORMAT=<ID=EC,Number=.,Type=Integer,Description="Allele count for the ref and alt alleles in the order listed">') def close(self): 'It merges the header and the snv data' # Append the data spec to the header fhand = self.fhand self._add_filters_to_header() line_items = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] line_items.extend([group.upper() for group in self._genotype_groups.keys()]) num_items_per_line = len(line_items) self._header.append('%s\n' % '\t'.join(line_items)) fhand.write('\n'.join(self._header)) for line in open(self._temp_fhand.name): #fix the missing genotype groups line = line.strip() line = line.split() num_items = len(line) line.extend(['.:.'] * (num_items_per_line - num_items)) line = '\t'.join(line) #\fix the missing genotype groups fhand.write(line + '\n') fhand.flush() def _add_filters_to_header(self): 'It adds the used filter tag to the header' for name, desc in self._filter_descriptions.values(): filter_desc = '##FILTER=<ID=%s,Description="%s">' % (name, desc) self._header.append(filter_desc) def write(self, sequence): 'It writes the snvs present in the given sequence as SeqFeatures' for snv in sequence.get_features(kind='snv'): self.num_features += 1 self._write_snv(sequence, snv) @staticmethod def _create_alternative_alleles(alleles): 'It returns the ALT part on the vcf' str_alleles = [] alternative_alleles = [] for allele in alleles: kind = allele[1] if kind == INVARIANT: continue str_allele = allele[0].replace('-', '') str_alleles.append(str_allele) alternative_alleles.append(allele) if str_alleles: str_alleles = ','.join(str_alleles) else: str_alleles = '.' return str_alleles, alternative_alleles def _create_filters(self, qualifiers): 'It returns the FILTER part on the vcf' filter_strs = [] if 'filters' not in qualifiers: return '.' for name, filters_data in qualifiers['filters'].items(): for parameters, result in filters_data.items(): if not result: continue short_name, description = self._namer.get_filter_description( name, parameters, self._filter_descriptions) short_name = short_name.upper() filter_strs.append(short_name) self._filter_descriptions[name, parameters] = (short_name, description) if not filter_strs: return 'PASS' else: return ';'.join(filter_strs) def _create_info(self, qualifiers, alternative_alleles): 'It creates the INFO bit on the vcf' toprint_items = [] alleles = qualifiers['alleles'] #RC allele count in genotypes, for each ALT allele, in the same order as #listed acounts = [] #allele_count for allele in alternative_alleles: acount = _allele_count(allele, alleles, group_kind='read_groups') acounts.append(acount) if acounts: toprint_items.append('RC=%s' % ','.join(map(str, acounts))) #AF allele frequency for each ALT allele in the same order as listed: reference_allele = qualifiers['reference_allele'], INVARIANT if reference_allele in alleles: ref_count = _allele_count(reference_allele, alleles, group_kind='read_groups') else: ref_count = 0 total_count = float(sum(acounts) + ref_count) afreqs = [acount / total_count for acount in acounts] if afreqs: toprint_items.append('AF=%s' % ','.join(map(lambda x: '%.1f' % x, afreqs))) #MQ RMS mapping quality, e.g. MQ=52 #BQ RMS base quality at this position for kind, strfmt in (('mapping_quality', 'MQ=%.2f'), ('quality', 'BQ=%.2f')): qual = qualifiers[kind] if qual is not None: toprint_items.append(strfmt % qual) #genotype count #we count in how many genotypes every allele has been found. allele_counts = self._allele_count_by_group(alleles=alleles, reference_allele=reference_allele[0], alternative_alleles=alternative_alleles, read_groups=qualifiers['read_groups'], count_reads=False) #if some allele is missing there are 0 counts of it n_als = max(allele_counts.keys()) + 1 genotype_counts = [(al, len(allele_counts.get(al, [])))for al in range(n_als)] #now we print counts = [str(count[1]) for count in genotype_counts] toprint_items.append('GC=%s' % (','.join(counts))) #genotype polymorphism #1 - (number_groups_for_the_allele_with_more_groups / number_groups) number_of_groups = sum([count[1] for count in genotype_counts]) genotype_polymorphism = 1 - genotype_counts[0][1] / float(number_of_groups) toprint_items.append('GP=%.2f' % genotype_polymorphism) #cap enzymes if 'cap_enzymes' in qualifiers and qualifiers['cap_enzymes']: to_print = 'EZ=%s' % ','.join(qualifiers['cap_enzymes']) toprint_items.append(to_print) if toprint_items: return ';'.join(toprint_items) else: return '.' @staticmethod def _create_quality(alleles, alternative_alleles): '''It returns the quality for this snv QUAL phred-scaled quality score for the assertion made in ALT. i.e. give -10log_10 prob(call in ALT is wrong). If ALT is "." (no variant) then this is -10log_10 p(variant), and if ALT is not "." this is -10log_10 p(no variant). High QUAL scores indicate high confidence calls. Although traditionally people use integer phred scores, this field is permitted to be a floating point so to enable higher resolution for low confidence calls if desired. (Numeric, Missing Value: -1)''' if alternative_alleles: phreds = [alleles[allele]['quality'] for allele in alternative_alleles] if len(phreds) == 1: phred = phreds[0] else: inv_phred = lambda phred: math.pow(10, (-phred / 10)) probs = map(inv_phred, phreds[:2]) prob = probs[0] * probs[1] phred = -10 * math.log10(prob) else: phred = alleles.values()[0]['quality'] return '%i' % phred @staticmethod def _numbers_for_alleles(reference_allele, alternative_alleles): 'It returns a key with the numbers for the alleles' #a map from alleles to allele index (0 for reference, etc) alleles_index = [(reference_allele, INVARIANT)] alleles_index.extend(alternative_alleles) alleles_index = dict(zip(alleles_index, range(len(alleles_index)))) return alleles_index def _allele_count_by_group(self, alleles, reference_allele, alternative_alleles, read_groups, count_reads): '''It returns the allele counts by group It can answer to two questions: - How many times have the allele been found in every group? (count_reads = True) It returns a dict indexed by group and the alleles (with the vcf number coding) - in which groups the allele have been found? (count_reads= False) It returns a dict indexed by the alleles (with the vcf number coding) It takes into account the grouping_key (read_group, sample or library) It requires the dict with the information about the read_groups. ''' #a map from alleles to allele index (0 for reference, etc) alleles_index = self._numbers_for_alleles(reference_allele, alternative_alleles) grouping_key = self._genotype_grouping_key alleles_by_group = {} for allele, allele_info in alleles.items(): #we need the index for the allele # if allele not in alleles_index: # continue try: allele_index = alleles_index[allele] except KeyError: print 'allele', allele print "allele index", alleles_index print 'ref_allele', reference_allele print 'alternative_alleles', alternative_alleles print 'alleles', alleles raise for read_group in allele_info['read_groups']: group = _get_group(read_group, grouping_key, read_groups) if group not in self._genotype_groups: self._genotype_groups[group] = True if count_reads: if group not in alleles_by_group: alleles_by_group[group] = {} if allele_index not in alleles_by_group[group]: alleles_by_group[group][allele_index] = 0 count = allele_info['read_groups'][read_group] alleles_by_group[group][allele_index] += count else: if allele_index not in alleles_by_group: alleles_by_group[allele_index] = set() alleles_by_group[allele_index].add(group) return alleles_by_group def _create_genotypes(self, qualifiers, alternative_alleles): 'It returns the genotype section for this snv' alleles = qualifiers['alleles'] reference_allele = qualifiers['reference_allele'] read_groups = qualifiers['read_groups'] items = [] #the format items.append('GT:EC') #a map from alleles to allele index (0 for reference, etc) alleles_index = self._numbers_for_alleles(reference_allele, alternative_alleles) #now we need the alleles for every sample alleles_by_group = self._allele_count_by_group(alleles=alleles, reference_allele=reference_allele, alternative_alleles=alternative_alleles, read_groups=read_groups, count_reads=True) #now we can build the info for every sample for group in self._genotype_groups.keys(): allele_counts = alleles_by_group.get(group, {None: None}) alleles, counts = [], [] for allele, count in allele_counts.items(): allele = str(allele) if allele is not None else '.' alleles.append(str(allele)) count = str(count) if count is not None else '.' counts.append(count) #print group, alleles, counts mix_genotype = '|'.join(alleles) allele_counts = ','.join(counts) items.append('%s:%s' % (mix_genotype, allele_counts)) #print 'result', ' '.join(items) return '\t'.join(items) def _write_snv(self, sequence, snv): 'Given an snv feature it writes a line in the vcf' items = [] #items to write items.append(get_seq_name(sequence)) items.append(str(int(snv.location.start.position) + 1)) id_ = snv.id if id_ == "<unknown id>": id_ = '.' items.append(id_) qualifiers = snv.qualifiers ref_seq = qualifiers['reference_allele'].replace('-', '') items.append(ref_seq) toprint_af, alternative_alleles = self._create_alternative_alleles( qualifiers['alleles']) items.append(toprint_af) items.append(self._create_quality(qualifiers['alleles'], alternative_alleles)) filters = self._create_filters(qualifiers) items.append(filters) try: items.append(self._create_info(qualifiers, alternative_alleles)) except KeyError: print 'sequence', get_seq_name(sequence) print 'position', str(int(snv.location.start.position)) raise items.append(self._create_genotypes(qualifiers, alternative_alleles)) self._temp_fhand.write('%s\n' % '\t'.join(items)) self._temp_fhand.flush()
def test_get_filter_description(): 'It tests get_filter_description function' filter_name = 'close_to_intron' parameters = 30 filter_descriptions = {} namer = SnvNamer() name, desc = namer.get_filter_description(filter_name, parameters, filter_descriptions) assert name == 'I30' assert desc == 'An intron is located closer than 30 base pairs' filter_name = 'maf' parameters = 0.6 filter_descriptions = {} name, desc = namer.get_filter_description(filter_name, parameters, filter_descriptions) assert name == 'maf1' assert desc == 'The most frequent allele in All: All. frequency greater than 0.60' filter_name = 'maf' parameters = (0.6, ('1', '2'), 'read_group') filter_descriptions = {} name, desc = namer.get_filter_description(filter_name, parameters, filter_descriptions) assert name == 'maf2' assert desc == 'The most frequent allele in read_group: 1,2. frequency greater than 0.60' filter_name = 'by_kind' parameters = SNP filter_descriptions = {} name, desc = namer.get_filter_description(filter_name, parameters, filter_descriptions) filter_name = 'is_variable' kind = 'read_groups' groups = ['rg1', 'rg2'] maf = 0.6 min_num_reads = 3 in_union = True in_all_groups = True reference_free = True parameters = (kind, tuple(groups), in_union, in_all_groups, reference_free, maf, min_num_reads) filter_descriptions = {} name, desc = namer.get_filter_description(filter_name, parameters, filter_descriptions) assert name[:3] == 'vrg' descrip = "It is not variable, or no data, in the read_groups : rg1,rg2." descrip += ' All together: True. maf:0.600000. min_num_reads:3' assert desc == descrip name, desc = namer.get_filter_description(filter_name, parameters, filter_descriptions) assert name[:3] == 'vrg' filter_name = 'is_not_variable' kind = 'read_groups' groups = ['rg1', 'rg2'] maf = 0.7 min_num_reads = 2 in_union = True in_all_groups = True reference_free = True parameters = (kind, tuple(groups), in_union, in_all_groups, reference_free, maf, min_num_reads) filter_descriptions = {} name, desc = namer.get_filter_description(filter_name, parameters, filter_descriptions) assert name[:4] == 'nvrg' descrip = "It is variable, or no data, in the read_groups : rg1,rg2." descrip += ' All together: True. maf:0.700000. min_num_reads:2' assert desc == descrip filter_name = 'is_not_variable' kind = 'read_groups' groups = ['rg1', 'rg2'] maf = None min_num_reads = None in_union = True in_all_groups = True reference_free = True parameters = (kind, tuple(groups), in_union, in_all_groups, reference_free, maf, min_num_reads) filter_descriptions = {} name, desc = namer.get_filter_description(filter_name, parameters, filter_descriptions) assert name[:4] == 'nvrg' descrip = "It is variable, or no data, in the read_groups : rg1,rg2." descrip += ' All together: True' assert desc == descrip parameters = (4, 'read_groups') name, desc = namer.get_filter_description('min_groups', parameters, filter_descriptions) assert name == 'mr4' assert desc == 'SNV read in less than 4 read_groups' parameters = (True) name, desc = namer.get_filter_description('cap_enzymes', parameters, filter_descriptions) assert name == 'cet' assert desc == 'SNV is not a CAP detectable by the enzymes: all' parameters = (False) name, desc = namer.get_filter_description('cap_enzymes', parameters, filter_descriptions) assert name == 'cef' assert desc == 'SNV is not a CAP detectable by the enzymes: cheap ones'