def add_log2ratios_to_records(vcf_reader, segments_dict): """Add the CNV log2 ratio to each record's genotype for a sample.""" for record in vcf_reader: orig_formats = record.FORMAT.split(':') if orig_formats[-1] == 'LR': # log2 ratios were already added to this file -- replace them CallData = make_calldata_tuple(orig_formats) orig_formats = orig_formats[:-1] else: # No existing log2 ratios (the common case) CallData = make_calldata_tuple(orig_formats + ['LR']) record.add_format("LR") out_samples = [] for genotype in record.samples: if genotype.sample in segments_dict: segs = segments_dict[genotype.sample] site_log2 = logratio_at_site(record, segs) else: site_log2 = None # Create a new genotype field with LR added for this sample call_vals = [getattr(genotype.data, fmt, None) for fmt in orig_formats] calldata = CallData(*(call_vals + [site_log2])) out_samples.append(Call(record, genotype.sample, calldata)) record.samples = out_samples yield record
def _grid_item_to_vcf_record(info_dict, obj, sample_ids, sample_names): # , get_genotype_from_expanded_zygosity): CHROM = obj.get("locus__contig__name", ".") POS = obj.get("locus__position", ".") ID = obj.get("variantannotation__dbsnp_rs_id") REF = obj.get("locus__ref__seq", ".") ALT = obj.get("alt__seq", ".") QUAL = '.' # QUAL = obj.get("annotation__quality", ".") FILTER = None INFO = {} for info_id, data in info_dict.items(): col = data['column__variant_column'] val = obj.get(col) if val: INFO[info_id] = val FORMAT = None MY_FORMAT = ['GT', 'AD', 'AF', 'PL', 'DP', 'GQ'] CallData = make_calldata_tuple(MY_FORMAT) sample_indexes = {} samples = [] if sample_ids: FORMAT = ':'.join(MY_FORMAT) alts = [_Substitution(ALT)] ALT = alts record = _Record(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes) if sample_ids: for i, (sample_id, sample) in enumerate(zip(sample_ids, sample_names)): ad = obj[f"{sample_id}_samples_allele_depth"] zygosity = obj[f"{sample_id}_samples_zygosity"] gt = Zygosity.get_genotype_from_expanded_zygosity(zygosity) dp = obj[f"{sample_id}_samples_read_depth"] af = obj[f"{sample_id}_samples_allele_frequency"] # GQ/PL/FT are optional now # TODO: Ideally, we'd not write them out pl = obj.get(f"{sample_id}_samples_phred_likelihood", ".") gq = obj.get(f"{sample_id}_samples_genotype_quality", ".") # TODO: Need to grab information for reference base to be able to properly fill in this data. data_args = {'AD': ['.', ad], 'GT': gt, 'PL': ['.', pl], 'DP': ['.', dp], 'GQ': ['.', gq], 'AF': ['.', af]} data = CallData(**data_args) call = _Call(record, sample, data) samples.append(call) sample_indexes[sample] = i record.samples = samples return record
def _update_sample(self, record): call = record.samples[0] data = call.data record.QUAL = float(self._get_min_value(data.CGA_CEHQ)) data_tuple = make_calldata_tuple(self.new_format_fields) od = OrderedDict() for k in self.new_format_fields: od[k] = getattr(call.data, k) data = data_tuple._make(od.values()) return data
def write_record(self, marker): '''Write the marker data to outstream.''' # Update the vcf INFO field. for info_id, val in marker.info.items(): if isinstance(val, float): marker.record.add_info(info_id, round(val, 3)) elif isinstance(val, bool): if val is True: marker.record.add_info(info_id, val) elif val is not None: marker.record.add_info(info_id, val) # Hash normalization factors and insert sizes, keyed by sample name. nf = {} ins = {} for call in marker.calls: nf[call.sample] = call.NF ins[call.sample] = call.INS # Update the vcf FORMAT field. if 'NF' not in marker.record.FORMAT: marker.record.add_format('NF') if 'INS' not in marker.record.FORMAT: marker.record.add_format('INS') if 'DC' not in marker.record.FORMAT: marker.record.add_format('DC') # Update the vcf sample data. for sample in marker.record.samples: ids = list(sample.data._fields) vals = list(iter(sample.data)) if 'NF' not in ids: ids.append('NF') try: vals.append(round(nf[sample.sample], 3)) except: vals.append(None) if 'INS' not in ids: ids.append('INS') try: vals.append(int(ins[sample.sample])) except: vals.append(None) if 'DC' not in ids: ids.append('DC') try: dropout_count = marker.param['H1'][-1]['exp_phi'][ sample.sample][2] vals.append(round(dropout_count, 3)) except: vals.append(None) new_cls = make_calldata_tuple(ids) sample.data = new_cls._make(vals) # Write record to outstream. self.writer.write_record(marker.record) return (None)
def write_record(self, marker): '''Write the marker data to outstream.''' # Update the vcf INFO field. for info_id, val in marker.info.items(): if isinstance(val, float): marker.record.add_info(info_id, round(val, 3)) elif isinstance(val, bool): if val is True: marker.record.add_info(info_id, val) elif val is not None: marker.record.add_info(info_id, val) # Hash normalization factors and insert sizes, keyed by sample name. nf = {} ins = {} for call in marker.calls: nf[call.sample] = call.NF ins[call.sample] = call.INS # Update the vcf FORMAT field. if 'NF' not in marker.record.FORMAT: marker.record.add_format('NF') if 'INS' not in marker.record.FORMAT: marker.record.add_format('INS') if 'DC' not in marker.record.FORMAT: marker.record.add_format('DC') # Update the vcf sample data. for sample in marker.record.samples: ids = list(sample.data._fields) vals = list(iter(sample.data)) if 'NF' not in ids: ids.append('NF') try: vals.append(round(nf[sample.sample], 3)) except: vals.append(None) if 'INS' not in ids: ids.append('INS') try: vals.append(int(ins[sample.sample])) except: vals.append(None) if 'DC' not in ids: ids.append('DC') try: dropout_count = marker.param['H1'][-1]['exp_phi'][sample.sample][2] vals.append(round(dropout_count, 3)) except: vals.append(None) new_cls = make_calldata_tuple(ids) sample.data = new_cls._make(vals) # Write record to outstream. self.writer.write_record(marker.record) return(None)
def make_call_data(formats, gtypes, caller_flags): """Create a pyvcf-compatible namedtuple of a sample's genotype values.""" CallData = make_calldata_tuple(formats + CALLER_CODES) call_vals = [consensus_of_gtype(gtypes, fmt) for fmt in formats] # Set GT field by whether any caller called the variant idx = formats.index("GT") if idx != 0: print("*** oddly ordered FORMAT:", formats, file=sys.stderr) orig_gt = call_vals[idx] or "0/1" is_called = int(any(caller_flags.viewvalues())) call_vals[idx] = SWITCH_GT[orig_gt][is_called] # Rebuild the genotype data structure return CallData(*(call_vals + caller_flags.values()))
def _parse_sample_format(self, samp_fmt): """ Parse the format of the calls in this _Record """ samp_fmt = make_calldata_tuple(samp_fmt.split(':')) for fmt in samp_fmt._fields: try: entry_type = self.formats[fmt].type entry_num = self.formats[fmt].num except KeyError: entry_num = None try: entry_type = RESERVED_FORMAT[fmt] except KeyError: entry_type = 'String' samp_fmt._types.append(entry_type) samp_fmt._nums.append(entry_num) return samp_fmt
def filter_variants(self, keep_only_snps=False, only_good=False): """Filter the VCF records. Parameters ---------- keep_only_snps: bool, optional Retain only SNP variants (default: False). only_good: bool, optional True/False if only SNPs that PASS should output. Returns ------- list of records is returned. """ if self._reader is None: # Create a reader class from input VCF. self._reader = vcf.Reader(filename=self.vcf_in) # get list of existing filters. existing_filters = {} removed_filters = [] for filter_id in self._reader.filters: conf = PHEFilterBase.decode(filter_id) tuple(conf.keys()) existing_filters.update({tuple(conf.keys()):filter_id}) # Add each filter we are going to use to the record. # This is needed for writing out proper #FILTER header in VCF. for record_filter in self.filters: # We know that each filter has short description method. short_doc = record_filter.short_desc() short_doc = short_doc.split('\n')[0].lstrip() filter_name = PHEFilterBase.decode(record_filter.filter_name()) # Check if the sample has been filtered for this type of filter # in the past. If so remove is, because it is going to be refiltered. if tuple(filter_name) in existing_filters: logging.info("Removing existing filter: %s", existing_filters[tuple(filter_name)]) removed_filters.append(existing_filters[tuple(filter_name)]) del self._reader.filters[existing_filters[tuple(filter_name)]] self._reader.filters[record_filter.filter_name()] = _Filter(record_filter.filter_name(), short_doc) # Update the filters for output. self._update_filters(self._reader.filters) _pos = 1 _chrom = None # For each record (POSITION) apply set of filters. for record in self._reader: if _chrom != record.CHROM: _pos, _chrom = 1, record.CHROM # Fill in any missing consecutive data with GT=./. records. while _pos <= record.POS: if _pos == record.POS: _record = record else: # This is a padding "N" record when records do not follow each other, # and there is a gap. e,g, 1,2,3,5,6 -> in 4 "N" will be inserted. _ref = self._get_reference_base(record.CHROM, _pos) _record = vcf.model._Record(record.CHROM, _pos, ".", _ref, [None], 0, [], {}, 'GT', None) _calls = [] sorted_samples = sorted(record._sample_indexes.items(), key=operator.itemgetter(1)) for sample, i in sorted_samples: _data = make_calldata_tuple(["GT"]) _data._types = ["String"] _data._nums = [1] d = ["./."] _calls.append(vcf.model._Call(_record, sample=sample, data=_data(*d))) _record.samples = _calls _record._sample_indexes = dict(sorted_samples) self._filter_record(_record, removed_filters) # After applying all filters, check if FILTER is None. # If it is, then record PASSED all filters. if _record.FILTER is None or _record.FILTER == []: if not record.is_monomorphic: _record.FILTER = [] if not keep_only_snps or (_record.is_snp and keep_only_snps): self._variants.append(_record) elif not only_good: self._variants.append(_record) _pos += 1 if _chrom is None: _chrom = record.CHROM return [ variant for variant in self._variants if not variant.FILTER]
def calldata_class(self): return make_calldata_tuple(self.record.FORMAT.split(':'))
def create_call(sample_id, **kwargs): keys = list(kwargs.keys()) nt = make_calldata_tuple(keys)
def annotate_variants(vcf_iter, vep_cols, sample_names, annotated_snp_pos_dict, cosmic_pos_dict, snp_threshold, hotspot_codons): """Generator yielding annotated records ready for being outputted to a file. Combine samples together. """ for records in vcf_iter: # Find one exemplary record/sample carrying variant one_record = obtain_one_record(records) # Otherwise, continue # Find common denominator of format fields all_fmt_fields = [set(r.FORMAT.split(":")) for r in records if r is not None] common_fmt_fields = set.intersection(*all_fmt_fields) # Ensure that GT is first if "GT" in common_fmt_fields: common_fmt_fields = ["GT"] + list(common_fmt_fields - set(["GT"])) common_fmt_string = ":".join(common_fmt_fields) one_record.FORMAT = common_fmt_string format_tuple = model.make_calldata_tuple(common_fmt_fields) blanks = ["0"] * (len(common_fmt_fields) - 1) uncalled_format = format_tuple("0/0", *blanks) # Obtain top effect top_effect = obtain_top_effect(one_record, vep_cols) # Generate calls for all records combined_samples = [] num_affected_samples = 0 for sample_name, record in zip(sample_names, records): if record is None: new_sample = vcf.model._Call(site=one_record, sample=sample_name, data=uncalled_format) else: new_sample = record.samples[0] fmt_dict = record.samples[0].data._asdict() fmt_data = format_tuple(*[fmt_dict[field] for field in common_fmt_fields]) new_sample.data = fmt_data num_affected_samples += 1 combined_samples.append(new_sample) one_record.samples = combined_samples # Remove unnecessary INFO tags for k in one_record.INFO.keys(): if k not in []: del one_record.INFO[k] # Clear QUAL col one_record.QUAL = "." # Annotate variant variant_id = create_pos_id(one_record.CHROM, one_record.POS) if variant_id in annotated_snp_pos_dict: one_record.INFO["ANN_SNP_POS"] = True if variant_id in cosmic_pos_dict: one_record.INFO["COSMIC"] = True one_record.INFO["NUM_SAMPLES"] = num_affected_samples top_effect_encoded = "|".join([top_effect[col] for col in vep_cols]) one_record.INFO["TOP_CSQ"] = top_effect_encoded if num_affected_samples >= snp_threshold: one_record.INFO["SNP"] = True if top_effect["HGVSp"] != "" and "synonymous_variant" not in top_effect["Consequence"]: one_record.INFO["PROTEIN_CHANGE"] = True if one_record.is_snp: transcript, codon_num, alt_codon = obtain_mutated_codon_info(top_effect) if hotspot_codons[transcript].setdefault(codon_num, False): one_record.INFO["HOTSPOT"] = True # Yield variant yield one_record