Ejemplo n.º 1
0
def add_log2ratios_to_records(vcf_reader, segments_dict):
    """Add the CNV log2 ratio to each record's genotype for a sample."""
    for record in vcf_reader:
        orig_formats = record.FORMAT.split(':')
        if orig_formats[-1] == 'LR':
            # log2 ratios were already added to this file -- replace them
            CallData = make_calldata_tuple(orig_formats)
            orig_formats = orig_formats[:-1]
        else:
            # No existing log2 ratios (the common case)
            CallData = make_calldata_tuple(orig_formats + ['LR'])
            record.add_format("LR")
        out_samples = []
        for genotype in record.samples:
            if genotype.sample in segments_dict:
                segs = segments_dict[genotype.sample]
                site_log2 = logratio_at_site(record, segs)
            else:
                site_log2 = None
            # Create a new genotype field with LR added for this sample
            call_vals = [getattr(genotype.data, fmt, None)
                         for fmt in orig_formats]
            calldata = CallData(*(call_vals + [site_log2]))
            out_samples.append(Call(record, genotype.sample, calldata))
        record.samples = out_samples
        yield record
Ejemplo n.º 2
0
def _grid_item_to_vcf_record(info_dict, obj, sample_ids, sample_names):  # , get_genotype_from_expanded_zygosity):
    CHROM = obj.get("locus__contig__name", ".")
    POS = obj.get("locus__position", ".")
    ID = obj.get("variantannotation__dbsnp_rs_id")
    REF = obj.get("locus__ref__seq", ".")
    ALT = obj.get("alt__seq", ".")
    QUAL = '.'  # QUAL = obj.get("annotation__quality", ".")
    FILTER = None
    INFO = {}

    for info_id, data in info_dict.items():
        col = data['column__variant_column']
        val = obj.get(col)
        if val:
            INFO[info_id] = val

    FORMAT = None
    MY_FORMAT = ['GT', 'AD', 'AF', 'PL', 'DP', 'GQ']
    CallData = make_calldata_tuple(MY_FORMAT)
    sample_indexes = {}
    samples = []

    if sample_ids:
        FORMAT = ':'.join(MY_FORMAT)

    alts = [_Substitution(ALT)]
    ALT = alts
    record = _Record(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, sample_indexes)

    if sample_ids:
        for i, (sample_id, sample) in enumerate(zip(sample_ids, sample_names)):
            ad = obj[f"{sample_id}_samples_allele_depth"]
            zygosity = obj[f"{sample_id}_samples_zygosity"]
            gt = Zygosity.get_genotype_from_expanded_zygosity(zygosity)
            dp = obj[f"{sample_id}_samples_read_depth"]
            af = obj[f"{sample_id}_samples_allele_frequency"]
            # GQ/PL/FT are optional now
            # TODO: Ideally, we'd not write them out
            pl = obj.get(f"{sample_id}_samples_phred_likelihood", ".")
            gq = obj.get(f"{sample_id}_samples_genotype_quality", ".")
            # TODO: Need to grab information for reference base to be able to properly fill in this data.
            data_args = {'AD': ['.', ad],
                         'GT': gt,
                         'PL': ['.', pl],
                         'DP': ['.', dp],
                         'GQ': ['.', gq],
                         'AF': ['.', af]}

            data = CallData(**data_args)
            call = _Call(record, sample, data)
            samples.append(call)
            sample_indexes[sample] = i

        record.samples = samples

    return record
Ejemplo n.º 3
0
 def _update_sample(self, record):
     call = record.samples[0]
     data = call.data
     record.QUAL = float(self._get_min_value(data.CGA_CEHQ))
     data_tuple = make_calldata_tuple(self.new_format_fields)
     od = OrderedDict()
     for k in self.new_format_fields:
         od[k] = getattr(call.data, k)
     data = data_tuple._make(od.values())
     return data
Ejemplo n.º 4
0
 def write_record(self, marker):
     '''Write the marker data to outstream.'''
     # Update the vcf INFO field.
     for info_id, val in marker.info.items():
         if isinstance(val, float):
             marker.record.add_info(info_id, round(val, 3))
         elif isinstance(val, bool):
             if val is True:
                 marker.record.add_info(info_id, val)
         elif val is not None:
             marker.record.add_info(info_id, val)
     # Hash normalization factors and insert sizes, keyed by sample name.
     nf = {}
     ins = {}
     for call in marker.calls:
         nf[call.sample] = call.NF
         ins[call.sample] = call.INS
     # Update the vcf FORMAT field.
     if 'NF' not in marker.record.FORMAT:
         marker.record.add_format('NF')
     if 'INS' not in marker.record.FORMAT:
         marker.record.add_format('INS')
     if 'DC' not in marker.record.FORMAT:
         marker.record.add_format('DC')
     # Update the vcf sample data.
     for sample in marker.record.samples:
         ids = list(sample.data._fields)
         vals = list(iter(sample.data))
         if 'NF' not in ids:
             ids.append('NF')
             try:
                 vals.append(round(nf[sample.sample], 3))
             except:
                 vals.append(None)
         if 'INS' not in ids:
             ids.append('INS')
             try:
                 vals.append(int(ins[sample.sample]))
             except:
                 vals.append(None)
         if 'DC' not in ids:
             ids.append('DC')
             try:
                 dropout_count = marker.param['H1'][-1]['exp_phi'][
                     sample.sample][2]
                 vals.append(round(dropout_count, 3))
             except:
                 vals.append(None)
         new_cls = make_calldata_tuple(ids)
         sample.data = new_cls._make(vals)
     # Write record to outstream.
     self.writer.write_record(marker.record)
     return (None)
Ejemplo n.º 5
0
 def write_record(self, marker):
     '''Write the marker data to outstream.'''
     # Update the vcf INFO field.
     for info_id, val in marker.info.items():
         if isinstance(val, float):
             marker.record.add_info(info_id, round(val, 3))
         elif isinstance(val, bool):
             if val is True:
                 marker.record.add_info(info_id, val)
         elif val is not None:
             marker.record.add_info(info_id, val)
     # Hash normalization factors and insert sizes, keyed by sample name.
     nf = {}
     ins = {}
     for call in marker.calls:
         nf[call.sample] = call.NF
         ins[call.sample] = call.INS
     # Update the vcf FORMAT field.
     if 'NF' not in marker.record.FORMAT:
         marker.record.add_format('NF')
     if 'INS' not in marker.record.FORMAT:
         marker.record.add_format('INS')
     if 'DC' not in marker.record.FORMAT:
         marker.record.add_format('DC')
     # Update the vcf sample data.
     for sample in marker.record.samples:
         ids = list(sample.data._fields)
         vals = list(iter(sample.data))
         if 'NF' not in ids:
             ids.append('NF')
             try:
                 vals.append(round(nf[sample.sample], 3))
             except:
                 vals.append(None)
         if 'INS' not in ids:
             ids.append('INS')
             try:
                 vals.append(int(ins[sample.sample]))
             except:
                 vals.append(None)
         if 'DC' not in ids:
             ids.append('DC')
             try:
                 dropout_count = marker.param['H1'][-1]['exp_phi'][sample.sample][2]
                 vals.append(round(dropout_count, 3))
             except:
                 vals.append(None)
         new_cls = make_calldata_tuple(ids)
         sample.data = new_cls._make(vals)
     # Write record to outstream.
     self.writer.write_record(marker.record)
     return(None)
Ejemplo n.º 6
0
def make_call_data(formats, gtypes, caller_flags):
    """Create a pyvcf-compatible namedtuple of a sample's genotype values."""
    CallData = make_calldata_tuple(formats + CALLER_CODES)
    call_vals = [consensus_of_gtype(gtypes, fmt) for fmt in formats]
    # Set GT field by whether any caller called the variant
    idx = formats.index("GT")
    if idx != 0:
        print("*** oddly ordered FORMAT:", formats, file=sys.stderr)
    orig_gt = call_vals[idx] or "0/1"
    is_called = int(any(caller_flags.viewvalues()))
    call_vals[idx] = SWITCH_GT[orig_gt][is_called]
    # Rebuild the genotype data structure
    return CallData(*(call_vals + caller_flags.values()))
Ejemplo n.º 7
0
    def _parse_sample_format(self, samp_fmt):
        """ Parse the format of the calls in this _Record """
        samp_fmt = make_calldata_tuple(samp_fmt.split(':'))

        for fmt in samp_fmt._fields:
            try:
                entry_type = self.formats[fmt].type
                entry_num = self.formats[fmt].num
            except KeyError:
                entry_num = None
                try:
                    entry_type = RESERVED_FORMAT[fmt]
                except KeyError:
                    entry_type = 'String'
            samp_fmt._types.append(entry_type)
            samp_fmt._nums.append(entry_num)
        return samp_fmt
Ejemplo n.º 8
0
    def filter_variants(self, keep_only_snps=False, only_good=False):
        """Filter the VCF records.

        Parameters
        ----------
        keep_only_snps: bool, optional
            Retain only SNP variants (default: False).
        only_good: bool, optional
            True/False if only SNPs that PASS should output.

        Returns
        -------
        list of records is returned.
         """

        if self._reader is None:
            # Create a reader class from input VCF.
            self._reader = vcf.Reader(filename=self.vcf_in)

        # get list of existing filters.
        existing_filters = {}
        removed_filters = []

        for filter_id in self._reader.filters:
            conf = PHEFilterBase.decode(filter_id)
            tuple(conf.keys())
            existing_filters.update({tuple(conf.keys()):filter_id})

        # Add each filter we are going to use to the record.
        # This is needed for writing out proper #FILTER header in VCF.
        for record_filter in self.filters:
            # We know that each filter has short description method.
            short_doc = record_filter.short_desc()
            short_doc = short_doc.split('\n')[0].lstrip()

            filter_name = PHEFilterBase.decode(record_filter.filter_name())

            # Check if the sample has been filtered for this type of filter
            #    in the past. If so remove is, because it is going to be refiltered.
            if tuple(filter_name) in existing_filters:
                logging.info("Removing existing filter: %s", existing_filters[tuple(filter_name)])
                removed_filters.append(existing_filters[tuple(filter_name)])
                del self._reader.filters[existing_filters[tuple(filter_name)]]

            self._reader.filters[record_filter.filter_name()] = _Filter(record_filter.filter_name(), short_doc)

        # Update the filters for output.
        self._update_filters(self._reader.filters)

        _pos = 1
        _chrom = None
        # For each record (POSITION) apply set of filters.
        for record in self._reader:

            if _chrom != record.CHROM:
                _pos, _chrom = 1, record.CHROM

            # Fill in any missing consecutive data with GT=./. records.
            while _pos <= record.POS:
                if _pos == record.POS:
                    _record = record
                else:
                    # This is a padding "N" record when records do not follow each other,
                    #    and there is a gap. e,g, 1,2,3,5,6 -> in 4 "N" will be inserted.

                    _ref = self._get_reference_base(record.CHROM, _pos)

                    _record = vcf.model._Record(record.CHROM, _pos, ".", _ref, [None], 0, [], {}, 'GT', None)
                    _calls = []
                    sorted_samples = sorted(record._sample_indexes.items(), key=operator.itemgetter(1))
                    for sample, i in sorted_samples:

                        _data = make_calldata_tuple(["GT"])
                        _data._types = ["String"]
                        _data._nums = [1]
                        d = ["./."]
                        _calls.append(vcf.model._Call(_record, sample=sample, data=_data(*d)))

                    _record.samples = _calls
                    _record._sample_indexes = dict(sorted_samples)


                self._filter_record(_record, removed_filters)

                # After applying all filters, check if FILTER is None.
                # If it is, then record PASSED all filters.
                if _record.FILTER is None or _record.FILTER == []:
                    if not record.is_monomorphic:
                        _record.FILTER = []

                        if not keep_only_snps or (_record.is_snp and keep_only_snps):

                            self._variants.append(_record)

                elif not only_good:
                    self._variants.append(_record)

                _pos += 1
                if _chrom is None:
                    _chrom = record.CHROM
        return [ variant for variant in self._variants if not variant.FILTER]
Ejemplo n.º 9
0
    def filter_variants(self, keep_only_snps=False, only_good=False):
        """Filter the VCF records.

        Parameters
        ----------
        keep_only_snps: bool, optional
            Retain only SNP variants (default: False).
        only_good: bool, optional
            True/False if only SNPs that PASS should output.

        Returns
        -------
        list of records is returned.
         """

        if self._reader is None:
            # Create a reader class from input VCF.
            self._reader = vcf.Reader(filename=self.vcf_in)

        # get list of existing filters.
        existing_filters = {}
        removed_filters = []

        for filter_id in self._reader.filters:
            conf = PHEFilterBase.decode(filter_id)
            tuple(conf.keys())
            existing_filters.update({tuple(conf.keys()):filter_id})

        # Add each filter we are going to use to the record.
        # This is needed for writing out proper #FILTER header in VCF.
        for record_filter in self.filters:
            # We know that each filter has short description method.
            short_doc = record_filter.short_desc()
            short_doc = short_doc.split('\n')[0].lstrip()

            filter_name = PHEFilterBase.decode(record_filter.filter_name())

            # Check if the sample has been filtered for this type of filter
            #    in the past. If so remove is, because it is going to be refiltered.
            if tuple(filter_name) in existing_filters:
                logging.info("Removing existing filter: %s", existing_filters[tuple(filter_name)])
                removed_filters.append(existing_filters[tuple(filter_name)])
                del self._reader.filters[existing_filters[tuple(filter_name)]]

            self._reader.filters[record_filter.filter_name()] = _Filter(record_filter.filter_name(), short_doc)

        # Update the filters for output.
        self._update_filters(self._reader.filters)

        _pos = 1
        _chrom = None
        # For each record (POSITION) apply set of filters.
        for record in self._reader:

            if _chrom != record.CHROM:
                _pos, _chrom = 1, record.CHROM

            # Fill in any missing consecutive data with GT=./. records.
            while _pos <= record.POS:
                if _pos == record.POS:
                    _record = record
                else:
                    # This is a padding "N" record when records do not follow each other,
                    #    and there is a gap. e,g, 1,2,3,5,6 -> in 4 "N" will be inserted.

                    _ref = self._get_reference_base(record.CHROM, _pos)

                    _record = vcf.model._Record(record.CHROM, _pos, ".", _ref, [None], 0, [], {}, 'GT', None)
                    _calls = []
                    sorted_samples = sorted(record._sample_indexes.items(), key=operator.itemgetter(1))
                    for sample, i in sorted_samples:

                        _data = make_calldata_tuple(["GT"])
                        _data._types = ["String"]
                        _data._nums = [1]
                        d = ["./."]
                        _calls.append(vcf.model._Call(_record, sample=sample, data=_data(*d)))

                    _record.samples = _calls
                    _record._sample_indexes = dict(sorted_samples)


                self._filter_record(_record, removed_filters)

                # After applying all filters, check if FILTER is None.
                # If it is, then record PASSED all filters.
                if _record.FILTER is None or _record.FILTER == []:
                    if not record.is_monomorphic:
                        _record.FILTER = []

                        if not keep_only_snps or (_record.is_snp and keep_only_snps):

                            self._variants.append(_record)

                elif not only_good:
                    self._variants.append(_record)

                _pos += 1
                if _chrom is None:
                    _chrom = record.CHROM
        return [ variant for variant in self._variants if not variant.FILTER]
Ejemplo n.º 10
0
 def calldata_class(self):
     return make_calldata_tuple(self.record.FORMAT.split(':'))
Ejemplo n.º 11
0
def create_call(sample_id, **kwargs):
    keys = list(kwargs.keys())
    nt = make_calldata_tuple(keys)
Ejemplo n.º 12
0
def annotate_variants(vcf_iter, vep_cols, sample_names, annotated_snp_pos_dict, cosmic_pos_dict, snp_threshold, hotspot_codons):
    """Generator yielding annotated records ready
    for being outputted to a file. Combine samples
    together.
    """
    for records in vcf_iter:
        # Find one exemplary record/sample carrying variant
        one_record = obtain_one_record(records)
        # Otherwise, continue
        # Find common denominator of format fields
        all_fmt_fields = [set(r.FORMAT.split(":")) for r in records if r is not None]
        common_fmt_fields = set.intersection(*all_fmt_fields)
        # Ensure that GT is first
        if "GT" in common_fmt_fields:
            common_fmt_fields = ["GT"] + list(common_fmt_fields - set(["GT"]))
        common_fmt_string = ":".join(common_fmt_fields)
        one_record.FORMAT = common_fmt_string
        format_tuple = model.make_calldata_tuple(common_fmt_fields)
        blanks = ["0"] * (len(common_fmt_fields) - 1)
        uncalled_format = format_tuple("0/0", *blanks)
        # Obtain top effect
        top_effect = obtain_top_effect(one_record, vep_cols)
        # Generate calls for all records
        combined_samples = []
        num_affected_samples = 0
        for sample_name, record in zip(sample_names, records):
            if record is None:
                new_sample = vcf.model._Call(site=one_record, sample=sample_name, data=uncalled_format)
            else:
                new_sample = record.samples[0]
                fmt_dict = record.samples[0].data._asdict()
                fmt_data = format_tuple(*[fmt_dict[field] for field in common_fmt_fields])
                new_sample.data = fmt_data
                num_affected_samples += 1
            combined_samples.append(new_sample)
        one_record.samples = combined_samples
        # Remove unnecessary INFO tags
        for k in one_record.INFO.keys():
            if k not in []:
                del one_record.INFO[k]
        # Clear QUAL col
        one_record.QUAL = "."
        # Annotate variant
        variant_id = create_pos_id(one_record.CHROM, one_record.POS)
        if variant_id in annotated_snp_pos_dict:
            one_record.INFO["ANN_SNP_POS"] = True
        if variant_id in cosmic_pos_dict:
            one_record.INFO["COSMIC"] = True
        one_record.INFO["NUM_SAMPLES"] = num_affected_samples
        top_effect_encoded = "|".join([top_effect[col] for col in vep_cols])
        one_record.INFO["TOP_CSQ"] = top_effect_encoded
        if num_affected_samples >= snp_threshold:
            one_record.INFO["SNP"] = True
        if top_effect["HGVSp"] != "" and "synonymous_variant" not in top_effect["Consequence"]:
            one_record.INFO["PROTEIN_CHANGE"] = True
            if one_record.is_snp:
                transcript, codon_num, alt_codon = obtain_mutated_codon_info(top_effect)
                if hotspot_codons[transcript].setdefault(codon_num, False):
                    one_record.INFO["HOTSPOT"] = True
        # Yield variant
        yield one_record