def insertion_variant(): bam = os.path.join(get_data_folder(), "some_indels.bam") variant = Variant(chrom="1", pos=10122622, id="rs57037935", ref='T', allele='TG', quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'], samples=[['1/1', None, 546, [0, 246], [25, 25], 330]], zygosity=[VariantZygosity.HOMOZYGOUS], type=VariantType.INSERTION, vcf='null.vcf', bams=[bam]) return variant
def deletion_variant(): bam = os.path.join(get_data_folder(), "some_indels.bam") variant = Variant(chrom="1", pos=10163457, id=None, ref='CTTTA', allele='C', quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'], samples=[['1/0', None, 177, [0, 0, 0], [0, 0, 0], 160]], zygosity=[VariantZygosity.HETEROZYGOUS], type=VariantType.DELETION, vcf='null.vcf', bams=[bam]) return variant
def snp_variant(): bam = os.path.join(get_data_folder(), "small_bam.bam") variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A', quality=60, filter=None, info={'DP': 35, 'AF': 0.0185714}, format=['GT', 'GQ'], samples=[['1/1', '50']], zygosity=[VariantZygosity.HOMOZYGOUS], type=VariantType.SNP, vcf='null.vcf', bams=[bam]) return variant
def _df_row_to_variant(dataframe, idx, format_vcf_key_counts, sample_names, filter_vcf_keys, info_vcf_key_counts, header_type, vcf, bams): # Convert row Series into dict to improve access performance. row = dataframe.iloc[idx].to_dict() # Build sample data by iterating through FORMAT columns. samples = [] zygosities = [] format_keys = sorted(format_vcf_key_counts.keys()) for call in sample_names: call_data = [] for k in format_keys: count = format_vcf_key_counts[k] if count == 1: call_data.append(header_type[k](row["{}_{}".format(call, k)])) else: vals = [] for i in range(count): vals.append(header_type[k](row["{}_{}-{}".format( call, k, i)])) call_data.append(vals) samples.append(call_data) zyg_col = "{}_GT".format(call) if zyg_col in row: zygosities.append(VariantZygosity(row[zyg_col])) # Build filter data by iterating over all saved FILTER keys. var_filter = [] for k in filter_vcf_keys: if row["FILTER_{}".format(k)]: var_filter.append(k) # Build info data by iterating over all saved INFO keys. info = {} for k, count in info_vcf_key_counts.items(): if count == 1: val = header_type[k](row["{}".format(k)]) info[k] = val else: vals = [] for i in range(count): vals.append(header_type[k](row["{}-{}".format(k, i)])) info[k] = vals variant = Variant(chrom=row["chrom"], pos=row["start_pos"], id=row["id"] if "id" in row else ".", ref=row["ref"], allele=row["alt"], quality=row["quality"] if "quality" in row else ".", filter=(var_filter if var_filter else None), info=info, format=format_keys, type=VariantType(row["variant_type"]), samples=samples, zygosity=zygosities, vcf=vcf, bams=bams) return variant
def _create_variant_tuple_from_record(self, record, vcf_file, bam, is_fp): """Create a variant record from pyVCF record. Args: record : pyVCF record vcf_file : Path to VCF file bam : Path to corresponding BAM file is_fp : Boolean indicating whether entry is a false positive variant or not. Returns: Variant dataclass record. """ var_zyg = self._get_variant_zygosity(record, is_fp) var_type = self._get_variant_type(record) # Split multi alleles into multiple entries for alt in record.ALT: var_allele = alt.sequence try: var_format = record.FORMAT.split(':') except AttributeError: if is_fp: var_format = [] else: raise RuntimeError( "Could not parse format field for entry - {}".format( record)) try: yield Variant( chrom=record.CHROM, pos=record.POS, id=record.ID, ref=record.REF, allele=var_allele, quality=record.QUAL, filter=record.FILTER, info=record.INFO, format=var_format, samples=[[field_value for field_value in sample.data] for sample in record.samples], zygosity=var_zyg, type=var_type, vcf=vcf_file, bam=bam) except Exception: raise RuntimeError( "Could not parse variant from entry - {}".format(record))
def test_zygosity_encoder(): encoder = ZygosityLabelEncoder() bam = os.path.join(get_data_folder(), "small_bam.bam") variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A', quality=60, filter=None, info={ 'DP': 35, 'AF': 0.0185714 }, format=['GT', 'GQ'], samples=[['1/1', '50']], zygosity=[VariantZygosity.HOMOZYGOUS], type=VariantType.SNP, vcf='null.vcf', bams=[bam]) encoding = encoder(variant) # Since it should return a scalar assert (encoding.size() == torch.Size([])) assert (encoding == 1) variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A', quality=60, filter=None, info={ 'DP': 35, 'AF': 0.0185714 }, format=['GT', 'GQ'], samples=[['0/0', '50']], zygosity=[VariantZygosity.NO_VARIANT], type=VariantType.SNP, vcf='null.vcf', bams=[bam]) encoding = encoder(variant) assert (encoding == 0) variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A', quality=60, filter=None, info={ 'DP': 35, 'AF': 0.0185714 }, format=['GT', 'GQ'], samples=[['0/1', '50']], zygosity=[VariantZygosity.HETEROZYGOUS], type=VariantType.SNP, vcf='null.vcf', bams=[bam]) encoding = encoder(variant) assert (encoding == 2)
def __getitem__(self, idx): """Get Variant instance in location. Args: idx: Variant index Returns: Variant instance """ row = self._dataframe.iloc[idx] # Build sample data by iterating through FORMAT columns. samples = [] zygosities = [] format_keys = sorted(self._format_vcf_key_counts.keys()) for call in self._sample_names: call_data = [] for k in format_keys: count = self._format_vcf_key_counts[k] if count == 1: call_data.append(row["{}_{}".format(call, k)]) else: for i in range(count): call_data.append(row["{}_{}_{}".format(call, k, i)]) samples.append(call_data) zyg_col = "{}_zyg".format(call) if zyg_col in row: zygosities.append(VariantZygosity(row[zyg_col])) # Build filter data by iterating over all saved FILTER keys. var_filter = [] for k in self._filter_vcf_keys: if row["FILTER_{}".format(k)]: var_filter.append(k) # Build info data by iterating over all saved INFO keys. info = {} for k, count in self._info_vcf_key_counts.items(): if count == 1: info[k] = row["{}".format(k)] else: vals = [] for i in range(count): vals.append(row["{}_{}".format(k, i)]) info[k] = vals variant = Variant(chrom=row["chrom"], pos=row["start_pos"], id=row["id"], ref=row["ref"], allele=row["alt"], quality=row["quality"], filter=(var_filter if var_filter else None), info=info, format=format_keys, type=VariantType(row["variant_type"]), samples=samples, zygosity=zygosities, vcf=self._vcf, bams=self._bams) return variant