コード例 #1
0
def insertion_variant():
    bam = os.path.join(get_data_folder(), "some_indels.bam")
    variant = Variant(chrom="1", pos=10122622, id="rs57037935", ref='T', allele='TG',
                      quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
                      samples=[['1/1', None, 546, [0, 246], [25, 25], 330]], zygosity=[VariantZygosity.HOMOZYGOUS],
                      type=VariantType.INSERTION, vcf='null.vcf', bams=[bam])
    return variant
コード例 #2
0
def deletion_variant():
    bam = os.path.join(get_data_folder(), "some_indels.bam")
    variant = Variant(chrom="1", pos=10163457, id=None, ref='CTTTA', allele='C',
                      quality=50, filter=[], info={}, format=['GT', 'PS', 'DP', 'ADALL', 'AD', 'GQ'],
                      samples=[['1/0', None, 177, [0, 0, 0], [0, 0, 0], 160]], zygosity=[VariantZygosity.HETEROZYGOUS],
                      type=VariantType.DELETION, vcf='null.vcf', bams=[bam])
    return variant
コード例 #3
0
def snp_variant():
    bam = os.path.join(get_data_folder(), "small_bam.bam")
    variant = Variant(chrom="1", pos=240000, id="GL000235", ref='T', allele='A',
                      quality=60, filter=None, info={'DP': 35, 'AF': 0.0185714}, format=['GT', 'GQ'],
                      samples=[['1/1', '50']], zygosity=[VariantZygosity.HOMOZYGOUS],
                      type=VariantType.SNP, vcf='null.vcf', bams=[bam])
    return variant
コード例 #4
0
def _df_row_to_variant(dataframe, idx, format_vcf_key_counts, sample_names,
                       filter_vcf_keys, info_vcf_key_counts, header_type, vcf,
                       bams):
    # Convert row Series into dict to improve access performance.
    row = dataframe.iloc[idx].to_dict()

    # Build sample data by iterating through FORMAT columns.
    samples = []
    zygosities = []
    format_keys = sorted(format_vcf_key_counts.keys())
    for call in sample_names:
        call_data = []
        for k in format_keys:
            count = format_vcf_key_counts[k]
            if count == 1:
                call_data.append(header_type[k](row["{}_{}".format(call, k)]))
            else:
                vals = []
                for i in range(count):
                    vals.append(header_type[k](row["{}_{}-{}".format(
                        call, k, i)]))
                call_data.append(vals)
        samples.append(call_data)
        zyg_col = "{}_GT".format(call)
        if zyg_col in row:
            zygosities.append(VariantZygosity(row[zyg_col]))

    # Build filter data by iterating over all saved FILTER keys.
    var_filter = []
    for k in filter_vcf_keys:
        if row["FILTER_{}".format(k)]:
            var_filter.append(k)

    # Build info data by iterating over all saved INFO keys.
    info = {}
    for k, count in info_vcf_key_counts.items():
        if count == 1:
            val = header_type[k](row["{}".format(k)])
            info[k] = val
        else:
            vals = []
            for i in range(count):
                vals.append(header_type[k](row["{}-{}".format(k, i)]))
            info[k] = vals

    variant = Variant(chrom=row["chrom"],
                      pos=row["start_pos"],
                      id=row["id"] if "id" in row else ".",
                      ref=row["ref"],
                      allele=row["alt"],
                      quality=row["quality"] if "quality" in row else ".",
                      filter=(var_filter if var_filter else None),
                      info=info,
                      format=format_keys,
                      type=VariantType(row["variant_type"]),
                      samples=samples,
                      zygosity=zygosities,
                      vcf=vcf,
                      bams=bams)
    return variant
コード例 #5
0
    def _create_variant_tuple_from_record(self, record, vcf_file, bam, is_fp):
        """Create a variant record from pyVCF record.

        Args:
            record : pyVCF record
            vcf_file : Path to VCF file
            bam : Path to corresponding BAM file
            is_fp : Boolean indicating whether entry is a false positive variant or not.

        Returns:
           Variant dataclass record.
        """
        var_zyg = self._get_variant_zygosity(record, is_fp)
        var_type = self._get_variant_type(record)
        # Split multi alleles into multiple entries
        for alt in record.ALT:
            var_allele = alt.sequence
            try:
                var_format = record.FORMAT.split(':')
            except AttributeError:
                if is_fp:
                    var_format = []
                else:
                    raise RuntimeError(
                        "Could not parse format field for entry - {}".format(
                            record))

            try:
                yield Variant(
                    chrom=record.CHROM,
                    pos=record.POS,
                    id=record.ID,
                    ref=record.REF,
                    allele=var_allele,
                    quality=record.QUAL,
                    filter=record.FILTER,
                    info=record.INFO,
                    format=var_format,
                    samples=[[field_value for field_value in sample.data]
                             for sample in record.samples],
                    zygosity=var_zyg,
                    type=var_type,
                    vcf=vcf_file,
                    bam=bam)
            except Exception:
                raise RuntimeError(
                    "Could not parse variant from entry - {}".format(record))
コード例 #6
0
def test_zygosity_encoder():
    encoder = ZygosityLabelEncoder()

    bam = os.path.join(get_data_folder(), "small_bam.bam")
    variant = Variant(chrom="1",
                      pos=240000,
                      id="GL000235",
                      ref='T',
                      allele='A',
                      quality=60,
                      filter=None,
                      info={
                          'DP': 35,
                          'AF': 0.0185714
                      },
                      format=['GT', 'GQ'],
                      samples=[['1/1', '50']],
                      zygosity=[VariantZygosity.HOMOZYGOUS],
                      type=VariantType.SNP,
                      vcf='null.vcf',
                      bams=[bam])
    encoding = encoder(variant)
    # Since it should return a scalar
    assert (encoding.size() == torch.Size([]))
    assert (encoding == 1)

    variant = Variant(chrom="1",
                      pos=240000,
                      id="GL000235",
                      ref='T',
                      allele='A',
                      quality=60,
                      filter=None,
                      info={
                          'DP': 35,
                          'AF': 0.0185714
                      },
                      format=['GT', 'GQ'],
                      samples=[['0/0', '50']],
                      zygosity=[VariantZygosity.NO_VARIANT],
                      type=VariantType.SNP,
                      vcf='null.vcf',
                      bams=[bam])
    encoding = encoder(variant)
    assert (encoding == 0)

    variant = Variant(chrom="1",
                      pos=240000,
                      id="GL000235",
                      ref='T',
                      allele='A',
                      quality=60,
                      filter=None,
                      info={
                          'DP': 35,
                          'AF': 0.0185714
                      },
                      format=['GT', 'GQ'],
                      samples=[['0/1', '50']],
                      zygosity=[VariantZygosity.HETEROZYGOUS],
                      type=VariantType.SNP,
                      vcf='null.vcf',
                      bams=[bam])
    encoding = encoder(variant)
    assert (encoding == 2)
コード例 #7
0
    def __getitem__(self, idx):
        """Get Variant instance in location.

        Args:
            idx: Variant index

        Returns:
            Variant instance
        """
        row = self._dataframe.iloc[idx]

        # Build sample data by iterating through FORMAT columns.
        samples = []
        zygosities = []
        format_keys = sorted(self._format_vcf_key_counts.keys())
        for call in self._sample_names:
            call_data = []
            for k in format_keys:
                count = self._format_vcf_key_counts[k]
                if count == 1:
                    call_data.append(row["{}_{}".format(call, k)])
                else:
                    for i in range(count):
                        call_data.append(row["{}_{}_{}".format(call, k, i)])
            samples.append(call_data)
            zyg_col = "{}_zyg".format(call)
            if zyg_col in row:
                zygosities.append(VariantZygosity(row[zyg_col]))

        # Build filter data by iterating over all saved FILTER keys.
        var_filter = []
        for k in self._filter_vcf_keys:
            if row["FILTER_{}".format(k)]:
                var_filter.append(k)

        # Build info data by iterating over all saved INFO keys.
        info = {}
        for k, count in self._info_vcf_key_counts.items():
            if count == 1:
                info[k] = row["{}".format(k)]
            else:
                vals = []
                for i in range(count):
                    vals.append(row["{}_{}".format(k, i)])
                info[k] = vals

        variant = Variant(chrom=row["chrom"],
                          pos=row["start_pos"],
                          id=row["id"],
                          ref=row["ref"],
                          allele=row["alt"],
                          quality=row["quality"],
                          filter=(var_filter if var_filter else None),
                          info=info,
                          format=format_keys,
                          type=VariantType(row["variant_type"]),
                          samples=samples,
                          zygosity=zygosities,
                          vcf=self._vcf,
                          bams=self._bams)
        return variant