Example #1
0
 def create_record(self, values):
     """Create new entry in dbVar SV table."""
     self.fh_tsv.write(
         "\t".join(
             [
                 self.genome_release,
                 values["chr"],
                 values["outermost_start"],
                 values["outermost_stop"],
                 str(binning.assign_bin(
                     int(values["outermost_start"]) - 1, int(values["outermost_stop"])
                 )),
                 values["variant_count"],
                 values["variant_type"],
                 values["method"],
                 values["analysis"],
                 values["platform"],
                 values["study"],
                 list_to_str(values.get("clinical_assertion", "").split(";")),
                 list_to_str(values.get("clinvar_accessions", "").split(";")),
                 values["bin_size"],
                 values.get("min_insertion_length", ""),
                 values.get("max_insertion_length", ""),
             ]
         ) + "\n"
     )
Example #2
0
 def convert(self):
     self.fh_tsv.write(self.header + "\n")
     with open(self.path, "rt") as inputf:
         chrom = None
         var_type = None
         for line in inputf:
             if line and line[-1] == "\n":
                 line = line[:-1]
             if line.startswith("track"):
                 if line.split()[1].startswith("name=delControls"):
                     var_type = "deletion"
                 else:
                     if not line.split()[1].startswith("name=dupControls"):
                         raise Exception("Unexpected track line: {}".format(line))
                     var_type = "duplication"
             else:
                 arr = line.split()
                 self.fh_tsv.write(
                     "\t".join([
                         self.genome_release,
                         arr[0][len("chr") :],
                         arr[1],
                         arr[2],
                         str(binning.assign_bin(int(arr[1]) - 1, int(arr[2]))),
                         var_type,
                         arr[3].split("-")[1],
                         arr[4],
                     ]) + "\n"
                 )
                 # read next line
                 if chrom != arr[0]:
                     print(
                         "Starting sv type {} on contig {}".format(var_type, arr[0])
                     )
                 chrom = arr[0]
Example #3
0
 def _import_elements(self, element_types, reg_map, path_bed):
     header = None
     with path_bed.open("rt") as inputf:
         for line in inputf:
             line = line.strip()
             arr = line.split("\t")
             if not header:
                 header = arr
                 continue
             chrom, begin, end, et_slug, score = arr[:5]
             begin = int(begin)
             end = int(end)
             elem_type = element_types[et_slug]
             score = float("NaN") if score in ("", ".",
                                               "-") else float(score)
             if len(arr) > 5:
                 extra_data = json.loads(arr[5])
             else:
                 extra_data = None
             RegElement.objects.create(
                 reg_map=reg_map,
                 elem_type=elem_type,
                 release=reg_map.collection.release,
                 chromosome=chrom,
                 start=begin + 1,
                 end=end,
                 bin=binning.assign_bin(begin, end),
                 score=score,
                 extra_data=extra_data,
             )
Example #4
0
class RegInteractionFactory(factory.django.DjangoModelFactory):
    """Factory for ``RegInteraction`` records."""
    class Meta:
        model = RegInteraction

    reg_map = factory.SubFactory(RegMapFactory)
    release = "GRCh37"
    chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    start = factory.Sequence(lambda n: (n + 1) * 1000)
    end = factory.Sequence(lambda n: (n + 1) * 1500 + 100)
    bin = factory.LazyAttribute(
        lambda obj: binning.assign_bin(obj.start, obj.end))
    score = 1.0

    chromosome1 = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    start1 = factory.Sequence(lambda n: (n + 1) * 1000)
    end1 = factory.Sequence(lambda n: (n + 1) * 1000 + 100)
    chromosome2 = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    start2 = factory.Sequence(lambda n: (n + 1) * 1500)
    end2 = factory.Sequence(lambda n: (n + 1) * 1500 + 100)

    extra_data = None

    @factory.post_generation
    def fix_bins(obj, *args, **kwargs):
        obj.bin = binning.assign_bin(obj.start - 1, obj.end)
        obj.save()
Example #5
0
 def __init__(self,
              chromosome,
              reference_type,
              accession,
              gene,
              orientation,
              start,
              stop,
              exon_starts,
              exon_stops,
              source,
              transcript=1,
              cds=None,
              select_transcript=False,
              version=None):
     self.chromosome = chromosome
     self.reference_type = reference_type
     self.accession = accession
     self.gene = gene
     self.orientation = orientation
     self.start = start
     self.stop = stop
     self.exon_starts = exon_starts
     self.exon_stops = exon_stops
     self.source = source
     self.transcript = transcript
     self.cds = cds
     self.select_transcript = select_transcript
     self.version = version
     self.bin = binning.assign_bin(self.start - 1, self.stop)
Example #6
0
class StructuralVariantFactory(factory.django.DjangoModelFactory):
    class Meta:
        model = StructuralVariant
        exclude = ["case", "variant_set"]

    class Params:
        #: The genotypes to create, by default only first is het. the rest is wild-type.
        genotypes = default_genotypes

    release = "GRCh37"
    chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    chromosome_no = factory.Iterator(list(range(1, 25)))
    start = factory.Sequence(lambda n: (n + 1) * 100)
    end = factory.Sequence(lambda n: (n + 1) * 100 + 100)

    bin = factory.LazyAttribute(lambda obj: binning.assign_bin(obj.start, obj.end))

    start_ci_left = -100
    start_ci_right = 100
    end_ci_left = -100
    end_ci_right = 100

    #: Model pseudo-attribute, not stored in database.  Instead, ``set_id`` is stored.
    variant_set = factory.SubFactory(StructuralVariantSetFactory)
    #: The actual reference to the ``StructuralVariantSet``.
    set_id = factory.LazyAttribute(lambda o: o.variant_set.id)
    #: Model pseudo-attribute, not stored in database.  Instead ``case_id`` is stored.
    case = factory.LazyAttribute(lambda obj: Case.objects.get(id=obj.case_id))
    #: The actual foreign key to the ``Case``.
    case_id = factory.SelfAttribute("variant_set.case.id")

    caller = "DELLYv4001"
    sv_type = "DEL"
    sv_sub_type = "DEL"

    genotype = factory.LazyAttribute(
        lambda obj: {
            line["patient"]: {"gt": gt, "gq": 10, "src": 10, "srv": 5, "pec": 10, "pev": 5}
            for line, gt in zip(obj.case.pedigree, obj.genotypes())
        }
    )

    @factory.lazy_attribute
    def info(self):
        num_affected = 0
        num_unaffected = 0
        for line, gt in zip(self.case.pedigree, self.genotypes()):
            if "1" in gt:
                if line.get("affected") == 2:
                    num_affected += 1
                else:
                    num_affected += 1
        return {
            "affectedCarriers": num_affected,
            "unaffectedCarriers": num_unaffected,
            "backgroundCarriers": 0,
        }
def test_containing(intervals, interval):
    start, stop = interval

    # Intervals completely containing the query interval.
    containing = set((x, y) for x, y in intervals
                     if x <= start and stop <= y)

    # Pre-selection of intervals using binning.
    binned = set((x, y) for x, y in intervals
                 if binning.assign_bin(x, y)
                 in binning.containing_bins(start, stop))

    assert binned.issuperset(containing)
def test_contained(intervals, interval):
    start, stop = interval

    # Intervals completely contained by the query interval.
    contained = set((x, y) for x, y in intervals
                    if start <= x and y <= stop)

    # Pre-selection of intervals using binning.
    binned = set((x, y) for x, y in intervals
                 if binning.assign_bin(x, y)
                 in binning.contained_bins(start, stop))

    assert binned.issuperset(contained)
def test_overlapping(intervals, interval):
    start, stop = interval

    # Intervals overlapping the query interval.
    overlapping = set((x, y) for x, y in intervals
                      if x < stop and start < y)

    # Pre-selection of intervals using binning.
    binned = set((x, y) for x, y in intervals
                 if binning.assign_bin(x, y)
                 in binning.overlapping_bins(start, stop))

    assert binned.issuperset(overlapping)
Example #10
0
def upgrade():
    # We want to add a NOT NULL column without default value. So we first add
    # the column without the constraint, then populate it, then add the
    # constraint.
    # Unfortunately, SQLite doesn't support adding the constraint on an
    # existing column. We use batch_alter_table to workaround this. Of course
    # this makes the entire migration horribly awkward on SQLite, but I can't
    # really be bothered to improve it. This works.
    # Also, the downgrade will fail on SQLite, but we don't support downgrades
    # anyway, so I'm not fixing it.
    connection = op.get_bind()

    op.add_column('transcript_mappings',
                  sa.Column('bin', sa.Integer(), nullable=True))

    transcript_mappings = sql.table('transcript_mappings',
                                    sql.column('id', sa.Integer()),
                                    sql.column('start', sa.Integer()),
                                    sql.column('stop', sa.Integer()),
                                    sql.column('bin', sa.Integer()))

    result = connection.execute(transcript_mappings.select().with_only_columns(
        [
            transcript_mappings.c.id, transcript_mappings.c.start,
            transcript_mappings.c.stop
        ]))

    while True:
        chunk = result.fetchmany(1000)
        if not chunk:
            break

        statement = transcript_mappings.update().where(
            transcript_mappings.c.id == sql.bindparam('m_id')).values(
                {'bin': sql.bindparam('m_bin')})

        connection.execute(
            statement, [{
                'm_id': m.id,
                'm_bin': binning.assign_bin(m.start - 1, m.stop)
            } for m in chunk])

    # See note above.
    with op.batch_alter_table('transcript_mappings') as batch_op:
        batch_op.alter_column('bin',
                              nullable=False,
                              existing_type=sa.Integer())

    op.create_index(op.f('ix_transcript_mappings_bin'),
                    'transcript_mappings', ['bin'],
                    unique=False)
Example #11
0
 def test_clinvar_query_fail(self):
     created, query = self.create(ClinvarFactory)
     self.run_get_query(
         Clinvar,
         {
             **query,
             **{
                 "start": created.start + 1,
                 "end": created.end + 1,
                 "bin": binning.assign_bin(created.start, created.end + 1),
             },
         },
         Clinvar.DoesNotExist,
     )
Example #12
0
 def __init__(self, variation, chromosome, position, reference, observed,
              zygosity=None, support=1):
     self.variation = variation
     self.chromosome = chromosome
     self.position = position
     self.reference = reference
     self.observed = observed
     # We choose the 'region' of the reference covered by an insertion to
     # be the base next to it.
     self.bin = binning.assign_bin(
         self.position - 1,
         self.position + max(1, len(self.reference)) - 1)
     self.zygosity = zygosity
     self.support = support
def run(args):
    header = next(args.input)
    args.output.write(header)
    for record in tsv_reader(args.input, header):
        if record["end"] == "":
            try:
                record["end"] = str(
                    int(record["start"]) + len(record["reference"]) - 1)
            except KeyError:
                raise KeyError(
                    "Please make sure `end` column is filled when not providing `alternative` column."
                )
        record["bin"] = str(
            binning.assign_bin(int(record["start"]) - 1, int(record["end"])))
        args.output.write("%s\n" % "\t".join(v for v in record.values()))
Example #14
0
class ExacCnvFactory(factory.django.DjangoModelFactory):
    class Meta:
        model = ExacCnv

    release = "GRCh37"
    chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    start = factory.Sequence(lambda n: (n + 1) * 100)
    end = factory.Sequence(lambda n: (n + 1) * 100 + 100)

    bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100,
                                                        (n + 1) * 100 + 100))

    sv_type = "DEL"
    population = factory.Iterator([x[0] for x in EXAC_POP_CHOICES])
    phred_score = factory.Iterator(list(range(30)))
Example #15
0
class _UserAnnotationFactory(factory.django.DjangoModelFactory):
    class Meta:
        abstract = True

    bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100, (n + 1) * 100 + 100))

    release = "GRCh37"
    chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    start = factory.Sequence(lambda n: (n + 1) * 100)
    end = factory.Sequence(lambda n: (n + 1) * 100 + 100)

    sv_type = "DEL"
    sv_sub_type = "DEL"

    # user = factory.SubFactory(UserFactory)  # TODO
    case = factory.SubFactory(CaseFactory)
def upgrade():
    # We want to add a NOT NULL column without default value. So we first add
    # the column without the constraint, then populate it, then add the
    # constraint.
    # Unfortunately, SQLite doesn't support adding the constraint on an
    # existing column. We use batch_alter_table to workaround this. Of course
    # this makes the entire migration horribly awkward on SQLite, but I can't
    # really be bothered to improve it. This works.
    # Also, the downgrade will fail on SQLite, but we don't support downgrades
    # anyway, so I'm not fixing it.
    connection = op.get_bind()

    op.add_column('transcript_mappings', sa.Column('bin', sa.Integer(), nullable=True))

    transcript_mappings = sql.table('transcript_mappings',
                                    sql.column('id', sa.Integer()),
                                    sql.column('start', sa.Integer()),
                                    sql.column('stop', sa.Integer()),
                                    sql.column('bin', sa.Integer()))

    result = connection.execute(
        transcript_mappings.select().with_only_columns([
            transcript_mappings.c.id,
            transcript_mappings.c.start,
            transcript_mappings.c.stop]))

    while True:
        chunk = result.fetchmany(1000)
        if not chunk:
            break

        statement = transcript_mappings.update().where(
            transcript_mappings.c.id == sql.bindparam('m_id')
        ).values({'bin': sql.bindparam('m_bin')})

        connection.execute(statement, [
            {'m_id': m.id, 'm_bin': binning.assign_bin(m.start - 1, m.stop)}
            for m in chunk])

    # See note above.
    with op.batch_alter_table('transcript_mappings') as batch_op:
        batch_op.alter_column('bin', nullable=False, existing_type=sa.Integer())

    op.create_index(op.f('ix_transcript_mappings_bin'), 'transcript_mappings', ['bin'], unique=False)
Example #17
0
 def write_to_tsv(self, values):
     """Insert record into database."""
     self.fh_tsv.write(
         "\t".join([
             self.genome_release,
             values["chr"],
             values["start"],
             values["end"],
             str(binning.assign_bin(int(values["start"]) - 1, int(values["end"]))),
             values["variantaccession"],
             values["varianttype"],
             values["variantsubtype"],
             values["reference"],
             list_to_str(values["platform"].split(",")),
             values["samplesize"] or "0",
             values["observedgains"] or "0",
             values["observedlosses"] or "0",
         ]) + "\n"
     )
Example #18
0
 def __init__(self, chromosome, reference_type, accession, gene,
              orientation, start, stop, exon_starts, exon_stops, source,
              transcript=1, cds=None, select_transcript=False,
              version=None):
     self.chromosome = chromosome
     self.reference_type = reference_type
     self.accession = accession
     self.gene = gene
     self.orientation = orientation
     self.start = start
     self.stop = stop
     self.exon_starts = exon_starts
     self.exon_stops = exon_stops
     self.source = source
     self.transcript = transcript
     self.cds = cds
     self.select_transcript = select_transcript
     self.version = version
     self.bin = binning.assign_bin(self.start - 1, self.stop)
Example #19
0
 def write_to_tsv(self, values):
     """Insert record into database."""
     attributes = values["attributes"]
     pop_sum = {
         key: value
         for key, value in [x.split(" ") for x in attributes["PopulationSummary"].split(":")]
     }
     self.fh_tsv.write(
         "\t".join([
             self.genome_release,
             values["seqid"],
             attributes["outer_start"],
             attributes["inner_start"],
             attributes["inner_end"],
             attributes["outer_end"],
             str(binning.assign_bin(
                 int(attributes["outer_start"]) - 1, int(attributes["outer_end"])
             )),
             attributes["ID"],
             attributes["variant_type"],
             attributes["variant_sub_type"],
             attributes["num_studies"],
             list_to_str(attributes["Studies"].split(",")),
             attributes["num_platforms"],
             list_to_str(attributes["Platforms"].split(",")),
             attributes["number_of_algorithms"],
             list_to_str(attributes["algorithms"].split(",")),
             attributes["num_variants"],
             attributes["num_samples"],
             attributes["num_unique_samples_tested"],
             pop_sum["African"],
             pop_sum["Asian"],
             pop_sum["European"],
             pop_sum["Mexican"],
             pop_sum["MiddleEast"],
             pop_sum["NativeAmerican"],
             pop_sum["NorthAmerican"],
             pop_sum["Oceania"],
             pop_sum["SouthAmerican"],
             pop_sum["Admixed"],
             pop_sum["Unknown"],
         ]) + "\n"
     )
Example #20
0
 def _import_interactions(self, reg_map, path_bed):
     header = None
     with path_bed.open("rt") as inputf:
         for line in inputf:
             line = line.strip()
             arr = line.split("\t")
             if not header:
                 header = arr
                 continue
             chrom, begin, end, score, chrom1, begin1, end1, chrom2, begin2, end2 = arr[:
                                                                                        10]
             begin = int(begin)
             end = int(end)
             begin1 = int(begin1)
             end1 = int(end1)
             begin2 = int(begin2)
             end2 = int(end2)
             score = float("NaN") if score in ("", ".",
                                               "-") else float(score)
             if len(arr) > 5:
                 extra_data = json.loads(arr[10])
             else:
                 extra_data = None
             RegInteraction.objects.create(
                 reg_map=reg_map,
                 release=reg_map.collection.release,
                 chromosome=chrom,
                 start=begin + 1,
                 end=end,
                 bin=binning.assign_bin(begin, end),
                 chromosome1=chrom1,
                 start1=begin1 + 1,
                 end1=end1,
                 chromosome2=chrom2,
                 start2=begin2 + 1,
                 end2=end2,
                 score=score,
                 extra_data=extra_data,
             )
Example #21
0
class DgvSvsFactory(factory.django.DjangoModelFactory):
    class Meta:
        model = DgvSvs

    release = "GRCh37"
    chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    start = factory.Sequence(lambda n: (n + 1) * 100)
    end = factory.Sequence(lambda n: (n + 1) * 100 + 100)

    bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100,
                                                        (n + 1) * 100 + 100))

    accession = factory.Sequence(lambda n: "DGV-%d" % n)
    sv_type = "DEL"
    sv_sub_type = "DEL"

    study = factory.Sequence(lambda n: "DGV-STUDY-%d" % n)
    platform = factory.Sequence(lambda n: ["DGV-PLATFORM-%d" % n])

    num_samples = 1
    observed_gains = 0
    observed_losses = 1
Example #22
0
class RegElementFactory(factory.django.DjangoModelFactory):
    """Factory for ``RegElement`` records."""
    class Meta:
        model = RegElement

    reg_map = factory.SubFactory(RegMapFactory)
    elem_type = factory.LazyAttribute(
        lambda o: RegElementTypeFactory(collection=o.reg_map.collection))

    release = "GRCh37"
    chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    start = factory.Sequence(lambda n: (n + 1) * 100)
    end = factory.Sequence(lambda n: (n + 1) * 100 + 100)
    bin = factory.LazyAttribute(
        lambda obj: binning.assign_bin(obj.start, obj.end))
    score = 1.0
    extra_data = None

    @factory.post_generation
    def fix_bins(obj, *args, **kwargs):
        obj.bin = binning.assign_bin(obj.start - 1, obj.end)
        obj.save()
Example #23
0
class DbVarSvFactory(factory.django.DjangoModelFactory):
    class Meta:
        model = DbVarSv

    release = "GRCh37"
    chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    start = factory.Sequence(lambda n: (n + 1) * 100)
    end = factory.Sequence(lambda n: (n + 1) * 100 + 100)

    bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100,
                                                        (n + 1) * 100 + 100))

    num_carriers = 1
    sv_type = "DEL"
    method = "Sequencing"
    analysis = "Read_depth"
    platform = factory.Sequence(lambda n: "DBVAR-PLATFORM-%d" % n)
    study = factory.Sequence(lambda n: "DBVAR-STUDY-%d" % n)
    clinical_assertions = []
    clinvar_accessions = []
    bin_size = "large"
    min_ins_length = None
    max_ins_length = None
Example #24
0
class DgvGoldStandardSvsFactory(factory.django.DjangoModelFactory):
    class Meta:
        model = DgvGoldStandardSvs

    release = "GRCh37"
    chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    start_outer = factory.Sequence(lambda n: (n + 1) * 100 - 10)
    start_inner = factory.Sequence(lambda n: (n + 1) * 100 + 10)
    end_inner = factory.Sequence(lambda n: (n + 1) * 100 + 90)
    end_outer = factory.Sequence(lambda n: (n + 1) * 100 + 110)

    bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100 - 11,
                                                        (n + 1) * 100 + 110))

    accession = factory.Sequence(lambda n: "DGV-GS-%d" % n)
    sv_type = "DEL"
    sv_sub_type = "DEL"
    num_studies = 1
    studies = factory.Sequence(lambda n: ["DGV-GS-STUDY-%d" % n])
    num_platforms = 1
    platforms = factory.Sequence(lambda n: ["DGV-GS-PLATFORM-%d" % n])
    num_algorithms = 1
    algorithms = factory.Sequence(lambda n: ["DGV-GS-ALGO-%d" % n])
    num_variants = 1
    num_carriers = 1
    num_unique_samples = 1
    num_carriers_african = 0
    num_carriers_asian = 0
    num_carriers_european = 0
    num_carriers_mexican = 0
    num_carriers_middle_east = 1
    num_carriers_native_american = 0
    num_carriers_north_american = 0
    num_carriers_oceania = 0
    num_carriers_south_american = 0
    num_carriers_admixed = 0
    num_carriers_unknown = 0
Example #25
0
class ThousandGenomesSvFactory(factory.django.DjangoModelFactory):
    class Meta:
        model = ThousandGenomesSv

    release = "GRCh37"
    chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    start = factory.Sequence(lambda n: (n + 1) * 100)
    end = factory.Sequence(lambda n: (n + 1) * 100 + 100)

    bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100,
                                                        (n + 1) * 100 + 100))

    start_ci_left = -100
    start_ci_right = 100
    end_ci_left = -100
    end_ci_right = 100

    sv_type = "DEL"
    source_call_set = "DEL_delly"

    mobile_element_info = []

    num_samples = 1
    num_alleles = 2
    num_var_alleles = 1

    num_alleles_afr = 2
    num_var_alleles_afr = 1
    num_alleles_amr = 0
    num_var_alleles_amr = 0
    num_alleles_eas = 0
    num_var_alleles_eas = 0
    num_alleles_eur = 0
    num_var_alleles_eur = 0
    num_alleles_sas = 0
    num_var_alleles_sas = 0
Example #26
0
 def __init__(self, coverage, chromosome, begin, end):
     self.coverage = coverage
     self.chromosome = chromosome
     self.begin = begin
     self.end = end
     self.bin = binning.assign_bin(self.begin - 1, self.end)
def test_assign_bin_covered_interval(start, stop):
    bin_start, bin_stop = binning.covered_interval(binning.assign_bin(start,
                                                                      stop))
    assert bin_start <= start and stop <= bin_stop
def test_assign_bin(start, stop, expected):
    assert binning.assign_bin(start, stop) == expected
Example #29
0
class GnomAdSvFactory(factory.django.DjangoModelFactory):
    class Meta:
        model = GnomAdSv

    release = "GRCh37"
    chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    start = factory.Sequence(lambda n: (n + 1) * 100)
    end = factory.Sequence(lambda n: (n + 1) * 100 + 100)

    bin = factory.Sequence(lambda n: binning.assign_bin((n + 1) * 100,
                                                        (n + 1) * 100 + 100))

    ref = "N"
    alt = ["<DUP>"]

    name = [factory.Sequence(lambda n: "DBVAR-SV-%d" % n)]
    svtype = "DEL"
    svlen = 100
    filter = ["PASS"]
    evidence = ["BAF", "RD"]
    algorithms = ["depth"]
    chr2 = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"])
    cpx_type = None
    cpx_intervals = []
    source = None
    strands = None
    unresolved_type = None
    pcrplus_depleted = False
    pesr_gt_overdispersion = False
    protein_coding_lof = []
    protein_coding_dup_lof = []
    protein_coding_copy_gain = []
    protein_coding_dup_partial = []
    protein_coding_msv_exon_ovr = []
    protein_coding_intronic = []
    protein_coding_inv_span = []
    protein_coding_utr = []
    protein_coding_nearest_tss = []
    protein_coding_intergenic = False
    protein_coding_promoter = []
    an = 2
    ac = [1]
    af = [0.5]
    n_bi_genos = 1
    n_homref = 0
    n_het = 1
    n_homalt = 0
    freq_homref = 0.5
    freq_het = 0.5
    freq_homalt = 0.0
    popmax_af = 0.5
    afr_an = 1
    afr_ac = [1]
    afr_af = [0.5]
    afr_n_bi_genos = 0
    afr_n_homref = 0
    afr_n_het = 0
    afr_n_homalt = 0
    afr_freq_homref = 0.0
    afr_freq_het = 0.0
    afr_freq_homalt = 0.0
    amr_an = 0
    amr_ac = [0]
    amr_af = [0.0]
    amr_n_bi_genos = 0
    amr_n_homref = 0
    amr_n_het = 0
    amr_n_homalt = 0
    amr_freq_homref = 0.0
    amr_freq_het = 0.0
    amr_freq_homalt = 0.0
    eas_an = 0
    eas_ac = [0]
    eas_af = [0.0]
    eas_n_bi_genos = 0
    eas_n_homref = 0
    eas_n_het = 0
    eas_n_homalt = 0
    eas_freq_homref = 0.0
    eas_freq_het = 0.0
    eas_freq_homalt = 0.0
    eur_an = 0
    eur_ac = [0]
    eur_af = [0.0]
    eur_n_bi_genos = 0
    eur_n_homref = 0
    eur_n_het = 0
    eur_n_homalt = 0
    eur_freq_homref = 0.0
    eur_freq_het = 0.0
    eur_freq_homalt = 0.0
    oth_an = 0
    oth_ac = [0]
    oth_af = [0.0]
    oth_n_bi_genos = 0
    oth_n_homref = 0
    oth_n_het = 0
    oth_n_homalt = 0
    oth_freq_homref = 0.0
    oth_freq_het = 0.0
    oth_freq_homalt = 0.0
def test_assign__bin_range(start, stop):
    with pytest.raises(binning.OutOfRangeError):
        binning.assign_bin(start, stop)
def test_covered_interval_assign_bin(bin):
    assert binning.assign_bin(*binning.covered_interval(bin)) == bin
Example #32
0
 def _create_record(self, record):
     """Create new entry in gnomAD SV table."""
     self.fh_tsv.write(
         "\t".join(
             [
                 self.genome_release,
                 record.CHROM,
                 str(record.POS),
                 str(record.INFO.get("END")),
                 str(binning.assign_bin(record.INFO.get("END") - 1, record.POS)),
                 record.REF,
                 list_to_str([alt.serialize() for alt in record.ALT]),
                 list_to_str(record.ID),
                 record.INFO.get("SVTYPE"),
                 str(record.INFO.get("SVLEN")),
                 list_to_str(record.FILTER),
                 list_to_str(record.INFO.get("EVIDENCE")),
                 list_to_str(record.INFO.get("ALGORITHMS")),
                 record.INFO.get("CHR2"),
                 record.INFO.get("CPX_TYPE", ""),
                 list_to_str(record.INFO.get("CPX_INTERVALS", [])),
                 record.INFO.get("SOURCE", ""),
                 record.INFO.get("STRANDS", ""),
                 record.INFO.get("UNRESOLVED_TYPE", ""),
                 str(record.INFO.get("PCRPLUS_DEPLETED", False)),
                 str(record.INFO.get("PESR_GT_OVERDISPERSION", False)),
                 list_to_str(record.INFO.get("PROTEIN_CODING_LOF", [])),
                 list_to_str(record.INFO.get("PROTEIN_CODING__DUP_LOF", [])),
                 list_to_str(record.INFO.get("PROTEIN_CODING__COPY_GAIN", [])),
                 list_to_str(record.INFO.get("PROTEIN_CODING__DUP_PARTIAL", [])),
                 list_to_str(record.INFO.get("PROTEIN_CODING__MSV_EXON_OVR", [])),
                 list_to_str(record.INFO.get("PROTEIN_CODING__INTRONIC", [])),
                 list_to_str(record.INFO.get("PROTEIN_CODING__INV_SPAN", [])),
                 list_to_str(record.INFO.get("PROTEIN_CODING__UTR", [])),
                 list_to_str(record.INFO.get("PROTEIN_CODING__NEAREST_TSS", [])),
                 str(record.INFO.get("PROTEIN_CODING__INTERGENIC", False)),
                 list_to_str(record.INFO.get("PROTEIN_CODING__PROMOTER", [])),
                 str(record.INFO.get("AN")),
                 list_to_str(record.INFO.get("AC", [])),
                 list_to_str(record.INFO.get("AF", [])),
                 str(record.INFO.get("N_BI_GENOS", 0)),
                 str(record.INFO.get("N_HOMREF", 0)),
                 str(record.INFO.get("N_HET", 0)),
                 str(record.INFO.get("N_HOMALT", 0)),
                 str(record.INFO.get("FREQ_HOMREF", 0.0)),
                 str(record.INFO.get("FREQ_HET", 0.0)),
                 str(record.INFO.get("FREQ_HOMALT", 0.0)),
                 str(record.INFO.get("POPMAX_AF", 0.0)),
                 str(record.INFO.get("AFR_AN")),
                 list_to_str(record.INFO.get("AFR_AC", [])),
                 list_to_str(record.INFO.get("AFR_AF", [])),
                 str(record.INFO.get("AFR_N_BI_GENOS", 0)),
                 str(record.INFO.get("AFR_N_HOMREF", 0)),
                 str(record.INFO.get("AFR_N_HET", 0)),
                 str(record.INFO.get("AFR_N_HOMALT", 0)),
                 str(record.INFO.get("AFR_FREQ_HOMREF", 0.0)),
                 str(record.INFO.get("AFR_FREQ_HET", 0.0)),
                 str(record.INFO.get("AFR_FREQ_HOMALT", 0.0)),
                 str(record.INFO.get("AMR_AN")),
                 list_to_str(record.INFO.get("AMR_AC", [])),
                 list_to_str(record.INFO.get("AMR_AF", [])),
                 str(record.INFO.get("AMR_N_BI_GENOS", 0)),
                 str(record.INFO.get("AMR_N_HOMREF", 0)),
                 str(record.INFO.get("AMR_N_HET", 0)),
                 str(record.INFO.get("AMR_N_HOMALT", 0)),
                 str(record.INFO.get("AMR_FREQ_HOMREF", 0.0)),
                 str(record.INFO.get("AMR_FREQ_HET", 0.0)),
                 str(record.INFO.get("AMR_FREQ_HOMALT", 0.0)),
                 str(record.INFO.get("EAS_AN")),
                 list_to_str(record.INFO.get("EAS_AC", [])),
                 list_to_str(record.INFO.get("EAS_AF", [])),
                 str(record.INFO.get("EAS_N_BI_GENOS", 0)),
                 str(record.INFO.get("EAS_N_HOMREF", 0)),
                 str(record.INFO.get("EAS_N_HET", 0)),
                 str(record.INFO.get("EAS_N_HOMALT", 0)),
                 str(record.INFO.get("EAS_FREQ_HOMREF", 0.0)),
                 str(record.INFO.get("EAS_FREQ_HET", 0.0)),
                 str(record.INFO.get("EAS_FREQ_HOMALT", 0.0)),
                 str(record.INFO.get("EUR_AN")),
                 list_to_str(record.INFO.get("EUR_AC", [])),
                 list_to_str(record.INFO.get("EUR_AF", [])),
                 str(record.INFO.get("EUR_N_BI_GENOS", 0)),
                 str(record.INFO.get("EUR_N_HOMREF", 0)),
                 str(record.INFO.get("EUR_N_HET", 0)),
                 str(record.INFO.get("EUR_N_HOMALT", 0)),
                 str(record.INFO.get("EUR_FREQ_HOMREF", 0.0)),
                 str(record.INFO.get("EUR_FREQ_HET", 0.0)),
                 str(record.INFO.get("EUR_FREQ_HOMALT", 0.0)),
                 str(record.INFO.get("OTH_AN")),
                 list_to_str(record.INFO.get("OTH_AC", [])),
                 list_to_str(record.INFO.get("OTH_AF", [])),
                 str(record.INFO.get("OTH_N_BI_GENOS", 0)),
                 str(record.INFO.get("OTH_N_HOMREF", 0)),
                 str(record.INFO.get("OTH_N_HET", 0)),
                 str(record.INFO.get("OTH_N_HOMALT", 0)),
                 str(record.INFO.get("OTH_FREQ_HOMREF", 0.0)),
                 str(record.INFO.get("OTH_FREQ_HET", 0.0)),
                 str(record.INFO.get("OTH_FREQ_HOMALT", 0.0)),
             ]
         ) + "\n"
     )
Example #33
0
 def fix_bins(obj, *args, **kwargs):
     obj.bin = binning.assign_bin(obj.start - 1, obj.end)
     obj.save()
Example #34
0
    def import_sv_vcf_record(self, panel_map, record):
        """Import the SV VCF file into the database."""
        # Counters
        super_pops = ("All", "AFR", "AMR", "EAS", "EUR", "SAS")
        num_samples = 0
        num_alleles = {key: 0 for key in super_pops}
        num_var_alleles = {key: 0 for key in super_pops}

        # Count statistics
        for call in record.calls:
            sample = call.sample
            gt = call.data.get("GT", ".")
            super_pop = panel_map[sample]["super_pop"]
            sex = panel_map[sample]["sex"]
            # Skip if genotype is no-call
            if gt == ".":
                continue
            # Count alleles contributed by this individual
            if record.CHROM == "X":
                this_alleles = 1 if sex == "male" else 2
            elif record.CHROM == "Y":
                this_alleles = 1 if sex == "male" else 0
            else:
                this_alleles = 2
            if this_alleles == 0:
                continue  # no alleles contributed by this individual
            # Increment allele counters
            num_alleles["All"] += this_alleles
            num_alleles[super_pop] += this_alleles
            num_samples += 1
            if gt in ("0|0", "0/0"):
                continue  # non-variant allele
            elif this_alleles == 1:
                num_var_alleles["All"] += 1
                num_alleles[super_pop] += 1
            elif "0" in gt:  # heterozygous, even if multiallelic (-> CNV)
                num_var_alleles["All"] += 1
                num_alleles[super_pop] += 1
            else:  # homozygous non-ref, even if multiallelic (-> CNV)
                num_var_alleles["All"] += 2
                num_alleles[super_pop] += 2

        # Perform the record creation
        self.fh_tsv.write(
            "\t".join(
                [
                    self.genome_release,
                    record.CHROM,
                    str(record.POS),
                    str(record.INFO.get("END", record.POS)),
                    str(binning.assign_bin(record.POS - 1, record.INFO.get("END", record.POS))),
                    str(record.INFO.get("CIPOS", (0, 0))[0]),
                    str(record.INFO.get("CIPOS", (0, 0))[1]),
                    str(record.INFO.get("CIEND", (0, 0))[0]),
                    str(record.INFO.get("CIEND", (0, 0))[1]),
                    record.INFO.get("SVTYPE"),
                    record.INFO.get("CS"),
                    list_to_str(record.INFO.get("MEINFO", [])),
                    str(num_samples),
                    str(num_alleles["All"]),
                    str(num_var_alleles["All"])
                ] +
                [str(num_alleles[key]) for key in super_pops if key != "All"] +
                [str(num_var_alleles[key]) for key in super_pops if key != "All"]
            ) + "\n"
        )
Example #35
0
def fixture_setup_case1_simple():
    """Setup test case 1 -- a singleton with one variant only."""
    project = Project.objects.create(**PROJECT_DICT)
    case = project.case_set.create(
        sodar_uuid="9b90556b-041e-47f1-bdc7-4d5a4f8357e3",
        name="A",
        index="A",
        pedigree=[{
            "sex": 1,
            "father": "0",
            "mother": "0",
            "patient": "A",
            "affected": 1,
            "has_gt_entries": True,
        }],
    )
    SmallVariant.objects.create(
        case_id=case.pk,
        release="GRCh37",
        chromosome="1",
        start=100,
        end=100,
        bin=binning.assign_bin(99, 100),
        reference="A",
        alternative="G",
        var_type="snv",
        genotype={"A": {
            "ad": 15,
            "dp": 30,
            "gq": 99,
            "gt": "0/1"
        }},
        in_clinvar=True,
        # frequencies
        exac_frequency=0.01,
        exac_homozygous=0,
        exac_heterozygous=0,
        exac_hemizygous=0,
        thousand_genomes_frequency=0.01,
        thousand_genomes_homozygous=0,
        thousand_genomes_heterozygous=0,
        thousand_genomes_hemizygous=0,
        gnomad_exomes_frequency=0.01,
        gnomad_exomes_homozygous=0,
        gnomad_exomes_heterozygous=0,
        gnomad_exomes_hemizygous=0,
        gnomad_genomes_frequency=0.01,
        gnomad_genomes_homozygous=0,
        gnomad_genomes_heterozygous=0,
        gnomad_genomes_hemizygous=0,
        # RefSeq
        refseq_gene_id="1234",
        refseq_transcript_id="NR_00001.1",
        refseq_transcript_coding=False,
        refseq_hgvs_c="n.111+2T>C",
        refseq_hgvs_p="p.=",
        refseq_effect=["synonymous_variant"],
        # ENSEMBL
        ensembl_gene_id="ENGS00001",
        ensembl_transcript_id="ENST00001",
        ensembl_transcript_coding=False,
        ensembl_hgvs_c="n.111+2T>C",
        ensembl_hgvs_p="p.=",
        ensembl_effect=["synonymous_variant"],
    )
    SmallVariantSummary.objects.create(
        release="GRCh37",
        chromosome="1",
        start=100,
        end=100,
        bin=binning.assign_bin(99, 100),
        reference="A",
        alternative="G",
        count_hom_ref=0,
        count_het=1,
        count_hom_alt=1,
        count_hemi_ref=0,
        count_hemi_alt=0,
    )
    Hgnc.objects.create(hgnc_id="HGNC:1", symbol="AAA")
    RefseqToHgnc.objects.create(entrez_id="1234", hgnc_id="HGNC:1")

    rebuild_case_variant_stats(SQLALCHEMY_ENGINE, case)
Example #36
0
 def run(self):
     logger.info("Parsing elements...")
     out_files = {"GRCh37": self.out_b37, "GRCh38": self.out_b38}
     for out_file in out_files.values():
         print(TSV_HEADER, file=out_file)
     with tqdm.tqdm(unit="rcvs") as progress:
         for event, elem in ET.iterparse(self.input):
             if elem.tag == "ClinVarSet" and event == "end":
                 self.rcvs += 1
                 clinvar_set = ClinVarSet.from_element(elem)
                 if clinvar_set.ref_cv_assertion.observed_in:
                     origin = clinvar_set.ref_cv_assertion.observed_in.origin
                 else:
                     origin = "."
                 for genotype_set in clinvar_set.ref_cv_assertion.genotype_sets:
                     for measure_set in genotype_set.measure_sets:
                         for measure in measure_set.measures:
                             for build, location in measure.sequence_locations.items(
                             ):
                                 if build not in out_files:
                                     continue
                                 elif location.ref is not None and location.alt is not None:
                                     if len(location.ref) == 1 and len(
                                             location.alt) == 1:
                                         variation_type = "snv"
                                     elif len(location.ref) == len(
                                             location.alt):
                                         variation_type = "mnv"
                                     else:
                                         variation_type = "indel"
                                     row = [
                                         build,
                                         location.chrom,
                                         location.start,
                                         location.stop,
                                         binning.assign_bin(
                                             location.start - 1,
                                             location.stop),
                                         location.ref,
                                         location.alt,
                                         variation_type,
                                         as_pg_list(measure.symbols),
                                         as_pg_list(measure.hgnc_ids),
                                         clinvar_set.ref_cv_assertion.id_no,
                                         clinvar_set.ref_cv_assertion.
                                         clinvar_accession,
                                         clinvar_set.ref_cv_assertion.
                                         gold_stars,
                                         clinvar_set.ref_cv_assertion.
                                         review_status,
                                         clinvar_set.ref_cv_assertion.
                                         pathogenicity,
                                         origin,
                                         measure_set.accession,
                                         json.dumps(
                                             cattr.unstructure(clinvar_set),
                                             cls=DateTimeEncoder).replace(
                                                 r"\"",
                                                 "'").replace('"', '"""'),
                                     ]
                                     print("\t".join(map(str, row)),
                                           file=out_files[build])
                     progress.update()
                 elem.clear()
             if self.max_rcvs and self.rcvs >= self.max_rcvs:
                 logger.info(
                     "Breaking out after processing %d RCVs (as configured)",
                     self.rcvs)
                 break
     logger.info("Done parsing elements")