def random_variants(count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] while len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset(base1_genomic_position) try: seq = transcript.sequence except ValueError as e: logging.warn(e) # can't get sequence for non-coding transcripts continue ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant(transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) return VariantCollection(variants)
def random_variants(count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] # we should finish way before this loop is over but just in case # something is wrong with PyEnsembl we want to avoid an infinite loop for _ in range(count * 100): if len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset( base1_genomic_position) seq = transcript.sequence ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant(transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) else: return VariantCollection(variants) raise ValueError(("Unable to generate %d random variants, " "there may be a problem with PyEnsembl") % count)
def random_variants( count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] while len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset(base1_genomic_position) try: seq = transcript.sequence except ValueError as e: logging.warn(e) # can't get sequence for non-coding transcripts continue ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant( transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) return VariantCollection(variants)
def random_variants( count, ensembl_release=MAX_ENSEMBL_RELEASE, deletions=True, insertions=True, random_seed=None): """ Generate a VariantCollection with random variants that overlap at least one complete coding transcript. """ rng = random.Random(random_seed) ensembl = EnsemblRelease(ensembl_release) if ensembl_release in _transcript_ids_cache: transcript_ids = _transcript_ids_cache[ensembl_release] else: transcript_ids = ensembl.transcript_ids() _transcript_ids_cache[ensembl_release] = transcript_ids variants = [] # we should finish way before this loop is over but just in case # something is wrong with PyEnsembl we want to avoid an infinite loop for _ in range(count * 100): if len(variants) < count: transcript_id = rng.choice(transcript_ids) transcript = ensembl.transcript_by_id(transcript_id) if not transcript.complete: continue exon = rng.choice(transcript.exons) base1_genomic_position = rng.randint(exon.start, exon.end) transcript_offset = transcript.spliced_offset(base1_genomic_position) seq = transcript.sequence ref = str(seq[transcript_offset]) if transcript.on_backward_strand: ref = reverse_complement(ref) alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref] if insertions: nucleotide_pairs = [ x + y for x in STANDARD_NUCLEOTIDES for y in STANDARD_NUCLEOTIDES ] alt_nucleotides.extend(nucleotide_pairs) if deletions: alt_nucleotides.append("") alt = rng.choice(alt_nucleotides) variant = Variant( transcript.contig, base1_genomic_position, ref=ref, alt=alt, ensembl=ensembl) variants.append(variant) else: return VariantCollection(variants) raise ValueError( ("Unable to generate %d random variants, " "there may be a problem with PyEnsembl") % count)