def test1(self): """ Test 1. """ result = describe_dna('ATGATGATCAGATACAGTGTGATACAGGTAGTTAGACAA', 'ATGATTTGATCAGATACATGTGATACCGGTAGTTAGGACAA') assert str(result) == '[5_6insTT;17del;26A>C;35dup]'
def _filter(variants, ref_seq1, ref_seq2): raw_de_variants = extractor.describe_dna(ref_seq1, ref_seq2) seq_variants = de_to_hgvs( raw_de_variants, {"reference": ref_seq1, "observed": ref_seq2}, ) return [v for v in variants if v not in seq_variants]
def test2(self): """ Test 2. """ result = describe_dna( 'TAAGCACCAGGAGTCCATGAAGAAGATGGCTCCTGCCATGGAATCCCCTACTCTACTGTG', 'TAAGCACCAGGAGTCCATGAAGAAGCTGGATCCTCCCATGGAATCCCCTACTCTACTGTG') assert str(result) == '[26A>C;30C>A;35G>C]'
def test4(self): """ Test 4. """ result = describe_dna( 'TAAGCACCAGGAGTCCATGAAGAAGATGGCTCCTGCCATGGAATCCCCTACTCTA', 'TAAGCACCAGGAGTCCATGAAGAAGCCATGTCCTGCCATGAATCCCCTACTCTA') assert str(result) == '[26_29inv;30C>G;41del]'
def _extract_hgvs_internal_model(obs_seq, r_model): ref_seq = r_model["sequence"]["seq"] de_variants = extractor.describe_dna(ref_seq, obs_seq) return de_to_hgvs( de_variants, {"reference": ref_seq, "observed": obs_seq}, )
def test1(self): """ Test 1. """ result = describe_dna( 'ATGATGATCAGATACAGTGTGATACAGGTAGTTAGACAA', 'ATGATTTGATCAGATACATGTGATACCGGTAGTTAGGACAA') assert str(result) == '[5_6insTT;17del;26A>C;35dup]'
def _extract(self): self.de_model = { "reference": copy.deepcopy(self.internal_indexing_model["reference"]), "coordinate_system": "i", "variants": describe_dna( self.references["reference"]["sequence"]["seq"], self.references["observed"]["sequence"]["seq"], ), }
def description_extractor(reference, observed): de_variants = extractor.describe_dna(reference, observed) de_hgvs_variants = de_to_hgvs(de_variants, { "reference": reference, "observed": observed }) hgvs_indexing_variants = variants_to_internal_indexing(de_hgvs_variants) crossmap = crossmap_to_hgvs_setup("g") hgvs_variants = locations_to_hgvs_locations( {"variants": hgvs_indexing_variants}, crossmap) normalized_description = variants_to_description(hgvs_variants["variants"]) return normalized_description
def map_description( description, reference_id, selector_id=None, slice_to=None, clean=False, ): # Get the observed sequence d = Description(description) d.normalize() if d.errors: return {"errors": d.errors} if not d.references and not d.references.get("observed"): return {"errors": [{"details": "No observed sequence or other error occured."}]} obs_seq = d.references["observed"]["sequence"]["seq"] r_model = retrieve_reference(reference_id) if r_model is None: return {"errors": [reference_not_retrieved(reference_id, [])]} ref_seq2 = d.references["reference"]["sequence"]["seq"] if selector_id: s_model = get_selector_model(r_model["annotations"], selector_id, True) if s_model is None: return {"errors": [no_selector_found(reference_id, selector_id, [])]} if s_model["inverted"]: obs_seq = reverse_complement(obs_seq) ref_seq2 = reverse_complement(ref_seq2) if slice_to: r_model = _get_reference_model(r_model, selector_id, slice_to) ref_seq1 = r_model["sequence"]["seq"] # Get the description extractor hgvs internal indexing variants variants = _extract_hgvs_internal_model(obs_seq, r_model) if clean: raw_de_variants = extractor.describe_dna(ref_seq1, ref_seq2) seq_variants = de_to_hgvs( raw_de_variants, {"reference": ref_seq1, "observed": ref_seq2}, ) if [v for v in seq_variants if v not in variants]: return { "errors": [{"code": "EMAPFILTER", "details": "Unsuccessful filtering."}] } variants = [v for v in variants if v not in seq_variants] return _get_description(variants, r_model, selector_id)
def _single_variant(self, sample, expected): """ General single variant test. """ reference = 'ACGTCGATTCGCTAGCTTCGGGGGATAGATAGAGATATAGAGAT' result = describe_dna(reference, sample) assert result[0].type == expected[0] assert result[0].start == expected[1] assert result[0].end == expected[2] assert result[0].sample_start == expected[3] assert result[0].sample_end == expected[4] assert result[0].deleted[0].sequence == expected[5] assert result[0].inserted[0].sequence == expected[6] assert str(result[0]) == expected[7]
def description_extractor_submit(): """ The Variant Description Extractor (experimental service). There multiple ways for the user to provide two sequences, corresponding to the values for the `reference_method` and `sample_method` fields, each requiring some additional fields to be defined: `raw_method` The reference and sample sequences are pasted into the form fields. - `reference_sequence`: The reference sequence. - `sample_sequence`: The sample sequence. `file_method` The reference and sample sequences are uploaded. - `reference_file`: The reference file. - `sample_file`: The sample file. `refseq_method` The reference and sample sequences are given by RefSeq accession numbers. - `reference_accession_number`: RefSeq accession number for the reference sequence. - `sample_accession_number`: RefSeq accession number for the sample sequence. """ output = Output(__file__) output.addMessage(__file__, -1, 'INFO', 'Received Description Extract request from %s' % request.remote_addr) stats.increment_counter('description-extractor/website') r = s = '' reference_method = request.form.get('reference_method') sample_method = request.form.get('sample_method') reference_sequence = request.form.get('reference_sequence') sample_sequence = request.form.get('sample_sequence') reference_file = request.files.get('reference_file') sample_file = request.files.get('sample_file') reference_filename = '' sample_filename = '' reference_accession_number = request.form.get('reference_accession_number') sample_accession_number = request.form.get('sample_accession_number') if reference_method == 'refseq_method': if reference_accession_number: retriever = Retriever.GenBankRetriever(output) genbank_record = retriever.loadrecord(reference_accession_number) if genbank_record: r = unicode(genbank_record.seq) else: output.addMessage(__file__, 3, 'EEMPTYFIELD', 'Reference accession number input fields is empty.') elif reference_method == 'file_method': if reference_file: reference_filename = reference_file.filename r = util.read_dna(reference_file) else: output.addMessage(__file__, 3, 'EEMPTYFIELD', 'No reference file provided.') else: # raw_method if reference_sequence: r = util.read_dna(StringIO.StringIO(reference_sequence)) else: output.addMessage(__file__, 3, 'EEMPTYFIELD', 'Reference sequence number input fields is empty.') if sample_method == 'refseq_method': if sample_accession_number: retriever = Retriever.GenBankRetriever(output) genbank_record = retriever.loadrecord(sample_accession_number) if genbank_record: s = unicode(genbank_record.seq) else: output.addMessage(__file__, 3, 'EEMPTYFIELD', 'Sample accession number input fields is empty.') elif sample_method == 'file_method': if sample_file: sample_filename = sample_file.filename s = util.read_dna(sample_file) else: output.addMessage(__file__, 3, 'EEMPTYFIELD', 'No sample file provided.') else: # raw_method if sample_sequence: s = util.read_dna(StringIO.StringIO(sample_sequence)) else: output.addMessage(__file__, 3, 'EEMPTYFIELD', 'Sample sequence number input fields is empty.') # Todo: Move this to the describe module. if not r or not util.is_dna(r): output.addMessage(__file__, 3, 'ENODNA', 'Reference sequence is not DNA.') if not s or not util.is_dna(s): output.addMessage(__file__, 3, 'ENODNA', 'Sample sequence is not DNA.') raw_vars = None if r and s: if (len(r) > settings.EXTRACTOR_MAX_INPUT_LENGTH or len(s) > settings.EXTRACTOR_MAX_INPUT_LENGTH): output.addMessage(__file__, 3, 'EMAXSIZE', 'Input sequences are restricted to {:,} bp.' .format(settings.EXTRACTOR_MAX_INPUT_LENGTH)) else: raw_vars = extractor.describe_dna(r, s) errors, warnings, summary = output.Summary() messages = map(util.message_info, output.getMessages()) output.addMessage(__file__, -1, 'INFO', 'Finished Description Extract request') return render_template('description-extractor.html', extractor_max_input_length=settings.EXTRACTOR_MAX_INPUT_LENGTH, reference_sequence=reference_sequence or '', sample_sequence=sample_sequence or '', reference_accession_number=reference_accession_number or '', sample_accession_number=sample_accession_number or '', reference_filename=reference_filename or '', sample_filename=sample_filename or '', raw_vars=raw_vars, errors=errors, summary=summary, messages=messages, reference_method=reference_method, sample_method=sample_method)
def name_checker(): """ Name checker. """ # For backwards compatibility with older LOVD versions, we support the # `mutationName` argument. If present, we redirect and add `standalone=1`. # # Also for backwards compatibility, we support the `name` argument as an # alias for `description`. if 'name' in request.args: return redirect(url_for('.name_checker', description=request.args['name'], standalone=request.args.get('standalone')), code=301) if 'mutationName' in request.args: return redirect(url_for('.name_checker', description=request.args['mutationName'], standalone=1), code=301) description = request.args.get('description') if not description: return render_template('name-checker.html') output = Output(__file__) output.addMessage(__file__, -1, 'INFO', 'Received variant %s from %s' % (description, request.remote_addr)) stats.increment_counter('name-checker/website') variantchecker.check_variant(description, output) errors, warnings, summary = output.Summary() parse_error = output.getOutput('parseError') record_type = output.getIndexedOutput('recordType', 0, '') reference = output.getIndexedOutput('reference', 0, '') if reference: if record_type == 'LRG': reference_filename = reference + '.xml' else : reference_filename = reference + '.gb' else: reference_filename = None genomic_dna = output.getIndexedOutput('molType', 0) != 'n' genomic_description = output.getIndexedOutput('genomicDescription', 0, '') # Create a link to the UCSC Genome Browser. browser_link = None raw_variants = output.getIndexedOutput('rawVariantsChromosomal', 0) if raw_variants: positions = [pos for descr, (first, last) in raw_variants[2] for pos in (first, last)] bed_url = url_for('.bed', description=description, _external=True) browser_link = ('http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg19&' 'position={chromosome}:{start}-{stop}&hgt.customText=' '{bed_file}'.format(chromosome=raw_variants[0], start=min(positions) - 10, stop=max(positions) + 10, bed_file=urllib.quote(bed_url))) # Experimental description extractor. if (output.getIndexedOutput('original', 0) and output.getIndexedOutput('mutated', 0)): allele = extractor.describe_dna(output.getIndexedOutput('original', 0), output.getIndexedOutput('mutated', 0)) extracted = '(skipped)' if allele: extracted = unicode(allele) else: extracted = '' # Todo: Generate the fancy HTML views for the proteins here instead of in # `mutalyzer.variantchecker`. arguments = { 'description' : description, 'messages' : map(util.message_info, output.getMessages()), 'summary' : summary, 'parse_error' : parse_error, 'errors' : errors, 'genomicDescription' : genomic_description, 'chromDescription' : output.getIndexedOutput( 'genomicChromDescription', 0), 'genomicDNA' : genomic_dna, 'visualisation' : output.getOutput('visualisation'), 'descriptions' : output.getOutput('descriptions'), 'protDescriptions' : output.getOutput('protDescriptions'), 'oldProtein' : output.getOutput('oldProteinFancy'), 'altStart' : output.getIndexedOutput('altStart', 0), 'altProtein' : output.getOutput('altProteinFancy'), 'newProtein' : output.getOutput('newProteinFancy'), 'transcriptInfo' : output.getIndexedOutput('hasTranscriptInfo', 0, False), 'transcriptCoding' : output.getIndexedOutput('transcriptCoding', 0, False), 'exonInfo' : output.getOutput('exonInfo'), 'cdsStart_g' : output.getIndexedOutput('cdsStart_g', 0), 'cdsStart_c' : output.getIndexedOutput('cdsStart_c', 0), 'cdsStop_g' : output.getIndexedOutput('cdsStop_g', 0), 'cdsStop_c' : output.getIndexedOutput('cdsStop_c', 0), 'restrictionSites' : output.getOutput('restrictionSites'), 'legends' : output.getOutput('legends'), 'reference_filename' : reference_filename, # Todo: Download link is not shown... 'browserLink' : browser_link, 'extractedDescription': extracted, 'standalone' : bool(request.args.get('standalone')) } output.addMessage(__file__, -1, 'INFO', 'Finished variant %s' % description) return render_template('name-checker.html', **arguments)
def check_name(description): """ Run the name checker. """ O = output.Output(__file__) O.addMessage(__file__, -1, "INFO", "Received variant " + description) RD = variantchecker.check_variant(description, O) O.addMessage(__file__, -1, "INFO", "Finished processing variant " + description) ### OUTPUT BLOCK ### gn = O.getOutput("genename") if gn : print "Gene Name: " + gn[0] tv = O.getOutput("transcriptvariant") if tv : print "Transcript variant: " + tv[0] print #if for i in O.getMessages() : print i errors, warnings, summary = O.Summary() print summary print if not errors: print "Overview of the raw variants:" for i in O.getOutput("visualisation"): for j in range(len(i)): print i[j] print #for print "Genomic description:" print O.getIndexedOutput('genomicDescription', 0, '') print "\nChromosomal description:" print O.getOutput("genomicChromDescription") print "\nAffected transcripts:" for i in O.getOutput('descriptions'): print i print "\nAffected proteins:" for i in O.getOutput('protDescriptions'): print i print "\nOld protein:" for i in O.getOutput("oldProteinFancyText"): print i print "\nNew protein:" for i in O.getOutput("newProteinFancyText"): print i print "\nAlternative protein:" for i in O.getOutput("altProteinFancyText"): print i print "\nExon information:" for i in O.getOutput("exonInfo") : print i print "\nCDS information:" print O.getOutput("cdsStart_c"), O.getOutput("cdsStop_c") print O.getOutput("cdsStart_g"), O.getOutput("cdsStop_g") print "\nEffect on Restriction sites:" for i in O.getOutput("restrictionSites") : print i print "\nLegend:" for i in O.getOutput("legends") : print i reference_sequence = O.getIndexedOutput("original", 0) sample_sequence = O.getIndexedOutput("mutated", 0) described_allele = extractor.describe_dna(reference_sequence, sample_sequence) #described_protein_allele = describe.describe( # O.getIndexedOutput("oldprotein", 0), # O.getIndexedOutput("newprotein", 0, default=""), # DNA=False) described_protein_allele = "" described = described_protein = '(skipped)' if described_allele: described = described_allele if described_protein_allele: described_protein = described_protein_allele print "\nExperimental services:" print described print described_protein #print "+++ %s" % O.getOutput("myTranscriptDescription") print json.dumps({ #"reference_sequence": reference_sequence, #"sample_sequence": sample_sequence, "allele_description": described_allele}, cls=AlleleEncoder)
def check_name(description): """ Run the name checker. """ O = output.Output(__file__) O.addMessage(__file__, -1, "INFO", "Received variant " + description) RD = variantchecker.check_variant(description, O) O.addMessage(__file__, -1, "INFO", "Finished processing variant " + description) ### OUTPUT BLOCK ### gn = O.getOutput("genename") if gn: print "Gene Name: " + gn[0] tv = O.getOutput("transcriptvariant") if tv: print "Transcript variant: " + tv[0] print #if for i in O.getMessages(): print i errors, warnings, summary = O.Summary() print summary print if not errors: print "Overview of the raw variants:" for i in O.getOutput("visualisation"): for j in range(len(i)): print i[j] print #for print "Genomic description:" print O.getIndexedOutput('genomicDescription', 0, '') print "\nChromosomal description:" print O.getOutput("genomicChromDescription") print "\nAffected transcripts:" for i in O.getOutput('descriptions'): print i print "\nAffected proteins:" for i in O.getOutput('protDescriptions'): print i print "\nOld protein:" for i in O.getOutput("oldProteinFancyText"): print i print "\nNew protein:" for i in O.getOutput("newProteinFancyText"): print i print "\nAlternative protein:" for i in O.getOutput("altProteinFancyText"): print i print "\nExon information:" for i in O.getOutput("exonInfo"): print i print "\nCDS information:" print O.getOutput("cdsStart_c"), O.getOutput("cdsStop_c") print O.getOutput("cdsStart_g"), O.getOutput("cdsStop_g") print "\nEffect on Restriction sites:" for i in O.getOutput("restrictionSites"): print i print "\nLegend:" for i in O.getOutput("legends"): print i reference_sequence = O.getIndexedOutput("original", 0) sample_sequence = O.getIndexedOutput("mutated", 0) described_allele = extractor.describe_dna(reference_sequence, sample_sequence) #described_protein_allele = describe.describe( # O.getIndexedOutput("oldProtein", 0), # O.getIndexedOutput("newProtein", 0, default=""), # DNA=False) described_protein_allele = "" described = described_protein = '(skipped)' if described_allele: described = described_allele if described_protein_allele: described_protein = described_protein_allele print "\nExperimental services:" print described print described_protein #print "+++ %s" % O.getOutput("myTranscriptDescription") print json.dumps( { #"reference_sequence": reference_sequence, #"sample_sequence": sample_sequence, "allele_description": described_allele }, cls=AlleleEncoder)
def rsid_search(tile_index, variant, suppress_output=True): results_dict = {} tile = application.Tile(tile_index) varval = int(variant) # retrieve tile information using getTileVariants script info_tile = getTileVariants.tile_iteration(tile, suppress_output=True, all_functionality=True) # break up output into common variant (.000) and specifically chosen # variant tile_variant = info_tile.variants.split('\n')[varval] common_variant = info_tile.variants.split('\n')[0] if tile_variant == common_variant: print("No difference between tiles. Exiting...") sys.exit() # retrieve sequence, ignoring hash + id tile_seq = tile_variant.split(',')[2] common_seq = common_variant.split(',')[2] results_dict['tile_sequence'] = tile_variant results_dict['common_sequence'] = common_variant results_dict['variant'] = variant results_dict['index'] = tile_index results_dict['mutations'] = {} if not suppress_output: print("Common sequence: {}".format(common_seq)) print("Variant sequence: {}".format(tile_seq)) # delete spanning tile parts if necessary if len(tile_seq) - len(common_seq) >= 24: tile_seq = tile_seq[:len(common_seq)] if not suppress_output: print("Detected spanning tile. Deleting extra part...") elif len(common_seq) - len(tile_seq) >= 24: common_seq = common_seq[:len(tile_seq)] if not suppress_output: print("Detected spanning tile. Deleting extra part...") # run mutalyzer description extractor for alignment data allele = describe_dna(common_seq, tile_seq) changes = str(allele) # break up into list if needed changes = changes.replace('[', '') changes = changes.replace(']', '') changes = changes.split(';') mutations = map(lambda x: get_mutation(x, info_tile, common_seq), changes) # offset all mutations map(lambda x: x.offset(info_tile.to_dict()['position_start']), mutations) # store all the queries rsid_queries = map(lambda x: x.rsid_query(), mutations) rsid_queries = list(filter(lambda x: x != '', rsid_queries)) mutations_lst = zip(mutations, rsid_queries) # print results for mutation, rsid_lst in mutations_lst: if not suppress_output: print('---') print("Mutation: {}".format(mutation)) print("Representation: {}".format(repr(mutation))) mutation_info = [] if len(rsid_lst) > 0 and rsid_lst[0] != '': if not suppress_output: print("Possible SNP RSIDS:") for rsid_query in rsid_lst: chrom, rsid, location, ref, alt = rsid_query.split(" ") result_str = "RSID: {}; Location: {}, REF: {}, ALT: {}".format( rsid, location, ref, alt) if not suppress_output: print(result_str) mutation_info.append((rsid, location, ref, alt)) else: if not suppress_output: print("No possible SNPs found") results_dict['mutations'][repr(mutation)] = mutation_info return results_dict