def get_transcript_ids_for_gene(gene_name): _log.debug('get_transcript_ids_for_gene') # retrieve the transcript ids for this gene trancripts = GeneRepository.retrieve_all_transcript_ids(gene_name) # check if there was any return value if len(trancripts) > 0: message = "Retrieved transcripts for gene '" + trancripts[ 0].gene_name + "'" else: message = "No transcripts available in database for gene '" + gene_name + "'" transcript_results = [] for t in trancripts: # retrieve matching refseq identifiers for this transcript refseq_ids = retrieve_refseq_identifiers_for_transcript( t.gencode_transcription_id) refseq_nm_numbers = ", ".join(nm_number for nm_number in refseq_ids['NM']) transcript_entry = {} transcript_entry['aa_length'] = t.sequence_length transcript_entry['gencode_id'] = t.gencode_transcription_id transcript_entry['refseq_nm_numbers'] = refseq_nm_numbers transcript_entry['has_protein_data'] = not t.protein_id is None transcript_results.append(transcript_entry) return jsonify(trancript_ids=transcript_results, message=message)
def test_session_always_removed(mock_create_session): class FailSession: def __init__(self): self.removed = False def query(self, id_): raise Exception("test fail") def remove(self): self.removed = True class Allable: def all(self): return [] class Filterable: def filter(self, id_): return Allable() class SuccessSession: def __init__(self): self.removed = False def query(self, id_): return Filterable() def remove(self): self.removed = True # Test that it removes upon failure: session = FailSession() mock_create_session.return_value = session try: l = GeneRepository.retrieve_all_transcript_ids_with_mappings() except: ok_(session.removed) # Test that it removes upon success: session = SuccessSession() mock_create_session.return_value = session l = GeneRepository.retrieve_all_transcript_ids_with_mappings() ok_(session.removed)
def test_raises_recoverable_error(mock_create_session): class FailSession: def query(self, id_): raise OperationalError('test fail') def remove(self): pass session = FailSession() mock_create_session.return_value = session l = GeneRepository.retrieve_all_transcript_ids_with_mappings()
def write_all_genes_names_to_disk(): # retrieve all gene names present in the database gene_names = sorted(GeneRepository.retrieve_all_gene_names_from_db()) # First attempt to remove the file present try: os.remove(GENE_NAMES_FILE) except OSError: pass # write all gene names to file with open(GENE_NAMES_FILE, 'w') as gene_names_file: for gene_name in gene_names: gene_names_file.write("%s\n" % gene_name)
def test_logs_error(mock_log_error, mock_create_session): error_message = "test fail" class FailSession: def query(self, id_): raise Exception(error_message) try: l = GeneRepository.retrieve_all_transcript_ids_with_mappings() except Exception as e: eq_(str(e), error_message) ok_(mock_log_error.assert_called)
def generate_pfam_aligned_codons(pfam_id): """ Generates a list of dictionaries (meta_codons_per_consensus_pos) wherein all aligned codons per domain consensus positions are located Also provides the consensus_length of the domain and the n_instances """ _log.info("Started a meta-domain based on the alignment of all '" + pfam_id + "' Pfam domains in the human genome") start_time = time.clock() # the consensus length consensus_length = 0 # the meta_domain that is to be returned meta_codons_per_consensus_pos = {} # the mapping of the protein {protein_id: {protein_posistion: consensus_position}} consensus_pos_per_protein = {} # the amount of domain occurrences found n_instances = 0 # retrieve the alignment hmmeralign_output = interpret_hmm_alignment_file( METADOMAIN_DIR + pfam_id + '/' + METADOMAIN_ALIGNMENT_FILE_NAME) if not len(hmmeralign_output) == 0: #update the consensus length consensus_length = len(hmmeralign_output['consensus']['sequence']) # update the number of instances n_instances = len(hmmeralign_output['alignments']) _log.debug( "Creating the alignment of mappings for '" + str(n_instances) + "' '" + pfam_id + "' domain occurrences based on the HMM alignment to consensus and original domain sequence" ) # ensure we can map consensus residues back to consensus positions hmmeralign_output['consensus'][ 'aligned_sequence'] = convert_pfam_fasta_alignment_to_original_aligned_sequence( hmmeralign_output['consensus']['alignment']) hmmeralign_output['consensus'][ 'mapping_consensus_alignment_to_positions'] = map_sequence_to_aligned_sequence( hmmeralign_output['consensus']['sequence'], hmmeralign_output['consensus']['aligned_sequence']) # create mappings between domain occurrences and the domain consensus sequence for _alignment in hmmeralign_output['alignments']: # retrieve current aligned domain # Create a mapping from the aligned domain sequence to the domain sequence aligned_sequence = convert_pfam_fasta_alignment_to_original_aligned_sequence( _alignment['alignment']) original_sequence = convert_pfam_fasta_alignment_to_strict_sequence( aligned_sequence) mapping_domain_alignment_to_sequence_positions = map_sequence_to_aligned_sequence( original_sequence, aligned_sequence) # Generate the strict sequence for this domain; leaving only residues that were aligned to the domain consensus strict_aligned_sequence = convert_pfam_fasta_alignment_to_strict_fasta( _alignment['alignment']) # create the mapping between the strict alignments and the original consensus sequence mapping_aligned_domain_to_domain_consensus = createAlignedSequenceMapping( strict_aligned_sequence, hmmeralign_output['consensus']['aligned_sequence'], False) # create a list of mapping positions that includes insertions mapping_positions = list( mapping_domain_alignment_to_sequence_positions.keys()) + list( set(mapping_aligned_domain_to_domain_consensus.keys()) - set(mapping_domain_alignment_to_sequence_positions.keys())) # Second add each aligned residue mapping for mapping_pos in sorted(mapping_positions): # retrieve the residue at the consensus position and the residue at the domain position consensus_domain_residue = hmmeralign_output['consensus'][ 'aligned_sequence'][mapping_pos] if consensus_domain_residue == '-': # Set the default values for the insertion continue else: # retrieve the position in the domain consensus domain_consensus_pos = hmmeralign_output['consensus'][ 'mapping_consensus_alignment_to_positions'][ mapping_pos] # retrieve the position in the domain sequence ref_pos = mapping_domain_alignment_to_sequence_positions[ mapping_pos] # convert the position in the domain sequence to the uniprot position and genomic position uniprot_pos = int(_alignment['start_pos']) + ref_pos - 1 # Add the consensus pos to the protein if not _alignment[ 'uniprot_ac'] in consensus_pos_per_protein.keys(): consensus_pos_per_protein[_alignment['uniprot_ac']] = {} if not uniprot_pos in consensus_pos_per_protein[ _alignment['uniprot_ac']].keys(): consensus_pos_per_protein[ _alignment['uniprot_ac']][uniprot_pos] = [] consensus_pos_per_protein[_alignment['uniprot_ac']][ uniprot_pos].append(domain_consensus_pos) # now incorporate the alignment data into our domain model in form of mappings # First get the protein ids for the uniprot acs uniprot_acs_to_ids = ProteinRepository.retrieve_protein_id_for_multiple_protein_acs( [x for x in consensus_pos_per_protein.keys()]) protein_ids = [ int(y) for y in np.unique([x for x in uniprot_acs_to_ids.values()]) ] # Second, get all mappings for these proteins protein_mappings = MappingRepository.get_mappings_for_multiple_protein_ids( protein_ids) # retrieve all transcripts mapped to these protein_ids gene_ids = GeneRepository.retrieve_transcript_id_for_multiple_protein_ids( protein_ids) # create all aligned codons meta_codons_per_consensus_pos = {} for uniprot_ac in consensus_pos_per_protein.keys(): for uniprot_pos in consensus_pos_per_protein[uniprot_ac].keys(): for domain_consensus_pos in consensus_pos_per_protein[ uniprot_ac][uniprot_pos]: # Retrieve the mapping for the corresponding uniprot_position mappings_for_uniprot_pos = [ x for x in protein_mappings[ uniprot_acs_to_ids[uniprot_ac]] if x.uniprot_position == uniprot_pos ] # Seperate the mappings per gene_id mapping_per_gene_id = {} for mapping in mappings_for_uniprot_pos: if not mapping.gene_id in mapping_per_gene_id.keys(): mapping_per_gene_id[mapping.gene_id] = [] mapping_per_gene_id[mapping.gene_id].append(mapping) for gene_id in mapping_per_gene_id.keys(): # Obtain the mappings for this position mappings = mapping_per_gene_id[gene_id] try: # create a codon codon = Codon.initializeFromMapping( mappings, gene_ids[gene_id], uniprot_ac) # Add the codon to the consensus positions if not domain_consensus_pos in meta_codons_per_consensus_pos.keys( ): meta_codons_per_consensus_pos[ domain_consensus_pos] = [] meta_codons_per_consensus_pos[ domain_consensus_pos].append(codon) except MalformedCodonException as e: raise MalformedMappingsForAlignedCodonsPosition( "Encountered a malformed codon mapping for domain '" + str(pfam_id) + "' in gene '" + str(gene_id) + "', at amino_acid_position '" + str(uniprot_pos) + "':" + str(e)) time_step = time.clock() _log.info("Finished the alignment of mappings for '" + str(n_instances) + "' instances '" + pfam_id + "' domain occurrences in " + str(time_step - start_time) + " seconds") return meta_codons_per_consensus_pos, consensus_length, n_instances
def analyse_transcript(transcript_id): # Retrieve the gene from the database try: gene = GeneRepository.retrieve_gene(transcript_id) except RepositoryException as e: return { 'error': 'No gene region could be build for transcript {}, reason: {}'. format(transcript_id, e) } # build the gene region gene_region = GeneRegion(gene) # Retrieve the refseq ids refseq_ids = retrieve_refseq_identifiers_for_transcript(transcript_id) if not gene_region is None: # generate the positional annotation for this gene by first computing the tolerance landscape region_positional_annotation = compute_tolerance_landscape( gene_region, flask_app.config['SLIDING_WINDOW_SIZE'], flask_app.config['ALLELE_FREQUENCY_CUTOFF']) # Annotate Pfam domains Pfam_domains = [] meta_domains = {} for domain in gene_region.interpro_domains: if domain.ext_db_id.startswith('PF'): # we have a Pfam domain pfam_domain = {} pfam_domain["ID"] = domain.ext_db_id pfam_domain["Name"] = domain.region_name pfam_domain["start"] = domain.uniprot_start pfam_domain["stop"] = domain.uniprot_stop try: if not pfam_domain['ID'] in meta_domains.keys(): # construct a meta-domain if possible temp_meta_domain = MetaDomain.initializeFromDomainID( domain.ext_db_id) # Ensure there are enough instances to actually perform the metadomain trick if temp_meta_domain.n_instances < 2: pfam_domain["metadomain"] = False meta_domains[pfam_domain['ID']] = None else: pfam_domain["metadomain"] = True meta_domains[pfam_domain['ID']] = temp_meta_domain pfam_domain[ 'meta_domain_alignment_depth'] = temp_meta_domain.get_max_alignment_depth( ) else: pfam_domain["metadomain"] = not ( meta_domains[pfam_domain['ID']] is None) except UnsupportedMetaDomainIdentifier as e: _log.error(str(e)) # meta domain is not possible meta_domains[pfam_domain['ID']] = None # Add the domain to the domain list Pfam_domains.append(pfam_domain) # Annotate the clinvar variants for the current gene ClinVar_annotation = annotateSNVs(annotateTranscriptWithClinvarData, mappings_per_chr_pos=gene_region. retrieve_mappings_per_chromosome(), strand=gene_region.strand, chromosome=gene_region.chr, regions=gene_region.regions) # retrieve the mappings per chromosome position _mappings_per_chromosome = gene_region.retrieve_mappings_per_chromosome( ) for chrom_pos in ClinVar_annotation.keys(): for variant in ClinVar_annotation[chrom_pos]: protein_pos = _mappings_per_chromosome[chrom_pos][ 'amino_acid_position'] if not 'ClinVar' in region_positional_annotation[ protein_pos].keys(): region_positional_annotation[protein_pos]['ClinVar'] = [] codon = gene_region.retrieve_codon_for_protein_position( protein_pos) # create new entry for this variant variant_entry = SingleNucleotideVariant.initializeFromVariant( _codon=codon, _chr_position=chrom_pos, _alt_nucleotide=variant['ALT'], _variant_source='ClinVar').toClinVarJson( ClinVar_id=variant['ID']) region_positional_annotation[protein_pos]['ClinVar'].append( variant_entry) # annotate the positions further for d in region_positional_annotation: # retrieve the position as is in the database db_position = d['protein_pos'] # update the positions to abide the users' expectation (start at 1, not zero) d.update((k, v + 1) for k, v in d.items() if k == "protein_pos") # add domain and meta domain information per position d['domains'] = {} for domain in Pfam_domains: if d['protein_pos'] >= domain["start"] and d[ 'protein_pos'] <= domain["stop"]: # add the domain id for this position d['domains'][domain['ID']] = None if not meta_domains[domain['ID']] is None: # retrieve the context for this protein consensus_positions = meta_domains[domain[ 'ID']].get_consensus_positions_for_uniprot_position( uniprot_ac=gene_region.uniprot_ac, uniprot_position=db_position) if domain["metadomain"] and len( consensus_positions) > 0: d['domains'][ domain['ID']] = create_meta_domain_entry( gene_region, meta_domains[domain['ID']], consensus_positions, db_position) result = { "transcript_id": transcript_id, "refseq_ids": refseq_ids['NM'], "protein_ac": gene_region.uniprot_ac, "gene_name": gene_region.gene_name, "positional_annotation": region_positional_annotation, "domains": Pfam_domains } else: result = { 'error': 'No gene region could be build for transcript ' + str(transcript_id) } return result
def retrieve_metadomain_annotation(transcript_id, protein_position, domain_positions): # first correct the protein_position protein_position -= 1 domain_results = {} for domain_id in domain_positions.keys(): # add new key to domain results domain_results[domain_id] = {} # create the values that are to be returned normal_variants = [] pathogenic_variants = [] alignment_depth = 0 # retrieve the metadomain meta_domain = MetaDomain.initializeFromDomainID(domain_id) # retrieve the codon current_codon = meta_domain.get_codon_for_transcript_and_position( transcript_id, protein_position) for consensus_position in domain_positions[domain_id]: # first correct the consensus_position consensus_position -= 1 # Retrieve the meta codons for this position meta_codons = meta_domain.get_codons_aligned_to_consensus_position( consensus_position) alignment_depth += len(meta_codons) # Retrieve the meta SNVs for this position meta_snvs = meta_domain.get_annotated_SNVs_for_consensus_position( consensus_position) # Retrieve the matching gene names for the transcripts transcript_ids = [ meta_snvs[meta_snv_repr][0]['gencode_transcription_id'] for meta_snv_repr in meta_snvs.keys() ] transcripts_to_gene = GeneRepository.retrieve_gene_names_for_multiple_transcript_ids( transcript_ids) # iterate over meta_codons and add to metadom_entry for meta_snv_repr in meta_snvs.keys(): if not current_codon.unique_str_representation( ) in meta_snv_repr: # unique variant at homologous position, can just take the first from the list meta_snv = meta_snvs[meta_snv_repr][0] # initiate the SNV variant snv_variant = SingleNucleotideVariant.initializeFromDict( meta_snv) # Convert to the original nucleotide if snv_variant.strand == Strand.minus: snv_variant.ref_nucleotide = convertNucleotide( snv_variant.ref_nucleotide) snv_variant.alt_nucleotide = convertNucleotide( snv_variant.alt_nucleotide) # start the variant entry and add the codon based information variant_entry = snv_variant.toCodonJson() # Add the gene name variant_entry['gene_name'] = transcripts_to_gene[ snv_variant.gencode_transcription_id] # Add the variant specific information if meta_snv['variant_source'] == 'gnomAD': # convert the variant to the expected format gnomad_json = snv_variant.toGnommADJson( allele_number=meta_snv['allele_number'], allele_count=meta_snv['allele_count']) for key in gnomad_json.keys(): variant_entry[key] = gnomad_json[key] # append to the list of variants normal_variants.append(variant_entry) elif meta_snv['variant_source'] == 'ClinVar': # convert the variant to the expected format clinvar_json = snv_variant.initializeFromDict( meta_snv).toClinVarJson( ClinVar_id=meta_snv['clinvar_ID']) for key in clinvar_json.keys(): variant_entry[key] = clinvar_json[key] # append to the list of variants pathogenic_variants.append(variant_entry) domain_results[domain_id]["pathogenic_variants"] = pathogenic_variants domain_results[domain_id]["normal_variants"] = normal_variants domain_results[domain_id]["alignment_depth"] = alignment_depth return domain_results
def dashboard(): gene_names = GeneRepository.retrieve_all_gene_names_from_file() return render_template('dashboard.html', data=map(json.dumps, gene_names))
domain_id: initialize_metadomain.delay(domain_id) for domain_id in os.listdir(settings.METADOMAIN_DIR) } _log.debug("waiting for results") try: monitor(results) except: _log.debug("revoking all jobs") for result in results.values(): result.revoke() raise _log.debug("getting transcript ids") with app.app_context(): transcripts = GeneRepository.retrieve_all_transcript_ids_with_mappings() transcript_ids = filter( not_created, [transcript.gencode_transcription_id for transcript in transcripts]) _log.debug("submitting visualization jobs") results = { transcript_id: create_prebuild_visualization.delay(transcript_id) for transcript_id in transcript_ids } _log.debug("waiting for results") try: monitor(results) except: