Beispiel #1
0
def get_transcript_ids_for_gene(gene_name):
    _log.debug('get_transcript_ids_for_gene')
    # retrieve the transcript ids for this gene
    trancripts = GeneRepository.retrieve_all_transcript_ids(gene_name)

    # check if there was any return value
    if len(trancripts) > 0:
        message = "Retrieved transcripts for gene '" + trancripts[
            0].gene_name + "'"
    else:
        message = "No transcripts available in database for gene '" + gene_name + "'"

    transcript_results = []
    for t in trancripts:
        # retrieve matching refseq identifiers for this transcript
        refseq_ids = retrieve_refseq_identifiers_for_transcript(
            t.gencode_transcription_id)
        refseq_nm_numbers = ", ".join(nm_number
                                      for nm_number in refseq_ids['NM'])

        transcript_entry = {}
        transcript_entry['aa_length'] = t.sequence_length
        transcript_entry['gencode_id'] = t.gencode_transcription_id
        transcript_entry['refseq_nm_numbers'] = refseq_nm_numbers
        transcript_entry['has_protein_data'] = not t.protein_id is None
        transcript_results.append(transcript_entry)

    return jsonify(trancript_ids=transcript_results, message=message)
Beispiel #2
0
def test_session_always_removed(mock_create_session):
    class FailSession:
        def __init__(self):
            self.removed = False

        def query(self, id_):
            raise Exception("test fail")

        def remove(self):
            self.removed = True

    class Allable:
        def all(self):
            return []

    class Filterable:
        def filter(self, id_):
            return Allable()

    class SuccessSession:
        def __init__(self):
            self.removed = False

        def query(self, id_):
            return Filterable()

        def remove(self):
            self.removed = True

    # Test that it removes upon failure:
    session = FailSession()
    mock_create_session.return_value = session

    try:
        l = GeneRepository.retrieve_all_transcript_ids_with_mappings()
    except:
        ok_(session.removed)

    # Test that it removes upon success:
    session = SuccessSession()
    mock_create_session.return_value = session

    l = GeneRepository.retrieve_all_transcript_ids_with_mappings()
    ok_(session.removed)
Beispiel #3
0
def test_raises_recoverable_error(mock_create_session):
    class FailSession:
        def query(self, id_):
            raise OperationalError('test fail')

        def remove(self):
            pass

    session = FailSession()
    mock_create_session.return_value = session

    l = GeneRepository.retrieve_all_transcript_ids_with_mappings()
Beispiel #4
0
def write_all_genes_names_to_disk():
    # retrieve all gene names present in the database
    gene_names = sorted(GeneRepository.retrieve_all_gene_names_from_db())

    # First attempt to remove the file present
    try:
        os.remove(GENE_NAMES_FILE)
    except OSError:
        pass

    # write all gene names to file
    with open(GENE_NAMES_FILE, 'w') as gene_names_file:
        for gene_name in gene_names:
            gene_names_file.write("%s\n" % gene_name)
Beispiel #5
0
def test_logs_error(mock_log_error, mock_create_session):

    error_message = "test fail"

    class FailSession:
        def query(self, id_):
            raise Exception(error_message)

    try:
        l = GeneRepository.retrieve_all_transcript_ids_with_mappings()
    except Exception as e:
        eq_(str(e), error_message)

    ok_(mock_log_error.assert_called)
Beispiel #6
0
def generate_pfam_aligned_codons(pfam_id):
    """
    Generates a list of dictionaries (meta_codons_per_consensus_pos)
    wherein all aligned codons per domain consensus positions are located
    Also provides the consensus_length of the domain and the n_instances
    """
    _log.info("Started a meta-domain based on the alignment of all '" +
              pfam_id + "' Pfam domains in the human genome")
    start_time = time.clock()

    # the consensus length
    consensus_length = 0
    # the meta_domain that is to be returned
    meta_codons_per_consensus_pos = {}
    # the mapping of the protein {protein_id: {protein_posistion: consensus_position}}
    consensus_pos_per_protein = {}
    # the amount of domain occurrences found
    n_instances = 0

    # retrieve the alignment
    hmmeralign_output = interpret_hmm_alignment_file(
        METADOMAIN_DIR + pfam_id + '/' + METADOMAIN_ALIGNMENT_FILE_NAME)
    if not len(hmmeralign_output) == 0:
        #update the consensus length
        consensus_length = len(hmmeralign_output['consensus']['sequence'])

        # update the number of instances
        n_instances = len(hmmeralign_output['alignments'])
        _log.debug(
            "Creating the alignment of mappings for '" + str(n_instances) +
            "' '" + pfam_id +
            "' domain occurrences based on the HMM alignment to consensus and original domain sequence"
        )

        # ensure we can map consensus residues back to consensus positions
        hmmeralign_output['consensus'][
            'aligned_sequence'] = convert_pfam_fasta_alignment_to_original_aligned_sequence(
                hmmeralign_output['consensus']['alignment'])
        hmmeralign_output['consensus'][
            'mapping_consensus_alignment_to_positions'] = map_sequence_to_aligned_sequence(
                hmmeralign_output['consensus']['sequence'],
                hmmeralign_output['consensus']['aligned_sequence'])

        # create mappings between domain occurrences and the domain consensus sequence
        for _alignment in hmmeralign_output['alignments']:
            # retrieve current aligned domain

            # Create a mapping from the aligned domain sequence to the domain sequence
            aligned_sequence = convert_pfam_fasta_alignment_to_original_aligned_sequence(
                _alignment['alignment'])
            original_sequence = convert_pfam_fasta_alignment_to_strict_sequence(
                aligned_sequence)
            mapping_domain_alignment_to_sequence_positions = map_sequence_to_aligned_sequence(
                original_sequence, aligned_sequence)

            # Generate the strict sequence for this domain; leaving only residues that were aligned to the domain consensus
            strict_aligned_sequence = convert_pfam_fasta_alignment_to_strict_fasta(
                _alignment['alignment'])

            # create the mapping between the strict alignments and the original consensus sequence
            mapping_aligned_domain_to_domain_consensus = createAlignedSequenceMapping(
                strict_aligned_sequence,
                hmmeralign_output['consensus']['aligned_sequence'], False)

            # create a list of mapping positions that includes insertions
            mapping_positions = list(
                mapping_domain_alignment_to_sequence_positions.keys()) + list(
                    set(mapping_aligned_domain_to_domain_consensus.keys()) -
                    set(mapping_domain_alignment_to_sequence_positions.keys()))

            # Second add each aligned residue mapping
            for mapping_pos in sorted(mapping_positions):
                # retrieve the residue at the consensus position and the residue at the domain position
                consensus_domain_residue = hmmeralign_output['consensus'][
                    'aligned_sequence'][mapping_pos]

                if consensus_domain_residue == '-':
                    # Set the default values for the insertion
                    continue
                else:
                    # retrieve the position in the domain consensus
                    domain_consensus_pos = hmmeralign_output['consensus'][
                        'mapping_consensus_alignment_to_positions'][
                            mapping_pos]

                # retrieve the position in the domain sequence
                ref_pos = mapping_domain_alignment_to_sequence_positions[
                    mapping_pos]
                # convert the position in the domain sequence to the uniprot position and genomic position
                uniprot_pos = int(_alignment['start_pos']) + ref_pos - 1

                # Add the consensus pos to the protein
                if not _alignment[
                        'uniprot_ac'] in consensus_pos_per_protein.keys():
                    consensus_pos_per_protein[_alignment['uniprot_ac']] = {}
                if not uniprot_pos in consensus_pos_per_protein[
                        _alignment['uniprot_ac']].keys():
                    consensus_pos_per_protein[
                        _alignment['uniprot_ac']][uniprot_pos] = []
                consensus_pos_per_protein[_alignment['uniprot_ac']][
                    uniprot_pos].append(domain_consensus_pos)

        # now incorporate the alignment data into our domain model in form of mappings
        # First get the protein ids for the uniprot acs
        uniprot_acs_to_ids = ProteinRepository.retrieve_protein_id_for_multiple_protein_acs(
            [x for x in consensus_pos_per_protein.keys()])
        protein_ids = [
            int(y) for y in np.unique([x for x in uniprot_acs_to_ids.values()])
        ]

        # Second, get all mappings for these proteins
        protein_mappings = MappingRepository.get_mappings_for_multiple_protein_ids(
            protein_ids)

        # retrieve all transcripts mapped to these protein_ids
        gene_ids = GeneRepository.retrieve_transcript_id_for_multiple_protein_ids(
            protein_ids)

        # create all aligned codons
        meta_codons_per_consensus_pos = {}
        for uniprot_ac in consensus_pos_per_protein.keys():
            for uniprot_pos in consensus_pos_per_protein[uniprot_ac].keys():
                for domain_consensus_pos in consensus_pos_per_protein[
                        uniprot_ac][uniprot_pos]:
                    # Retrieve the mapping for the corresponding uniprot_position
                    mappings_for_uniprot_pos = [
                        x for x in protein_mappings[
                            uniprot_acs_to_ids[uniprot_ac]]
                        if x.uniprot_position == uniprot_pos
                    ]

                    # Seperate the mappings per gene_id
                    mapping_per_gene_id = {}
                    for mapping in mappings_for_uniprot_pos:
                        if not mapping.gene_id in mapping_per_gene_id.keys():
                            mapping_per_gene_id[mapping.gene_id] = []
                        mapping_per_gene_id[mapping.gene_id].append(mapping)

                    for gene_id in mapping_per_gene_id.keys():
                        # Obtain the mappings for this position
                        mappings = mapping_per_gene_id[gene_id]

                        try:
                            # create a codon
                            codon = Codon.initializeFromMapping(
                                mappings, gene_ids[gene_id], uniprot_ac)

                            # Add the codon to the consensus positions
                            if not domain_consensus_pos in meta_codons_per_consensus_pos.keys(
                            ):
                                meta_codons_per_consensus_pos[
                                    domain_consensus_pos] = []

                            meta_codons_per_consensus_pos[
                                domain_consensus_pos].append(codon)
                        except MalformedCodonException as e:
                            raise MalformedMappingsForAlignedCodonsPosition(
                                "Encountered a malformed codon mapping for domain '"
                                + str(pfam_id) + "' in gene '" + str(gene_id) +
                                "', at amino_acid_position '" +
                                str(uniprot_pos) + "':" + str(e))

    time_step = time.clock()
    _log.info("Finished the alignment of mappings for '" + str(n_instances) +
              "' instances '" + pfam_id + "' domain occurrences in " +
              str(time_step - start_time) + " seconds")
    return meta_codons_per_consensus_pos, consensus_length, n_instances
Beispiel #7
0
def analyse_transcript(transcript_id):
    # Retrieve the gene from the database
    try:
        gene = GeneRepository.retrieve_gene(transcript_id)
    except RepositoryException as e:
        return {
            'error':
            'No gene region could be build for transcript {}, reason: {}'.
            format(transcript_id, e)
        }

    # build the gene region
    gene_region = GeneRegion(gene)

    # Retrieve the refseq ids
    refseq_ids = retrieve_refseq_identifiers_for_transcript(transcript_id)

    if not gene_region is None:
        # generate the positional annotation for this gene by first computing the tolerance landscape
        region_positional_annotation = compute_tolerance_landscape(
            gene_region, flask_app.config['SLIDING_WINDOW_SIZE'],
            flask_app.config['ALLELE_FREQUENCY_CUTOFF'])

        # Annotate Pfam domains
        Pfam_domains = []
        meta_domains = {}
        for domain in gene_region.interpro_domains:
            if domain.ext_db_id.startswith('PF'):
                # we have a Pfam domain
                pfam_domain = {}
                pfam_domain["ID"] = domain.ext_db_id
                pfam_domain["Name"] = domain.region_name
                pfam_domain["start"] = domain.uniprot_start
                pfam_domain["stop"] = domain.uniprot_stop

                try:
                    if not pfam_domain['ID'] in meta_domains.keys():
                        # construct a meta-domain if possible
                        temp_meta_domain = MetaDomain.initializeFromDomainID(
                            domain.ext_db_id)

                        # Ensure there are enough instances to actually perform the metadomain trick
                        if temp_meta_domain.n_instances < 2:
                            pfam_domain["metadomain"] = False
                            meta_domains[pfam_domain['ID']] = None
                        else:
                            pfam_domain["metadomain"] = True
                            meta_domains[pfam_domain['ID']] = temp_meta_domain
                            pfam_domain[
                                'meta_domain_alignment_depth'] = temp_meta_domain.get_max_alignment_depth(
                                )
                    else:
                        pfam_domain["metadomain"] = not (
                            meta_domains[pfam_domain['ID']] is None)
                except UnsupportedMetaDomainIdentifier as e:
                    _log.error(str(e))
                    # meta domain is not possible
                    meta_domains[pfam_domain['ID']] = None

                # Add the domain to the domain list
                Pfam_domains.append(pfam_domain)

        # Annotate the clinvar variants for the current gene
        ClinVar_annotation = annotateSNVs(annotateTranscriptWithClinvarData,
                                          mappings_per_chr_pos=gene_region.
                                          retrieve_mappings_per_chromosome(),
                                          strand=gene_region.strand,
                                          chromosome=gene_region.chr,
                                          regions=gene_region.regions)

        # retrieve the mappings per chromosome position
        _mappings_per_chromosome = gene_region.retrieve_mappings_per_chromosome(
        )

        for chrom_pos in ClinVar_annotation.keys():
            for variant in ClinVar_annotation[chrom_pos]:
                protein_pos = _mappings_per_chromosome[chrom_pos][
                    'amino_acid_position']

                if not 'ClinVar' in region_positional_annotation[
                        protein_pos].keys():
                    region_positional_annotation[protein_pos]['ClinVar'] = []

                codon = gene_region.retrieve_codon_for_protein_position(
                    protein_pos)

                # create new entry for this variant
                variant_entry = SingleNucleotideVariant.initializeFromVariant(
                    _codon=codon,
                    _chr_position=chrom_pos,
                    _alt_nucleotide=variant['ALT'],
                    _variant_source='ClinVar').toClinVarJson(
                        ClinVar_id=variant['ID'])
                region_positional_annotation[protein_pos]['ClinVar'].append(
                    variant_entry)

        # annotate the positions further
        for d in region_positional_annotation:
            # retrieve the position as is in the database
            db_position = d['protein_pos']

            # update the positions to abide the users' expectation (start at 1, not zero)
            d.update((k, v + 1) for k, v in d.items() if k == "protein_pos")

            # add domain and meta domain information per position
            d['domains'] = {}
            for domain in Pfam_domains:
                if d['protein_pos'] >= domain["start"] and d[
                        'protein_pos'] <= domain["stop"]:
                    # add the domain id for this position
                    d['domains'][domain['ID']] = None
                    if not meta_domains[domain['ID']] is None:
                        # retrieve the context for this protein
                        consensus_positions = meta_domains[domain[
                            'ID']].get_consensus_positions_for_uniprot_position(
                                uniprot_ac=gene_region.uniprot_ac,
                                uniprot_position=db_position)

                        if domain["metadomain"] and len(
                                consensus_positions) > 0:
                            d['domains'][
                                domain['ID']] = create_meta_domain_entry(
                                    gene_region, meta_domains[domain['ID']],
                                    consensus_positions, db_position)
        result = {
            "transcript_id": transcript_id,
            "refseq_ids": refseq_ids['NM'],
            "protein_ac": gene_region.uniprot_ac,
            "gene_name": gene_region.gene_name,
            "positional_annotation": region_positional_annotation,
            "domains": Pfam_domains
        }
    else:
        result = {
            'error':
            'No gene region could be build for transcript ' +
            str(transcript_id)
        }

    return result
Beispiel #8
0
def retrieve_metadomain_annotation(transcript_id, protein_position,
                                   domain_positions):
    # first correct the protein_position
    protein_position -= 1

    domain_results = {}

    for domain_id in domain_positions.keys():
        # add new key to domain results
        domain_results[domain_id] = {}

        # create the values that are to be returned
        normal_variants = []
        pathogenic_variants = []
        alignment_depth = 0

        # retrieve the metadomain
        meta_domain = MetaDomain.initializeFromDomainID(domain_id)

        # retrieve the codon
        current_codon = meta_domain.get_codon_for_transcript_and_position(
            transcript_id, protein_position)

        for consensus_position in domain_positions[domain_id]:
            # first correct the consensus_position
            consensus_position -= 1

            # Retrieve the meta codons for this position
            meta_codons = meta_domain.get_codons_aligned_to_consensus_position(
                consensus_position)
            alignment_depth += len(meta_codons)

            # Retrieve the meta SNVs for this position
            meta_snvs = meta_domain.get_annotated_SNVs_for_consensus_position(
                consensus_position)

            # Retrieve the matching gene names for the transcripts
            transcript_ids = [
                meta_snvs[meta_snv_repr][0]['gencode_transcription_id']
                for meta_snv_repr in meta_snvs.keys()
            ]
            transcripts_to_gene = GeneRepository.retrieve_gene_names_for_multiple_transcript_ids(
                transcript_ids)

            # iterate over meta_codons and add to metadom_entry
            for meta_snv_repr in meta_snvs.keys():
                if not current_codon.unique_str_representation(
                ) in meta_snv_repr:
                    # unique variant at homologous position, can just take the first from the list
                    meta_snv = meta_snvs[meta_snv_repr][0]

                    # initiate the SNV variant
                    snv_variant = SingleNucleotideVariant.initializeFromDict(
                        meta_snv)

                    # Convert to the original nucleotide
                    if snv_variant.strand == Strand.minus:
                        snv_variant.ref_nucleotide = convertNucleotide(
                            snv_variant.ref_nucleotide)
                        snv_variant.alt_nucleotide = convertNucleotide(
                            snv_variant.alt_nucleotide)

                    # start the variant entry and add the codon based information
                    variant_entry = snv_variant.toCodonJson()

                    # Add the gene name
                    variant_entry['gene_name'] = transcripts_to_gene[
                        snv_variant.gencode_transcription_id]

                    # Add the variant specific information
                    if meta_snv['variant_source'] == 'gnomAD':
                        # convert the variant to the expected format
                        gnomad_json = snv_variant.toGnommADJson(
                            allele_number=meta_snv['allele_number'],
                            allele_count=meta_snv['allele_count'])
                        for key in gnomad_json.keys():
                            variant_entry[key] = gnomad_json[key]

                        # append to the list of variants
                        normal_variants.append(variant_entry)
                    elif meta_snv['variant_source'] == 'ClinVar':
                        # convert the variant to the expected format
                        clinvar_json = snv_variant.initializeFromDict(
                            meta_snv).toClinVarJson(
                                ClinVar_id=meta_snv['clinvar_ID'])
                        for key in clinvar_json.keys():
                            variant_entry[key] = clinvar_json[key]

                        # append to the list of variants
                        pathogenic_variants.append(variant_entry)

        domain_results[domain_id]["pathogenic_variants"] = pathogenic_variants
        domain_results[domain_id]["normal_variants"] = normal_variants
        domain_results[domain_id]["alignment_depth"] = alignment_depth

    return domain_results
Beispiel #9
0
def dashboard():
    gene_names = GeneRepository.retrieve_all_gene_names_from_file()
    return render_template('dashboard.html', data=map(json.dumps, gene_names))
Beispiel #10
0
    domain_id: initialize_metadomain.delay(domain_id)
    for domain_id in os.listdir(settings.METADOMAIN_DIR)
}

_log.debug("waiting for results")
try:
    monitor(results)
except:
    _log.debug("revoking all jobs")
    for result in results.values():
        result.revoke()
    raise

_log.debug("getting transcript ids")
with app.app_context():
    transcripts = GeneRepository.retrieve_all_transcript_ids_with_mappings()

    transcript_ids = filter(
        not_created,
        [transcript.gencode_transcription_id for transcript in transcripts])

_log.debug("submitting visualization jobs")
results = {
    transcript_id: create_prebuild_visualization.delay(transcript_id)
    for transcript_id in transcript_ids
}

_log.debug("waiting for results")
try:
    monitor(results)
except: