Beispiel #1
0
def get_genes(variant):
    """Get the gene information in the mongoengine format.
    
        Args:
          variant (dict): A Variant dictionary
    
        Returns:
          mongo_genes (list): A list with mongo engine object that 
                              represents the genes
    
    """
    genes = {}
    mongo_genes = []    
    
    transcripts = get_transcripts(variant)
    # A dictionary with clinical gene descriptions
    gene_descriptions = get_gene_descriptions(variant)
    
    # First we get all vep entrys that we find and put them under their
    # corresponding gene symbol in 'genes'
    
    for transcript in transcripts:
        hgnc_symbol = transcript.hgnc_symbol
        transcript_id = transcript.transcript_id
        functional_annotations = transcript.functional_annotations
        ensembl_id = transcript.ensembl_id
        
        if hgnc_symbol:
            if hgnc_symbol in genes:
                genes[hgnc_symbol]['transcripts'][transcript_id] = transcript
                # Check most severe transcript
                for functional_annotation in functional_annotations:
                    new_rank = SO_TERMS[functional_annotation]['rank']
                    if new_rank < genes[hgnc_symbol]['best_rank']:
                        genes[hgnc_symbol]['best_rank'] = new_rank
                        genes[hgnc_symbol]['most_severe_transcript'] = transcript
                        genes[hgnc_symbol]['most_severe_function'] = functional_annotation
            else:
                genes[hgnc_symbol] = {
                    'transcripts':{
                        transcript_id: transcript,
                    },
                    'most_severe_transcript': transcript,
                    'omim_gene_id': None,
                    'phenotypic_terms': [],
                    'best_rank': 100,
                    'ensembl_id': ensembl_id,
                }
    
                for functional_annotation in functional_annotations:
                    new_rank = SO_TERMS[functional_annotation]['rank']
                    if new_rank < genes[hgnc_symbol]['best_rank']:
                        genes[hgnc_symbol]['best_rank'] = new_rank
                        genes[hgnc_symbol]['most_severe_function'] = functional_annotation
    
    ######################################################################
    ## There are two types of OMIM terms, one is the OMIM gene entry    ##
    ## and one is for the phenotypic terms.                             ##
    ## Each key in the 'omim_terms' dictionary reprecents a gene id.    ##
    ## Values are a dictionary with 'omim_gene_id' = omim_gene_id and   ##
    ## 'phenotypic_terms' = [list of OmimPhenotypeObjects]              ##
    ######################################################################
    
    # Fill the mim ids for the genes:
    mim_ids = get_omim_gene_ids(variant)
    for hgnc_symbol in mim_ids:
        if hgnc_symbol in genes:
            mim_id = mim_ids[hgnc_symbol]
            logger.debug("Adding mim id {0} to gene {1}".format(
                hgnc_symbol, mim_id))
            genes[hgnc_symbol]['omim_gene_id'] = mim_id
    
    # Fill the omim phenotype terms:
    
    phenotype_mim_ids = get_omim_phenotype_ids(variant)
    for hgnc_symbol in phenotype_mim_ids:
        phenotype_terms = phenotype_mim_ids[hgnc_symbol]
        if hgnc_symbol in genes:
            for term in phenotype_terms:
                genes[hgnc_symbol]['phenotypic_terms'].append(term)
    
    reduced_penetrance = set(variant['info_dict'].get('Reduced_penetrance', []))
    
    for hgnc_symbol in genes:
        gene_info = genes[hgnc_symbol]
        most_severe = gene_info['most_severe_transcript']
        # Create a mongo engine gene object for each gene found in the variant
        mongo_gene = Gene(hgnc_symbol=hgnc_symbol)
        if hgnc_symbol in reduced_penetrance:
            mongo_gene.reduced_penetrance = True
      
        mongo_gene.description = gene_descriptions.get(hgnc_symbol)
        mongo_gene.ensembl_gene_id = gene_info['ensembl_id']
        mongo_gene.omim_gene_entry = gene_info['omim_gene_id']
    
        mongo_gene.omim_phenotypes = gene_info['phenotypic_terms']
    
        # Add a list with the transcripts:
        mongo_gene.transcripts = []
        for transcript_id in gene_info['transcripts']:
            mongo_gene.transcripts.append(gene_info['transcripts'][transcript_id])
    
        try:
            mongo_gene.functional_annotation = gene_info['most_severe_function']
        except AttributeError:
            pass
        try:
            mongo_gene.region_annotation = SO_TERMS[mongo_gene.functional_annotation]['region']
        except AttributeError:
            pass
        try:
            mongo_gene.sift_prediction = most_severe.sift_prediction
        except AttributeError:
            pass
        try:
            mongo_gene.polyphen_prediction = most_severe.polyphen_prediction
        except AttributeError:
            pass
        # Add the mongo engine gene to the dictionary
        mongo_genes.append(mongo_gene)
    
    return mongo_genes
    
Beispiel #2
0
def get_genes(variant):
  """
  Get the transcript information in the mongoengine format.

  Args:
    variant : A Variant dictionary

  Returns:
    mongo_genes: A list with mongo engine object that represents the genes

  """
  genes = {}
  transcripts = []
  mongo_genes = []
  
  # Conversion from ensembl to refseq
  # ensembl_to_refseq is a dictionary with ensembl transcript id as keys and
  # a list of refseq ids as values
  ensembl_to_refseq = {}
  for gene_info in variant['info_dict'].get(
    'Ensembl_transcript_to_refseq_transcript', []):
    splitted_gene = gene_info.split(':')
    transcript_info = splitted_gene[1]
    for transcript in transcript_info.split('|'):
      splitted_transcript = transcript.split('>')
      if len(splitted_transcript) > 1:
        ensembl_id = splitted_transcript[0]
        refseq_ids = splitted_transcript[1].split('/')
        ensembl_to_refseq[ensembl_id] = refseq_ids
  
  # A dictionary with clinical gene descriptions
  gene_descriptions = {}
  for gene_info in variant['info_dict'].get('Gene_description', []):
    splitted_gene = gene_info.split(':')
    hgnc_symbol = splitted_gene[0]
    description = splitted_gene[1]
    gene_descriptions[hgnc_symbol] = description
  
  # First we get all vep entrys that we find and put them under their 
  # corresponding gene symbol in 'genes'
  for vep_entry in variant['vep_info'].get(variant['ALT'], []):
      transcript = get_transcript(vep_entry, ensembl_to_refseq)
      hgnc_symbol = transcript.hgnc_symbol
      if hgnc_symbol:
        if hgnc_symbol in genes:
          genes[hgnc_symbol]['transcripts'][transcript.transcript_id] = transcript
          for functional_annotation in transcript.functional_annotations:
            new_rank = SO_TERMS[functional_annotation]['rank']
            if new_rank < genes[hgnc_symbol]['best_rank']:
              genes[hgnc_symbol]['best_rank'] = new_rank
              genes[hgnc_symbol]['most_severe_transcript'] = transcript
              genes[hgnc_symbol]['most_severe_function'] = functional_annotation
              
        else:
          genes[hgnc_symbol] = {}
          genes[hgnc_symbol]['transcripts'] = {}
          genes[hgnc_symbol]['transcripts'][transcript.transcript_id] = transcript
          genes[hgnc_symbol]['most_severe_transcript'] = transcript
          genes[hgnc_symbol]['omim_gene_id'] = None
          genes[hgnc_symbol]['phenotypic_terms'] = []
          genes[hgnc_symbol]['best_rank'] = 40
          genes[hgnc_symbol]['ensembl_id'] = transcript.ensembl_id
          
          for functional_annotation in transcript.functional_annotations:
            new_rank = SO_TERMS[functional_annotation]['rank']
            if new_rank < genes[hgnc_symbol]['best_rank']:
              genes[hgnc_symbol]['best_rank'] = new_rank
              genes[hgnc_symbol]['most_severe_function'] = functional_annotation
  
  
  ######################################################################
  ## There are two types of OMIM terms, one is the OMIM gene entry    ##
  ## and one is for the phenotypic terms.                             ##
  ## Each key in the 'omim_terms' dictionary reprecents a gene id.    ##
  ## Values are a dictionary with 'omim_gene_id' = omim_gene_id and   ##
  ## 'phenotypic_terms' = [list of OmimPhenotypeObjects]              ##
  ######################################################################

  # Fill the omim gene id:s:
  for annotation in variant['info_dict'].get('OMIM_morbid', []):
    if annotation:
      splitted_record = annotation.split(':')
      try:
        hgnc_symbol = splitted_record[0]
        omim_term = splitted_record[1]
        genes[hgnc_symbol]['omim_gene_id'] = omim_term
      except (ValueError, KeyError):
        pass

  # Fill the omim phenotype terms:
  for gene_annotation in variant['info_dict'].get('Phenotypic_disease_model', []):
    if gene_annotation:
      splitted_gene = gene_annotation.split(':')
      hgnc_symbol = splitted_gene[0]
      for omim_entry in splitted_gene[1].split('|'):
        splitted_record = omim_entry.split('>')
        
        phenotype_id = splitted_record[0]
        inheritance_patterns = []
        if len(splitted_record) > 1:
          inheritance_patterns = splitted_record[1].split('/')
        
        disease_model = PhenotypeTerm(
                              phenotype_id=phenotype_id,
                              disease_models=inheritance_patterns
                            )
        
        genes[hgnc_symbol]['phenotypic_terms'].append(disease_model)
  
  for hgnc_symbol in genes:
    gene_info = genes[hgnc_symbol]
    most_severe = gene_info['most_severe_transcript']
    # Create a mongo engine gene object for each gene found in the variant
    mongo_gene = Gene(hgnc_symbol=hgnc_symbol)
    mongo_gene.description = gene_descriptions.get(hgnc_symbol)
    mongo_gene.ensembl_gene_id = gene_info.get('ensembl_id', None)
    mongo_gene.omim_gene_entry = gene_info.get(
                                      'omim_gene_id', 
                                      None
                                      )

    mongo_gene.omim_phenotypes = gene_info.get(
                                      'phenotypic_terms', 
                                      []
                                      )

    # Add a list with the transcripts:
    mongo_gene.transcripts = []
    for transcript_id in gene_info['transcripts']:
      mongo_gene.transcripts.append(gene_info['transcripts'][transcript_id])

    try:
      mongo_gene.functional_annotation = gene_info['most_severe_function']
    except AttributeError:
      pass
    try:
      mongo_gene.region_annotation = SO_TERMS[mongo_gene.functional_annotation]['region']
    except AttributeError:
      pass
    try:
      mongo_gene.sift_prediction = most_severe.sift_prediction
    except AttributeError:
      pass
    try:
      mongo_gene.polyphen_prediction = most_severe.polyphen_prediction
    except AttributeError:
      pass
    # Add the mongo engine gene to the dictionary
    mongo_genes.append(mongo_gene)

  return mongo_genes
Beispiel #3
0
def get_genes(variant):
  """
  Get the transcript information in the mongoengine format.

  Args:
    variant : A Variant dictionary

  Returns:
    mongo_genes: A list with mongo engine object that represents the genes

  """
  genes = {}
  transcripts = []
  mongo_genes = []

  # Conversion from ensembl to refseq
  # ensembl_to_refseq is a dictionary with ensembl transcript id as keys and
  # a list of refseq ids as values
  ensembl_to_refseq = {}
  for gene_info in variant['info_dict'].get(
    'Ensembl_transcript_to_refseq_transcript', []):
    splitted_gene = gene_info.split(':')
    transcript_info = splitted_gene[1]
    for transcript in transcript_info.split('|'):
      splitted_transcript = transcript.split('>')
      if len(splitted_transcript) > 1:
        ensembl_id = splitted_transcript[0]
        refseq_ids = splitted_transcript[1].split('/')
        ensembl_to_refseq[ensembl_id] = refseq_ids

  # A dictionary with clinical gene descriptions
  gene_descriptions = {}
  for gene_info in variant['info_dict'].get('Gene_description', []):
    splitted_gene = gene_info.split(':')
    hgnc_symbol = splitted_gene[0]
    description = splitted_gene[1]
    gene_descriptions[hgnc_symbol] = description

  # First we get all vep entrys that we find and put them under their
  # corresponding gene symbol in 'genes'
  for vep_entry in variant['vep_info'].get(variant['ALT'], []):
      transcript = get_transcript(vep_entry, ensembl_to_refseq)
      hgnc_symbol = transcript.hgnc_symbol
      if hgnc_symbol:
        if hgnc_symbol in genes:
          genes[hgnc_symbol]['transcripts'][transcript.transcript_id] = transcript
          for functional_annotation in transcript.functional_annotations:
            new_rank = SO_TERMS[functional_annotation]['rank']
            if new_rank < genes[hgnc_symbol]['best_rank']:
              genes[hgnc_symbol]['best_rank'] = new_rank
              genes[hgnc_symbol]['most_severe_transcript'] = transcript
              genes[hgnc_symbol]['most_severe_function'] = functional_annotation

        else:
          genes[hgnc_symbol] = {}
          genes[hgnc_symbol]['transcripts'] = {}
          genes[hgnc_symbol]['transcripts'][transcript.transcript_id] = transcript
          genes[hgnc_symbol]['most_severe_transcript'] = transcript
          genes[hgnc_symbol]['omim_gene_id'] = None
          genes[hgnc_symbol]['phenotypic_terms'] = []
          genes[hgnc_symbol]['best_rank'] = 40
          genes[hgnc_symbol]['ensembl_id'] = transcript.ensembl_id

          for functional_annotation in transcript.functional_annotations:
            new_rank = SO_TERMS[functional_annotation]['rank']
            if new_rank < genes[hgnc_symbol]['best_rank']:
              genes[hgnc_symbol]['best_rank'] = new_rank
              genes[hgnc_symbol]['most_severe_function'] = functional_annotation


  ######################################################################
  ## There are two types of OMIM terms, one is the OMIM gene entry    ##
  ## and one is for the phenotypic terms.                             ##
  ## Each key in the 'omim_terms' dictionary reprecents a gene id.    ##
  ## Values are a dictionary with 'omim_gene_id' = omim_gene_id and   ##
  ## 'phenotypic_terms' = [list of OmimPhenotypeObjects]              ##
  ######################################################################

  # Fill the omim gene id:s:
  for annotation in variant['info_dict'].get('OMIM_morbid', []):
    if annotation:
      splitted_record = annotation.split(':')
      try:
        hgnc_symbol = splitted_record[0]
        omim_term = splitted_record[1]
        genes[hgnc_symbol]['omim_gene_id'] = omim_term
      except (ValueError, KeyError):
        pass

  # Fill the omim phenotype terms:
  for gene_annotation in variant['info_dict'].get('Phenotypic_disease_model', []):
    if gene_annotation:
      splitted_gene = gene_annotation.split(':')
      hgnc_symbol = splitted_gene[0]
      for omim_entry in splitted_gene[1].split('|'):
        splitted_record = omim_entry.split('>')

        phenotype_id = splitted_record[0]
        inheritance_patterns = []
        if len(splitted_record) > 1:
          inheritance_patterns = splitted_record[1].split('/')

        disease_model = PhenotypeTerm(
                              phenotype_id=phenotype_id,
                              disease_models=inheritance_patterns
                            )

        genes[hgnc_symbol]['phenotypic_terms'].append(disease_model)

  for hgnc_symbol in genes:
    gene_info = genes[hgnc_symbol]
    most_severe = gene_info['most_severe_transcript']
    # Create a mongo engine gene object for each gene found in the variant
    mongo_gene = Gene(hgnc_symbol=hgnc_symbol)
    mongo_gene.description = gene_descriptions.get(hgnc_symbol)
    mongo_gene.ensembl_gene_id = gene_info.get('ensembl_id', None)
    mongo_gene.omim_gene_entry = gene_info.get(
                                      'omim_gene_id',
                                      None
                                      )

    mongo_gene.omim_phenotypes = gene_info.get(
                                      'phenotypic_terms',
                                      []
                                      )

    # Add a list with the transcripts:
    mongo_gene.transcripts = []
    for transcript_id in gene_info['transcripts']:
      mongo_gene.transcripts.append(gene_info['transcripts'][transcript_id])

    try:
      mongo_gene.functional_annotation = gene_info['most_severe_function']
    except AttributeError:
      pass
    try:
      mongo_gene.region_annotation = SO_TERMS[mongo_gene.functional_annotation]['region']
    except AttributeError:
      pass
    try:
      mongo_gene.sift_prediction = most_severe.sift_prediction
    except AttributeError:
      pass
    try:
      mongo_gene.polyphen_prediction = most_severe.polyphen_prediction
    except AttributeError:
      pass
    # Add the mongo engine gene to the dictionary
    mongo_genes.append(mongo_gene)

  return mongo_genes