class EnsemblAnnotation(object):
    """
    Class for building an annotation file for MAVIS in json format.
    Args:
        species (str): species of interest
        release (int): Ensembl release to use
        output (str): path to output file
        best_file (str): path to file of "best transcripts"
        alias_file (str): path to file with gene aliases
    """
    def __init__(self,
                 release,
                 species,
                 output,
                 best_file=None,
                 alias_file=None,
                 custom_cache=None):

        self.annotation = {}

        self.custom_cache = custom_cache
        self.cache_prefix = None
        self.gen_time = get_date()
        self.release = release
        self.species = species
        self.output = output

        self.best_file = best_file
        self.alias_file = alias_file

        if self.alias_file:
            self.alias = parse_alias_file(self.alias_file)
        else:
            self.alias = defaultdict(set)

        self.data = EnsemblRelease(release, species)
        self.download_pyensembl_cache()
        self.get_domain_cache()

        if self.best_file:
            self.best = parse_best_file(self.best_file)
        else:
            self.best = self.choose_best_transcripts()

        self.build_json()

    def download_pyensembl_cache(self):
        """
        Method download the pyensembl cache files for this release if not already there.
        Args:
            data (EnsemblRelease): pyensembl object for the release info
            custom_cache (str): path to cirectory to cache pyensembl files
        """
        if self.custom_cache:
            os.environ["PYENSEMBL_CACHE_DIR"] = self.custom_cache
        self.data.download()
        self.data.index()
        self.cache_prefix = self.data.gtf_path.split("gtf.gz")[0]

    def get_domain_cache(self):
        global DOMAIN_CACHE_PATH
        DOMAIN_CACHE_PATH = self.cache_prefix + "domain.tsv"
        parse_cached_domains()

    def get_genes(self, eid):
        """
        Method parse gene info in the EnsemblRelease into json format.
        Args:
            eid (str): Ensembl gene ID
        Returns:
            dict: gene info formatted for json
        """
        gene = self.data.gene_by_id(eid)
        result = {
            "name": str(gene.gene_id),
            "chr": str(gene.contig),
            "start": int(gene.start),
            "end": int(gene.end),
            "strand": str(gene.strand),
            "aliases": [str(gene.gene_name)] + list(self.alias[gene.gene_id]),
            "transcripts": [],
        }

        return result

    def get_transcripts(self, eid):
        """
        Method parse transcript info in the EnsemblRelease into json format.
               Ignore non-coding transcripts.
        Args:
            eid (str): Ensembl transcript ID
        Returns:
            dict: transcript info formatted for json
        """
        transcript = self.data.transcript_by_id(eid)
        protein_id = transcript.protein_id
        if not protein_id:
            return None

        result = {
            "name": str(transcript.transcript_id),
            "start": int(transcript.start),
            "end": int(transcript.end),
            "aliases": [str(transcript.transcript_name)],
            "is_best_transcript": str(transcript.transcript_id) in self.best,
            "protein_id": transcript.protein_id,
            "exons": [],
            "domains": [],
        }

        # start/end are absolute genomic positions, so calculate positions relative to the mRNA start
        cpos = transcript.coding_sequence_position_ranges
        if transcript.strand in ("+", "1"):
            result["cdna_coding_start"] = transcript.spliced_offset(
                cpos[0][0]) + 1
            result["cdna_coding_end"] = transcript.spliced_offset(
                cpos[-1][1]) + 1
        elif transcript.strand in ("-", "-1"):
            result["cdna_coding_start"] = transcript.spliced_offset(
                cpos[0][1]) + 1
            result["cdna_coding_end"] = transcript.spliced_offset(
                cpos[-1][0]) + 1

        return result

    def get_exons(self, eid):
        """
        Method parse exon info in the EnsemblRelease into json format.
        Args:
            eid (str): Ensembl exon ID
        Returns:
            dict: exon info formatted for json
        """
        exon = self.data.exon_by_id(eid)
        result = {
            "name": str(exon.exon_id),
            "start": int(exon.start),
            "end": int(exon.end)
        }

        return result

    @cached_domains
    def get_domains(self, eid):
        """
        Method request domain info from Ensembl and parse into json format.
        Args:
            eid (str): Ensembl protein ID
        Returns:
            list: a list of domains formatted for json
        """
        temp = {}

        protein = request_ensembl_protein(eid)
        for domain in protein:
            name = str(domain["id"])
            desc = (str(domain["description"]).replace('"',
                                                       "").replace("'", "")
                    )  # quotes causing errors when mavis loads json
            region = {"start": int(domain["start"]), "end": int(domain["end"])}
            if desc == "":
                continue
            if name in temp:
                temp[name]["regions"].append(region)
            else:
                temp[name] = {"name": name, "desc": desc, "regions": [region]}

        domain_list = list(temp.values())
        return domain_list

    def build_json(self):
        """
        Method compile a json object for MAVIS of all protein coding genes and
               associated info for the indicated species.
        Returns:
            dict: a json-formatted set of annotations for use with MAVIS
        """
        count = {"gene": 0, "transcript": 0, "non_coding": 0}

        self.annotation["script"] = SCRIPT
        self.annotation["script_version"] = VERSION
        self.annotation["gene_alias_file"] = self.alias_file
        self.annotation["best_transcript_file"] = self.best_file
        self.annotation["ensembl_version"] = self.release
        self.annotation["generation_time"] = self.gen_time
        self.annotation["genes"] = []

        gene_ids = self.data.gene_ids()

        for index, gid in enumerate(gene_ids):
            print("{}/{} genes".format(index, len(gene_ids)))
            gened = self.get_genes(gid)
            count["gene"] += 1
            for tid in self.data.transcript_ids_of_gene_id(gid):
                transd = self.get_transcripts(tid)
                count["transcript"] += 1
                if transd:
                    for eid in self.data.exon_ids_of_transcript_id(tid):
                        exond = self.get_exons(eid)
                        transd["exons"].append(exond)
                    domains = self.get_domains(transd["protein_id"])
                    transd["domains"] = domains
                    gened["transcripts"].append(transd)
                else:
                    count["non_coding"] += 1
            if gened["transcripts"] != []:
                self.annotation["genes"].append(gened)

        print(
            "{gene:,} genes, {transcript:,} transcripts ({non_coding:,} non-coding transcripts, ignored)"
            .format(**count))
        return self.annotation

    def dump_json(self):
        """
        Method dump the annotations in json-format to the specified output file.
        """
        with open(self.output, "w") as fh:
            json.dump(self.annotation, fh)

    def delete_cache(self):
        """
        Method delete both the pyensembl and domain cache files.
        """
        for cache_file in glob(self.cache_prefix + "*"):
            print("Removing cache file", cache_file)
            os.remove(cache_file)

    def choose_best_transcripts(self):
        """
        Select a canonical transcript for each human gene using Ensembl rules.

        For human, the canonical transcript for a gene is set according to the following hierarchy: 
        - 1. Longest CCDS translation with no stop codons. 
        - 2. If no (1), choose the longest Ensembl/Havana merged translation with no stop codons. 
        - 3. If no (2), choose the longest translation with no stop codons. 
        - 4. If no translation, choose the longest non-protein-coding transcript.

        See: http://uswest.ensembl.org/Help/Glossary?id=346

        Returns:
            str: canonical Ensembl transcript ID (if any)
        """
        def longest_ccds(transcripts):
            """Longest CCDS translation with no stop codons."""
            longest = None
            longest_len = 0
            for t in transcripts:
                if t.is_protein_coding:
                    if len(t.protein_sequence) > longest_len:
                        longest = t
                        longest_len = len(t.protein_sequence)
            return longest

        def longest_translation(transcript):
            """Longest translation with no stop codons."""
            longest = None
            longest_len = 0
            for t in transcripts:
                if t.contains_start_codon:
                    start = t.start_codon_positions[0]
                    if t.contains_stop_codon:
                        stop = t.stop_codon_positions[2]
                    else:
                        stop = t.end
                    if stop - start > longest_len:
                        longest = t
                        longest_len = stop - start
            return longest

        def longest_transcript(transcripts):
            """Longest transcript."""
            longest = None
            longest_len = 0
            for t in transcripts:
                if t.end - t.start > longest_len:
                    longest = t
                    longest_len = t.end - t.start
            return longest

        best = set()

        # Ensembl rules for canonical transcripts only apply to humans
        species = self.data.species.latin_name
        if species != "homo_sapiens":
            print(
                "Unable to choose canonical transcripts for {}. You can specify canonical transcripts with '--best-transcript-file'"
                .format(species))
            return best
        else:
            print("Selecting a canoncial transcript for each gene")

        for gene_id in self.data.gene_ids():
            transcripts = [
                self.data.transcript_by_id(transcript_id) for transcript_id in
                self.data.transcript_ids_of_gene_id(gene_id)
            ]
            selected = (longest_ccds(transcripts)
                        or longest_translation(transcripts)
                        or longest_transcript(transcripts))
            if selected:
                best.add(selected.transcript_id)

        return best
Ejemplo n.º 2
0
class LlamaEnsembl(object):
    """ Ensembl tools """
    def __init__(self, genome='hg19'):
        if genome == 'hg19':
            self.version = 75
            self.rest_url = "http://grch37.rest.ensembl.org"
        else:
            self.version = 77
            self.rest_url = "http://rest.ensembl.org"
        self.db = EnsemblRelease(self.version)

    def rest_call(self, ext, data=None):
        if data:
            headers = {
                "Content-Type": "application/json",
                "Accept": "application/json"
            }
            r = requests.post(self.rest_url + ext, headers=headers, data=data)
        else:
            headers = {"Content-Type": "application/json"}
            r = requests.get(self.rest_url + ext, headers=headers)

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        decoded = r.json()
        # print(repr(decoded))
        return decoded

    def load_ensembl_ref(self, rid=None):
        """ Download, load, and index ensembl data """
        self.db.download(self.version)
        self.db.index()
        if rid is not None:
            return self.db.transcript_by_id(rid)
        else:
            return None

    def get_exon_numbers(self, gene):
        """ This creates exon areas from the biggest transcript """
        dct = {'start': [], 'id': [], 'stop': [], 'transcript': []}
        gene_id = self.db.gene_ids_of_gene_name(gene)[0]
        transcripts = self.db.transcript_ids_of_gene_id(gene_id)
        longest = 0
        e = None
        for trans in transcripts:
            tsc = self.db.exon_ids_of_transcript_id(trans)
            tsize = len(tsc)
            if tsize > longest:
                longest = tsize
                e = tsc
                longest_transcript = trans
        for exid in e:
            exon = self.db.exon_by_id(exid)
            dct['start'].append(exon.start)
            dct['stop'].append(exon.end)
            dct['id'].append(exid)
            dct['transcript'].append(longest_transcript)
        df = pd.DataFrame(dct)
        df['number'] = df.index + 1
        return df

    def get_genes(self, chrom, start, stop):
        if isinstance(chrom, str):
            chrom = chrom.replace('chr', '')
        return [
            gobj.gene_name
            for gobj in self.db.genes_at_locus(chrom, start, stop)
        ]

    def get_gene_pos(self, gene):
        gene_id = self.db.gene_ids_of_gene_name(gene)[0]
        result = self.db.gene_by_id(gene_id)
        return result.contig, result.start, result.end

    # Rest client calls
    def get_rsids(self, rsids):
        ext = "/variation/homo_sapiens"
        data = {"ids": rsids}
        return self.rest_call(ext, json.dumps(data))

    def get_cds_region(self, transcript, position):
        """ get location of variant to """
        ext = "/variation/human/{}:{}?".format(transcript, position)
        try:
            mappings = self.rest_call(ext)['mappings'][0]
        except requests.exceptions.HTTPError:
            return '', '', ''
        return mappings['seq_region_name'], mappings['start'], mappings['end']

    def parse_ref_exons(self, chrom, start, stop, gene=None, tx_col=None):
        """ Return fasta reference with only the sequences needed"""
        ens_db = self.db
        if isinstance(chrom, str):
            chrom = chrom.replace('chr', '')
        try:
            exons = ens_db.exons_at_locus(chrom, start, stop)
        except ValueError as e:
            # Load pyensembl db
            raise e
        if not len(exons):
            return '', ''
        exon_numbers = self.get_exon_numbers(exons[0].gene_name)
        transcript = exon_numbers['transcript'].values[0]
        trx_exons = []
        for ex in exons:
            nrow = exon_numbers[exon_numbers['id'] == ex.exon_id]
            if nrow.shape[0] > 0:
                trx_exons.extend(nrow['number'].values)
        return transcript, ','.join([str(number) for number in trx_exons])

    # Annotate DataFrames
    def annotate_dataframe(self,
                           df,
                           chrom_col='CHROM',
                           start_col='START',
                           end_col='END',
                           gene_col=None,
                           tx_col=None):
        genes = []
        exons = []
        transcripts = []
        for i, row in df.iterrows():
            genes_row = self.get_genes(row[chrom_col], row[start_col],
                                       row[end_col])
            if gene_col:
                if row[gene_col] in genes_row:
                    genes_row = [row[gene_col]]
                else:
                    print(
                        'Warning!! {} not found for {}:{}-{} in row {}'.format(
                            row[gene_col], row[chrom_col], row[start_col],
                            row[end_col], i))
            genes.append(','.join(genes_row))
            if len(genes_row) == 1 or tx_col:
                trans_row, exons_row = self.parse_ref_exons(
                    row[chrom_col],
                    row[start_col],
                    row[end_col],
                    gene=genes_row[0],
                    tx_col=tx_col
                )  # TODO - add fucntionality to choose gene and transcript
            elif len(genes_row) == 0:
                trans_row, exons_row = self.parse_ref_exons(row[chrom_col],
                                                            row[start_col],
                                                            row[end_col],
                                                            tx_col=tx_col)
            else:
                trans_row = ''
                exons_row = ''
            exons.append(exons_row)
            transcripts.append(trans_row)
        new_df = pd.DataFrame(
            {
                'genes': genes,
                'exons': exons,
                'transcript': transcripts
            },
            index=df.index)
        return new_df

    def annotate_variants(self, rsid_array, extra_cols=[]):
        """ Get chom:start-end for a list of variants """
        result = {
            'chrom': [],
            'start': [],
            'end': [],
            'rsid': [],
            'allele': [],
            'vartype': [],
            'consequence': []
        }
        for extra in extra_cols:
            result[extra] = []
        response = self.get_rsids(rsid_array)
        for var in rsid_array:
            if var not in response:
                continue
            mapping = response[var]['mappings'][0]
            result['chrom'].append(mapping['seq_region_name'])
            result['start'].append(mapping['start'])
            result['end'].append(mapping['end'])
            result['rsid'].append(var)
            result['allele'].append(mapping['allele_string'])
            result['vartype'].append(response[var]['var_class'])
            result['consequence'].append(
                response[var]['most_severe_consequence'])
            for extra in extra_cols:
                result[extra].append(response[var][extra])
        return pd.DataFrame(result)

    def annotate_cds_regions(self, df, tx_col='NM', cds_col='MutationName'):
        chroms = []
        starts = []
        ends = []
        for _, row in df.iterrows():
            location = self.get_cds_region(row[tx_col], row[cds_col])
            chroms.append(location[0])
            starts.append(location[1])
            ends.append(location[2])
        df['chrom'] = chroms
        df['start'] = starts
        df['end'] = ends
        return df