def main(opts):
    # load ensembl db
    data = EnsemblRelease(95)

    # read in fusion file
    df = pd.read_csv(opts['input'], sep='\t')

    output_list = []
    for ix, row in df.iterrows():
        # extract gene / tx
        gene5 = row["5'_gene"]
        gene3 = row["3'_gene"]
        tx_id5 = row["5'_transcript"]
        tx_id3 = row["3'_transcript"]

        # fetch prot sequence
        tx5 = data.transcript_by_id(tx_id5)
        tx3 = data.transcript_by_id(tx_id3)
        prot5 = tx5.protein_id
        prot3 = tx3.protein_id
        prot_seq5 = tx5.protein_sequence
        prot_seq3 = tx3.protein_sequence

        # append output
        output_list.append([gene5, tx_id5, prot5, prot_seq5])
        output_list.append([gene3, tx_id3, prot3, prot_seq3])

    # save output
    output_df = pd.DataFrame(output_list, columns=['gene', 'transcript_id', 'protein_id', 'protein_sequence'])
    output_df.drop_duplicates(subset=['gene', 'transcript_id', 'protein_id']).to_csv(opts['output'], sep='\t', index=False)
Exemple #2
0
def random_variants(count,
                    ensembl_release=MAX_ENSEMBL_RELEASE,
                    deletions=True,
                    insertions=True,
                    random_seed=None):
    """
    Generate a VariantCollection with random variants that overlap
    at least one complete coding transcript.
    """
    rng = random.Random(random_seed)
    ensembl = EnsemblRelease(ensembl_release)

    if ensembl_release in _transcript_ids_cache:
        transcript_ids = _transcript_ids_cache[ensembl_release]
    else:
        transcript_ids = ensembl.transcript_ids()
        _transcript_ids_cache[ensembl_release] = transcript_ids

    variants = []

    while len(variants) < count:
        transcript_id = rng.choice(transcript_ids)
        transcript = ensembl.transcript_by_id(transcript_id)

        if not transcript.complete:
            continue

        exon = rng.choice(transcript.exons)
        base1_genomic_position = rng.randint(exon.start, exon.end)
        transcript_offset = transcript.spliced_offset(base1_genomic_position)

        try:
            seq = transcript.sequence
        except ValueError as e:
            logging.warn(e)
            # can't get sequence for non-coding transcripts
            continue

        ref = str(seq[transcript_offset])
        if transcript.on_backward_strand:
            ref = reverse_complement(ref)

        alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref]

        if insertions:
            nucleotide_pairs = [
                x + y for x in STANDARD_NUCLEOTIDES
                for y in STANDARD_NUCLEOTIDES
            ]
            alt_nucleotides.extend(nucleotide_pairs)
        if deletions:
            alt_nucleotides.append("")
        alt = rng.choice(alt_nucleotides)
        variant = Variant(transcript.contig,
                          base1_genomic_position,
                          ref=ref,
                          alt=alt,
                          ensembl=ensembl)
        variants.append(variant)
    return VariantCollection(variants)
Exemple #3
0
class ScrapeEnsembl():
    ''' 
    '''
    def __init__(self, query, hg_version):
        self.query = query.replace("chr","")
        self.hg_version = ScrapeEnsembl.genome.get(hg_version) # convert to ensembl release
        self.hg = EnsemblRelease(self.hg_version) # convert to ensembl release object

    
    genome = {"hg19": 75, "hg38": 83}
    
    def get_gene_info(self):
        ''' Get the gene information at a given genomic position
        '''
         
        # check if the input is a genomic position or genomic range
        if re.search(r"[-:]", self.query) and self.query.replace(":","").isdigit():

            chrom = int(self.query.split(":")[0])
            pos = int(self.query.split(":")[1])
            gene_name = self.hg.gene_names_at_locus(contig=chrom, position=pos)
            if not gene_name:
                msg = " ".join(("No gene found at",self.query,"for genome version",
                                str(self.hg_version)))
                return msg 
            
            gene_info = self.hg.genes_by_name(gene_name[0])
            # gene_info[0].loaction doesn't work, hence the mess below
            gene_location = str(gene_info[0]).split(",")[-1][:-1].split("=")[1]

            gene_info = (gene_info[0].name, gene_info[0].id, 
                         gene_info[0].biotype, gene_location)
            
            return(gene_info)
    
    
    def get_canonical_transcript(self, gene_name):
        ''' Determine and return the canonical transcript of the given gene
        '''
        all_transcripts = self.hg.transcript_ids_of_gene_name(gene_name)
        all_transcript_details = [self.hg.transcript_by_id(x) for x in all_transcripts]
        protein_coding_transcripts = []
        for x in all_transcript_details:
            split_transcript_info = re.split(r"[=,]",str(x))
            transcript = split_transcript_info[1]
            transcript_type = split_transcript_info[9]
            location = split_transcript_info[-1][:-1]
            start = re.split(r"[:-]", location)[1]
            stop = re.split(r"[:-]", location)[2]
            size = int(stop) - int(start)
            if transcript_type == "protein_coding":
                protein_coding_transcripts.append((size,transcript,transcript_type)) 
        
        # sort by size and return the largest protein coding transcript
        if protein_coding_transcripts:    
            canonical_transcript = sorted(protein_coding_transcripts)[-1][1]
            return canonical_transcript
Exemple #4
0
def random_variants(count,
                    ensembl_release=MAX_ENSEMBL_RELEASE,
                    deletions=True,
                    insertions=True,
                    random_seed=None):
    """
    Generate a VariantCollection with random variants that overlap
    at least one complete coding transcript.
    """
    rng = random.Random(random_seed)
    ensembl = EnsemblRelease(ensembl_release)

    if ensembl_release in _transcript_ids_cache:
        transcript_ids = _transcript_ids_cache[ensembl_release]
    else:
        transcript_ids = ensembl.transcript_ids()
        _transcript_ids_cache[ensembl_release] = transcript_ids

    variants = []

    # we should finish way before this loop is over but just in case
    # something is wrong with PyEnsembl we want to avoid an infinite loop
    for _ in range(count * 100):
        if len(variants) < count:
            transcript_id = rng.choice(transcript_ids)
            transcript = ensembl.transcript_by_id(transcript_id)

            if not transcript.complete:
                continue

            exon = rng.choice(transcript.exons)
            base1_genomic_position = rng.randint(exon.start, exon.end)
            transcript_offset = transcript.spliced_offset(
                base1_genomic_position)
            seq = transcript.sequence

            ref = str(seq[transcript_offset])
            if transcript.on_backward_strand:
                ref = reverse_complement(ref)

            alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref]

            if insertions:
                nucleotide_pairs = [
                    x + y for x in STANDARD_NUCLEOTIDES
                    for y in STANDARD_NUCLEOTIDES
                ]
                alt_nucleotides.extend(nucleotide_pairs)
            if deletions:
                alt_nucleotides.append("")
            alt = rng.choice(alt_nucleotides)
            variant = Variant(transcript.contig,
                              base1_genomic_position,
                              ref=ref,
                              alt=alt,
                              ensembl=ensembl)
            variants.append(variant)
        else:
            return VariantCollection(variants)
    raise ValueError(("Unable to generate %d random variants, "
                      "there may be a problem with PyEnsembl") % count)
class EnsemblAnnotation(object):
    """
    Class for building an annotation file for MAVIS in json format.
    Args:
        species (str): species of interest
        release (int): Ensembl release to use
        output (str): path to output file
        best_file (str): path to file of "best transcripts"
        alias_file (str): path to file with gene aliases
    """
    def __init__(self,
                 release,
                 species,
                 output,
                 best_file=None,
                 alias_file=None,
                 custom_cache=None):

        self.annotation = {}

        self.custom_cache = custom_cache
        self.cache_prefix = None
        self.gen_time = get_date()
        self.release = release
        self.species = species
        self.output = output

        self.best_file = best_file
        self.alias_file = alias_file

        if self.alias_file:
            self.alias = parse_alias_file(self.alias_file)
        else:
            self.alias = defaultdict(set)

        self.data = EnsemblRelease(release, species)
        self.download_pyensembl_cache()
        self.get_domain_cache()

        if self.best_file:
            self.best = parse_best_file(self.best_file)
        else:
            self.best = self.choose_best_transcripts()

        self.build_json()

    def download_pyensembl_cache(self):
        """
        Method download the pyensembl cache files for this release if not already there.
        Args:
            data (EnsemblRelease): pyensembl object for the release info
            custom_cache (str): path to cirectory to cache pyensembl files
        """
        if self.custom_cache:
            os.environ["PYENSEMBL_CACHE_DIR"] = self.custom_cache
        self.data.download()
        self.data.index()
        self.cache_prefix = self.data.gtf_path.split("gtf.gz")[0]

    def get_domain_cache(self):
        global DOMAIN_CACHE_PATH
        DOMAIN_CACHE_PATH = self.cache_prefix + "domain.tsv"
        parse_cached_domains()

    def get_genes(self, eid):
        """
        Method parse gene info in the EnsemblRelease into json format.
        Args:
            eid (str): Ensembl gene ID
        Returns:
            dict: gene info formatted for json
        """
        gene = self.data.gene_by_id(eid)
        result = {
            "name": str(gene.gene_id),
            "chr": str(gene.contig),
            "start": int(gene.start),
            "end": int(gene.end),
            "strand": str(gene.strand),
            "aliases": [str(gene.gene_name)] + list(self.alias[gene.gene_id]),
            "transcripts": [],
        }

        return result

    def get_transcripts(self, eid):
        """
        Method parse transcript info in the EnsemblRelease into json format.
               Ignore non-coding transcripts.
        Args:
            eid (str): Ensembl transcript ID
        Returns:
            dict: transcript info formatted for json
        """
        transcript = self.data.transcript_by_id(eid)
        protein_id = transcript.protein_id
        if not protein_id:
            return None

        result = {
            "name": str(transcript.transcript_id),
            "start": int(transcript.start),
            "end": int(transcript.end),
            "aliases": [str(transcript.transcript_name)],
            "is_best_transcript": str(transcript.transcript_id) in self.best,
            "protein_id": transcript.protein_id,
            "exons": [],
            "domains": [],
        }

        # start/end are absolute genomic positions, so calculate positions relative to the mRNA start
        cpos = transcript.coding_sequence_position_ranges
        if transcript.strand in ("+", "1"):
            result["cdna_coding_start"] = transcript.spliced_offset(
                cpos[0][0]) + 1
            result["cdna_coding_end"] = transcript.spliced_offset(
                cpos[-1][1]) + 1
        elif transcript.strand in ("-", "-1"):
            result["cdna_coding_start"] = transcript.spliced_offset(
                cpos[0][1]) + 1
            result["cdna_coding_end"] = transcript.spliced_offset(
                cpos[-1][0]) + 1

        return result

    def get_exons(self, eid):
        """
        Method parse exon info in the EnsemblRelease into json format.
        Args:
            eid (str): Ensembl exon ID
        Returns:
            dict: exon info formatted for json
        """
        exon = self.data.exon_by_id(eid)
        result = {
            "name": str(exon.exon_id),
            "start": int(exon.start),
            "end": int(exon.end)
        }

        return result

    @cached_domains
    def get_domains(self, eid):
        """
        Method request domain info from Ensembl and parse into json format.
        Args:
            eid (str): Ensembl protein ID
        Returns:
            list: a list of domains formatted for json
        """
        temp = {}

        protein = request_ensembl_protein(eid)
        for domain in protein:
            name = str(domain["id"])
            desc = (str(domain["description"]).replace('"',
                                                       "").replace("'", "")
                    )  # quotes causing errors when mavis loads json
            region = {"start": int(domain["start"]), "end": int(domain["end"])}
            if desc == "":
                continue
            if name in temp:
                temp[name]["regions"].append(region)
            else:
                temp[name] = {"name": name, "desc": desc, "regions": [region]}

        domain_list = list(temp.values())
        return domain_list

    def build_json(self):
        """
        Method compile a json object for MAVIS of all protein coding genes and
               associated info for the indicated species.
        Returns:
            dict: a json-formatted set of annotations for use with MAVIS
        """
        count = {"gene": 0, "transcript": 0, "non_coding": 0}

        self.annotation["script"] = SCRIPT
        self.annotation["script_version"] = VERSION
        self.annotation["gene_alias_file"] = self.alias_file
        self.annotation["best_transcript_file"] = self.best_file
        self.annotation["ensembl_version"] = self.release
        self.annotation["generation_time"] = self.gen_time
        self.annotation["genes"] = []

        gene_ids = self.data.gene_ids()

        for index, gid in enumerate(gene_ids):
            print("{}/{} genes".format(index, len(gene_ids)))
            gened = self.get_genes(gid)
            count["gene"] += 1
            for tid in self.data.transcript_ids_of_gene_id(gid):
                transd = self.get_transcripts(tid)
                count["transcript"] += 1
                if transd:
                    for eid in self.data.exon_ids_of_transcript_id(tid):
                        exond = self.get_exons(eid)
                        transd["exons"].append(exond)
                    domains = self.get_domains(transd["protein_id"])
                    transd["domains"] = domains
                    gened["transcripts"].append(transd)
                else:
                    count["non_coding"] += 1
            if gened["transcripts"] != []:
                self.annotation["genes"].append(gened)

        print(
            "{gene:,} genes, {transcript:,} transcripts ({non_coding:,} non-coding transcripts, ignored)"
            .format(**count))
        return self.annotation

    def dump_json(self):
        """
        Method dump the annotations in json-format to the specified output file.
        """
        with open(self.output, "w") as fh:
            json.dump(self.annotation, fh)

    def delete_cache(self):
        """
        Method delete both the pyensembl and domain cache files.
        """
        for cache_file in glob(self.cache_prefix + "*"):
            print("Removing cache file", cache_file)
            os.remove(cache_file)

    def choose_best_transcripts(self):
        """
        Select a canonical transcript for each human gene using Ensembl rules.

        For human, the canonical transcript for a gene is set according to the following hierarchy: 
        - 1. Longest CCDS translation with no stop codons. 
        - 2. If no (1), choose the longest Ensembl/Havana merged translation with no stop codons. 
        - 3. If no (2), choose the longest translation with no stop codons. 
        - 4. If no translation, choose the longest non-protein-coding transcript.

        See: http://uswest.ensembl.org/Help/Glossary?id=346

        Returns:
            str: canonical Ensembl transcript ID (if any)
        """
        def longest_ccds(transcripts):
            """Longest CCDS translation with no stop codons."""
            longest = None
            longest_len = 0
            for t in transcripts:
                if t.is_protein_coding:
                    if len(t.protein_sequence) > longest_len:
                        longest = t
                        longest_len = len(t.protein_sequence)
            return longest

        def longest_translation(transcript):
            """Longest translation with no stop codons."""
            longest = None
            longest_len = 0
            for t in transcripts:
                if t.contains_start_codon:
                    start = t.start_codon_positions[0]
                    if t.contains_stop_codon:
                        stop = t.stop_codon_positions[2]
                    else:
                        stop = t.end
                    if stop - start > longest_len:
                        longest = t
                        longest_len = stop - start
            return longest

        def longest_transcript(transcripts):
            """Longest transcript."""
            longest = None
            longest_len = 0
            for t in transcripts:
                if t.end - t.start > longest_len:
                    longest = t
                    longest_len = t.end - t.start
            return longest

        best = set()

        # Ensembl rules for canonical transcripts only apply to humans
        species = self.data.species.latin_name
        if species != "homo_sapiens":
            print(
                "Unable to choose canonical transcripts for {}. You can specify canonical transcripts with '--best-transcript-file'"
                .format(species))
            return best
        else:
            print("Selecting a canoncial transcript for each gene")

        for gene_id in self.data.gene_ids():
            transcripts = [
                self.data.transcript_by_id(transcript_id) for transcript_id in
                self.data.transcript_ids_of_gene_id(gene_id)
            ]
            selected = (longest_ccds(transcripts)
                        or longest_translation(transcripts)
                        or longest_transcript(transcripts))
            if selected:
                best.add(selected.transcript_id)

        return best
Exemple #6
0
                    mstrg_list = []
                    mstrg_list.append(g_id_fresh)

                else:
                    mstrg_list.append(g_id_fresh)

            ## Check if our gene id is already in the MSTRG tracker
            if g_id_fresh in mstrg_list:

                ## If subsequent entries for the MSTRG gene have an ensembl transcript, print out the correct info
                if "ENS" in t_id:
                    mstrg = g_id_fresh.split("\"")[1]
                    g_name = g_name.split("\"")[1]
                    t_id = t_id.split("\"")[1]

                    ensembl = data.transcript_by_id(t_id)
                    ensembl_id = ensembl.gene_id
                    ensembl_name = ensembl.transcript_name
                    print(mstrg, ensembl_id, t_id, g_name, sep="\t")

                    ##  kick future genes with this id out of the loop
                    g_id.append(g_id_fresh)
                    ## and squash the mstrg list
                    mstrg_list = []

                ## if no ENS id is found, save the info for the next round
                elif "MSTRG" in t_id:
                    line2 = line

            # if the id hasn't been found yet, and is NOT in the mstrg_list, then:
            else:
Exemple #7
0
def random_variants(
        count,
        ensembl_release=MAX_ENSEMBL_RELEASE,
        deletions=True,
        insertions=True,
        random_seed=None):
    """
    Generate a VariantCollection with random variants that overlap
    at least one complete coding transcript.
    """
    rng = random.Random(random_seed)
    ensembl = EnsemblRelease(ensembl_release)

    if ensembl_release in _transcript_ids_cache:
        transcript_ids = _transcript_ids_cache[ensembl_release]
    else:
        transcript_ids = ensembl.transcript_ids()
        _transcript_ids_cache[ensembl_release] = transcript_ids

    variants = []

    while len(variants) < count:
        transcript_id = rng.choice(transcript_ids)
        transcript = ensembl.transcript_by_id(transcript_id)

        if not transcript.complete:
            continue

        exon = rng.choice(transcript.exons)
        base1_genomic_position = rng.randint(exon.start, exon.end)
        transcript_offset = transcript.spliced_offset(base1_genomic_position)

        try:
            seq = transcript.sequence
        except ValueError as e:
            logging.warn(e)
            # can't get sequence for non-coding transcripts
            continue

        ref = str(seq[transcript_offset])
        if transcript.on_backward_strand:
            ref = reverse_complement(ref)

        alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref]

        if insertions:
            nucleotide_pairs = [
                x + y
                for x in STANDARD_NUCLEOTIDES
                for y in STANDARD_NUCLEOTIDES
            ]
            alt_nucleotides.extend(nucleotide_pairs)
        if deletions:
            alt_nucleotides.append("")
        alt = rng.choice(alt_nucleotides)
        variant = Variant(
            transcript.contig,
            base1_genomic_position,
            ref=ref,
            alt=alt,
            ensembl=ensembl)
        variants.append(variant)
    return VariantCollection(variants)
Exemple #8
0
def random_variants(
        count,
        ensembl_release=MAX_ENSEMBL_RELEASE,
        deletions=True,
        insertions=True,
        random_seed=None):
    """
    Generate a VariantCollection with random variants that overlap
    at least one complete coding transcript.
    """
    rng = random.Random(random_seed)
    ensembl = EnsemblRelease(ensembl_release)

    if ensembl_release in _transcript_ids_cache:
        transcript_ids = _transcript_ids_cache[ensembl_release]
    else:
        transcript_ids = ensembl.transcript_ids()
        _transcript_ids_cache[ensembl_release] = transcript_ids

    variants = []

    # we should finish way before this loop is over but just in case
    # something is wrong with PyEnsembl we want to avoid an infinite loop
    for _ in range(count * 100):
        if len(variants) < count:
            transcript_id = rng.choice(transcript_ids)
            transcript = ensembl.transcript_by_id(transcript_id)

            if not transcript.complete:
                continue

            exon = rng.choice(transcript.exons)
            base1_genomic_position = rng.randint(exon.start, exon.end)
            transcript_offset = transcript.spliced_offset(base1_genomic_position)
            seq = transcript.sequence

            ref = str(seq[transcript_offset])
            if transcript.on_backward_strand:
                ref = reverse_complement(ref)

            alt_nucleotides = [x for x in STANDARD_NUCLEOTIDES if x != ref]

            if insertions:
                nucleotide_pairs = [
                    x + y
                    for x in STANDARD_NUCLEOTIDES
                    for y in STANDARD_NUCLEOTIDES
                ]
                alt_nucleotides.extend(nucleotide_pairs)
            if deletions:
                alt_nucleotides.append("")
            alt = rng.choice(alt_nucleotides)
            variant = Variant(
                transcript.contig,
                base1_genomic_position,
                ref=ref,
                alt=alt,
                ensembl=ensembl)
            variants.append(variant)
        else:
            return VariantCollection(variants)
    raise ValueError(
        ("Unable to generate %d random variants, "
         "there may be a problem with PyEnsembl") % count)
Exemple #9
0
class LlamaEnsembl(object):
    """ Ensembl tools """
    def __init__(self, genome='hg19'):
        if genome == 'hg19':
            self.version = 75
            self.rest_url = "http://grch37.rest.ensembl.org"
        else:
            self.version = 77
            self.rest_url = "http://rest.ensembl.org"
        self.db = EnsemblRelease(self.version)

    def rest_call(self, ext, data=None):
        if data:
            headers = {
                "Content-Type": "application/json",
                "Accept": "application/json"
            }
            r = requests.post(self.rest_url + ext, headers=headers, data=data)
        else:
            headers = {"Content-Type": "application/json"}
            r = requests.get(self.rest_url + ext, headers=headers)

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        decoded = r.json()
        # print(repr(decoded))
        return decoded

    def load_ensembl_ref(self, rid=None):
        """ Download, load, and index ensembl data """
        self.db.download(self.version)
        self.db.index()
        if rid is not None:
            return self.db.transcript_by_id(rid)
        else:
            return None

    def get_exon_numbers(self, gene):
        """ This creates exon areas from the biggest transcript """
        dct = {'start': [], 'id': [], 'stop': [], 'transcript': []}
        gene_id = self.db.gene_ids_of_gene_name(gene)[0]
        transcripts = self.db.transcript_ids_of_gene_id(gene_id)
        longest = 0
        e = None
        for trans in transcripts:
            tsc = self.db.exon_ids_of_transcript_id(trans)
            tsize = len(tsc)
            if tsize > longest:
                longest = tsize
                e = tsc
                longest_transcript = trans
        for exid in e:
            exon = self.db.exon_by_id(exid)
            dct['start'].append(exon.start)
            dct['stop'].append(exon.end)
            dct['id'].append(exid)
            dct['transcript'].append(longest_transcript)
        df = pd.DataFrame(dct)
        df['number'] = df.index + 1
        return df

    def get_genes(self, chrom, start, stop):
        if isinstance(chrom, str):
            chrom = chrom.replace('chr', '')
        return [
            gobj.gene_name
            for gobj in self.db.genes_at_locus(chrom, start, stop)
        ]

    def get_gene_pos(self, gene):
        gene_id = self.db.gene_ids_of_gene_name(gene)[0]
        result = self.db.gene_by_id(gene_id)
        return result.contig, result.start, result.end

    # Rest client calls
    def get_rsids(self, rsids):
        ext = "/variation/homo_sapiens"
        data = {"ids": rsids}
        return self.rest_call(ext, json.dumps(data))

    def get_cds_region(self, transcript, position):
        """ get location of variant to """
        ext = "/variation/human/{}:{}?".format(transcript, position)
        try:
            mappings = self.rest_call(ext)['mappings'][0]
        except requests.exceptions.HTTPError:
            return '', '', ''
        return mappings['seq_region_name'], mappings['start'], mappings['end']

    def parse_ref_exons(self, chrom, start, stop, gene=None, tx_col=None):
        """ Return fasta reference with only the sequences needed"""
        ens_db = self.db
        if isinstance(chrom, str):
            chrom = chrom.replace('chr', '')
        try:
            exons = ens_db.exons_at_locus(chrom, start, stop)
        except ValueError as e:
            # Load pyensembl db
            raise e
        if not len(exons):
            return '', ''
        exon_numbers = self.get_exon_numbers(exons[0].gene_name)
        transcript = exon_numbers['transcript'].values[0]
        trx_exons = []
        for ex in exons:
            nrow = exon_numbers[exon_numbers['id'] == ex.exon_id]
            if nrow.shape[0] > 0:
                trx_exons.extend(nrow['number'].values)
        return transcript, ','.join([str(number) for number in trx_exons])

    # Annotate DataFrames
    def annotate_dataframe(self,
                           df,
                           chrom_col='CHROM',
                           start_col='START',
                           end_col='END',
                           gene_col=None,
                           tx_col=None):
        genes = []
        exons = []
        transcripts = []
        for i, row in df.iterrows():
            genes_row = self.get_genes(row[chrom_col], row[start_col],
                                       row[end_col])
            if gene_col:
                if row[gene_col] in genes_row:
                    genes_row = [row[gene_col]]
                else:
                    print(
                        'Warning!! {} not found for {}:{}-{} in row {}'.format(
                            row[gene_col], row[chrom_col], row[start_col],
                            row[end_col], i))
            genes.append(','.join(genes_row))
            if len(genes_row) == 1 or tx_col:
                trans_row, exons_row = self.parse_ref_exons(
                    row[chrom_col],
                    row[start_col],
                    row[end_col],
                    gene=genes_row[0],
                    tx_col=tx_col
                )  # TODO - add fucntionality to choose gene and transcript
            elif len(genes_row) == 0:
                trans_row, exons_row = self.parse_ref_exons(row[chrom_col],
                                                            row[start_col],
                                                            row[end_col],
                                                            tx_col=tx_col)
            else:
                trans_row = ''
                exons_row = ''
            exons.append(exons_row)
            transcripts.append(trans_row)
        new_df = pd.DataFrame(
            {
                'genes': genes,
                'exons': exons,
                'transcript': transcripts
            },
            index=df.index)
        return new_df

    def annotate_variants(self, rsid_array, extra_cols=[]):
        """ Get chom:start-end for a list of variants """
        result = {
            'chrom': [],
            'start': [],
            'end': [],
            'rsid': [],
            'allele': [],
            'vartype': [],
            'consequence': []
        }
        for extra in extra_cols:
            result[extra] = []
        response = self.get_rsids(rsid_array)
        for var in rsid_array:
            if var not in response:
                continue
            mapping = response[var]['mappings'][0]
            result['chrom'].append(mapping['seq_region_name'])
            result['start'].append(mapping['start'])
            result['end'].append(mapping['end'])
            result['rsid'].append(var)
            result['allele'].append(mapping['allele_string'])
            result['vartype'].append(response[var]['var_class'])
            result['consequence'].append(
                response[var]['most_severe_consequence'])
            for extra in extra_cols:
                result[extra].append(response[var][extra])
        return pd.DataFrame(result)

    def annotate_cds_regions(self, df, tx_col='NM', cds_col='MutationName'):
        chroms = []
        starts = []
        ends = []
        for _, row in df.iterrows():
            location = self.get_cds_region(row[tx_col], row[cds_col])
            chroms.append(location[0])
            starts.append(location[1])
            ends.append(location[2])
        df['chrom'] = chroms
        df['start'] = starts
        df['end'] = ends
        return df