Ejemplo n.º 1
0
def symbol2ensg(*args):
    genome = EnsemblRelease(ENSEMBL_RELEASE_VERSION)
    return genome.gene_ids_of_gene_name(*args)[0]
Ejemplo n.º 2
0
class LlamaEnsembl(object):
    """ Ensembl tools """
    def __init__(self, genome='hg19'):
        if genome == 'hg19':
            self.version = 75
            self.rest_url = "http://grch37.rest.ensembl.org"
        else:
            self.version = 77
            self.rest_url = "http://rest.ensembl.org"
        self.db = EnsemblRelease(self.version)

    def rest_call(self, ext, data=None):
        if data:
            headers = {
                "Content-Type": "application/json",
                "Accept": "application/json"
            }
            r = requests.post(self.rest_url + ext, headers=headers, data=data)
        else:
            headers = {"Content-Type": "application/json"}
            r = requests.get(self.rest_url + ext, headers=headers)

        if not r.ok:
            r.raise_for_status()
            sys.exit()

        decoded = r.json()
        # print(repr(decoded))
        return decoded

    def load_ensembl_ref(self, rid=None):
        """ Download, load, and index ensembl data """
        self.db.download(self.version)
        self.db.index()
        if rid is not None:
            return self.db.transcript_by_id(rid)
        else:
            return None

    def get_exon_numbers(self, gene):
        """ This creates exon areas from the biggest transcript """
        dct = {'start': [], 'id': [], 'stop': [], 'transcript': []}
        gene_id = self.db.gene_ids_of_gene_name(gene)[0]
        transcripts = self.db.transcript_ids_of_gene_id(gene_id)
        longest = 0
        e = None
        for trans in transcripts:
            tsc = self.db.exon_ids_of_transcript_id(trans)
            tsize = len(tsc)
            if tsize > longest:
                longest = tsize
                e = tsc
                longest_transcript = trans
        for exid in e:
            exon = self.db.exon_by_id(exid)
            dct['start'].append(exon.start)
            dct['stop'].append(exon.end)
            dct['id'].append(exid)
            dct['transcript'].append(longest_transcript)
        df = pd.DataFrame(dct)
        df['number'] = df.index + 1
        return df

    def get_genes(self, chrom, start, stop):
        if isinstance(chrom, str):
            chrom = chrom.replace('chr', '')
        return [
            gobj.gene_name
            for gobj in self.db.genes_at_locus(chrom, start, stop)
        ]

    def get_gene_pos(self, gene):
        gene_id = self.db.gene_ids_of_gene_name(gene)[0]
        result = self.db.gene_by_id(gene_id)
        return result.contig, result.start, result.end

    # Rest client calls
    def get_rsids(self, rsids):
        ext = "/variation/homo_sapiens"
        data = {"ids": rsids}
        return self.rest_call(ext, json.dumps(data))

    def get_cds_region(self, transcript, position):
        """ get location of variant to """
        ext = "/variation/human/{}:{}?".format(transcript, position)
        try:
            mappings = self.rest_call(ext)['mappings'][0]
        except requests.exceptions.HTTPError:
            return '', '', ''
        return mappings['seq_region_name'], mappings['start'], mappings['end']

    def parse_ref_exons(self, chrom, start, stop, gene=None, tx_col=None):
        """ Return fasta reference with only the sequences needed"""
        ens_db = self.db
        if isinstance(chrom, str):
            chrom = chrom.replace('chr', '')
        try:
            exons = ens_db.exons_at_locus(chrom, start, stop)
        except ValueError as e:
            # Load pyensembl db
            raise e
        if not len(exons):
            return '', ''
        exon_numbers = self.get_exon_numbers(exons[0].gene_name)
        transcript = exon_numbers['transcript'].values[0]
        trx_exons = []
        for ex in exons:
            nrow = exon_numbers[exon_numbers['id'] == ex.exon_id]
            if nrow.shape[0] > 0:
                trx_exons.extend(nrow['number'].values)
        return transcript, ','.join([str(number) for number in trx_exons])

    # Annotate DataFrames
    def annotate_dataframe(self,
                           df,
                           chrom_col='CHROM',
                           start_col='START',
                           end_col='END',
                           gene_col=None,
                           tx_col=None):
        genes = []
        exons = []
        transcripts = []
        for i, row in df.iterrows():
            genes_row = self.get_genes(row[chrom_col], row[start_col],
                                       row[end_col])
            if gene_col:
                if row[gene_col] in genes_row:
                    genes_row = [row[gene_col]]
                else:
                    print(
                        'Warning!! {} not found for {}:{}-{} in row {}'.format(
                            row[gene_col], row[chrom_col], row[start_col],
                            row[end_col], i))
            genes.append(','.join(genes_row))
            if len(genes_row) == 1 or tx_col:
                trans_row, exons_row = self.parse_ref_exons(
                    row[chrom_col],
                    row[start_col],
                    row[end_col],
                    gene=genes_row[0],
                    tx_col=tx_col
                )  # TODO - add fucntionality to choose gene and transcript
            elif len(genes_row) == 0:
                trans_row, exons_row = self.parse_ref_exons(row[chrom_col],
                                                            row[start_col],
                                                            row[end_col],
                                                            tx_col=tx_col)
            else:
                trans_row = ''
                exons_row = ''
            exons.append(exons_row)
            transcripts.append(trans_row)
        new_df = pd.DataFrame(
            {
                'genes': genes,
                'exons': exons,
                'transcript': transcripts
            },
            index=df.index)
        return new_df

    def annotate_variants(self, rsid_array, extra_cols=[]):
        """ Get chom:start-end for a list of variants """
        result = {
            'chrom': [],
            'start': [],
            'end': [],
            'rsid': [],
            'allele': [],
            'vartype': [],
            'consequence': []
        }
        for extra in extra_cols:
            result[extra] = []
        response = self.get_rsids(rsid_array)
        for var in rsid_array:
            if var not in response:
                continue
            mapping = response[var]['mappings'][0]
            result['chrom'].append(mapping['seq_region_name'])
            result['start'].append(mapping['start'])
            result['end'].append(mapping['end'])
            result['rsid'].append(var)
            result['allele'].append(mapping['allele_string'])
            result['vartype'].append(response[var]['var_class'])
            result['consequence'].append(
                response[var]['most_severe_consequence'])
            for extra in extra_cols:
                result[extra].append(response[var][extra])
        return pd.DataFrame(result)

    def annotate_cds_regions(self, df, tx_col='NM', cds_col='MutationName'):
        chroms = []
        starts = []
        ends = []
        for _, row in df.iterrows():
            location = self.get_cds_region(row[tx_col], row[cds_col])
            chroms.append(location[0])
            starts.append(location[1])
            ends.append(location[2])
        df['chrom'] = chroms
        df['start'] = starts
        df['end'] = ends
        return df
    print(chr)
    for f in glob.glob(chr+'/*.switchAndError.txt'):
        with open(f) as input_file:
            header_index = list_index(input_file.readline().strip().split('\t'))
            for line in input_file:
                line = line.strip().split('\t')
                for snp in line[header_index['overlapping_snps']].split(','):
                    if len(snp.strip()) == 0:
                        continue
                    snps.add(snp)
                for snp in line[header_index['snps_only_chunkVCF']].split(','):
                    if len(snp.strip()) == 0:
                        continue
                    snps.add(snp)
                for snp in line[header_index['snps_only_refVCF']].split(','):
                    if len(snp.strip()) == 0:
                        continue
                    snps.add(snp)

with open('snp_gene_locations.txt','w') as out:
    number_of_snps = len(snps)
    for index, snp in enumerate(snps):
        if index % 5000 == 0:
            print(str(index)+'/'+str(number_of_snps))
        gene_names = data.gene_names_at_locus(contig=chr.replace("chr",""), position=int(snp.split('_')[1]))
        gene_ids = []
        for gene_name in gene_names:
            for gene_id in data.gene_ids_of_gene_name(gene_name):
                gene_ids.append(gene_id)
        out.write(snp+'\t'+','.join(gene_names)+'\t'+','.join(gene_ids)+'\n')