Ejemplo n.º 1
0
class ScrapeEnsembl():
    ''' 
    '''
    def __init__(self, query, hg_version):
        self.query = query.replace("chr","")
        self.hg_version = ScrapeEnsembl.genome.get(hg_version) # convert to ensembl release
        self.hg = EnsemblRelease(self.hg_version) # convert to ensembl release object

    
    genome = {"hg19": 75, "hg38": 83}
    
    def get_gene_info(self):
        ''' Get the gene information at a given genomic position
        '''
         
        # check if the input is a genomic position or genomic range
        if re.search(r"[-:]", self.query) and self.query.replace(":","").isdigit():

            chrom = int(self.query.split(":")[0])
            pos = int(self.query.split(":")[1])
            gene_name = self.hg.gene_names_at_locus(contig=chrom, position=pos)
            if not gene_name:
                msg = " ".join(("No gene found at",self.query,"for genome version",
                                str(self.hg_version)))
                return msg 
            
            gene_info = self.hg.genes_by_name(gene_name[0])
            # gene_info[0].loaction doesn't work, hence the mess below
            gene_location = str(gene_info[0]).split(",")[-1][:-1].split("=")[1]

            gene_info = (gene_info[0].name, gene_info[0].id, 
                         gene_info[0].biotype, gene_location)
            
            return(gene_info)
    
    
    def get_canonical_transcript(self, gene_name):
        ''' Determine and return the canonical transcript of the given gene
        '''
        all_transcripts = self.hg.transcript_ids_of_gene_name(gene_name)
        all_transcript_details = [self.hg.transcript_by_id(x) for x in all_transcripts]
        protein_coding_transcripts = []
        for x in all_transcript_details:
            split_transcript_info = re.split(r"[=,]",str(x))
            transcript = split_transcript_info[1]
            transcript_type = split_transcript_info[9]
            location = split_transcript_info[-1][:-1]
            start = re.split(r"[:-]", location)[1]
            stop = re.split(r"[:-]", location)[2]
            size = int(stop) - int(start)
            if transcript_type == "protein_coding":
                protein_coding_transcripts.append((size,transcript,transcript_type)) 
        
        # sort by size and return the largest protein coding transcript
        if protein_coding_transcripts:    
            canonical_transcript = sorted(protein_coding_transcripts)[-1][1]
            return canonical_transcript
Ejemplo n.º 2
0
def make_the_gene_with_exon_and_map(chr,
                                    gene_name,
                                    ax,
                                    exon_lib,
                                    x,
                                    y,
                                    locus,
                                    frequency,
                                    width=30.0,
                                    height=5.0,
                                    scale=1.0):
    genome = EnsemblRelease(75)
    genes = genome.genes_by_name(gene_name)
    gene = genes[0]
    gene_start = gene.start
    gene_end = gene.end
    gene_length = float(gene_end - gene_start)
    # draw the gene
    ax.add_patch(
        patches.Rectangle(
            (x, y - height / 2),
            width,
            height,
            alpha=0.5,
            color="dodgerblue",
            linewidth=0,
        ))
    # find the exon ids
    exon_id = genome.exon_ids_of_gene_name(gene_name)
    # plot the gene with exons
    for i in exon_id:
        position = exon_lib[i]
        exon_start = int(position[0])
        exon_end = int(position[1])
        exon_length = exon_end - exon_start
        plot_length = exon_length / gene_length * width
        plot_x = (exon_start - gene_start) / gene_length * width
        ax.add_patch(
            patches.Rectangle(
                (x + plot_x, y - height / 2 - 0.5 * scale),
                plot_length,
                height + 1 * scale,
                color="black",
                alpha=0.4,
                linewidth=0,
            ))
    # map the read to the plot
    for i in range(len(locus)):
        plot_x = (locus[0] - gene_start) / gene_length * width
        if frequency[i] < 0.0001:
            bar_length = 1
        else:
            bar_length = abs(math.log(frequency[i] / 0.01) * 2)
        ax.plot(
            (x + plot_x, x + plot_x),
            (y + height / 2 + scale * 0.5,
             y + height / 2 + scale * 0.5 + bar_length),
            color="darkblue",
            alpha=0.5,
            linewidth=2,
        )
    # gene name at the left
    ax.text(x - 0.5,
            y,
            gene_name + '\n' + chr,
            horizontalalignment='right',
            verticalalignment='center',
            fontsize=25,
            fontweight="bold")
    # gene start
    ax.text(
        x,
        y - height / 2 - 0.8 * scale,
        gene_start,
        horizontalalignment='center',
        verticalalignment='top',
        fontsize=12,
    )
    # gene end
    ax.text(
        x + width,
        y - height / 2 - 0.8 * scale,
        gene_end,
        horizontalalignment='center',
        verticalalignment='top',
        fontsize=12,
    )
Ejemplo n.º 3
0
def plot_the_genes_mapping(lib,
                           lib_char_y,
                           ax,
                           total_read,
                           length=50.0,
                           x_default=10.0,
                           frequency_parameter=0.05):
    #parse the data
    l_gene = []
    for i in lib:
        chr = "chr" + i[0]
        gene_name = i[1]
        if gene_name == "non-coding":
            for j in lib[i]:
                frequency = [
                    float(j[1]) * 100 / total_read,
                ]
                position = j[2]
                l_gene.append((chr, None, frequency, position))
        else:
            frequency = [float(x[1]) * 100 / total_read for x in lib[i]]
            position = [int(x[2]) for x in lib[i]]
            l_gene.append((chr, gene_name, frequency, position))
    # draw the genes
    for i in l_gene:
        gene_name = i[1]
        gene_y = lib_char_y[i[0]]
        frequency = sum(i[2])
        chr_length = chr_relative_size[i[0]] * chr_relative_size["chr1_length"]
        #find the x coordinate of the gene in the corresponding chrs
        if gene_name != None:
            genome = EnsemblRelease(75)
            gene = genome.genes_by_name(gene_name)
            gene = gene[0]
            gene_start = gene.start
            gene_end = gene.end
            gene_position_in_chr = (gene_start + gene_end) / 2
            gene_x = length * chr_relative_size[
                i[0]] * gene_position_in_chr / chr_length
        else:
            gene_x = length * chr_relative_size[i[0]] * i[3] / chr_length
        # draw the genes (size of circle stand for the freqency
        if frequency < 0.05:
            radius = 0.5
        else:
            radius = abs(math.log(frequency / frequency_parameter))
        if gene_name == None:
            color = "deeppink"
        else:
            color = "dodgerblue"
        # gene with frequency related radius
        ax.add_patch(
            patches.Circle(
                (gene_x + x_default, gene_y),
                radius,
                color=color,
                alpha=0.5,
                linewidth=0,
            ))
        # gene position
        ax.add_patch(
            patches.Circle(
                (gene_x + x_default, gene_y),
                0.5,
                color=color,
                linewidth=0.5,
                edgecolor="black",
            ))

        if gene_name != None:
            ax.text(
                gene_x + x_default,
                gene_y - radius - 0.8,
                gene_name,
                horizontalalignment='center',
                verticalalignment='center',
                fontsize=18,
                fontweight="bold",
            )
        if frequency < 0.0001:
            number = "%.2e" % float(frequency)
        else:
            number = "%.4f" % float(frequency)
        ax.text(
            gene_x + x_default,
            gene_y + radius + 0.5,
            number,
            horizontalalignment='center',
            verticalalignment='center',
            fontsize=12,
        )
    return l_gene
Ejemplo n.º 4
0
def genes_by_name(*args, **kwargs):
    genome = EnsemblRelease(ENSEMBL_RELEASE_VERSION)
    return genome.genes_by_name(*args, **kwargs)