def geneTranslation(var):
    """Translates variants to genes
	
	Arguments:
		var {list} -- List of chromosome regions
	"""
    data = pyensembl.Genome(
        reference_name='GRCh37',
        annotation_name='my_genome_features',
        gtf_path_or_url='../../data/datasets/Original/ensembl_v37.gtf')
    data.index()
    genes, variants = [], []
    bar = Bar('Processing', max=len(var), suffix='%(percent)d%%')
    for v in var:
        locus = v.split(':')
        chr, pos = locus[0][3:], int(float(locus[1]))
        gene = data.genes_at_locus(chr, pos)
        if len(gene) == 0:
            genes.append('None')
            variants.append(v)
        elif gene[0].biotype == 'protein_coding':
            genes.append(gene[0].gene_name)
            variants.append(v)
        else:
            genes.append('None')
            variants.append(v)
        bar.next()
    bar.finish()
    result = zip(genes, variants)  # Creates a list of (gene,variant region)
    print('>>> Writing txt...')
    with open('../../data/genes/geneList.csv', mode='w', newline='') as myfile:
        wr = csv.writer(myfile)
        wr.writerow(("Genes", "Variants"))
        for row in result:
            wr.writerow(list(row))
Example #2
0
def load_ensembl_gene_ids(mouse_gtf=None):
    data = pyensembl.Genome(
        reference_name='GRCm38',
        gtf_path_or_url=
        'ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/Mus_musculus.GRCm38.81.gtf.gz',
        transcript_fasta_path_or_url=
        'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz',
        protein_fasta_path_or_url=
        'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz'
    )
    return data
Example #3
0
def main():
    sys.stderr = open(snakemake.log[0], "w")
    data = pyensembl.Genome(
        reference_name='GRCm38',
        annotation_name='mus_musculus',
        gtf_path_or_url=snakemake.params.gtf)
    data.index()

    df_cts = annotate(snakemake.input.counts, data)
    df_dge = annotate(snakemake.input.dge, data)
    df = df_dge.merge(df_cts, how='inner', left_index=True, right_index=True)
    df = df.sort_values(by='padj')

    df.to_csv(snakemake.output.table, sep='\t')
Example #4
0
def get_genome(reference_name,
               gtf,
               transcript_fasta=None,
               logging_args=None,
               annotation_name='ensembl',
               **kwargs):
    """ Retrieve the pyensembl annotations associated with the given reference.

    The script also creates the database (with the "index" method) if it does
    not already exist.

    This function is largely a wrapper around the pyensembl.Genome constructor,
    so please see it for more details.

    Parameters
    ----------
    reference_name: string
        The identifier for the reference

    gtf: string
        The path to the GTF annotation file

    transcript_fasta: string (optional)
        The path to the fasta file with transcript sequences. The fasta keys
        must match the transcript_id in the GTF file

    logging_args: argparse.Namespace
        pyensembl appears to change several logging levels while opening the
        annotation database (sometimes?). If the logging arguments are given,
        then they will be restored after opening the database.
        
    annotation_name, kwargs:
        Other options to pass to the pyensembl constructor
    """
    ensembl = pyensembl.Genome(reference_name=reference_name,
                               gtf_path_or_url=gtf,
                               transcript_fasta_paths_or_urls=transcript_fasta,
                               annotation_name=annotation_name,
                               **kwargs)

    # this will create the database if needed
    ensembl.index()

    if logging_args is not None:
        logging_utils.update_logging(logging_args)

    return ensembl
Example #5
0
def main(tsv_path):
    """
    annotate.py

    Manually annotate a file with gene symbols from Ensemble ID.
    :param tsv_path: Path for TSV to annotate
    """
    data = pyensembl.Genome(reference_name='GRCm38',
                            annotation_name='mus_musculus',
                            gtf_path_or_url='resources/genome.gtf')
    data.index()

    df = annotate(tsv_path, data)

    parent_dir = os.path.dirname(tsv_path)
    name, ext = os.path.splitext(os.path.basename(tsv_path))
    out_dir = os.path.join(parent_dir, "{}_anno{}".format(name, ext))

    df.to_csv(out_dir, sep='\t')
Example #6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Given a meme motif file, extract the gene names and map "
        "them to ensembl identifiers using pyensembl. The pyensembl database "
        "information can be given either in a yaml config file or as command "
        "line options. The yaml config file values have precedence over the "
        "command line options.")

    parser.add_argument('meme', help="The meme file")
    parser.add_argument('out', help="The output file")

    parser.add_argument(
        '-c',
        '--config',
        help="The yaml config file. If "
        "given, this should include keys 'genome_name' and 'gtf'. Otherwise, "
        "they may be specified using the respective command line options.",
        default=None)

    parser.add_argument('-n',
                        '--genome-name',
                        help="The genome_parameter for "
                        "retrieving the pyensembl database",
                        default=None)
    parser.add_argument('-g',
                        '--gtf',
                        help="The gtf file for pyensembl",
                        default=None)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # if the config file was given, use any values in it to replace those
    # passed on the command line
    if args.config is not None:
        msg = "Reading config file"
        logger.info(msg)
        config = yaml.load(open(args.config))

        args.genome_name = config.get('genome_name', args.genome_name)
        args.gtf = config.get('gtf', args.gtf)

    msg = "genome_name: {}".format(args.genome_name)
    logger.debug(msg)

    msg = "gtf: {}".format(args.gtf)
    logger.debug(msg)

    msg = "Loading pyensembl database"
    logger.info(msg)

    ensembl = pyensembl.Genome(reference_name=args.genome_name,
                               annotation_name="ensembl",
                               gtf_path_or_url=args.gtf)

    # this will create the database if needed
    ensembl.index()

    msg = "Parsing motif gene names"
    logger.info(msg)

    # a line from CISBP looks like:
    #   MOTIF M002_0.6 (Ankhd1)_(Homo_sapiens)_(RBD_1.00)

    all_motifs = []

    motif_re = ("\((?P<gene_name>[^\)]+)\)_\((?P<species>[^\)]+)\)_"
                "\((?P<rbd_score>[^\)]+)\)")
    motif_re = re.compile(motif_re)

    with open(args.meme) as meme_f:
        for line in meme_f:
            if line.startswith("MOTIF"):
                (key, motif_name, info) = line.split()

                m = motif_re.match(info)

                if m is None:
                    msg = ("Could not parse gene name. Guessing the entire "
                           "string is the gene name: '{}'.".format(info))
                    logger.warning(msg)
                    gene_name = info
                else:
                    gene_name = m.group("gene_name")

                try:
                    ensembl_ids = ensembl.gene_ids_of_gene_name(gene_name)
                except ValueError:
                    msg = ("Could not find Ensembl identifier for gene_name: "
                           "'{}'".format(gene_name))
                    logger.warning(msg)
                    ensembl_ids = [gene_name]

                for ensembl_id in ensembl_ids:
                    motif = {
                        "motif_name": motif_name,
                        "gene_name": gene_name,
                        "ensembl_id": ensembl_id
                    }

                    all_motifs.append(motif)

    msg = "Joining motif gene names into large data frame"
    logger.info(msg)
    all_motifs_df = pd.DataFrame(all_motifs)

    msg = "Writing motifs to disk"
    logger.info(msg)
    utils.write_df(all_motifs_df, args.out, index=False)
Example #7
0
from __future__ import print_function
import sys
import pyensembl


myfile = open(sys.argv[1])
gtffile = sys.argv[2]
outfile = open(sys.argv[1][:-4] + '_geneanno.sam', 'w')
data = pyensembl.Genome(reference_name='GRCH37', annotation_name='my_genome_features', gtf_path_or_url=gtffile)
data.index()
with myfile:
    for line in myfile:
        if line[0] == '@':
            continue
        info = line.split('\t')
        chr = info[2]
        pos = int(info[3])
        nameresult = data.gene_names_at_locus(contig=chr, position=pos)
        k = 20
        if nameresult == []:
            nameresult = data.gene_names_at_locus(contig=chr, position=pos+k)
        if nameresult == []:
            nameresult = data.gene_names_at_locus(contig=chr, position=pos - k)
        genename = ''
        for item in nameresult:
            genename += item + ';'
        genename = genename[:-1]
        outfile.write(genename + '\t' + line)
                    peak[2],
                    width=peak[1] - peak[0],
                    color='b',
                    edgecolor='none')
        axes[8].set_xlim([start_pos, end_pos])
        axes[8].set_title('RelA Lipid A 2h')
        axes[8].set_yticks([0, 40])
        axes[8].set_xticks([])
"""
Plot location of Ccl3 and Ccl4
"""
mouse_genome = pyensembl.Genome(
    reference_name='NCBIM37',
    gtf_path_or_url=
    'ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/Mus_musculus.NCBIM37.67.gtf.gz',
    transcript_fasta_path_or_url=
    'ftp://ftp.ensembl.org/pub/release-67/fasta/mus_musculus/cdna/Mus_musculus.NCBIM37.67.cdna.all.fa.gz',
    protein_fasta_path_or_url=
    'ftp://ftp.ensembl.org/pub/release-67/fasta/mus_musculus/pep/Mus_musculus.NCBIM37.67.pep.all.fa.gz'
)
list_of_genes = ['Ccl3', 'Ccl4']
transcript_dict = {}
gene_object_dict = {}

for gene in list_of_genes:
    transcript_dict[gene] = mouse_genome.transcript_ids_of_gene_name(gene)

for gene in list_of_genes:
    gene_object_dict[gene] = mouse_genome.genes_by_name(gene)[0]

# Gene locations
Example #9
0
def count_rsem_files(direc_name,
                     bammed_direc='aligned_star',
                     quant_direc='quant_rsem',
                     counted_direc='counted_rsem',
                     spikeids=[]):
    quant_path = os.path.join(direc_name, quant_direc)
    counted_path = os.path.join(direc_name, counted_direc)
    bammed_path = os.path.join(direc_name, bammed_direc)

    file_list = os.listdir(quant_path)

    mouse_genome = pyensembl.Genome(
        reference_name='GRCm38',
        gtf_path_or_url=
        'ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/Mus_musculus.GRCm38.81.gtf.gz',
        transcript_fasta_path_or_url=
        'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz',
        protein_fasta_path_or_url=
        'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz'
    )

    for seq_file_temp in file_list:
        if fnmatch.fnmatch(seq_file_temp, r'*.isoforms.results'):
            seq_file = seq_file_temp

    gene_names = mouse_genome.gene_names()

    transcript_dict = {}
    counter = 0
    for gene in gene_names:
        transcript_dict[gene] = mouse_genome.transcript_ids_of_gene_name(gene)
        counter += 1
        print counter

    for seq_file in file_list:
        print seq_file, fnmatch.fnmatch(seq_file, r'*.isoforms.results')
        if fnmatch.fnmatch(seq_file, r'*.isoforms.results'):
            bfs = seq_file.split('.')
            bam_file_sorted_by_loc = bfs[0] + '.transcript.sorted.bam'
            num_mapped, num_unmapped = bam_read_count(
                os.path.join(bammed_path, bam_file_sorted_by_loc))
            transcripts, spikeins = load_sequence_counts_rsem(
                rsem_file=os.path.join(quant_path, seq_file),
                spikeids=spikeids)

            filename_save = os.path.join(counted_path, seq_file + '.h5')
            print 'Saving ' + filename_save

            store = pd.HDFStore(filename_save)

            d = {'num_mapped': num_mapped, 'num_unmapped': num_unmapped}
            quality_control = pd.DataFrame(d, index=[0])

            # Create a transcript table indexed by gene name - sum over all isoforms
            counts_list = []
            fpkm_list = []
            tpm_list = []
            for j in xrange(len(gene_names)):
                gene = gene_names[j]
                transcript_list = transcript_dict[gene]
                counts = 0
                fpkm = 0
                tpm = 0
                for transcript in transcript_list:
                    if transcript in transcripts.index:
                        counts += transcripts.loc[transcript]['est_counts']
                        fpkm += transcripts.loc[transcript]['fpkm']
                        tpm += transcripts.loc[transcript]['tpm']
                counts_list += [counts]
                fpkm_list += [fpkm]
                tpm_list += [tpm]

            d = {
                'gene_name': gene_names,
                'est_counts': counts_list,
                'fpkm': fpkm_list,
                'tpm': tpm_list
            }
            gene_counts = pd.DataFrame(d)
            gene_counts.set_index('gene_name', inplace=True)

            # Store data frames in HDF5 format
            store['quality_control'] = quality_control
            store['transcripts'] = transcripts
            store['gene_counts'] = gene_counts
            store['spikeins'] = spikeins
            store.close()
Example #10
0
def count_hdf5_files(direc_name,
                     bammed_direc='bammed',
                     aligned_direc='aligned',
                     counted_direc='counted',
                     spikeids=[]):
    bammed_path = os.path.join(direc_name, bammed_direc)
    aligned_path = os.path.join(direc_name, aligned_direc)
    counted_path = os.path.join(direc_name, counted_direc)

    file_list = os.listdir(aligned_path)

    mouse_genome = pyensembl.Genome(
        reference_name='GRCm38',
        gtf_path_or_url=
        'ftp://ftp.ensembl.org/pub/release-81/gtf/mus_musculus/Mus_musculus.GRCm38.81.gtf.gz',
        transcript_fasta_path_or_url=
        'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz',
        protein_fasta_path_or_url=
        'ftp://ftp.ensembl.org/pub/release-81/fasta/mus_musculus/pep/Mus_musculus.GRCm38.pep.all.fa.gz'
    )

    for seq_file_temp in file_list:
        if fnmatch.fnmatch(seq_file_temp, r'*.h5'):
            seq_file = seq_file_temp
    transcripts, spikeins = load_sequence_counts_kallisto(
        h5file_name=os.path.join(aligned_path, seq_file), spikeids=spikeids)

    transcript_names = transcripts.index
    gene_names = []
    counter = 0
    for transcript in transcript_names:
        counter += 1
        print counter
        gene_names += [mouse_genome.gene_name_of_transcript_id(transcript)]
    unique_gene_names = list(set(gene_names))

    transcript_dict = {}
    for gene in unique_gene_names:
        transcript_dict[gene] = []

    for transcript in transcript_names:
        gene_name = mouse_genome.gene_name_of_transcript_id(transcript)
        transcript_dict[gene_name] += [transcript]

    for seq_file in file_list:
        print seq_file, fnmatch.fnmatch(seq_file, r'*.h5')
        if fnmatch.fnmatch(seq_file, r'*.h5'):
            bfs = seq_file.split('.')
            bam_file_sorted_by_loc = bfs[0] + '_sorted_by_location.bam'
            num_mapped, num_unmapped = bam_read_count(
                os.path.join(bammed_path, bam_file_sorted_by_loc))
            transcripts, spikeins = load_sequence_counts_kallisto(
                h5file_name=os.path.join(aligned_path, seq_file),
                spikeids=spikeids)

            filename_save = os.path.join(counted_path, seq_file[:-3] + '.h5')
            print 'Saving ' + filename_save

            store = pd.HDFStore(filename_save)

            d = {'num_mapped': num_mapped, 'num_unmapped': num_unmapped}
            quality_control = pd.DataFrame(d, index=[0])

            # Create a transcript table indexed by gene name
            counts_list = []
            fpkm_list = []
            tpm_list = []
            length_list = []
            for j in xrange(len(unique_gene_names)):
                gene = unique_gene_names[j]
                transcript_list = transcript_dict[gene]
                counts = 0
                fpkm = 0
                tpm = 0
                mean_eff_length = 0
                for transcript in transcript_list:
                    counts += transcripts.loc[transcript]['est_counts']
                    fpkm += transcripts.loc[transcript]['fpkm']
                    tpm += transcripts.loc[transcript]['tpm']
                    mean_eff_length += transcripts.loc[transcript][
                        'eff_length'] / len(transcript_list)
                counts_list += [counts]
                fpkm_list += [fpkm]
                tpm_list += [tpm]
                length_list += [mean_eff_length]

            d = {
                'gene_name': unique_gene_names,
                'est_counts': counts_list,
                'fpkm': fpkm_list,
                'tpm': tpm_list,
                'mean_eff_length': length_list
            }
            gene_counts = pd.DataFrame(d)
            gene_counts.set_index('gene_name', inplace=True)

            # Store data frames in HDF5 format
            store['quality_control'] = quality_control
            store['transcripts'] = transcripts
            store['gene_counts'] = gene_counts
            store['spikeins'] = spikeins
            store.close()