def fetch_rRNA_genes_from_biomart(org,host='www.ensembl.org'):
  '''
  '''
  try:
    if org not in ['hsapiens',
                   'mmusculus']:
      raise ValueError('Organism {0} not supported for biomart lookup'.\
                       format(org))
    s=biomart.BioMart(host=host)
    s.new_query()
    if org=='hsapiens':
      s.add_dataset_to_xml('hsapiens_gene_ensembl')
      s.add_attribute_to_xml('hgnc_symbol')                                     # fetch human gene names
    elif org=='mmusculus':
      s.add_dataset_to_xml('mmusculus_gene_ensembl')
      s.add_attribute_to_xml('mgi_symbol')                                      # fetch mouse gene names
    else:
      raise ValueError('Organism {0} not supported for biomart lookup'.\
                         format(org))
    s.add_filter_to_xml("biotype", "rRNA")                                      # filter query for rRNA genes
    s.add_attribute_to_xml('chromosome_name')                                   # fetch chr names
    xml=s.get_xml()                                                             # fetch xml
    res=pd.read_csv(StringIO(s.query(xml)), sep='\t', header=None)
    res.columns=['symbol','chromosome_name']                                    # reformat dataframe
    res.set_index('symbol',inplace=True)
    res=res[~res.index.duplicated(keep='first')]
    return res.index
  except:
    raise
Exemple #2
0
def biomart_query_all(verbose=False, extra_fields=None, force_download=False):
    """
    pulls down all entries from BIOMART for Human: symbol, trasncript, gene, length, type
    """

    THE_FILE = pathlib.Path(__file__).parent / 'biomart_all.csv.gz'

    if not force_download and os.path.exists(THE_FILE):
        return _biomart_df_postprocess(pd.read_csv(THE_FILE, index_col=0))

    s = biomart.BioMart(host=HOST)
    s.new_query()
    s.add_dataset_to_xml('hsapiens_gene_ensembl')

    # what we want to get back
    # s.add_attribute_to_xml('entrezgene')

    fields = [
        'hgnc_symbol',
        'ensembl_gene_id',
        'ensembl_gene_id_version',
        'transcript_length',
        'ensembl_transcript_id',
        'ensembl_transcript_id_version',
        'transcript_biotype',
        'chromosome_name',
        'start_position',
        'end_position',
        'external_synonym',
    ]

    if extra_fields:
        fields.extend(extra_fields)

    for f in fields:
        s.add_attribute_to_xml(f)

    xml = s.get_xml()

    if verbose:
        print(xml)

    res = s.query(xml)

    df = pd.read_csv(io.StringIO(res), sep='\t', header=None)
    df.columns = fields
    df = df.drop_duplicates()

    df = _biomart_df_postprocess(df)

    df.to_csv(THE_FILE)

    return df
Exemple #3
0
def gene_coordinates(host, org, gene, chr_exclude=[]) -> pd.DataFrame:
    """Retrieve gene coordinates for specific organism through BioMart.
    Parameters
    ----------
    host : {{'www.ensembl.org', ...}}
        A valid BioMart host URL. Can be used to control genome build.
    org : {{'hsapiens', 'mmusculus', 'drerio'}}
        Organism to query. Currently available are human ('hsapiens'), mouse
        ('mmusculus') and zebrafish ('drerio').
    gene :
        The gene symbol (e.g. 'hgnc_symbol' for human) for which to retrieve
        coordinates.
    chr_exclude :
        A list of chromosomes to exclude from query.
    Returns
    -------
    A `pd.DataFrame` containing gene coordinates for the specified gene symbol.
    """
    try:
        from bioservices import biomart
    except ImportError:
        raise ImportError(
            'You need to install the `bioservices` module.')
    from io import StringIO
    s = biomart.BioMart(host=host)

    # building query
    s.new_query()
    if org == 'hsapiens':
        s.add_dataset_to_xml('hsapiens_gene_ensembl')
        s.add_attribute_to_xml('hgnc_symbol')
    elif org == 'mmusculus':
        s.add_dataset_to_xml('mmusculus_gene_ensembl')
        s.add_attribute_to_xml('mgi_symbol')
    elif org == 'drerio':
        s.add_dataset_to_xml('drerio_gene_ensembl')
        s.add_attribute_to_xml('zfin_id_symbol')
    else:
        logg.debug('organism ', str(org), ' is unavailable', no_indent=True)
        return None
    s.add_attribute_to_xml('chromosome_name')
    s.add_attribute_to_xml('start_position')
    s.add_attribute_to_xml('end_position')
    xml = s.get_xml()

    # parsing gene coordinates
    res = pd.read_csv(StringIO(s.query(xml)), sep='\t', header=None)
    res.columns = ['symbol', 'chromosome_name', 'start', 'end']
    res = res.dropna()
    res = res[~res['chromosome_name'].isin(chr_exclude)]
    res = res.set_index('symbol')

    return res.loc[[gene], :]
Exemple #4
0
def mitochondrial_genes(host, org) -> pd.Index:
    """Mitochondrial gene symbols for specific organism through BioMart.

    Parameters
    ----------
    host : {{'www.ensembl.org', ...}}
        A valid BioMart host URL.
    org : {{'hsapiens', 'mmusculus', 'drerio'}}
        Organism to query. Currently available are human ('hsapiens'), mouse
        ('mmusculus') and zebrafish ('drerio').

    Returns
    -------
    A :class:`pandas.Index` containing mitochondrial gene symbols.
    """
    try:
        from bioservices import biomart
    except ImportError:
        raise ImportError(
            'You need to install the `bioservices` module.')
    from io import StringIO
    s = biomart.BioMart(host=host)

    # building query
    s.new_query()
    if org == 'hsapiens':
        s.add_dataset_to_xml('hsapiens_gene_ensembl')
        s.add_attribute_to_xml('hgnc_symbol')
    elif org == 'mmusculus':
        s.add_dataset_to_xml('mmusculus_gene_ensembl')
        s.add_attribute_to_xml('mgi_symbol')
    elif org == 'drerio':
        s.add_dataset_to_xml('drerio_gene_ensembl')
        s.add_attribute_to_xml('zfin_id_symbol')
    else:
        logg.debug('organism ', str(org), ' is unavailable', no_indent=True)
        return None
    s.add_attribute_to_xml('chromosome_name')
    xml = s.get_xml()

    # parsing mitochondrial gene symbols
    res = pd.read_csv(StringIO(s.query(xml)), sep='\t', header=None)
    res.columns = ['symbol', 'chromosome_name']
    res = res.dropna()
    res = res[res['chromosome_name'] == 'MT']
    res = res.set_index('symbol')
    res = res[~res.index.duplicated(keep='first')]

    return res.index
Exemple #5
0
def biomart_query_new(attributes: list,
                      filter: str,
                      filter_values: list,
                      batchsize=10000,
                      verbose=False):
    """
    query biomart similar to the R-function getBM()

    :param attributes: which attributes to retrieve from biomart
    :param filter: field which to filter on
    :param filter_values: return only entries where filter \in filter_values
    """
    s = biomart.BioMart(host=HOST)

    batch_results = []
    n_batches = int(np.ceil(len(filter_values) / batchsize))
    for id_batch in tqdm.tqdm(batch(filter_values, batchsize=batchsize),
                              total=n_batches):

        s.new_query()
        s.add_dataset_to_xml('hsapiens_gene_ensembl')

        # what we want to get back
        for a in attributes:
            s.add_attribute_to_xml(a)

        # the query should be comma separated
        # better make sure theres no whitespace
        query = ",".join([_.strip() for _ in id_batch])

        s.add_filter_to_xml(filter, query)
        xml = s.get_xml()

        if verbose:
            print(xml)

        res = s.query(xml)

        df = pd.read_csv(io.StringIO(res), sep='\t', header=None)
        df.columns = attributes
        df = df.drop_duplicates()
        batch_results.append(df)

    df = pd.concat(batch_results, axis=0).drop_duplicates()

    return df
Exemple #6
0
 def __init__(self, host='www.ensembl.org'):
     self.s = biomart.BioMart(host=host)
Exemple #7
0
def print_attributes():
    """
    thats all the atribures that we can get from biomart for a gene/transcript
    """
    s = biomart.BioMart(host=HOST)
    return list(s.attributes('hsapiens_gene_ensembl').keys())
Exemple #8
0
import sys
from Bio import SeqIO

# Load annotation
gene_table_file = sys.argv[1]
gene_table = pd.read_csv(gene_table_file)
gene_table.set_index('acc', inplace=True)
# Define column data types
gene_table['chromosome'] = gene_table['chromosome'].astype(str)
for col in ['start', 'end', 'length', 'strand']:
    gene_table[col] = gene_table[col].astype(int)

# Number of genes to be analyzed at a time
chunk_size = int(sys.argv[2])
# Setup server
b = biomart.BioMart()
b.host = 'www.ensembl.org'

datasets = b.datasets("ENSEMBL_MART_ENSEMBL")

# Set sequence file name
seq_file = 'all_seqs.fa'
fileO = open(seq_file, 'a')
# Record downloaded sequences
downloaded = [
    gene.name.split("|")[2] for gene in SeqIO.parse(seq_file, 'fasta')
]

# Filter table so that only non downloaded genes are processed
gene_table = gene_table.loc[~(gene_table.index.isin(downloaded))]
# Group table by species and chromosome