def fetch_rRNA_genes_from_biomart(org,host='www.ensembl.org'): ''' ''' try: if org not in ['hsapiens', 'mmusculus']: raise ValueError('Organism {0} not supported for biomart lookup'.\ format(org)) s=biomart.BioMart(host=host) s.new_query() if org=='hsapiens': s.add_dataset_to_xml('hsapiens_gene_ensembl') s.add_attribute_to_xml('hgnc_symbol') # fetch human gene names elif org=='mmusculus': s.add_dataset_to_xml('mmusculus_gene_ensembl') s.add_attribute_to_xml('mgi_symbol') # fetch mouse gene names else: raise ValueError('Organism {0} not supported for biomart lookup'.\ format(org)) s.add_filter_to_xml("biotype", "rRNA") # filter query for rRNA genes s.add_attribute_to_xml('chromosome_name') # fetch chr names xml=s.get_xml() # fetch xml res=pd.read_csv(StringIO(s.query(xml)), sep='\t', header=None) res.columns=['symbol','chromosome_name'] # reformat dataframe res.set_index('symbol',inplace=True) res=res[~res.index.duplicated(keep='first')] return res.index except: raise
def biomart_query_all(verbose=False, extra_fields=None, force_download=False): """ pulls down all entries from BIOMART for Human: symbol, trasncript, gene, length, type """ THE_FILE = pathlib.Path(__file__).parent / 'biomart_all.csv.gz' if not force_download and os.path.exists(THE_FILE): return _biomart_df_postprocess(pd.read_csv(THE_FILE, index_col=0)) s = biomart.BioMart(host=HOST) s.new_query() s.add_dataset_to_xml('hsapiens_gene_ensembl') # what we want to get back # s.add_attribute_to_xml('entrezgene') fields = [ 'hgnc_symbol', 'ensembl_gene_id', 'ensembl_gene_id_version', 'transcript_length', 'ensembl_transcript_id', 'ensembl_transcript_id_version', 'transcript_biotype', 'chromosome_name', 'start_position', 'end_position', 'external_synonym', ] if extra_fields: fields.extend(extra_fields) for f in fields: s.add_attribute_to_xml(f) xml = s.get_xml() if verbose: print(xml) res = s.query(xml) df = pd.read_csv(io.StringIO(res), sep='\t', header=None) df.columns = fields df = df.drop_duplicates() df = _biomart_df_postprocess(df) df.to_csv(THE_FILE) return df
def gene_coordinates(host, org, gene, chr_exclude=[]) -> pd.DataFrame: """Retrieve gene coordinates for specific organism through BioMart. Parameters ---------- host : {{'www.ensembl.org', ...}} A valid BioMart host URL. Can be used to control genome build. org : {{'hsapiens', 'mmusculus', 'drerio'}} Organism to query. Currently available are human ('hsapiens'), mouse ('mmusculus') and zebrafish ('drerio'). gene : The gene symbol (e.g. 'hgnc_symbol' for human) for which to retrieve coordinates. chr_exclude : A list of chromosomes to exclude from query. Returns ------- A `pd.DataFrame` containing gene coordinates for the specified gene symbol. """ try: from bioservices import biomart except ImportError: raise ImportError( 'You need to install the `bioservices` module.') from io import StringIO s = biomart.BioMart(host=host) # building query s.new_query() if org == 'hsapiens': s.add_dataset_to_xml('hsapiens_gene_ensembl') s.add_attribute_to_xml('hgnc_symbol') elif org == 'mmusculus': s.add_dataset_to_xml('mmusculus_gene_ensembl') s.add_attribute_to_xml('mgi_symbol') elif org == 'drerio': s.add_dataset_to_xml('drerio_gene_ensembl') s.add_attribute_to_xml('zfin_id_symbol') else: logg.debug('organism ', str(org), ' is unavailable', no_indent=True) return None s.add_attribute_to_xml('chromosome_name') s.add_attribute_to_xml('start_position') s.add_attribute_to_xml('end_position') xml = s.get_xml() # parsing gene coordinates res = pd.read_csv(StringIO(s.query(xml)), sep='\t', header=None) res.columns = ['symbol', 'chromosome_name', 'start', 'end'] res = res.dropna() res = res[~res['chromosome_name'].isin(chr_exclude)] res = res.set_index('symbol') return res.loc[[gene], :]
def mitochondrial_genes(host, org) -> pd.Index: """Mitochondrial gene symbols for specific organism through BioMart. Parameters ---------- host : {{'www.ensembl.org', ...}} A valid BioMart host URL. org : {{'hsapiens', 'mmusculus', 'drerio'}} Organism to query. Currently available are human ('hsapiens'), mouse ('mmusculus') and zebrafish ('drerio'). Returns ------- A :class:`pandas.Index` containing mitochondrial gene symbols. """ try: from bioservices import biomart except ImportError: raise ImportError( 'You need to install the `bioservices` module.') from io import StringIO s = biomart.BioMart(host=host) # building query s.new_query() if org == 'hsapiens': s.add_dataset_to_xml('hsapiens_gene_ensembl') s.add_attribute_to_xml('hgnc_symbol') elif org == 'mmusculus': s.add_dataset_to_xml('mmusculus_gene_ensembl') s.add_attribute_to_xml('mgi_symbol') elif org == 'drerio': s.add_dataset_to_xml('drerio_gene_ensembl') s.add_attribute_to_xml('zfin_id_symbol') else: logg.debug('organism ', str(org), ' is unavailable', no_indent=True) return None s.add_attribute_to_xml('chromosome_name') xml = s.get_xml() # parsing mitochondrial gene symbols res = pd.read_csv(StringIO(s.query(xml)), sep='\t', header=None) res.columns = ['symbol', 'chromosome_name'] res = res.dropna() res = res[res['chromosome_name'] == 'MT'] res = res.set_index('symbol') res = res[~res.index.duplicated(keep='first')] return res.index
def biomart_query_new(attributes: list, filter: str, filter_values: list, batchsize=10000, verbose=False): """ query biomart similar to the R-function getBM() :param attributes: which attributes to retrieve from biomart :param filter: field which to filter on :param filter_values: return only entries where filter \in filter_values """ s = biomart.BioMart(host=HOST) batch_results = [] n_batches = int(np.ceil(len(filter_values) / batchsize)) for id_batch in tqdm.tqdm(batch(filter_values, batchsize=batchsize), total=n_batches): s.new_query() s.add_dataset_to_xml('hsapiens_gene_ensembl') # what we want to get back for a in attributes: s.add_attribute_to_xml(a) # the query should be comma separated # better make sure theres no whitespace query = ",".join([_.strip() for _ in id_batch]) s.add_filter_to_xml(filter, query) xml = s.get_xml() if verbose: print(xml) res = s.query(xml) df = pd.read_csv(io.StringIO(res), sep='\t', header=None) df.columns = attributes df = df.drop_duplicates() batch_results.append(df) df = pd.concat(batch_results, axis=0).drop_duplicates() return df
def __init__(self, host='www.ensembl.org'): self.s = biomart.BioMart(host=host)
def print_attributes(): """ thats all the atribures that we can get from biomart for a gene/transcript """ s = biomart.BioMart(host=HOST) return list(s.attributes('hsapiens_gene_ensembl').keys())
import sys from Bio import SeqIO # Load annotation gene_table_file = sys.argv[1] gene_table = pd.read_csv(gene_table_file) gene_table.set_index('acc', inplace=True) # Define column data types gene_table['chromosome'] = gene_table['chromosome'].astype(str) for col in ['start', 'end', 'length', 'strand']: gene_table[col] = gene_table[col].astype(int) # Number of genes to be analyzed at a time chunk_size = int(sys.argv[2]) # Setup server b = biomart.BioMart() b.host = 'www.ensembl.org' datasets = b.datasets("ENSEMBL_MART_ENSEMBL") # Set sequence file name seq_file = 'all_seqs.fa' fileO = open(seq_file, 'a') # Record downloaded sequences downloaded = [ gene.name.split("|")[2] for gene in SeqIO.parse(seq_file, 'fasta') ] # Filter table so that only non downloaded genes are processed gene_table = gene_table.loc[~(gene_table.index.isin(downloaded))] # Group table by species and chromosome