def yield_mouse_orthologs(hs_genes): # map into mouse orthologs using biomart query = B.new_query() dataset = B.add_dataset(query, 'hsapiens_gene_ensembl') B.add_attribute(dataset, 'ensembl_gene_id') B.add_attribute(dataset, 'mouse_ensembl_gene') filter = B.add_filter(dataset, name='ensembl_gene_id', value='') filter.set('value', ','.join(ensembl_hs_genes)) for chunk in B.split_big_list(ensembl_hs_genes, 50): #logging.info('Querying Ensembl biomart for chunk of %d genes', len(chunk)) filter.set('value', ','.join(chunk)) for row in B.yield_csv_query_results(query): if row[1]: yield row[1]
def get_ensembl_go_annotations(genes): "@return: A map from the given genes to sets of go annotations." import biopsy.identifiers.biomart as biomart logging.info('Querying Ensembl biomart for GO annotations of %d genes', len(genes)) result = cookbook.DictOfLists() for id_attr, evidence_attr in [ ('go_biological_process_id', 'go_biological_process_linkage_type'), ('go_cellular_component_id', 'go_cellular_component_linkage_type'), ('go_molecular_function_id', 'go_molecular_function_linkage_type'), ]: query = biomart.new_query() dataset = biomart.add_dataset(query, 'mmusculus_gene_ensembl') biomart.add_attribute(dataset, 'ensembl_gene_id') biomart.add_attribute(dataset, id_attr) biomart.add_attribute(dataset, evidence_attr) filter = biomart.add_filter(dataset, name='ensembl_gene_id', value='') for chunk in biomart.split_big_list((str(g) for g in genes), 50): #logging.info('Querying Ensembl biomart for chunk of %d genes', len(chunk)) filter.set('value', ','.join(chunk)) for row in biomart.yield_csv_query_results(query): if row[2] not in options.go_evidence_codes_to_ignore: result[row[0]].append(row[1]) logging.info('Found %d go annotations', sum(len(v) for v in result.values())) return result