def test_list_filters(self): sb = SciBiomart() err = sb.list_filters() assert err['err'] == MART_SET_ERR sb.set_mart('ENSEMBL_MART_ENSEMBL') err = sb.list_filters() assert err['err'] == DATASET_SET_ERR sb.set_dataset('fcatus_gene_ensembl') filters_df = sb.list_filters(False) assert filters_df['name'].values[0] == 'chromosome_name' assert filters_df['id'].values[3] == 'seq_region_strand_1020' self.sb = sb
def test_list_attributes(self): sb = SciBiomart() err = sb.list_attributes() assert err['err'] == MART_SET_ERR sb.set_mart('ENSEMBL_MART_ENSEMBL') err = sb.list_attributes() assert err['err'] == DATASET_SET_ERR sb.set_dataset('fcatus_gene_ensembl') df = sb.list_attributes(False) assert len(df['name'] == 'chromosome_name') > 0 assert 'name_1059' in df[df['name'] == 'chromosome_name']['id'].values self.sb = sb
def test_set_dataset(self): sb = SciBiomart() err = sb.list_configs() assert err['err'] == MART_SET_ERR sb.set_mart('ENSEMBL_MART_ENSEMBL') err = sb.list_configs() assert err['err'] == DATASET_SET_ERR sb.set_dataset('hsapiens_gene_ensembl') assert sb.dataset_version == 'hsapiens_gene_ensembl-GRCh38.p13' self.sb = sb
def test_run_query(self): sb = SciBiomart() err = sb.list_filters() assert err['err'] == MART_SET_ERR sb.set_mart('ENSEMBL_MART_ENSEMBL') err = sb.list_filters() assert err['err'] == DATASET_SET_ERR sb.set_dataset('hsapiens_gene_ensembl') results = sb.run_query( {'ensembl_gene_id': 'ENSG00000139618,ENSG00000091483'}, ['ensembl_gene_id', 'hgnc_symbol', 'uniprotswissprot']) assert 'ENSG00000139618' in results['ensembl_gene_id'].values assert 'ENSG00000091483' in results['ensembl_gene_id'].values assert 'ENSG00000091422' not in results['ensembl_gene_id'].values assert 'P07954' in results['uniprotswissprot'].values self.sb = sb
def test_list_configs(self): sb = SciBiomart() err = sb.list_configs() assert err['err'] == MART_SET_ERR sb.set_mart('ENSEMBL_MART_ENSEMBL') err = sb.list_configs() assert err['err'] == DATASET_SET_ERR sb.set_dataset('fcatus_gene_ensembl') configs = sb.list_configs(True) check_configs_exist = ['Exportable', 'Importable', 'MainTable'] found_configs = [] for d in configs: if d in check_configs_exist: found_configs.append(d) print(len(configs)) assert len(found_configs) == len(check_configs_exist) assert len(configs) == 23 self.sb = sb
def test_human_sequence(self): sb = SciBiomart() sb.set_mart('ENSEMBL_MART_ENSEMBL') sb.set_dataset('hsapiens_gene_ensembl') attributes = [ 'ensembl_gene_id', 'mmusculus_homolog_ensembl_gene', 'mmusculus_homolog_perc_id_r1' ] results = sb.run_query( {'ensembl_gene_id': 'ENSG00000139618,ENSG00000091483'}, attributes) print(results) self.sb = sb
def test_list_marts(self): sb = SciBiomart() marts = sb.list_marts() expected_marts = [ 'ENSEMBL_MART_ENSEMBL', 'ENSEMBL_MART_MOUSE', 'ENSEMBL_MART_SEQUENCE', 'ENSEMBL_MART_ONTOLOGY', 'ENSEMBL_MART_GENOMIC', 'ENSEMBL_MART_SNP', 'ENSEMBL_MART_FUNCGEN' ] found_marts = [] # Check that all the marts are in the expected marts list count_marts = 0 for m in marts: for mart_attr in m: if mart_attr == '@name': assert m[mart_attr] in expected_marts count_marts += 1 found_marts.append(m[mart_attr]) # Now check we had all of them print(found_marts) print(count_marts, len(expected_marts)) assert count_marts == len(expected_marts) self.sb = sb
def test_get_gene_flank(self): sb = SciBiomart() sb.set_mart('ENSEMBL_MART_ENSEMBL') sb.set_dataset('hsapiens_gene_ensembl') attributes = ['ensembl_gene_id', 'gene_flank'] results = sb.run_query( { 'ensembl_gene_id': 'ENSG00000139618,ENSG00000091483', 'upstream_flank': 500 }, attributes) print(results) results.to_csv('results_df.csv', index=False) self.sb = sb
def test_hg19(self): sb = SciBiomart() err = sb.list_datasets() # Expect an error if we haven't set a mart. assert err['err'] == MART_SET_ERR sb.set_mart('ENSEMBL_MART_ENSEMBL') datasets = sb.list_datasets() check_datasets_exist = [ 'hsapiens_gene_ensembl', 'mmusculus_gene_ensembl' ] # mmusculus_gene_ensembl found_datasets = [] for d in datasets['name'].values: if d in check_datasets_exist: found_datasets.append(d) sb.set_dataset('hsapiens_gene_ensembl') self.sb = sb
def test_grch37(self): sb = SciBiomart('http://grch37.ensembl.org/biomart/martservice/') marts = sb.list_marts() self.sb = sb sb.set_mart('ENSEMBL_MART_ENSEMBL') datasets = sb.list_datasets() check_datasets_exist = [ 'hsapiens_gene_ensembl', 'mmusculus_gene_ensembl' ] # mmusculus_gene_ensembl found_datasets = [] for d in datasets['name'].values: if d in check_datasets_exist: found_datasets.append(d)
def test_list_datasets(self): sb = SciBiomart() err = sb.list_datasets() # Expect an error if we haven't set a mart. assert err['err'] == MART_SET_ERR sb.set_mart('ENSEMBL_MART_ENSEMBL') datasets = sb.list_datasets() check_datasets_exist = [ 'fcatus_gene_ensembl', 'pcoquereli_gene_ensembl', 'lsdomestica_gene_ensembl' ] found_datasets = [] for d in datasets['name'].values: if d in check_datasets_exist: found_datasets.append(d) print(len(found_datasets), len(check_datasets_exist), len(datasets)) # This has changed over time, probably need to think of a better test #assert len(found_datasets) == len(check_datasets_exist) assert len(datasets) > 203 self.sb = sb
def run(args): sb = SciBiomart() if args.marts: # Check if the user wanted to print the marts sb.list_marts(True) return sb.set_mart(args.m) # Otherwise set the mart if args.datasets: # Check if the user wanted to print the datasets sb.list_datasets(True) return sb.set_dataset(args.d) # Otherwise set the dataset if args.filters: # Check if the user wanted to print the filters sb.list_filters(True) return if args.attrs: # Check if the user wanted to print the filters sb.list_attributes(True) return if args.configs: sb.list_configs(True) return # Otherwise they actually have a query so we run it # Convert the filetrs string to a dict if args.f: filters = json.loads(args.f) else: filters = None if args.a: attrs = args.a.split(",") else: attrs = None if not attrs and args.s: # We need the start and ends at least attrs = ['external_gene_name', 'chromosome_name', 'start_position', 'end_position', 'strand'] sb.u.dp(['Running query on:', '\nMart: ', sb.mart, '\nDataset: ', sb.dataset_version, '\nFilters: ', filters, '\nAttributes: ', attrs]) results_df = sb.run_query(filters, attrs) if args.s == 't': # Check if we need to sort the file convert_dict = {'start_position': int, 'end_position': int, 'strand': int, 'chromosome_name': str} sb.u.warn_p(['Removing any genes with no gene name... Required for sorting.']) results_df = results_df[~results_df['external_gene_name'].isnull()] results_df = results_df.astype(convert_dict) results_df = sb.sort_df_on_starts(results_df) # Note the user would have had to select the starts and ends saved_file = sb.save_as_csv(results_df, args.o) sb.u.dp(['Saved the output to:', saved_file])