Esempio n. 1
0
 def test_list_filters(self):
     sb = SciBiomart()
     err = sb.list_filters()
     assert err['err'] == MART_SET_ERR
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     err = sb.list_filters()
     assert err['err'] == DATASET_SET_ERR
     sb.set_dataset('fcatus_gene_ensembl')
     filters_df = sb.list_filters(False)
     assert filters_df['name'].values[0] == 'chromosome_name'
     assert filters_df['id'].values[3] == 'seq_region_strand_1020'
     self.sb = sb
Esempio n. 2
0
 def test_list_attributes(self):
     sb = SciBiomart()
     err = sb.list_attributes()
     assert err['err'] == MART_SET_ERR
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     err = sb.list_attributes()
     assert err['err'] == DATASET_SET_ERR
     sb.set_dataset('fcatus_gene_ensembl')
     df = sb.list_attributes(False)
     assert len(df['name'] == 'chromosome_name') > 0
     assert 'name_1059' in df[df['name'] == 'chromosome_name']['id'].values
     self.sb = sb
Esempio n. 3
0
 def test_set_dataset(self):
     sb = SciBiomart()
     err = sb.list_configs()
     assert err['err'] == MART_SET_ERR
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     err = sb.list_configs()
     assert err['err'] == DATASET_SET_ERR
     sb.set_dataset('hsapiens_gene_ensembl')
     assert sb.dataset_version == 'hsapiens_gene_ensembl-GRCh38.p13'
     self.sb = sb
Esempio n. 4
0
 def test_run_query(self):
     sb = SciBiomart()
     err = sb.list_filters()
     assert err['err'] == MART_SET_ERR
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     err = sb.list_filters()
     assert err['err'] == DATASET_SET_ERR
     sb.set_dataset('hsapiens_gene_ensembl')
     results = sb.run_query(
         {'ensembl_gene_id': 'ENSG00000139618,ENSG00000091483'},
         ['ensembl_gene_id', 'hgnc_symbol', 'uniprotswissprot'])
     assert 'ENSG00000139618' in results['ensembl_gene_id'].values
     assert 'ENSG00000091483' in results['ensembl_gene_id'].values
     assert 'ENSG00000091422' not in results['ensembl_gene_id'].values
     assert 'P07954' in results['uniprotswissprot'].values
     self.sb = sb
Esempio n. 5
0
 def test_list_configs(self):
     sb = SciBiomart()
     err = sb.list_configs()
     assert err['err'] == MART_SET_ERR
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     err = sb.list_configs()
     assert err['err'] == DATASET_SET_ERR
     sb.set_dataset('fcatus_gene_ensembl')
     configs = sb.list_configs(True)
     check_configs_exist = ['Exportable', 'Importable', 'MainTable']
     found_configs = []
     for d in configs:
         if d in check_configs_exist:
             found_configs.append(d)
     print(len(configs))
     assert len(found_configs) == len(check_configs_exist)
     assert len(configs) == 23
     self.sb = sb
Esempio n. 6
0
 def test_human_sequence(self):
     sb = SciBiomart()
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     sb.set_dataset('hsapiens_gene_ensembl')
     attributes = [
         'ensembl_gene_id', 'mmusculus_homolog_ensembl_gene',
         'mmusculus_homolog_perc_id_r1'
     ]
     results = sb.run_query(
         {'ensembl_gene_id': 'ENSG00000139618,ENSG00000091483'}, attributes)
     print(results)
     self.sb = sb
Esempio n. 7
0
    def test_list_marts(self):
        sb = SciBiomart()
        marts = sb.list_marts()
        expected_marts = [
            'ENSEMBL_MART_ENSEMBL', 'ENSEMBL_MART_MOUSE',
            'ENSEMBL_MART_SEQUENCE', 'ENSEMBL_MART_ONTOLOGY',
            'ENSEMBL_MART_GENOMIC', 'ENSEMBL_MART_SNP', 'ENSEMBL_MART_FUNCGEN'
        ]
        found_marts = []
        # Check that all the marts are in the expected marts list
        count_marts = 0
        for m in marts:
            for mart_attr in m:
                if mart_attr == '@name':
                    assert m[mart_attr] in expected_marts
                    count_marts += 1
                    found_marts.append(m[mart_attr])

        # Now check we had all of them
        print(found_marts)
        print(count_marts, len(expected_marts))
        assert count_marts == len(expected_marts)
        self.sb = sb
Esempio n. 8
0
 def test_get_gene_flank(self):
     sb = SciBiomart()
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     sb.set_dataset('hsapiens_gene_ensembl')
     attributes = ['ensembl_gene_id', 'gene_flank']
     results = sb.run_query(
         {
             'ensembl_gene_id': 'ENSG00000139618,ENSG00000091483',
             'upstream_flank': 500
         }, attributes)
     print(results)
     results.to_csv('results_df.csv', index=False)
     self.sb = sb
Esempio n. 9
0
    def test_hg19(self):
        sb = SciBiomart()
        err = sb.list_datasets()
        # Expect an error if we haven't set a mart.
        assert err['err'] == MART_SET_ERR
        sb.set_mart('ENSEMBL_MART_ENSEMBL')
        datasets = sb.list_datasets()

        check_datasets_exist = [
            'hsapiens_gene_ensembl', 'mmusculus_gene_ensembl'
        ]  # mmusculus_gene_ensembl
        found_datasets = []
        for d in datasets['name'].values:
            if d in check_datasets_exist:
                found_datasets.append(d)

        sb.set_dataset('hsapiens_gene_ensembl')
        self.sb = sb
Esempio n. 10
0
    def test_grch37(self):
        sb = SciBiomart('http://grch37.ensembl.org/biomart/martservice/')
        marts = sb.list_marts()
        self.sb = sb
        sb.set_mart('ENSEMBL_MART_ENSEMBL')
        datasets = sb.list_datasets()

        check_datasets_exist = [
            'hsapiens_gene_ensembl', 'mmusculus_gene_ensembl'
        ]  # mmusculus_gene_ensembl
        found_datasets = []
        for d in datasets['name'].values:
            if d in check_datasets_exist:
                found_datasets.append(d)
Esempio n. 11
0
    def test_list_datasets(self):
        sb = SciBiomart()
        err = sb.list_datasets()
        # Expect an error if we haven't set a mart.
        assert err['err'] == MART_SET_ERR
        sb.set_mart('ENSEMBL_MART_ENSEMBL')
        datasets = sb.list_datasets()

        check_datasets_exist = [
            'fcatus_gene_ensembl', 'pcoquereli_gene_ensembl',
            'lsdomestica_gene_ensembl'
        ]
        found_datasets = []
        for d in datasets['name'].values:
            if d in check_datasets_exist:
                found_datasets.append(d)
        print(len(found_datasets), len(check_datasets_exist), len(datasets))
        # This has changed over time, probably need to think of a better test
        #assert len(found_datasets) == len(check_datasets_exist)
        assert len(datasets) > 203
        self.sb = sb
Esempio n. 12
0
def run(args):

    sb = SciBiomart()
    if args.marts:  # Check if the user wanted to print the marts
        sb.list_marts(True)
        return
    sb.set_mart(args.m) # Otherwise set the mart
    if args.datasets: # Check if the user wanted to print the datasets
        sb.list_datasets(True)
        return
    sb.set_dataset(args.d) # Otherwise set the dataset
    if args.filters: # Check if the user wanted to print the filters
        sb.list_filters(True)
        return
    if args.attrs: # Check if the user wanted to print the filters
        sb.list_attributes(True)
        return
    if args.configs:
        sb.list_configs(True)
        return
    # Otherwise they actually have a query so we run it
    # Convert the filetrs string to a dict
    if args.f:
        filters = json.loads(args.f)
    else:
        filters = None
    if args.a:
        attrs = args.a.split(",")
    else:
        attrs = None
    if not attrs and args.s:  # We need the start and ends at least
        attrs = ['external_gene_name', 'chromosome_name', 'start_position', 'end_position', 'strand']
    sb.u.dp(['Running query on:',
             '\nMart: ', sb.mart,
             '\nDataset: ', sb.dataset_version,
             '\nFilters: ', filters,
             '\nAttributes: ', attrs])
    results_df = sb.run_query(filters, attrs)
    if args.s == 't':  # Check if we need to sort the file
        convert_dict = {'start_position': int,
                        'end_position': int,
                        'strand': int,
                        'chromosome_name': str}
        sb.u.warn_p(['Removing any genes with no gene name... Required for sorting.'])

        results_df = results_df[~results_df['external_gene_name'].isnull()]

        results_df = results_df.astype(convert_dict)
        results_df = sb.sort_df_on_starts(results_df)  # Note the user would have had to select the starts and ends

    saved_file = sb.save_as_csv(results_df, args.o)
    sb.u.dp(['Saved the output to:', saved_file])