Ejemplo n.º 1
0
 def test_set_dataset(self):
     sb = SciBiomart()
     err = sb.list_configs()
     assert err['err'] == MART_SET_ERR
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     err = sb.list_configs()
     assert err['err'] == DATASET_SET_ERR
     sb.set_dataset('hsapiens_gene_ensembl')
     assert sb.dataset_version == 'hsapiens_gene_ensembl-GRCh38.p13'
     self.sb = sb
Ejemplo n.º 2
0
 def test_human_sequence(self):
     sb = SciBiomart()
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     sb.set_dataset('hsapiens_gene_ensembl')
     attributes = [
         'ensembl_gene_id', 'mmusculus_homolog_ensembl_gene',
         'mmusculus_homolog_perc_id_r1'
     ]
     results = sb.run_query(
         {'ensembl_gene_id': 'ENSG00000139618,ENSG00000091483'}, attributes)
     print(results)
     self.sb = sb
Ejemplo n.º 3
0
 def test_list_filters(self):
     sb = SciBiomart()
     err = sb.list_filters()
     assert err['err'] == MART_SET_ERR
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     err = sb.list_filters()
     assert err['err'] == DATASET_SET_ERR
     sb.set_dataset('fcatus_gene_ensembl')
     filters_df = sb.list_filters(False)
     assert filters_df['name'].values[0] == 'chromosome_name'
     assert filters_df['id'].values[3] == 'seq_region_strand_1020'
     self.sb = sb
Ejemplo n.º 4
0
 def test_list_attributes(self):
     sb = SciBiomart()
     err = sb.list_attributes()
     assert err['err'] == MART_SET_ERR
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     err = sb.list_attributes()
     assert err['err'] == DATASET_SET_ERR
     sb.set_dataset('fcatus_gene_ensembl')
     df = sb.list_attributes(False)
     assert len(df['name'] == 'chromosome_name') > 0
     assert 'name_1059' in df[df['name'] == 'chromosome_name']['id'].values
     self.sb = sb
Ejemplo n.º 5
0
def run(args):

    sb = SciBiomart()
    if args.marts:  # Check if the user wanted to print the marts
        sb.list_marts(True)
        return
    sb.set_mart(args.m) # Otherwise set the mart
    if args.datasets: # Check if the user wanted to print the datasets
        sb.list_datasets(True)
        return
    sb.set_dataset(args.d) # Otherwise set the dataset
    if args.filters: # Check if the user wanted to print the filters
        sb.list_filters(True)
        return
    if args.attrs: # Check if the user wanted to print the filters
        sb.list_attributes(True)
        return
    if args.configs:
        sb.list_configs(True)
        return
    # Otherwise they actually have a query so we run it
    # Convert the filetrs string to a dict
    if args.f:
        filters = json.loads(args.f)
    else:
        filters = None
    if args.a:
        attrs = args.a.split(",")
    else:
        attrs = None
    if not attrs and args.s:  # We need the start and ends at least
        attrs = ['external_gene_name', 'chromosome_name', 'start_position', 'end_position', 'strand']
    sb.u.dp(['Running query on:',
             '\nMart: ', sb.mart,
             '\nDataset: ', sb.dataset_version,
             '\nFilters: ', filters,
             '\nAttributes: ', attrs])
    results_df = sb.run_query(filters, attrs)
    if args.s == 't':  # Check if we need to sort the file
        convert_dict = {'start_position': int,
                        'end_position': int,
                        'strand': int,
                        'chromosome_name': str}
        sb.u.warn_p(['Removing any genes with no gene name... Required for sorting.'])

        results_df = results_df[~results_df['external_gene_name'].isnull()]

        results_df = results_df.astype(convert_dict)
        results_df = sb.sort_df_on_starts(results_df)  # Note the user would have had to select the starts and ends

    saved_file = sb.save_as_csv(results_df, args.o)
    sb.u.dp(['Saved the output to:', saved_file])
Ejemplo n.º 6
0
 def test_get_gene_flank(self):
     sb = SciBiomart()
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     sb.set_dataset('hsapiens_gene_ensembl')
     attributes = ['ensembl_gene_id', 'gene_flank']
     results = sb.run_query(
         {
             'ensembl_gene_id': 'ENSG00000139618,ENSG00000091483',
             'upstream_flank': 500
         }, attributes)
     print(results)
     results.to_csv('results_df.csv', index=False)
     self.sb = sb
Ejemplo n.º 7
0
 def test_run_query(self):
     sb = SciBiomart()
     err = sb.list_filters()
     assert err['err'] == MART_SET_ERR
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     err = sb.list_filters()
     assert err['err'] == DATASET_SET_ERR
     sb.set_dataset('hsapiens_gene_ensembl')
     results = sb.run_query(
         {'ensembl_gene_id': 'ENSG00000139618,ENSG00000091483'},
         ['ensembl_gene_id', 'hgnc_symbol', 'uniprotswissprot'])
     assert 'ENSG00000139618' in results['ensembl_gene_id'].values
     assert 'ENSG00000091483' in results['ensembl_gene_id'].values
     assert 'ENSG00000091422' not in results['ensembl_gene_id'].values
     assert 'P07954' in results['uniprotswissprot'].values
     self.sb = sb
Ejemplo n.º 8
0
 def test_list_configs(self):
     sb = SciBiomart()
     err = sb.list_configs()
     assert err['err'] == MART_SET_ERR
     sb.set_mart('ENSEMBL_MART_ENSEMBL')
     err = sb.list_configs()
     assert err['err'] == DATASET_SET_ERR
     sb.set_dataset('fcatus_gene_ensembl')
     configs = sb.list_configs(True)
     check_configs_exist = ['Exportable', 'Importable', 'MainTable']
     found_configs = []
     for d in configs:
         if d in check_configs_exist:
             found_configs.append(d)
     print(len(configs))
     assert len(found_configs) == len(check_configs_exist)
     assert len(configs) == 23
     self.sb = sb
Ejemplo n.º 9
0
    def test_hg19(self):
        sb = SciBiomart()
        err = sb.list_datasets()
        # Expect an error if we haven't set a mart.
        assert err['err'] == MART_SET_ERR
        sb.set_mart('ENSEMBL_MART_ENSEMBL')
        datasets = sb.list_datasets()

        check_datasets_exist = [
            'hsapiens_gene_ensembl', 'mmusculus_gene_ensembl'
        ]  # mmusculus_gene_ensembl
        found_datasets = []
        for d in datasets['name'].values:
            if d in check_datasets_exist:
                found_datasets.append(d)

        sb.set_dataset('hsapiens_gene_ensembl')
        self.sb = sb