def sync_to_db(cls, name, cluster, merge): """Fetches metadata for that datasource and merges the Superset db""" logging.info("Syncing Druid datasource [{}]".format(name)) session = get_session() datasource = session.query(cls).filter_by(datasource_name=name).first() if not datasource: datasource = cls(datasource_name=name) session.add(datasource) flasher("Adding new datasource [{}]".format(name), "success") else: flasher("Refreshing datasource [{}]".format(name), "info") session.flush() datasource.cluster = cluster datasource.merge_flag = merge session.flush() cols = datasource.latest_metadata() if not cols: logging.error("Failed at fetching the latest segment") return for col in cols: # Skip the time column if col == "__time": continue col_obj = ( session .query(DruidColumn) .filter_by(datasource_name=name, column_name=col) .first() ) datatype = cols[col]['type'] if not col_obj: col_obj = DruidColumn(datasource_name=name, column_name=col) session.add(col_obj) if datatype == "STRING": col_obj.groupby = True col_obj.filterable = True if datatype == "hyperUnique" or datatype == "thetaSketch": col_obj.count_distinct = True # If long or double, allow sum/min/max if datatype == "LONG" or datatype == "DOUBLE": col_obj.sum = True col_obj.min = True col_obj.max = True if col_obj: col_obj.type = cols[col]['type'] session.flush() col_obj.datasource = datasource col_obj.generate_metrics() session.flush()
def refresh(self, datasource_names, merge_flag, refreshAll): """ Fetches metadata for the specified datasources andm merges to the Superset database """ session = db.session ds_list = (session.query(DruidDatasource).filter( or_(DruidDatasource.datasource_name == name for name in datasource_names))) ds_map = {ds.name: ds for ds in ds_list} for ds_name in datasource_names: datasource = ds_map.get(ds_name, None) if not datasource: datasource = DruidDatasource(datasource_name=ds_name) with session.no_autoflush: session.add(datasource) flasher('Adding new datasource [{}]'.format(ds_name), 'success') ds_map[ds_name] = datasource elif refreshAll: flasher('Refreshing datasource [{}]'.format(ds_name), 'info') else: del ds_map[ds_name] continue datasource.cluster = self datasource.merge_flag = merge_flag session.flush() # Prepare multithreaded executation pool = Pool() ds_refresh = list(ds_map.values()) metadata = pool.map(_fetch_metadata_for, ds_refresh) pool.close() pool.join() for i in range(0, len(ds_refresh)): datasource = ds_refresh[i] cols = metadata[i] if cols: col_objs_list = (session.query(DruidColumn).filter( DruidColumn.datasource_name == datasource.datasource_name).filter( or_(DruidColumn.column_name == col for col in cols))) col_objs = {col.column_name: col for col in col_objs_list} for col in cols: if col == '__time': # skip the time column continue col_obj = col_objs.get(col, None) if not col_obj: col_obj = DruidColumn( datasource_name=datasource.datasource_name, column_name=col) with session.no_autoflush: session.add(col_obj) datatype = cols[col]['type'] if datatype == 'STRING': col_obj.groupby = True col_obj.filterable = True if datatype == 'hyperUnique' or datatype == 'thetaSketch': col_obj.count_distinct = True # Allow sum/min/max for long or double if datatype == 'LONG' or datatype == 'DOUBLE': col_obj.sum = True col_obj.min = True col_obj.max = True col_obj.type = datatype col_obj.datasource = datasource datasource.generate_metrics_for(col_objs_list) session.commit()
def refresh_async(self, datasource_names, merge_flag, refreshAll): """ Fetches metadata for the specified datasources andm merges to the Superset database """ session = db.session ds_list = ( session.query(DruidDatasource) .filter(or_(DruidDatasource.datasource_name == name for name in datasource_names)) ) ds_map = {ds.name: ds for ds in ds_list} for ds_name in datasource_names: datasource = ds_map.get(ds_name, None) if not datasource: datasource = DruidDatasource(datasource_name=ds_name) with session.no_autoflush: session.add(datasource) flasher( "Adding new datasource [{}]".format(ds_name), 'success') ds_map[ds_name] = datasource elif refreshAll: flasher( "Refreshing datasource [{}]".format(ds_name), 'info') else: del ds_map[ds_name] continue datasource.cluster = self datasource.merge_flag = merge_flag session.flush() # Prepare multithreaded executation pool = Pool() ds_refresh = list(ds_map.values()) metadata = pool.map(_fetch_metadata_for, ds_refresh) pool.close() pool.join() for i in range(0, len(ds_refresh)): datasource = ds_refresh[i] cols = metadata[i] col_objs_list = ( session.query(DruidColumn) .filter(DruidColumn.datasource_name == datasource.datasource_name) .filter(or_(DruidColumn.column_name == col for col in cols)) ) col_objs = {col.column_name: col for col in col_objs_list} for col in cols: if col == '__time': # skip the time column continue col_obj = col_objs.get(col, None) if not col_obj: col_obj = DruidColumn( datasource_name=datasource.datasource_name, column_name=col) with session.no_autoflush: session.add(col_obj) datatype = cols[col]['type'] if datatype == 'STRING': col_obj.groupby = True col_obj.filterable = True if datatype == 'hyperUnique' or datatype == 'thetaSketch': col_obj.count_distinct = True # Allow sum/min/max for long or double if datatype == 'LONG' or datatype == 'DOUBLE': col_obj.sum = True col_obj.min = True col_obj.max = True col_obj.type = datatype col_obj.datasource = datasource datasource.generate_metrics_for(col_objs_list) session.commit()