Example #1
0
    def sync_to_db(cls, name, cluster, merge):
        """Fetches metadata for that datasource and merges the Superset db"""
        logging.info("Syncing Druid datasource [{}]".format(name))
        session = get_session()
        datasource = session.query(cls).filter_by(datasource_name=name).first()
        if not datasource:
            datasource = cls(datasource_name=name)
            session.add(datasource)
            flasher("Adding new datasource [{}]".format(name), "success")
        else:
            flasher("Refreshing datasource [{}]".format(name), "info")
        session.flush()
        datasource.cluster = cluster
        datasource.merge_flag = merge
        session.flush()

        cols = datasource.latest_metadata()
        if not cols:
            logging.error("Failed at fetching the latest segment")
            return
        for col in cols:
            # Skip the time column
            if col == "__time":
                continue
            col_obj = (
                session
                .query(DruidColumn)
                .filter_by(datasource_name=name, column_name=col)
                .first()
            )
            datatype = cols[col]['type']
            if not col_obj:
                col_obj = DruidColumn(datasource_name=name, column_name=col)
                session.add(col_obj)
            if datatype == "STRING":
                col_obj.groupby = True
                col_obj.filterable = True
            if datatype == "hyperUnique" or datatype == "thetaSketch":
                col_obj.count_distinct = True
            # If long or double, allow sum/min/max
            if datatype == "LONG" or datatype == "DOUBLE":
                col_obj.sum = True
                col_obj.min = True
                col_obj.max = True
            if col_obj:
                col_obj.type = cols[col]['type']
            session.flush()
            col_obj.datasource = datasource
            col_obj.generate_metrics()
            session.flush()
Example #2
0
    def refresh(self, datasource_names, merge_flag, refreshAll):
        """
        Fetches metadata for the specified datasources andm
        merges to the Superset database
        """
        session = db.session
        ds_list = (session.query(DruidDatasource).filter(
            or_(DruidDatasource.datasource_name == name
                for name in datasource_names)))

        ds_map = {ds.name: ds for ds in ds_list}
        for ds_name in datasource_names:
            datasource = ds_map.get(ds_name, None)
            if not datasource:
                datasource = DruidDatasource(datasource_name=ds_name)
                with session.no_autoflush:
                    session.add(datasource)
                flasher('Adding new datasource [{}]'.format(ds_name),
                        'success')
                ds_map[ds_name] = datasource
            elif refreshAll:
                flasher('Refreshing datasource [{}]'.format(ds_name), 'info')
            else:
                del ds_map[ds_name]
                continue
            datasource.cluster = self
            datasource.merge_flag = merge_flag
        session.flush()

        # Prepare multithreaded executation
        pool = Pool()
        ds_refresh = list(ds_map.values())
        metadata = pool.map(_fetch_metadata_for, ds_refresh)
        pool.close()
        pool.join()

        for i in range(0, len(ds_refresh)):
            datasource = ds_refresh[i]
            cols = metadata[i]
            if cols:
                col_objs_list = (session.query(DruidColumn).filter(
                    DruidColumn.datasource_name ==
                    datasource.datasource_name).filter(
                        or_(DruidColumn.column_name == col for col in cols)))
                col_objs = {col.column_name: col for col in col_objs_list}
                for col in cols:
                    if col == '__time':  # skip the time column
                        continue
                    col_obj = col_objs.get(col, None)
                    if not col_obj:
                        col_obj = DruidColumn(
                            datasource_name=datasource.datasource_name,
                            column_name=col)
                        with session.no_autoflush:
                            session.add(col_obj)
                    datatype = cols[col]['type']
                    if datatype == 'STRING':
                        col_obj.groupby = True
                        col_obj.filterable = True
                    if datatype == 'hyperUnique' or datatype == 'thetaSketch':
                        col_obj.count_distinct = True
                    # Allow sum/min/max for long or double
                    if datatype == 'LONG' or datatype == 'DOUBLE':
                        col_obj.sum = True
                        col_obj.min = True
                        col_obj.max = True
                    col_obj.type = datatype
                    col_obj.datasource = datasource
                datasource.generate_metrics_for(col_objs_list)
        session.commit()
Example #3
0
    def refresh_async(self, datasource_names, merge_flag, refreshAll):
        """
        Fetches metadata for the specified datasources andm
        merges to the Superset database
        """
        session = db.session
        ds_list = (
            session.query(DruidDatasource)
            .filter(or_(DruidDatasource.datasource_name == name
                    for name in datasource_names))
        )

        ds_map = {ds.name: ds for ds in ds_list}
        for ds_name in datasource_names:
            datasource = ds_map.get(ds_name, None)
            if not datasource:
                datasource = DruidDatasource(datasource_name=ds_name)
                with session.no_autoflush:
                    session.add(datasource)
                flasher(
                    "Adding new datasource [{}]".format(ds_name), 'success')
                ds_map[ds_name] = datasource
            elif refreshAll:
                flasher(
                    "Refreshing datasource [{}]".format(ds_name), 'info')
            else:
                del ds_map[ds_name]
                continue
            datasource.cluster = self
            datasource.merge_flag = merge_flag
        session.flush()

        # Prepare multithreaded executation
        pool = Pool()
        ds_refresh = list(ds_map.values())
        metadata = pool.map(_fetch_metadata_for, ds_refresh)
        pool.close()
        pool.join()

        for i in range(0, len(ds_refresh)):
            datasource = ds_refresh[i]
            cols = metadata[i]
            col_objs_list = (
                session.query(DruidColumn)
                .filter(DruidColumn.datasource_name == datasource.datasource_name)
                .filter(or_(DruidColumn.column_name == col for col in cols))
            )
            col_objs = {col.column_name: col for col in col_objs_list}
            for col in cols:
                if col == '__time':  # skip the time column
                    continue
                col_obj = col_objs.get(col, None)
                if not col_obj:
                    col_obj = DruidColumn(
                        datasource_name=datasource.datasource_name,
                        column_name=col)
                    with session.no_autoflush:
                        session.add(col_obj)
                datatype = cols[col]['type']
                if datatype == 'STRING':
                    col_obj.groupby = True
                    col_obj.filterable = True
                if datatype == 'hyperUnique' or datatype == 'thetaSketch':
                    col_obj.count_distinct = True
                # Allow sum/min/max for long or double
                if datatype == 'LONG' or datatype == 'DOUBLE':
                    col_obj.sum = True
                    col_obj.min = True
                    col_obj.max = True
                col_obj.type = datatype
                col_obj.datasource = datasource
            datasource.generate_metrics_for(col_objs_list)
        session.commit()