def _load_intermediate_data(job_ids):
    jobs_data = [io_helper.get_results(job_id).data for job_id in job_ids]
    # chain all results together, ignore empty results
    data = list(itertools.chain(*[json.loads(d) for d in jobs_data if d]))

    if not data:
        raise errors.UserError('Intermediate jobs {} do not have any data.'.format(job_ids))

    return data
def aggregate_stats(job_ids):
    """Get all partial statistics from all nodes and aggregate them.
    :input job_ids: list of job_ids with intermediate results
    """
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")
    results = [json.loads(io_helper.get_results(str(job_id)).data) for job_id in job_ids]

    corr, columns = _aggregate_results(results)

    _save_corr_heatmap(corr, columns)
def _load_intermediate_data(job_ids):
    jobs_data = [io_helper.get_results(job_id).data for job_id in job_ids]
    # chain all results together, ignore empty results
    data = list(
        itertools.chain(*[json.loads(d)['data'] for d in jobs_data if d]))

    if not data:
        raise errors.UserError(
            'Intermediate jobs {} do not have any data.'.format(job_ids))

    df = pd.DataFrame(data)
    df['group'] = df['group'].map(tuple)
    return df
Esempio n. 4
0
def _load_intermediate_data(job_ids):
    data = []
    for job_id in job_ids:
        job_result = io_helper.get_results(job_id)

        # log errors (e.g. about missing data), but do not reraise them
        if job_result.error:
            logging.warning(job_result.error)
        else:
            pfa = json.loads(job_result.data)
            data.append(pfa)

    if not data:
        raise errors.UserError('All jobs {} returned an error.'.format(job_ids))

    return data
def aggregate_kmeans(job_ids):
    """Compute merging of clusters according to least merging error (e.g. smallest distance betweeen centroids)
    :input job_ids: list of job_ids with intermediate results
    """
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")
    data = [
        json.loads(io_helper.get_results(str(job_id)).data)
        for job_id in job_ids
    ]

    local_centroids = [
        np.array(x['centroids']) for x in data if x['centroids']
    ]
    indep_vars = data[0]['indep_vars']

    # Aggregate clusters remotely
    remote_centroids = remote.aggregate_clusters(local_centroids)
    logging.info("Centroids:\n{}".format(remote_centroids))

    # Create fake KMeans estimator and assign it our centroids
    estimator = KMeans()
    estimator.cluster_centers_ = np.array(remote_centroids)

    # Generate PFA for kmeans and add centroids to metadata
    featurizer = _create_featurizer(indep_vars)
    types = [(var['name'], var['type']['name']) for var in indep_vars]
    pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

    # Add serialized model as metadata
    pfa['metadata'] = {
        'centroids': json.dumps(np.array(remote_centroids).tolist())
    }

    # Save or update job_result
    logging.info('Saving PFA to job_results table')
    pfa = json.dumps(pfa)
    io_helper.save_results(pfa, shapes.Shapes.PFA)
    logging.info("DONE")
Esempio n. 6
0
def main(job_id, generate_pfa):
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    if dep_var['type']['name'] in ('polynominal', 'binominal'):
        job_type = 'classification'
    else:
        job_type = 'regression'

    # Get existing results with partial model if they exist
    if job_id:
        job_result = io_helper.get_results(job_id=str(job_id))

        logging.info('Loading existing estimator')
        estimator = deserialize_sklearn_estimator(json.loads(job_result.data)['estimator'])
    else:
        logging.info('Creating new estimator')
        estimator = _create_estimator(job_type)

    # featurization
    featurizer = _create_featurizer(indep_vars, estimator)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    X = utils.remove_nulls(X, errors='ignore')
    y = X.pop(dep_var['name'])

    X = featurizer.transform(X)

    if len(X) == 0:
        # log error, but still save the estimator
        logging.warning("All data are NULL, cannot fit model")
    else:
        # Train single step
        if hasattr(estimator, 'partial_fit'):
            if job_type == 'classification':
                estimator.partial_fit(X, y, classes=dep_var['type']['enumeration'])
            else:
                estimator.partial_fit(X, y)
        else:
            if not generate_pfa:
                logging.warning('{} does not support partial fit.'.format(estimator))
            if isinstance(estimator, GradientBoostingClassifier) and len(set(y)) == 1:
                raise errors.UserError(
                    'All outputs have single category ({}), Gradient boosting cannot fit that.'.format(y.iloc[0])
                )
            estimator.fit(X, y)

    if generate_pfa:
        # Create PFA from the estimator
        types = [(var['name'], var['type']['name']) for var in indep_vars]

        # Estimator was not trained on any data
        if not _is_fitted(estimator):
            raise errors.UserError('Model was not fitted on any data, cannot generate PFA.')

        pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

        # Add serialized model as metadata
        pfa['metadata'] = _estimator_metadata(estimator, X, y, featurizer)

        model_type = parameters.get_parameter('type', str, 'linear_model')
        pfa['name'] = model_type

        # Save or update job_result
        logging.info('Saving PFA to job_results table')
        pfa = json.dumps(pfa)
        io_helper.save_results(pfa, shapes.Shapes.PFA)
    else:
        # Save or update job_result
        logging.info('Saving serialized estimator into job_results table')
        io_helper.save_results(json.dumps(_estimator_metadata(estimator, X, y, featurizer)), shapes.Shapes.JSON)