Ejemplo n.º 1
0
def main():
    # configure logging
    logging.basicConfig(level=logging.INFO)
    logging.info(cf_netSDM)
    # Read inputs
    data = io_helper.fetch_data()['data']
    X = io_helper.fetch_dataframe(data['independent'])
    y = io_helper.fetch_dataframe(data['dependent']).iloc[:, 0]

    if len(X) >= 2000:
        logging.warning(
            'HINMine runs in quadratic time, processing {} samples could be very slow.'
            .format(len(X)))

    normalize = parameters.get_param('normalize', bool, 'True')
    damping = parameters.get_param('damping', float, '0.85')

    if normalize:
        X = X.apply(lambda x: x / np.linalg.norm(x))

    network = construct_adjacency_graph(range(len(X)), X.values, y.values)
    propositionalized = timecall(cf_netSDM.hinmine_propositionalize)(
        network, damping)['train_features']['data']

    results_dict = _construct_results(propositionalized)

    io_helper.save_results(json.dumps(results_dict),
                           shapes.Shapes.TABULAR_DATA_RESOURCE)
def compute():
    """Create PFA for kNN."""
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")

    inputs = io_helper.fetch_data()
    indep_vars = inputs["data"]["independent"]

    # Extract hyperparameters from ENV variables
    k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS)

    # featurization
    featurizer = _create_featurizer(indep_vars)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=indep_vars)
    X = utils.remove_nulls(X, errors='ignore')
    X = featurizer.transform(X)

    estimator = KMeans(n_clusters=k)
    estimator.fit(X)

    # Generate PFA for kmeans
    types = [(var['name'], var['type']['name']) for var in indep_vars]
    pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

    # Add centroids as metadata
    pfa['metadata'] = {
        'centroids': json.dumps(estimator.cluster_centers_.tolist())
    }

    # Save or update job_result
    logging.info('Saving PFA to job_results table')
    io_helper.save_results(json.dumps(pfa), shapes.Shapes.PFA)
    logging.info("DONE")
Ejemplo n.º 3
0
def compute():
    """Create PFA for kNN."""
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]
    params = parameters.fetch_parameters()

    if dep_var['type']['name'] in ('polynominal', 'binominal'):
        job_type = 'classification'
    else:
        job_type = 'regression'

    logging.info('Creating new estimator')
    estimator = _create_estimator(job_type, params)
    featurizer = _create_featurizer(indep_vars)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    X = utils.remove_nulls(X)
    y = X.pop(dep_var['name'])
    X = featurizer.transform(X)

    # Drop NaN values
    estimator.fit(X, y)

    # Create PFA from the estimator
    types = [(var['name'], var['type']['name']) for var in indep_vars]
    pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())
    pfa['name'] = "kNN"

    # Save or update job_result
    logging.info('Saving PFA to job_results table...')
    pfa = json.dumps(pfa)
    io_helper.save_results(pfa, shapes.Shapes.PFA)
def compute(graph_type=None):
    """Perform both intermediate step and aggregation at once."""
    # Read inputs
    logging.info("Fetching data...")
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    result = _compute_intermediate_result(inputs)
    corr, columns, crosstab = _aggregate_results([result])

    graph_type = graph_type or parameters.get_parameter(
        'graph', str, 'correlation_heatmap')

    if graph_type == 'correlation_heatmap':
        fig = _fig_corr_heatmap(corr, columns, crosstab)
    elif graph_type == 'pca':
        X = io_helper.fetch_dataframe([dep_var] + indep_vars)
        fig = _fig_pca(corr, columns, X)
    else:
        raise errors.UserError(
            'MODEL_PARAM_graph only supports values `correlation_heatmap` and `pca`'
        )

    logging.info("Results:\n{}".format(fig))
    io_helper.save_results(json.dumps(fig), shapes.Shapes.PLOTLY)
    logging.info("DONE")
Ejemplo n.º 5
0
def intermediate():
    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    data = utils.remove_nulls(data, errors='ignore')
    y = data.pop(dep_var['name'])

    featurizer = _create_featurizer(indep_vars)
    X = pd.DataFrame(featurizer.transform(data),
                     columns=featurizer.columns,
                     index=data.index)

    if not indep_vars:
        raise errors.UserError('No covariables selected.')

    # Distributed linear regression only works for continuous variables
    if utils.is_nominal(dep_var):
        raise errors.UserError(
            'Dependent variable must be continuous in distributed mode. Use SGD Regression for '
            'nominal variables instead.')

    if data.empty:
        logging.warning('All values are NAN, returning zero values')
        result = {
            'summary': {},
            'columns': [],
            'means': 0,
            'X^T * X': 0,
            'count': 0,
            'scale': 0,
        }

    else:
        # Compute linear-regression
        X.insert(loc=0, column='intercept', value=1.)
        lm = OLS(y, X)
        flm = lm.fit()
        logging.info(flm.summary())
        output = format_output(flm)

        result = {
            'summary': output,
            'columns': list(X.columns),
            'means': X.mean().values,
            'X^T * X': X.T.values.dot(X.values),
            'count': len(X),
            'scale': flm.scale,
        }

    # Store results
    io_helper.save_results(json.dumps(result), 'application/json')
def _compute_intermediate_result(inputs):
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    nominal_vars = []
    numeric_vars = []
    for var in [dep_var] + indep_vars:
        if utils.is_nominal(var):
            nominal_vars.append(var['name'])
        else:
            numeric_vars.append(var['name'])

    # Load data into a Pandas dataframe
    logging.info("Loading data...")
    X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)

    logging.info('Dropping NULL values')
    X = utils.remove_nulls(X, errors='ignore')

    # Generate results
    logging.info("Generating results...")
    result = {
        'columns': numeric_vars,
        'nominal_columns': nominal_vars,
    }
    if len(X):
        result.update({
            'means':
            X[numeric_vars].mean().values,
            'X^T * X':
            X[numeric_vars].T.dot(X[numeric_vars].values).values,
            'count':
            len(X),
        })
        if nominal_vars:
            result['crosstab'] = X[nominal_vars].groupby(nominal_vars).size()\
                                                .reset_index()\
                                                .rename(columns={0: 'count'})\
                                                .to_dict(orient='records')
        else:
            result['crosstab'] = []
    else:
        logging.warning('All values are NAN, returning zero values')
        k = len(result['columns'])
        result.update({
            'means': np.zeros(k),
            'X^T * X': np.zeros((k, k)),
            'count': 0,
            'crosstab': [],
        })
    return result
Ejemplo n.º 7
0
def intermediate_stats():
    """Calculate summary statistics for single node."""
    # Read inputs
    logging.info("Fetching data...")
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]
    labels = _get_labels(indep_vars + [dep_var])
    types = _get_types(indep_vars + [dep_var])

    if len(dep_var['series']) == 0:
        logging.warning(
            'Dependent variable has no values, check your SQL query.')

    # Load data into a Pandas dataframe
    logging.info("Loading data...")
    df = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)

    # Generate results
    logging.info("Generating results...")

    group_variables = [
        var['name'] for var in indep_vars if utils.is_nominal(var)
    ]

    # grouped statistics
    data = []
    if group_variables:
        for group_name, group in df.groupby(group_variables):
            # if there's only one nominal column
            if not isinstance(group_name, tuple):
                group_name = (group_name, )

            data += _calc_stats(group, group_name, group_variables, labels,
                                types)

    # overall statistics
    data += _calc_stats(df, ('all', ), [], labels, types)

    logging.info("Results:\n{}".format(data))
    table = {
        'schema': OUTPUT_SCHEMA_INTERMEDIATE,
        'data': data,
    }
    io_helper.save_results(pd.io.json.dumps(table),
                           shapes.Shapes.TABULAR_DATA_RESOURCE)
    logging.info("DONE")
def test_fetch_dataframe(mock_read_sql_query):
    data = pd.DataFrame({
        'lefthippocampus': [1., 2.],
        'subjectageyears': [20, 30],
    })
    mock_read_sql_query.return_value = data

    with mock_engine():
        df = fetch_dataframe()

    assert df.to_dict(orient='records') == [{
        'subjectageyears': 20.0,
        'lefthippocampus': 1.0
    }, {
        'subjectageyears': 30.0,
        'lefthippocampus': 2.0
    }]
def _compute_intermediate_result(inputs):
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    # Use only numeric variables
    variables = []
    for var in [dep_var] + indep_vars:
        if utils.is_nominal(var):
            logging.warning('Correlation heatmap works only with numerical types ({} is {})'.format(var['name'], var['type']['name']))
        else:
            variables.append(var)

    # Load data into a Pandas dataframe
    logging.info("Loading data...")
    X = io_helper.fetch_dataframe(variables=variables)

    logging.info('Dropping NULL values')
    X = utils.remove_nulls(X, errors='ignore')

    # Generate results
    logging.info("Generating results...")
    if len(X):
        result = {
            'columns': list(X.columns),
            'means': X.mean().values,
            'X^T * X': X.T.dot(X.values).values,
            'count': len(X),
        }
    else:
        logging.warning('All values are NAN, returning zero values')
        k = X.shape[1]
        result = {
            'columns': list(X.columns),
            'means': np.zeros(k),
            'X^T * X': np.zeros((k, k)),
            'count': 0,
        }
    return result
def intermediate_kmeans():
    """Calculate k-Means locally."""
    # Read inputs
    logging.info("Fetching data...")
    inputs = io_helper.fetch_data()
    indep_vars = inputs["data"]["independent"]

    # Extract hyperparameters from ENV variables
    k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS)

    # Load data into a Pandas dataframe
    logging.info("Loading data...")
    X = io_helper.fetch_dataframe(variables=indep_vars)

    # Return variables info, but remove actual data points
    results = {'indep_vars': []}
    for var in indep_vars:
        if var['type']['name'] in ('integer', 'real'):
            new_var = {k: v for k, v in var.items() if k != 'series'}
            mean, std = _get_moments(var)
            new_var['mean'] = mean
            new_var['std'] = std
        else:
            new_var = var

        results['indep_vars'].append(new_var)

    # Drop NaN values
    X = utils.remove_nulls(X, errors='ignore')
    if len(X) == 0:
        logging.warning("All data are NULL, returning empty centroids.")
        results['centroids'] = []
        io_helper.save_results(json.dumps(results), shapes.Shapes.JSON)
        return

    # Generate results
    logging.info("Generating results...")

    # featurization
    featurizer = _create_featurizer(indep_vars)
    X = featurizer.transform(X)

    m, n = X.shape
    num_iter = 0
    not_converged = True

    # Run k-Means locally
    # Have each site compute k initial clusters locally
    local_centroids = local.initialize_own_centroids(X, k)

    # Local Optimization Loop
    while not_converged:
        # Each local site computes its cluster
        cluster_labels = local.compute_clustering(X, local_centroids)
        if OPTIMIZATION == 'lloyd':
            # Computes its local mean if doing lloyd, and updates centroids
            local_means = local.compute_mean(X, cluster_labels, k)
            local_centroids, previous_centroids = local.mean_step(
                local_means, local_centroids)
        elif OPTIMIZATION == 'gradient':
            # Computes the local gradient if doing GD, and takes a GD step
            local_grad = local.compute_gradient(X, cluster_labels,
                                                local_centroids, LR)
            local_centroids, previous_centroids = local.gradient_step(
                local_grad, local_centroids)

        # Check local stopping conditions
        not_converged, local_delta = local.check_stopping(
            local_centroids, previous_centroids, EPSILON)

        num_iter += 1
        logging.info("Single-Shot {} ; iter : {} delta : {}".format(
            OPTIMIZATION, num_iter, local_delta))

    results['centroids'] = [lc.tolist() for lc in local_centroids]

    logging.info("Results:\n{}".format(results))
    io_helper.save_results(json.dumps(results), shapes.Shapes.JSON)
    logging.info("DONE")
Ejemplo n.º 11
0
def main(job_id, generate_pfa):
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    if dep_var['type']['name'] in ('polynominal', 'binominal'):
        job_type = 'classification'
    else:
        job_type = 'regression'

    # Get existing results with partial model if they exist
    if job_id:
        job_result = io_helper.get_results(job_id=str(job_id))

        logging.info('Loading existing estimator')
        estimator = deserialize_sklearn_estimator(json.loads(job_result.data)['estimator'])
    else:
        logging.info('Creating new estimator')
        estimator = _create_estimator(job_type)

    # featurization
    featurizer = _create_featurizer(indep_vars, estimator)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    X = utils.remove_nulls(X, errors='ignore')
    y = X.pop(dep_var['name'])

    X = featurizer.transform(X)

    if len(X) == 0:
        # log error, but still save the estimator
        logging.warning("All data are NULL, cannot fit model")
    else:
        # Train single step
        if hasattr(estimator, 'partial_fit'):
            if job_type == 'classification':
                estimator.partial_fit(X, y, classes=dep_var['type']['enumeration'])
            else:
                estimator.partial_fit(X, y)
        else:
            if not generate_pfa:
                logging.warning('{} does not support partial fit.'.format(estimator))
            if isinstance(estimator, GradientBoostingClassifier) and len(set(y)) == 1:
                raise errors.UserError(
                    'All outputs have single category ({}), Gradient boosting cannot fit that.'.format(y.iloc[0])
                )
            estimator.fit(X, y)

    if generate_pfa:
        # Create PFA from the estimator
        types = [(var['name'], var['type']['name']) for var in indep_vars]

        # Estimator was not trained on any data
        if not _is_fitted(estimator):
            raise errors.UserError('Model was not fitted on any data, cannot generate PFA.')

        pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

        # Add serialized model as metadata
        pfa['metadata'] = _estimator_metadata(estimator, X, y, featurizer)

        model_type = parameters.get_parameter('type', str, 'linear_model')
        pfa['name'] = model_type

        # Save or update job_result
        logging.info('Saving PFA to job_results table')
        pfa = json.dumps(pfa)
        io_helper.save_results(pfa, shapes.Shapes.PFA)
    else:
        # Save or update job_result
        logging.info('Saving serialized estimator into job_results table')
        io_helper.save_results(json.dumps(_estimator_metadata(estimator, X, y, featurizer)), shapes.Shapes.JSON)
Ejemplo n.º 12
0
def main():
    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    if not indep_vars:
        raise errors.UserError('No covariables selected.')

    if utils.is_nominal(dep_var):
        job_type = 'classification'
    else:
        job_type = 'regression'

    data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    data = utils.remove_nulls(data, errors='ignore')
    y = data.pop(dep_var['name'])

    featurizer = _create_featurizer(indep_vars)
    X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns)

    if X.empty:
        logging.warning('All values are NAN, returning zero values')
        result = {}
        pfa = None

    else:
        # Add intercept
        X.insert(loc=0, column='intercept', value=1.)

        # Remove linearly dependent columns
        X = X.iloc[:, _independent_columns(X)]

        # Fit regresssion
        if job_type == 'regression':
            result, metadata = _fit_regression(X, y)

            # Generate PFA for predictions
            pfa = _generate_pfa_regressor(result, indep_vars, featurizer)

        elif job_type == 'classification':
            # Run one-vs-others for each class
            result = {}
            metadata = {}
            for cat in y.cat.categories:
                r, m = _fit_logit(X, y == cat)
                result[cat] = r
                metadata[cat] = m

            if all(result[cat]['intercept']['coef'] is None
                   for cat in y.cat.categories):
                raise errors.UserError(
                    'Not enough data to apply logistic regression.')

            # Generate PFA for predictions
            pfa = _generate_pfa_classifier(result, indep_vars, featurizer,
                                           y.cat.categories)

        # Add metadata from model
        pfa['metadata'] = metadata

        # TODO: save multiple outputs - PFA and coeficients

    # Store results
    io_helper.save_results(json.dumps(result), 'application/json')