def aggregate_stats(job_ids, graph_type=None):
    """Get all partial statistics from all nodes and aggregate them.
    :input job_ids: list of job_ids with intermediate results
    """
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")
    results = io_helper.load_intermediate_json_results(map(str, job_ids))

    corr, columns, crosstab = _aggregate_results(results)

    graph_type = graph_type or parameters.get_parameter(
        'graph', str, 'correlation_heatmap')

    if graph_type == 'correlation_heatmap':
        fig = _fig_corr_heatmap(corr, columns, crosstab)
    elif graph_type == 'pca':
        # save PCA graphs, but leave out the one with PCA scores
        logging.warning(
            'Sample scores graph is not yet implemented for distributed PCA.')
        fig = _fig_pca(corr, columns, X=None)
    else:
        raise errors.UserError(
            'MODEL_PARAM_graph only supports values `correlation_heatmap` and `pca`'
        )

    logging.info("Results:\n{}".format(fig))
    io_helper.save_results(json.dumps(fig), shapes.Shapes.PLOTLY)
    logging.info("DONE")
Example #2
0
def compute():
    """Create PFA for kNN."""
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]
    params = parameters.fetch_parameters()

    if dep_var['type']['name'] in ('polynominal', 'binominal'):
        job_type = 'classification'
    else:
        job_type = 'regression'

    logging.info('Creating new estimator')
    estimator = _create_estimator(job_type, params)
    featurizer = _create_featurizer(indep_vars)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    X = utils.remove_nulls(X)
    y = X.pop(dep_var['name'])
    X = featurizer.transform(X)

    # Drop NaN values
    estimator.fit(X, y)

    # Create PFA from the estimator
    types = [(var['name'], var['type']['name']) for var in indep_vars]
    pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())
    pfa['name'] = "kNN"

    # Save or update job_result
    logging.info('Saving PFA to job_results table...')
    pfa = json.dumps(pfa)
    io_helper.save_results(pfa, shapes.Shapes.PFA)
def compute(graph_type=None):
    """Perform both intermediate step and aggregation at once."""
    # Read inputs
    logging.info("Fetching data...")
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    result = _compute_intermediate_result(inputs)
    corr, columns, crosstab = _aggregate_results([result])

    graph_type = graph_type or parameters.get_parameter(
        'graph', str, 'correlation_heatmap')

    if graph_type == 'correlation_heatmap':
        fig = _fig_corr_heatmap(corr, columns, crosstab)
    elif graph_type == 'pca':
        X = io_helper.fetch_dataframe([dep_var] + indep_vars)
        fig = _fig_pca(corr, columns, X)
    else:
        raise errors.UserError(
            'MODEL_PARAM_graph only supports values `correlation_heatmap` and `pca`'
        )

    logging.info("Results:\n{}".format(fig))
    io_helper.save_results(json.dumps(fig), shapes.Shapes.PLOTLY)
    logging.info("DONE")
def compute():
    """Create PFA for kNN."""
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")

    inputs = io_helper.fetch_data()
    indep_vars = inputs["data"]["independent"]

    # Extract hyperparameters from ENV variables
    k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS)

    # featurization
    featurizer = _create_featurizer(indep_vars)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=indep_vars)
    X = utils.remove_nulls(X, errors='ignore')
    X = featurizer.transform(X)

    estimator = KMeans(n_clusters=k)
    estimator.fit(X)

    # Generate PFA for kmeans
    types = [(var['name'], var['type']['name']) for var in indep_vars]
    pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

    # Add centroids as metadata
    pfa['metadata'] = {
        'centroids': json.dumps(estimator.cluster_centers_.tolist())
    }

    # Save or update job_result
    logging.info('Saving PFA to job_results table')
    io_helper.save_results(json.dumps(pfa), shapes.Shapes.PFA)
    logging.info("DONE")
Example #5
0
def main():
    # configure logging
    logging.basicConfig(level=logging.INFO)
    logging.info(cf_netSDM)
    # Read inputs
    data = io_helper.fetch_data()['data']
    X = io_helper.fetch_dataframe(data['independent'])
    y = io_helper.fetch_dataframe(data['dependent']).iloc[:, 0]

    if len(X) >= 2000:
        logging.warning(
            'HINMine runs in quadratic time, processing {} samples could be very slow.'
            .format(len(X)))

    normalize = parameters.get_param('normalize', bool, 'True')
    damping = parameters.get_param('damping', float, '0.85')

    if normalize:
        X = X.apply(lambda x: x / np.linalg.norm(x))

    network = construct_adjacency_graph(range(len(X)), X.values, y.values)
    propositionalized = timecall(cf_netSDM.hinmine_propositionalize)(
        network, damping)['train_features']['data']

    results_dict = _construct_results(propositionalized)

    io_helper.save_results(json.dumps(results_dict),
                           shapes.Shapes.TABULAR_DATA_RESOURCE)
def intermediate_stats():
    """Calculate X*X^T, means and count for single node that will be later used to construct covariance matrix."""
    # Read inputs
    logging.info("Fetching data...")
    inputs = io_helper.fetch_data()

    result = _compute_intermediate_result(inputs)
    io_helper.save_results(json.dumps(result), shapes.Shapes.JSON)
    logging.info("DONE")
def _save_corr_heatmap(corr, columns):
    """Generate heatmap from correlation matrix and return it in plotly format"""
    trace = go.Heatmap(z=corr,
                       x=columns,
                       y=columns)
    data = [trace]

    logging.info("Results:\n{}".format(data))
    io_helper.save_results(json.dumps(data), shapes.Shapes.PLOTLY)
    logging.info("DONE")
Example #8
0
def intermediate():
    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    data = utils.remove_nulls(data, errors='ignore')
    y = data.pop(dep_var['name'])

    featurizer = _create_featurizer(indep_vars)
    X = pd.DataFrame(featurizer.transform(data),
                     columns=featurizer.columns,
                     index=data.index)

    if not indep_vars:
        raise errors.UserError('No covariables selected.')

    # Distributed linear regression only works for continuous variables
    if utils.is_nominal(dep_var):
        raise errors.UserError(
            'Dependent variable must be continuous in distributed mode. Use SGD Regression for '
            'nominal variables instead.')

    if data.empty:
        logging.warning('All values are NAN, returning zero values')
        result = {
            'summary': {},
            'columns': [],
            'means': 0,
            'X^T * X': 0,
            'count': 0,
            'scale': 0,
        }

    else:
        # Compute linear-regression
        X.insert(loc=0, column='intercept', value=1.)
        lm = OLS(y, X)
        flm = lm.fit()
        logging.info(flm.summary())
        output = format_output(flm)

        result = {
            'summary': output,
            'columns': list(X.columns),
            'means': X.mean().values,
            'X^T * X': X.T.values.dot(X.values),
            'count': len(X),
            'scale': flm.scale,
        }

    # Store results
    io_helper.save_results(json.dumps(result), 'application/json')
def main():
    # configure logging
    logging.basicConfig(level=logging.INFO)
    logging.info(cf_netSDM)
    # Read inputs
    inputs = io_helper.fetch_data()
    data = inputs['data']
    normalize = parameters.get_param('normalize', bool, 'True')
    damping = parameters.get_param('damping', float, '0.85')
    data_array = np.zeros(
        (len(data['independent'][0]['series']), len(data['independent'])))
    col_number = 0
    row_number = 0
    for var in data['independent']:
        for value in var['series']:
            data_array[row_number, col_number] = value
            row_number += 1
        col_number += 1
        row_number = 0
    if normalize:
        for col_number in range(data_array.shape[1]):
            data_array[:,
                       col_number] = data_array[:,
                                                col_number] / np.linalg.norm(
                                                    data_array[:, col_number])
    network = construct_adjacency_graph(range(data_array.shape[0]), data_array,
                                        data['dependent'][0]['series'])
    propositionalized = cf_netSDM.hinmine_propositionalize(
        network, damping)['train_features']['data']
    results_dict = {
        'profile': 'tabular-data-resource',
        'name': 'hinmine-features',
        'data': [],
        'schema': {
            'fields': [],
            'primaryKey': 'id'
        }
    }
    n = propositionalized.shape[0]
    for row_index in range(n):
        instance = {"id": row_index}
        for col_index in range(n):
            instance["feature_%i" %
                     (col_index + 1)] = propositionalized[row_index, col_index]
        results_dict['data'].append(instance)
    for col_index in range(n):
        results_dict['schema']['fields'].append({
            'name':
            'feature_%i' % (col_index + 1),
            'type':
            'float'
        })
    io_helper.save_results(json.dumps(results_dict), 'text/plain')
Example #10
0
def aggregate_knn(job_ids):
    """Get all kNN from all nodes and create one model from them.
    :input job_ids: list of job_ids with intermediate results
    """
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")
    pfas = io_helper.load_intermediate_json_results(job_ids)

    # Put all PFAs together by combining `points`
    pfa = _combine_knn_pfas(pfas)

    # Save job_result
    logging.info('Saving PFA to job_results table...')
    pfa = json.dumps(pfa)
    logging.info("Results:\n{}".format(pfa))
    io_helper.save_results(pfa, shapes.Shapes.PFA)
def intermediate_stats():
    """Calculate summary statistics for single node."""
    # Read inputs
    logging.info("Fetching data...")
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]
    labels = _get_labels(indep_vars + [dep_var])
    types = _get_types(indep_vars + [dep_var])

    if len(dep_var['series']) == 0:
        logging.warning(
            'Dependent variable has no values, check your SQL query.')

    # Load data into a Pandas dataframe
    logging.info("Loading data...")
    df = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)

    # Generate results
    logging.info("Generating results...")

    group_variables = [
        var['name'] for var in indep_vars if utils.is_nominal(var)
    ]

    # grouped statistics
    data = []
    if group_variables:
        for group_name, group in df.groupby(group_variables):
            # if there's only one nominal column
            if not isinstance(group_name, tuple):
                group_name = (group_name, )

            data += _calc_stats(group, group_name, group_variables, labels,
                                types)

    # overall statistics
    data += _calc_stats(df, ('all', ), [], labels, types)

    logging.info("Results:\n{}".format(data))
    table = {
        'schema': OUTPUT_SCHEMA_INTERMEDIATE,
        'data': data,
    }
    io_helper.save_results(pd.io.json.dumps(table),
                           shapes.Shapes.TABULAR_DATA_RESOURCE)
    logging.info("DONE")
def test_save_results():
    results = json.dumps({'a': 'b'})

    with mock_engine() as engine:
        save_results(results=results, shape=Shapes.JSON)

    assert engine.execute.call_args[1] == {
        'job_id': '1',
        'node': 'test',
        'timestamp': datetime.datetime(2018, 1, 1, 0, 0),
        'data': '{"a": "b"}',
        'error': None,
        'shape': 'application/json',
        'function': 'unit-test',
        'result_name': '',
        'result_title': None,
        'parameters': PARAMETERS
    }
def aggregate_histograms(job_ids):
    """Get all histograms from all nodes and sum them together.
    :input job_ids: list of job_ids with intermediate results
    """
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")
    data = _load_intermediate_data(job_ids)

    # group by label (e.g. `Histogram - agegroup`)
    results = []
    data = sorted(data, key=lambda d: d['label'])
    for key, hists in itertools.groupby(data, key=lambda d: d['label']):
        hists = list(hists)

        # add data from other histograms
        result = hists[0]
        for hist in hists[1:]:
            hist, result = _align_categories(hist, result)
            assert hist['xAxis']['categories'] == result['xAxis']['categories']

            # use pandas for easier manipulation
            series = {s['name']: np.array(s['data']) for s in result['series']}

            for s in hist['series']:
                if s['name'] not in series:
                    series[s['name']] = s['data']
                else:
                    series[s['name']] += s['data']

            # turn series into original form
            result['series'] = [{
                'name': k,
                'data': list(v)
            } for k, v in series.items()]

        if not INCLUDE_NO_DATA:
            result = _remove_no_data(result)

        results.append(result)

    logging.info("Results:\n{}".format(results))
    io_helper.save_results(json.dumps(results), shapes.Shapes.HIGHCHARTS)
def main():
    """Calculate histogram of dependent variable in a single-node mode and return output in highcharts JSON."""
    try:
        # Read inputs
        inputs = io_helper.fetch_data()
        try:
            dep_var = inputs["data"]["dependent"][0]
        except KeyError:
            logging.warning("Cannot find dependent variables data")
            dep_var = []
        try:
            indep_vars = inputs["data"]["independent"]
        except KeyError:
            logging.warning("Cannot find independent variables data")
            indep_vars = []
        nb_bins = parameters.get_param(BINS_PARAM, int, DEFAULT_BINS)

        # Compute histograms (JSON formatted for HighCharts)
        histograms_results = compute_histograms(dep_var, indep_vars, nb_bins)

        if not INCLUDE_NO_DATA:
            histograms_results = [
                _remove_no_data(hist) for hist in histograms_results
            ]

        # Store results
        io_helper.save_results(json.dumps(histograms_results),
                               shapes.Shapes.HIGHCHARTS)
    except errors.UserError as e:
        logging.error(e)
        strict = parameters.get_boolean_param(STRICT_PARAM, DEFAULT_STRICT)
        if strict:
            # Will be handled by catch_user_error
            raise e
        else:
            # Display something to the user and then exit
            histograms_results = error_histograms(dep_var, indep_vars)
            io_helper.save_results(histograms_results,
                                   shapes.Shapes.HIGHCHARTS)
            utils.exit_on_error()
def aggregate_stats(job_ids):
    """Get all partial statistics from all nodes and aggregate them.
    :input job_ids: list of job_ids with intermediate results
    """
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")
    df = _load_intermediate_data(job_ids)

    # Aggregate summary statistics
    logging.info("Aggregating results...")
    data = []
    for (group_name, index), gf in df.groupby(['group', 'index']):
        data.append(_agg_stats(gf, group_name, index))

    logging.info("Results:\n{}".format(data))
    table = {
        'schema': OUTPUT_SCHEMA_AGGREGATE,
        'data': data,
    }
    io_helper.save_results(pd.io.json.dumps(table),
                           shapes.Shapes.TABULAR_DATA_RESOURCE)
    logging.info("DONE")
def aggregate_kmeans(job_ids):
    """Compute merging of clusters according to least merging error (e.g. smallest distance betweeen centroids)
    :input job_ids: list of job_ids with intermediate results
    """
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")
    data = [
        json.loads(io_helper.get_results(str(job_id)).data)
        for job_id in job_ids
    ]

    local_centroids = [
        np.array(x['centroids']) for x in data if x['centroids']
    ]
    indep_vars = data[0]['indep_vars']

    # Aggregate clusters remotely
    remote_centroids = remote.aggregate_clusters(local_centroids)
    logging.info("Centroids:\n{}".format(remote_centroids))

    # Create fake KMeans estimator and assign it our centroids
    estimator = KMeans()
    estimator.cluster_centers_ = np.array(remote_centroids)

    # Generate PFA for kmeans and add centroids to metadata
    featurizer = _create_featurizer(indep_vars)
    types = [(var['name'], var['type']['name']) for var in indep_vars]
    pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

    # Add serialized model as metadata
    pfa['metadata'] = {
        'centroids': json.dumps(np.array(remote_centroids).tolist())
    }

    # Save or update job_result
    logging.info('Saving PFA to job_results table')
    pfa = json.dumps(pfa)
    io_helper.save_results(pfa, shapes.Shapes.PFA)
    logging.info("DONE")
Example #17
0
def aggregate(job_ids):
    """Get partial regression coefficients together with covaraince matrix from all nodes and combine them into
    single estimate.
    :input job_ids: list of job_ids with intermediate results
    """
    # Read inputs
    inputs = io_helper.fetch_data()
    indep_vars = inputs["data"]["independent"]

    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")
    results = _load_intermediate_data(job_ids)

    # Pool results
    result = _combine_estimates(results)

    # Generate PFA from coefficients
    featurizer = _create_featurizer(indep_vars)
    pfa = _generate_pfa_regressor(result, indep_vars, featurizer)

    # Save job_result
    logging.info('Saving PFA to job_results table...')
    io_helper.save_results(json.dumps(result), shapes.Shapes.PFA)
def main():
    # Configure logging
    logging.basicConfig(level=logging.INFO)

    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    inped_vars = inputs["data"]["independent"]
    design = get_parameter(inputs["parameters"], DESIGN_PARAM)

    # Check dependent variable type (should be continuous)
    if dep_var["type"]["name"] not in ["integer", "real"]:
        logging.warning("Dependent variable should be continuous !")
        return None

    # Extract data and parameters from inputs
    data = format_data(inputs["data"])

    # Compute anova and generate PFA output
    anova_results = format_output(compute_anova(dep_var, inped_vars, data, design).to_dict())

    # Store results
    io_helper.save_results(anova_results, Shapes.JSON)
Example #19
0
def main():
    # Configure logging
    logging.basicConfig(level=logging.INFO)

    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    inped_vars = inputs["data"]["independent"]

    # Check dependent variable type (should be continuous)
    if dep_var["type"]["name"] not in ["integer", "real"]:
        logging.warning("Dependent variable should be continuous !")
        return None

    # Extract data and parameters from inputs
    data = format_data(inputs["data"])

    # Compute linear-regression and generate PFA output
    linear_regression_results = format_output(
        compute_linear_regression(dep_var, inped_vars, data))

    # Store results
    io_helper.save_results(linear_regression_results, 'application/json')
Example #20
0
def main(clean_files=False):
    """
    :param clean_files: if True, clean files afterwards
    """
    # Read inputs
    inputs = io_helper.fetch_data()
    data = inputs["data"]

    beam = parameters.get_parameter('beam', int, 10)
    support = parameters.get_parameter('support', float, '0.00001')
    out_file = 'input.csv'
    rules_out_file = 'rules.txt'

    matrix, attributes = preprocess.to_matrix(data)
    preprocess.dump_to_csv(matrix, attributes, out_file)

    # Call hedwig with sensible defaults
    examples_file = out_file

    empty_bk = tempfile.mkdtemp()
    call([
        'python', '-m'
        'hedwig.__main__', empty_bk, examples_file, '--beam',
        str(beam), '--support',
        str(support), '-f', 'csv', '-l', '-o', rules_out_file, '--nocache'
    ])

    with open(rules_out_file) as f:
        results = f.read()

    if clean_files:
        os.remove(out_file)
        os.remove(rules_out_file)

    io_helper.save_results(results.replace('less_than', '<'),
                           shapes.Shapes.TEXT)
    # Configure logging
    logging.basicConfig(level=logging.INFO)

    # Read inputs
    inputs = io_helper.fetch_data()
    data = inputs["data"]

    beam = parameters.get_param('beam', int, 10)
    support = parameters.get_param('support', float, '0.00001')
    out_file = 'input.csv'
    rules_out_file = 'rules.txt'

    matrix, attributes = preprocess.to_matrix(data)
    preprocess.dump_to_csv(matrix, attributes, out_file)

    # Call hedwig with sensible defaults
    examples_file = out_file

    empty_bk = tempfile.mkdtemp()
    call([
        'python', '-m'
        'hedwig.__main__', empty_bk, examples_file, '--beam',
        str(beam), '--support',
        str(support), '-f', 'csv', '-l', '-o', rules_out_file, '--nocache'
    ])

    with open(rules_out_file) as f:
        results = f.read()
    # TODO: add text/plain to mime types in shapes.Shapes
    io_helper.save_results(results.replace('less_than', '<'), 'text/plain')
def intermediate_kmeans():
    """Calculate k-Means locally."""
    # Read inputs
    logging.info("Fetching data...")
    inputs = io_helper.fetch_data()
    indep_vars = inputs["data"]["independent"]

    # Extract hyperparameters from ENV variables
    k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS)

    # Load data into a Pandas dataframe
    logging.info("Loading data...")
    X = io_helper.fetch_dataframe(variables=indep_vars)

    # Return variables info, but remove actual data points
    results = {'indep_vars': []}
    for var in indep_vars:
        if var['type']['name'] in ('integer', 'real'):
            new_var = {k: v for k, v in var.items() if k != 'series'}
            mean, std = _get_moments(var)
            new_var['mean'] = mean
            new_var['std'] = std
        else:
            new_var = var

        results['indep_vars'].append(new_var)

    # Drop NaN values
    X = utils.remove_nulls(X, errors='ignore')
    if len(X) == 0:
        logging.warning("All data are NULL, returning empty centroids.")
        results['centroids'] = []
        io_helper.save_results(json.dumps(results), shapes.Shapes.JSON)
        return

    # Generate results
    logging.info("Generating results...")

    # featurization
    featurizer = _create_featurizer(indep_vars)
    X = featurizer.transform(X)

    m, n = X.shape
    num_iter = 0
    not_converged = True

    # Run k-Means locally
    # Have each site compute k initial clusters locally
    local_centroids = local.initialize_own_centroids(X, k)

    # Local Optimization Loop
    while not_converged:
        # Each local site computes its cluster
        cluster_labels = local.compute_clustering(X, local_centroids)
        if OPTIMIZATION == 'lloyd':
            # Computes its local mean if doing lloyd, and updates centroids
            local_means = local.compute_mean(X, cluster_labels, k)
            local_centroids, previous_centroids = local.mean_step(
                local_means, local_centroids)
        elif OPTIMIZATION == 'gradient':
            # Computes the local gradient if doing GD, and takes a GD step
            local_grad = local.compute_gradient(X, cluster_labels,
                                                local_centroids, LR)
            local_centroids, previous_centroids = local.gradient_step(
                local_grad, local_centroids)

        # Check local stopping conditions
        not_converged, local_delta = local.check_stopping(
            local_centroids, previous_centroids, EPSILON)

        num_iter += 1
        logging.info("Single-Shot {} ; iter : {} delta : {}".format(
            OPTIMIZATION, num_iter, local_delta))

    results['centroids'] = [lc.tolist() for lc in local_centroids]

    logging.info("Results:\n{}".format(results))
    io_helper.save_results(json.dumps(results), shapes.Shapes.JSON)
    logging.info("DONE")
Example #23
0
def main(job_id, generate_pfa):
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    if dep_var['type']['name'] in ('polynominal', 'binominal'):
        job_type = 'classification'
    else:
        job_type = 'regression'

    # Get existing results with partial model if they exist
    if job_id:
        job_result = io_helper.get_results(job_id=str(job_id))

        logging.info('Loading existing estimator')
        estimator = deserialize_sklearn_estimator(json.loads(job_result.data)['estimator'])
    else:
        logging.info('Creating new estimator')
        estimator = _create_estimator(job_type)

    # featurization
    featurizer = _create_featurizer(indep_vars, estimator)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    X = utils.remove_nulls(X, errors='ignore')
    y = X.pop(dep_var['name'])

    X = featurizer.transform(X)

    if len(X) == 0:
        # log error, but still save the estimator
        logging.warning("All data are NULL, cannot fit model")
    else:
        # Train single step
        if hasattr(estimator, 'partial_fit'):
            if job_type == 'classification':
                estimator.partial_fit(X, y, classes=dep_var['type']['enumeration'])
            else:
                estimator.partial_fit(X, y)
        else:
            if not generate_pfa:
                logging.warning('{} does not support partial fit.'.format(estimator))
            if isinstance(estimator, GradientBoostingClassifier) and len(set(y)) == 1:
                raise errors.UserError(
                    'All outputs have single category ({}), Gradient boosting cannot fit that.'.format(y.iloc[0])
                )
            estimator.fit(X, y)

    if generate_pfa:
        # Create PFA from the estimator
        types = [(var['name'], var['type']['name']) for var in indep_vars]

        # Estimator was not trained on any data
        if not _is_fitted(estimator):
            raise errors.UserError('Model was not fitted on any data, cannot generate PFA.')

        pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

        # Add serialized model as metadata
        pfa['metadata'] = _estimator_metadata(estimator, X, y, featurizer)

        model_type = parameters.get_parameter('type', str, 'linear_model')
        pfa['name'] = model_type

        # Save or update job_result
        logging.info('Saving PFA to job_results table')
        pfa = json.dumps(pfa)
        io_helper.save_results(pfa, shapes.Shapes.PFA)
    else:
        # Save or update job_result
        logging.info('Saving serialized estimator into job_results table')
        io_helper.save_results(json.dumps(_estimator_metadata(estimator, X, y, featurizer)), shapes.Shapes.JSON)
Example #24
0
def main():
    # Read inputs
    inputs = io_helper.fetch_data()
    dep_var = inputs["data"]["dependent"][0]
    indep_vars = inputs["data"]["independent"]

    if not indep_vars:
        raise errors.UserError('No covariables selected.')

    if utils.is_nominal(dep_var):
        job_type = 'classification'
    else:
        job_type = 'regression'

    data = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars)
    data = utils.remove_nulls(data, errors='ignore')
    y = data.pop(dep_var['name'])

    featurizer = _create_featurizer(indep_vars)
    X = pd.DataFrame(featurizer.transform(data), columns=featurizer.columns)

    if X.empty:
        logging.warning('All values are NAN, returning zero values')
        result = {}
        pfa = None

    else:
        # Add intercept
        X.insert(loc=0, column='intercept', value=1.)

        # Remove linearly dependent columns
        X = X.iloc[:, _independent_columns(X)]

        # Fit regresssion
        if job_type == 'regression':
            result, metadata = _fit_regression(X, y)

            # Generate PFA for predictions
            pfa = _generate_pfa_regressor(result, indep_vars, featurizer)

        elif job_type == 'classification':
            # Run one-vs-others for each class
            result = {}
            metadata = {}
            for cat in y.cat.categories:
                r, m = _fit_logit(X, y == cat)
                result[cat] = r
                metadata[cat] = m

            if all(result[cat]['intercept']['coef'] is None
                   for cat in y.cat.categories):
                raise errors.UserError(
                    'Not enough data to apply logistic regression.')

            # Generate PFA for predictions
            pfa = _generate_pfa_classifier(result, indep_vars, featurizer,
                                           y.cat.categories)

        # Add metadata from model
        pfa['metadata'] = metadata

        # TODO: save multiple outputs - PFA and coeficients

    # Store results
    io_helper.save_results(json.dumps(result), 'application/json')
Example #25
0
def main():
    logging.basicConfig(level=logging.INFO)

    inputs = io_helper.fetch_data()

    # Dependent variable for tsne this might be the labels - this is optional
    labels = None
    dependent = inputs["data"].get("dependent", [])
    indep_vars = inputs["data"]["independent"]  # For tsne the data dimensions

    if not data_types_in_allowed(indep_vars, ["integer", "real"]):
        logging.warning("Independent variables should be continuous !")
        return None
    #
    data = format_independent_data(inputs["data"])
    df = pd.DataFrame.from_dict(data)
    source_dimensions = df.shape[1]  # number of columns
    num_points = df.shape[0]  # number of samples/points

    convdf = df.apply(lambda x: pd.to_numeric(x))
    # Write the data to a temporary file
    f = tempfile.NamedTemporaryFile(delete=False)
    input = convdf.values.astype(np.float32)
    logging.debug('input {}'.format(input))

    # Get the parameters (optional)
    perplexity = 30
    theta = 0.5
    target_dimensions = 2
    iterations = 1000
    do_zscore = True
    dependent_is_label = True

    try:
        perplexity = get_parameter(inputs['parameters'], 'perplexity',
                                   perplexity)
        theta = get_parameter(inputs['parameters'], 'theta', theta)
        target_dimensions = get_parameter(inputs['parameters'],
                                          'target_dimensions',
                                          target_dimensions)
        iterations = get_parameter(inputs['parameters'], 'iterations',
                                   iterations)
        do_zscore_str = get_parameter(inputs['parameters'], 'do_zscore',
                                      str(do_zscore))
        if do_zscore_str == 'True':
            do_zscore = True
        elif do_zscore_str == 'False':
            do_zscore = False
        else:
            raise ValueError
        dependent_is_label_str = get_parameter(inputs['parameters'],
                                               'dependent_is_label',
                                               str(dependent_is_label))
        if dependent_is_label_str == 'True':
            dependent_is_label = True
        elif dependent_is_label_str == 'False':
            dependent_is_label = False
        else:
            raise ValueError

    except ValueError as e:
        logging.error("Could not convert supplied parameter to value, error: ",
                      e)
        raise
    except Exception:
        logging.error(" Unexpected error:", sys.exc_info()[0])
        raise
    # Compute results

    if do_zscore:
        input = scipy.stats.zscore(input)

    if len(dependent) > 0 and dependent_is_label:
        dep_var = dependent[0]
        labels = dep_var["series"]

    input_file_path = f.name
    input.tofile(input_file_path)
    f.close()

    f = tempfile.NamedTemporaryFile(delete=False)
    output_file_path = f.name
    f.close()
    output = a_tsne(input_file_path, output_file_path, num_points,
                    source_dimensions, target_dimensions, perplexity, theta,
                    iterations)

    logging.debug('output shape {}'.format(output.shape))
    logging.debug('output {}'.format(output))
    chart = generate_scatterchart(output, indep_vars, labels, perplexity,
                                  theta, iterations)

    logging.debug("Highchart: %s", chart)
    io_helper.save_results(chart, '', shapes.Shapes.HIGHCHARTS)
    logging.info("Highchart output saved to database.")