Esempio n. 1
0
def main():
    # configure logging
    logging.basicConfig(level=logging.INFO)
    logging.info(cf_netSDM)
    # Read inputs
    data = io_helper.fetch_data()['data']
    X = io_helper.fetch_dataframe(data['independent'])
    y = io_helper.fetch_dataframe(data['dependent']).iloc[:, 0]

    if len(X) >= 2000:
        logging.warning(
            'HINMine runs in quadratic time, processing {} samples could be very slow.'
            .format(len(X)))

    normalize = parameters.get_param('normalize', bool, 'True')
    damping = parameters.get_param('damping', float, '0.85')

    if normalize:
        X = X.apply(lambda x: x / np.linalg.norm(x))

    network = construct_adjacency_graph(range(len(X)), X.values, y.values)
    propositionalized = timecall(cf_netSDM.hinmine_propositionalize)(
        network, damping)['train_features']['data']

    results_dict = _construct_results(propositionalized)

    io_helper.save_results(json.dumps(results_dict),
                           shapes.Shapes.TABULAR_DATA_RESOURCE)
def main():
    # configure logging
    logging.basicConfig(level=logging.INFO)
    logging.info(cf_netSDM)
    # Read inputs
    inputs = io_helper.fetch_data()
    data = inputs['data']
    normalize = parameters.get_param('normalize', bool, 'True')
    damping = parameters.get_param('damping', float, '0.85')
    data_array = np.zeros(
        (len(data['independent'][0]['series']), len(data['independent'])))
    col_number = 0
    row_number = 0
    for var in data['independent']:
        for value in var['series']:
            data_array[row_number, col_number] = value
            row_number += 1
        col_number += 1
        row_number = 0
    if normalize:
        for col_number in range(data_array.shape[1]):
            data_array[:,
                       col_number] = data_array[:,
                                                col_number] / np.linalg.norm(
                                                    data_array[:, col_number])
    network = construct_adjacency_graph(range(data_array.shape[0]), data_array,
                                        data['dependent'][0]['series'])
    propositionalized = cf_netSDM.hinmine_propositionalize(
        network, damping)['train_features']['data']
    results_dict = {
        'profile': 'tabular-data-resource',
        'name': 'hinmine-features',
        'data': [],
        'schema': {
            'fields': [],
            'primaryKey': 'id'
        }
    }
    n = propositionalized.shape[0]
    for row_index in range(n):
        instance = {"id": row_index}
        for col_index in range(n):
            instance["feature_%i" %
                     (col_index + 1)] = propositionalized[row_index, col_index]
        results_dict['data'].append(instance)
    for col_index in range(n):
        results_dict['schema']['fields'].append({
            'name':
            'feature_%i' % (col_index + 1),
            'type':
            'float'
        })
    io_helper.save_results(json.dumps(results_dict), 'text/plain')
def compute():
    """Create PFA for kNN."""
    # Read intermediate inputs from jobs
    logging.info("Fetching intermediate data...")

    inputs = io_helper.fetch_data()
    indep_vars = inputs["data"]["independent"]

    # Extract hyperparameters from ENV variables
    k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS)

    # featurization
    featurizer = _create_featurizer(indep_vars)

    # convert variables into dataframe
    X = io_helper.fetch_dataframe(variables=indep_vars)
    X = utils.remove_nulls(X, errors='ignore')
    X = featurizer.transform(X)

    estimator = KMeans(n_clusters=k)
    estimator.fit(X)

    # Generate PFA for kmeans
    types = [(var['name'], var['type']['name']) for var in indep_vars]
    pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())

    # Add centroids as metadata
    pfa['metadata'] = {
        'centroids': json.dumps(estimator.cluster_centers_.tolist())
    }

    # Save or update job_result
    logging.info('Saving PFA to job_results table')
    io_helper.save_results(json.dumps(pfa), shapes.Shapes.PFA)
    logging.info("DONE")
def main():
    """Calculate histogram of dependent variable in a single-node mode and return output in highcharts JSON."""
    try:
        # Read inputs
        inputs = io_helper.fetch_data()
        try:
            dep_var = inputs["data"]["dependent"][0]
        except KeyError:
            logging.warning("Cannot find dependent variables data")
            dep_var = []
        try:
            indep_vars = inputs["data"]["independent"]
        except KeyError:
            logging.warning("Cannot find independent variables data")
            indep_vars = []
        nb_bins = parameters.get_param(BINS_PARAM, int, DEFAULT_BINS)

        # Compute histograms (JSON formatted for HighCharts)
        histograms_results = compute_histograms(dep_var, indep_vars, nb_bins)

        if not INCLUDE_NO_DATA:
            histograms_results = [
                _remove_no_data(hist) for hist in histograms_results
            ]

        # Store results
        io_helper.save_results(json.dumps(histograms_results),
                               shapes.Shapes.HIGHCHARTS)
    except errors.UserError as e:
        logging.error(e)
        strict = parameters.get_boolean_param(STRICT_PARAM, DEFAULT_STRICT)
        if strict:
            # Will be handled by catch_user_error
            raise e
        else:
            # Display something to the user and then exit
            histograms_results = error_histograms(dep_var, indep_vars)
            io_helper.save_results(histograms_results,
                                   shapes.Shapes.HIGHCHARTS)
            utils.exit_on_error()
def intermediate_kmeans():
    """Calculate k-Means locally."""
    # Read inputs
    logging.info("Fetching data...")
    inputs = io_helper.fetch_data()
    indep_vars = inputs["data"]["independent"]

    # Extract hyperparameters from ENV variables
    k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS)

    # Load data into a Pandas dataframe
    logging.info("Loading data...")
    X = io_helper.fetch_dataframe(variables=indep_vars)

    # Return variables info, but remove actual data points
    results = {'indep_vars': []}
    for var in indep_vars:
        if var['type']['name'] in ('integer', 'real'):
            new_var = {k: v for k, v in var.items() if k != 'series'}
            mean, std = _get_moments(var)
            new_var['mean'] = mean
            new_var['std'] = std
        else:
            new_var = var

        results['indep_vars'].append(new_var)

    # Drop NaN values
    X = utils.remove_nulls(X, errors='ignore')
    if len(X) == 0:
        logging.warning("All data are NULL, returning empty centroids.")
        results['centroids'] = []
        io_helper.save_results(json.dumps(results), shapes.Shapes.JSON)
        return

    # Generate results
    logging.info("Generating results...")

    # featurization
    featurizer = _create_featurizer(indep_vars)
    X = featurizer.transform(X)

    m, n = X.shape
    num_iter = 0
    not_converged = True

    # Run k-Means locally
    # Have each site compute k initial clusters locally
    local_centroids = local.initialize_own_centroids(X, k)

    # Local Optimization Loop
    while not_converged:
        # Each local site computes its cluster
        cluster_labels = local.compute_clustering(X, local_centroids)
        if OPTIMIZATION == 'lloyd':
            # Computes its local mean if doing lloyd, and updates centroids
            local_means = local.compute_mean(X, cluster_labels, k)
            local_centroids, previous_centroids = local.mean_step(
                local_means, local_centroids)
        elif OPTIMIZATION == 'gradient':
            # Computes the local gradient if doing GD, and takes a GD step
            local_grad = local.compute_gradient(X, cluster_labels,
                                                local_centroids, LR)
            local_centroids, previous_centroids = local.gradient_step(
                local_grad, local_centroids)

        # Check local stopping conditions
        not_converged, local_delta = local.check_stopping(
            local_centroids, previous_centroids, EPSILON)

        num_iter += 1
        logging.info("Single-Shot {} ; iter : {} delta : {}".format(
            OPTIMIZATION, num_iter, local_delta))

    results['centroids'] = [lc.tolist() for lc in local_centroids]

    logging.info("Results:\n{}".format(results))
    io_helper.save_results(json.dumps(results), shapes.Shapes.JSON)
    logging.info("DONE")
from subprocess import call
from mip_helper import io_helper, parameters

import preprocess

DEFAULT_DOCKER_IMAGE = 'python-jsi-hedwig'

if __name__ == '__main__':
    # Configure logging
    logging.basicConfig(level=logging.INFO)

    # Read inputs
    inputs = io_helper.fetch_data()
    data = inputs["data"]

    beam = parameters.get_param('beam', int, 10)
    support = parameters.get_param('support', float, '0.00001')
    out_file = 'input.csv'
    rules_out_file = 'rules.txt'

    matrix, attributes = preprocess.to_matrix(data)
    preprocess.dump_to_csv(matrix, attributes, out_file)

    # Call hedwig with sensible defaults
    examples_file = out_file

    empty_bk = tempfile.mkdtemp()
    call([
        'python', '-m'
        'hedwig.__main__', empty_bk, examples_file, '--beam',
        str(beam), '--support',