Ejemplo n.º 1
0
def serialize_to_json(data):
    info(f'Serializing type {type(data)} to json')

    if isinstance(data, pd.DataFrame) | isinstance(data, pd.Series):
        return _serialize_pandas(data)

    return _default_serialization(data)
Ejemplo n.º 2
0
def column_names(client: ContainerClient,
                 data,
                 *args,
                 exclude_orgs=(),
                 **kwargs):
    """Master algoritm.

    Ask all nodes for their column names and combines them in one set.
    """

    info(
        f'Calling column names tasks on all organizations within collaboration except {exclude_orgs}'
    )
    results = _dispatch_tasks(client,
                              data,
                              *args,
                              method='column_names',
                              exclude_orgs=exclude_orgs,
                              **kwargs)

    # Create generator that lists all columns and turn it into a set to remove duplicates
    column_set = set(chain.from_iterable(results))

    info("master algorithm complete")

    # return all the messages from the nodes
    return column_set
Ejemplo n.º 3
0
def RPC_get_data(data: pd.DataFrame, *args, **kwargs):
    """
    Return the raw data.
    TODO: This function should not exist in the final version of the code! The data should be pseudonymized at the very
        least!

    """
    info(f'Returning raw data with {len(data)} records')
    return data
Ejemplo n.º 4
0
def RPC_column_names(data: pd.DataFrame, *args, **kwargs):
    """Column names

    List the names of the table columns
    """
    info("Retrieving column names")

    # what you return here is sent to the central server. So make sure
    # no privacy sensitive data is shared
    return data.columns.to_list()
Ejemplo n.º 5
0
def RPC_some_example_method(data, *args, **kwargs):
    """Some_example_method.

    Do computation on data local to this node and send it back to 
    central server for further processing.

    In this case, take mean `Age` on groups of different `Sex`
    """
    info("Computing mean age for males and females")
    result = data.groupby("Sex").Age.aggregate(['count', 'mean'])

    # what you return here is send to the central server. So make sure
    # no privacy sensitive data is shared
    return result.to_dict()
Ejemplo n.º 6
0
def _combine_all_node_data(client, data, merge_keys, *args,
                           **kwargs) -> pd.DataFrame:
    results = _dispatch_tasks(client, data, method='get_data', *args, **kwargs)

    for r in results:
        info(f'Retrieved node data with shape {r.shape}')

    combined_df = _merge_multiple_dfs(results, on=merge_keys)
    info(','.join(combined_df.columns))

    info(f'Joined table has shape {combined_df.shape}')

    # Drop duplicate rows, this happens in case identifying columns have
    # duplicate values (i.e. multiple entries for the same person or
    # different persons with the same name & birthdate)
    len_before_drop = len(combined_df)
    combined_df = combined_df.drop_duplicates(keep=False, subset=merge_keys)
    n_dropped_rows = len_before_drop - len(combined_df)
    info(f'Dropped {n_dropped_rows} rows with duplicate identifiers')

    if len(combined_df.index) < MIN_RECORDS:
        raise ValueError(
            f'Only {len(combined_df.index)} records available for analysis! Privacy is not ensured.'
        )

    return combined_df
Ejemplo n.º 7
0
def fit_pipeline(client: ContainerClient,
                 data,
                 pipe: Pipeline,
                 features: List[str],
                 target: str,
                 merge_keys=None,
                 *args,
                 **kwargs):
    """
    Retrieve data from nodes and train data analysis pipeline on it. Returns the performance of the resulting model.
    TODO: How and where do we save our model?

    :param client: Client for accessing Vantage6 proxy server. Is a parameter for all master algorithms
    :param data: Data from datastation as Pandas DataFrame. Is handled by wrapper
    :param pipe: A sklearn pipeline containing one or multiple data transformations. Should have a `fit` and
                        `predict` method.
    :param features: The features that should be used in the fitting of the pipeline.
    :param target: The field that should be used as target for the machine learning algorithm.
    :param merge_keys: The identifying fields for joining datasets.
    :param args:
    :param kwargs:
    :return:
    """
    pipe = pipeline.reconstruct_pipeline(pipe)
    try:
        info(f'Training pipeline with the following steps: {pipe.named_steps}')
        results = _combine_all_node_data(client, data, merge_keys, *args,
                                         **kwargs)

        X = results[features].values
        y = results[target].values

        # Split data
        # TODO: Make splitting of dataset controllable from client-side
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, random_state=RANDOM_SEED)

        pipe.fit(X_train, y_train)
        predictions = pipe.predict(X_test)

        # TODO: Make metrics configurable
        score = metrics.mean_absolute_error(y_test, predictions)

        return score
    except Exception as e:
        # Log stacktrace
        traceback.print_exc()
Ejemplo n.º 8
0
def dispact_rpc(data, input_data, module, token):

    # import algorithm module
    try:
        lib = importlib.import_module(module)
        info(f"Module '{module}' imported!")
    except ModuleNotFoundError:
        warn(f"Module '{module}' can not be imported!")

    # in case of a master container, we have to do a little extra
    master = input_data.get("master")
    if master:
        info("Running a master-container")
        # read env
        host = os.environ["HOST"]
        port = os.environ["PORT"]
        api_path = os.environ["API_PATH"]

        # init Docker Client
        client = ContainerClient(token=token,
                                 host=host,
                                 port=port,
                                 path=api_path)

        # read JWT token, to log te collaboration id. The
        # ContainerClient automatically sets the collaboration_id

        claims = jwt.decode(token, verify=False)
        id_ = claims["identity"]["collaboration_id"]
        info(f"Working with collaboration_id <{id_}>")

        method_name = input_data["method"]

    else:
        info("Running a regular container")
        method_name = f"RPC_{input_data['method']}"

    # attemt to load the method
    try:
        method = getattr(lib, method_name)
    except AttributeError:
        warn(f"method '{method_name}' not found!\n")
        exit(1)

    # get the args and kwargs input for this function.
    args = input_data.get("args", [])
    kwargs = input_data.get("kwargs", {})

    # try to run the method
    try:
        result = method(client, data, *args, **kwargs) if master else \
                 method(data, *args, **kwargs)
    except Exception as e:
        warn(f"Error encountered while calling {method_name}: {e}")
        exit(1)

    return result
Ejemplo n.º 9
0
def load_input(input_file):
    """
    Try to read the specified data format and deserialize the rest of the
    stream accordingly. If this fails, assume the data format is pickle.
    :param input_file:
    :return:
    """
    with open(input_file, "rb") as fp:
        try:
            input_data = _read_formatted(fp)
        except DeserializationException:
            info('No data format specified. '
                 'Assuming input data is pickle format')
            fp.seek(0)
            try:
                input_data = pickle.load(fp)
            except pickle.UnpicklingError:
                raise DeserializationException('Could not deserialize input')
    return input_data
Ejemplo n.º 10
0
def _dispatch_tasks(client: ContainerClient,
                    data,
                    method,
                    *args,
                    exclude_orgs=(),
                    **kwargs):
    """
    Generic master algorithm
    """
    tries = kwargs.get('tries', NUM_TRIES)

    # Get all organizations (ids) that are within the collaboration
    # FlaskIO knows the collaboration to which the container belongs
    # as this is encoded in the JWT (Bearer token)
    organizations = client.get_organizations_in_my_collaboration()

    info(f'Organizations in my collaboration: {organizations}')

    ids = map(lambda x: x['id'], organizations)
    ids = filter(lambda x: x not in exclude_orgs, ids)
    ids = list(ids)

    info(
        f'Dispatching task to organizations with ids {ids}.\n{exclude_orgs} will be excluded.'
    )

    # The input for the algorithm is the same for all organizations
    # in this case
    info("Defining input parameters")
    input_ = {
        "method": method,
    }

    # create a new task for all organizations in the collaboration.
    info("Dispatching node-tasks")
    task = client.create_new_task(input_=input_, organization_ids=list(ids))

    return _get_results(client, tries, task)
Ejemplo n.º 11
0
def _get_results(client, tries, task):
    """
    Check up to n times if a task has completed, return the results if possible. Otherwise, raise an exception.
    """
    # Wait for node to return results. Instead of polling it is also
    # possible to subscribe to a websocket channel to get status
    # updates
    info("Waiting for results")
    task_id = task.get("id")
    for r in range(tries):
        task = client.get_task(task_id)
        if task.get('complete'):
            break

        info("Waiting for results")
        time.sleep(1)
    # Raise Exception if task has still not completed
    if not task.get('complete'):
        raise Exception(
            f'Task timeout for master function column names\ntask id: {task_id}'
        )
    info("Obtaining results")
    results = client.get_results(task_id=task.get("id"))
    return results
Ejemplo n.º 12
0
def _default_serialization(data):
    info('Using default json serialization')
    return json.dumps(data).encode()
Ejemplo n.º 13
0
def docker_wrapper(module: str):
    """
    Wrap an algorithm module to provide input and output handling for the
    vantage6 infrastructure.

    Data is received in the form of files, whose location should be specified
    in the following environment variables:
    - `INPUT_FILE`: input arguments for the algorithm
    - `OUTPUT_FILE`: location where the results of the algorithm should be
      stored
    - `TOKEN_FILE`: access token for the vantage6 server REST api
    - `DATABASE_URI`: either a database endpoint or path to a csv file.

    The wrapper is able to parse a number of input file formats. The available
    formats can be found in `vantage6.tools.data_format.DataFormat`. When the
    input is not pickle (legacy), the format should be specified in the first
    bytes of the input file, followed by a '.'.

    It is also possible to specify the desired output format. This is done by
    including the parameter 'output_format' in the input parameters. Again, the
    list of possible output formats can be found in
    `vantage6.tools.data_format.DataFormat`.

    It is still possible that output serialization will fail even if the
    specified format is listed in the DataFormat enum. Algorithms can in
    principle return any python object, but not every serialization format will
    support arbitrary python objects. When dealing with unsupported algorithm
    output, the user should use 'pickle' as output format, which is the
    default.

    The other serialization formats support the following algorithm output:
    - built-in primitives (int, float, str, etc.)
    - built-in collections (list, dict, tuple, etc.)
    - pandas DataFrames

    :param module: module that contains the vantage6 algorithms
    :return:
    """
    info(f"wrapper for {module}")

    # read input from the mounted inputfile.
    input_file = os.environ["INPUT_FILE"]
    info(f"Reading input file {input_file}")

    input_data = load_input(input_file)

    # all containers receive a token, however this is usually only
    # used by the master method. But can be used by regular containers also
    # for example to find out the node_id.
    token_file = os.environ["TOKEN_FILE"]
    info(f"Reading token file '{token_file}'")
    with open(token_file) as fp:
        token = fp.read().strip()

    data_file = os.environ["DATABASE_URI"]
    info(f"Using '{data_file}' as database")
    # with open(data_file, "r") as fp:
    data = pandas.read_csv(data_file)

    # make the actual call to the method/function
    info("Dispatching ...")
    output = dispact_rpc(data, input_data, module, token)

    # write output from the method to mounted output file. Which will be
    # transfered back to the server by the node-instance.
    output_file = os.environ["OUTPUT_FILE"]
    info(f"Writing output to {output_file}")

    output_format = input_data.get('output_format', None)
    write_output(output_format, output, output_file)
Ejemplo n.º 14
0
def serialize_to_pickle(data):
    info('Serializing to pickle')
    return pickle.dumps(data)
Ejemplo n.º 15
0
def master(client, db_client, columns, functions):
    """
    Master algorithm to compute a summary of the federated datasets.

    Parameters
    ----------
    client : ContainerClient
        Interface to the central server. This is supplied by the wrapper.
    db_client : DBClient
        The database client.
    columns : List
        List containing the columns and information needed.

    Returns
    -------
    Dict
        A dictionary containing summary statistics for the chosen columns of the
        dataset.
    """
    # Validating the input
    info("Validating the input arguments")
    if type(columns) == list:
        for column in columns:
            if not all(
                [parameter in column for parameter in [VARIABLE, TABLE]]):
                warn("Missing information in the input argument")
                return None
            # check which functions to run
            if FUNCTIONS not in column:
                if functions:
                    column[FUNCTIONS] = functions
                else:
                    column[FUNCTIONS] = list(AGGREGATORS.keys())
            # Check if it supports all functions
            unsupported_functions = [
                function for function in column[FUNCTIONS]
                if function not in AGGREGATORS.keys()
            ]
            if len(unsupported_functions) > 0:
                warn(
                    f"Unsupported functions: {', '.join(unsupported_functions)}"
                )
                return None

            column[REQUIRED_FUNCTIONS] = set([
                r_function for function in column[FUNCTIONS]
                for r_function in FUNCTION_MAPPING[function]
            ])
    else:
        warn("Invalid format for the input argument")
        return None

    # define the input for the summary algorithm
    info("Defining input parameters")
    input_ = {"method": "summary", "args": [], "kwargs": {"columns": columns}}

    # obtain organizations that are within my collaboration
    info("Obtaining the organizations in the collaboration")
    organizations = client.get_organizations_in_my_collaboration()
    ids = [organization.get("id") for organization in organizations]

    # collaboration and image is stored in the key, so we do not need
    # to specify these
    info("Creating node tasks")
    task = client.create_new_task(input_, organization_ids=ids)

    # wait for all results
    # TODO subscribe to websocket, to avoid polling
    task_id = task.get("id")
    task = client.request(f"task/{task_id}")
    while not task.get("complete"):
        task = client.request(f"task/{task_id}")
        info("Waiting for results")
        time.sleep(1)

    info("Obtaining results")
    results = client.get_results(task_id=task.get("id"))

    info("Check if any exception occurred")
    if any([ERROR in result for result in results]):
        warn("Encountered an error, please review the parameters")
        return [result[ERROR] for result in results if ERROR in result]

    # process the output
    info("Process the node results")
    summary = {}

    for column in columns:
        summary[column[VARIABLE]] = {}
        nodes_summary = [result[column[VARIABLE]] for result in results]
        for function in column[FUNCTIONS]:
            summary[column[VARIABLE]][function] = AGGREGATORS[function](
                nodes_summary)

    return summary
Ejemplo n.º 16
0
def master(client, data, *args, **kwargs):
    """Master algoritm.

    The master algorithm is the chair of the Round Robin, which makes
    sure everyone waits for their turn to identify themselfs.
    """

    # get all organizations (ids) that are within the collaboration
    # FlaskIO knows the collaboration to which the container belongs
    # as this is encoded in the JWT (Bearer token)
    organizations = client.get_organizations_in_my_collaboration()
    ids = [organization.get("id") for organization in organizations]

    # The input fot the algorithm is the same for all organizations
    # in this case
    info("Defining input parameters")
    input_ = {
        "method": "some_example_method",
    }

    # create a new task for all organizations in the collaboration.
    info("Dispatching node-tasks")
    task = client.create_new_task(input_=input_, organization_ids=ids)

    # wait for node to return results. Instead of polling it is also
    # possible to subscribe to a websocket channel to get status
    # updates
    info("Waiting for resuls")
    task_id = task.get("id")
    task = client.get_task(task_id)
    while not task.get("complete"):
        task = client.get_task(task_id)
        info("Waiting for results")
        time.sleep(1)

    info("Obtaining results")
    results = client.get_results(task_id=task.get("id"))
    print(results)

    # combine all results into one dataframe
    dfs = [pd.DataFrame.from_dict(res) for res in results]
    res_df = pd.concat(dfs, keys=range(len(results)))

    # Calculate overall mean over all nodes
    res_df['total_mean'] = res_df['count'] * res_df['mean']
    res_total = pd.DataFrame(res_df.groupby(level=1).sum())
    res_total['mean'] = res_total['total_mean'] / res_total['count']

    info("master algorithm complete")

    # return all the messages from the nodes
    return res_total['mean'].to_dict()
Ejemplo n.º 17
0
def master(client, data):
    """Master algorithm.
    The master algorithm is the chair of the Round Robin, which makes
    sure everyone waits for their turn to identify themselves.
    """
    # Info messages can help you when an algorithm crashes. These info
    # messages are stored in a log file which is send to the server when
    # either a task finished or crashes.
    info('Collecting participating organizations')

    # Collect all organization that participate in this collaboration.
    # These organizations will receive the task to compute the partial.
    organizations = client.get_organizations_in_my_collaboration()
    ids = [organization.get("id") for organization in organizations]

    # # Determine the device to train on
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # clear cuda memory
    torch.cuda.empty_cache()
    # torch.cuda.clear_memory_allocated()

    # # Initialize model and send parameters of server to all workers
    model = Net().to(device)

    # Train without federated averaging
    info('Train_test')
    task = client.create_new_task(
        input_={
            'method': 'train_test',
            'kwargs': {
                'parameters': model.parameters(),
                'model': model,
                'device': device,
                'log_interval': 10,
                'local_dp': True,
                'return_params': True,
                'epoch': 2,
                # 'round': 1,
                'delta': 1e-5,
                'if_test': False
            }
        },
        organization_ids=ids)

    info("Waiting for parameters")
    task_id = task.get("id")
    task = client.get_task(task_id)
    while not task.get("complete"):
        task = client.get_task(task_id)
        info("Waiting for results")
        time.sleep(1)

    # # Once we now the partials are complete, we can collect them.
    info("Obtaining parameters from all nodes")

    results_train = client.get_results(task_id=task.get("id"))

    global_sum = 0
    global_count = 0

    for output in results_train:
        # print(len(output))
        global_sum += output["params"]
        global_count += len(global_sum)

    # for parameters in results:
    #     print(parameters)

    averaged_parameters = global_sum / global_count

    # info("Averaged parameters")
    # for parameters in averaged_parameters:
    #     print(parameters)
    """
    in order to not have the optimizer see the new parameters as a non-leaf tensor, .clone().detach() needs
    to be applied in order to turn turn "grad_fn=<DivBackward0>" into "grad_fn=True"
    """

    averaged_parameters = [averaged_parameters.clone().detach()]

    torch.cuda.empty_cache()
    # torch.cuda.clear_memory_allocated()

    # info('Federated averaging w/ averaged_parameters')
    # task = client.create_new_task(
    #     input_={
    #         'method': 'train_test',
    #         'kwargs': {
    #             'parameters': averaged_parameters,
    #             'model': output['model'],
    #             'device': device,
    #             'log_interval': 10,
    #             'local_dp': True,
    #             'return_params': True,
    #             'epoch': 5,
    #             # 'round': 1,
    #             'delta': 1e-5,
    #             'if_test': False
    #         }
    #     },
    #     organization_ids=ids
    # )

    info('Federated averaging w/ averaged_parameters')
    task = client.create_new_task(
        input_={
            'method': 'train_test',
            'kwargs': {
                'parameters': averaged_parameters,
                'model': output['model'],
                'device': device,
                'log_interval': 10,
                'local_dp': False,
                'return_params': True,
                'epoch': 1,
                # 'round': 1,
                'delta': 1e-5,
                'if_test': True
            }
        },
        organization_ids=ids)

    results = client.get_results(task_id=task.get("id"))
    for output in results:
        acc = output["test_accuracy"]
    return acc
Ejemplo n.º 18
0
def docker_wrapper(module: str):
    info(f"wrapper for {module}")

    # read input from the mounted inputfile.
    input_file = os.environ["INPUT_FILE"]
    info(f"Reading input file {input_file}")

    with open(input_file, "rb") as fp:
        input_data = pickle.load(fp)

    # all containers receive a token, however this is usually only
    # used by the master method. But can be used by regular containers also
    # for example to find out the node_id.
    token_file = os.environ["TOKEN_FILE"]
    info(f"Reading token file '{token_file}'")
    with open(token_file) as fp:
        token = fp.read().strip()

    data_file = os.environ["DATABASE_URI"]
    info(f"Using '{data_file}' as database")
    # with open(data_file, "r") as fp:
    data = pandas.read_csv(data_file)
    # data = csv.reader(fp)

    # make the actual call to the method/function
    info("Dispatching ...")
    output = dispact_rpc(data, input_data, module, token)

    # write output from the method to mounted output file. Which will be
    # transfered back to the server by the node-instance.
    output_file = os.environ["OUTPUT_FILE"]
    info(f"Writing output to {output_file}")
    with open(output_file, 'wb') as fp:
        fp.write(pickle.dumps(output))
Ejemplo n.º 19
0
def _serialize_pandas(data):
    info('Running pandas json serialization')
    return data.to_json().encode()
Ejemplo n.º 20
0
def sparql_wrapper(module: str):
    """
    Wrapper for a vantage6 algorithm module that will query a SPARQL endpoint stored in the DATABASE_URI environment
    variable. It will then pass the result as a pandas DataFrame to a method implemented in `module`.

    In the vantage6 infrastructure information is passed to algorithm containers through the use of environment
    variables.

    Required environment variables:

    - `INPUT_FILE`: Path to the file containing the input arguments as a python dict.
    - `DATABASE_URI`: URI to a SPARQL endpoint
    - `TOKEN_FILE`: Path to a file containing a vantage6 authentication token
    - `OUTPUT_FILE`: Path where algorithm output should be stored

    The file indicated by the `INPUT_FILE` environment variable requires the field `query` in order to use this wrapper.
    The value should be a SPARQL `SELECT` query string.

    Example
    ======
    Given the following input parameters:
    ```
    {'method': 'column_names',
     'query':
     '''
     PREFIX foaf:  <http://xmlns.com/foaf/0.1/>
    SELECT ?person ?name ?email
    WHERE {
    ?person foaf:name ?name .
    ?person foaf:mbox ?email .
    }
    '''
     }

    The wrapper will provide the `column_names` algorithm with a pandas dataframe with the columns `person`, `name`,
    `email`.

    :param module: the name of a package that contains vantage6 algorithms
    :return:
    """
    info(f"wrapper for {module}")

    # read input from the mounted inputfile.
    input_file = os.environ["INPUT_FILE"]
    info(f"Reading input file {input_file}")

    # TODO: _load_data handles input deserialization. It should be a public function
    input_data = docker_wrapper._load_data(input_file)

    query = input_data['query']

    # all containers receive a token, however this is usually only
    # used by the master method. But can be used by regular containers also
    # for example to find out the node_id.
    token_file = os.environ["TOKEN_FILE"]
    info(f"Reading token file '{token_file}'")
    with open(token_file) as fp:
        token = fp.read().strip()

    endpoint = os.environ["DATABASE_URI"]

    endpoint = _fix_endpoint(endpoint)

    info(f"Using '{endpoint}' as triplestore endpoint")

    data = query_triplestore(endpoint, query)

    # make the actual call to the method/function
    info("Dispatching ...")
    output = dispact_rpc(data, input_data, module, token)

    # write output from the method to mounted output file. Which will be
    # transfered back to the server by the node-instance.
    output_file = os.environ["OUTPUT_FILE"]
    info(f"Writing output to {output_file}")
    with open(output_file, 'wb') as fp:
        if 'output_format' in input_data:
            output_format = input_data['output_format']

            # Indicate output format
            fp.write(output_format.encode() + b'.')

            # Write actual data
            output_format = DataFormat(output_format.lower())
            serialized = serialization.serialize(output, output_format)
            fp.write(serialized)
        else:
            # No output format specified, use legacy method
            fp.write(pickle.dumps(output))