Beispiel #1
0
def _es_linker(project_id, data_params, module_params):
    '''
    Runs the recoding module
    
    ARGUMENTS (GET):
        project_id: ID for "link" project

    ARGUMENTS (POST):
        - data_params: 
            none
                {
                "module_name": module to fetch from (source)
                "file_name": file to fetch (source)
                }
        - module_params: {
                "index_name": name of the Elasticsearch index to fetch from
                "query_template": 
                "threshold": minimum value of score for this query_template for a match
                "must": terms to filter by field (AND: will include ONLY IF ALL are in text)
                "must_not": terms to exclude by field from search (OR: will exclude if ANY is found)
                "exact_pairs": (optional)
                "non_matching_pairs": (optional)
                }
    '''
    # Problem: what project are we talking about? what ID?

    proj = ESLinker(project_id)
    _, run_info = proj.linker('es_linker', None, module_params)
    proj.write_data()

    return run_info
def update_labeller(project_id):
    '''
    Send an user input to the labeller and receive the updated labeller state    
    
    GET:
        project_id: ID for "link" project
    
    POST:
        module_params:
            user_input: #TODO: document
    '''
    _, module_params = _parse_request()
    logging.info(module_params)
    user_input = module_params['user_input']

    proj = ESLinker(project_id=project_id)
    labeller = proj.labeller_from_json()

    if labeller.answer_is_valid(user_input):
        labeller.update(user_input)
    else:
        raise ValueError('Answer received "{0}" is not valid'.format(user_input))
    
    proj.labeller_to_json(labeller)
        
    encoder = MyEncoder()
    return jsonify(error=False,
                   result=encoder.encode(labeller.to_emit()))
def update_filters_labeller(project_id):
    '''
    Update filters for a labeller and receive the updated labeller state. 
    
    GET:
        project_id: ID for "link" project
    
    POST:
        module_params:
            must: #TODO: document
            must_not: #TODO: document
    '''
    _, module_params = _parse_request()
    
    logging.info('update_musts got:', module_params)
    must = module_params['must']    
    must_not = module_params['must_not'] 
    
    proj = ESLinker(project_id=project_id)
    labeller = proj.labeller_from_json()
    
    labeller.update_musts(must, must_not)
    
    proj.labeller_to_json(labeller)
    
    encoder = MyEncoder()
    return jsonify(error=False,
                   result=encoder.encode(labeller.to_emit()))
def add_search(project_id):
    '''
    # Perform search on specific user-specified terms
    
    GET:
        project_id: ID for "link" project
    
    POST:
        module_params:
            col_to_search: A dictionnary mapping the column name to query string
                Ex: [{'columns': ['col1', 'col2'], 'values_to_search': ['val1']}, ...]
            max_num_results: (optional) Max number of results for search
    '''
    _, module_params = _parse_request()
    
    proj = ESLinker(project_id)
    
    labeller = proj.labeller_from_json()
    
    # TODO: change this hack
    def temp(cols):
        if isinstance(cols, str):
            return cols
        elif isinstance(cols, list):
            return tuple(cols)
    pms = {temp(search['columns']): search['values_to_search'] for search in module_params['search']}
    labeller.add_custom_search(pms, module_params.get('max_num_results', 15))
    
    proj.labeller_to_json(labeller)
        
    encoder = MyEncoder()
    return jsonify(error=False,
                   result=encoder.encode(labeller.to_emit()))
def update_targets_labeller(project_id):
    '''
    Update filters for a labeller and receive the updated labeller state. 
    
    GET:
        project_id: ID for "link" project
    
    POST:
        module_params:
            target_precision: value between 0 and 1 representing the minimal 
                acceptable value for precision (results with lower values will
                be strongly penalised)
            target_recall:value between 0 and 1 representing the minimal 
                acceptable value for recall (results with lower values will
                be strongly penalised)
    '''
    _, module_params = _parse_request()
    
    logging.info('update_musts got:', module_params)
    t_p = module_params['target_precision']    
    t_r = module_params['target_recall'] 
    
    proj = ESLinker(project_id=project_id)
    labeller = proj.labeller_from_json()
    
    labeller.update_targets(t_p, t_r)
    
    proj.labeller_to_json(labeller)
    
    encoder = MyEncoder()
    return jsonify(error=False,
                   result=encoder.encode(labeller.to_emit()))  
def current_state(project_id):
    '''
    Get the current state for an ES labeller.
    
    GET:
        project_id: ID for "link" project
    '''
    # Generate necessary paths and create labeller
    proj = ESLinker(project_id=project_id)
    
    labeller = proj.labeller_from_json()

    encoder = MyEncoder()
    return jsonify(error=False,
                   result=encoder.encode(labeller.to_emit()))
def _create_es_labeller(project_id, *argv):
    '''
    Create an "es" labeller and pickle to project
    
    ARGUMENTS (GET):
        - project_id
    
    ARGUMENTS (POST):
        - data_params: none
        - module_params: none
    '''
    proj = ESLinker(project_id=project_id)
    labeller = proj._gen_es_labeller()
    proj.labeller_to_json(labeller)
    return
Beispiel #8
0
    def list_projects(self, project_type, project_access='all'):
        '''Returns a list of project_metadatas'''
        _check_project_type(project_type)
        _check_project_access(project_access)

        list_of_ids = self.list_dirs(project_type)
        list_of_metadatas = []

        for id_ in list_of_ids:

            try:
                if project_type == 'link':
                    proj = ESLinker(id_)
                else:
                    proj = ESNormalizer(id_)
                could_load = True
            except:
                could_load = False
                print('Could not load {0}: {1}'.format(project_type, id_))

            if could_load:
                if project_access != 'all':
                    if proj.metadata.get(
                            'public', False) == (project_access == 'public'):
                        list_of_metadatas.append(proj.metadata)
                else:
                    list_of_metadatas.append(proj.metadata)
        return list_of_metadatas
def new_project(project_type):
    '''
    Create a new project:
        
    GET:
        - project_type: "link" or "normalize"
        
    POST:
        - (description): project description
        - (display_name): name to show to user
        - (public): make project freely available
    
    '''
    _check_project_type(project_type)
    
    description = request.json.get('description', '')
    display_name = request.json.get('display_name', '')
    public = request.json.get('public', False)
    
    if public and (not description):
        raise Exception('Public projects should have a description')

    if project_type == 'normalize':
        proj = ESNormalizer(create_new=True, description=description, display_name=display_name, public=public)
    else:
        proj = ESLinker(create_new=True, description=description, display_name=display_name, public=public)
        
    return jsonify(error=False, 
                   project_id=proj.project_id)
def add_column_matches(project_id):
    """
    Add pairs of columns to compare for linking.
    
    wrapper around ESLinker.add_col_matches
    
    GET: 
        - project_id: ID for the "link" project
        
    POST:
        - column_matches: [list object] column matches (see doc in original function)
    """
    column_matches = request.json['column_matches']
    proj = ESLinker(project_id=project_id)
    proj.add_col_matches(column_matches)
    return jsonify(error=False)
def clear_search(project_id):
    '''
    Remove user search items from the list of next labeller proposals
    '''
    _, module_params = _parse_request()
    
    proj = ESLinker(project_id)
    
    labeller = proj.labeller_from_json()
    
    labeller.clear_custom_search()
    
    proj.labeller_to_json(labeller)
        
    encoder = MyEncoder()
    return jsonify(error=False,
                   result=encoder.encode(labeller.to_emit()))
Beispiel #12
0
def _perform_restriction(project_id, _, module_params):
    '''
    Creates a restricted version of the file set as reference in the link
    project and writes it in the link project.
    
    ARGUMENTS (GET):
        project_id: ID for "link" project

    ARGUMENTS (POST):
        - data_params: none
        - module_params: same as result of infer_mvs
    '''
    proj = ESLinker(project_id=project_id)

    run_info = proj.perform_restriction(module_params)

    return run_info
Beispiel #13
0
def _link_results_analyzer(project_id, data_params, module_params):
    '''
    Runs the link results analyzer module
    
    wrapper around ESNormalizer.infer ?
    
    ARGUMENTS (GET):
        project_id: ID for "normalize" project

    ARGUMENTS (POST):
        - data_params: {
                "module_name": module to fetch from
                "file_name": file to fetch
                }
        - (module_params): {
                "col_matches": dict like {"source": col_source, "ref": col_ref}
                    A pair of columns that can be used a joining key:
                "lower": bool (defaults to False)
                    Whether or not the values of the joining should be lowercased
                    before joining
                }
    '''
    if module_params is None:
        module_params = dict()

    proj = ESLinker(project_id=project_id)
    proj.load_data(data_params['module_name'], data_params['file_name'])
    result = proj.analyze_results(module_params)

    # Write log
    proj._write_log_buffer(False)
    return result
def complete_training(project_id):
    '''
    # TODO: SOON deprecated 
    
    GET:
        project_id: ID for "link" project
    '''
    proj = ESLinker(project_id)

    logging.info('Writing train')
    labeller = proj.labeller_from_json()
    
    learned_settings = labeller.export_best_params()
    
    proj.add_es_learned_settings(learned_settings)
    logging.info('Wrote train')
    
    return jsonify(error=False)
def add_column_certain_matches(project_id):
    '''
    Specify certain column matches (exact match on a subset of columns equivalent 
    to entity identity). This is used to test performances.
    
    wrapper around ESLinker.add_col_certain_matches
    
    GET:
        - project_id: ID for "link" project
        
    POST:
        - column_certain_matches: {dict object}: (see doc in original function)
    
    '''
    column_matches = request.json['column_certain_matches']
    proj = ESLinker(project_id=project_id)
    proj.add_col_certain_matches(column_matches)
    return jsonify(error=False)
def select_file(project_id):
    '''    
    Choose a file to use as source or referential for merging
    send {file_role: "source", project_id: "ABCYOUANDME", public: False}
    
    GET:
        - project_id: ID for the "link" project
        
    POST:
        - file_role: "ref" or "source". Role of the normalized file for linking
        - project_id: ID of the "normalize" project to use for linking
    '''
    proj = ESLinker(project_id)
    params = request.json
    proj.add_selected_project(file_role=params['file_role'], 
                           public=params.get('public', False),
                           project_id=params['project_id'])
    return jsonify(error=False)
Beispiel #17
0
def _dedupe_linker(project_id, *argv):
    '''
    Runs deduper module. Contrary to other modules, linker modules, take
    paths as input (in addition to module parameters)
    
    ARGUMENTS (GET):
        project_id: ID for "link" project

    ARGUMENTS (POST):
        - data_params: none
        - module_params: none
        
    # Todo: deprecate
    '''

    proj = ESLinker(
        project_id=project_id)  # Ref and source are loaded by default

    paths = proj._gen_paths_dedupe()

    col_matches = proj.read_col_matches()
    my_variable_definition = proj._gen_dedupe_variable_definition(col_matches)

    module_params = {
        'variable_definition': my_variable_definition,
        'selected_columns_from_source': None,
        'selected_columns_from_ref': None
    }

    # TODO: This should probably be moved
    logging.info('Performing linking')

    # Perform linking
    proj.linker('dedupe_linker', paths, module_params)

    logging.info('Writing data')
    # Write transformations and log
    proj.write_data()

    file_path = proj.path_to(proj.mem_data_info['module_name'],
                             proj.mem_data_info['file_name'])
    logging.info('Wrote data to: {0}'.format(file_path))

    return {}
Beispiel #18
0
def _infer_restriction(project_id, _, module_params):
    '''
    Runs the training data and infers possible restrictions that can be made
    on the referential.
    
    ARGUMENTS (GET):
        project_id: ID for "link" project

    ARGUMENTS (POST):
        - data_params: none
        - module_params: {#TODO: fill with params from restrict}
    '''
    if module_params is None:
        module_params = dict()

    proj = ESLinker(project_id=project_id)
    training = proj.read_config_data('dedupe_linker', 'training.json')
    if not training:
        raise Exception('No training file was found in this project')
    module_params['training'] = training

    result = proj.infer('infer_restriction', module_params)

    # Write log
    proj._write_log_buffer(False)
    return result
 def remove_project(self, project_type, project_id):
     assert project_id and (project_id is not None)
     _check_project_type(project_type)
     dir_path = self.path_to(project_type, project_id) 
     if not os.path.isdir(dir_path):
         raise Exception('No project found with the following ID: {0}'.format(project_id))
         
     if project_type == 'normalize':
         proj = ESNormalizer(project_id)
     elif project_type == 'link':
         proj = ESLinker(project_id)
         
     proj.delete_project()
     print('Deleted project:', project_type, project_id)
def delete_project(project_type, project_id):
    """
    Delete an existing project (including all configuration, data and metadata)
    
    GET:
        - project_type: "link" or "normalize"
        - project_id
    """
    _check_project_type(project_type)
    
    # TODO: replace by _init_project
    if project_type == 'normalize':
        proj = ESNormalizer(project_id=project_id)
    else:
        proj = ESLinker(project_id=project_id)
    proj.delete_project()
    return jsonify(error=False)
Beispiel #21
0
    def list_projects(self, project_type, project_access='all'):
        '''Return a list of project metadatas.
        
        Parameters
        ----------
        project_type: str (either "normalize" or "link")
        project_access: str (either "all", "public", or "private")
            Whether to list all projects or only those that are public or 
            non-public (private)
            
        Returns
        -------
        list_of_metadatas: list of dict
            Metadata for the selected project type and access permission.        
        '''
        _check_project_type(project_type)
        _check_project_access(project_access)

        list_of_ids = self.list_dirs(project_type)
        list_of_metadatas = []

        for id_ in list_of_ids:

            try:
                if project_type == 'link':
                    proj = ESLinker(id_)
                else:
                    proj = ESNormalizer(id_)
                could_load = True
            except:
                could_load = False
                print('Could not load {0}: {1}'.format(project_type, id_))

            if could_load:
                if project_access != 'all':
                    if proj.metadata.get(
                            'public', False) == (project_access == 'public'):
                        list_of_metadatas.append(proj.metadata)
                else:
                    list_of_metadatas.append(proj.metadata)
        return list_of_metadatas
def label_pair(project_id):
    '''
    Assign a label to a (source_id, ref_id) pair
    
    GET:
        project_id: ID for "link" project
    
    POST:
        source_id: ID of the source element within source
        ref_id: ID of ref element in elasticsearch
        label: label to assign to pair ('yes', 'no', 'forget')
    '''
    _, module_params = _parse_request()   
    proj = ESLinker(project_id=project_id)    
    
    source_id = module_params['source_id']
    ref_id = module_params['ref_id']
    label = module_params['label']
    
    # TODO: add label_pair
    pass
def _link_results_analyzer(project_id, data_params, *argv):
    '''
    Runs the link results analyzer module
    
    wrapper around ESNormalizer.infer ?
    
    ARGUMENTS (GET):
        project_id: ID for "normalize" project

    ARGUMENTS (POST):
        - data_params: {
                "module_name": module to fetch from
                "file_name": file to fetch
                }    
    '''
    proj = ESLinker(project_id=project_id)
    proj.load_data(data_params['module_name'], data_params['file_name'])
    result = proj.infer('link_results_analyzer', {})

    # Write log
    proj._write_log_buffer(False)
    return result
def _init_project(project_type, 
                 project_id=None, 
                 create_new=False, 
                 display_name=None, 
                 description=None):
    '''
    Runs the appropriate constructor for Linker or Normalizer projects
    
    DEV NOTE: Use this in api calls that have project_type as a variable
    '''
    _check_project_type(project_type)

    if project_type == 'link':
        proj = ESLinker(project_id=project_id, 
                          create_new=create_new, 
                          display_name=display_name, 
                          description=description)
    else:
        proj = ESNormalizer(project_id=project_id, 
                              create_new=create_new, 
                              display_name=display_name, 
                              description=description)
    
    return proj
Beispiel #25
0
def _create_es_labeller(project_id, _, module_params):
    '''
    Create an "es" labeller and pickle to project
    
    ARGUMENTS (GET):
        - project_id
    
    ARGUMENTS (POST):
        - data_params: none
        - module_params: 
            force: (false) Set to true if 
    '''
    proj = ESLinker(project_id=project_id)

    if module_params is None:
        module_params = {}

    print('ES_labeller module_params:', module_params)
    if not proj._has_labeller() or module_params.get('force', False):
        print('yes here')
        labeller = proj._gen_es_labeller()
        proj.labeller_to_json(labeller)
    return
def _create_es_index(project_id, data_params, module_params):
    '''
    Create an Elasticsearch index for the selected file
    
    GET:
        - project_id: Link project_id
    POST:
        - data_params: 
                        {
                        project_type: (optional) defaults to link
                        module_name:
                        file_name: 
                        }
        - module_params: {
                            columns_to_index: 
                            for_linking: create index to use as referential (instead of storage)
                            force: force recreation of index even if existant
                        }
    '''

    if module_params is None:
        module_params = {}

    print(module_params)
    columns_to_index = module_params.get('columns_to_index')
    force = module_params.get('force', False)
    for_linking = module_params.get('for_linking', True)

    if (not for_linking) and (columns_to_index is not None):
        raise ValueError(
            'columns_to_index and for_linking cannot be not None and False')

    if (data_params is not None) and ('project_type' in data_params):
        project_type = data_params['project_type']

    project_type = 'link'
    if data_params is not None:
        module_name = data_params['module_name']
        file_name = data_params['file_name']
        project_type = data_params.get('project_type', 'link')

    # TODO: dirty fix for linking and normalization
    if for_linking:
        if project_type == 'link':
            proj_link = ESLinker(project_id)
            proj = ESNormalizer(proj_link.ref.project_id)

            if data_params is None:
                module_name = proj_link.metadata['files']['ref']['module_name']
                file_name = proj_link.metadata['files']['ref']['file_name']

        elif project_type == 'normalize':
            proj = ESNormalizer(project_id)

        # Generate default columns_to_index
        if columns_to_index is None:
            columns_to_index = proj.gen_default_columns_to_index(for_linking)

    else:
        proj = ESLinker(project_id)
        if data_params is None:
            module_name, file_name = proj.get_last_written()

        if columns_to_index is None:
            columns_to_index = {
                col: {}
                for col in proj._get_header(module_name, file_name)
            }

    file_path = proj.path_to(module_name, file_name)
    proj.create_index(file_path, columns_to_index, force)
    time.sleep(5)  # TODO: why is this necessary?
    return
Beispiel #27
0
def _create_es_index(project_id, data_params, module_params):
    '''
    Create an Elasticsearch index for the selected file
    
    GET:
        - project_id: Link project_id
    POST:
        - data_params: 
                        {
                            link_project_id: (optional) ID of the associated link project
                            project_type: (optional) defaults to link
                            module_name:
                            file_name: 
                        }
        - module_params: {
                            columns_to_index: 
                            for_linking: create index to use as referential (instead of storage)
                            force: force recreation of index even if existant
                        }
    '''

    if module_params is None:
        module_params = {}

    print(module_params)
    columns_to_index = module_params.get('columns_to_index')
    force = module_params.get('force', False)
    for_linking = module_params.get('for_linking', True)

    if (not for_linking) and (columns_to_index is not None):
        raise ValueError(
            'columns_to_index and for_linking cannot be NOT None and False')

    if (data_params is not None) and ('project_type' in data_params):
        project_type = data_params['project_type']

    project_type = 'link'
    if data_params is not None:
        module_name = data_params['module_name']
        file_name = data_params['file_name']
        project_type = data_params.get('project_type', 'link')

    # TODO: dirty fix for linking and normalization
    if for_linking:
        if project_type == 'link':
            proj_link = ESLinker(project_id)
            columns_to_index = proj_link.gen_default_columns_to_index()

            if data_params is None:
                module_name = proj_link.metadata['files']['ref']['module_name']
                file_name = proj_link.metadata['files']['ref']['file_name']

            proj = ESNormalizer(proj_link.ref.project_id)

        elif project_type == 'normalize':
            proj = ESNormalizer(project_id)
            assert columns_to_index is not None

    else:
        proj = ESLinker(project_id)
        if data_params is None:
            module_name, file_name = proj.get_last_written()

        # Type non str columns or use the default string analyzer
        types_dict = {float: 'float', bool: 'boolean', int: 'integer'}
        columns_to_index = {col: types_dict.get(proj._choose_dtype(col), {}) \
                            for col in proj._get_header(module_name, file_name)}

    file_path = proj.path_to(module_name, file_name)
    proj.create_index(file_path, columns_to_index, force,
                      proj.metadata.get('public', False))
    return