def _es_linker(project_id, data_params, module_params): ''' Runs the recoding module ARGUMENTS (GET): project_id: ID for "link" project ARGUMENTS (POST): - data_params: none { "module_name": module to fetch from (source) "file_name": file to fetch (source) } - module_params: { "index_name": name of the Elasticsearch index to fetch from "query_template": "threshold": minimum value of score for this query_template for a match "must": terms to filter by field (AND: will include ONLY IF ALL are in text) "must_not": terms to exclude by field from search (OR: will exclude if ANY is found) "exact_pairs": (optional) "non_matching_pairs": (optional) } ''' # Problem: what project are we talking about? what ID? proj = ESLinker(project_id) _, run_info = proj.linker('es_linker', None, module_params) proj.write_data() return run_info
def update_labeller(project_id): ''' Send an user input to the labeller and receive the updated labeller state GET: project_id: ID for "link" project POST: module_params: user_input: #TODO: document ''' _, module_params = _parse_request() logging.info(module_params) user_input = module_params['user_input'] proj = ESLinker(project_id=project_id) labeller = proj.labeller_from_json() if labeller.answer_is_valid(user_input): labeller.update(user_input) else: raise ValueError('Answer received "{0}" is not valid'.format(user_input)) proj.labeller_to_json(labeller) encoder = MyEncoder() return jsonify(error=False, result=encoder.encode(labeller.to_emit()))
def update_filters_labeller(project_id): ''' Update filters for a labeller and receive the updated labeller state. GET: project_id: ID for "link" project POST: module_params: must: #TODO: document must_not: #TODO: document ''' _, module_params = _parse_request() logging.info('update_musts got:', module_params) must = module_params['must'] must_not = module_params['must_not'] proj = ESLinker(project_id=project_id) labeller = proj.labeller_from_json() labeller.update_musts(must, must_not) proj.labeller_to_json(labeller) encoder = MyEncoder() return jsonify(error=False, result=encoder.encode(labeller.to_emit()))
def add_search(project_id): ''' # Perform search on specific user-specified terms GET: project_id: ID for "link" project POST: module_params: col_to_search: A dictionnary mapping the column name to query string Ex: [{'columns': ['col1', 'col2'], 'values_to_search': ['val1']}, ...] max_num_results: (optional) Max number of results for search ''' _, module_params = _parse_request() proj = ESLinker(project_id) labeller = proj.labeller_from_json() # TODO: change this hack def temp(cols): if isinstance(cols, str): return cols elif isinstance(cols, list): return tuple(cols) pms = {temp(search['columns']): search['values_to_search'] for search in module_params['search']} labeller.add_custom_search(pms, module_params.get('max_num_results', 15)) proj.labeller_to_json(labeller) encoder = MyEncoder() return jsonify(error=False, result=encoder.encode(labeller.to_emit()))
def update_targets_labeller(project_id): ''' Update filters for a labeller and receive the updated labeller state. GET: project_id: ID for "link" project POST: module_params: target_precision: value between 0 and 1 representing the minimal acceptable value for precision (results with lower values will be strongly penalised) target_recall:value between 0 and 1 representing the minimal acceptable value for recall (results with lower values will be strongly penalised) ''' _, module_params = _parse_request() logging.info('update_musts got:', module_params) t_p = module_params['target_precision'] t_r = module_params['target_recall'] proj = ESLinker(project_id=project_id) labeller = proj.labeller_from_json() labeller.update_targets(t_p, t_r) proj.labeller_to_json(labeller) encoder = MyEncoder() return jsonify(error=False, result=encoder.encode(labeller.to_emit()))
def current_state(project_id): ''' Get the current state for an ES labeller. GET: project_id: ID for "link" project ''' # Generate necessary paths and create labeller proj = ESLinker(project_id=project_id) labeller = proj.labeller_from_json() encoder = MyEncoder() return jsonify(error=False, result=encoder.encode(labeller.to_emit()))
def _create_es_labeller(project_id, *argv): ''' Create an "es" labeller and pickle to project ARGUMENTS (GET): - project_id ARGUMENTS (POST): - data_params: none - module_params: none ''' proj = ESLinker(project_id=project_id) labeller = proj._gen_es_labeller() proj.labeller_to_json(labeller) return
def list_projects(self, project_type, project_access='all'): '''Returns a list of project_metadatas''' _check_project_type(project_type) _check_project_access(project_access) list_of_ids = self.list_dirs(project_type) list_of_metadatas = [] for id_ in list_of_ids: try: if project_type == 'link': proj = ESLinker(id_) else: proj = ESNormalizer(id_) could_load = True except: could_load = False print('Could not load {0}: {1}'.format(project_type, id_)) if could_load: if project_access != 'all': if proj.metadata.get( 'public', False) == (project_access == 'public'): list_of_metadatas.append(proj.metadata) else: list_of_metadatas.append(proj.metadata) return list_of_metadatas
def new_project(project_type): ''' Create a new project: GET: - project_type: "link" or "normalize" POST: - (description): project description - (display_name): name to show to user - (public): make project freely available ''' _check_project_type(project_type) description = request.json.get('description', '') display_name = request.json.get('display_name', '') public = request.json.get('public', False) if public and (not description): raise Exception('Public projects should have a description') if project_type == 'normalize': proj = ESNormalizer(create_new=True, description=description, display_name=display_name, public=public) else: proj = ESLinker(create_new=True, description=description, display_name=display_name, public=public) return jsonify(error=False, project_id=proj.project_id)
def add_column_matches(project_id): """ Add pairs of columns to compare for linking. wrapper around ESLinker.add_col_matches GET: - project_id: ID for the "link" project POST: - column_matches: [list object] column matches (see doc in original function) """ column_matches = request.json['column_matches'] proj = ESLinker(project_id=project_id) proj.add_col_matches(column_matches) return jsonify(error=False)
def clear_search(project_id): ''' Remove user search items from the list of next labeller proposals ''' _, module_params = _parse_request() proj = ESLinker(project_id) labeller = proj.labeller_from_json() labeller.clear_custom_search() proj.labeller_to_json(labeller) encoder = MyEncoder() return jsonify(error=False, result=encoder.encode(labeller.to_emit()))
def _perform_restriction(project_id, _, module_params): ''' Creates a restricted version of the file set as reference in the link project and writes it in the link project. ARGUMENTS (GET): project_id: ID for "link" project ARGUMENTS (POST): - data_params: none - module_params: same as result of infer_mvs ''' proj = ESLinker(project_id=project_id) run_info = proj.perform_restriction(module_params) return run_info
def _link_results_analyzer(project_id, data_params, module_params): ''' Runs the link results analyzer module wrapper around ESNormalizer.infer ? ARGUMENTS (GET): project_id: ID for "normalize" project ARGUMENTS (POST): - data_params: { "module_name": module to fetch from "file_name": file to fetch } - (module_params): { "col_matches": dict like {"source": col_source, "ref": col_ref} A pair of columns that can be used a joining key: "lower": bool (defaults to False) Whether or not the values of the joining should be lowercased before joining } ''' if module_params is None: module_params = dict() proj = ESLinker(project_id=project_id) proj.load_data(data_params['module_name'], data_params['file_name']) result = proj.analyze_results(module_params) # Write log proj._write_log_buffer(False) return result
def complete_training(project_id): ''' # TODO: SOON deprecated GET: project_id: ID for "link" project ''' proj = ESLinker(project_id) logging.info('Writing train') labeller = proj.labeller_from_json() learned_settings = labeller.export_best_params() proj.add_es_learned_settings(learned_settings) logging.info('Wrote train') return jsonify(error=False)
def add_column_certain_matches(project_id): ''' Specify certain column matches (exact match on a subset of columns equivalent to entity identity). This is used to test performances. wrapper around ESLinker.add_col_certain_matches GET: - project_id: ID for "link" project POST: - column_certain_matches: {dict object}: (see doc in original function) ''' column_matches = request.json['column_certain_matches'] proj = ESLinker(project_id=project_id) proj.add_col_certain_matches(column_matches) return jsonify(error=False)
def select_file(project_id): ''' Choose a file to use as source or referential for merging send {file_role: "source", project_id: "ABCYOUANDME", public: False} GET: - project_id: ID for the "link" project POST: - file_role: "ref" or "source". Role of the normalized file for linking - project_id: ID of the "normalize" project to use for linking ''' proj = ESLinker(project_id) params = request.json proj.add_selected_project(file_role=params['file_role'], public=params.get('public', False), project_id=params['project_id']) return jsonify(error=False)
def _dedupe_linker(project_id, *argv): ''' Runs deduper module. Contrary to other modules, linker modules, take paths as input (in addition to module parameters) ARGUMENTS (GET): project_id: ID for "link" project ARGUMENTS (POST): - data_params: none - module_params: none # Todo: deprecate ''' proj = ESLinker( project_id=project_id) # Ref and source are loaded by default paths = proj._gen_paths_dedupe() col_matches = proj.read_col_matches() my_variable_definition = proj._gen_dedupe_variable_definition(col_matches) module_params = { 'variable_definition': my_variable_definition, 'selected_columns_from_source': None, 'selected_columns_from_ref': None } # TODO: This should probably be moved logging.info('Performing linking') # Perform linking proj.linker('dedupe_linker', paths, module_params) logging.info('Writing data') # Write transformations and log proj.write_data() file_path = proj.path_to(proj.mem_data_info['module_name'], proj.mem_data_info['file_name']) logging.info('Wrote data to: {0}'.format(file_path)) return {}
def _infer_restriction(project_id, _, module_params): ''' Runs the training data and infers possible restrictions that can be made on the referential. ARGUMENTS (GET): project_id: ID for "link" project ARGUMENTS (POST): - data_params: none - module_params: {#TODO: fill with params from restrict} ''' if module_params is None: module_params = dict() proj = ESLinker(project_id=project_id) training = proj.read_config_data('dedupe_linker', 'training.json') if not training: raise Exception('No training file was found in this project') module_params['training'] = training result = proj.infer('infer_restriction', module_params) # Write log proj._write_log_buffer(False) return result
def remove_project(self, project_type, project_id): assert project_id and (project_id is not None) _check_project_type(project_type) dir_path = self.path_to(project_type, project_id) if not os.path.isdir(dir_path): raise Exception('No project found with the following ID: {0}'.format(project_id)) if project_type == 'normalize': proj = ESNormalizer(project_id) elif project_type == 'link': proj = ESLinker(project_id) proj.delete_project() print('Deleted project:', project_type, project_id)
def delete_project(project_type, project_id): """ Delete an existing project (including all configuration, data and metadata) GET: - project_type: "link" or "normalize" - project_id """ _check_project_type(project_type) # TODO: replace by _init_project if project_type == 'normalize': proj = ESNormalizer(project_id=project_id) else: proj = ESLinker(project_id=project_id) proj.delete_project() return jsonify(error=False)
def list_projects(self, project_type, project_access='all'): '''Return a list of project metadatas. Parameters ---------- project_type: str (either "normalize" or "link") project_access: str (either "all", "public", or "private") Whether to list all projects or only those that are public or non-public (private) Returns ------- list_of_metadatas: list of dict Metadata for the selected project type and access permission. ''' _check_project_type(project_type) _check_project_access(project_access) list_of_ids = self.list_dirs(project_type) list_of_metadatas = [] for id_ in list_of_ids: try: if project_type == 'link': proj = ESLinker(id_) else: proj = ESNormalizer(id_) could_load = True except: could_load = False print('Could not load {0}: {1}'.format(project_type, id_)) if could_load: if project_access != 'all': if proj.metadata.get( 'public', False) == (project_access == 'public'): list_of_metadatas.append(proj.metadata) else: list_of_metadatas.append(proj.metadata) return list_of_metadatas
def label_pair(project_id): ''' Assign a label to a (source_id, ref_id) pair GET: project_id: ID for "link" project POST: source_id: ID of the source element within source ref_id: ID of ref element in elasticsearch label: label to assign to pair ('yes', 'no', 'forget') ''' _, module_params = _parse_request() proj = ESLinker(project_id=project_id) source_id = module_params['source_id'] ref_id = module_params['ref_id'] label = module_params['label'] # TODO: add label_pair pass
def _link_results_analyzer(project_id, data_params, *argv): ''' Runs the link results analyzer module wrapper around ESNormalizer.infer ? ARGUMENTS (GET): project_id: ID for "normalize" project ARGUMENTS (POST): - data_params: { "module_name": module to fetch from "file_name": file to fetch } ''' proj = ESLinker(project_id=project_id) proj.load_data(data_params['module_name'], data_params['file_name']) result = proj.infer('link_results_analyzer', {}) # Write log proj._write_log_buffer(False) return result
def _init_project(project_type, project_id=None, create_new=False, display_name=None, description=None): ''' Runs the appropriate constructor for Linker or Normalizer projects DEV NOTE: Use this in api calls that have project_type as a variable ''' _check_project_type(project_type) if project_type == 'link': proj = ESLinker(project_id=project_id, create_new=create_new, display_name=display_name, description=description) else: proj = ESNormalizer(project_id=project_id, create_new=create_new, display_name=display_name, description=description) return proj
def _create_es_labeller(project_id, _, module_params): ''' Create an "es" labeller and pickle to project ARGUMENTS (GET): - project_id ARGUMENTS (POST): - data_params: none - module_params: force: (false) Set to true if ''' proj = ESLinker(project_id=project_id) if module_params is None: module_params = {} print('ES_labeller module_params:', module_params) if not proj._has_labeller() or module_params.get('force', False): print('yes here') labeller = proj._gen_es_labeller() proj.labeller_to_json(labeller) return
def _create_es_index(project_id, data_params, module_params): ''' Create an Elasticsearch index for the selected file GET: - project_id: Link project_id POST: - data_params: { project_type: (optional) defaults to link module_name: file_name: } - module_params: { columns_to_index: for_linking: create index to use as referential (instead of storage) force: force recreation of index even if existant } ''' if module_params is None: module_params = {} print(module_params) columns_to_index = module_params.get('columns_to_index') force = module_params.get('force', False) for_linking = module_params.get('for_linking', True) if (not for_linking) and (columns_to_index is not None): raise ValueError( 'columns_to_index and for_linking cannot be not None and False') if (data_params is not None) and ('project_type' in data_params): project_type = data_params['project_type'] project_type = 'link' if data_params is not None: module_name = data_params['module_name'] file_name = data_params['file_name'] project_type = data_params.get('project_type', 'link') # TODO: dirty fix for linking and normalization if for_linking: if project_type == 'link': proj_link = ESLinker(project_id) proj = ESNormalizer(proj_link.ref.project_id) if data_params is None: module_name = proj_link.metadata['files']['ref']['module_name'] file_name = proj_link.metadata['files']['ref']['file_name'] elif project_type == 'normalize': proj = ESNormalizer(project_id) # Generate default columns_to_index if columns_to_index is None: columns_to_index = proj.gen_default_columns_to_index(for_linking) else: proj = ESLinker(project_id) if data_params is None: module_name, file_name = proj.get_last_written() if columns_to_index is None: columns_to_index = { col: {} for col in proj._get_header(module_name, file_name) } file_path = proj.path_to(module_name, file_name) proj.create_index(file_path, columns_to_index, force) time.sleep(5) # TODO: why is this necessary? return
def _create_es_index(project_id, data_params, module_params): ''' Create an Elasticsearch index for the selected file GET: - project_id: Link project_id POST: - data_params: { link_project_id: (optional) ID of the associated link project project_type: (optional) defaults to link module_name: file_name: } - module_params: { columns_to_index: for_linking: create index to use as referential (instead of storage) force: force recreation of index even if existant } ''' if module_params is None: module_params = {} print(module_params) columns_to_index = module_params.get('columns_to_index') force = module_params.get('force', False) for_linking = module_params.get('for_linking', True) if (not for_linking) and (columns_to_index is not None): raise ValueError( 'columns_to_index and for_linking cannot be NOT None and False') if (data_params is not None) and ('project_type' in data_params): project_type = data_params['project_type'] project_type = 'link' if data_params is not None: module_name = data_params['module_name'] file_name = data_params['file_name'] project_type = data_params.get('project_type', 'link') # TODO: dirty fix for linking and normalization if for_linking: if project_type == 'link': proj_link = ESLinker(project_id) columns_to_index = proj_link.gen_default_columns_to_index() if data_params is None: module_name = proj_link.metadata['files']['ref']['module_name'] file_name = proj_link.metadata['files']['ref']['file_name'] proj = ESNormalizer(proj_link.ref.project_id) elif project_type == 'normalize': proj = ESNormalizer(project_id) assert columns_to_index is not None else: proj = ESLinker(project_id) if data_params is None: module_name, file_name = proj.get_last_written() # Type non str columns or use the default string analyzer types_dict = {float: 'float', bool: 'boolean', int: 'integer'} columns_to_index = {col: types_dict.get(proj._choose_dtype(col), {}) \ for col in proj._get_header(module_name, file_name)} file_path = proj.path_to(module_name, file_name) proj.create_index(file_path, columns_to_index, force, proj.metadata.get('public', False)) return