def _es_linker(project_id, data_params, module_params): ''' Runs the recoding module ARGUMENTS (GET): project_id: ID for "link" project ARGUMENTS (POST): - data_params: none { "module_name": module to fetch from (source) "file_name": file to fetch (source) } - module_params: { "index_name": name of the Elasticsearch index to fetch from "query_template": "threshold": minimum value of score for this query_template for a match "must": terms to filter by field (AND: will include ONLY IF ALL are in text) "must_not": terms to exclude by field from search (OR: will exclude if ANY is found) "exact_pairs": (optional) "non_matching_pairs": (optional) } ''' # Problem: what project are we talking about? what ID? proj = ESLinker(project_id) _, run_info = proj.linker('es_linker', None, module_params) proj.write_data() return run_info
def _dedupe_linker(project_id, *argv): ''' Runs deduper module. Contrary to other modules, linker modules, take paths as input (in addition to module parameters) ARGUMENTS (GET): project_id: ID for "link" project ARGUMENTS (POST): - data_params: none - module_params: none # Todo: deprecate ''' proj = ESLinker( project_id=project_id) # Ref and source are loaded by default paths = proj._gen_paths_dedupe() col_matches = proj.read_col_matches() my_variable_definition = proj._gen_dedupe_variable_definition(col_matches) module_params = { 'variable_definition': my_variable_definition, 'selected_columns_from_source': None, 'selected_columns_from_ref': None } # TODO: This should probably be moved logging.info('Performing linking') # Perform linking proj.linker('dedupe_linker', paths, module_params) logging.info('Writing data') # Write transformations and log proj.write_data() file_path = proj.path_to(proj.mem_data_info['module_name'], proj.mem_data_info['file_name']) logging.info('Wrote data to: {0}'.format(file_path)) return {}