def _create_es_index(project_id, data_params, module_params): ''' Create an Elasticsearch index for the selected file GET: - project_id: Link project_id POST: - data_params: { project_type: (optional) defaults to link module_name: file_name: } - module_params: { columns_to_index: for_linking: create index to use as referential (instead of storage) force: force recreation of index even if existant } ''' if module_params is None: module_params = {} print(module_params) columns_to_index = module_params.get('columns_to_index') force = module_params.get('force', False) for_linking = module_params.get('for_linking', True) if (not for_linking) and (columns_to_index is not None): raise ValueError( 'columns_to_index and for_linking cannot be not None and False') if (data_params is not None) and ('project_type' in data_params): project_type = data_params['project_type'] project_type = 'link' if data_params is not None: module_name = data_params['module_name'] file_name = data_params['file_name'] project_type = data_params.get('project_type', 'link') # TODO: dirty fix for linking and normalization if for_linking: if project_type == 'link': proj_link = ESLinker(project_id) proj = ESNormalizer(proj_link.ref.project_id) if data_params is None: module_name = proj_link.metadata['files']['ref']['module_name'] file_name = proj_link.metadata['files']['ref']['file_name'] elif project_type == 'normalize': proj = ESNormalizer(project_id) # Generate default columns_to_index if columns_to_index is None: columns_to_index = proj.gen_default_columns_to_index(for_linking) else: proj = ESLinker(project_id) if data_params is None: module_name, file_name = proj.get_last_written() if columns_to_index is None: columns_to_index = { col: {} for col in proj._get_header(module_name, file_name) } file_path = proj.path_to(module_name, file_name) proj.create_index(file_path, columns_to_index, force) time.sleep(5) # TODO: why is this necessary? return
def _create_es_index(project_id, data_params, module_params): ''' Create an Elasticsearch index for the selected file GET: - project_id: Link project_id POST: - data_params: { link_project_id: (optional) ID of the associated link project project_type: (optional) defaults to link module_name: file_name: } - module_params: { columns_to_index: for_linking: create index to use as referential (instead of storage) force: force recreation of index even if existant } ''' if module_params is None: module_params = {} print(module_params) columns_to_index = module_params.get('columns_to_index') force = module_params.get('force', False) for_linking = module_params.get('for_linking', True) if (not for_linking) and (columns_to_index is not None): raise ValueError( 'columns_to_index and for_linking cannot be NOT None and False') if (data_params is not None) and ('project_type' in data_params): project_type = data_params['project_type'] project_type = 'link' if data_params is not None: module_name = data_params['module_name'] file_name = data_params['file_name'] project_type = data_params.get('project_type', 'link') # TODO: dirty fix for linking and normalization if for_linking: if project_type == 'link': proj_link = ESLinker(project_id) columns_to_index = proj_link.gen_default_columns_to_index() if data_params is None: module_name = proj_link.metadata['files']['ref']['module_name'] file_name = proj_link.metadata['files']['ref']['file_name'] proj = ESNormalizer(proj_link.ref.project_id) elif project_type == 'normalize': proj = ESNormalizer(project_id) assert columns_to_index is not None else: proj = ESLinker(project_id) if data_params is None: module_name, file_name = proj.get_last_written() # Type non str columns or use the default string analyzer types_dict = {float: 'float', bool: 'boolean', int: 'integer'} columns_to_index = {col: types_dict.get(proj._choose_dtype(col), {}) \ for col in proj._get_header(module_name, file_name)} file_path = proj.path_to(module_name, file_name) proj.create_index(file_path, columns_to_index, force, proj.metadata.get('public', False)) return
ref_path = ref.path_to_last_written() columns_to_index = { 'numero_uai': {}, 'denomination_principale_uai': {'french', 'whitespace', 'integers', 'n_grams'}, 'patronyme_uai': {'french', 'whitespace', 'integers', 'n_grams'}, 'adresse_uai': {'french', 'whitespace', 'integers', 'n_grams'}, 'localite_acheminement_uai': {'french', 'whitespace', 'integers', 'n_grams'}, 'departement': {'french', 'whitespace', 'integers', 'n_grams'}, 'code_postal_uai': {}, 'full_name': {'french', 'whitespace', 'integers', 'n_grams'} } ref.create_index(ref_path, columns_to_index, force=False) # Link index_name = proj.metadata['files']['ref']['project_id'] query_template = (('must', 'commune', 'localite_acheminement_uai', '.french', 1), ('must', 'lycees_sources', 'full_name', '.french', 1)) threshold = 3.5 must = {'full_name': ['lycee']} must_not = {'full_name': ['ass', 'association', 'sportive', 'foyer']} params = dict() params['index_name'] = index_name params['query_template'] = query_template params['thresh'] = threshold params['must'] = must