def _create_es_index(project_id, data_params, module_params):
    '''
    Create an Elasticsearch index for the selected file
    
    GET:
        - project_id: Link project_id
    POST:
        - data_params: 
                        {
                        project_type: (optional) defaults to link
                        module_name:
                        file_name: 
                        }
        - module_params: {
                            columns_to_index: 
                            for_linking: create index to use as referential (instead of storage)
                            force: force recreation of index even if existant
                        }
    '''

    if module_params is None:
        module_params = {}

    print(module_params)
    columns_to_index = module_params.get('columns_to_index')
    force = module_params.get('force', False)
    for_linking = module_params.get('for_linking', True)

    if (not for_linking) and (columns_to_index is not None):
        raise ValueError(
            'columns_to_index and for_linking cannot be not None and False')

    if (data_params is not None) and ('project_type' in data_params):
        project_type = data_params['project_type']

    project_type = 'link'
    if data_params is not None:
        module_name = data_params['module_name']
        file_name = data_params['file_name']
        project_type = data_params.get('project_type', 'link')

    # TODO: dirty fix for linking and normalization
    if for_linking:
        if project_type == 'link':
            proj_link = ESLinker(project_id)
            proj = ESNormalizer(proj_link.ref.project_id)

            if data_params is None:
                module_name = proj_link.metadata['files']['ref']['module_name']
                file_name = proj_link.metadata['files']['ref']['file_name']

        elif project_type == 'normalize':
            proj = ESNormalizer(project_id)

        # Generate default columns_to_index
        if columns_to_index is None:
            columns_to_index = proj.gen_default_columns_to_index(for_linking)

    else:
        proj = ESLinker(project_id)
        if data_params is None:
            module_name, file_name = proj.get_last_written()

        if columns_to_index is None:
            columns_to_index = {
                col: {}
                for col in proj._get_header(module_name, file_name)
            }

    file_path = proj.path_to(module_name, file_name)
    proj.create_index(file_path, columns_to_index, force)
    time.sleep(5)  # TODO: why is this necessary?
    return
Ejemplo n.º 2
0
def _create_es_index(project_id, data_params, module_params):
    '''
    Create an Elasticsearch index for the selected file
    
    GET:
        - project_id: Link project_id
    POST:
        - data_params: 
                        {
                            link_project_id: (optional) ID of the associated link project
                            project_type: (optional) defaults to link
                            module_name:
                            file_name: 
                        }
        - module_params: {
                            columns_to_index: 
                            for_linking: create index to use as referential (instead of storage)
                            force: force recreation of index even if existant
                        }
    '''

    if module_params is None:
        module_params = {}

    print(module_params)
    columns_to_index = module_params.get('columns_to_index')
    force = module_params.get('force', False)
    for_linking = module_params.get('for_linking', True)

    if (not for_linking) and (columns_to_index is not None):
        raise ValueError(
            'columns_to_index and for_linking cannot be NOT None and False')

    if (data_params is not None) and ('project_type' in data_params):
        project_type = data_params['project_type']

    project_type = 'link'
    if data_params is not None:
        module_name = data_params['module_name']
        file_name = data_params['file_name']
        project_type = data_params.get('project_type', 'link')

    # TODO: dirty fix for linking and normalization
    if for_linking:
        if project_type == 'link':
            proj_link = ESLinker(project_id)
            columns_to_index = proj_link.gen_default_columns_to_index()

            if data_params is None:
                module_name = proj_link.metadata['files']['ref']['module_name']
                file_name = proj_link.metadata['files']['ref']['file_name']

            proj = ESNormalizer(proj_link.ref.project_id)

        elif project_type == 'normalize':
            proj = ESNormalizer(project_id)
            assert columns_to_index is not None

    else:
        proj = ESLinker(project_id)
        if data_params is None:
            module_name, file_name = proj.get_last_written()

        # Type non str columns or use the default string analyzer
        types_dict = {float: 'float', bool: 'boolean', int: 'integer'}
        columns_to_index = {col: types_dict.get(proj._choose_dtype(col), {}) \
                            for col in proj._get_header(module_name, file_name)}

    file_path = proj.path_to(module_name, file_name)
    proj.create_index(file_path, columns_to_index, force,
                      proj.metadata.get('public', False))
    return
Ejemplo n.º 3
0
    ref_path = ref.path_to_last_written()

    columns_to_index = {
        'numero_uai': {},
        'denomination_principale_uai':
        {'french', 'whitespace', 'integers', 'n_grams'},
        'patronyme_uai': {'french', 'whitespace', 'integers', 'n_grams'},
        'adresse_uai': {'french', 'whitespace', 'integers', 'n_grams'},
        'localite_acheminement_uai':
        {'french', 'whitespace', 'integers', 'n_grams'},
        'departement': {'french', 'whitespace', 'integers', 'n_grams'},
        'code_postal_uai': {},
        'full_name': {'french', 'whitespace', 'integers', 'n_grams'}
    }

    ref.create_index(ref_path, columns_to_index, force=False)

    # Link
    index_name = proj.metadata['files']['ref']['project_id']
    query_template = (('must', 'commune', 'localite_acheminement_uai',
                       '.french', 1), ('must', 'lycees_sources', 'full_name',
                                       '.french', 1))
    threshold = 3.5
    must = {'full_name': ['lycee']}
    must_not = {'full_name': ['ass', 'association', 'sportive', 'foyer']}

    params = dict()
    params['index_name'] = index_name
    params['query_template'] = query_template
    params['thresh'] = threshold
    params['must'] = must